library(data.table)
library(Rsubread)
library(GeneOverlap)

#Code for a more in-depth statistical analysis of K9me2, sRNA levels at ATHILA and non-ATHILA elements

#Step 1: Load in TAIR10 TE annotations for reformating into featureCounts custom annotation files
#All annotated TEs
tair10_te <- fread("TAIR10_Transposable_Elements.txt")
#TEs with K9me2 up-regulation in ddm1 vs ddm1rdr6
ddm1_up_te <- fread("ddm1_up_TE_intersection.txt")

names(tair10_te) <- c('te_name','5prime_orientation','start','end','family','superfamily')
names(ddm1_up_te) <- c('te_name','5prime_orientation','start','end','family','superfamily')
#Chr IDs are implied but not directly specified in these files, need to reformat
#Subset via implied Chr (TE ID)

#Extract only ATHILA elements from each annotation
ddm1_up_te <- ddm1_up_te[grepl("ATHILA",ddm1_up_te$family)]
tair10_te <- tair10_te[grepl("ATHILA",tair10_te$family)]

tair10_1 <- tair10_te[grepl("^AT1TE",tair10_te$te_name)]
tair10_2 <- tair10_te[grepl("^AT2TE",tair10_te$te_name)]
tair10_3 <- tair10_te[grepl("^AT3TE",tair10_te$te_name)]
tair10_4 <- tair10_te[grepl("^AT4TE",tair10_te$te_name)]
tair10_5 <- tair10_te[grepl("^AT5TE",tair10_te$te_name)]

ddm1_1 <- ddm1_up_te[grepl("^AT1TE",ddm1_up_te$te_name)]
ddm1_2 <- ddm1_up_te[grepl("^AT2TE",ddm1_up_te$te_name)]
ddm1_3 <- ddm1_up_te[grepl("^AT3TE",ddm1_up_te$te_name)]
ddm1_4 <- ddm1_up_te[grepl("^AT4TE",ddm1_up_te$te_name)]
ddm1_5 <- ddm1_up_te[grepl("^AT5TE",ddm1_up_te$te_name)]

#featureCounts Simplified Annotation Format (SAF) is tsv file as follows:
#GeneID - Chr - Start - End - Strand

ddm1_1 <- data.table(cbind(ddm1_1$te_name,'Chr1',ddm1_1$start,ddm1_1$end,'-'))
ddm1_2 <- data.table(cbind(ddm1_2$te_name,'Chr2',ddm1_2$start,ddm1_2$end,'-'))
ddm1_3 <- data.table(cbind(ddm1_3$te_name,'Chr3',ddm1_3$start,ddm1_3$end,'-'))
ddm1_4 <- data.table(cbind(ddm1_4$te_name,'Chr4',ddm1_4$start,ddm1_4$end,'-'))
ddm1_5 <- data.table(cbind(ddm1_5$te_name,'Chr5',ddm1_5$start,ddm1_5$end,'-'))

ddm1_up.saf <- data.table(rbind(ddm1_1,ddm1_2,ddm1_3,ddm1_4,ddm1_5))
names(ddm1_up.saf) <- c('GeneID','Chr','Start','End','Strand')

tair10_1 <- data.table(cbind(tair10_1$te_name,'Chr1',tair10_1$start,tair10_1$end,'-'))
tair10_2 <- data.table(cbind(tair10_2$te_name,'Chr2',tair10_2$start,tair10_2$end,'-'))
tair10_3 <- data.table(cbind(tair10_3$te_name,'Chr3',tair10_3$start,tair10_3$end,'-'))
tair10_4 <- data.table(cbind(tair10_4$te_name,'Chr4',tair10_4$start,tair10_4$end,'-'))
tair10_5 <- data.table(cbind(tair10_5$te_name,'Chr5',tair10_5$start,tair10_5$end,'-'))

tair10.saf <- data.table(rbind(tair10_1,tair10_2,tair10_3,tair10_4,tair10_5))
names(tair10.saf) <- c('GeneID','Chr','Start','End','Strand')

#Subset the tair10.saf to remove entires present in ddm1_up.saf
tair10.sub.saf <- tair10.saf[!(is.element(tair10.saf$GeneID,ddm1_up.saf$GeneID))]

#Write .SAF files out as tsv
write.table(ddm1_up.saf,file='ddm1_up.saf',col.names = TRUE,row.names = FALSE,quote = FALSE,sep = '\t')
write.table(tair10.sub.saf,file='tair10.sub.saf',col.names = TRUE,row.names = FALSE,quote = FALSE,sep = '\t')

###########################
#Use featureCounts to assign reads overlapping with annotated feaatures

#WT genotype
wt.athila <- featureCounts(files = c("WT_21_22_whole.bam","WT_23_whole.bam","WT_24_whole.bam")
                                 ,annot.ext = "ddm1_up.saf",strandSpecific = 0)
wt.other <- featureCounts(files = c("WT_21_22_whole.bam","WT_23_whole.bam","WT_24_whole.bam")
                           ,annot.ext = "tair10.sub.saf",strandSpecific = 0)

#ddm1 genotype
ddm1.athila <- featureCounts(files = c("ddm1_21_22_whole.bam","ddm1_23_whole.bam","ddm1_24_whole.bam")
                           ,annot.ext = "ddm1_up.saf",strandSpecific = 0)
ddm1.other <- featureCounts(files = c("ddm1_21_22_whole.bam","ddm1_23_whole.bam","ddm1_24_whole.bam")
                          ,annot.ext = "tair10.sub.saf",strandSpecific = 0)

#ddm1rdr6 genotype
ddm1rdr6.athila <- featureCounts(files = c("ddm1rdr6_21_22_whole.bam","ddm1rdr6_23_whole.bam","ddm1rdr6_24_whole.bam")
                             ,annot.ext = "ddm1_up.saf",strandSpecific = 0)
ddm1rdr6.other <- featureCounts(files = c("ddm1rdr6_21_22_whole.bam","ddm1rdr6_23_whole.bam","ddm1rdr6_24_whole.bam")
                            ,annot.ext = "tair10.sub.saf",strandSpecific = 0)

#Extract relevant data
wt.athila.dt <- data.table(wt.athila$annotation,wt.athila$counts)
ddm1.athila.dt <- data.table(ddm1.athila$annotation,ddm1.athila$counts)
ddm1rdr6.athila.dt <- data.table(ddm1rdr6.athila$annotation,ddm1rdr6.athila$counts)

wt.other.dt <- data.table(wt.other$annotation,wt.other$counts)
ddm1.other.dt <- data.table(ddm1.other$annotation,ddm1.other$counts)
ddm1rdr6.other.dt <- data.table(ddm1rdr6.other$annotation,ddm1rdr6.other$counts)

wt.21_22 <- as.numeric(2933894)
wt.23 <- as.numeric(1783414)
wt.24 <- as.numeric(7393410)

ddm1.21_22 <- as.numeric(5687665)
ddm1.23 <- as.numeric(1657199)
ddm1.24 <- as.numeric(6495236)

ddm1rdr6.21_22 <- as.numeric(2987883)
ddm1rdr6.23 <- as.numeric(1835242)
ddm1rdr6.24 <- as.numeric(9160371)

#Calculate RPM values (with +1 pseudocounts)

######## H3K9me2 Regulated Athila elements ########

wt.athila.dt$WT.21.22.whole.bam.rpm <- (wt.athila.dt$WT.21.22.whole.bam/(wt.21_22/1000000))+1 
wt.athila.dt$WT.23.whole.bam.rpm <- (wt.athila.dt$WT.23.whole.bam/(wt.23/1000000))+1
wt.athila.dt$WT.24.whole.bam.rpm <- (wt.athila.dt$WT.24.whole.bam/(wt.24/1000000))+1
  
ddm1.athila.dt$ddm1.21.22.whole.bam.rpm <- (ddm1.athila.dt$ddm1.21.22.whole.bam/(ddm1.21_22/1000000))+1 
ddm1.athila.dt$ddm1.23.whole.bam.rpm <- (ddm1.athila.dt$ddm1.23.whole.bam/(ddm1.23/1000000))+1
ddm1.athila.dt$ddm1.24.whole.bam.rpm <- (ddm1.athila.dt$ddm1.24.whole.bam/(ddm1.24/1000000))+1

ddm1rdr6.athila.dt$ddm1rdr6.21.22.whole.bam.rpm <- (ddm1rdr6.athila.dt$ddm1rdr6.21.22.whole.bam/(ddm1rdr6.21_22/1000000))+1 
ddm1rdr6.athila.dt$ddm1rdr6.23.whole.bam.rpm <- (ddm1rdr6.athila.dt$ddm1rdr6.23.whole.bam/(ddm1rdr6.23/1000000))+1
ddm1rdr6.athila.dt$ddm1rdr6.24.whole.bam.rpm <- (ddm1rdr6.athila.dt$ddm1rdr6.24.whole.bam/(ddm1rdr6.24/1000000))+1

######## Other Athila elements ########
 
wt.other.dt$WT.21.22.whole.bam.rpm <- (wt.other.dt$WT.21.22.whole.bam/(wt.21_22/1000000))+1 
wt.other.dt$WT.23.whole.bam.rpm <- (wt.other.dt$WT.23.whole.bam/(wt.23/1000000))+1
wt.other.dt$WT.24.whole.bam.rpm <- (wt.other.dt$WT.24.whole.bam/(wt.24/1000000))+1

ddm1.other.dt$ddm1.21.22.whole.bam.rpm <- (ddm1.other.dt$ddm1.21.22.whole.bam/(ddm1.21_22/1000000))+1 
ddm1.other.dt$ddm1.23.whole.bam.rpm <- (ddm1.other.dt$ddm1.23.whole.bam/(ddm1.23/1000000))+1
ddm1.other.dt$ddm1.24.whole.bam.rpm <- (ddm1.other.dt$ddm1.24.whole.bam/(ddm1.24/1000000))+1

ddm1rdr6.other.dt$ddm1rdr6.21.22.whole.bam.rpm <- (ddm1rdr6.other.dt$ddm1rdr6.21.22.whole.bam/(ddm1rdr6.21_22/1000000))+1 
ddm1rdr6.other.dt$ddm1rdr6.23.whole.bam.rpm <- (ddm1rdr6.other.dt$ddm1rdr6.23.whole.bam/(ddm1rdr6.23/1000000))+1
ddm1rdr6.other.dt$ddm1rdr6.24.whole.bam.rpm <- (ddm1rdr6.other.dt$ddm1rdr6.24.whole.bam/(ddm1rdr6.24/1000000))+1 

#log2 conversion of RPM values

######## H3K9me2 Regulated Athila elements ########

wt.athila.dt$WT.21.22.log2 <- log2(wt.athila.dt$WT.21.22.whole.bam.rpm)
wt.athila.dt$WT.23.log2 <- log2(wt.athila.dt$WT.23.whole.bam.rpm)
wt.athila.dt$WT.24.log2 <- log2(wt.athila.dt$WT.24.whole.bam.rpm)

ddm1.athila.dt$ddm1.21.22.log2 <- log2(ddm1.athila.dt$ddm1.21.22.whole.bam.rpm)
ddm1.athila.dt$ddm1.23.log2 <- log2(ddm1.athila.dt$ddm1.23.whole.bam.rpm)
ddm1.athila.dt$ddm1.24.log2 <- log2(ddm1.athila.dt$ddm1.24.whole.bam.rpm)

ddm1rdr6.athila.dt$ddm1rdr6.21.22.log2 <- log2(ddm1rdr6.athila.dt$ddm1rdr6.21.22.whole.bam.rpm) 
ddm1rdr6.athila.dt$ddm1rdr6.23.log2 <- log2(ddm1rdr6.athila.dt$ddm1rdr6.23.whole.bam.rpm)
ddm1rdr6.athila.dt$ddm1rdr6.24.log2 <- log2(ddm1rdr6.athila.dt$ddm1rdr6.24.whole.bam.rpm)

######## Other Athila elements ########

wt.other.dt$WT.21.22.log2 <- log2(wt.other.dt$WT.21.22.whole.bam.rpm)
wt.other.dt$WT.23.log2 <- log2(wt.other.dt$WT.23.whole.bam.rpm)
wt.other.dt$WT.24.log2 <- log2(wt.other.dt$WT.24.whole.bam.rpm)

ddm1.other.dt$ddm1.21.22.log2 <- log2(ddm1.other.dt$ddm1.21.22.whole.bam.rpm)
ddm1.other.dt$ddm1.23.log2 <- log2(ddm1.other.dt$ddm1.23.whole.bam.rpm)
ddm1.other.dt$ddm1.24.log2 <- log2(ddm1.other.dt$ddm1.24.whole.bam.rpm)

ddm1rdr6.other.dt$ddm1rdr6.21.22.log2 <- log2(ddm1rdr6.other.dt$ddm1rdr6.21.22.whole.bam.rpm) 
ddm1rdr6.other.dt$ddm1rdr6.23.log2 <- log2(ddm1rdr6.other.dt$ddm1rdr6.23.whole.bam.rpm)
ddm1rdr6.other.dt$ddm1rdr6.24.log2 <- log2(ddm1rdr6.other.dt$ddm1rdr6.24.whole.bam.rpm)

# Write tables for later use
write.table(wt.athila.dt,file='wt.athila.counts.tsv',col.names = TRUE,row.names = FALSE,quote = FALSE,sep = '\t')
write.table(ddm1.athila.dt,file='ddm1.athila.counts.tsv',col.names = TRUE,row.names = FALSE,quote = FALSE,sep = '\t')  
write.table(ddm1rdr6.athila.dt,file='ddm1rdr6.athila.counts.tsv',col.names = TRUE,row.names = FALSE,quote = FALSE,sep = '\t')  
  
write.table(wt.other.dt,file='wt.other.counts.tsv',col.names = TRUE,row.names = FALSE,quote = FALSE,sep = '\t')
write.table(ddm1.other.dt,file='ddm1.other.counts.tsv',col.names = TRUE,row.names = FALSE,quote = FALSE,sep = '\t')  
write.table(ddm1rdr6.other.dt,file='ddm1rdr6.other.counts.tsv',col.names = TRUE,row.names = FALSE,quote = FALSE,sep = '\t')   

###################################### Performing the above analysis with COPIA elements ###################################### 

tair10_te <- fread("TAIR10_Transposable_Elements.txt")
names(tair10_te) <- c('te_name','5prime_orientation','start','end','family','superfamily')

tair10_te <- tair10_te[grepl("COPIA",tair10_te$family)]

tair10_1 <- tair10_te[grepl("^AT1TE",tair10_te$te_name)]
tair10_2 <- tair10_te[grepl("^AT2TE",tair10_te$te_name)]
tair10_3 <- tair10_te[grepl("^AT3TE",tair10_te$te_name)]
tair10_4 <- tair10_te[grepl("^AT4TE",tair10_te$te_name)]
tair10_5 <- tair10_te[grepl("^AT5TE",tair10_te$te_name)]

tair10_1 <- data.table(cbind(tair10_1$te_name,'Chr1',tair10_1$start,tair10_1$end,'-'))
tair10_2 <- data.table(cbind(tair10_2$te_name,'Chr2',tair10_2$start,tair10_2$end,'-'))
tair10_3 <- data.table(cbind(tair10_3$te_name,'Chr3',tair10_3$start,tair10_3$end,'-'))
tair10_4 <- data.table(cbind(tair10_4$te_name,'Chr4',tair10_4$start,tair10_4$end,'-'))
tair10_5 <- data.table(cbind(tair10_5$te_name,'Chr5',tair10_5$start,tair10_5$end,'-'))

tair10.copia.saf <- data.table(rbind(tair10_1,tair10_2,tair10_3,tair10_4,tair10_5))
names(tair10.copia.saf) <- c('GeneID','Chr','Start','End','Strand')

write.table(tair10.copia.saf,file='tair10.copia.saf',col.names = TRUE,row.names = FALSE,quote = FALSE,sep = '\t')

#############

wt.copia <- featureCounts(files = c("WT_21_22_whole.bam","WT_23_whole.bam","WT_24_whole.bam")
                           ,annot.ext = "tair10.copia.saf",strandSpecific = 0)

ddm1.copia <- featureCounts(files = c("ddm1_21_22_whole.bam","ddm1_23_whole.bam","ddm1_24_whole.bam")
                           ,annot.ext = "tair10.copia.saf",strandSpecific = 0)

ddm1rdr6.copia <- featureCounts(files = c("ddm1rdr6_21_22_whole.bam","ddm1rdr6_23_whole.bam","ddm1rdr6_24_whole.bam")
                           ,annot.ext = "tair10.copia.saf",strandSpecific = 0)


wt.copia.dt <- data.table(wt.copia$annotation,wt.copia$counts)
ddm1.copia.dt <- data.table(ddm1.copia$annotation,ddm1.copia$counts)
ddm1rdr6.copia.dt <- data.table(ddm1rdr6.copia$annotation,ddm1rdr6.copia$counts)

wt.21_22 <- as.numeric(2933894)
wt.23 <- as.numeric(1783414)
wt.24 <- as.numeric(7393410)

ddm1.21_22 <- as.numeric(5687665)
ddm1.23 <- as.numeric(1657199)
ddm1.24 <- as.numeric(6495236)

ddm1rdr6.21_22 <- as.numeric(2987883)
ddm1rdr6.23 <- as.numeric(1835242)
ddm1rdr6.24 <- as.numeric(9160371)

######## COPIA elements ########

wt.copia.dt$WT.21.22.whole.bam.rpm <- (wt.copia.dt$WT.21.22.whole.bam/(wt.21_22/1000000))+1 
wt.copia.dt$WT.23.whole.bam.rpm <- (wt.copia.dt$WT.23.whole.bam/(wt.23/1000000))+1
wt.copia.dt$WT.24.whole.bam.rpm <- (wt.copia.dt$WT.24.whole.bam/(wt.24/1000000))+1

ddm1.copia.dt$ddm1.21.22.whole.bam.rpm <- (ddm1.copia.dt$ddm1.21.22.whole.bam/(ddm1.21_22/1000000))+1 
ddm1.copia.dt$ddm1.23.whole.bam.rpm <- (ddm1.copia.dt$ddm1.23.whole.bam/(ddm1.23/1000000))+1
ddm1.copia.dt$ddm1.24.whole.bam.rpm <- (ddm1.copia.dt$ddm1.24.whole.bam/(ddm1.24/1000000))+1

ddm1rdr6.copia.dt$ddm1rdr6.21.22.whole.bam.rpm <- (ddm1rdr6.copia.dt$ddm1rdr6.21.22.whole.bam/(ddm1rdr6.21_22/1000000))+1 
ddm1rdr6.copia.dt$ddm1rdr6.23.whole.bam.rpm <- (ddm1rdr6.copia.dt$ddm1rdr6.23.whole.bam/(ddm1rdr6.23/1000000))+1
ddm1rdr6.copia.dt$ddm1rdr6.24.whole.bam.rpm <- (ddm1rdr6.copia.dt$ddm1rdr6.24.whole.bam/(ddm1rdr6.24/1000000))+1

#Log2 conversion

wt.copia.dt$WT.21.22.log2 <- log2(wt.copia.dt$WT.21.22.whole.bam.rpm)
wt.copia.dt$WT.23.log2 <- log2(wt.copia.dt$WT.23.whole.bam.rpm)
wt.copia.dt$WT.24.log2 <- log2(wt.copia.dt$WT.24.whole.bam.rpm)

ddm1.copia.dt$ddm1.21.22.log2 <- log2(ddm1.copia.dt$ddm1.21.22.whole.bam.rpm)
ddm1.copia.dt$ddm1.23.log2 <- log2(ddm1.copia.dt$ddm1.23.whole.bam.rpm)
ddm1.copia.dt$ddm1.24.log2 <- log2(ddm1.copia.dt$ddm1.24.whole.bam.rpm)

ddm1rdr6.copia.dt$ddm1rdr6.21.22.log2 <- log2(ddm1rdr6.copia.dt$ddm1rdr6.21.22.whole.bam.rpm) 
ddm1rdr6.copia.dt$ddm1rdr6.23.log2 <- log2(ddm1rdr6.copia.dt$ddm1rdr6.23.whole.bam.rpm)
ddm1rdr6.copia.dt$ddm1rdr6.24.log2 <- log2(ddm1rdr6.copia.dt$ddm1rdr6.24.whole.bam.rpm)

#Write output tables

write.table(wt.copia.dt,file='wt.copia.counts.tsv',col.names = TRUE,row.names = FALSE,quote = FALSE,sep = '\t')
write.table(ddm1.copia.dt,file='ddm1.copia.counts.tsv',col.names = TRUE,row.names = FALSE,quote = FALSE,sep = '\t')  
write.table(ddm1rdr6.copia.dt,file='ddm1rdr6.copia.counts.tsv',col.names = TRUE,row.names = FALSE,quote = FALSE,sep = '\t') 

#########################################

#Doing contingency testing (Fisher Exact Testing) to determine significance of correlation between 
#H3K9me2 and sRNAs at a subset of Athila elements

library(GeneOverlap)

go.athila <- newGeneOverlap(k9_high, all_athila_id,genome.size = 2859)
go.athila.test <- testGeneOverlap(go.athila)

go.athila.minus <- newGeneOverlap(k9_high, other_id,genome.size = 2859)
go.athila.minus.test <- testGeneOverlap(go.athila.minus)

go.copia <- newGeneOverlap(k9_high, copia_id,genome.size = 1553)
go.copia.test <- testGeneOverlap(go.copia)







