
# Make the following lists:

# Non expressed genes
# Expressed genes

# Non expressed TEs
# Expressed genic TEs
# Expressed non-genic TEs 

# Use the small RNA annotation (bed file) to do a bedtools closest to all the lists.

Geneannotation
Clemannot_with_Maedafunction_withoutTEs.bed
Geneannotation <- read.table("Clemannot_with_Maedafunction_withoutTEs.bed",header = FALSE, sep="\t", stringsAsFactors=FALSE, quote="")

TEannotation
Rir_HGAP_ii_V2.fa.out.bed
TEannotation <- read.table("Rir_HGAP_ii_V2.fa.out.copy.txt", header = FALSE, fill=TRUE)

Genecounts
genecounts <- read.table("Seb_annotation_allcounts3.txt", header = TRUE)

TEcounts
TEcounts <- read.table("MM100_TE2_allcounts.txt", header = TRUE, fill=TRUE)


# Normalise TRANSPOSON counts by RPKM

# Remove rows that have a * in the first column
TEannotation <- TEannotation[TEannotation$V1 != "*", ]
#Renumber the TEs (GeneId) to remove ID duplicates
TEannotation$V15 <- seq.int(nrow(TEannotation))
#Check if the numbering is correct
tail(TEcounts)
tail(TEannotation)

#Reshuffle counts file to put reps and conds in order
names(TEcounts)[7] <- "a" #0h
names(TEcounts)[8] <- "b"
names(TEcounts)[9] <- "c"
names(TEcounts)[10] <- "d"
names(TEcounts)[11] <- "e" #24 e
names(TEcounts)[12] <- "f"
names(TEcounts)[13] <- "g"
names(TEcounts)[14] <- "h"
names(TEcounts)[15] <- "i" #24 h
names(TEcounts)[16] <- "j"
names(TEcounts)[17] <- "k"
names(TEcounts)[18] <- "l"
names(TEcounts)[19] <- "m" #48 h
names(TEcounts)[20] <- "n"
names(TEcounts)[21] <- "o"
names(TEcounts)[22] <- "p"
names(TEcounts)[23] <- "q" #48 e
names(TEcounts)[24] <- "r"
names(TEcounts)[25] <- "s"
names(TEcounts)[26] <- "t"
names(TEcounts)[27] <- "u" #mays
names(TEcounts)[28] <- "v"

#Reorganize dataframes (similar to bed files)
TEcounts <- subset(TEcounts, select=c(Geneid, a:d, i:l, e:h, m:t))
TEannotation <- subset(TEannotation, select=c(V5, V6, V7, V10, V11, V15, V2))   #V2 is added to check the % divergence later
names(TEannotation)[6] <- "Geneid"
names(TEannotation)[7] <- "percdiv"
TEannotation$length = abs(TEannotation$V7-TEannotation$V6)
TEannotation_filt <- TEannotation[TEannotation$length > 100, ] #134979 left

# Merge both dataframes based on Geneid (adds details on what locus is what type of repeat). 
TEcounts <- merge(TEcounts, TEannotation, by="Geneid")
#Filter for length >100bp
TEcounts_filt <- TEcounts[TEcounts$length > 100, ] #134979 left
rownames(TEcounts_filt) <- TEcounts_filt$Geneid 

#We have a TE count table with 278,043 rows. Only Keep "known" TEs >100bp
lib_size <- colSums(subset(TEcounts, select=c(a:t)))   ### lib_size = number of mapped reads
head(lib_size, n=20)

#Transform counts to RPKM
TEcounts_RPKM = (subset(TEcounts_filt, select=c(a:t)))/(lib_size/1000000)/(TEcounts_filt$length/1000)
TEcounts_RPKM$Geneid = rownames(TEcounts_RPKM)
#Filter for expression >< 0.1RPKM in <>5 samples and length >100bp
TEcounts_RPKM_filt <- TEcounts_RPKM[rowSums(subset(TEcounts_RPKM, select=c(a:t))>=1)>5,] #22778 left if >1 RPKM in at least 5 samples.
TEcounts_RPKM_filt$Geneid = rownames(TEcounts_RPKM_filt)

expressedTE <- TEcounts_RPKM_filt
nonexpressedTE <- anti_join(TEcounts_RPKM, TEcounts_RPKM_filt) #112201.

# Use the Geneid col in each df to extract the coordinates and identity of the TE
expressedTEannot <- merge(expressedTE, TEannotation, by="Geneid")
nonexpressedTEannot <- merge(nonexpressedTE, TEannotation, by="Geneid")

#Remove counts coming from unknown elements
expressedTEannotknown <- dplyr::filter(expressedTEannot, (!grepl("Unknown", V11) & (!grepl("Simple_repeat", V11)) & (!grepl("Low_complexity", V11))))
nonexpressedTEannotknown <- dplyr::filter(nonexpressedTEannot, (!grepl("Unknown", V11) & (!grepl("Simple_repeat", V11)) & (!grepl("Low_complexity", V11))))

# Make bed files
# BED = chr   coord1    coord2     dots      strand      number       name/Geneid
expressedTEannotbed <- subset(expressedTEannotknown, select=c(V5:V7, Geneid, Geneid, Geneid, V11))
nonexpressedTEannotbed <- subset(nonexpressedTEannotknown, select=c(V5:V7, Geneid, Geneid, Geneid, V11))
expressedTEannotbed$Geneid = "."
expressedTEannotbed$Geneid.1 = "+"
nonexpressedTEannotbed$Geneid = "."
nonexpressedTEannotbed$Geneid.1 = "+"

write.table(expressedTEannotbed, paste(out_dir,"expressedTE.bed",sep="/"), col.names=F, quote=F, sep="\t", row.names=F)
write.table(nonexpressedTEannotbed, paste(out_dir,"nonexpressedTE.bed",sep="/"), col.names=F, quote=F, sep="\t", row.names=F)

# sort -k1,1 -k2,2n expressedTE.bed > expressedTE_sorted.bed
# sort -k1,1 -k2,2n nonexpressedTE.bed > nonexpressedTE_sorted.bed
# These bed files will be "bedtooled" to sRNA loci

# Do the same for Gene counts
# Normalise GENE counts by RPKM
#Reshuffle counts file to put reps and conds in order
names(genecounts)[7] <- "a" #0h
names(genecounts)[8] <- "b"
names(genecounts)[9] <- "c"
names(genecounts)[10] <- "d"
names(genecounts)[11] <- "e" #24 e
names(genecounts)[12] <- "f"
names(genecounts)[13] <- "g"
names(genecounts)[14] <- "h"
names(genecounts)[15] <- "i" #24 h
names(genecounts)[16] <- "j"
names(genecounts)[17] <- "k"
names(genecounts)[18] <- "l"
names(genecounts)[19] <- "m" #48 h
names(genecounts)[20] <- "n"
names(genecounts)[21] <- "o"
names(genecounts)[22] <- "p"
names(genecounts)[23] <- "q" #48 e
names(genecounts)[24] <- "r"
names(genecounts)[25] <- "s"
names(genecounts)[26] <- "t"
names(genecounts)[27] <- "u" #mays
names(genecounts)[28] <- "v"

lib_size <- colSums(subset(genecounts, select=c(a:t)))  
head(lib_size, n=20)

#Transform counts to RPKM
genecounts_RPKM = (subset(genecounts, select=c(a:t)))/(lib_size/1000000)/(genecounts$Length/1000)
genecounts_RPKM$Geneid = rownames(genecounts_RPKM)
#Filter for expression level
genecounts_RPKM_filt <- genecounts_RPKM[rowSums(subset(genecounts_RPKM, select=c(a:t))>=1)>5,]

expressedGene <- genecounts_RPKM_filt
nonexpressedGene <- anti_join(genecounts_RPKM, genecounts_RPKM_filt) 

# Use the Geneid col in each df to extract the coordinates and identity of the TE
string1 = "g"
expressedGene$Geneid2 <- NA
expressedGene$Geneid2 = paste(string1,expressedGene$Geneid, sep="")
expressedGene$Geneid <- NULL
names(expressedGene)[21] <- "Geneid"
Geneannotation2 <- Geneannotation %>% dplyr::mutate(V6 = regmatches(V7,gregexpr("(?<=ClementID=).*",V7, perl=TRUE)))
Geneannotation2$V6 <- sub('(?<=\\;).*$', '', Geneannotation2$V6, perl=TRUE)
Geneannotation2 <- Geneannotation2 %>% mutate(V6 = as.character(gsub(";", "", V6)))
names(Geneannotation2)[6] <- "Geneid"
expressedGeneannot <- merge(expressedGene, Geneannotation2, by="Geneid")
nonexpressedGeneannot <- merge(nonexpressedGene, Geneannotation2, by="Geneid")

# Make bed files
# BED = chr   coord1    coord2     dots      strand      number       name/Geneid
expressedGeneannotbed <- subset(expressedGeneannot, select=c(V1:V5, Geneid, V7))
nonexpressedGeneannotbed <- subset(nonexpressedGeneannot, select=c(V1:V5, Geneid, V7))

write.table(expressedGeneannotbed, paste(out_dir,"expressedGene.bed",sep="/"), col.names=F, quote=F, sep="\t", row.names=F)
write.table(nonexpressedGeneannotbed, paste(out_dir,"nonexpressedGene.bed",sep="/"), col.names=F, quote=F, sep="\t", row.names=F)

# sort -k1,1 -k2,2n expressedGene.bed > expressedGene_sorted.bed
# sort -k1,1 -k2,2n nonexpressedGene.bed > nonexpressedGene_sorted.bed
# These bed files are "bedtooled" to sRNA loci

bedtools closest -k 1 -d -a sRNAannotation_VF.bed -b expressedGene_sorted.bed > sRNA_closest_expressedGene.bed
bedtools closest -k 1 -d -a sRNAannotation_VF.bed -b nonexpressedGene_sorted.bed > sRNA_closest_nonexpressedGene.bed
bedtools closest -k 1 -d -a sRNAannotation_VF.bed -b expressedTE_sorted.bed > sRNA_closest_expressedTE.bed
bedtools closest -k 1 -d -a sRNAannotation_VF.bed -b nonexpressedTE_sorted.bed > sRNA_closest_nonexpressedTE.bed

sRNA_expressedGene <- read.table("sRNA_closest_expressedGene.bed", header = FALSE, sep="\t", stringsAsFactors=FALSE, quote="")
sRNA_nonexpressedGene <- read.table("sRNA_closest_nonexpressedGene.bed", header = FALSE, sep="\t", stringsAsFactors=FALSE, quote="")
sRNA_expressedTE <- read.table("sRNA_closest_expressedTE.bed", header = FALSE, sep="\t", stringsAsFactors=FALSE, quote="")
sRNA_nonexpressedTE <- read.table("sRNA_closest_nonexpressedTE.bed", header = FALSE, sep="\t", stringsAsFactors=FALSE, quote="")

#Remove rows that have no overlap or closest (those that have a dot in V7)
sRNA_expressedGene = sRNA_expressedGene[sRNA_expressedGene$V7 != ".", ]  #3104
sRNA_nonexpressedGene = sRNA_nonexpressedGene[sRNA_nonexpressedGene$V7 != ".", ] # There are no sRNA produced from non-expressed genes
sRNA_expressedTE = sRNA_expressedTE[sRNA_expressedTE$V7 != ".", ] #3072
sRNA_nonexpressedTE = sRNA_nonexpressedTE[sRNA_nonexpressedTE$V7 != ".", ] #3845

#Check the distances for each overlap list. We're not going to look at anything that does not overlap. Genes are sometimes really close to transposons (that's how the genome is organised)
# so it would be skewing all the results. Huge sRNA clusters mapping to a TE are sometimes right next to genes like kinases...

sRNA_expressedGene_overlap <- sRNA_expressedGene[sRNA_expressedGene$V14 == 0, ] #
length(unique(sRNA_expressedGene_overlap$V4))
434/3104*100 # 29 kinases, 1 btbpoz, 2 kelch, 3 sel1

sRNA_expressedTE_overlap <- sRNA_expressedTE[sRNA_expressedTE$V14 == 0, ] #
283/3072*100 #9.21%

sRNA_nonexpressedTE_overlap <- sRNA_nonexpressedTE[sRNA_nonexpressedTE$V14 == 0, ] #
2068/3845*100 #53.78%


#Check how many unique sRNA loci are represented in each list
length(unique(sRNA_expressedGene_overlap$V4))
375
length(unique(sRNA_expressedTE_overlap$V4))
229
length(unique(sRNA_nonexpressedTE_overlap$V4))
1281


# Hierarchical analysis : Look at loci that overlap with TEs first, then from what's left, look at what overlaps with genes.
toremove <- rbind(sRNA_expressedTE_overlap, sRNA_nonexpressedTE_overlap)
sRNA_expressedGene_overlap2 <- anti_join(sRNA_expressedGene_overlap, toremove, by="V4")
length(unique(sRNA_expressedGene_overlap2$V4))
#293

# The genes that produce sRNA appear to be next to CpG islands and methhylated TEs.
# Looks like this:  ____ kinase___kinase___kinase/smallRNA___methylated TEs___smallRNA
# Looks like this:  ____ kinase___kinase___kinase/smallRNA___methylated region___smallRNA___gene

# Check out the distance between non-TE-derived sRNA loci VS CpG islands, and the distance of TE-derived sRNA loci vs CpG islands

sRNA_expressedGene_overlap.class2 <- dplyr::filter(sRNA_expressedGene_overlap2, grepl("hypothetical|provisional|tis13", V13, ignore.case=TRUE))
sRNA_expressedGene_overlap.class22 <- dplyr::filter(sRNA_expressedGene_overlap2, grepl(";", V13, ignore.case=TRUE))
sRNA_expressedGene_overlap.classB <-rbind(sRNA_expressedGene_overlap.class2, sRNA_expressedGene_overlap.class22)
sRNA_expressedGene_overlap.classB <- dplyr::filter(sRNA_expressedGene_overlap.classB, !grepl("reverse_transcriptase|CENP-b|crinkler_family_protein|zinc_finger_mym|DNA_polymerase_delta_subunit_1|DNA_polymerase_epsilon|ATPase_aaa|retrotransposon|integrase|pif1|zinc_finger_bed_domain|PIF1|ricesleeper|helicase-primase|helicase/primase|gag-pol|far1-related|ribonuclease_hi|ribonuclease_h|ATP_dependent_DNA_helicase|jockey|rve_super_family_integrase|transposase|transposable", V13, ignore.case=TRUE))
length(unique(sRNA_expressedGene_overlap.classB$V4))
226 class B
write.table(sRNA_expressedGene_overlap.classB, paste(out_dir,"sRNAloci_ClassBgenes.bed",sep="/"), col.names=F, quote=F, sep="\t", row.names=F)
sRNA_expressedGene_overlap.class3 <- dplyr::filter(sRNA_expressedGene_overlap2, grepl("serine/threonine|serine-threonine|serine_threonine|btb/poz|btb-poz|calmodulin|sel1|kelch", V13, ignore.case=TRUE))
length(unique(sRNA_expressedGene_overlap.class3$V4))
31 class C
write.table(sRNA_expressedGene_overlap.class3, paste(out_dir,"sRNAloci_ClassCgenes.bed",sep="/"), col.names=F, quote=F, sep="\t", row.names=F)
sRNA_expressedGene_overlap.te <- dplyr::filter(sRNA_expressedGene_overlap2, grepl("reverse_transcriptase|CENP-b|crinkler_family_protein|zinc_finger_mym|DNA_polymerase_delta_subunit_1|DNA_polymerase_epsilon|ATPase_aaa|retrotransposon|integrase|pif1|zinc_finger_bed_domain|PIF1|ricesleeper|helicase-primase|helicase/primase|gag-pol|far1-related|ribonuclease_hi|ribonuclease_h|ATP_dependent_DNA_helicase|jockey|rve_super_family_integrase|transposase|transposable", V13, ignore.case=TRUE))
length(unique(sRNA_expressedGene_overlap.te$V4))
1 TE-like sRNA loci
write.table(sRNA_expressedGene_overlap.te, paste(out_dir,"sRNAloci_TE-relatedgenes.bed",sep="/"), col.names=F, quote=F, sep="\t", row.names=F)
class1sRNAloci <- anti_join(sRNA_expressedGene_overlap2, sRNA_expressedGene_overlap.classB, by="V4")
class1sRNAloci2 <- anti_join(class1sRNAloci, sRNA_expressedGene_overlap.class3, by="V4")
class1sRNAloci3 <- anti_join(class1sRNAloci2, sRNA_expressedGene_overlap.te, by="V4")
class1sRNAloci3 <- dplyr::filter(class1sRNAloci3, !grepl("serine/threonine|serine-threonine|serine_threonine|btb/poz|btb-poz|calmodulin|sel1|kelch|reverse_transcriptase|CENP-b|crinkler_family_protein|zinc_finger_mym|DNA_polymerase_delta_subunit_1|DNA_polymerase_epsilon|ATPase_aaa|retrotransposon|integrase|pif1|zinc_finger_bed_domain|PIF1|ricesleeper|helicase-primase|helicase/primase|gag-pol|far1-related|ribonuclease_hi|ribonuclease_h|ATP_dependent_DNA_helicase|jockey|rve_super_family_integrase|transposase", V13, ignore.case=TRUE))
length(unique(class1sRNAloci3$V4))
66 class1 loci
write.table(class1sRNAloci, paste(out_dir,"sRNAloci_classAgenes.bed",sep="/"), col.names=F, quote=F, sep="\t", row.names=F)
293-226-31-1
pie <- data.frame(
  group = c("Class A", "Class B", "Class C", "TE-related"),
  value = c(35, 226, 31, 1)
)
bp <- ggplot(pie, aes(x="", y=value, fill=group))+
  geom_bar(width = 1, stat = "identity")
bp


# MAke Pie chart for Figure 6a.
# NOTE: Total of 3495 sRNA loci in Figure 4. I removed from the annotation 428 loci that were depleted by either method. 3067 loci in final annotation

pie <- data.frame(
  group = c("Non-expressed TEs", "Expressed TEs", "Non-expressed genes", "Expressed genes", "Unannotated region"),
  value = c(1281, 229, 0, 196, 1264)
)
head(df)

bp <- ggplot(pie, aes(x="", y=value, fill=group))+
  geom_bar(width = 1, stat = "identity")
bp

pieplot <- bp + coord_polar("y", start=0) + theme_classic()
pieplot



#Redo the sRNA - TE overlaps, this time including a column with sRNAmethylationscore

sRNAannotation <- read.table("sRNAannotation_VF.bed",header = FALSE, sep="\t", stringsAsFactors=FALSE, quote="")
bedtools map -c 4 -o median -a sRNAannotation_VF.bed -b methIGV.bed > sRNA_meth_overlap.bed
# Load sRNA-meth overlap. 
sRNAmeth <- read.table("sRNA_meth_overlap.bed", header = FALSE, sep="\t", stringsAsFactors=FALSE, quote="")
# Merge with the TE dataframes by small RNA id
sRNAMETHexpressedTE <- merge(sRNA_expressedTE_overlap, sRNAmeth, by="V4")
sRNAMETHnonexpressedTE <- merge(sRNA_nonexpressedTE_overlap, sRNAmeth, by="V4")
sRNAMETHnonexpressedTE <- anti_join(sRNAMETHnonexpressedTE, sRNAMETHexpressedTE, by="V4")
sRNAMETHexpressedGENE <- merge(sRNA_expressedGene_overlap3, sRNAmeth, by="V4")

groupmeth1 <- as.data.frame(sRNAMETHexpressedTE$V4)
groupmeth2 <- as.data.frame(sRNAMETHnonexpressedTE$V4)
groupmeth3 <- as.data.frame(sRNAMETHexpressedGENE$V4)
names(groupmeth1)[1] <- "V4"
names(groupmeth2)[1] <- "V4"
names(groupmeth3)[1] <- "V4"

groupmeth <- unique(rbind(groupmeth1, groupmeth2, groupmeth3))

sRNAMETHunannotated <- anti_join(sRNAmeth, groupmeth)
length(unique(sRNAMETHunannotated$V4))

# Remove lines that don't have values
sRNAMETHunannotated = sRNAMETHunannotated[sRNAMETHunannotated$V7 != ".", ]  #
sRNAMETHexpressedTE = sRNAMETHexpressedTE[sRNAMETHexpressedTE$V7.y != ".", ]  #
sRNAMETHnonexpressedTE = sRNAMETHnonexpressedTE[sRNAMETHnonexpressedTE$V7.y != ".", ]  #
sRNAMETHexpressedGENE = sRNAMETHexpressedGENE[sRNAMETHexpressedGENE$V7.y != ".", ]  #



################################################################################################
# plot the distribution of methylation scores for sRNA loci of each category (FIGURE 6B)
################################################################################################
library(ggbeeswarm)
sapply(sRNAMETHunannotated, class)
sRNAMETHunannotated$cat = 1
sRNAMETHunannotated$V7 <- as.numeric(as.character(sRNAMETHunannotated$V7))
sRNAMETHexpressedTE$cat = 1
sRNAMETHexpressedTE$V7.y <- as.numeric(as.character(sRNAMETHexpressedTE$V7.y))
sRNAMETHnonexpressedTE$cat = 1
sRNAMETHnonexpressedTE$V7.y <- as.numeric(as.character(sRNAMETHnonexpressedTE$V7.y))
sRNAMETHexpressedGENE$cat = 1
sRNAMETHexpressedGENE$V7.y <- as.numeric(as.character(sRNAMETHexpressedGENE$V7.y))
  
#Unanottated
nrow(sRNAMETHunannotated)
ggplot(data=sRNAMETHunannotated, aes(x=cat, y=V7))+
  geom_violin(fill="white", colour="black", alpha=0.05, width=0.8)+
  geom_quasirandom(method='tukeyDense', size=0.5, alpha=0.2) +
  stat_summary(fun=median, geom="point", size=3, color="red")+
  theme_classic() +
  theme(axis.text.x = element_text(size = 10, angle = 90)) +
  theme(axis.text.y = element_text(size = 4))+
  xlim(0.5,1.5)+
  ylim(0,1)

  
#Expressed TE
ggplot(data=sRNAMETHexpressedTE, aes(x=cat, y=V7.y))+
  geom_violin(fill="white", colour="black", alpha=0.05, width=0.8)+
  geom_quasirandom(method='tukeyDense', size=0.5, alpha=0.2) +
  stat_summary(fun=median, geom="point", size=3, color="red")+
  theme_classic() +
  theme(axis.text.x = element_text(size = 10, angle = 90)) +
  theme(axis.text.y = element_text(size = 4))+
  xlim(0.5,1.5)+
  ylim(0,1)

#Non Expressed TE
ggplot(data=sRNAMETHnonexpressedTE, aes(x=cat, y=V7.y))+
  geom_violin(fill="white", colour="black", alpha=0.05, width=0.8)+
  geom_quasirandom(method='tukeyDense', size=0.5, alpha=0.2) +
  stat_summary(fun=median, geom="point", size=3, color="red")+
  theme_classic() +
  theme(axis.text.x = element_text(size = 10, angle = 90)) +
  theme(axis.text.y = element_text(size = 4))+
  xlim(0.5,1.5)+
  ylim(0,1)


#Expressed genes
ggplot(data=sRNAMETHexpressedGENE, aes(x=cat, y=V7.y))+
  geom_violin(fill="white", colour="black", alpha=0.05, width=0.8)+
  geom_quasirandom(method='tukeyDense', size=0.5, alpha=0.2) +
  stat_summary(fun=median, geom="point", size=3, color="red")+
  theme_classic() +
  theme(axis.text.x = element_text(size = 10, angle = 90)) +
  theme(axis.text.y = element_text(size = 4))+
  xlim(0.5,1.5)+
  ylim(0,1)


#List the most sRNA-targeted TE classes (FIGURE 6C)
sRNAMETHnonexpressedTE$newnumber <- seq.int(nrow(sRNAMETHnonexpressedTE))
ZDT <- data.table(sRNAMETHnonexpressedTE)
Zcat <- ZDT[, .(number_of_distinct_orders = length(unique(newnumber))), by = V13]
Zcatord <- Zcat %>% arrange(desc(number_of_distinct_orders))
write.table(Zcatord, paste(out_dir,"sRNA-targeted_nonexpressedTEs.txt",sep="/"), col.names=T, quote=F, sep="\t", row.names=F)

TEcat <- read.table("sRNA-targeted_nonexpressedTEs.mod1.txt", header = TRUE, sep="\t", stringsAsFactors=FALSE, quote="")

ggplot(data=TEcat, aes(x= reorder(V13, -number_of_distinct_orders), y = number_of_distinct_orders)) +
  geom_bar(stat="identity", width=0.8) +
  theme_minimal() +
  scale_x_discrete(limits = rev) +
  theme(axis.text.x = element_text(size = 10, angle = 90)) +
  coord_flip() +
  xlab("") + ylab(" Top expressed TE/repeat classes                  ") +
  ylim(0, 600) +
  theme_classic()

LTR/Gypsy
509

LINE
285

LINE/I
187

DNA/hAT-Tag1
155

DNA/Maverick
137

DNA/CMC-EnSpm
105

RC/Helitron
94

DNA/MULE-MuDR
92

DNA/hAT-Ac
76

DNA/TcMar-Tc1
59

sRNAMETHexpressedTE$newnumber <- seq.int(nrow(sRNAMETHexpressedTE))
ZDT <- data.table(sRNAMETHexpressedTE)
Zcat <- ZDT[, .(number_of_distinct_orders = length(unique(newnumber))), by = V13]
Zcatord <- Zcat %>% arrange(desc(number_of_distinct_orders))

TEcat <- read.table("sRNA-targeted_expressedTEs.mod1.txt", header = TRUE, sep="\t", stringsAsFactors=FALSE, quote="")

ggplot(data=TEcat, aes(x= reorder(V13, -number_of_distinct_orders), y = number_of_distinct_orders)) +
  geom_bar(stat="identity", width=0.8) +
  theme_minimal() +
  scale_x_discrete(limits = rev) +
  theme(axis.text.x = element_text(size = 10, angle = 90)) +
  coord_flip() +
  xlab("") + ylab(" Top expressed TE/repeat classes                  ") +
  ylim(0, 40) +
  theme_classic()


DNA/TcMar-Tc1
36

DNA/hAT-Ac
35

DNA/CMC-EnSpm
35

DNA/hAT-Tag1
27

RC/Helitron
25

LTR/Gypsy
24

DNA/MULE-MuDR
15

DNA/Maverick
14

rRNA
10

LINE/I
9

LINE/L1-Tx1
8

TEcat <- read.table("toptargetedTEs_all_regardlessofexpression.txt", header = TRUE, sep="\t", stringsAsFactors=FALSE, quote="")

ggplot(data=TEcat, aes(x= reorder(V13, -number_of_distinct_orders), y = number_of_distinct_orders)) +
  geom_bar(stat="identity", width=0.8) +
  theme_minimal() +
  scale_x_discrete(limits = rev) +
  theme(axis.text.x = element_text(size = 10, angle = 90)) +
  coord_flip() +
  xlab("") + ylab(" Top expressed TE/repeat classes                  ") +
  ylim(0, 600) +
  theme_classic()


TEcat <- read.table("sRNA-targeted_TEs_all_modFig6C.txt", header = TRUE, sep="\t", stringsAsFactors=FALSE, quote="")
colSums(subset(TEcat, select=c(number_of_distinct_orders))) #2320

ggplot(data=TEcat, aes(x= reorder(V13, -number_of_distinct_orders), y = number_of_distinct_orders)) +
  geom_bar(stat="identity", width=0.8) +
  theme_minimal() +
  scale_x_discrete(limits = rev) +
  theme(axis.text.x = element_text(size = 10, angle = 90)) +
  coord_flip() +
  xlab("") + ylab(" Top expressed TE/repeat classes                  ") +
  ylim(0, 700) +
  theme_classic()


#Check if TEs producing sRNAs are young or old
annotTE <- read.table("Rir_HGAP_ii_V2.fa.out.copy.txt", header = FALSE, fill=TRUE)

TEsRNAlociloc <- subset(sRNAMETHnonexpressedTE, select=c(V4, V7.x, V8, V9, V13))
TEsRNAlociloc2 <- subset(sRNAMETHexpressedTE, select=c(V4, V7.x, V8, V9, V13))

TEsRNAlociloc <- rbind(TEsRNAlociloc, TEsRNAlociloc2)
names(TEsRNAlociloc)[2] <- "V5"
names(TEsRNAlociloc)[3] <- "V6"
names(TEsRNAlociloc)[4] <- "V7"

TEsRNAlociage <- merge(TEsRNAlociloc, annotTE, by=c("V5", "V6", "V7"))

#Make Kimura divergence bins
# Calculate the % divergence for all expressed TEs in bins of 10% divergence (0-10, 11-20, 21-30, 31-40, 41-50) using erangedf_filt_merg2 df (made in one of the next sections)
# Extract percdiv for each group
erangedf_filt_merg3 <- subset(TEsRNAlociage, select=c(V2, V11)) #V2-percdiv, V11=group
#Make bins
erangedf_filt_merg3$bins <- cut(erangedf_filt_merg3$V2, breaks=c(0,10,20,30,40,50))

#Find unique values in bins for each TE category
divgypsy <- dplyr::filter(erangedf_filt_merg3, grepl("LTR/Gypsy", V11))
unique(divgypsy[c("bins")])
table(divgypsy$bins)
(0,10] (10,20] (20,30] (30,40] (40,50] 
244     153     114      21       0 

divline <- dplyr::filter(erangedf_filt_merg3, grepl("LINE", V11))
unique(divline[c("bins")])
table(divline$bins)
(0,10] (10,20] (20,30] (30,40] (40,50] 
326     196     157      24       0 

divhattag <- dplyr::filter(erangedf_filt_merg3, grepl("DNA/hAT-Tag1", V11))
unique(divhattag[c("bins")])
table(divhattag$bins)
(0,10] (10,20] (20,30] (30,40] (40,50] 
66      17      81      17       0 

divtcmartc1 <- dplyr::filter(erangedf_filt_merg3, grepl("DNA/TcMar", V11))
unique(divtcmartc1[c("bins")])
table(divtcmartc1$bins)
(0,10] (10,20] (20,30] (30,40] (40,50] 
41      54      51      17       0 

divmaverick <- dplyr::filter(erangedf_filt_merg3, grepl("DNA/Maverick", V11))
unique(divmaverick[c("bins")])
table(divmaverick$bins)
(0,10] (10,20] (20,30] (30,40] (40,50] 
102      26      15       2       0 

divcmc <- dplyr::filter(erangedf_filt_merg3, grepl("DNA/CMC-EnSpm", V11))
unique(divcmc[c("bins")])
table(divcmc$bins)
(0,10] (10,20] (20,30] (30,40] (40,50] 
41      24      68       4       0 

divhatac <- dplyr::filter(erangedf_filt_merg3, grepl("DNA/hAT-Ac", V11))
unique(divhatac[c("bins")])
table(divhatac$bins)
(0,10] (10,20] (20,30] (30,40] (40,50] 
23      24      54       9       0 

divhelitron <- dplyr::filter(erangedf_filt_merg3, grepl("Helitron", V11))
unique(divhelitron[c("bins")])
table(divhelitron$bins)
(0,10] (10,20] (20,30] (30,40] (40,50] 
20      39      44      16       0 

divmule <- dplyr::filter(erangedf_filt_merg3, grepl("DNA/MULE", V11))
unique(divmule[c("bins")])
table(divmule$bins)
(0,10] (10,20] (20,30] (30,40] (40,50] 
39      26      34      11       0 

divdnaother <- dplyr::filter(erangedf_filt_merg3, grepl("DNA", V11))
divdnaother2 <- dplyr::filter(divdnaother, !grepl("DNA/MULE|DNA/hAT-Ac|DNA/CMC-EnSpm|DNA/Maverick|DNA/TcMar|DNA/hAT-Tag1", V11))
unique(divdnaother2[c("bins")])
table(divdnaother2$bins)
(0,10] (10,20] (20,30] (30,40] (40,50] 
11      34      66       6       0 


divdnaother <- dplyr::filter(erangedf_filt_merg3, grepl("LTR", V11))
divdnaother2 <- dplyr::filter(divdnaother, !grepl("LTR/Gypsy", V11))
unique(divdnaother2[c("bins")])
table(divdnaother2$bins)
(0,10] (10,20] (20,30] (30,40] (40,50] 
1       4       9       0       0 


# STACKED PLOT FIGURE 6C
setwd("/Users/alexandradallaire/TEanalysis/rework")
summary <- read.table("expr_TE_categories_RPKM1_mod4.txt", header = TRUE, sep="\t", stringsAsFactors=FALSE, quote="")
#Just plot the top TE/repeats categories expressed

ggplot(summary, aes(fill=bin, y=number, x=reorder(V11, -number))) + 
  geom_bar(position="dodge", stat="identity")

ggplot(summary, aes(fill=bin, y=number, x=reorder(V11, -number))) + 
  geom_bar(position="stack", stat="identity")+
  theme_classic() +
  coord_flip()+
  scale_x_discrete(limits = rev)


sRNAMETHunannotated
write.table(sRNAMETHunannotated, paste(out_dir,"sRNAMETHunannotated.bed",sep="/"), col.names=F, quote=F, sep="\t", row.names=F)
sort -k1,1 -k2,2n sRNAMETHunannotated.bed > sRNAMETHunannotated_sorted.bed

#CHECK THE LOCATION OF SHUFFLED SRNA LOCI
#Use bedtools to shuffle the location of our 1264 random small RNA loci. This way, we keep the same length distribution
#Group together unannotated and genic sRNA loci
sRNA_expressedGene_overlap2 and anything that is not in sRNA_expressedTE_overlap or sRNA_nonexpressedTE_overlap
fd1 <- subset(sRNA_expressedGene_overlap2, select=c(V1:V6)) #GENE
fd2 <- subset(sRNA_expressedTE_overlap, select=c(V1:V6)) #TE
fd3 <- subset(sRNA_nonexpressedTE_overlap, select=c(V1:V6)) #TE
sRNAannotation$V7 <- seq.int(nrow(sRNAannotation))

questionloci1 <- rbind(fd1, fd2, fd3)
length(unique(questionloci1$V4))
questionloci1 <- questionloci1[!duplicated(questionloci1[ , c("V4")]), ]
questionloci2 <- anti_join(sRNAannotation, questionloci1, by="V4")
questionloci4 <- questionloci2[sample(nrow(questionloci2), 1264), ]
write.table(questionloci4, paste(out_dir,"questionloci.bed",sep="/"), col.names=F, quote=F, sep="\t", row.names=F)

TEloci <- rbind(fd2, fd3)
TEloci <- TEloci[!duplicated(TEloci[ , c("V4")]), ]
write.table(TEloci, paste(out_dir,"TEloci.bed",sep="/"), col.names=F, quote=F, sep="\t", row.names=F)

#Shuffle the location of 1237 questionloci.
bedtools shuffle -i questionloci.bed -g contiglengths.bed -excl TEannotation_filt_knowns_sorted.bed -seed 927442958 -noOverlapping > shuffledsRNAloci.bed

randomsRNAloci <- read.table("shuffledsRNAloci.bed", header = FALSE, sep="\t", stringsAsFactors=FALSE, quote="")
#Give them unique IDs in V7
randomsRNAloci$V7 <- seq.int(nrow(randomsRNAloci))
write.table(randomsRNAloci, paste(out_dir,"randomsRNAloci_id.bed",sep="/"), col.names=F, quote=F, sep="\t", row.names=F)

#bedtools closest TE 
TEannotation_filt_ltrunknown <- dplyr::filter(TEannotation_filt, grepl("LTR/Unknown", V11)) 
TEannotation_filt2 <- dplyr::filter(TEannotation_filt, !grepl("Unknown|Low_complexity|rRNA|Simple_repeat|Satellite", V11)) #
TEannotation_filt3 <- rbind(TEannotation_filt2, TEannotation_filt_ltrunknown) #Add LTR/Unknown back
write.table(TEannotation_filt3, paste(out_dir,"TEannotation_filt_knowns.bed",sep="/"), col.names=F, quote=F, sep="\t", row.names=F)
sort -k1,1 -k2,2n TEannotation_filt_knowns.bed > TEannotation_filt_knowns_sorted.bed
sort -k1,1 -k2,2n randomsRNAloci_id.bed > randomsRNAloci_id_sorted.bed

bedtools closest -k 1 -d -a randomsRNAloci_id_sorted.bed -b TEannotation_filt_knowns_sorted.bed > randomsRNAloci_closestsknownTE.bed
randomsRNAloci_closestTE <- read.table("randomsRNAloci_closestsknownTE.bed", header = FALSE, sep="\t", stringsAsFactors=FALSE, quote="")
length(unique(randomsRNAloci_closestTE$V4))
randomsRNAloci_closestTE <- randomsRNAloci_closestTE[!duplicated(randomsRNAloci_closestTE$V4, fromLast = TRUE), ] 
randomsRNAloci_closestTE <- randomsRNAloci_closestTE[randomsRNAloci_closestTE$V16 > 0, ] 

## Do the same for genes
sort -k1,1 -k2,2n sRNAMETHexpressedGENE.bed > sRNAMETHexpressedGENE_sorted.bed
bedtools closest -k 1 -d -a sRNAMETHexpressedGENE_sorted.bed -b TEannotation_filt_knowns_sorted.bed > sRNAMETHexpressedGENE_closestTE.bed
sRNAMETHexpressedGENE_closestTE <- read.table("sRNAMETHexpressedGENE_closestTE.bed", header = FALSE, sep="\t", stringsAsFactors=FALSE, quote="")
sRNAMETHexpressedGENE_closestTE <- sRNAMETHexpressedGENE_closestTE[!duplicated(sRNAMETHexpressedGENE_closestTE$V7), ] 
sRNAMETHexpressedGENE_closestTE <- sRNAMETHexpressedGENE_closestTE[sRNAMETHexpressedGENE_closestTE$V16 > 0, ] 

ggplot(data=sRNAMETHunannotated_closestTE, mapping=aes(x=V5, y=V17))+
  geom_violin(fill="gray80", colour="black", alpha=1, width=1.25) +
  geom_quasirandom(method='tukeyDense', size=0.35, alpha=0.2, width = 0.6) +
  stat_summary(fun=median, geom="point", size=3, color="red")+
  theme(axis.text.x = element_text(size = 10, angle = 90)) +
  theme(axis.text.y = element_text(size = 4)) +
  theme_classic()+
  ylim(0,85000)

ggplot(data=randomsRNAloci_closestTE, mapping=aes(x=V5, y=V16))+
  geom_violin(fill="gray80", colour="black", alpha=1, width=1.25) +
  geom_quasirandom(method='tukeyDense', size=0.35, alpha=0.2, width = 0.6) +
  stat_summary(fun=median, geom="point", size=3, color="red")+
  theme(axis.text.x = element_text(size = 10, angle = 90)) +
  theme(axis.text.y = element_text(size = 4)) +
  theme_classic()+
  ylim(0,85000)

ggplot(data=sRNAMETHexpressedGENE_closestTE, mapping=aes(x=V4, y=V16))+
  geom_violin(fill="gray80", colour="black", alpha=1, width=1.25) +
  geom_quasirandom(method='tukeyDense', size=0.35, alpha=0.2, width = 0.6) +
  stat_summary(fun=median, geom="point", size=3, color="red")+
  theme(axis.text.x = element_text(size = 10, angle = 90)) +
  theme(axis.text.y = element_text(size = 4)) +
  theme_classic()+
  ylim(0,85000)



sRNAMETHexpressedGENE_closestTE$group = 1
sRNAMETHexpressedGENE_closestTE.test <- subset(sRNAMETHexpressedGENE_closestTE, select=c(V16, group))
randomsRNAloci_closestTE$group = 2
merg1 <- rbind(sRNAMETHunannotated_closestTE.test, randomsRNAloci_closestTE.test)
kruskal.test(V16 ~ group, data = merg1)
p-value < 2.2e-16

merg1$group <- ordered(merg1$group,
                       levels = c("1", "2"))

res.aov <- aov(V16 ~ group, data = merg1)
summary(res.aov)
               Df    Sum Sq   Mean Sq F value Pr(>F)    
group          1 5.116e+09 5.116e+09   96.42 <2e-16 ***
  Residuals   2523 1.339e+11 5.306e+07                   


sRNAMETHunannotated_closestTE.test <- subset(sRNAMETHunannotated_closestTE, select=c(V17, group))
names(sRNAMETHunannotated_closestTE.test)[1] <- "V16"
randomsRNAloci_closestTE$group = 2
randomsRNAloci_closestTE.test <- subset(randomsRNAloci_closestTE, select=c(V16, group))
merg1 <- rbind(sRNAMETHunannotated_closestTE.test, randomsRNAloci_closestTE.test)

kruskal.test(V16 ~ group, data = merg1)
p-value < 2.2e-16

merg1$group <- ordered(merg1$group,
                         levels = c("1", "2"))

res.aov <- aov(V16 ~ group, data = merg1)
summary(res.aov)
Df    Sum Sq   Mean Sq F value Pr(>F)    
group          1 5.116e+09 5.116e+09   96.42 <2e-16 ***
Residuals   2523 1.339e+11 5.306e+07 



sRNAMETHexpressedGENE_closestTE$group = 1
sRNAMETHunannotated_closestTE$group = 2
sRNAMETHexpressedGENE_closestTE.test <- subset(sRNAMETHexpressedGENE_closestTE, select=c(V16, group))
sRNAMETHunannotated_closestTE.test <- subset(sRNAMETHunannotated_closestTE, select=c(V17, group))
names(sRNAMETHunannotated_closestTE.test)[1] <- "V16"
merg1 <- rbind(sRNAMETHexpressedGENE_closestTE.test, sRNAMETHunannotated_closestTE.test)
kruskal.test(V16 ~ group, data = merg1)
p-value  = 2.881e-06

merg1$group <- ordered(merg1$group,
                       levels = c("1", "2"))

res.aov <- aov(V16 ~ group, data = merg1)
summary(res.aov)
Df    Sum Sq   Mean Sq F value   Pr(>F)    
group          1 7.663e+08 766297328   23.53 1.35e-06 ***
  Residuals   1548 5.041e+10  32564070                     
---
  Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1



