

####################################################################
##### Plot the methylation distribution for all TE categories #####
####################################################################

setwd("/Users/alexandradallaire/project0/fungalsmallrnas/RIR17genome/kinase")
out_dir <- "/Users/alexandradallaire/methanalysis"

bedtools map -c 4 -o mean -a Rir_HGAP_ii_V2.fa.out.TEONLY.sorted.bed -b methIGV.bed > TE_meth_overlap.bed
bedtools map -c 4 -o median -a Rir_HGAP_ii_V2.fa.out.TEONLY.sorted.bed -b methIGV.bed > TE_meth_overlap.bed

z1 <- read.table("TE_meth_overlap.bed",header = FALSE, sep="\t", stringsAsFactors=FALSE, quote="")

# Remove lines that have a dot in the last column (V6)
z1filt <- z1[z1$V6 != ".", ]
sapply(z1filt, class)
z1filt$V6 <- as.numeric(as.character(z1filt$V6))
sapply(z1filt, class)

# Find and count unique values in column V4
ZDT <- data.table(z1filt)
Zcat <- ZDT[, .(number_of_distinct_orders = length(unique(V2))), by = V4]
Zcatord <- Zcat %>% arrange(desc(number_of_distinct_orders))
#List of all TE categories which have an mCG score in the methylation data. (Those that have no coverage are removed.)

ggplot(data=z1filt, mapping=aes(x=V4, y=V6))+
  geom_boxplot()+
  theme_minimal() +
  theme(axis.text.x = element_text(size = 10, angle = 90)) +
  theme(axis.text.y = element_text(size = 4)) +
  coord_flip()

#Size filtering
z1filt$size = z1filt$V3-z1filt$V2
z1filt2 <- z1filt[z1filt$size > 100, ]

#Only keep TE classes that are represented at least 20 times in the list
tt <- table(z1filt2$V4)
df2 <- subset(z1filt2, V4 %in% names(tt[tt > 19]))

############################################################
############### FINAL PLOT Figure 2D ########################
############################################################
ggplot(data=df2, mapping=aes(x=reorder(V4, -V6), y=V6))+
  geom_boxplot(outlier.size=0.01, width=0.5, alpha=0.25)+
  geom_jitter(color="black", size=0.01, alpha=0.25)+
  stat_summary(fun=median, geom="point", size=1, color="red")+
  theme(axis.text.x = element_text(size = 10, angle = 90)) +
  theme(axis.text.y = element_text(size = 4)) +
  coord_flip()+
  theme_classic()

#Use the df2 dataframe to check scores of expressed TEs. Overlap the meth score of TEs with expressed TEs
nongenicExprTE_cop <- nongenicExprTE
names(nongenicExprTE_cop)[23] <- "Chr"
names(nongenicExprTE_cop)[24] <- "Start"
names(nongenicExprTE_cop)[25] <- "End"
names(df2)[1] <- "Chr"
names(df2)[2] <- "Start"
names(df2)[3] <- "End"

#Merge the dataframe by coordinates and Rename the methylation score column
exprTE_meth <- merge(nongenicExprTE_cop, df2, by.x = c("Chr", "Start", "End"), by.y = c("Chr", "Start", "End"))
names(exprTE_meth)[32] <- "MethScore"
#Give them group names
LTR <- dplyr::filter(exprTE_meth, grepl("LTR", V4)) #240
gypsy <- dplyr::filter(LTR, grepl("LTR/Gypsy", V4)) #n=186
gypsy$group = "LTR/Gypsy"
ltrother <- dplyr::filter(LTR, !grepl("LTR/Gypsy", V4)) #n=54
ltrother$group = "LTR/Other"
line <- dplyr::filter(exprTE_meth, grepl("LINE/", V4)) #n=170
line$group = "LINE"
hAT <- dplyr::filter(exprTE_meth, grepl("DNA/hAT", V4)) #n=356
hAT$group = "DNA/hAT"
tcMar <- dplyr::filter(exprTE_meth, grepl("DNA/Tc", V4)) #n=258
tcMar$group = "DNA/TcMar"
cmc <- dplyr::filter(exprTE_meth, grepl("DNA/CMC", V4)) #n=115
cmc$group = "DNA/CMC"
mule <- dplyr::filter(exprTE_meth, grepl("DNA/MULE", V4)) #n=269
mule$group = "DNA/MULE"
mav <- dplyr::filter(exprTE_meth, grepl("DNA/Maverick", V4)) #n=98
mav$group = "DNA/Maverick"
helitron <- dplyr::filter(exprTE_meth, grepl("Helitron", V4)) #n=106
helitron$group = "Helitron"
DNA <- dplyr::filter(exprTE_meth, grepl("DNA/", V4)) 
DNAother <- dplyr::filter(DNA, (!grepl("DNA/CMC", V4) & (!grepl("DNA/hAT", V4)) & (!grepl("DNA/Maverick", V4)) & (!grepl("DNA/MULE", V4)) & (!grepl("DNA/Tc", V4)) & (!grepl("Helitron", V4)))) #
DNAother$group = "DNA/Other" #173

#Group them together
exprTE_meth_group <- rbind(gypsy, ltrother, line, hAT, tcMar, cmc, mule, mav, helitron, DNAother) #n=1785  TEs we have methylation Data for

#Plot the distribution
ggplot(data=exprTE_meth_group, mapping=aes(x=reorder(group, -MethScore), y=MethScore))+
  theme_classic() +
  geom_violin(fill="white", colour="black", alpha=0.05, width=1)+
  geom_jitter(color="Black", size=0.5, alpha=0.5) +
  theme(axis.text.x = element_text(size = 10, angle = 90)) +
  theme(axis.text.y = element_text(size = 4))+
  stat_summary(fun=median, geom="point", size=1, color="red")+
  coord_flip()
  

#Add a different color to dots of differentially expressed TEs
#Merge exprTE_meth with deTEbed2
exprTE_meth_group_cop <- subset(exprTE_meth_group, select=c(Chr, Start, End, group, MethScore))
deTEsinfo_cop <- subset(deTEsinfo, select=c(Chr, Start, End, group))
diffexprTE_meth <- merge(exprTE_meth_group_cop, deTEsinfo_cop, by.x = c("Chr", "Start", "End"), by.y = c("Chr", "Start", "End"))
diffexprTE_meth$newgroup = 2

#Plot the distribution
ggplot(data=diffexprTE_meth, mapping=aes(x=reorder(group.x, -MethScore), y=MethScore))+
  theme_classic() +
  geom_jitter(color="springgreen1", size=2, alpha=1, pch=18) +
  theme(axis.text.x = element_text(size = 10, angle = 90)) +
  theme(axis.text.y = element_text(size = 4)) +
  coord_flip()+
  ylim(0, 1)


### CHI-square to test if the expressed TE distribution is significantly different to the non-expressed TE distribution (Kruskal-Wallis chi-squared)
#Make a df containing the methscores of non-expressed TEs
notexprTE_meth <- anti_join(df2, exprTE_meth_group) 
notexprTE_meth$newgroup = 2
names(notexprTE_meth)[4] <- "group"
notexprTE_meth$V5 <- NULL
notexprTE_meth$size <- NULL
names(notexprTE_meth)[5] <- "MethScore"
exprTE_meth_group$Geneid <- NULL

exprTE_meth_group_cop$newgroup = 1
chisqdf <- rbind(exprTE_meth_group_cop, notexprTE_meth)
#kruskal.test(MethScore ~ newgroup, data = chisqdf)

chisqdf %>%
  filter(group == "DNA/hAT") %>%
  group_by(newgroup) %>%
  summarise(p_value = wilcox.test(chisqdf$MethScore, newgroup, exact = FALSE)$p.value)

DNA/hAT : p-value 7.40e-228
DNA/Other : p-value 7.98e-113
DNA/CMC : p-value 7.25e-76
DNA/TcMar : p-value 1.53e-166
LINE : p-value 6.42e-111
DNA/MULE : p-value 1.86e-173
LTR/Gypsy : p-value 4.50e-121
DNA/Maverick : p-value 5.55e-65
Helitron : p-value 4.16e-70
LTR/Other : p-value 1.11e-36



#Merge diffexprTE_meth back to exprTE_meth
exprTE_meth_tomerge <- subset(exprTE_meth, select=c(Chr, Start, End, group, Geneid, MethScore))
diffexprTE_meth_tomerge <- subset(diffexprTE_meth, select=c(Chr, Start, End, group.x, MethScore, Geneid, newgroup))
#Print Geneids that are not found in both dfs (non DE TEs)
missing <- data.frame(setdiff(exprTE_meth_tomerge$Geneid, diffexprTE_meth_tomerge$Geneid))
names(missing)[1] <- "Geneid"
exprTE_meth_tomerge2 <- exprTE_meth_tomerge %>%
  filter(Geneid %in% missing$Geneid)
exprTE_meth_tomerge2$newgroup =1

exprTE_meth_merge <- rbind(exprTE_meth_tomerge2, diffexprTE_meth_tomerge)



############################################################
############### PLOT Figure 2B     ########################
############################################################

# Get the  %divergence of all TEs in the list (size is already there)
annotTE <- read.table("Rir_HGAP_ii_V2.fa.out.copy.txt", header = FALSE, fill=TRUE)
# Remove rows that have a * in the first oclumn
annotTE <- annotTE[annotTE$V1 != "*", ]
#Renumber the TEs (GeneId) to remove ID duplicates
annotTE$V15 <- seq.int(nrow(annotTE))
#Reorganize dataframes (similar to bed files)
annotTE <- subset(annotTE, select=c(V5, V6, V7, V10, V11, V15, V2))   #V2 is added to check the % divergence later
names(annotTE)[6] <- "Geneid"
names(annotTE)[7] <- "percdiv"
annotTE$length = abs(annotTE$V7-annotTE$V6)

names(annotTE)[1] <- "Chr"
names(annotTE)[2] <- "Start"
names(annotTE)[3] <- "End"

names(df2)[1] <- "Chr"
names(df2)[2] <- "Start"
names(df2)[3] <- "End"

# Merge the annotation and df2 dataframes based on coordinates
methTEdetails <- merge(df2, annotTE, by=c("Chr", "Start", "End"))

#Define groups for each methylation category: >0.5 = high, <0.5 low
methTEhigh <- subset(methTEdetails, (V6 >= 0.5)) #High = group 1
methTEhigh$group <- "A"
methTElow <- subset(methTEdetails, (V6 < 0.5)) #Low = group2
methTElow$group <- "B"

methTE_grouped <- rbind(methTEhigh, methTElow)

sapply(methTE_grouped, class)
methTE_grouped$length <- as.numeric(as.character(methTE_grouped$length))
methTE_grouped$loglength <- log2(methTE_grouped$length)

scatterPlot  <- ggplot(methTE_grouped, aes(percdiv, length, colour=group)) + 
  geom_point(alpha = 0.5) +
  theme_classic()
scatterPlot

# Marginal density plot of x (top panel)
xdensity <- ggplot(methTE_grouped, aes(percdiv, fill=group)) + 
  geom_density(alpha=0.5) + 
  scale_fill_manual(values = c('red','blue')) + 
  theme_classic()
xdensity

# Marginal density plot of y (right panel)
ydensity <- ggplot(methTE_grouped, aes(length, fill=group)) + 
  geom_density(alpha=.5) + 
  scale_fill_manual(values = c('red','blue')) + 
  theme_classic()
ydensity



# Log scale
group.colors <- c(A = "brown2", B = "black")

scatterPlot  <- ggplot(methTE_grouped, aes(percdiv, loglength, colour=factor(group))) + 
  geom_point(alpha = 0.20) +
  theme_classic()+
  scale_color_manual(values = c("coral1", "slateblue"))
scatterPlot

# Marginal density plot of x (top panel)
xdensity <- ggplot(methTE_grouped, aes(percdiv, fill=group)) + 
  geom_density(alpha=0.5) + 
  scale_fill_manual(values = c('red','blue')) + 
  theme_classic()
xdensity

kruskal.test(percdiv ~ group, data = methTE_grouped)
p-value < 2.2e-16

# Marginal density plot of y (right panel)
ydensity <- ggplot(methTE_grouped, aes(loglength, fill=group)) + 
  geom_density(alpha=.5) + 
  scale_fill_manual(values = c('red','blue')) + 
  theme_classic()
ydensity

kruskal.test(loglength ~ group, data = methTE_grouped)
p-value < 2.2e-16








#######################################################################
################# TE metagene analysis (FIGURE 2C)     ################
#######################################################################

#TEs with methscore >0.5
methTEhigh <- read.table("methTE_high.txt", header = TRUE)

#Make bed file with methTEhigh
# BED = chr   coord1    coord2     dots      strand      number       name/Geneid
methTEhighbed <- subset(methTEhigh, select=c(Chr, Start, End, Geneid, V5, size, V4))
methTEhighbed$V5 <- "+"
write.table(methTEhighbed, paste(out_dir,"methTEhigh.bed",sep="/"), col.names=F, quote=F, sep="\t", row.names=F)

#Make upstream and downstream windows
bedtools  flank -i methTEhigh.bed -g contiglengths.bed -b 2000 > flank_methTEhigh.bed
methTEflanks <- read.table("flank_methTEhigh.bed", header = FALSE)
#Split the ones that don't have a left and right flanking sequence
df1 <- split(methTEflanks, duplicated(methTEflanks$V4) | duplicated(methTEflanks$V4, fromLast = TRUE))
df1

df2 <- as.data.frame(df1[[1]]) #These only have one left or one right. I'm not gonna use them
df3 <- as.data.frame(df1[[2]]) #These have a left and a right flank

# Separate the left and right flanks
leftflank <- aggregate(. ~ V4, data = df3, FUN = head, 1)
rightflank <- aggregate(. ~ V4, data = df3, FUN = tail, 1)
leftflank <- subset(leftflank, select=c(V1, V2, V3, V4, V5, V6, V7))
rightflank <- subset(rightflank, select=c(V1, V2, V3, V4, V5, V6, V7))
write.table(leftflank, paste(out_dir,"leftflank.bed",sep="/"), col.names=F, quote=F, sep="\t", row.names=F)
write.table(rightflank, paste(out_dir,"rightflank.bed",sep="/"), col.names=F, quote=F, sep="\t", row.names=F)


#bedtool
bedtools intersect -wa -wb -a leftflank.bed -b methIGV.bed > methTEhigh_left.bed
bedtools intersect -wa -wb -a rightflank.bed -b methIGV.bed > methTEhigh_right.bed
bedtools intersect -wa -wb -a methTEhigh.bed -b methIGV.bed > methTEhigh_body.bed

leftmeth <- read.table("methTEhigh_left.bed", header = FALSE)
rightmeth <- read.table("methTEhigh_right.bed", header = FALSE)
bodymeth <- read.table("methTEhigh_body.bed", header = FALSE)

bodymeth <- dplyr::filter(bodymeth, !grepl("rRNA", V7))
leftmeth <- dplyr::filter(leftmeth, !grepl("rRNA", V7))
rightmeth <- dplyr::filter(rightmeth, !grepl("rRNA", V7))

names(leftmeth)[4] <- "TEid"
names(rightmeth)[4] <- "TEid"
names(bodymeth)[4] <- "TEid"
names(leftmeth)[11] <- "meth"
names(rightmeth)[11] <- "meth"
names(bodymeth)[11] <- "meth"

bodymeth$pos1 = (((bodymeth$V9+8)-(bodymeth$V2))/bodymeth$V6*100)
leftmeth$pos1 = -(leftmeth$V3 - leftmeth$V9)
rightmeth$pos1 = (rightmeth$V9 - rightmeth$V2)

#Replace negative by 0 and values > or < 2000 by their max limits
bodymeth$pos1[bodymeth$pos1<0] <- 0
bodymeth$pos1[bodymeth$pos1>100] <- 100
leftmeth$pos1[leftmeth$pos1<(-2000)] <- (-2000)
rightmeth$pos1[rightmeth$pos1<0] <- 0
rightmeth$pos1 = (rightmeth$pos1)+101+1900
bodymeth$pos1= bodymeth$pos1*20

all <- rbind(leftmeth, bodymeth, rightmeth)

left = -2000 to -1
body = 0 to 100
right = 101 to 2100

p <- ggplot(all[all$V7=="RC/Helitron",], aes(pos1, meth, color=V7)) + 
  geom_point(aes(alpha=0.1))+
  theme(legend.position = "none")+
  geom_smooth(colour = "black", formula = y ~ poly(x,4))+
  facet_wrap(.~V7)+
  theme_classic()
p

p <- ggplot(all, aes(pos1, meth, color=V7)) + 
  geom_point(aes(alpha=0.1))+
  theme(legend.position = "none")+
  geom_smooth(colour = "black", formula = y ~ poly(x,6))+
  facet_wrap(.~V7)+
  theme_classic()+
  theme(legend.position = "none")

pdf("plot.pdf")
p
dev.off()

# Make special bed files
# BED = chr   coord1    coord2     dots      strand      number       name/Geneid
leftmeth_2bed <- subset(leftmeth, select=c(V8:V10, meth, V5, meth, pos1))
write.table(leftmeth_2bed, paste(out_dir,"leftflank_mCG.bed",sep="/"), col.names=F, quote=F, sep="\t", row.names=F)
rightmeth_2bed <- subset(rightmeth, select=c(V8:V10, meth, V5, meth, pos1))
write.table(rightmeth_2bed, paste(out_dir,"rightflank_mCG.bed",sep="/"), col.names=F, quote=F, sep="\t", row.names=F)
bodymeth_2bed <- subset(bodymeth, select=c(V8:V10, meth, V5, meth, pos1))
write.table(bodymeth_2bed, paste(out_dir,"body_mCG.bed",sep="/"), col.names=F, quote=F, sep="\t", row.names=F)

bedtools intersect -wa -wb -a leftflank_mCG.bed -b body_mCG.bed > left_intersect_body.bed
bedtools intersect -wa -wb -a rightflank_mCG.bed -b body_mCG.bed > right_intersect_body.bed

leftmeth2 <- read.table("left_intersect_body.bed", header = FALSE)
rightmeth2 <- read.table("right_intersect_body.bed", header = FALSE)
#V1:V7 are the coordinates of mCG sites that overlap with a TE body. Remove these rows from our initial df
leftmeth2 <- subset(leftmeth2, select=c(V1:V7))
rightmeth2 <- subset(rightmeth2, select=c(V1:V7))

names(leftmeth2)[1] <- "V8"
names(leftmeth2)[2] <- "V9"
names(leftmeth2)[3] <- "V10"
newleft <- anti_join(leftmeth, leftmeth2, by=c("V8", "V9", "V10"))

names(rightmeth2)[1] <- "V8"
names(rightmeth2)[2] <- "V9"
names(rightmeth2)[3] <- "V10"
newright <- anti_join(rightmeth, rightmeth2, by=c("V8", "V9", "V10"))

newleft <- subset(newleft, select=c(TEid, V7, meth, pos1))
newright <- subset(newright, select=c(TEid, V7, meth, pos1))
newbodymeth <- subset(bodymeth, select=c(TEid, V7, meth, pos1))

all2 <- rbind(newleft, newbodymeth, newright)

p <- ggplot(all2[all2$V7=="RC/Helitron",], aes(pos1, meth, color=V7)) + 
  geom_point(aes(alpha=0.1))+
  theme(legend.position = "none")+
  geom_smooth(colour = "black", method=lm, formula = y ~ poly(x,6))+
  facet_wrap(.~V7)+
  theme_classic()
p

p <- ggplot(all2, aes(pos1, meth, color=V7)) + 
  geom_point(aes(alpha=0.1))+
  theme(legend.position = "none")+
  geom_smooth(colour = "black", formula = y ~ poly(x,6))+
  facet_wrap(.~V7)+
  theme_classic()+
  theme(legend.position = "none")
pdf("plot_test2.pdf")
p
dev.off()

bedtools intersect -wa -wb -a leftflank_mCG.bed -b rightflank_mCG.bed > left_right_intersect.bed
leftrightintersect <- read.table("left_right_intersect.bed", header = FALSE)

#Remove from our leftmeth V8:V10 df the ones that are in leftrightintersect V1:V3
names(leftrightintersect)[1] <- "V8"
names(leftrightintersect)[2] <- "V9"
names(leftrightintersect)[3] <- "V10"
names(leftrightintersect)[8] <- "x"
names(leftrightintersect)[9] <- "y"
names(leftrightintersect)[10] <- "z"
newleft2 <- anti_join(leftmeth, leftrightintersect, by=c("V8", "V9", "V10"))
newleft3 <- anti_join(newleft2, leftmeth2, by=c("V8", "V9", "V10"))

#Reload the same dataframe and remove from our rightmeth V8:V10 df the ones that are in leftrightintersect V8:V10
leftrightintersect <- read.table("left_right_intersect.bed", header = FALSE)
newright2 <- anti_join(rightmeth, leftrightintersect, by=c("V8", "V9", "V10"))
newright3 <- anti_join(newright2, rightmeth2, by=c("V8", "V9", "V10"))

newleft3 <- subset(newleft3, select=c(TEid, V7, meth, pos1))
newright3 <- subset(newright2, select=c(TEid, V7, meth, pos1))

all3 <- rbind(newleft3, newbodymeth, newright3)

# Plot all TE classes together
p <- ggplot(all3, aes(pos1, meth, color=V7)) + 
  theme(legend.position = "none")+
  geom_smooth(colour = "black", method=lm, formula = y ~ poly(x,25), level=0.999999999999999)+
  theme_classic()+
  theme(legend.position = "none")+
  ylim(0, 1)
pdf("plot_test6.pdf")
p
dev.off()


# M multi-plot for different TE categories (FIGURE 2C)
# Modify all3 df to change the names of the TE categories we want to merge:
# All DNA together
# All LTR together
# All LINE together
# RC/Helitron 
unique(all3$V7)

#893852 total
DNA <- dplyr::filter(all3, grepl("DNA", V7)) #490462
LTR <- dplyr::filter(all3, grepl("LTR", V7)) #126284
Heli <- dplyr::filter(all3, grepl("RC/Helitron", V7)) #20665
Line <- dplyr::filter(all3, grepl("LINE", V7)) #256441

490462+126284+20665+256441

DNA$V7 <- "DNA"
LTR$V7 <- "LTR"
Heli$V7 <- "Helitron"
Line$V7 <- "LINE"

all4 <- rbind(DNA, LTR, Heli, Line)

p <- ggplot(all4, aes(pos1, meth, color=V7)) + 
  theme(legend.position = "none")+
  geom_smooth(colour = "black", formula = y ~ poly(x,12), level=0.999999999999999)+
  facet_wrap(.~V7)+
  theme_classic()+
  theme(legend.position = "none")+
  ylim(0,1)
pdf("plot_test7.pdf")
p
dev.off()




