
# Methlation scores of gene categories. Use the list of multi-copy orphan genes to separate multi-copy/repetitive orphan genes from non-repetitive/lowcopynumber orphan genes (Class B). Class B genes now become split into 2 categories: high-copy number (HCN) orphan, low-copy number (LCN) orphan.

clemannot_funct3 <- read.table("Clemannot_with_Maedafunction_final.bed", header = FALSE, sep="\t", stringsAsFactors=FALSE, quote="")
22508 genes in total
HCN_orphan_list <- read.table("multicopy_orphan_genes.bed", header = FALSE, sep="\t", stringsAsFactors=FALSE, quote="") #First set of coordinates are location of the repeat, 2nd set is the gene location/annotation
# Our gene annotation contains 10299 orphan/Class B genes (out of a total of 22508 genes).
# 4039 are HCN orphan genes (39.21% of orphan genes are multi-copy, i.e. 17.9% of all genes are HCN orphans). 
# 6260 are LCN orphan genes (60.78% of orphan genes are low-copy number, i.e. 27.81% of all genes are LCN orphans).

# Get a methylation score for all genes in the gene annotation.
bedtools map -c 4 -o median -a Clemannot_with_Maedafunction_final.sorted.bed -b methIGV.bed > newgeneannotation.methscores.bed
newannotmeth <- read.table("newgeneannotation.methscores.bed", header = FALSE, sep="\t", stringsAsFactors=FALSE, quote="")

ggplot(newannotmeth, aes(x=V8)) + geom_bar(color="black", fill="black")+
  theme_classic()+
  xlab("mCG score")+
  ylab("Number Genes")+
  ggtitle("mCG scores of 22508 genes")

newannotmeth <- newannotmeth[newannotmeth$V8 != ".", ]
newannotmeth$V8<- as.numeric(as.character(newannotmeth$V8))

# PLOT FIGURE 3A
a <- ggplot(newannotmeth, aes(x = V8))
a + geom_histogram(bins = 80, color = "black", fill = "gray")+
  theme_classic()+
  xlab("mCG score")+
  ylab("Number Genes")+
  ggtitle("mCG scores of 23514 genes")

geneHighmeth <- subset(newannotmeth, (V8 >= 0.5)) #4022 genes are high-meth . 
4022/22508*100
17.8692 % of genes have a median mCG score >0.5
unkngenesmeth <- dplyr::filter(geneHighmeth, grepl("hypothetical|provisional|tis13", V7, ignore.case=TRUE))
#2116
#How many don't have a ";"?
unkngenesmeth2 <- dplyr::filter(geneHighmeth, !grepl(";", V7, ignore.case=TRUE))
#1107
2116 + 1107 = 3223 unknown genes, no known domain 
3223/4022*100
80.13426 % of High-meth genes have unknown function (class B, orphans)
HCN_orphan_highmeth <- subset(HCN_orphan_meth, (V8 >= 0.5)) #1491 hi-copy number orphan genes are high-meth.
HCN_orphan_lowmeth <- subset(HCN_orphan_meth, (V8 < 0.5)) #2548 hi-copy number orphan genes are low-meth . 
# i.e. of the 80.1% high-meth classB, 29.6 is high-copynumber (repetitive)
# i.e. of the 80.1% high-meth classB, 50.5 is low-copynumber
1491/4039*100
2548/4039*100
0.36915*80.1
0.6308*80.1


# 113 kinases (87 ser-thr.kin), 30 sel1, 19 kelch, 16 btbpoz, 17 calmodulin.kin
113+30+19+16+17
195 High-meth genes are class C 
195/4022*100 #= 4.84 % High-meth genes are class C 
othergenesmeth2 <- dplyr::filter(geneHighmeth, !grepl("kelch|calmodulin|sel1|btb/poz|btb-poz|serine/threonine|serine-threonine|serine_threonine", V7, ignore.case=TRUE))
othergenesmeth3 <- dplyr::filter(othergenesmeth2, !grepl("hypothetical|provisional|tis13", V7, ignore.case=TRUE))
othergenesmeth4 <- dplyr::filter(othergenesmeth3, grepl(";", V7, ignore.case=TRUE))
othergenesmeth5 <- dplyr::filter(othergenesmeth4, !grepl("gag|reverse_transcriptas|zinc_finger_mym|far1|DNA_pol|transposase|ribonuclease_h|helicase|retrotransposon|CENP-b|RNase_h|crinkler|transposable|zinc_finger_bed_domain", V7, ignore.case=TRUE))

#TE-related domains = 
reverse_transcriptase   (19)
helicase   (45) (part of helitron)
RNase_h (1)
CENP-b_protein_1   (12) (part of Crypton-A)
zinc_finger_mym-type_protein_2 (part of crypton-A)    (18)
gag_pol (27)
far-related gene (70)
DNA_pol (43)
transposase (34)
ribonuclease_h (25)
retrotransposon (4)
zinc_finger_bed_domain (152)
transposable (13)
rve_super_family_integrase (2)
nuclease_harbi1 (9)
19+45+1+12+18+27+70+43+34+25+4+152+13+2+9
474/4022*100 #11.79 % TE-related

crinkler_family_protein  (14)
14/4022*100 # 0.3480855 % crinkler effector

3223+7076
10299  genes with no known protein domain in our annotation
10299/22508*100 = 45% of genes with no known protein domain

geneLowmeth <- subset(newannotmeth, (V8 < 0.5)) # 18407
unkngenesnonmeth <- dplyr::filter(geneLowmeth, grepl("hypothetical|provisional|tis13", V7, ignore.case=TRUE))
5263/18407*100
unkngenesnonmeth2 <- dplyr::filter(geneLowmeth, !grepl(";", V7, ignore.case=TRUE))
1813 + 5263 
#7076 unknown genes are low-methylated
7076/18407*100 
#38.4419% of low-meth genes have unknown function (Class B)
LCN_orphan_highmeth <- subset(LCN_orphan_meth, (V8 >= 0.5)) #2056 low-copy number orphan genes are low-meth . 
LCN_orphan_lowmeth <- subset(LCN_orphan_meth, (V8 < 0.5)) #5589 low-copy number orphan genes are low-meth . 
# i.e. of the 38.4% low-meth classB, 11.2 is high-copynumber (repetitive)
# i.e. of the 38.4% low-meth classB, 27.2 is low-copynumber
2056/7076*100
5589/7076*100
0.2905596*38.4419
38.4-11.2

x <- dplyr::filter(geneLowmeth, grepl("kelch|calmodulin|sel1|btb/poz|btb-poz|serine/threonine|serine-threonine|serine_threonine", V7, ignore.case=TRUE))
3512/18407*100
#3512 repeated genes = 19.0797 % of low-meth genes are class C
3512+195
3707/22508*100 16.5% of all genes are class C
#TE-related domains = 
reverse_transcriptase   (0)
CENP-b_protein_1   (2) (part of Crypton-A)
zinc_finger_mym-type_protein_2 (crypton-A)    (14)
gag_pol (1)
far-related gene (14)
DNA_pol (9)
transposase (16)
ribonuclease_h (7)
retrotransposon (5)
zinc_finger_bed_domain (21)
transposable (5)
rve_super_family_integrase (0)
nuclease_harbi1 (4)
0+2+14+1+14+9+16+7+5+21+5+4
98/18407*100 
#0.53 % TE-related genes are low-meth

crinkler_family_protein  (34)
34/18407*100 # 0.1847123 % crinkler effector

x <- dplyr::filter(geneLowmeth, !grepl("kelch|calmodulin|sel1|btb/poz|btb-poz|serine/threonine|serine-threonine|serine_threonine|reverse_transcriptase|CENP-b|crinkler_family_protein|zinc_finger_mym", V7, ignore.case=TRUE))
y <- dplyr::filter(geneHighmeth, !grepl("kelch|calmodulin|sel1|btb/poz|btb-poz|serine/threonine|serine-threonine|serine_threonine|reverse_transcriptase|CENP-b|crinkler_family_protein|zinc_finger_mym", V7, ignore.case=TRUE))

# PIE CHARTS IN FIGURE 3C
#Highmeth
100-80.13-11.79-4.84-0.35
2.89 # class A
#80.13 class B is further split into HCN and LCN
pie <- data.frame(
  group = c("ClassB_HCN", "ClassB_LCN", "Transposon-related domain", "ClassC", "ClassA", "crinkler"),
  value = c(29.6, 50.5, 11.79, 4.84, 2.89, 0.35)
)
head(df)
bp <- ggplot(pie, aes(x="", y=value, fill=group))+
  geom_bar(width = 1, stat = "identity")
bp
pieplot <- bp + coord_polar("y", start=0) + theme_classic()
pieplot

#Lowmeth
100-38.4-19.08-0.53-0.18
41.81 # class A
#38.3 class B is further split into HCN and LCN  
pie <- data.frame(
  group = c("ClassB_HCN", "ClassB_LCN", "ClassC", "Transposon-related domain", "ClassA", "crinkler"),
  value = c(11.2, 27.2, 19.08, 0.53, 41.81, 0.18)
)
head(df)
bp <- ggplot(pie, aes(x="", y=value, fill=group))+
  geom_bar(width = 1, stat = "identity")
bp
pieplot <- bp + coord_polar("y", start=0) + theme_classic()
pieplot

# Metagene plot for high and low categories separately.
#Start with flanking regions
bedtools  flank -i LowmethGenes.bed -g contiglengths.bed -b 2000 > lowmethflank.bed
bedtools  flank -i HighmethGenes.bed -g contiglengths.bed -b 2000 > highmethflank.bed

lowmethflanks <- read.table("lowmethflank.bed", header = FALSE, fill=TRUE,  sep="\t", stringsAsFactors=FALSE, quote="")
highmethflanks <- read.table("highmethflank.bed", header = FALSE, fill=TRUE,  sep="\t", stringsAsFactors=FALSE, quote="")

# LOW - Split the ones that don't have a left and right flanking sequence
df1 <- split(lowmethflanks, duplicated(lowmethflanks$V7) | duplicated(lowmethflanks$V7, fromLast = TRUE))
df1
df2 <- as.data.frame(df1[[1]]) #
df3 <- as.data.frame(df1[[2]]) #
# Separate the left and right flanks
leftflanklow <- aggregate(. ~ V7, data = df2, FUN = head, 1)
rightflanklow <- aggregate(. ~ V7, data = df2, FUN = tail, 1)
leftflanklow <- subset(leftflanklow, select=c(V1, V2, V3, V4, V5, V6, V7))
rightflanklow <- subset(rightflanklow, select=c(V1, V2, V3, V4, V5, V6, V7))
leftflanklow$group <- "low"
rightflanklow$group <- "low"
write.table(leftflanklow, paste(out_dir,"leftflanklow.bed",sep="/"), col.names=F, quote=F, sep="\t", row.names=F)
write.table(rightflanklow, paste(out_dir,"rightflanklow.bed",sep="/"), col.names=F, quote=F, sep="\t", row.names=F)

# HIGH - Split the ones that don't have a left and right flanking sequence
df1 <- split(highmethflanks, duplicated(highmethflanks$V7) | duplicated(highmethflanks$V7, fromLast = TRUE))
df1
df2 <- as.data.frame(df1[[1]]) #
df3 <- as.data.frame(df1[[2]]) #
# Separate the left and right flanks
leftflankhigh <- aggregate(. ~ V7, data = df2, FUN = head, 1)
rightflankhigh <- aggregate(. ~ V7, data = df2, FUN = tail, 1)
leftflankhigh <- subset(leftflankhigh, select=c(V1, V2, V3, V4, V5, V6, V7))
rightflankhigh <- subset(rightflankhigh, select=c(V1, V2, V3, V4, V5, V6, V7))
leftflankhigh$group <- "high"
rightflankhigh$group <- "high"
write.table(leftflankhigh, paste(out_dir,"leftflankhigh.bed",sep="/"), col.names=F, quote=F, sep="\t", row.names=F)
write.table(rightflankhigh, paste(out_dir,"rightflankhigh.bed",sep="/"), col.names=F, quote=F, sep="\t", row.names=F)

allleftflanks <- rbind(leftflanklow, leftflankhigh)
allrightflanks <- rbind(rightflanklow, rightflankhigh)
write.table(allleftflanks, paste(out_dir,"geneleftflank.bed",sep="/"), col.names=F, quote=F, sep="\t", row.names=F)
write.table(allrightflanks, paste(out_dir,"generightflank.bed",sep="/"), col.names=F, quote=F, sep="\t", row.names=F)

# Make a df for gene bodies
geneLowmeth$V8 <- NULL
geneHighmeth$V8 <- NULL
geneLowmeth$group <- "low"
geneHighmeth$group <- "high"
allgenes <- rbind(geneLowmeth, geneHighmeth)
write.table(allgenes, paste(out_dir,"allgenes.bed",sep="/"), col.names=F, quote=F, sep="\t", row.names=F)

#Intersect lists with the methylation data
bedtools intersect -wa -wb -a geneleftflank.bed -b methIGV.bed > methgene_left.bed
bedtools intersect -wa -wb -a generightflank.bed -b methIGV.bed > methgene_right.bed
bedtools intersect -wa -wb -a allgenes.bed -b methIGV.bed > methgene_body.bed

leftmeth <- read.table("methgene_left.bed", header = FALSE, fill=TRUE,  sep="\t", stringsAsFactors=FALSE, quote="")
rightmeth <- read.table("methgene_right.bed", header = FALSE, fill=TRUE,  sep="\t", stringsAsFactors=FALSE, quote="")
bodymeth <- read.table("methgene_body.bed", header = FALSE, fill=TRUE,  sep="\t", stringsAsFactors=FALSE, quote="")

# V12 is meth score
# V8 is category of gene
# Now set their positions

bodymeth$length = bodymeth$V3-bodymeth$V2
leftmeth$length = leftmeth$V3-leftmeth$V2
rightmeth$length = rightmeth$V3-rightmeth$V2
bodymeth$pos1 = (((bodymeth$V10+8)-(bodymeth$V2))/bodymeth$length*100)
leftmeth$pos1 = -(leftmeth$V3 - leftmeth$V10)
rightmeth$pos1 = (rightmeth$V10 - rightmeth$V2)

#Replace negative by 0 and values > or < 2000 by their max limits
bodymeth$pos1[bodymeth$pos1<0] <- 0
bodymeth$pos1[bodymeth$pos1>100] <- 100
leftmeth$pos1[leftmeth$pos1<(-2000)] <- (-2000)
rightmeth$pos1[rightmeth$pos1<0] <- 0
rightmeth$pos1 = (rightmeth$pos1)+101+1900
bodymeth$pos1= bodymeth$pos1*20

all <- rbind(leftmeth, bodymeth, rightmeth)

#Plot methylation for all low-meth genes (FIGURE 3B, left panel)
p <- ggplot(all[all$V8=="low",], aes(pos1, V12, color=V8)) + 
  theme(legend.position = "none")+
  geom_smooth(colour = "black", formula = y ~ poly(x,12), level=0.999999999999999)+
  facet_wrap(.~V8)+
  theme_classic()+
  theme(legend.position = "none")+
  ylim(0,1)
p

#Plot methylation for all high-meth genes (FIGURE 3B, right panel)
p <- ggplot(all[all$V8=="high",], aes(pos1, V12, color=V8)) + 
  theme(legend.position = "none")+
  geom_smooth(colour = "black", formula = y ~ poly(x,12), level=0.999999999999999)+
  facet_wrap(.~V8)+
  theme_classic()+
  theme(legend.position = "none")+
  ylim(0,1)
p







########################################
# Plot meth score of genes - Figure 3D #
########################################


#Split them into groups
ser.thrkin.methscore <- dplyr::filter(newannotmeth, grepl("serine/threonine|serine-threonine|serine_threonine", V7, ignore.case=TRUE))
ser.thrkin.methscore <- ser.thrkin.methscore[!duplicated(ser.thrkin.methscore$V7), ]

calmodkin.methscore <- dplyr::filter(newannotmeth, grepl("calmodulin", V7, ignore.case=TRUE))
calmodkin.methscore <- calmodkin.methscore[!duplicated(calmodkin.methscore$V7), ]

sel1.methscore <- dplyr::filter(newannotmeth, grepl("sel1", V7, ignore.case=TRUE))
sel1.methscore <- sel1.methscore[!duplicated(sel1.methscore$V7), ]

kelch.methscore <- dplyr::filter(newannotmeth, grepl("kelch", V7, ignore.case=TRUE))
kelch.methscore <- kelch.methscore[!duplicated(kelch.methscore$V7), ]

btb.poz.methscore <- dplyr::filter(newannotmeth, grepl("btb/poz|btb-poz", V7, ignore.case=TRUE))
btb.poz.methscore <- btb.poz.methscore[!duplicated(btb.poz.methscore$V7), ]

#Filter out class C genes and TEs from the list
nonrepeatgenes.methscore <- dplyr::filter(genes, !grepl("serine/threonine|serine-threonine|serine_threonine|btb/poz|btb-poz|calmodulin|sel1|kelch|reverse_transcriptase|CENP-b|crinkler_family_protein|zinc_finger_mym|DNA_polymerase_delta_subunit_1|DNA_polymerase_epsilon|ATPase_aaa|retrotransposon|integrase|pif1|zinc_finger_bed_domain|PIF1|ricesleeper|helicase-primase|helicase/primase|gag-pol|far1-related|ribonuclease_hi|ribonuclease_h|ATP_dependent_DNA_helicase|jockey|rve_super_family_integrase|transposase|transposable", V7, ignore.case=TRUE))
nonrepeatgenes.methscore <- nonrepeatgenes.methscore[!duplicated(nonrepeatgenes.methscore$V7), ]

repeatmeth <- data.frame(ser.thrkin.methscore$V8)
repeatmeth$cat = 1
repeatmeth = repeatmeth[ser.thrkin.methscore$V8 != ".", ]
repeatmeth$ser.thrkin.methscore.V8 <- as.numeric(as.character(repeatmeth$ser.thrkin.methscore.V8))
repeatmeth %>%
  ggplot(aes(x=cat, y=ser.thrkin.methscore.V8)) +
  geom_violin(fill="gray80", colour="black", alpha=1) +
  geom_quasirandom(method='tukeyDense', size=0.3, alpha=0.15) +
  ylim(0,1)+
  stat_summary(fun=median, geom="point", size=3, color="red")+
  theme_classic() +
  theme(legend.position="none",
        plot.title = element_text(size=11)
  )

repeatmeth <- data.frame(calmodkin.methscore$V8)
repeatmeth$cat = 1
repeatmeth = repeatmeth[calmodkin.methscore$V8 != ".", ]
repeatmeth$calmodkin.methscore.V8 <- as.numeric(as.character(repeatmeth$calmodkin.methscore.V8))
repeatmeth %>%
  ggplot(aes(x=cat, y=calmodkin.methscore.V8)) +
  geom_violin(fill="gray80", colour="black", alpha=1) +
  geom_quasirandom(method='tukeyDense', size=0.3, alpha=0.15) +
  ylim(0,1)+
  stat_summary(fun=median, geom="point", size=3, color="red")+
  theme_classic() +
  theme(legend.position="none",
        plot.title = element_text(size=11)
  )

repeatmeth <- data.frame(sel1.methscore$V8)
repeatmeth$cat = 1
repeatmeth = repeatmeth[sel1.methscore$V8 != ".", ]
repeatmeth$sel1.methscore.V8 <- as.numeric(as.character(repeatmeth$sel1.methscore.V8))
repeatmeth %>%
  ggplot(aes(x=cat, y=sel1.methscore.V8)) +
  geom_violin(fill="gray80", colour="black", alpha=1) +
  geom_quasirandom(method='tukeyDense', size=0.3, alpha=0.15) +
  ylim(0,1)+
  stat_summary(fun=median, geom="point", size=3, color="red")+
  theme_classic() +
  theme(legend.position="none",
        plot.title = element_text(size=11)
  )

repeatmeth <- data.frame(kelch.methscore$V8)
repeatmeth$cat = 1
repeatmeth = repeatmeth[kelch.methscore$V8 != ".", ]
repeatmeth$kelch.methscore.V8 <- as.numeric(as.character(repeatmeth$kelch.methscore.V8))
repeatmeth %>%
  ggplot(aes(x=cat, y=kelch.methscore.V8)) +
  geom_violin(fill="gray80", colour="black", alpha=1) +
  geom_quasirandom(method='tukeyDense', size=0.3, alpha=0.15) +
  ylim(0,1)+
  stat_summary(fun=median, geom="point", size=3, color="red")+
  theme_classic() +
  theme(legend.position="none",
        plot.title = element_text(size=11)
  )

repeatmeth <- data.frame(btb.poz.methscore$V8)
repeatmeth$cat = 1
repeatmeth = repeatmeth[btb.poz.methscore$V8 != ".", ]
repeatmeth$btb.poz.methscore.V8 <- as.numeric(as.character(repeatmeth$btb.poz.methscore.V8))
repeatmeth %>%
  ggplot(aes(x=cat, y=btb.poz.methscore.V8)) +
  geom_violin(fill="gray80", colour="black", alpha=1) +
  geom_quasirandom(method='tukeyDense', size=0.3, alpha=0.15) +
  ylim(0,1)+
  stat_summary(fun=median, geom="point", size=3, color="red")+
  theme_classic() +
  theme(legend.position="none",
        plot.title = element_text(size=11)
  )


#Only "core" Class A genes
nonrepeatgenes.methscore.standard <- dplyr::filter(nonrepeatgenes.methscore, grepl(";", V7, ignore.case=TRUE))
nonrepeatgenes.methscore.standard2 <- dplyr::filter(nonrepeatgenes.methscore.standard, !grepl("hypothetical|provisional|tis13", V7, ignore.case=TRUE))

repeatmeth <- data.frame(nonrepeatgenes.methscore.standard2$V8)
repeatmeth$cat = 1
repeatmeth = repeatmeth[nonrepeatgenes.methscore.standard2$V8 != ".", ]
repeatmeth$nonrepeatgenes.methscore.standard2 <- as.numeric(as.character(repeatmeth$nonrepeatgenes.methscore.standard2))
repeatmeth %>%
  ggplot(aes(x=cat, y=nonrepeatgenes.methscore.standard2)) +
  geom_violin(fill="gray80", colour="black", alpha=1) +
  geom_quasirandom(method='tukeyDense', size=0.3, alpha=0.10) +
  theme_classic() +
  ylim(0,1)+
  stat_summary(fun=median, geom="point", size=3, color="red")+
  theme(legend.position="none",
        plot.title = element_text(size=11))

#Only "unknown/orphan" Class B genes
nonrepeatgenes.methscore.unknown <- dplyr::filter(nonrepeatgenes.methscore, !grepl(";", V7, ignore.case=TRUE))
nonrepeatgenes.methscore.unknown2 <- dplyr::filter(nonrepeatgenes.methscore, grepl("hypothetical|provisional|tis13", V7, ignore.case=TRUE))
nonrepeatgenes.methscore.unknown3 <- rbind(nonrepeatgenes.methscore.unknown, nonrepeatgenes.methscore.unknown2)

nonrepeatgenes.methscore.standard2 <- read.table("classAgenes.bed",header = FALSE, sep="\t", stringsAsFactors=FALSE, quote="")
nonrepeatgenes.methscore.unknown3 <- read.table("classBgenes.bed",header = FALSE, sep="\t", stringsAsFactors=FALSE, quote="") #All class B (orphan) genes


#Splitting repetitive vs non-repetitive oprhan genes
HCN_orphan_meth <- subset(nonrepeatgenes.methscore.unknown3, V7 %in% HCN_orphan_list$V12) #4039 HCN orphans (ClassB repetitive)
LCN_orphan_meth <- subset(nonrepeatgenes.methscore.unknown3, !(V7 %in% HCN_orphan_list$V12)) #7645 LCN orphans (ClassB repetitive)

#Plot for all Class B genes
repeatmeth <- data.frame(nonrepeatgenes.methscore.unknown3$V8)
repeatmeth$cat = 1
repeatmeth = repeatmeth[nonrepeatgenes.methscore.unknown3$V8 != ".", ]
repeatmeth$nonrepeatgenes.methscore.unknown3 <- as.numeric(as.character(repeatmeth$nonrepeatgenes.methscore.unknown3))
repeatmeth %>%
  ggplot(aes(x=cat, y=nonrepeatgenes.methscore.unknown3)) +
  geom_violin(fill="gray80", colour="black", alpha=1) +
  geom_quasirandom(method='tukeyDense', size=0.3, alpha=0.10) +
  theme_classic() +
  ylim(0,1)+
  stat_summary(fun=median, geom="point", size=3, color="red")+
  theme(legend.position="none",
        plot.title = element_text(size=11))

#Plot for High copy number Class B genes
repeatmeth <- data.frame(HCN_orphan_meth$V8)
repeatmeth$cat = 1
repeatmeth = repeatmeth[HCN_orphan_meth$V8 != ".", ]
repeatmeth$HCN_orphan_meth <- as.numeric(as.character(repeatmeth$HCN_orphan_meth))
repeatmeth %>%
  ggplot(aes(x=cat, y=HCN_orphan_meth)) +
  geom_violin(fill="gray80", colour="black", alpha=1) +
  geom_quasirandom(method='tukeyDense', size=0.3, alpha=0.10) +
  theme_classic() +
  ylim(0,1)+
  stat_summary(fun=median, geom="point", size=3, color="red")+
  theme(legend.position="none",
        plot.title = element_text(size=11))

#Plot for Low copy number Class B genes
repeatmeth <- data.frame(LCN_orphan_meth$V8)
repeatmeth$cat = 1
repeatmeth = repeatmeth[LCN_orphan_meth$V8 != ".", ]
repeatmeth$LCN_orphan_meth <- as.numeric(as.character(repeatmeth$LCN_orphan_meth))
repeatmeth %>%
  ggplot(aes(x=cat, y=LCN_orphan_meth)) +
  geom_violin(fill="gray80", colour="black", alpha=1) +
  geom_quasirandom(method='tukeyDense', size=0.3, alpha=0.10) +
  theme_classic() +
  ylim(0,1)+
  stat_summary(fun=median, geom="point", size=3, color="red")+
  theme(legend.position="none",
        plot.title = element_text(size=11))

# Non-parametric test
test <- data.frame(ser.thrkin.methscore$V8)
names(test)[1] <- "methscore"
test$methscore = test[order(test$methscore),] 
test$group = 'serthr.kin'

test2 <- data.frame(calmodkin.methscore$V8)
names(test2)[1] <- "methscore"
test2$methscore = test2[order(test2$methscore),] 
test2$group = 'calm.kin'

test5 <- data.frame(sel1.methscore$V8)
names(test5)[1] <- "methscore"
test5$methscore = test5[order(test5$methscore),] 
test5$group = 'sel1'

test6 <- data.frame(kelch.methscore$V8)
names(test6)[1] <- "methscore"
test6$methscore = test6[order(test6$methscore),] 
test6$group = 'kelch'

test7 <- data.frame(btb.poz.methscore$V8)
names(test7)[1] <- "methscore"
test7$methscore = test7[order(test7$methscore),] 
test7$group = 'btb.poz'

#HCN orphan genes (Class B genes, HCN)
test8 <- data.frame(HCN_orphan_meth$V8)
names(test8)[1] <- "methscore"
test8$methscore = test8[order(test8$methscore),] 
test8$group = 'HCN.unknowngenes'

#LCN orphan genes (Class B genes, LCN)
test9 <- data.frame(LCN_orphan_meth$V8)
names(test9)[1] <- "methscore"
test9$methscore = test9[order(test9$methscore),] 
test9$group = 'LCN.unknowngenes'

#Class A genes
nonrepeatgenes.methscore.standard <- dplyr::filter(nonrepeatgenes.methscore, grepl(";", V7, ignore.case=TRUE))
nonrepeatgenes.methscore.standard2 <- dplyr::filter(nonrepeatgenes.methscore.standard, !grepl("hypothetical|provisional|tis13", V7, ignore.case=TRUE))
test10 <- data.frame(nonrepeatgenes.methscore.standard2$V8)
names(test10)[1] <- "methscore"
test10$methscore = test10[order(test10$methscore),] 
test10$group = 'nonrepeat.genes.standard'

newdf <- rbind(test, test2, test5, test6, test7, test8, test9, test10)
newdf = newdf[newdf$methscore != ".", ]
newdf$methscore <- as.numeric(as.character(newdf$methscore))

group_by(newdf, group) %>%
  summarise(
    count = n(),
    mean = mean(methscore, na.rm = TRUE),
    sd = sd(methscore, na.rm = TRUE),
    median = median(methscore, na.rm = TRUE),
    IQR = IQR(methscore, na.rm = TRUE)
  )

#Kruskal-Wallis H
kruskal.test(methscore ~ group, data = newdf)
#ser.thr : p-value < 2.2e-16

# Multiple pairwise wilcoxon rank
pairwise.wilcox.test(newdf$methscore, newdf$group,
                     p.adjust.method = "BH")
                           btb.poz calm.kin HCN.unknowngenes kelch   LCN.unknowngenes nonrepeat.genes.standard sel1   
  calm.kin                 0.07884 -        -                -       -                -                        -      
  HCN.unknowngenes         < 2e-16 1.5e-14  -                -       -                -                        -      
  kelch                    0.33970 0.33970  < 2e-16          -       -                -                        -      
  LCN.unknowngenes         3.7e-14 5.0e-05  < 2e-16          2.2e-09 -                -                        -      
  nonrepeat.genes.standard 0.32552 0.00120  < 2e-16          0.02685 < 2e-16          -                        -      
  sel1                     2.7e-08 0.00061  1.4e-10          6.1e-06 0.07884          < 2e-16                  -      
  serthr.kin               9.7e-05 0.20354  < 2e-16          0.00940 9.3e-16          < 2e-16                  0.00081

library(FSA)
dunnTest(methscore ~ group,
         data=newdf,
         method="bh")
                                Comparison           Z       P.unadj         P.adj
1                           btb.poz - calm.kin  -1.6912913  9.078119e-02  1.105162e-01
2                   btb.poz - HCN.unknowngenes -11.8293384  2.753029e-32  1.927121e-31
3                  calm.kin - HCN.unknowngenes  -7.3664754  1.751983e-13  5.450613e-13
4                              btb.poz - kelch  -0.8416309  3.999946e-01  4.148092e-01
5                             calm.kin - kelch   0.8941899  3.712203e-01  3.997757e-01
6                     HCN.unknowngenes - kelch   9.8589005  6.273256e-23  2.927520e-22
7                   btb.poz - LCN.unknowngenes  -7.8519322  4.096760e-15  1.433866e-14
8                  calm.kin - LCN.unknowngenes  -4.0947188  4.226810e-05  7.890046e-05
9          HCN.unknowngenes - LCN.unknowngenes  10.0171255  1.281766e-23  7.177888e-23
10                    kelch - LCN.unknowngenes  -6.1236192  9.147338e-10  2.134379e-09
11          btb.poz - nonrepeat.genes.standard   0.9111994  3.621903e-01  4.056532e-01
12         calm.kin - nonrepeat.genes.standard   2.8210464  4.786728e-03  7.446021e-03
13 HCN.unknowngenes - nonrepeat.genes.standard  30.5726741 2.825762e-205 7.912135e-204
14            kelch - nonrepeat.genes.standard   1.9506826  5.109482e-02  6.502977e-02
15 LCN.unknowngenes - nonrepeat.genes.standard  24.7253956 5.703856e-135 7.985399e-134
16                              btb.poz - sel1  -4.8425823  1.281625e-06  2.563250e-06
17                             calm.kin - sel1  -2.5871733  9.676692e-03  1.354737e-02
18                     HCN.unknowngenes - sel1   4.9192783  8.686388e-07  1.870914e-06
19                                kelch - sel1  -3.8208567  1.329889e-04  2.327306e-04
20                     LCN.unknowngenes - sel1   0.9657116  3.341885e-01  3.898866e-01
21             nonrepeat.genes.standard - sel1  -7.3297705  2.305472e-13  5.868475e-13
22                        btb.poz - serthr.kin  -3.4401304  5.814340e-04  9.576561e-04
23                       calm.kin - serthr.kin  -0.8084940  4.188062e-01  4.188062e-01
24               HCN.unknowngenes - serthr.kin  13.8405432  1.451060e-43  1.354322e-42
25                          kelch - serthr.kin  -2.1751367  2.961988e-02  3.949317e-02
26               LCN.unknowngenes - serthr.kin   7.3449961  2.057651e-13  5.761424e-13
27       nonrepeat.genes.standard - serthr.kin  -8.3553394  6.524485e-17  2.609794e-16
28                           sel1 - serthr.kin   2.7044080  6.842621e-03  1.008386e-02















######################################
# Identity of TEs closest to kinases # (Figure 4A)
######################################

# Make a TE annotation that doesn't contain Simple_repeat, Unknown, Low_complexity, Satellite
TElist <- read.table("Rir_HGAP_ii_V2.fa.out.bed",header = FALSE, sep="\t", stringsAsFactors=FALSE, quote="")
TElist$count = 1
Simple_repeat <- dplyr::filter(TElist, !grepl("Simple_repeat", V4))
Unknown <- dplyr::filter(Simple_repeat, !grepl("Unknown", V4))
Low_complexity <- dplyr::filter(Unknown, !grepl("Low_complexity", V4))
Satellite <- dplyr::filter(Low_complexity, !grepl("Satellite", V4))
write.table(Satellite, paste(out_dir,"Rir_HGAP_ii_V2.fa.out.TEONLY.bed",sep="/"), col.names=F, quote=F, sep="\t", row.names=F)

#Bedtools closest for class B genes
write.table(nonrepeatgenes.methscore.unknown3, paste(out_dir,"classBgenes.bed",sep="/"), col.names=F, quote=F, sep="\t", row.names=F)
write.table(HCN_orphan_meth, paste(out_dir,"classB_HCNgenes.bed",sep="/"), col.names=F, quote=F, sep="\t", row.names=F)
write.table(LCN_orphan_meth, paste(out_dir,"classB_LCNgenes.bed",sep="/"), col.names=F, quote=F, sep="\t", row.names=F)
sort -k1,1 -k2,2n classB_HCNgenes.bed > classB_HCNgenes.sorted.bed
sort -k1,1 -k2,2n classB_LCNgenes.bed > classB_LCNgenes.sorted.bed
bedtools closest -k 1 -d -a classB_HCNgenes.sorted.bed -b Rir_HGAP_ii_V2.fa.out.TEONLY.sorted.bed > classB_HCN.closestTE.bed
bedtools closest -k 1 -d -a classB_LCNgenes.sorted.bed -b Rir_HGAP_ii_V2.fa.out.TEONLY.sorted.bed > classB_LCN.closestTE.bed

#Load
clos.classB_HCN <- read.table("classB_HCN.closestTE.bed",header = FALSE, sep="\t", stringsAsFactors=FALSE, quote="")
clos.classB_LCN <- read.table("classB_LCN.closestTE.bed",header = FALSE, sep="\t", stringsAsFactors=FALSE, quote="")
clos.class1 <- read.table("classAgenes.closestTE.bed",header = FALSE, sep="\t", stringsAsFactors=FALSE, quote="") #8491

# Find and count unique values in column V11 for CLASSB_HCN GENES
DT <- data.table(clos.classB_HCN)
clos.classb.hcn.list <- DT[, .(number_of_distinct_orders = length(unique(V7))), by = V12]
clos.classb.hcn.list = clos.classb.hcn.list %>% arrange(desc(number_of_distinct_orders))
mean(clos.classB_HCN$V14) # 2657.279
sum(clos.classb.hcn.list$number_of_distinct_orders) # 4254
clos.classb.hcn.list$norm = clos.classb.hcn.list$number_of_distinct_orders/4254*100
meanclassbhcn <- clos.classB_HCN %>%
  group_by(V12) %>%
  summarise(
    Mean_dist = mean(V14)
  ) 

# Find and count unique values in column V11 for CLASSB_LCN GENES
DT <- data.table(clos.classB_LCN)
clos.classb.lcn.list <- DT[, .(number_of_distinct_orders = length(unique(V7))), by = V12]
clos.classb.lcn.list = clos.classb.lcn.list %>% arrange(desc(number_of_distinct_orders))
mean(clos.classB_LCN$V14) # 2694.034
sum(clos.classb.lcn.list$number_of_distinct_orders) # 7937
clos.classb.lcn.list$norm = clos.classb.lcn.list$number_of_distinct_orders/7937*100
meanclassblcn <- clos.classB_LCN %>%
  group_by(V12) %>%
  summarise(
    Mean_dist = mean(V14)
  ) 

# Find and count unique values in column V11 for CLASS A GENES
DT <- data.table(clos.class1)
clos.class1.list <- DT[, .(number_of_distinct_orders = length(unique(V7))), by = V12]
clos.class1.list = clos.class1.list %>% arrange(desc(number_of_distinct_orders))
mean(clos.class1$V14) # 3356.339
sum(clos.class1.list$number_of_distinct_orders) # 8319
clos.class1.list$norm = clos.class1.list$number_of_distinct_orders/8319*100
meanclass1 <- clos.class1 %>%
  group_by(V12) %>%
  summarise(
    Mean_dist = mean(V14)
  ) 

#STATS : Ex: compare the number of MULES, gypsy and CMC (top3) and NON-MULES next to classB HCN to the number of MULES vs non-MULES in Class A genes
#CLASS B_HCN VS CLASS A
sum(clos.classb.hcn.list$number_of_distinct_orders) # 4254
sum(clos.class1.list$number_of_distinct_orders) # 8319
mule
classB_hcn     non-repgenes (Class A)
a <- c(490,891)
4254-490 #= 3764
b <- c(3764, 7428)
8319-891 #= 7428
dat <- data.frame(a,b)
chisq.test(dat)$expected
test <- fisher.test(dat)
test
test$p.value
0.175007

gypsy     non-repgenes
a <- c(607,1158)
b <- c(3647, 7161)
dat <- data.frame(a,b)
test <- fisher.test(dat)
test
test$p.value
0.606233

CMC     non-repgenes
a <- c(310,460)
b <- c(3944, 7859)
dat <- data.frame(a,b)
test <- fisher.test(dat)
test
test$p.value
0.0001344124

#CLASS B_LCN VS CLASS A
sum(clos.classb.lcn.list$number_of_distinct_orders) # 7937
sum(clos.class1.list$number_of_distinct_orders) # 8319
classB_lcn     non-repgenes (Class A)
a <- c(1055,891)
7937-1055 #= 6882
b <- c(6882, 7428)
8319-891 #= 7428
dat <- data.frame(a,b)
chisq.test(dat)$expected
test <- fisher.test(dat)
test
test$p.value
4.305088e-07

gypsy     non-repgenes
a <- c(1004,1158)
b <- c(6933, 7161)
dat <- data.frame(a,b)
test <- fisher.test(dat)
test
test$p.value
0.01733098

CMC     non-repgenes
a <- c(456,460)
b <- c(7481, 7859)
dat <- data.frame(a,b)
test <- fisher.test(dat)
test
test$p.value
0.5630738

#Plot enrichment (lower panel of Figure 4A)
names(clos.classb.hcn.list)[1] <- "class2.hcn"
names(clos.classb.lcn.list)[1] <- "class2.lcn"

top10 <- data.frame(clos.classb.hcn.list[1:10,], clos.classb.lcn.list[1:10,])
top10$number_of_distinct_orders <- NULL
top10$number_of_distinct_orders.1 <- NULL
top10$number_of_distinct_orders.2 <- NULL
top10$number_of_distinct_orders.3 <- NULL
top10$number_of_distinct_orders.4 <- NULL
top10$number_of_distinct_orders.5 <- NULL
top10$number_of_distinct_orders.6 <- NULL

write.table(top10, paste(out_dir,"top10_reb1.txt",sep="/"), col.names=T, quote=F, sep="\t", row.names=F)
#edited in excel
top33 <- read.table("top10_reb1.mod.txt",header = TRUE, sep="\t", stringsAsFactors=FALSE, quote="")

ggplot(data=top33, aes(x=TE, y=norm)) + 
  geom_bar(stat="identity") +
  facet_wrap(.~group)+
  theme_classic()+
  theme(axis.text.x = element_text(angle = 90))


########################################
# Distance plots Figure 4A top
########################################

library("ggbeeswarm")
#### Distance distribution between classb.hcn and closeby MULE
# From classb.hcn closest list, keep only lines with a MULE
btbpoz.mule <- dplyr::filter(clos.classB_HCN, grepl("MULE", V12))
#Plot the distance distribution (V13)
btbpoz.mule$cat = 1
btbpoz.mule %>%
  ggplot(aes(x=cat, y=log10(V14+1))) +
  geom_violin(fill="gray80", colour="black", alpha=1) +
  stat_summary(fun=median, geom="point", size=1, color="red")+
  theme_classic() +
  ylim(0,5)+
  theme(legend.position="none",
        plot.title = element_text(size=11)
  )
#Kruskal-Wallis chi-squared.
btbpoz.mule$group = 1
btbpoz.mule <- subset(btbpoz.mule, select=c(V12, V14, group))
nonrep.mule <- dplyr::filter(clos.class1, grepl("MULE", V12))
nonrep.mule$group = 2
nonrep.mule <- subset(nonrep.mule, select=c(V12, V14, group))
merg1 <- rbind(btbpoz.mule, nonrep.mule)
kruskal.test(V14 ~ group, data = merg1)
p-value  = 8.654e-06


#### Distance distribution between classb.hcn and closeby gypsy
# From classb.hcn closest list, keep only lines with a gypsy
btbpoz.gypsy <- dplyr::filter(clos.classB_HCN, grepl("Gypsy", V12))
#Plot the distance distribution (V13)
btbpoz.gypsy$cat = 1
btbpoz.gypsy %>%
  ggplot(aes(x=cat, y=log10(V14+1))) +
  geom_violin(fill="gray80", colour="black", alpha=1) +
  stat_summary(fun=median, geom="point", size=1, color="red")+
  theme_classic() +
  ylim(0,5)+
  theme(legend.position="none",
        plot.title = element_text(size=11)
  )
#Kruskal-Wallis chi-squared.
btbpoz.gypsy$group = 1
btbpoz.gypsy <- subset(btbpoz.gypsy, select=c(V12, V14, group))
nonrep.gypsy<- dplyr::filter(clos.class1, grepl("Gypsy", V12))
nonrep.gypsy$group = 2
nonrep.gypsy <- subset(nonrep.gypsy, select=c(V12, V14, group))
merg1 <- rbind(btbpoz.gypsy, nonrep.gypsy)
kruskal.test(V14 ~ group, data = merg1)
p-value = 0.000234

#### Distance distribution between classb.hcn and closeby CMC
# From classb.hcn closest list, keep only lines with a CMC
btbpoz.cmc <- dplyr::filter(clos.classB_HCN, grepl("CMC", V12))
#Plot the distance distribution (V13)
btbpoz.cmc$cat = 1
btbpoz.cmc %>%
  ggplot(aes(x=cat, y=log10(V14+1))) +
  geom_violin(fill="gray80", colour="black", alpha=1) +
  stat_summary(fun=median, geom="point", size=1, color="red")+
  theme_classic() +
  ylim(0,5)+
  theme(legend.position="none",
        plot.title = element_text(size=11)
  )
#Kruskal-Wallis chi-squared.
btbpoz.cmc$group = 1
btbpoz.cmc <- subset(btbpoz.cmc, select=c(V12, V14, group))
nonrep.cmc <- dplyr::filter(clos.class1, grepl("CMC", V12))
nonrep.cmc$group = 2
nonrep.cmc <- subset(nonrep.cmc, select=c(V12, V14, group))
merg1 <- rbind(btbpoz.cmc, nonrep.cmc)
kruskal.test(V14 ~ group, data = merg1)
p-value  = 0.02986



#### Distance distribution between classb.Lcn and closeby MULE
# From classb.hcn closest list, keep only lines with a MULE
btbpoz.mule <- dplyr::filter(clos.classB_LCN, grepl("MULE", V12))
#Plot the distance distribution (V13)
btbpoz.mule$cat = 1
btbpoz.mule %>%
  ggplot(aes(x=cat, y=log10(V14+1))) +
  geom_violin(fill="gray80", colour="black", alpha=1) +
  stat_summary(fun=median, geom="point", size=1, color="red")+
  theme_classic() +
  ylim(0,5)+
  theme(legend.position="none",
        plot.title = element_text(size=11)
  )
#Kruskal-Wallis chi-squared.
btbpoz.mule$group = 1
btbpoz.mule <- subset(btbpoz.mule, select=c(V12, V14, group))
nonrep.mule <- dplyr::filter(clos.class1, grepl("MULE", V12))
nonrep.mule$group = 2
nonrep.mule <- subset(nonrep.mule, select=c(V12, V14, group))
merg1 <- rbind(btbpoz.mule, nonrep.mule)
kruskal.test(V14 ~ group, data = merg1)
p-value  = 3.766e-07


#### Distance distribution between classb.Lcn and closeby gypsy
# From classb.Lcn closest list, keep only lines with a gypsy
btbpoz.gypsy <- dplyr::filter(clos.classB_LCN, grepl("Gypsy", V12))
#Plot the distance distribution (V13)
btbpoz.gypsy$cat = 1
btbpoz.gypsy %>%
  ggplot(aes(x=cat, y=log10(V14+1))) +
  geom_violin(fill="gray80", colour="black", alpha=1) +
  stat_summary(fun=median, geom="point", size=1, color="red")+
  theme_classic() +
  ylim(0,5)+
  theme(legend.position="none",
        plot.title = element_text(size=11)
  )
#Kruskal-Wallis chi-squared.
btbpoz.gypsy$group = 1
btbpoz.gypsy <- subset(btbpoz.gypsy, select=c(V12, V14, group))
nonrep.gypsy<- dplyr::filter(clos.class1, grepl("Gypsy", V12))
nonrep.gypsy$group = 2
nonrep.gypsy <- subset(nonrep.gypsy, select=c(V12, V14, group))
merg1 <- rbind(btbpoz.gypsy, nonrep.gypsy)
kruskal.test(V14 ~ group, data = merg1)
p-value = < 0.000234

#### Distance distribution between classb.Lcn and closeby CMC
# From classb.Lcn closest list, keep only lines with a CMC
btbpoz.cmc <- dplyr::filter(clos.classB_LCN, grepl("CMC", V12))
#Plot the distance distribution (V13)
btbpoz.cmc$cat = 1
btbpoz.cmc %>%
  ggplot(aes(x=cat, y=log10(V14+1))) +
  geom_violin(fill="gray80", colour="black", alpha=1) +
  stat_summary(fun=median, geom="point", size=1, color="red")+
  theme_classic() +
  ylim(0,5)+
  theme(legend.position="none",
        plot.title = element_text(size=11)
  )
#Kruskal-Wallis chi-squared.
btbpoz.cmc$group = 1
btbpoz.cmc <- subset(btbpoz.cmc, select=c(V12, V14, group))
nonrep.cmc <- dplyr::filter(clos.class1, grepl("CMC", V12))
nonrep.cmc$group = 2
nonrep.cmc <- subset(nonrep.cmc, select=c(V12, V14, group))
merg1 <- rbind(btbpoz.cmc, nonrep.cmc)
kruskal.test(V14 ~ group, data = merg1)
p-value  = 0.0733


newdf <- rbind(test, test2, test5, test6, test7, test8, test9)

library(FSA)
dunnTest(methscore ~ group,
         data=newdf,
         method="bh")










