

# Using an annotation that doesn't contain genes with transposon-related protein domains
Clemannot_with_Maedafunction_noTEproteindomain_final.bed

clemannot_funct3 <- read.table("Clemannot_with_Maedafunction_noTEproteindomain_final.bed", header = FALSE, sep="\t", stringsAsFactors=FALSE, quote="")

# Find the distance of the closest upstream gene, irrespective on the orientation of the gene.
# Modify bed files to set all genes on the positive strand.
clemannot_funct3$V5 <- "+"
write.table(clemannot_funct3, paste(out_dir,"Clemannot_with_Maedafunction_withoutTEs_Stranded.bed",sep="/"), col.names=F, quote=F, sep="\t", row.names=F)

# sort -k1,1 -k2,2n Clemannot_with_Maedafunction.bed > Clemannot_with_Maedafunction_sorted.bed
# sort -k1,1 -k2,2n Clemannot_with_Maedafunction_withoutTEs_Stranded.bed > Clemannot_with_Maedafunction_withoutTEs_Stranded_sorted.bed
# bedtools closest -a Clemannot_with_Maedafunction_withoutTEs_Stranded_sorted.bed -b Clemannot_with_Maedafunction_withoutTEs_Stranded_sorted.bed -io -id -D a -k 1 > twospeedtest5end.bed
# bedtools closest -a Clemannot_with_Maedafunction_withoutTEs_Stranded_sorted.bed -b Clemannot_with_Maedafunction_withoutTEs_Stranded_sorted.bed -io -iu -D a -k 1 > twospeedtest3end.bed

twospeedtest5end <- read.table("twospeedtest5end.bed", header = FALSE, sep="\t", stringsAsFactors=FALSE, quote="")
twospeedtest3end <- read.table("twospeedtest3end.bed", header = FALSE, sep="\t", stringsAsFactors=FALSE, quote="")

#Remove rows of genes that don't have anything upstram. They have a dot in V8
twospeedtest5end = twospeedtest5end[twospeedtest5end$V8 != ".", ]
twospeedtest3end = twospeedtest3end[twospeedtest3end$V8 != ".", ]

#Make a new df with the distances
twospeedtest5end_sub <- subset(twospeedtest5end, select=c(V7, V15))
twospeedtest3end_sub <- subset(twospeedtest3end, select=c(V7, V15))
twospeedtest5end_sub$V15 = -(twospeedtest5end_sub$V15)

#Distance distribution for the 3'
ggplot(twospeedtest3end, aes(x=V15)) + geom_histogram(color="black", fill="white", binwidth=25)+
  theme_classic()+
  xlab("Distance")+
  ylab("Number of genes")+
  ggtitle("3'end distance distribution")+
  theme(plot.title = element_text(hjust = 0.5))

#Merge them by ID.
twospeeddf <- merge(twospeedtest5end_sub, twospeedtest3end_sub, by="V7")
write.table(twospeeddf, paste(out_dir,"twospeeddf.txt",sep="/"), col.names=F, quote=F, sep="\t", row.names=F)

# Plot the 5' and 3' distances
#V15.x is the 5' distance and V15.y is the 3' distance
ggplot(twospeeddf, aes(x=log2(V15.x), y=log2(V15.y))) +
  xlab("5' ITL (bp")+
  ylab("3' ITL (bp)")+
  geom_bin2d(bins = 40) +
  theme_classic()

# Add categories for the repeated genes (class C) and different colors. 
repgenes <- dplyr::filter(twospeeddf, grepl("ser/thr|Serine/threonine|calmodulin|serine/threonine|serine-threonine|serine_threonine|btb/poz|btb-poz|kelch|sel1", V7, ignore.case=TRUE)) #3698 genes
repgenes$group = "classB"
unknowngenes.a <- dplyr::filter(twospeeddf, grepl("hypothetical|provisional|tis13", V7, ignore.case=TRUE)) #7267 genes
unknowngenes.b <- dplyr::filter(twospeeddf, !grepl(";", V7, ignore.case=TRUE)) # 2838 genes
unknowngenes <- rbind(unknowngenes.a, unknowngenes.b)
unknowngenes$group = "classC"

#group repgenes and unknowngenes together
rep.unknowngenes <- rbind(repgenes, unknowngenes)
#Anti join (remove unknown and repeated genes from the whole list: only class A genes will be left)
nonrepgenes <- anti_join(twospeeddf, rep.unknowngenes)
nonrepgenes$group = "classA" #7969 genes

#Bind class A B and C together.
twospeeddf_group <- unique(rbind(unknowngenes, nonrepgenes, repgenes)) #n=1785  TEs we have methylation Data for

#Plot the distribution for all genes
ggplot(subset(twospeeddf_group, group=classA), mapping=aes(x=log2(V15.x), y=log2(V15.y)))+
  geom_bin2d(bins=70) +
  theme_classic()+
  xlim(0, 20)+
  ylim(0, 20)+
  scale_fill_gradient(low="gray90",high="gray20")


#Plot for class A genes
classA <- twospeeddf_group %>% dplyr::filter(group %in% c("classA"))
ggplot(subset(classA), mapping=aes(x=log2(V15.x), y=log2(V15.y)))+
  geom_bin2d(bins=70) +
  theme_classic()+
  xlim(0, 20)+
  ylim(0, 20)+
  scale_fill_gradient(low="lightgoldenrod1",high="chocolate1")

#Plot for class B genes
classB <- twospeeddf_group %>% dplyr::filter(group %in% c("classB"))
ggplot(subset(classB), mapping=aes(x=log2(V15.x), y=log2(V15.y)))+
  geom_bin2d(bins=70) +
  theme_classic()+
  xlim(0, 20)+
  ylim(0, 20)+
  scale_fill_gradient(low="lightsteelblue1",high="mediumblue")


#Plot for class C genes
classC <- twospeeddf_group %>% dplyr::filter(group %in% c("classC"))
ggplot(subset(classC), mapping=aes(x=log2(V15.x), y=log2(V15.y)))+
  geom_bin2d(bins=70) +
  theme_classic()+
  xlim(0, 20)+
  ylim(0, 20)+
  scale_fill_gradient(low="mistyrose1",high="brown2")



