### Script for manuscript Chip plots

setwd("C:/Users/Jon/Google Drive/PhD/Manuscript")

library(readr)
library(ggplot2)
library(RColorBrewer)
library(dplyr)
library(tidyr)
library(wesanderson)
library(viridis)
library(splitstackshape)
library(bedr)
library(plotly)
library(UpSetR)
library(VennDiagram)
source("multiplot.R")


#####################################################################################
## To select the blacklisted regions based on WT Input
#####################################################################################
covWT<-read.table("coverage_WT.bedGraph", header = FALSE, sep="\t", 
                  na.strings=c("/"), stringsAsFactors = FALSE, 
                  col.names = c("Chr","Start","End","CPM"))
covWT<-covWT[covWT$Chr!="Pt",]
covWT<-covWT[covWT$Chr!="Mt",]
ggplot(covWT, aes(CPM)) +
  geom_histogram(binwidth = 0.1)

covWT<-covWT[covWT$CPM>5,]
covWT<-order_by(covWT, desc(covWT$CPM))

###########################################################################################
### To plot the distribution of peaks in genomic features
###########################################################################################

### for individual replicates

table<-read.delim("annotated_peaks_stats_ind.txt", header = TRUE, sep="") %>%
  group_by(Sample) %>%
  mutate(Total=sum(Count)) %>%
  group_by(Sample, Annotations)

samples<-c("MBD1_rep1","MBD1_rep2","MBD2_rep1","MBD2_rep2","MBD4_rep1","MBD4_rep2","MBD5_rep1","MBD5_rep2",
           "MBD6_rep1", "MBD6_rep2", "SUVH1_rep1", "SUVH1_rep2", "SUVH3_rep1", "SUVH3_rep2")

table$Annotations<-factor(table$Annotations, levels = rev(c("TE","TE_prom","TE_gene",
                                                            "TE_TTS","Promoter", "Exon","Intron", "TTS", "Intergenic")))
table$Sample<-factor(table$Sample, levels=samples)

plot.annot.distribution<-function(table) {
  plot<-ggplot(table, aes(Sample, Count, fill=Annotations)) +
    geom_bar(stat="identity", position="fill", colour="black", show.legend = T) +
    theme_bw() + guides(colour = FALSE) +
    labs(title = paste("Distribution of peaks by genomic feature"), 
         x="",y="", fill="Feature") +
    scale_fill_manual(values = c("TE"="#000000","TE_prom"="#373738","TE_gene"="#636466",
                                 "TE_TTS"="#898a8c","Promoter"="#94efcb", "Exon"="#66e8e8",
                                 "Intron"="#51d199", "TTS"="#90baf9", "Intergenic"="#ddc8c3")) + 
    theme(axis.text.x = element_text(color="black", size=8, angle=90, vjust=0.5), 
          panel.grid = element_blank(), axis.ticks.x=element_blank()) + 
    annotate("text", x=table$Sample, y=1.05, label=table$Total, size = 5)
  
  plot
}  

pdf("C:/Users/Jon/Google Drive/PhD/Papers/2020_mC_readers/Figures/distribution_annotations_individual.pdf", width = 10)
plot.annot.distribution(table)
dev.off()

### for merged samples

table2<-read.delim("annotated_peaks_stats_merged.txt", header = TRUE, sep="") %>%
  group_by(Sample) %>%
  mutate(Total=sum(Count)) %>%
  group_by(Sample, Annotations)

samples2<-c("MBD1","MBD2","MBD4","MBD5","MBD6","SUVH1","SUVH3")

table2$Annotations<-factor(table2$Annotations, levels = rev(c("TE","TE_prom","TE_gene",
                                                            "TE_TTS","Promoter", "Exon","Intron", "TTS", "Intergenic")))
table2$Sample<-factor(table2$Sample, levels=samples2)

pdf("C:/Users/Jon/Google Drive/PhD/Papers/2020_mC_readers/Figures/distribution_annotations_merged.pdf", width = 10)
plot.annot.distribution(table2)
dev.off()

### for grouped samples

table3<-read.delim("annotated_grouped_peaks.txt", header = TRUE, sep="") %>%
  group_by(Sample) %>%
  mutate(Total=sum(Count)) %>%
  group_by(Sample, Annotations)

groups<-c("common_peaks","MBD1_peaks","MBD2_5_6_peaks","MBD5_6_peaks","MBD5_6_SUVH1_3_peaks")

table3$Annotations<-factor(table3$Annotations, levels = rev(c("TE","TE_prom","TE_gene",
                                                              "TE_TTS","Promoter", "Exon","Intron", "TTS", "Intergenic")))
table3$Sample<-factor(table3$Sample, levels=groups)

pdf("C:/Users/Jon/Google Drive/PhD/Papers/2020_mC_readers/Figures/distribution_annotations_grouped.pdf", width = 10)
plot.annot.distribution(table3)
dev.off()

################################################################
#### To plot Venn diagrams of peak numbers in each replicates and merged file
################################################################

pdf("C:/Users/Jon/Google Drive/PhD/Papers/2020_mC_readers/Figures/Venn/MBD1v2.pdf")
MBD1plot<-draw.triple.venn(area1       = 124456,
                           area2           = 107578,
                           area3           = 37537,
                           n12             = 96983,
                           n23             = 32533,
                           n13             = 37298,
                           n123            = 32533,
                           category        = c('Merged', 'Rep1', 'Rep2'),
                           fill            = c('purple', 'blue', 'red'),
                           cat.col         = c('purple', 'blue', 'red'),
                           cex             = 2,
                           cat.cex         = 2,
                           cat.dist = c(0.05,0.05,0.02),
                           cat.pos = c(-45,45,0),
                           euler.d = TRUE,
                           scaled = TRUE, 
                           alpha = 0.2, 
                           print.mode = "percent",
                           sigdigs = 2)

dev.off()

pdf("C:/Users/Jon/Google Drive/PhD/Papers/2020_mC_readers/Figures/Venn/MBD2.pdf")
MBD2plot<-draw.triple.venn(area1       = 212884,
                        area2           = 167056,
                        area3           = 72597,
                        n12             = 148664,
                        n23             = 58183,
                        n13             = 72101,
                        n123            = 58183,
                        category        = c('Merged', 'Rep1', 'Rep2'),
                        fill            = c('purple', 'blue', 'red'),
                        cat.col         = c('purple', 'blue', 'red'),
                        cex             = 1.5,
                        cat.cex         = 2,
                        cat.pos = c(-45,45,0),
                        euler.d = TRUE,
                        scaled = TRUE, 
                        alpha = 0.2, 
                        print.mode = "percent", 
                        sigdigs = 2)
dev.off()

pdf("C:/Users/Jon/Google Drive/PhD/Papers/2020_mC_readers/Figures/Venn/MBD4.pdf")
MBD4plot<-draw.triple.venn(area1       = 53786,
                           area2           = 18413,
                           area3           = 28868,
                           n12             = 16567,
                           n23             = 10335,
                           n13             = 28332,
                           n123            = 10335,
                           category        = c('Merged', 'Rep1', 'Rep2'),
                           fill            = c('purple', 'blue', 'red'),
                           cat.col         = c('purple', 'blue', 'red'),
                           cex             = 1.5,
                           cat.cex         = 2,
                           cat.pos = c(-45,45,0),
                           euler.d = TRUE,
                           scaled = TRUE, 
                           alpha = 0.2, 
                           print.mode = "percent", 
                           sigdigs = 2)
dev.off()

pdf("C:/Users/Jon/Google Drive/PhD/Papers/2020_mC_readers/Figures/Venn/MBD5.pdf")
MBD5plot<-draw.triple.venn(area1       = 695614,
                           area2           = 532919,
                           area3           = 551599,
                           n12             = 525156,
                           n23             = 453862,
                           n13             = 543015,
                           n123            = 453862,
                           category        = c('Merged', 'Rep1', 'Rep2'),
                           fill            = c('purple', 'blue', 'red'),
                           cat.col         = c('purple', 'blue', 'red'),
                           cex             = 1.5,
                           cat.cex         = 2,
                           cat.pos = c(-45,45,0),
                           euler.d = TRUE,
                           scaled = TRUE, 
                           alpha = 0.2, 
                           print.mode = "percent", 
                           sigdigs = 2)
dev.off()

pdf("C:/Users/Jon/Google Drive/PhD/Papers/2020_mC_readers/Figures/Venn/MBD6.pdf")
MBD6plot<-draw.triple.venn(area1       = 477188,
                           area2           = 437486,
                           area3           = 207668,
                           n12             = 411690,
                           n23             = 195854,
                           n13             = 205633,
                           n123            = 195854,
                           category        = c('Merged', 'Rep1', 'Rep2'),
                           fill            = c('purple', 'blue', 'red'),
                           cat.col         = c('purple', 'blue', 'red'),
                           cex             = 1.5,
                           cat.cex         = 2,
                           cat.pos = c(-45,45,0),
                           euler.d = TRUE,
                           scaled = TRUE, 
                           alpha = 0.2, 
                           print.mode = "percent", 
                           sigdigs = 2)
dev.off()

pdf("C:/Users/Jon/Google Drive/PhD/Papers/2020_mC_readers/Figures/Venn/SUVH1.pdf")
SUVH1plot<-draw.triple.venn(area1       = 235722,
                           area2           = 132945,
                           area3           = 163505,
                           n12             = 127235,
                           n23             = 98626,
                           n13             = 159238,
                           n123            = 98626,
                           category        = c('Merged', 'Rep1', 'Rep2'),
                           fill            = c('purple', 'blue', 'red'),
                           cat.col         = c('purple', 'blue', 'red'),
                           cex             = 1.5,
                           cat.cex         = 2,
                           cat.pos = c(-45,45,0),
                           euler.d = TRUE,
                           scaled = TRUE, 
                           alpha = 0.2, 
                           print.mode = "percent", 
                           sigdigs = 2)
dev.off()

pdf("C:/Users/Jon/Google Drive/PhD/Papers/2020_mC_readers/Figures/Venn/SUVH3.pdf")
SUVH3plot<-draw.triple.venn(area1       = 639760,
                           area2           = 485879,
                           area3           = 313172,
                           n12             = 472779,
                           n23             = 277737,
                           n13             = 310434,
                           n123            = 277737,
                           category        = c('Merged', 'Rep1', 'Rep2'),
                           fill            = c('purple', 'blue', 'red'),
                           cat.col         = c('purple', 'blue', 'red'),
                           cex             = 1.5,
                           cat.cex         = 2,
                           cat.pos = c(-45,45,0),
                           euler.d = TRUE,
                           scaled = TRUE, 
                           alpha = 0.2, 
                           print.mode = "percent", 
                           sigdigs = 2)
dev.off()

###############################################
### To plot histogram of distance to centromere
###############################################

### for individual replicates

distance<-read.delim("distance_to_centromeres_individual2.txt", stringsAsFactors = FALSE, header = TRUE)
distance$Sample<-factor(distance$Sample, levels = samples2)
distance$Rep<-as.factor(distance$Rep)

pdf("C:/Users/Jon/Google Drive/PhD/Papers/2020_mC_readers/Figures/Histogram_distance_individual.pdf", width=30, height = 10)
ggplot(distance, aes(Distance)) +
  geom_histogram(aes(y=..density..), binwidth = 2) +
  theme(panel.grid.minor = element_blank(), 
        panel.grid.major = element_blank(),
        axis.title.x = element_blank(),
        axis.ticks.x = element_blank(),
        axis.text.x = element_text(size=15),
        axis.text.y = element_text(size=20),
        axis.title.y = element_text(size=35),
        panel.background = element_rect(fill = 'white', colour = 'black'),
        strip.background = element_rect(fill = 'white', colour = 'black'),
        strip.text = element_text(size = 35)) +
  facet_grid(Rep~Sample) +
  scale_x_continuous(breaks = c(15,55,90), labels = c("Centromere",expression(""%->%""),"Telomere")) +
  labs(fill="", y="Peak density")
dev.off()

### for merged samples

distance2<-read.delim("distance_to_centromeres_merged.txt", stringsAsFactors = FALSE, header = TRUE)
distance2$Sample<-factor(distance2$Sample, levels = samples2)

pdf("C:/Users/Jon/Google Drive/PhD/Papers/2020_mC_readers/Figures/Histogram_distance_merged.pdf", width=30, height = 10)
ggplot(distance2, aes(Distance)) +
  geom_histogram(aes(y=..density..), binwidth = 2) +
  theme(panel.grid.minor = element_blank(), 
        panel.grid.major = element_blank(),
        axis.title.x = element_blank(),
        axis.ticks.x = element_blank(),
        axis.text.x = element_text(size=15),
        axis.text.y = element_text(size=20),
        axis.title.y = element_text(size=35),
        panel.background = element_rect(fill = 'white', colour = 'black'),
        strip.background = element_rect(fill = 'white', colour = 'black'),
        strip.text = element_text(size = 35)) +
  facet_grid(~Sample) +
  scale_x_continuous(breaks = c(15,55,90), labels = c("Centromere",expression(""%->%""),"Telomere")) +
  labs(fill="", y="Peak density")
dev.off()


#############################################################################################
### To plot mC distribution in peaks
#############################################################################################

total<-read.delim("all_peaks_mC.txt", stringsAsFactors = FALSE, header=TRUE)

total$Sample<-factor(total$Sample, levels=samples2)

tidy_mC_data <- gather(total, Context, mC, mCG, mCHG, mCHH, na.rm = TRUE) %>%
  mutate(Coverage = ifelse(Context == "mCG", Cov_mCG, 
                           ifelse(Context == "mCHG", Cov_mCHG, 
                                  ifelse(Context == "mCHH", Cov_mCHH, "ERROR"))),
         Counts = ifelse(Context == "mCG", Nb_mCG, 
                         ifelse(Context == "mCHG", Nb_mCHG, 
                                ifelse(Context == "mCHH", Nb_mCHH, "ERROR")))) %>%
  select(Sample, Peak_ID, Context, mC, Coverage, Counts) %>%
  group_by(Sample, Context)

### Function for plotting average methylation (per context) at peaks locations

plot.Metavg.bycontext <- function(min, cov) {
  tablet<-filter(tidy_mC_data, Counts>=min, Coverage>=cov)
  
  ggplot(tablet, aes(Sample, mC)) + geom_jitter(alpha=0.5, color="grey", size=0.4) + 
    geom_boxplot(aes(fill=Context), outlier.alpha = 0, alpha=0.5, show.legend = FALSE) +
    theme(axis.text.x = element_text(color='black', size=20, angle=90, vjust=0.6),
          axis.text.y = element_text(color='black', size=10),
          axis.title.y = element_text(color='black', size=15),
          panel.grid.minor = element_line(colour = 'grey'), panel.grid.major.x = element_blank(),
          panel.grid.major.y = element_line(colour='lightgrey'),
          panel.background = element_rect(fill = 'white', colour = 'black'),
          strip.background = element_rect(fill = 'white', colour = 'black'), 
          axis.ticks.x=element_blank(), axis.title.x=element_blank(), 
          legend.key=element_blank(), strip.text = element_text(size = 20)) + 
    labs(title = paste("Average methylation levels at peaks with at least",min,"cytosines covered",cov,"times or more"), 
         y="Methylation level (%)", size=10) +
    scale_fill_manual(values=wes_palette(n=3, name="Moonrise2")) + 
    facet_grid(~Context)
}

plot.Metavg.bysample <- function(min, cov) {
  tablet<-filter(tidy_mC_data, Counts>=min, Coverage>=cov)
  
  ggplot(tablet, aes(Context, mC)) + geom_jitter(alpha=0.5, color="grey", size=0.4) + 
    geom_boxplot(aes(fill=Context), outlier.alpha = 0, alpha=0.5) +
    theme(axis.text.x = element_blank(),
          axis.text.y = element_text(color='black', size=10),
          axis.title.y = element_text(color='black', size=15),
          panel.grid.minor = element_line(colour = 'grey'), panel.grid.major.x = element_blank(),
          panel.grid.major.y = element_line(colour='lightgrey'),
          panel.background = element_rect(fill = 'white', colour = 'black'),
          strip.background = element_rect(fill = 'white', colour = 'black'), 
          axis.ticks.x=element_blank(), axis.title.x=element_blank(), 
          legend.key=element_blank(), strip.text = element_text(size = 20)) + 
    labs(title = paste("Average methylation levels at peaks with at least",min,"cytosines covered",cov,"times or more"), 
         y="Methylation level (%)", size=10, fill="") +
    scale_fill_manual(values=wes_palette(n=3, name="Moonrise2")) + 
    facet_grid(~Sample)
}

plot.Metavg.bysample.simple <- function(min, cov) {
  tablet<-filter(tidy_mC_data, Counts>=min, Coverage>=cov)
  
  ggplot(tablet, aes(Context, mC)) + 
    geom_boxplot(aes(fill=Context), outlier.alpha = 0, alpha=0.5) +
    theme(axis.text.x = element_blank(),
          axis.text.y = element_text(color='black', size=15),
          axis.title.y = element_text(color='black', size=20),
          panel.grid.minor = element_blank(), panel.grid.major.x = element_blank(),
          panel.grid.major.y = element_blank(),
          panel.background = element_rect(fill = 'white', colour = 'black'),
          strip.background = element_rect(fill = 'white', colour = 'black'), 
          axis.ticks.x=element_blank(), axis.title.x=element_blank(), 
          legend.key=element_blank(), strip.text = element_text(size = 20),
          legend.text = element_text(size = 15)) + 
    labs(title = paste("Average methylation levels at peaks with at least",min,"cytosines covered",cov,"times or more"), 
         y="Methylation level (%)", size=10, fill="") +
    scale_fill_manual(values=wes_palette(n=3, name="Moonrise2")) + 
    facet_grid(~Sample)
}

plot.Metavg.bysample(3,3)
plot.Metavg.bycontext(3,3)

pdf("C:/Users/Jon/Google Drive/PhD/Papers/2020_mC_readers/Figures/mC_boxplot_peaks.pdf",20,7)
plot.Metavg.bysample.simple(3,3)
dev.off()

### Function to plot mC distribution

plot.Metdistr <- function(min, cov) {
  mC_data<-filter(tidy_mC_data, Counts >= min, Coverage >= cov) %>%
    group_by(Sample, Context)
  
  ggplot(mC_data, aes(mC, ..density.., fill = Context, color= Context)) + 
    geom_histogram(binwidth=2) + coord_flip() +
    facet_grid(Context~Sample, scales = "free_x") +
    theme(panel.grid.minor = element_blank(), 
          panel.grid.major.x = element_blank(),
          panel.grid.major.y = element_line(colour='lightgrey'),
          panel.background = element_rect(fill = 'white', colour = 'black'),
          strip.background = element_rect(fill = 'white', colour = 'black'), 
          axis.ticks.x=element_blank(), legend.key=element_blank()) +
    labs(title = "Distribution of peaks according to their average methylation", 
         y="Frequency", x="Methylation level (%)") + guides(fill = FALSE, color = FALSE) +
    scale_fill_manual(values=wes_palette(n=3, name="Moonrise2")) + 
    scale_color_manual(values=wes_palette(n=3, name="Moonrise2")) 
}

plot.Metdistr(3,3)

###############################################################################################
############### To plot RNA values

tableAGI<-read.delim("annotated_grouped_peaks_AGI.txt", header = TRUE, sep="") %>%
  filter(!is.na(AGI))

tableRNA<-data.frame(AGI=character(),
                     loFC=numeric(),
                     logCPM=numeric(),
                     RNASample=character(),
                     stringsAsFactors=FALSE)

RNAmutants<-c("MBD1_MBD2","mbd1_2_4","mbd1_2_5_6","mbd2_5_6","suvh1_3","SUVH1_SUVH3","MBD5_MBD6")
for (mutant in RNAmutants) {
  
  table<-read.delim(paste0("C:/Users/Jon/Google Drive/PhD/Server/R/MBD/Data/FC_tables/FC_",mutant,".txt"),
             header = TRUE,sep = "\t") %>%
    select(AGI, logFC, logCPM, RNASample)

  tableRNA<-rbind(tableRNA, table)
  
}

tableTot<-merge(tableRNA, tableAGI, by=c("AGI"))

groups<-c("common_peaks","MBD1_peaks","MBD2_5_6_peaks","MBD5_6_peaks","MBD5_6_SUVH1_3_peaks")

tableTot$Annotation<-factor(tableTot$Annotation, levels = rev(c("TE_prom","TE_gene",
                                                              "TE_TTS","Promoter", "Exon","Intron", "TTS")))
tableTot$Group<-factor(tableTot$Group, levels=groups)
summary(tableTot)

plot.boxplot.RNA.group<-function(group, annot) {
  tablet<-filter(tableTot, Group == group, Annotation == annot)
  
  ggplot(tablet, aes(RNASample, logFC)) +
    geom_boxplot() + lims(y=c(-1,1)) 
}

plot.boxplot.RNA.group("MBD2_5_6_peaks", "Exon")

###############################################################################################
############### To plot ChIP enrichment at TEs per family or superfamily, higlighting bound TEs

mean<-read.delim("Total_TEs_means.txt", stringsAsFactors = FALSE, header = TRUE)
mean$Superfamily<-as.factor(mean$Superfamily)
mean$Family<-as.factor(mean$Family)

samples<-c("MBD1","MBD2","MBD4","MBD5","MBD6","SUVH1","SUVH3","WT")
superfamilies<-as.character(unique(mean$Superfamily))
families<-as.character(unique(mean$Family))

mean2<-mean %>%
  gather(Protein, Log2FC, samples, na.rm = TRUE) %>%
  select(TE_ID, Superfamily, Family, Length, Protein, Log2FC)
mean_FC<-mean %>%
  gather(Protein, Bound, contains("Peaks"))
mean_FC$Protein<-gsub("Peaks_","",mean_FC$Protein)
mean_FC<-mean_FC %>% select(TE_ID, Protein, Bound) %>%
  merge(mean2, by=c("TE_ID", "Protein"))
mean_FC$Protein<-factor(mean_FC$Protein, levels=samples)
mean_FC$Bound<-factor(mean_FC$Bound, levels=c("Yes","No"))

meanWT<-mean %>%
  select(TE_ID, mCG_WT, mCHG_WT, mCHH_WT) %>%
  gather(context, mC, c(mCG_WT, mCHG_WT, mCHH_WT)) %>%
  rowwise() %>%
  mutate(Context=ifelse(grepl("CG",context), "CG", 
                        ifelse(grepl("CHG",context),"CHG","CHH")), 
         Mutant="WT") %>%
  select(-context)

meanMBD<-mean %>%
  select(TE_ID, mCG_MBD5_6, mCHG_MBD5_6, mCHH_MBD5_6) %>%
  gather(context, mC, c(mCG_MBD5_6, mCHG_MBD5_6, mCHH_MBD5_6)) %>%
  rowwise() %>%
  mutate(Context=ifelse(grepl("CG",context), "CG", 
                        ifelse(grepl("CHG",context),"CHG","CHH")), 
         Mutant="MBD5_6") %>%
  select(-context)

meanSUVH<-mean %>%
  select(TE_ID, mCG_SUVH1_3, mCHG_SUVH1_3, mCHH_SUVH1_3) %>%
  gather(context, mC, c(mCG_SUVH1_3, mCHG_SUVH1_3, mCHH_SUVH1_3)) %>%
  rowwise() %>%
  mutate(Context=ifelse(grepl("CG",context), "CG", 
                        ifelse(grepl("CHG",context),"CHG","CHH")), 
         Mutant="SUVH1_3") %>%
  select(-context)

mean_mC<-merge(mean_FC, rbind(meanWT, meanMBD, meanSUVH), by = "TE_ID")
mean_mC$Mutant<-factor(mean_mC$Mutant, levels=c("WT","MBD5_6", "SUVH1_3"))
mean_mC$Context<-factor(mean_mC$Context, levels=c("CG","CHG", "CHH"))

plot.TE.superfamily<-function(tab, super) {
  
  if (super=="all") {
    tottesup<-tab %>% select(TE_ID, Protein, Log2FC, Bound, Superfamily) %>% unique()
    
    plot<-ggplot(tottesup, aes(Protein, Log2FC)) +
      geom_jitter(aes(color=Bound), alpha = 0.5, size=0.5) +
      geom_violin(alpha=0.2, size=0.5) +
      scale_color_manual(values=c("Yes"="red","No"="grey50")) +
      facet_wrap(~Superfamily) +
      theme(panel.grid.minor = element_blank(), 
            panel.grid.major = element_blank(),
            axis.title.x = element_blank(),
            axis.text.x = element_text(size=10, angle=90, vjust = 0.5, hjust = 1),
            axis.text.y = element_text(size=10),
            axis.title.y = element_text(size=10),
            strip.text = element_text(size=10),
            panel.background = element_rect(fill = 'white', colour = 'black'),
            strip.background = element_rect(fill = 'white', colour = 'black'),
            legend.key = element_blank(),
            legend.position = "none")
    
  } else { 
    tottesup<-tab %>% filter(Superfamily==super) %>% 
              select(TE_ID, Protein, Log2FC, Bound, Family) %>% unique() 
    
    plot<-ggplot(tottesup, aes(Protein, Log2FC)) +
      geom_jitter(aes(color=Bound), alpha = 0.5, size=0.5) +
      geom_violin(alpha=0.2, size=0.5) +
      scale_color_manual(values=c("Yes"="red","No"="grey50")) +
      facet_wrap(~Family) +
      geom_hline(yintercept=1, color="black", linetype=2) +
      labs(title=paste0(super)) +
      theme(panel.grid.minor = element_blank(), 
            panel.grid.major = element_blank(),
            axis.title.x = element_blank(),
            axis.text.x = element_text(size=10, angle=90, vjust = 0.5, hjust = 1),
            axis.text.y = element_text(size=10),
            axis.title.y = element_text(size=10),
            strip.text = element_text(size=10),
            panel.background = element_rect(fill = 'white', colour = 'black'),
            strip.background = element_rect(fill = 'white', colour = 'black'),
            legend.key = element_blank(),
            legend.position = "none")
  }
  plot
}

pdf(file = "Rplots/All_superfamilies_ChIP_violin2.pdf", width = 20, height = 20)
plot.TE.superfamily(mean_FC,"all")
dev.off()

plot.TE.superfamily.mC<-function(tab, super) {
  
  if (super=="all") {
    tottesup<-tab %>% select(TE_ID, Mutant, mC, Bound, Superfamily, Context) %>% unique()
    
    plot<-ggplot(tottesup, aes(Mutant, mC)) +
      geom_boxplot(aes(color=Context, fill=Bound), alpha=0.2) +
      scale_fill_manual(values=c("Yes"="red","No"="grey50")) +
      scale_color_manual(values=c("CG"="black","CHG"="brown","CHH"="purple")) +
      facet_wrap(~Superfamily) +
      theme(panel.grid.minor = element_blank(), 
            panel.grid.major = element_blank(),
            axis.title.x = element_blank(),
            axis.text.x = element_text(size=10, angle=90, vjust = 0.5, hjust = 1),
            axis.text.y = element_text(size=10),
            axis.title.y = element_text(size=10),
            strip.text = element_text(size=10),
            panel.background = element_rect(fill = 'white', colour = 'black'),
            strip.background = element_rect(fill = 'white', colour = 'black'),
            legend.key = element_blank())
  } else { 
    tottesup<-tab %>% filter(Superfamily==super) %>% 
      select(TE_ID, Mutant, mC, Bound, Family, Context) %>% unique()
    
    plot<-ggplot(tottesup, aes(Mutant, mC)) +
      geom_boxplot(aes(color=Context, fill=Bound), alpha=0.2) +
      scale_fill_manual(values=c("Yes"="red","No"="grey50")) +
      scale_color_manual(values=c("CG"="black","CHG"="brown","CHH"="purple")) +
      facet_wrap(~Family) +
      labs(title=paste0(super)) +
      theme(panel.grid.minor = element_blank(), 
            panel.grid.major = element_blank(),
            axis.title.x = element_blank(),
            axis.text.x = element_text(size=10, angle=90, vjust = 0.5, hjust = 1),
            axis.text.y = element_text(size=10),
            axis.title.y = element_text(size=10),
            strip.text = element_text(size=10),
            panel.background = element_rect(fill = 'white', colour = 'black'),
            strip.background = element_rect(fill = 'white', colour = 'black'),
            legend.key = element_blank())
  }

  plot
}

pdf(file = "Rplots/All_superfamilies_mC_boxplot.pdf", width = 20, height = 20)
plot.TE.superfamily.mC(mean_mC,"all")
dev.off()

## For diff instead of mC levels

mean_diff<-mean %>%
  mutate(Diff_CG_MBD=mCG_MBD5_6-mCG_WT, Diff_CG_SUVH=mCG_SUVH1_3-mCG_WT,
         Diff_CHG_MBD=mCHG_MBD5_6-mCHG_WT, Diff_CHG_SUVH=mCHG_SUVH1_3-mCHG_WT,
         Diff_CHH_MBD=mCHH_MBD5_6-mCHH_WT, Diff_CHH_SUVH=mCHH_SUVH1_3-mCHH_WT) %>%
  select(TE_ID, contains("Diff")) %>%
  gather(mutant, Diff, contains("Diff")) %>%
  rowwise() %>%
  mutate(Context=ifelse(grepl("CG",mutant), "CG", ifelse(grepl("CHG",mutant), "CHG", "CHH")),
         Mutant=ifelse(grepl("MBD",mutant), "MBD5_6", "SUVH1_3")) %>%
  select(-mutant) %>%
  merge(mean_FC, by="TE_ID")
mean_diff$Mutant<-factor(mean_diff$Mutant, levels=c("MBD5_6", "SUVH1_3"))
mean_diff$Context<-factor(mean_diff$Context, levels=c("CG","CHG", "CHH"))

plot.TE.family.mCDiff<-function(tab, fam) {
  
  tottesup<- tab %>% filter(Family==fam) %>%
    select(TE_ID, Mutant, Diff, Bound, Context) %>% unique()
  super=unique(as.character(tottesup$Superfamily))

  sum<-group_by(tottesup, Bound, Context) %>%
    summarize(Total=n()) %>%
    spread(Bound, Total) %>%
    rowwise() %>%
    mutate(label=ifelse("Yes" %in% colnames(.), paste0(`Yes`," bound; ",`No`," unbound"),
                        paste0("0 bound; ",`No`," unbound")))
  
  tottesup<-merge(tottesup, sum, by=c("Context"))
  
  plot<-ggplot(tottesup, aes(Context, Diff)) +
    geom_boxplot(aes(color=Mutant, fill=Bound)) +
    scale_color_manual(values=c("WT"="grey50","MBD5_6"="blue","SUVH1_3"="orange")) +
    scale_fill_manual(values=c("Yes"="pink","No"="grey80")) +
    geom_hline(yintercept = 0, linetype=2, alpha=0.5) +
    labs(title=paste0(fam," from ",super)) +
    annotate("text", x=tottesup$Context, y=105, label=tottesup$label, size = 4, alpha=0.5) +
    coord_cartesian(ylim=c(-100,100)) +
    theme(panel.grid.minor = element_blank(), 
          panel.grid.major = element_blank(),
          axis.title.x = element_blank(),
          axis.text.x = element_text(size=10, angle=90, vjust = 0.5, hjust = 1),
          axis.text.y = element_text(size=10),
          axis.title.y = element_text(size=10),
          strip.text = element_text(size=10),
          panel.background = element_rect(fill = 'white', colour = 'black'),
          strip.background = element_rect(fill = 'white', colour = 'black'),
          legend.key = element_blank()) 
  plot
}


pdf(file = "Rplots/Families_mC_diff.pdf")
for ( fam in families ) {
  print(plot.TE.family.mCDiff(mean_diff, fam))
}
dev.off()

##########################################################################################
### To plot RNA and distance to closest gene from bound and unbound TEs

samples<-c("MBD1","MBD2","MBD4","MBD5","MBD6","SUVH1","SUVH3")
# tablebound<-read.delim("Bound_genes_complete.txt", stringsAsFactors = FALSE, header = TRUE)
# tablebound$Superfamily<-as.factor(tablebound$Superfamily)
# tablebound$Family<-as.factor(tablebound$Family)
# tablebound$Sample<-as.factor(tablebound$Sample)
# tablebound<-tablebound %>%
#   separate(GeneID, c("GeneType","AGI"), sep=":", stringAsFactors = FALSE) %>%
#   mutate(grouped=cut(Distance,
#                      breaks = c(-Inf, -2000, -1000, -200, 0, 200, 1000, 2000, +Inf),
#                      labels= c("up_over2000","up_1000to2000", "up_200to1000", "up_0to200",
#                                "down_0to200","down_200to1000","down_1000to2000","down_over2000")),
#          absgrouped=cut(abs(Distance),
#                         breaks = c(0, 100, 200, 500, 1000, +Inf),
#                         labels= c("0to100","100to200","200to500","500to1000","over1000"),
#                         include.lowest = TRUE)) %>%
#   rowwise() %>%
#   mutate(Group=ifelse(grouped=="up_0to200" && Distance==0, "bound", as.character(grouped))) %>%
#   mutate(AbsGroup=ifelse(absgrouped=="0to100" && Distance==0, "0", as.character(absgrouped))) %>%
#   select(-grouped, -absgrouped)
# tablebound$GeneType<-as.factor(tablebound$GeneType)
# tablebound$Group<-factor(tablebound$Group, levels=c("up_over2000","up_1000to2000", "up_200to1000", "up_0to200",
#                                                     "bound","down_0to200","down_200to1000","down_1000to2000","down_over2000"))
# tablebound$AbsGroup<-factor(tablebound$AbsGroup, levels=rev(c("0","0to100","100to200","200to500","500to1000","over1000")))

tableboundunbound<-read.delim("Bound_unbound_genes_complete2.txt", stringsAsFactors = FALSE, header = TRUE)
tableboundunbound$Superfamily<-as.factor(tableboundunbound$Superfamily)
tableboundunbound$Family<-as.factor(tableboundunbound$Family)
tableboundunbound$Sample<-as.factor(tableboundunbound$Sample)
tableboundunbound$Bound<-as.factor(tableboundunbound$Bound)

tableboundunbound<-tableboundunbound %>%
  separate(GeneID, c("GeneType","AGI"), sep=":", stringAsFactors = FALSE) %>%
  mutate(grouped=cut(Distance,
                     breaks = c(-1000,-500, -200, -100, 0, 100, 200, 500, 1000),
                     labels= c("up_500to1000","up_200to500", "up_100to200", "up_0to100",
                               "down_0to100", "down_100to200","down_200to500","down_500to1000")),
         absgrouped=cut(abs(Distance),
                        breaks = c(0,100,200, 500, 1000),
                        labels= c("0to100","100to200","200to500","500to1000"),
                        include.lowest = TRUE)) %>%
  rowwise() %>%
  mutate(Group=ifelse(grouped=="up_0to100" && Distance==0, "0", as.character(grouped))) %>%
  mutate(AbsGroup=ifelse(absgrouped=="0to100" && Distance==0, "0", as.character(absgrouped))) %>%
  select(-grouped, -absgrouped)
tableboundunbound$GeneType<-as.factor(tableboundunbound$GeneType)
tableboundunbound$Group<-factor(tableboundunbound$Group, 
                                levels=c("up_500to1000","up_200to500", "up_100to200","up_0to100",
                                         "0","down_0to100","down_100to200","down_200to500",
                                         "down_500to1000"))
tableboundunbound$AbsGroup<-factor(tableboundunbound$AbsGroup, 
                                   levels=rev(c("0","0to100","100to200","200to500","500to1000")))
  
superfamilies<-as.character(unique(tableboundunbound$Superfamily))
families<-as.character(unique(tableboundunbound$Family))

RNAmutants<-c("mbd1_2_5_6","mbd2_5_6","suvh1_3","mbd1_2_4")

for (mutant in RNAmutants) {
  assign(paste0("FC_",mutant),
         read.delim(paste0("C:/Users/Jon/Google Drive/PhD/Server/R/MBD/Data/FC_tables/FC_",mutant,".txt"),
                    header = TRUE,sep = "\t"))
}
totRNA<-rbind(FC_mbd1_2_4,FC_mbd1_2_5_6,FC_mbd2_5_6,FC_suvh1_3)

tableboundunboundRNA<-merge(tableboundunbound, totRNA, by=c("AGI"))

for ( sample in sample ) {
  pdf(file = paste0("Rplots/",sample,"_RNA_diff.pdf"))
  for (super in superfamilies) {
  ex<-tableboundRNA %>% filter(Sample == sample, Superfamily == super)
  sum<-ex %>% group_by(AbsGroup) %>%
      summarize(Tot=n()) %>% mutate(label=paste0("n=",Tot))
  ex2<-merge(ex,sum, by=c("AbsGroup"))

  plot<-ggplot(ex2, aes(AbsGroup, logFC)) +
      geom_boxplot(aes(color=RNASample), alpha=0.5) +
      geom_hline(yintercept = 0, linetype=2) +
      geom_text(y=-1, aes(x=AbsGroup, label=label)) +
      labs(title=paste0("Expression of genes next to ",super,"TEs bound by ",sample)) +
      theme(panel.grid.minor = element_blank(), 
            panel.grid.major = element_blank(),
            axis.title.x = element_blank(),
            axis.text.x = element_text(size=10, angle=90, vjust = 0.5, hjust = 1),
            axis.text.y = element_text(size=10),
            axis.title.y = element_text(size=10),
            strip.text = element_text(size=10),
            panel.background = element_rect(fill = 'white', colour = 'black'),
            strip.background = element_rect(fill = 'white', colour = 'black'),
            legend.key = element_blank()) +
      coord_cartesian(ylim=c(-1,1))
    print(plot) }
  dev.off()
  
}

### For bound TEs by superfamilies
for ( sample in samples ) {
  pdf(file = paste0("Rplots/",sample,"_RNA_diff_superfamilies.pdf"), width = 12, height = 12)
    ex<-tableboundunboundRNA %>% filter(Sample == sample)
    label<-ex %>% group_by(Superfamily,Bound) %>%
      summarize(Tot=n()) %>%
      spread(Bound, Tot) %>%
      mutate(label=paste0(Superfamily," (",Bound," bound; ",Unbound," unbound)")) %>%
      select(Superfamily, label)
    ex2<-merge(ex,label, by=c("Superfamily"))
    
    plot<-ggplot(ex2, aes(RNASample, logFC)) +
      geom_boxplot(aes(color=Bound, fill=RNASample), alpha=0.5, outlier.alpha = 0.5) +
      scale_color_manual(values=c("Bound"="red", "Unbound"="grey")) +
      scale_fill_manual(values=c("mbd1_2_5_6"="blue","mbd2_5_6"="green",
                                  "suvh1_3"="orange","mbd1_2_4"="purple")) +
      geom_hline(yintercept = 0, linetype=2) +
      facet_wrap(~label, ncol = 5) +
      labs(title=paste0("Differential expression of genes next to TEs bound (red box) or not by ",sample)) +
      guides(fill=FALSE, color=FALSE) +
      theme(panel.grid.minor = element_blank(), 
            panel.grid.major = element_blank(),
            axis.title.x = element_blank(),
            axis.text.x = element_text(size=10, angle=90, vjust = 0.5, hjust = 1),
            axis.text.y = element_text(size=10),
            axis.title.y = element_text(size=10),
            strip.text = element_text(size=10),
            panel.background = element_rect(fill = 'white', colour = 'black'),
            strip.background = element_rect(fill = 'white', colour = 'black')) +
      coord_cartesian(ylim=c(-1,1))
    print(plot)
  dev.off()
}

### For bound TEs by families
for ( sample in samples ) {
    pdf(file = paste0("Rplots/",sample,"_RNA_diff_families_over10.pdf"), width = 50, height = 50)
    ex<-tableboundunboundRNA %>% filter(Sample == sample)
    label<-ex %>% group_by(Family, Bound) %>%
      summarize(Tot=n()) %>% 
      spread(Bound, Tot) %>%
      filter(Bound >= 10, Unbound >= 10) %>%
      mutate(label=paste0(Family," (",Bound," bound; ",Unbound," unbound)")) %>%
      select(Family, label)
    ex2<-merge(ex,label, by=c("Family"))
    
    plot<-ggplot(ex2, aes(RNASample, logFC)) +
      geom_boxplot(aes(color=Bound, fill=RNASample), alpha=0.5, outlier.alpha = 0.5) +
      scale_color_manual(values=c("Bound"="red", "Unbound"="grey")) +
      scale_fill_manual(values=c("mbd1_2_5_6"="blue","mbd2_5_6"="green",
                                 "suvh1_3"="orange","mbd1_2_4"="purple")) +
      geom_hline(yintercept = 0, linetype=2) +
      facet_wrap(~label) +
      labs(title=paste0("Differential expression of genes next to TEs bound or not by ",sample),
           color=paste0("Are TEs bound by ",sample,"?")) +
      guides(fill=FALSE) +
      theme(panel.grid.minor = element_blank(), 
            panel.grid.major = element_blank(),
            axis.title.x = element_blank(),
            axis.text.x = element_text(size=10, angle=90, vjust = 0.5, hjust = 1),
            axis.text.y = element_text(size=10),
            axis.title.y = element_text(size=10),
            strip.text = element_text(size=10),
            panel.background = element_rect(fill = 'white', colour = 'black'),
            strip.background = element_rect(fill = 'white', colour = 'black'),
            legend.position = c(0.9, 0), 
            legend.justification = c(1, 0),
            legend.direction = "vertical") +
      coord_cartesian(ylim=c(-1,1))
    print(plot)
    dev.off()
  
}


### For bound TEs by distance

for ( sample in samples ) {
  pdf(file = paste0("Rplots/",sample,"_RNA_diff_Absdistance.pdf"), width = 10, height = 10)
  ex<-tableboundunboundRNA %>% filter(Sample == sample, GeneType == "protein_coding_gene")
  sum<-ex %>% group_by(AbsGroup,Bound) %>%
    summarize(Tot=n()) %>% 
    spread(Bound, Tot) %>%
    mutate(labelA=paste0("Bound=",Bound),
           labelB=paste0("Unbound=",Unbound)) %>%
    select(AbsGroup,labelA,labelB)

  plot<-ggplot(ex, aes(AbsGroup, logFC)) +
    geom_boxplot(aes(color=Bound, fill=RNASample), alpha=0.5, outlier.alpha = 0.2) +
    scale_color_manual(values=c("Bound"="red", "Unbound"="grey")) +
    scale_fill_manual(values=c("mbd1_2_5_6"="blue","mbd2_5_6"="green",
                               "suvh1_3"="orange","mbd1_2_4"="purple")) +
    geom_hline(yintercept = 0, linetype=2) +
    labs(title=paste0("Differential expression of genes next to TEs bound (red boxes) or not by ",sample),
         y="RNA log2FC") +
    geom_text(y=-0.8, aes(x=sum$AbsGroup, label=sum$labelA), size=2, alpha=0.8, data.frame()) +
    geom_text(y=-0.9, aes(x=sum$AbsGroup, label=sum$labelB), size=2, alpha=0.8, data.frame()) +
    guides(fill=FALSE, color=FALSE) +
    theme(panel.grid.minor = element_blank(), 
          panel.grid.major = element_blank(),
          axis.title.x = element_blank(),
          axis.text.x = element_text(size=10, angle=90, vjust = 0.5, hjust = 1),
          axis.text.y = element_text(size=10),
          axis.title.y = element_text(size=10),
          strip.text = element_text(size=10),
          panel.background = element_rect(fill = 'white', colour = 'black'),
          strip.background = element_rect(fill = 'white', colour = 'black'),
          legend.position = "right",
          legend.direction = "vertical") +
    coord_cartesian(ylim=c(-1,1))
  print(plot)
  dev.off()
  
}

##### To check RNA expression of TEs

for (mutant in RNAmutants) {
  assign(paste0("TE_FC_",mutant),
         read.delim(paste0("C:/Users/Jon/Google Drive/PhD/Server/R/MBD/Data/FC_tables/TE_FC_",mutant,".txt"),
                    header = TRUE,sep = "\t"))
}
totTERNA<-rbind(TE_FC_mbd1_2_4,TE_FC_mbd1_2_5_6,TE_FC_mbd2_5_6,TE_FC_suvh1_3) %>%
  rename(TE_AGI=AGI)

tableboundunboundTERNA<-merge(tableboundunbound, totTERNA, by=c("TE_AGI"))

for ( sample in samples ) {
  pdf(file = paste0("Rplots/",sample,"_TE_diff_superfamilies.pdf"), width = 12, height = 12)
  ex<-tableboundunboundTERNA %>% filter(Sample == sample)
  label<-ex %>% group_by(Superfamily,Bound) %>%
    summarize(Tot=n()) %>%
    spread(Bound, Tot) %>%
    mutate(label=paste0(Superfamily," (",Bound," bound; ",Unbound," unbound)")) %>%
    select(Superfamily, label)
  ex2<-merge(ex,label, by=c("Superfamily"))
  
  plot<-ggplot(ex2, aes(RNASample, logFC)) +
    geom_boxplot(aes(color=Bound, fill=RNASample), alpha=0.5, outlier.alpha = 0.5) +
    scale_color_manual(values=c("Bound"="red", "Unbound"="grey")) +
    scale_fill_manual(values=c("mbd1_2_5_6"="blue","mbd2_5_6"="green",
                               "suvh1_3"="orange","mbd1_2_4"="purple")) +
    geom_hline(yintercept = 0, linetype=2) +
    facet_wrap(~label, ncol = 5) +
    labs(title=paste0("Differential expression of TEs bound (red box) or not by ",sample)) +
    guides(fill=FALSE, color=FALSE) +
    theme(panel.grid.minor = element_blank(), 
          panel.grid.major = element_blank(),
          axis.title.x = element_blank(),
          axis.text.x = element_text(size=10, angle=90, vjust = 0.5, hjust = 1),
          axis.text.y = element_text(size=10),
          axis.title.y = element_text(size=10),
          strip.text = element_text(size=10),
          panel.background = element_rect(fill = 'white', colour = 'black'),
          strip.background = element_rect(fill = 'white', colour = 'black')) +
    coord_cartesian(ylim=c(-2,2))
  print(plot)
  dev.off()
}

### To filter genes that are regulated in suvh1_3 and compare the expression of the closest TE

for ( sample in samples ) {
regul<-tableboundunboundRNA %>% filter(Sample == sample, RNASample == "suvh1_3") %>%
  filter(logFC<0) %>%
  select(TE_AGI,Bound) %>%
  unique()

write.table(regul,paste0(sample,"_TE_close_to_gene_downregulated_in_suvh13.txt"), sep="\t", row.names = FALSE, col.names = FALSE)

}

for ( sample in c("SUVH1","SUVH3") ) {
  pdf(file = paste0("Rplots/",sample,"_TE_diffregul.pdf"), width = 12, height = 12)
  ex<-tableboundunboundTERNA %>% filter(Sample == sample) %>%
    merge(regul)
  sum<-ex %>% group_by(Bound,Regulate) %>%
    summarize(Tot=n()) %>%
    spread(Bound, Tot) %>%
    mutate(labelA=paste0("Bound: ",Bound), labelB=paste0("Unbound: ",Unbound))
  
  plot<-ggplot(ex, aes(Regulate, logFC)) +
    geom_boxplot(aes(color=Bound, fill=RNASample), alpha=0.5, outlier.alpha = 0.5) +
    scale_color_manual(values=c("Bound"="red", "Unbound"="grey")) +
    scale_fill_manual(values=c("mbd1_2_5_6"="blue","mbd2_5_6"="green",
                               "suvh1_3"="orange","mbd1_2_4"="purple")) +
    geom_text(y=-0.8, aes(x=sum$Regulate, label=sum$labelA), size=4, alpha=0.8, data.frame()) +
    geom_text(y=-0.9, aes(x=sum$Regulate, label=sum$labelB), size=4, alpha=0.8, data.frame()) +
    geom_hline(yintercept = 0, linetype=2) +
    labs(title=paste0("Differential expression of TEs bound (red box) or not by ",sample)) +
    guides(fill=FALSE, color=FALSE) +
    theme(panel.grid.minor = element_blank(), 
          panel.grid.major = element_blank(),
          axis.title.x = element_blank(),
          axis.text.x = element_text(size=10, angle=90, vjust = 0.5, hjust = 1),
          axis.text.y = element_text(size=10),
          axis.title.y = element_text(size=10),
          strip.text = element_text(size=10),
          panel.background = element_rect(fill = 'white', colour = 'black'),
          strip.background = element_rect(fill = 'white', colour = 'black')) +
    coord_cartesian(ylim=c(-1,1))
  print(plot)
  dev.off()
}

ex<-tableboundunboundTERNA %>% select(-Sample,-Bound) %>% unique()
sum<-ex %>% group_by(Superfamily, RNASample) %>%
  summarize(Tot=n()) %>%
  mutate(labelA=paste0("n=",Tot))
  
pdf(file = "Rplots/TE_diff_super.pdf", width = 20, height = 12)
plot<-ggplot(ex, aes(Superfamily, logFC)) +
    geom_boxplot(aes(fill=RNASample), outlier.alpha = 0.2) +
    scale_fill_manual(values=c("mbd1_2_5_6"="blue","mbd2_5_6"="green",
                               "suvh1_3"="orange","mbd1_2_4"="purple")) +
    geom_text(y=-2, aes(x=sum$Superfamily, label=sum$labelA), size=4, alpha=0.8, data.frame()) +
    geom_hline(yintercept = 0, linetype=2) +
    labs(title="Differential expression of TEs") +
    guides(color=FALSE) +
    theme(panel.grid.minor = element_blank(), 
          panel.grid.major = element_blank(),
          axis.title.x = element_blank(),
          axis.text.x = element_text(size=10, angle=90, vjust = 0.5, hjust = 1),
          axis.text.y = element_text(size=10),
          axis.title.y = element_text(size=10),
          strip.text = element_text(size=10),
          panel.background = element_rect(fill = 'white', colour = 'black'),
          strip.background = element_rect(fill = 'white', colour = 'black')) +
    coord_cartesian(ylim=c(-2,2))
print(plot)
dev.off()

###################################################

table<-read.delim("Clusters_AGI.txt", stringsAsFactors = FALSE, header = TRUE)
table$Type<-as.factor(table$Type)
table$Bound<-as.factor(table$Bound)

### To check gene differential expression at the gene clusters
RNAmutants<-c("mbd1_2_5_6","mbd2_5_6","suvh1_3","mbd1_2_4")

for (mutant in RNAmutants) {
  assign(paste0("FC_",mutant),
         read.delim(paste0("C:/Users/Jon/Google Drive/PhD/Server/R/MBD/Data/FC_tables/FC_",mutant,".txt"),
                    header = TRUE,sep = "\t"))
}
totRNA<-rbind(FC_mbd1_2_4,FC_mbd1_2_5_6,FC_mbd2_5_6,FC_suvh1_3)

table<-merge(table, totRNA, by="AGI")
sum<-table %>% group_by(Type, Bound) %>%
  summarize(Tot=n()) %>%
  spread(Bound, Tot) %>%
  mutate(labelA=paste0("Bound=",Bound), labelB=paste0("Unbound=",Unbound))

plot<-ggplot(table, aes(Type, logFC)) +
  geom_boxplot(aes(fill=RNASample, color=Bound), outlier.alpha = 0.2) +
  scale_fill_manual(values=c("mbd1_2_5_6"="blue","mbd2_5_6"="green",
                             "suvh1_3"="orange","mbd1_2_4"="purple")) +
  scale_color_manual(values=c("Bound"="red", "Unbound"="grey")) +
  geom_text(y=-1.8, aes(x=sum$Type, label=sum$labelA), size=2, alpha=0.8, data.frame()) +
  geom_text(y=-1.9, aes(x=sum$Type, label=sum$labelB), size=2, alpha=0.8, data.frame()) +
  geom_hline(yintercept = 0, linetype=2) +
  labs(title="Differential expression of genes") +
  guides(color=FALSE) +
  theme(panel.grid.minor = element_blank(), 
        panel.grid.major = element_blank(),
        axis.title.x = element_blank(),
        axis.text.x = element_text(size=10, angle=90, vjust = 0.5, hjust = 1),
        axis.text.y = element_text(size=10),
        axis.title.y = element_text(size=10),
        strip.text = element_text(size=10),
        panel.background = element_rect(fill = 'white', colour = 'black'),
        strip.background = element_rect(fill = 'white', colour = 'black')) +
  coord_cartesian(ylim=c(-2,2))
plot

######## To plot gene expression (CPM)

genextable<-read.delim("C:/Users/Jon/Google Drive/PhD/Server/R/MBD/Data/DEG/Gene_expression_RS5x2.txt", 
                       stringsAsFactors = FALSE, header = TRUE)
extable<-merge(table, genextable, by="AGI")
sum<-table %>% group_by(Type, Bound) %>%
  summarize(Tot=n()) %>%
  spread(Bound, Tot) %>%
  mutate(labelA=paste0("Bound=",Bound), labelB=paste0("Unbound=",Unbound))

extable2<-extable %>% 
  mutate(WT=(WT_a+WT_b+WT_c)/3) %>%
  mutate(mbd1_2_4=(mbd1_2_4_a+mbd1_2_4_b+mbd1_2_4_c)/3) %>%
  mutate(mbd1_2_5_6=(mbd1_2_5_6_a+mbd1_2_5_6_b+mbd1_2_5_6_c)/3) %>%
  mutate(mbd2_5_6=(mbd2_5_6_a+mbd2_5_6_b+mbd2_5_6_c)/3) %>%
  mutate(suvh1_3=(suvh1_3_a+suvh1_3_b+suvh1_3_c)/3) %>%
  select(AGI, Type, Bound, WT, mbd1_2_4, mbd1_2_5_6, mbd2_5_6, suvh1_3) %>%
  gather(Sample,meanCPM,c(WT, mbd1_2_4, mbd1_2_5_6, mbd2_5_6, suvh1_3))
extable2$Sample<-factor(extable2$Sample, levels = c("WT","mbd1_2_4","mbd2_5_6","mbd1_2_5_6","suvh1_3"))

plot<-ggplot(extable2, aes(Sample, meanCPM)) +
  geom_boxplot(aes(fill=Sample), notch = TRUE, alpha=0.5) +
  scale_fill_manual(values=c("WT"="#000000", "mbd1_2_5_6"="#4F4FCC","mbd2_5_6"="#9E61CB",
                             "suvh1_3"="#E8824E","mbd1_2_4"="#70B6EC")) +
  guides(fill=FALSE) +
  theme(panel.grid.minor = element_blank(), 
        panel.grid.major = element_blank(),
        axis.title.x = element_blank(),
        axis.text.x = element_text(size=10, angle=90, vjust = 0.5, hjust = 1),
        axis.text.y = element_text(size=10),
        axis.title.y = element_text(size=10),
        strip.text = element_text(size=10),
        panel.background = element_rect(fill = 'white', colour = 'black'),
        strip.background = element_rect(fill = 'white', colour = 'black')) +
  facet_grid(~Type*Bound) +
  coord_cartesian(ylim=c(0,100))
plot

### Similar analysis on the H2AKub clusters

H2AKubclusters<-read.delim("H2AKubdiff_clusters_AGI.txt", stringsAsFactors = FALSE, header = TRUE)

extable3<-merge(extable2,H2AKubclusters, by=c("AGI"))
extable3$Cluster<-factor(extable3$Cluster, 
                         levels=c("No_change","Mid","Low"))

plot<-ggplot(extable3, aes(Sample, meanCPM)) +
  geom_boxplot(aes(fill=Sample), notch = TRUE, alpha=0.5) +
  scale_fill_manual(values=c("WT"="#000000", "mbd1_2_5_6"="#4F4FCC","mbd2_5_6"="#9E61CB",
                             "suvh1_3"="#E8824E","mbd1_2_4"="#70B6EC")) +
  guides(fill=FALSE) +
  theme(panel.grid.minor = element_blank(), 
        panel.grid.major = element_blank(),
        axis.title.x = element_blank(),
        axis.text.x = element_text(size=10, angle=90, vjust = 0.5, hjust = 1),
        axis.text.y = element_text(size=10),
        axis.title.y = element_text(size=10),
        strip.text = element_text(size=10),
        panel.background = element_rect(fill = 'white', colour = 'black'),
        strip.background = element_rect(fill = 'white', colour = 'black')) +
  facet_grid(~Cluster) +
  coord_cartesian(ylim=c(0,200))
plot

####################################################

table<-read.delim("DEG_common_suvhs.txt", stringsAsFactors = FALSE, header = FALSE, col.names = c("AGI"))

sum<-table %>% summarize(Tot=n())
table<-merge(table, tableboundunboundRNA, by="AGI")

summary<-table %>% filter(Sample %in% c("SUVH1","SUVH3")) %>%
  select(Sample, Bound, AGI) %>%
  unique() %>%
  group_by(Sample, Bound) %>%
  summarize(Tot=n())

######### upset plots of DEGs

table<-read.delim("DEG_table_mutants.txt", stringsAsFactors = FALSE, header = TRUE)

table2<- table %>% rowwise %>%
  mutate(None=sum(mbd1_2_4_others,mbd2_5_6_others,mbd1_2_5_6_others,suvh1_3_others)) %>%
  filter(None<4)

table3<- table %>% select(-contains("others"), -contains("suvh1_3"))

upset(as.data.frame(table3), point.size = 2, line.size = 1, text.scale = 2, 
      sets = c("mbd1_2_4_up","mbd2_5_6_up","mbd1_2_5_6_up","mbd1_2_4_down","mbd2_5_6_down","mbd1_2_5_6_down") , 
      order.by = "freq", group.by = "sets", keep.order = TRUE )

upset(as.data.frame(table3), point.size = 2, line.size = 1, text.scale = 2, 
      sets = c("mbd1_2_4_up","mbd2_5_6_up","mbd1_2_5_6_up","mbd1_2_4_down","mbd2_5_6_down","mbd1_2_5_6_down") , 
      order.by = "freq", group.by = "degree", keep.order = FALSE )

###############################################
### To check annotations in each peak groups

peaks<-read.delim("v2_annotated_grouped_peaks.txt", stringsAsFactors = FALSE, header = TRUE)
peaks$Group<-factor(peaks$Group, levels = c("random","all_candidates","MBD1","MBD1_MBD2",
                                            "MBD2_MBD5_MBD6","MBD4","MBD5_MBD6",
                                            "MBD5_MBD6_SUVH1_SUVH3","SUVH1_SUVH3"), 
                    labels=c("Random", "Common", "MBD1","MBD1/2","MBD2/5/6",
                             "MBD4","MBD5/6", "MBD5/6/SUVH1/3","SUVH1/3"))
peaks$Annotation<-factor(peaks$Annotation, 
                         levels = c("Intergenic","TTS","Intron","Exon","Promoter",
                                    "TE_TTS","TE_gene","TE_prom","TE"))

tot<-peaks %>% group_by(Group) %>%
  summarize(Tot=sum(Number)) %>%
  mutate(label=paste0("n=",Tot))

ggplot(peaks, aes(Group, Number)) +
  geom_col(aes(fill=Annotation), position="fill", color="black") +
  scale_fill_manual(values=c("TE"="#000000","TE_prom"="#373738","TE_gene"="#636466",
                             "TE_TTS"="#898a8c","Promoter"="#0D9FDE", "Exon"="#40B0E1",
                             "Intron"="#6BBEE2", "TTS"="#9ACEE5", "Intergenic"="#DC9C86")) +
  geom_text(y=1.04, aes(x=tot$Group, label=tot$label), size=5, data.frame()) +
  theme(panel.grid.minor = element_blank(), 
        panel.grid.major = element_blank(),
        axis.title.x = element_blank(),
        axis.text.x = element_text(size=10),
        axis.ticks = element_blank(),
        axis.text.y = element_text(size=10),
        axis.title.y = element_blank(),
        panel.background = element_rect(fill = 'white', colour = 'black'),
        strip.background = element_rect(fill = 'white', colour = 'white')) +
  scale_y_continuous(breaks=c(0,0.25,0.5,0.75,1), labels = c("0","25%","50%","75%","100%")) +
  coord_cartesian(ylim=c(0,1.04)) +
  labs(fill="", title = "Annotations bound by each cluster of peaks")

###############################################################
### To plot CPM values of genes in each cluster and annotations

peaks2<-read.delim("annotated_grouped_peaks_AGI.txt", stringsAsFactors = FALSE, header = TRUE)
peaks2$Group<-factor(peaks2$Group, levels = c("random_peaks","common_peaks","MBD1_peaks","MBD2_5_6_peaks","MBD4_peaks","MBD5_6_peaks","MBD5_6_SUVH1_3_peaks"), 
                    labels=c("Random", "Common", "MBD1","MBD2/5/6","MBD4","MBD5/6", "MBD5/6/SUVH1/3"))
peaks2$Annotation<-factor(peaks2$Annotation, 
                         levels = c("Intergenic","TTS","Intron","Exon","Promoter","TE_TTS",
                                    "TE_gene","TE_prom","TE"))

genextable<-read.delim("C:/Users/Jon/Google Drive/PhD/Server/R/MBD/Data/DEG/Gene_expression_RS5x2.txt", 
                       stringsAsFactors = FALSE, header = TRUE)
genextable2<-genextable %>% 
  mutate(WT=(WT_a+WT_b+WT_c)/3) %>%
  mutate(mbd1_2_4=(mbd1_2_4_a+mbd1_2_4_b+mbd1_2_4_c)/3) %>%
  mutate(mbd1_2_5_6=(mbd1_2_5_6_a+mbd1_2_5_6_b+mbd1_2_5_6_c)/3) %>%
  mutate(mbd2_5_6=(mbd2_5_6_a+mbd2_5_6_b+mbd2_5_6_c)/3) %>%
  mutate(suvh1_3=(suvh1_3_a+suvh1_3_b+suvh1_3_c)/3) %>%
  select(AGI, WT, mbd1_2_4, mbd1_2_5_6, mbd2_5_6, suvh1_3) %>%
  gather(Sample,meanCPM,c(WT, mbd1_2_4, mbd1_2_5_6, mbd2_5_6, suvh1_3))
genextable2$Sample<-factor(genextable2$Sample, levels = c("WT","mbd1_2_4","mbd2_5_6","mbd1_2_5_6","suvh1_3"))

extable<-merge(peaks2, genextable2, by="AGI")

plotCPMAGI<-function(sample) {
  extable2<-extable %>% filter(Sample==sample)
  tab<-extable2 %>% group_by(Group, Annotation) %>%
    summarize(tot=n()) %>%
    mutate(label=paste0("n=",tot))
  
  if (sample =="WT") { 
    lab<-paste0("meanCPM Col0")} else
    {lab<-paste0("meanCPM ",sample)}
  plot<-ggplot(extable2, aes(Annotation, meanCPM)) +
    geom_boxplot(aes(fill=Annotation), alpha=0.5, outlier.alpha = 0.2) +
    geom_text(data = tab, y=-5, aes(x=Annotation, label=label), size=4, angle=45) +
    scale_fill_manual(values=c("TE"="#000000","TE_prom"="#373738","TE_gene"="#636466",
                             "TE_TTS"="#898a8c","Promoter"="#0D9FDE", "Exon"="#40B0E1",
                             "Intron"="#6BBEE2", "TTS"="#9ACEE5", "Intergenic"="#DC9C86")) +
    guides(fill=FALSE) +
    theme(panel.grid.minor = element_blank(), 
        panel.grid.major = element_blank(),
        axis.title.x = element_blank(),
        axis.text.x = element_text(size=10, angle=90, vjust = 0.5, hjust = 1),
        axis.text.y = element_text(size=10),
        axis.title.y = element_text(size=15),
        strip.text = element_text(size=10),
        panel.background = element_rect(fill = 'white', colour = 'black'),
        strip.background = element_rect(fill = 'white', colour = 'black')) +
    ylab(lab) +
    facet_grid(~Group) +
    coord_cartesian(ylim=c(0,200))
  print(plot)
}

pdf("Rplots/CPM_AGI_clusters.pdf", width=20, height = 10)
for (sample in c("WT","mbd1_2_4","mbd2_5_6","mbd1_2_5_6","suvh1_3")) {
# for (sample in c("Random", "Common", "MBD1","MBD2/5/6","MBD4","MBD5/6", "MBD5/6/SUVH1/3")) {
  plotCPMAGI(sample)
}
dev.off()

###############################################
### To plot histogram of distance to centromere

distance<-read.delim("distance_to_centromeres_grouped.txt", stringsAsFactors = FALSE, header = TRUE) %>%
  select(Group, Peak_ID, Distance)
distance$Group<-factor(distance$Group, levels = c("random","common","MBD1","MBD2_5_6","MBD4","MBD5_6","MBD5_6_SUVH1_3"), 
                    labels=c("Random","Common", "MBD1", "MBD2/5/6","MBD4", "MBD5/6", "MBD5/6/SUVH1/3"))

tab<-distance %>% group_by(Group) %>%
  summarize(tot=n()) %>%
  mutate(label=paste0("n=",tot))

pdf("Rplots/Distance_to_centromeres_group_histogram.pdf", width=20, height = 2)
ggplot(distance, aes(Distance)) +
  geom_histogram(aes(y=..density..), binwidth = 2) +
  geom_text(data=tab, x=80, y=0.04, aes(label=label), size=5) +
  theme(panel.grid.minor = element_blank(), 
        panel.grid.major = element_blank(),
        axis.title.x = element_blank(),
        axis.ticks.x = element_blank(),
        axis.text.x = element_text(size=10),
        axis.text.y = element_text(size=10),
        axis.title.y = element_text(size=15),
        panel.background = element_rect(fill = 'white', colour = 'black'),
        strip.background = element_rect(fill = 'white', colour = 'black')) +
  facet_grid(~Group) +
  scale_x_continuous(breaks = c(15,55,90), labels = c("Centromere",expression(""%->%""),"Telomere")) +
  labs(fill="", y="Density")
dev.off()

### To plot density per annotations

distable<-merge(peaks2, distance, by=c("Peak_ID","Group")) %>%
  rowwise() %>%
  mutate(Type_annot=ifelse(Annotation %in% c("TTS","Intron","Exon","Promoter"), "Genes",
                           ifelse(Annotation %in% c("TE_TTS","TE_gene","TE_prom","TE"),"TEs","Intergenic")))

tab<-distable %>% group_by(Group,Type_annot) %>%
  summarize(tot=n()) %>%
  mutate(label=paste0("n",Type_annot,"=",tot))

distable2<-distable%>%filter(Type_annot!="Intergenic")
pdf("Rplots/Distance_to_centromeres_group_density.pdf", width=20, height = 2)
ggplot(distable2, aes(Distance)) +
  geom_density(aes(fill=Type_annot, color=Type_annot),alpha=0.2) +
  geom_text(data=tab, x=50, y=rep(c(0.035,0.0375,0.04),6), aes(label=label), size=3) +
  scale_fill_manual(values=c("TEs"="#000000","Genes"="#0D9FDE","Intergenic"="#DC9C86")) +
  scale_color_manual(values=c("TEs"="#000000","Genes"="#0D9FDE","Intergenic"="#DC9C86")) +
  theme(panel.grid.minor = element_blank(), 
        panel.grid.major = element_blank(),
        axis.title.x = element_blank(),
        axis.text.x = element_text(size=10),
        axis.ticks = element_blank(),
        axis.text.y = element_text(size=10),
        axis.title.y = element_blank(),
        panel.background = element_rect(fill = 'white', colour = 'black'),
        strip.background = element_rect(fill = 'white', colour = 'white')) +
  facet_grid(~Group) +
  guides(color=FALSE) +
  labs(fill="", title = "Distribution of distances to centromeres")
dev.off()

############################################################################################
### To plot CPM in the MBD1 peaks clustered by H2AKub difference with WT

genes<-read.delim("AGI_in_MBD1_clusters.txt", stringsAsFactors = FALSE, header = TRUE)
genes$Cluster<-as.factor(genes$Cluster)

genextable<-read.delim("C:/Users/Jon/Google Drive/PhD/Server/R/MBD/Data/DEG/Gene_expression_RS5x2.txt", 
                       stringsAsFactors = FALSE, header = TRUE)
genextable<-genextable %>% 
  mutate(WT=(WT_a+WT_b+WT_c)/3) %>%
  mutate(mbd1_2_4=(mbd1_2_4_a+mbd1_2_4_b+mbd1_2_4_c)/3) %>%
  mutate(mbd1_2_5_6=(mbd1_2_5_6_a+mbd1_2_5_6_b+mbd1_2_5_6_c)/3) %>%
  mutate(mbd2_5_6=(mbd2_5_6_a+mbd2_5_6_b+mbd2_5_6_c)/3) %>%
  mutate(suvh1_3=(suvh1_3_a+suvh1_3_b+suvh1_3_c)/3) %>%
  select(AGI, WT, mbd1_2_4, mbd1_2_5_6, mbd2_5_6, suvh1_3)

genextable2<-genextable %>% gather(Sample,meanCPM,c(WT, mbd1_2_4, mbd1_2_5_6, mbd2_5_6, suvh1_3))
genextable2$Sample<-factor(genextable2$Sample, levels = c("WT","mbd1_2_4","mbd2_5_6","mbd1_2_5_6","suvh1_3"))

genextable3<-genextable %>%
  mutate(logFCmbd1_2_4=log2(mbd1_2_4/WT), logFCmbd1_2_5_6=log2(mbd1_2_5_6/WT), 
         logFCmbd2_5_6=log2(mbd2_5_6/WT), logFCsuvh1_3=log2(suvh1_3/WT)) %>% 
  select(AGI, mbd1_2_4=logFCmbd1_2_4, mbd1_2_5_6=logFCmbd1_2_5_6, 
                                          mbd2_5_6=logFCmbd2_5_6, suvh1_3=logFCsuvh1_3) %>%
  gather(Sample,logFC,c(mbd1_2_4, mbd1_2_5_6, mbd2_5_6, suvh1_3))
genextable3$Sample<-factor(genextable3$Sample, levels = c("mbd1_2_4","mbd2_5_6","mbd1_2_5_6","suvh1_3"))

extable<-merge(genes, genextable3, by="AGI")

plot<-ggplot(extable, aes(Sample, logFC)) +
    geom_boxplot(aes(color=Sample), alpha=0.2, notch = TRUE) +
    scale_color_manual(values=c("WT"="#000000", "mbd1_2_5_6"="#4F4FCC","mbd2_5_6"="#9E61CB",
                             "suvh1_3"="#E8824E","mbd1_2_4"="#70B6EC")) +
    geom_hline(yintercept = 0) +
    guides(color=FALSE) +
    theme(panel.grid.minor = element_blank(), 
          panel.grid.major = element_blank(),
          axis.title.x = element_blank(),
          axis.text.x = element_text(size=10, angle=90, vjust = 0.5, hjust = 1),
          axis.text.y = element_text(size=10),
          axis.title.y = element_text(size=15),
          strip.text = element_text(size=10),
          panel.background = element_rect(fill = 'white', colour = 'black'),
          strip.background = element_rect(fill = 'white', colour = 'black')) +
  facet_wrap(~Cluster) +
  coord_cartesian(ylim=c(-1,1))
plot

plot<-ggplot(extable, aes(Cluster, logFC)) +
  geom_boxplot(alpha=0.2, outlier.alpha = 0 ,notch = TRUE) +
  geom_jitter(aes(color=Cluster), alpha=0.2) +
  # scale_color_manual(values=c("WT"="#000000", "mbd1_2_5_6"="#4F4FCC","mbd2_5_6"="#9E61CB",
                              # "suvh1_3"="#E8824E","mbd1_2_4"="#70B6EC")) +
  geom_hline(yintercept = 0) +
  guides(color=FALSE) +
  theme(panel.grid.minor = element_blank(), 
        panel.grid.major = element_blank(),
        axis.title.x = element_blank(),
        axis.text.x = element_text(size=10, angle=90, vjust = 0.5, hjust = 1),
        axis.text.y = element_text(size=10),
        axis.title.y = element_text(size=15),
        strip.text = element_text(size=10),
        panel.background = element_rect(fill = 'white', colour = 'black'),
        strip.background = element_rect(fill = 'white', colour = 'black')) +
  facet_wrap(~Sample) +
  coord_cartesian(ylim=c(-1,1))
plot

#####################################################################################
## To check log2FC enrichment instead of Peaks
#####################################################################################

plotCoverage<-function(sample) {
  cov<-read.table(paste0("compare_CPM_ratio_",sample,".bedGraph"), header = FALSE, sep="\t", 
                  na.strings=c("/"), stringsAsFactors = FALSE, 
                  col.names = c("Chr","Start","End","CPM"))
  cov<-cov[cov$Chr!="Pt",]
  cov<-cov[cov$Chr!="Mt",]
  
  plot1<-ggplot(cov, aes(CPM)) +
    geom_histogram(bins = 50) + labs(title=paste0(sample," coverage (ratio)"))
  
  quantile(cov$CPM)
  
  cov2<-read.table(paste0("compare_CPM_rec_ratio_",sample,".bedGraph"), header = FALSE, sep="\t", 
                   na.strings=c("/"), stringsAsFactors = FALSE, 
                   col.names = c("Chr","Start","End","CPM"))
  cov2<-cov2[cov2$Chr!="Pt",]
  cov2<-cov2[cov2$Chr!="Mt",]
  
  quantile(cov2$CPM)
  
  plot2<-ggplot(cov2, aes(CPM)) +
    geom_histogram(bins=50) + labs(title=paste0(sample," coverage (rec ratio)"))
  
  multiplot(plot1, plot2, cols = 1)
  
}

plotCoverage("MBD5")

givequantile<-function(sample) {
  cov2<-read.table(paste0("compare_CPM_log2FC_",sample,".bedGraph"), header = FALSE, sep="\t", 
                   na.strings=c("/"), stringsAsFactors = FALSE, 
                   col.names = c("Chr","Start","End","CPM"))
  cov2<-cov2[cov2$Chr!="Pt",]
  cov2<-cov2[cov2$Chr!="Mt",]
  
  quantile(cov2$CPM)
}

givequantile("SUVH1")

cov2 %>% filter(Chr==1 & Start>=25653 & End>=25653 & Start<=25890 & End<=25890) %>%
  summarize(Mean=mean(CPM))

###########

setwd("G:/My Drive/CSHL/Papers/2024_Cahn_et_al_mC_readers/Temp_revisions")
library(readr)
library(ggplot2)
library(dplyr)
library(tidyr)
library(wesanderson)

tab<-read.delim("Readers_in_chromatin_states.txt", header=FALSE, 
                col.names=c("Reader","State","Coverage"))

ggplot(tab, aes(Reader, Coverage, fill=State)) +
  geom_bar(stat = "identity", position = "fill") +
  scale_fill_manual(values=c("State_1"="red","State_2"="salmon","State_3"="violet","State_4"="gold",
                             "State_5"="grey","State_6"="brown","State_7"="green","State_8"="lightblue",
                             "State_9"="blue")) +
  theme_bw() +
  theme(axis.text.x = element_text(size=10, angle=45, vjust = 0.5, hjust = 0.5),
        axis.title.x = element_blank())


tab2<-read.delim("annotated_peaks_stats_merged.txt", header=TRUE) %>%
  group_by(Sample) %>%
  mutate(Total=sum(Count)) %>%
  group_by(Sample, Annotations)

tab2$Annotations<-factor(tab2$Annotations, levels = rev(c("TE","TE_prom","TE_gene","TE_TTS","Promoter", "Exon", 
                                                                      "Intron", "TTS", "Intergenic")))
plot.annot.nb<-function(table) {
  plot<-ggplot(table, aes(Sample, Count, fill=Annotations)) +
    geom_bar(stat="identity", position="fill", colour="black", show.legend = T) +
    theme_bw() + guides(colour = FALSE) +
    labs(title = paste("Distribution of peak summits by annotation"), 
         y="Distribution of peaks per annotation", x="", fill="Annotation") +
    theme(axis.text.x = element_text(color="black", size=8, angle=90, vjust=0.5), 
          panel.grid = element_blank(), axis.ticks.x=element_blank()) + 
    annotate("text", x=table$Sample, y=1.05, label=table$Total, size = 5)
  
  plot
}  

plot.annot.nb(tab2)

##

tab3<-read.delim("distance_to_centromeres_merged.txt", stringsAsFactors = FALSE, header = TRUE)

plot3<-ggplot(tab3, aes(Distance)) +
  geom_histogram(aes(y=..density..), binwidth = 2) +
  theme(panel.grid.minor = element_blank(), 
        panel.grid.major = element_blank(),
        axis.title.x = element_blank(),
        axis.ticks.x = element_blank(),
        axis.text.x = element_text(size=15),
        axis.text.y = element_text(size=20),
        axis.title.y = element_text(size=35),
        panel.background = element_rect(fill = 'white', colour = 'black'),
        strip.background = element_rect(fill = 'white', colour = 'black'),
        strip.text = element_text(size = 35)) +
  scale_x_continuous(breaks = c(15,55,90), labels = c("Centromere",expression(""%->%""),"Telomere")) +
  labs(fill="", y="Peak density")

plot3


tab4<-read.delim("random_peaks_mc.txt", stringsAsFactors = FALSE, header = TRUE)

plot.Metavg.bycontext <- function(min, cov) {
  tablet<-filter(tab4, Counts>=min, Coverage>=cov)
  
  ggplot(tablet, aes(Sample, mC)) + geom_jitter(alpha=0.5, color="grey", size=0.4) + 
    geom_boxplot(aes(fill=Context), outlier.alpha = 0, alpha=0.5, show.legend = FALSE) +
    theme(axis.text.x = element_text(color='black', size=20, angle=90, vjust=0.6),
          axis.text.y = element_text(color='black', size=10),
          axis.title.y = element_text(color='black', size=15),
          panel.grid.minor = element_line(colour = 'grey'), panel.grid.major.x = element_blank(),
          panel.grid.major.y = element_line(colour='lightgrey'),
          panel.background = element_rect(fill = 'white', colour = 'black'),
          strip.background = element_rect(fill = 'white', colour = 'black'), 
          axis.ticks.x=element_blank(), axis.title.x=element_blank(), 
          legend.key=element_blank(), strip.text = element_text(size = 20)) + 
    labs(title = paste("Average methylation levels at peaks with at least",min,"cytosines covered",cov,"times or more"), 
         y="Methylation level (%)", size=10) +
    scale_fill_manual(values=wes_palette(n=3, name="Moonrise2")) + 
    facet_grid(~Context)
}

plot.Metavg.bycontext(0,0)
