#compare_encode_annotations.R

library(tidyverse)
library(ggplot2)
library(dplyr)
library(tidyr)
library(data.table)
library(tibble)
library(ggpubr)

ymax = 0.04
colors = c("#E69F00", "#666699")
setwd("PATH/TO/WORKING/DIR")
##########################################################################################
#import the table of DNMs
##########################################################################################
dnms_liftover <- read.delim(file = "mm39_mm10_liftover_finalDNMs.bed", header = F)
colnames(dnms_liftover) <- c("mm10_chr", "mm10_start", "mm10_end")

dnms <- read.delim(file = "MpileupDeepvariant_RepeatRemoved_35bpRemoved_BlaclListGenes_Homopolymer_ART_NAT_FINAL.tab")
dnms <- cbind(dnms, dnms_liftover)

dim(dnms)
length(unique(dnms$pos))

##########################################################################################
#add on encode annotations
##########################################################################################
for (bed in list.files(path = "encode", pattern = ".bed", full.names = T)) {
  mark <- unlist(strsplit(basename(bed), split = "\\."))[2]
  print(mark)
  if (file.info(bed)$size > 0) {
    bedfile <- read.delim(bed, header = FALSE)
    bedfile <- bedfile %>% mutate(`mark`=1) %>%
      select(-V3) %>% 
      unique()
    colnames(bedfile) <- c("mm10_chr", "mm10_start", mark)
    dnms <- left_join(dnms, bedfile, by = c("mm10_chr", "mm10_start"))
  }
}

#View(dnms)

#convert NA to 0
dnms <- dnms %>% mutate(across(CTCF:H3K9me3, ~ replace_na(.x, 0))) 
dim(dnms)

##########################################################################################
#perform stat comparisons and plotting
##########################################################################################
fract_overlap <- dnms %>% 
  pivot_longer(cols = 11:18, names_to="mark", values_to = "overlap") %>%
  group_by(mark, cohort) %>% 
  summarize(overlap_ct = sum(overlap), N = n()) %>%
  mutate(proportion = overlap_ct/N)

fract_overlap

#CTCF
counts <- dnms %>%
  select(c("cohort", "CTCF")) %>% 
  group_by(cohort) %>% 
  summarize(success = sum(CTCF), total = n()) %>%
  mutate(failure = total-success) %>%
  select(-total) %>% 
  column_to_rownames("cohort")

prop_test <- prop.test(x = as.matrix(counts))
prop_test
p_val = round(as.numeric(prop_test$p.value), 3)

a <- 
  fract_overlap %>% filter(mark == "CTCF") %>%
  ggplot(mapping = aes(x = factor(cohort), y = proportion, fill = cohort)) + 
  geom_bar(stat="identity") + 
  scale_fill_manual(values = colors) +
  theme_classic(base_size = 14) +
  ylab("Proportion") + 
  xlab("") + 
  ylim(c(0,ymax)) +
  theme(legend.position = "none") + 
  annotate(geom="text", x=1, y=ymax, label=paste("p = ", p_val, sep = "")) + 
  ggtitle("CTCF")


#H3K27ac (active enhancer mark)
counts <- dnms %>%
  select(c("cohort", "H3K27ac")) %>% 
  group_by(cohort) %>% 
  summarize(success = sum(H3K27ac), total = n()) %>%
  mutate(failure = total-success) %>%
  select(-total) %>% 
  column_to_rownames("cohort")

prop_test <- prop.test(x = as.matrix(counts))
prop_test
p_val = round(as.numeric(prop_test$p.value), 3)

b <- 
  fract_overlap %>% filter(mark == "H3K27ac") %>%
  ggplot(mapping = aes(x = factor(cohort), y = proportion, fill = cohort)) + 
  geom_bar(stat="identity") + 
  scale_fill_manual(values = colors) +
  theme_classic(base_size = 14) +
  ylab("Proportion") + 
  xlab("") + 
  ylim(c(0,ymax)) +
  theme(legend.position = "none") + 
  annotate(geom="text", x=1, y=ymax, label=paste("p = ", p_val, sep = "")) +
  ggtitle("H3K27ac")


#H3K36me3 
counts <- dnms %>%
  select(c("cohort", "H3K36me3")) %>% 
  group_by(cohort) %>% 
  summarize(success = sum(H3K36me3), total = n()) %>%
  mutate(failure = total-success) %>%
  select(-total) %>% 
  column_to_rownames("cohort")

prop_test <- prop.test(x = as.matrix(counts))
prop_test
p_val = round(as.numeric(prop_test$p.value), 3)

c <- 
  fract_overlap %>% filter(mark == "H3K36me3") %>%
  ggplot(mapping = aes(x = factor(cohort), y = proportion, fill = cohort)) + 
  geom_bar(stat="identity") + 
  scale_fill_manual(values = colors) +
  theme_classic(base_size = 14) +
  ylab("Proportion") + 
  xlab("") + 
  ylim(c(0,ymax)) +
  theme(legend.position = "none") + 
  annotate(geom="text", x=1, y=ymax, label=paste("p = ", p_val, sep = "")) +
  ggtitle("H3K36me3")


#H3K4me1 (required for enhancers to activate transcription -- "poised")
counts <- dnms %>%
  select(c("cohort", "H3K4me1")) %>% 
  group_by(cohort) %>% 
  summarize(success = sum(H3K4me1), total = n()) %>%
  mutate(failure = total-success) %>%
  select(-total) %>% 
  column_to_rownames("cohort")

prop_test <- prop.test(x = as.matrix(counts))
prop_test
p_val = round(as.numeric(prop_test$p.value), 3)

d <- 
  fract_overlap %>% filter(mark == "H3K4me1") %>%
  ggplot(mapping = aes(x = factor(cohort), y = proportion, fill = cohort)) + 
  geom_bar(stat="identity") + 
  scale_fill_manual(values = colors) +
  theme_classic(base_size = 14) +
  ylab("Proportion") + 
  xlab("") + 
  ylim(c(0,ymax)) +
  theme(legend.position = "none") + 
  annotate(geom="text", x=1, y=ymax, label=paste("p = ", p_val, sep = "")) + 
  ggtitle("H3K4me1")


#H3K4me3 (promoters)
counts <- dnms %>%
  select(c("cohort", "H3K4me3")) %>% 
  group_by(cohort) %>% 
  summarize(success = sum(H3K4me3), total = n()) %>%
  mutate(failure = total-success) %>%
  select(-total) %>% 
  column_to_rownames("cohort")

prop_test <- prop.test(x = as.matrix(counts))
prop_test
p_val = round(as.numeric(prop_test$p.value), 3)

e <- 
  fract_overlap %>% filter(mark == "H3K4me3") %>%
  ggplot(mapping = aes(x = factor(cohort), y = proportion, fill = cohort)) + 
  geom_bar(stat="identity") + 
  scale_fill_manual(values = colors) +
  theme_classic(base_size = 14) +
  ylab("Proportion") + 
  xlab("") + 
  ylim(c(0,ymax)) +
  theme(legend.position = "none") + 
  annotate(geom="text", x=1, y=ymax, label=paste("p = ", p_val, sep = "")) + 
  ggtitle("H3K4me3")


#H3K9ac (active promoters)
counts <- dnms %>%
  select(c("cohort", "H3K9ac")) %>% 
  group_by(cohort) %>% 
  summarize(success = sum(H3K9ac), total = n()) %>%
  mutate(failure = total-success) %>%
  select(-total) %>% 
  column_to_rownames("cohort")

prop_test <- prop.test(x = as.matrix(counts))
prop_test
p_val = round(as.numeric(prop_test$p.value), 3)

f <- 
  fract_overlap %>% filter(mark == "H3K9ac") %>%
  ggplot(mapping = aes(x = factor(cohort), y = proportion, fill = cohort)) + 
  geom_bar(stat="identity") + 
  scale_fill_manual(values = colors) +
  theme_classic(base_size = 14) +
  ylab("Proportion") + 
  xlab("") + 
  ylim(c(0,ymax)) +
  theme(legend.position = "none") + 
  annotate(geom="text", x=1, y=ymax, label=paste("p = ", p_val, sep = "")) + 
  ggtitle("H3K9ac")



#H3K9me3 
counts <- dnms %>%
  select(c("cohort", "H3K9me3")) %>% 
  group_by(cohort) %>% 
  summarize(success = sum(H3K9me3), total = n()) %>%
  mutate(failure = total-success) %>%
  select(-total) %>% 
  column_to_rownames("cohort")

prop_test <- prop.test(x = as.matrix(counts))
prop_test
p_val = round(as.numeric(prop_test$p.value), 3)

g <- 
  fract_overlap %>% filter(mark == "H3K9me3") %>%
  ggplot(mapping = aes(x = factor(cohort), y = proportion, fill = cohort)) + 
  geom_bar(stat="identity") + 
  scale_fill_manual(values = colors) +
  theme_classic(base_size = 14) +
  ylab("Proportion") + 
  xlab("") + 
  ylim(c(0,ymax)) +
  theme(legend.position = "none") + 
  annotate(geom="text", x=1, y=ymax, label=paste("p = ", p_val, sep = "")) + 
  ggtitle("H3K9me3")


ggarrange(a,b,c,d,e,f,g, ncol = 4, nrow =2, labels = "AUTO")
ggsave(filename = "encode/encode_proportionPlots_withP_facet.pdf", height = 6, width = 10)

#rmarkdown::render("compare_encode_annotations.R")

