#compare_gene_expression.R

#compare expression for genes within 2.5kb of a de novo in ivf and natural cohorts

library(ggplot2)
library(dplyr)
library(data.table)
library(RColorBrewer)
library(ggpubr)
library(tidyr)
library(tibble)

colors = c("#E69F00", "#666699")

setwd("/PATH/TO/WORKING/DIRECTORY")
dnm = read.table("calls_withFlankGC.tsv", sep = "\t", header = T)

ge = fread("gene_expression/Bruce4ES-RNAseq-geneQuant.tsv", header = T)
genes = fread("gene_expression/gene_names.txt", header = T)

################################################################################
#Reformat data frames
################################################################################

ge <- ge %>% filter(grepl("ENS", gene_id))
geneID = ge$gene_id
x = unlist(strsplit(split = "\\.", x = geneID))
x = x[grepl(pattern = "ENS", x = x)]
ge <- ge %>% mutate(geneLabel = x)
ge <- ge %>% select(c("gene_id", "geneLabel", "transcript_id(s)", "TPM", "FPKM"))


data <- left_join(ge, genes, by = c("geneLabel" = "Gene stable ID"))

#remove duplicate rows
data <- data %>% distinct(geneLabel, .keep_all= TRUE)

################################################################################
#Find DNMs that overlap or are in close proximity to expressed genes
################################################################################

ge_tpm = numeric(length(dnm$chr))

for (row in 1:length(dnm$chr)) {
    overlap <- data %>% filter(`Chromosome/scaffold name` == dnm$chr[row]) %>% 
                        filter(`Gene start (bp)`- 2500 < dnm$pos[row]) %>%
                        filter(`Gene end (bp)`+ 2500 > dnm$pos[row])
    
    if (dim(overlap)[1] == 1) {
       ge_tpm[row] = overlap$TPM[1]
    }
    else if (dim(overlap)[1] > 1) {
       print(row)
       #ge_tpm[row] = paste(overlap$TPM, collapse = ",")
       ge_tpm[row] = max(overlap$TPM)
    }
}

dnm <- dnm %>% mutate(mean_expression = ge_tpm) %>% 
  mutate(class = sub("IVF", "ART", class))


#plot gene expression values of ivf versus natural dnms
dnm %>% filter(mean_expression < 50) %>%
ggplot(mapping = aes(x = class, y = mean_expression, fill = class)) + 
  geom_boxplot() + 
  scale_fill_manual(values = colors) +
  theme_classic(base_size = 14) +
  theme(legend.position = "none")

natural <- dnm %>% filter(class == "Natural")
ivf <- dnm %>% filter(class == "ART")

wilcox.test(natural$mean_expression, ivf$mean_expression)

#consider only genes that are expressed
dnm_expressedOnly <- filter(dnm, mean_expression > 0)
natural_expressedOnly <- dnm_expressedOnly %>% filter(class == "Natural")
ivf_expressedOnly <- dnm_expressedOnly %>% filter(class == "ART")

wilcox.test(natural_expressedOnly$mean_expression, ivf_expressedOnly$mean_expression)

a <- 
  dnm_expressedOnly %>% 
  ggplot(mapping = aes(x = class, y = mean_expression, fill = class)) + 
    geom_boxplot() + 
    scale_fill_manual(values = colors) +
    theme_classic(base_size = 14) +
    theme(legend.position = "none") + 
    ylab("TPM") + 
    xlab("") +
    ylim(c(0, 150)) +
    stat_compare_means(method = "wilcox.test", label = "p.format")

################################################################################
#compare fraction of genes that are expressed between the two cohorts
################################################################################
ge_tpm = numeric(length(dnm$chr))

for (row in 1:length(dnm$chr)) {
  overlap <- data %>% filter(`Chromosome/scaffold name` == dnm$chr[row]) %>% 
    filter(`Gene start (bp)`- 2500 < dnm$pos[row]) %>%
    filter(`Gene end (bp)`+ 2500 > dnm$pos[row])
  
  if (dim(overlap)[1] == 1) {
    if (overlap$TPM[1] == 0) {ge_tpm[row] = 0;}
    else {ge_tpm[row] = 1}
  }
  else if (dim(overlap)[1] > 1) {
    if (sum(overlap$TPM) > 0) {ge_tpm[row] = 1;}
    else {ge_tpm[row] = 0}
  }
}

dnm <- dnm %>% mutate(expressed = ge_tpm)

#plot stacked bar plot showing proportion of ivf versus natural dnms that are expressed
dnm_fract <- 
  dnm %>% group_by(class) %>% 
  summarize(`Active` = sum(expressed)/n()) %>%
  mutate(`Silent` = 1-`Active`) %>%
  pivot_longer(cols = 2:3, names_to = "state", values_to = "proportion")

b <- ggplot(dnm_fract, mapping = aes(x = class, y = proportion, fill = state)) + 
    geom_bar(position="fill", stat="identity") + 
    scale_fill_manual(values = colors) +
    theme_classic(base_size = 14) +
    ylab("Proportion") + 
    xlab("") #+ 
    #stat_compare_means(method = "wilcox.test", label = "p.signif") #+
    #ggtitle("Proportion of genic DNMs \nin active vs. silent genes", size = 10) 


natural <- dnm %>% filter(class == "Natural")
ivf <- dnm %>% filter(class == "ART")

sum(natural$expressed)/dim(natural)[1]
sum(ivf$expressed)/dim(ivf)[1]

#proportion test:
counts <- dnm %>%
  select(c("class", "expressed")) %>% 
  group_by(class) %>% 
  summarize(success = sum(expressed), total = n()) %>%
  mutate(failure = total-success) %>%
  select(-total) %>% 
  column_to_rownames("class")

prop.test(x = as.matrix(counts))
wt <- wilcox.test(natural$expressed, ivf$expressed)
wt

b <- b + annotate(geom="text", x=1, y=1.05, label=paste("p = ", round(as.numeric(wt$p.value), 3), sep = ""))


#############################################################################
#supplementary figure for paper
#############################################################################

ggarrange(b, a, ncol = 2, nrow = 1, labels = "AUTO")
ggsave(filename="gene_expression/DNM_transcriptionLevel_compare_2panel.pdf", height = 4, width = 8)

#rmarkdown::render("compare_gene_expression.R")
