library(dplyr)
library(ggplot2)
library(XML)
library(TCGAbiolinks)
library(viridis)
library(DESeq2)
give.n <- function(x){
  return(c(y = mean(x), label = length(x)))
}

#get TCGA TF activation data
setwd("~/BRCA_TCGA/TF_each_sample/")
tcga_data = readRDS(file = "../TF_df_all.RDS")

#get data on samples 
tcga_coldata = readRDS(file = "~/BRCA_TCGA/tcga_data.RDS")
tcga_coldata<-as.data.frame(SummarizedExperiment::colData(tcga_coldata))
#get table for morphologies 
questions<-readHTMLTable(doc = "~/BRCA_TCGA/morph_table.html", trim=T, as.data.frame=T, header=T) #https://p53.iarc.fr/Morphology.aspx
morpho<-bind_rows(questions)


samples = names(tcga_data)
tcga_tfs<-lapply(samples,function(x){ data.frame(tcga_data[[x]], samples = x)})
names(tcga_tfs)<- names(tcga_data)
tcga_df<-bind_rows(tcga_tfs, .id = "samples")
#evangelia says not to do a cut off - i think i disagree
tcga_df<-tcga_df[tcga_df$FDR<0.05,] #we use no cut off to generate figure 6A and cut off 0.05 FDR for fig 6B
rap<-tcga_df[tcga_df$Regulon == "Rap1_Mod",] #Rap1_Mod
TFs<-tcga_df[tcga_df$Regulon != "Rap1_Mod",]
cor_plot<-merge(x=rap, y=TFs, by.x="samples", by.y="samples")

plot_with_info<-merge(x = rap, y = tcga_coldata, by.x = "samples", by.y = "barcode")
plot_with_info<-merge(x = plot_with_info, y = morpho, by.x = "morphology", by.y = "Morpho_code")
colnames(plot_with_info)[colnames(plot_with_info) == "NES"]<-"RAP1_GEM_NES"
test<-unique(data.frame(plot_with_info$RAP1_GEM_NES, plot_with_info$Morphology))
colnames(test)<-c("NES", "variable")
test<- test[test$variable %in% names(table(test$variable)[as.logical(table(test$variable) > 10 )]),]
# Plot
test %>%
  ggplot( aes(x=NES, y=reorder(variable, NES), fill=variable)) +
  geom_boxplot(outlier.shape = NA) +
  scale_fill_viridis(discrete = TRUE, alpha=0.6) +
  geom_jitter(color="black", size=0.4, alpha=0.9) +
  theme_ipsum() +
  theme(
    legend.position="none",
    plot.title = element_text(size=11)
  ) +
  ggtitle("Rap1 Gene expression module by clinical morphology") +
  xlab("") + 
  stat_summary(fun.data = give.n, geom = "text",position = position_dodge(width = .75)) +
  stat_summary(fun.y=mean, geom="point", shape=20, size=5, color="red", fill="red")
#t.test
ks.test(x = test$NES[test$variable == "Infiltrating duct carcinoma, NOS (C50._)"],
       y = test$NES[test$variable == "Metaplastic carcinoma, NOS"], 
       alternative = "greater")
t.test(x = test$NES[test$variable == "Infiltrating duct carcinoma, NOS (C50._)"],
        y = test$NES[test$variable == "Metaplastic carcinoma, NOS"])



p <- ggplot(test, aes(x=variable, y=NES)) + 
  geom_boxplot(trim=FALSE)

p  + geom_dotplot(binaxis='y', stackdir='center', dotsize=0.5)

ggplot(test, aes(x=NES, color=variable)) +
  geom_density()
library(hrbrthemes)
# Stacked density plot:
p <- ggplot(data=test, aes(x=NES, group=variable, fill=variable)) +
  geom_density(adjust=1.5, position="fill") +
  theme_ipsum()
p
#p
# Add mean lines
TFs_from_network<-c("KLF5", "ESRRA", "JARID2", "SMAD3", "E2F4", "SMAD4", "TP53", "RUNX2", "TCF7", "NFATC1")
TFs_from_figure<-c("NFKB1", "RELA", "TEAD1", "JARID2", "RUNX2", "RELB")
#line graphs 
p<-ggplot(data=cor_plot[cor_plot$Regulon.y %in% c("KLF5", "ESRRA", "JARID2", "SMAD3", "E2F4", "SMAD4", "TP53", "RUNX2", "TCF7", "NFATC1"),], aes(x=NES.x, y=NES.y, colour = p.value.x)) +
  geom_point(stat="identity") + 
  geom_smooth(method=lm , color="red", fill="#69b3a2", se=TRUE) +  facet_wrap(~ Regulon.y, ncol = 2) +  theme_ipsum() +
  theme(
    plot.title = element_text(size=11)
  )
p


TF_pvalue<-list()
TF_estimate<-list()
for (TF in unique(cor_plot$Regulon.y)) {
  if (length(cor_plot[cor_plot$Regulon.y == TF,]$NES.x)>2) {
    print(TF)
    result_sp<-cor.test(cor_plot[cor_plot$Regulon.y == TF,]$NES.x,
                        cor_plot[cor_plot$Regulon.y == TF,]$NES.y, method = "kendall")
    TF_pvalue[TF]<-result_sp$p.value
    TF_estimate[TF]<-result_sp$estimate
  }
}

#p value adjustment
TF_pvalue_adj<-p.adjust(TF_pvalue, method = "fdr")
sig_tfs<-names(unlist(TF_estimate[TF_pvalue_adj<0.00001]))
print(sig_tfs)
TF_pvalue_adj<-TF_pvalue

vol_plot<-data.frame(unlist(TF_estimate), unlist(TF_pvalue))  
#vol_plot$unlist.TF_pvalue.<-p.adjust(vol_plot$unlist.TF_pvalue.)
vol_plot$p_val_adj<-p.adjust(vol_plot$unlist.TF_pvalue.)
vol_plot$p_val_adj<--log2(vol_plot$p_val_adj+0.0001)
vol_plot$unlist.TF_pvalue.<--log2(vol_plot$unlist.TF_pvalue.+0.0001)
vol_plot$names<-rownames(vol_plot)
library(ggrepel)
ggplot(vol_plot, aes(x=unlist.TF_estimate., y=unlist.TF_pvalue.)) +
  geom_point(size=2, shape=23) + 
  geom_label_repel( 
    data=vol_plot %>% filter(unlist.TF_estimate.<-0.5 & unlist.TF_pvalue.>10), # Filter data first
    aes(label=names), max.overlaps = 50
  ) +  theme_ipsum() +
  theme(
    legend.position="none",
    plot.title = element_text(size=11)
  )
unique(subset_cor_plot$samples[subset_cor_plot$NES.x > 0])
