library(dplyr)
library(ggplot2)
library(XML)
library(viridis)
# library
library(ggridges)
library(ggplot2)
library(forcats)
library(hrbrthemes)
library(DESeq2)
# Diamonds dataset is provided by R natively

give.n <- function(x){
  return(c(y = mean(x), label = length(x)))
}

#get TCGA TF activation data
setwd("~/BRCA_TCGA/TF_each_sample/")
tcga_data = readRDS(file = "../TF_df_all.RDS")

#get data on samples 
tcga_coldata = readRDS(file = "~/BRCA_TCGA/tcga_data.RDS")
tcga_coldata<-as.data.frame(colData(tcga_coldata))
#get table for morphologies 
questions<-readHTMLTable(doc = "~/BRCA_TCGA/morph_table.html", trim=T, as.data.frame=T, header=T) #https://p53.iarc.fr/Morphology.aspx
morpho<-bind_rows(questions)


samples = names(tcga_data)
tcga_tfs<-lapply(samples,function(x){ data.frame(tcga_data[[x]], samples = x)})
names(tcga_tfs)<- names(tcga_data)
tcga_df<-bind_rows(tcga_tfs, .id = "samples")
#evangelia says not to do a cut off - i think i disagree
tcga_df<-tcga_df[tcga_df$p.value<0.1,] #we donts need adjusted p value because we are looking at specific tfs, not studying all of them. 
rap<-tcga_df[tcga_df$Regulon == "Rap1_Mod",] #Rap1_Mod
TFs<-tcga_df[tcga_df$Regulon != "Rap1_Mod",]
cor_plot<-merge(x=rap, y=TFs, by.x="samples", by.y="samples")
subset_cor_plot<-cor_plot[cor_plot$Regulon.y %in% c("SMAD2", "TEAD1", "NFKBIA", "JUN", "NFKB1", "RELB", ""),]

plot_with_info<-merge(x = rap, y = tcga_coldata, by.x = "samples", by.y = "barcode")
plot_with_info<-merge(x = plot_with_info, y = morpho, by.x = "morphology", by.y = "Morpho_code")
colnames(plot_with_info)[colnames(plot_with_info) == "NES"]<-"RAP1_GEM_NES"
test<-unique(data.frame(plot_with_info$RAP1_GEM_NES, plot_with_info$Morphology))
colnames(test)<-c("NES", "variable")
test<- test[test$variable %in% names(table(test$variable)[as.logical(table(test$variable) > 5 )]),]
# Plot
test %>%
  ggplot( aes(x=NES, y=reorder(variable, NES), fill=variable)) +
  geom_boxplot() +
  scale_fill_viridis(discrete = TRUE, alpha=0.6) +
  geom_jitter(color="black", size=0.4, alpha=0.9) +
  theme_ipsum() +
  theme(
    legend.position="none",
    plot.title = element_text(size=11)
  ) +
  ggtitle("Rap1 Gene expression module by clinical morphology") +
  xlab("") + 
  stat_summary(fun.data = give.n, geom = "text",position = position_dodge(width = .75)) +
  stat_summary(fun.y=mean, geom="point", shape=20, size=5, color="red", fill="red")

ks.test(x = test$NES[test$variable == "Lobular carcinoma, NOS (C50._)"],
        y = test$NES[test$variable == "Metaplastic carcinoma, NOS"], 
        alternative = "greater")
  t.test(x = test$NES[test$variable == "Infiltrating duct mixed with other types of carcinoma (C50._)"],
         y = test$NES[test$variable == "Mucinous adenocarcinoma"])
  
  
#head(diamonds)

# basic example
ggplot(test, aes(x = NES, y = fct_reorder(variable,NES), fill = variable)) +
  geom_density_ridges() +
  theme_ridges() + 
  theme(legend.position = "none")

non_sig_table<-data.frame(table(test$variable))
together_table<-merge(x = sig_table, y = non_sig_table, by = "Var1")
list_sig<-together_table$Freq.x
names(list_sig)<-together_table$Var1
list_nonsig<-together_table$Freq.y
names(list_nonsig)<-together_table$Var1


print(chisq.test(x = list_sig, p = list_nonsig/sum(together_table$Freq.y) ))

library(ggplot2)
# Barplot
bp<- ggplot(together_table, aes(x="", y=Freq.y, fill=Var1))+
  geom_bar(width = 1, stat = "identity")
bp
