#!/usr/bin/Rscript

# AUTHOR: Charlie Barker using function by Ioannis Kamzolas
#
# Uses library DOROTHEA to find the enrichments of regulons from our differential expression data.
#
# INPUT:
#          DEGS.txt,                file summarsing differentially expressed genes identified by deSEQ.R
#          human_network.csv        TF human regulons taken from https://saezlab.github.io/dorothea/
#
# OUTPUT:
#
#   TF.csv (out_path)               file describing significantly differntially regulated TFs.

library(dorothea)
library(org.Hs.eg.db)
library(readr)
library(ggplot2)
#path of pathways to write
path<-"~/BRCA_TCGA/morp_vs_WT/"
#path of pathway names to read
setwd(path)

#code kindly sent by Giannis Kamzolas

#  ---TF Activities---
# Inputs:
# 1) -Mouse regulon file (already created mouse network - we don't need to create it again and it's the same for the analysis of all the models)
#    -Human regulon file ------------//---------------------------//---------------------------//---------------------------//------------------
# 2) D.E. Signature (file from DESeq2 (with gene names in the last column))


library(viper)
#install.packages("purrr")
library("purrr")
#install.packages("dplyr")
library("dplyr")
library(gprofiler2)


df2regulon = function(df) {
  regulon = df %>%
    split(.$tf) %>%
    map(function(dat) {
      tf = dat %>% distinct(tf) %>% pull()
      targets = setNames(dat$mor, dat$target)
      likelihood = dat$likelihood
      list(tfmode =targets, likelihood = likelihood)
    })
  return(regulon)
}



my_TFA_plus_rap1 <- function(mypath, organism, outfile, Rap1_Mod) #mypath is deseq filename
{
  if(organism == "HUMAN")
    Regulon_file<- read.csv("~/phenotype_networks/data/DOROTHEA/human_network.csv", header=T) ###Open the human regulon file
  ###subset to the threshold - keep only the most confident TFs
  Regulon_file<- Regulon_file[Regulon_file$confidence=='A'| Regulon_file$confidence=='B'| Regulon_file$confidence=='C' | Regulon_file$confidence=='D',]
  DEsignature <- read.table(file = mypath, sep = ",", header = TRUE) #Read the DEGs file
  ens2symbol <- AnnotationDbi::select(org.Hs.eg.db, #add gene names
                                      key=DEsignature$X,
                                      columns="SYMBOL",
                                      keytype="ENSEMBL")
  #get symbol
  ens2symbol <- as_tibble(ens2symbol)
  DEsignature <- inner_join(DEsignature, ens2symbol, by=c("X"="ENSEMBL"))
  # Exclude probes with unknown or duplicated gene symbol
  DEsignature<-DEsignature[!(is.na(DEsignature$padj) | DEsignature$padj==""), ]
  # Estimatez-score values for the GES. Check VIPER manual for details
  myStatistics = matrix(DEsignature$log2FoldChange, dimnames = list(DEsignature$SYMBOL, 'log2FC') )
  myPvalue = matrix(DEsignature$padj, dimnames = list(DEsignature$SYMBOL, 'adj.P.Val') )
  mySignature = (qnorm(myPvalue/2, lower.tail = FALSE) * sign(myStatistics))[, 1]
  mySignature = mySignature[order(mySignature, decreasing = T)]
  # Estimate TF activities
  reg_rap<-df2regulon(Regulon_file)
  reg_rap$Rap1_Mod<-Rap1_Mod
  mrs = msviper(ges = mySignature, regulon = reg_rap, ges.filter = F, minsize = 4)
  
  
  TF_activities = data.frame(Regulon = names(mrs$es$nes),
                             Size = mrs$es$size[ names(mrs$es$nes) ],
                             NES = mrs$es$nes,
                             p.value = mrs$es$p.value,
                             FDR = p.adjust(mrs$es$p.value, method = 'fdr'))
  TF_activities = TF_activities[ order(TF_activities$p.value), ]
  # Save results
  if (organism == "HUMAN")
    write.csv(TF_activities, file = outfile)
}


#############################
#############################
#Viper - Transcription Factor Activities using the DEGs obtained by DESeq2
#############################
#############################

#add rap1 
source("~/phenotype_networks/scripts/get_wgcna.R")
#get modules from function
wgcnas<-get_wgcna(path_wgcna = "~/phenotype_networks/data/modules/ALLgenesprmodule.tab", 
                  path_correlations = "~/phenotype_networks/data/modules/correlations.txt", 
                  is.full = F)
#check saddlebrown (Rap1)
tfmode<-rep(1, length(wgcnas$MEsaddlebrown))
#make it DOROTHEA readable
names(tfmode)<-wgcnas$MEsaddlebrown
Rap1_Mod<-list()
Rap1_Mod$tfmode<-tfmode
Rap1_Mod$likelihood<-rep(1, length(wgcnas$MEsaddlebrown))
cell_line_samples = list.files(pattern="*.csv")
######Run DOROTHEA######
for (cl in cell_line_samples) {
  file.name<-cl
  #read expression file
  in.path<-paste("./", file.name, sep = "")
  out.path<-paste("./dorot/", file.name, sep = "")
  if (file.exists(in.path)) {
    #run dorothea
    my_TFA_plus_rap1(in.path, "HUMAN",out.path, Rap1_Mod)  
  }
}

######Analyse########

complete.df<-data.frame()
for (cl in cell_line_samples) {
  file.name<-cl
  dor.file<-cl
  dor.path<-paste("./dorot/", dor.file, sep = "")
  if (file.exists(dor.path)) {
    dor.df<-data.frame(read.csv(file = dor.path), C.L. = file.name)
    complete.df<-rbind(complete.df, dor.df)
  }
}
complete.df$C.L. = substr(complete.df$C.L.,1,nchar(complete.df$C.L.)-4)
complete.df<-complete.df[complete.df$FDR < 0.1,]
complete.df<-complete.df[!complete.df$C.L. == "Intercept",]
#####Present#####
Rap1_activaiton <- complete.df[complete.df$X == "Rap1_Mod",]


relevant.df<-complete.df#[complete.df$X %in% c("ZNF584", "TEAD1", "RELA", "ARNT", "NFKB1", "REL", "RELB", "NFKBIA"),]
rap1_df<-complete.df[complete.df$X == "Rap1_Mod",]
end_df<-merge(x = relevant.df, y = rap1_df, by.x = "C.L.", by.y = "C.L.")
end_df<-unique(end_df)
TF_pvalue<-list()
TF_estimate<-list()
for (TF in unique(end_df$X.x)) {
  print(TF)
  if (length(end_df[end_df$X.x == TF,]$NES.x) > 1) {
    result_sp<-cor.test(end_df[end_df$X.x == TF,]$NES.x,
                        end_df[end_df$X.x == TF,]$NES.y, method = "kendall")
    TF_pvalue[TF]<-result_sp$p.value
    TF_estimate[TF]<-result_sp$estimate 
  }
}
vol_plot<-data.frame(unlist(TF_estimate), unlist(TF_pvalue))  
vol_plot$TF_adj_pvalue.<-p.adjust(vol_plot$unlist.TF_pvalue.)
vol_plot$unlist.TF_pvalue.<--log2(vol_plot$unlist.TF_pvalue.+0.0001)
vol_plot$names<-rownames(vol_plot)
library(ggrepel)
ggplot(vol_plot, aes(x=unlist.TF_estimate., y=unlist.TF_pvalue.)) +
  geom_point(size=2, shape=23) + 
  geom_label_repel( 
    data=vol_plot %>% filter(unlist.TF_estimate.<-0.5 & unlist.TF_pvalue.>4.321928), # Filter data first
    aes(label=names), max.overlaps = 30
  ) + theme_bw()

end_df_subset<-end_df[end_df$X.x %in%c("SMAD2", "TEAD1", "NFKBIA", "JUN", "NFKB1", "RELA", "KLF6", "NR0B2"),]
p<-ggplot(data=end_df_subset, aes(x=NES.x, y=NES.y)) +
  geom_point(stat="identity") + 
  geom_smooth(method=lm, se=FALSE) + 
  facet_wrap(~X.x, ncol = 4)
p
#p value adjustment
TF_pvalue_adj<-p.adjust(TF_pvalue, method = "fdr")
cl_tfs<-names(unlist(TF_estimate[TF_pvalue_adj<0.01]))
tf_enr<-enrichr(cl_tfs[cl_tfs != "Rap1_Mod"], "WikiPathway_2021_Human")

mut_brca<-read.csv("~/cell_shapes/data/BRCA_mut.csv", header = F)
colnames(mut_brca)<-c("Sample", "COSMIC_id", "Cancer_type", "Gene", "Transcript", "cDNA", "AA", "Classification", "gene_list", "Recurrence filter", "Subs", "Truncating", "inframe")
mut_brca$Sample <- gsub("-", "_", mut_brca$Sample)
RAP1GAP<-mut_brca[mut_brca$Gene == "RAP1GAP",]$Sample #GTPase-activating protein Rap1GAP
RAP1GAP<-"Hs_578T" # do it manually cos theres a spelling differece 
RAP1GAP2<-mut_brca[mut_brca$Gene == "RAP1GAP2",]$Sample #RAP1 GTPase activating protein 2
RAP1GDS1<-mut_brca[mut_brca$Gene == "RAP1GDS1",]$Sample #Rap1 GTPase-GDP dissociation stimulator 1
DOCK4<-mut_brca[mut_brca$Gene == "DOCK4",]$Sample #DOCK4
RAPGEF<-mut_brca[grep("RAPGEF", mut_brca$Gene),]$Sample #RAPGEF
SIPA1L<-mut_brca[grep("SIPA1L", mut_brca$Gene),]$Sample # SIPA1L 
#TF correlation analysis
relevant.df<-complete.df[complete.df$X %in% c("SMAD2", "TEAD1", "NFKBIA", "JUN", "NFKB1", "RELA", "KLF6", "NR0B2"),]
rap1_df<-complete.df[complete.df$X == "Rap1_Mod",]
end_df_1<-merge(x = relevant.df, y = rap1_df, by.x = "C.L.", by.y = "C.L.")
end_df_1<-unique(end_df)

dist_plot<-rbind(
  data.frame(end_df_1[end_df_1$C.L. %in% RAP1GAP,], mutational_background = "RAP1GAP_mut"),
  data.frame(end_df_1[end_df_1$C.L. %in% RAP1GDS1,], mutational_background =  "RAP1GDS1_mut"),
  data.frame(end_df_1[end_df_1$C.L. %in% DOCK4,], mutational_background =  "DOCK4_mut"),
  data.frame(end_df_1[end_df_1$C.L. %in% RAPGEF,], mutational_background = "RAPGEF_mut"),
  data.frame(end_df_1[end_df_1$C.L. %in% SIPA1L,], mutational_background = "SIPA1L_mut")
)
dist_plot<-unique(data.frame(RAP1_module_CS = dist_plot$NES.y, mutational_background = dist_plot$mutational_background))

# Libraries
library(ggplot2)
library(hrbrthemes)
library(dplyr)
library(tidyr)
library(viridis)

# The diamonds dataset is natively available with R.

# Without transparency (left)
p1 <- ggplot(data=dist_plot, aes(x=RAP1_module_CS, group=mutational_background, fill=mutational_background)) +
  geom_density(adjust=1.5) +
  theme_ipsum()
p1

ggplot(dist_plot, aes(x=mutational_background, y=RAP1_module_CS)) + 
  geom_boxplot()