#!/usr/bin/Rscript

# AUTHOR: Charlie Barker based on code by Eirini Petsalaki
#
# Uses enrichment strategy to identify TFs whose regulons are associated with GEMs (using EnrichR). Further
# enrichment of these identfied TFs is used to find pathways that might regulate them. These pathways are then
# pulled from pathway commons and then are merged to form a contiguous signaling network. This script makes the 
# assumption that the features correlated with the GEMs are at least hypothesised to be regulated by similar or 
# overlapping pathways.
# 
# INPUT: 
#          modules_names.csv,           csv providing a mapping between WGCNA assigned GEM names and manually 
#                                       annotated ones (picked from module_content.R).
#          ALLgenesprmodule.tab         gene content of modules derived from wgcna.R
#          correlations.txt             file describing what GEMS are correlated to what phenotype features (from wgcna.R)
#          strong.correlated.mods.csv   csv describing matrix of phenotype by modules
#          R-HSA-162582.txt             txt. file containing the signaling pathways on reactome https://reactome.org/content/detail/R-HSA-162582
#
# OUTPUT:   
#	   ~/phenotype_networks/data/BIOPAX                Directory containing all the biopax files describing the participants of the pathways 
#                                                    and the nature of their interactions 
#	   STable2.csv		                                 File (shown in sup table 2) showing which pathways are regulating which features. 
#    biopax_shape_pathway.sif                        SIF file describing the merging of all the files in the biopax file directory. 
#

library(enrichR)
library(gprofiler2)
library(readr)
library(data.table)
library(dplyr)
library(ggplot2)
source("~/phenotype_networks/scripts/get_wgcna.R")

set.seed(Sys.time())
seed<-sample(1:20000, 1, replace=F)
print(seed) #1446 #19607
set.seed(seed)

mod.names<-read.csv(file = "~/cell_shapes/data/module_names.csv")

wgcnas<-get_wgcna(path_wgcna = "~/phenotype_networks/data/modules/ALLgenesprmodule.tab", 
                       path_correlations = "~/phenotype_networks/data/modules/correlations.txt", 
                       is.full = T)
correlations_table<-read_csv(file = "~/phenotype_networks/data/wgcna/strong.correlated.mods.csv") #../../genesprmodule/correlations.txt
wgcna.split<-wgcnas[correlations_table$modules]
wgcna_df<-stack(wgcna.split)
wgcna_df$ind <- mod.names$new_name[match(wgcna_df$ind, mod.names$ME_names) ]
colnames(wgcna_df) <- c("Genes", "Module Name")
#write.csv(x = wgcna_df, file = "~/cell_shapes/manuscript/figures/cell shape figures/supplementary/figure_csv/module_contents.csv")

mods<-names(wgcnas)
gene_list<-as.character(unlist(wgcnas, recursive=FALSE))
len_vec<-as.numeric(lapply(wgcnas, function(x) length(x)))
prob_vec<-len_vec/sum(len_vec)

ss <- sample(1:length(prob_vec),size=length(gene_list),replace=TRUE,prob=prob_vec)
shuffled_modules <- setNames(split(gene_list,ss), mods)
wgcna.split#<-shuffled_modules
#Enrichment of TFs using the following databases

#TRRUST_Transcription_Factors_2019

tf_db<- c( "ENCODE_and_ChEA_Consensus_TFs_from_ChIP-X","TRANSFAC_and_JASPAR_PWMs","Transcription_Factor_PPIs","ENCODE_TF_ChIP-seq_2015", "TRRUST_Transcription_Factors_2019")

df.names<-data.frame()
for (module in names(wgcna.split)) {
  print(module)
  enriched_prizes<- enrichr(wgcna.split[module][[1]], tf_db)
  enrichdt<-rbindlist(lapply(1:length(enriched_prizes), function(x){ setDT(enriched_prizes[[x]])[, id:=names(enriched_prizes)[x]]}), use.names=TRUE, fill=TRUE)
  enrichdt<-enrichdt[enrichdt$Adjusted.P.value < 0.05,]
  enrichdt<-enrichdt[order(enrichdt$Adjusted.P.value),]
  if (dim(enrichdt)[1] != 0) {
    row.to.add <- data.frame(module, 
                             enrichdt$Term,
                             enrichdt$id, 
                             enrichdt$Overlap, 
                             enrichdt$P.value,
                             enrichdt$Adjusted.P.value, 
                             enrichdt$Odds.Ratio, 
                             enrichdt$Genes)
    df.names<-rbind(df.names, row.to.add)
  }
}


df.names$enrichdt.Term <- unlist(lapply(strsplit(df.names$enrichdt.Term, " "), head, n = 1L))
sup.table<-df.names
sup.table$minuslog10pvalue <- -log10(sup.table$enrichdt.Adjusted.P.value)
dots_tf<-ggplot(data = sup.table[sup.table$enrichdt.Adjusted.P.value<0.05,], aes(x=module, y = reorder(enrichdt.Term, -minuslog10pvalue), color = enrichdt.Odds.Ratio, size = minuslog10pvalue)) + 
  geom_point() + theme(axis.text.x = element_text(angle = 60, hjust = 1))
colnames(sup.table) <- c("Module Name", "TF", "Source Database", "Overlap", "P.value", "Adjusted P.value", "Odds Ratio", "Gene List", "-log10(P)")
sup.table$`Module Name` <- mod.names$new_name[match(sup.table$`Module Name`, mod.names$ME_names) ]
#write.csv(x = sup.table, file = "~/cell_shapes/manuscript/figures/cell shape figures/supplementary/STable2.csv")

split.names <- df.names %>%
  group_by(module)
split.names<-group_split(split.names)
names(split.names)<-sort(unique(df.names$module))

reactome.df<-data.frame()
for (module in names(split.names)) {
  if (  length(unique(split.names[module][[1]]$enrichdt.Term)) < 10) {
    next
  }
  print(module)
  enriched_prizes<- enrichr(split.names[module][[1]]$enrichdt.Term, "Reactome_2016")
  enrichdt<-rbindlist(lapply(1:length(enriched_prizes), function(x){ setDT(enriched_prizes[[x]])[, id:=names(enriched_prizes)[x]]}), use.names=TRUE, fill=TRUE)
  enrichdt<-enrichdt[enrichdt$Adjusted.P.value < 0.05,]
  enrichdt<-enrichdt[order(enrichdt$Adjusted.P.value),]
  if (dim(enrichdt)[1] != 0) {
    row.to.add <- data.frame(module, 
                             enrichdt$Term,
                             enrichdt$id, 
                             enrichdt$Overlap, 
                             enrichdt$P.value,
                             enrichdt$Adjusted.P.value, 
                             enrichdt$Odds.Ratio, 
                             enrichdt$Genes)
    reactome.df<-rbind(reactome.df, row.to.add)
  }
}
reactome.df$minuslog10pvalue<- -log2(reactome.df$enrichdt.Adjusted.P.value)
#extract ONLY SIGNALING PATHWAYS
pathway.ids<-read_csv("~/phenotype_networks/data/R-HSA-162582.txt",col_names = F) 
reactome.df$REACTOME_ID <- unlist(lapply(strsplit(reactome.df$enrichdt.Term, " "), tail, n = 1L))
filt.reactome.df<-reactome.df[reactome.df$REACTOME_ID %in% pathway.ids$X1,]
dot.plot<-filt.reactome.df
p.adjust(filt.reactome.df$enrichdt.Adjusted.P.value)
dot.plot$module <- mod.names$new_name[match(dot.plot$module, mod.names$ME_names) ]
dot.plot[is.infinite(dot.plot$enrichdt.Odds.Ratio),]$enrichdt.Odds.Ratio <- 10
dot.plot$enrichdt.Term <- substr(dot.plot$enrichdt.Term,1,nchar(dot.plot$enrichdt.Term)-26)
dots<-ggplot(data = dot.plot[dot.plot$enrichdt.Adjusted.P.value<0.05,], aes(x=module, y = reorder(enrichdt.Term, -minuslog10pvalue), color = log2(enrichdt.Odds.Ratio), size = minuslog10pvalue)) + 
  geom_point() + theme(axis.text.x = element_text(angle = 60, hjust = 1))


split.names.reactome <- filt.reactome.df %>%
  group_by(module)
split.names.reactome<-group_split(split.names.reactome)
names(split.names.reactome)<-sort(unique(filt.reactome.df$module))



filt.reactome.df$module <- mod.names$new_name[match(filt.reactome.df$module, mod.names$ME_names) ]
colnames(filt.reactome.df)<-c("Module Name", "Reactome Pathway", "Database", "Overlap", "P value", "Adjusted P value", "Odds Ratio", "Genes", "-log10(P)", "ID")
#write.csv(x = filt.reactome.df, file = "~/cell_shapes/manuscript/figures/cell shape figures/supplementary/figure_csv/F1C.csv")

biopx1<-unique(filt.reactome.df$ID)

#Get the members of these pathways in the form of BIOPAX files

library(paxtoolsr)
library(plyr)
biopath<-"~/phenotype_networks/data/BIOPAX"
for (pthwy.stble.id in filt.reactome.df$REACTOME_ID) { #for each pathway id per cell shape
    ## Generate file to save content into
    biopaxFile <- paste(biopath, pthwy.stble.id, sep = "/")
    #skip thi step if biopax file already exists  
    if (file.exists(biopaxFile)) {
    }
    #save biopax in biopax folder
    else {  
      saveXML(getPc(paste("http://identifiers.org/reactome", 
                          pthwy.stble.id, sep = "/"), 
                    format = "BIOPAX"), biopaxFile)
      }
    #get sif and dimensions of sif and print. 
    #sif<-toSif(biopaxFile)
    print(pthwy.stble.id)
    #print(unique(c(sif$PARTICIPANT_A, sif$PARTICIPANT_B)))
    #print(dim(sif))
}


#merging biopax
#first we do one big pathway file, covering all pathways involving cells shapes generally speaking. 
all_pathways<-list.files(biopath,full.names = TRUE)
mega_biopax<-mergeBiopax(all_pathways[1],all_pathways[2])
for (pathways in all_pathways[3:length(all_pathways)]) {
  mega_biopax<-mergeBiopax(mega_biopax,pathways)
  
}
#then we zoom in, generating pathways specific to cell shapes. 

#options - collapse all the different types of interactions, or 
#visualise
library(igraph)
library("ggplot2")
mega_sif<-toSif(mega_biopax)


#remove types of interactions we're not too concerned with 
lesser_sif<-mega_sif[mega_sif$INTERACTION_TYPE != "chemical-affects",]
lesser_sif<-lesser_sif[lesser_sif$INTERACTION_TYPE != "neighbor-of",]

#WRITE

#write_delim(mega_sif,path = "~/phenotype_networks/data/biopax_shape_pathway.sif",delim = "\t")


