  #!/usr/bin/Rscript
  
  # AUTHOR: Charlie Barker
  #
  # Uses enrichment strategy to identify TFs whose regulons are associated with GEMs (using EnrichR). This is a permuation
  # based script to ensure that 
  # 
  # INPUT: 
  #          modules_names.csv,           csv providing a mapping between WGCNA assigned GEM names and manually 
  #                                       annotated ones (picked from module_content.R).
  #          ALLgenesprmodule.tab         gene content of modules derived from wgcna.R
  #          correlations.txt             file describing what GEMS are correlated to what phenotype features (from wgcna.R)
  #          strong.correlated.mods.csv   csv describing matrix of phenotype by modules
  #          R-HSA-162582.txt             txt. file containing the signaling pathways on reactome https://reactome.org/content/detail/R-HSA-162582
  #
  # OUTPUT:   
  #	   ~/phenotype_networks/data/BIOPAX                Directory containing all the biopax files describing the participants of the pathways 
  #                                                    and the nature of their interactions 
  #	   STable2.csv		                                 File (shown in sup table 2) showing which pathways are regulating which features. 
  #    biopax_shape_pathway.sif                        SIF file describing the merging of all the files in the biopax file directory. 
  #
  
  library(enrichR)
  library(gprofiler2)
  library(readr)
  library(data.table)
  library(dplyr)
  
  cluster.status = F
  
  
  #packages required locally 
  if (cluster.status == F) {
    pack.lib = NULL
    packages = c("enrichR", "gprofiler2", "readr", "data.table", "dplyr")
    mypath<- "~/phenotype_networks"
    
    
  }else {pack.lib = "/hps/software/users/petsalaki/users/cbarker/Rpackages"
    packages = c("enrichR", "gprofiler2", "readr", "data.table", "dplyr")
    mypath<- "/nfs/research/petsalaki/users/cbarker/phenotype_networks"
    environment(.libPaths)$.lib.loc <- pack.lib
  }
  #import otherwise R wont know where all your dependencies are 
  
  #load packages
  suppressPackageStartupMessages(lapply(packages, require, lib.loc = pack.lib, character.only = TRUE))
  setwd(mypath)
  
  #function 
  
  
  
  
  source("./scripts/get_wgcna.R")
  
  set.seed(Sys.time())
  seed<-sample(1:20000, 1, replace=F)
  print(seed) #1446 #19607
  set.seed(seed)
  
  mod.names<-read.csv(file = "./data/module_names.csv")
  
  wgcnas<-get_wgcna(path_wgcna = "./data/modules/ALLgenesprmodule.tab", 
                    path_correlations = "./data/modules/correlations.txt", 
                    is.full = T)
  correlations_table<-read_csv(file = "./data/wgcna/strong.correlated.mods.csv") #../../genesprmodule/correlations.txt
  wgcna.split<-wgcnas[correlations_table$modules]
  wgcna_df<-stack(wgcna.split)
  wgcna_df$ind <- mod.names$new_name[match(wgcna_df$ind, mod.names$ME_names) ]
  colnames(wgcna_df) <- c("Genes", "Module Name")
  #write.csv(x = wgcna_df, file = "~/cell_shapes/manuscript/figures/cell shape figures/supplementary/figure_csv/module_contents.csv")
  
  mods<-names(wgcnas)
  gene_list<-as.character(unlist(wgcnas, recursive=FALSE))
  len_vec<-as.numeric(lapply(wgcnas, function(x) length(x)))
  prob_vec<-len_vec/sum(len_vec)
  dupe_modules<-c()
  for (iteration in c(1:100)) {
    ss <- sample(1:length(prob_vec),size=length(gene_list),replace=TRUE,prob=prob_vec)
    shuffled_modules <- split(gene_list,ss)
    dupe_modules<-c(dupe_modules, shuffled_modules)
  }
  names(dupe_modules)<-as.character(1:length(dupe_modules))
  ss <- sample(1:length(prob_vec),size=length(gene_list),replace=TRUE,prob=prob_vec)
  shuffled_modules <- setNames(split(gene_list,ss), 
                               paste(mods, iteration, sep = "_"))
  wgcna.split<-dupe_modules
  #Enrichment of TFs using the following databases
  
  #TRRUST_Transcription_Factors_2019
  pathway.ids<-read_csv("./data/R-HSA-162582.txt",col_names = F) 
  
  tf_db<- c( "ENCODE_and_ChEA_Consensus_TFs_from_ChIP-X","TRANSFAC_and_JASPAR_PWMs","Transcription_Factor_PPIs","ENCODE_TF_ChIP-seq_2015", "TRRUST_Transcription_Factors_2019")
  
  df.names<-data.frame()
  pathway.df<-data.frame()
  for (module in names(wgcna.split)[1:1000]) {
    print(module)
    enriched_prizes<- enrichr(wgcna.split[module][[1]], tf_db)
    enrichdt<-rbindlist(lapply(1:length(enriched_prizes), function(x){ setDT(enriched_prizes[[x]])[, id:=names(enriched_prizes)[x]]}), use.names=TRUE, fill=TRUE)
    enrichdt<-enrichdt[enrichdt$Adjusted.P.value < 0.05,]
    enrichdt<-enrichdt[order(enrichdt$Adjusted.P.value),]
    if (dim(enrichdt)[1] > 1) { #if the number of TFs added is more than 2 
      TFs_df <- data.frame(module, 
                               enrichdt$Term,
                               enrichdt$id, 
                               enrichdt$Overlap, 
                               enrichdt$P.value,
                               enrichdt$Adjusted.P.value, 
                               enrichdt$Odds.Ratio, 
                               enrichdt$Genes)
      TFs<-unlist(lapply(strsplit(TFs_df$enrichdt.Term, " "), head, n = 1L))
      print(TFs)
      enriched_prizes<- enrichr(TFs, "Reactome_2016")
      pathway_dt<-rbindlist(lapply(1:length(enriched_prizes), function(x){ setDT(enriched_prizes[[x]])[, id:=names(enriched_prizes)[x]]}), use.names=TRUE, fill=TRUE)
      pathway_dt<-pathway_dt[order(pathway_dt$Adjusted.P.value),]
      if (dim(pathway_dt)[1] != 0) {
        filt.reactome.df<-pathway_dt
        row.to.add <- data.frame(Mod_name = module, 
                                 Term = filt.reactome.df$Term,
                                 ID = filt.reactome.df$id, 
                                 Overlap = filt.reactome.df$Overlap, 
                                 P.value = filt.reactome.df$P.value,
                                 Adj.P.value =filt.reactome.df$Adjusted.P.value, 
                                 Odds.Ratio = filt.reactome.df$Odds.Ratio, 
                                 Genes = filt.reactome.df$Genes)
        pathway.df<-rbind(pathway.df, row.to.add)
      }
    }
  }
  
  
  split.names <- pathway.df %>%
    group_by(Term)
  split.names<-group_split(split.names)
  names(split.names)<-sort(unique(pathway.df$Term))
  #so a problem is that we dont have all the pvalues for tests that have failed - if we give pathways that arent even enriched in algorithm an arbitary pvalue of 1, then we can still create an overall pvalue for each pathway. 
  complete_pathway_list<-list()
  for (pathway in names(split.names)) {
    print(pathway)
    row.to.add <- data.frame(Mod_name = data.frame(names(wgcna.split)[!names(wgcna.split) %in% data.frame(split.names[pathway][[1]])$Mod_name]),
                             Term = pathway,
                             ID = "Reactome_2016", 
                             Overlap = NA, 
                             P.value = 1,
                             Adj.P.value = 1, 
                             Odds.Ratio = 0, 
                             Genes = NA)
    colnames(row.to.add)[1] <- "Mod_name"
    complete_pathway_list[[pathway]]<-data.frame(rbind(data.frame(split.names[pathway][[1]]), row.to.add)) #this isnt working for some reason .
  }
  merge_list<-Reduce(function(...) merge(..., all=T), split.names)
  merge_list$REACTOME_ID <- unlist(lapply(strsplit(merge_list$Term, " "), tail, n = 1L))
  merge_list<-merge_list[merge_list$REACTOME_ID %in% pathway.ids$X1,]
  write.csv(merge_list,file = "./sim_pathways.csv")