#!/usr/bin/Rscript

# AUTHOR:	c.g.b 
#


#set for whether  the script is run locally or in the cluster (codon)
cluster.status = T 

#packages required locally 
if (cluster.status == F) {
  pack.lib = NULL
  packages = c("readr")
  mypath<- "/home/charlie/MelanomaProject/melanomaresistance/"
  
  
}else {pack.lib = "/hps/software/users/petsalaki/Rpackages/4_0_3/"
packages = c("crayon","BiocGenerics","S4Vectors","IRanges", 
             "TCGAbiolinks","GenomeInfoDb","GenomicRanges" ,
             "MatrixGenerics","SummarizedExperiment","DESeq2", "dorothea", 
             "org.Hs.eg.db", "readr", "viper", "dplyr", "gprofiler2", "glmGamPoi", "purrr")
mypath<- "/nfs/research/petsalaki/users/cbarker/TCGA_BRCA/paper_days_to_death"
}
#import otherwise R wont know where all your dependencies are 
environment(.libPaths)$.lib.loc <- pack.lib

#load packages
suppressPackageStartupMessages(lapply(packages, require, lib.loc = pack.lib, character.only = TRUE))




####FUNCTIONS#####


df2regulon = function(df) {
  regulon = df %>%
    split(.$tf) %>%
    map(function(dat) {
      tf = dat %>% distinct(tf) %>% pull()
      targets = setNames(dat$mor, dat$target)
      likelihood = dat$likelihood
      list(tfmode =targets, likelihood = likelihood)
    })
  return(regulon)
}



my_TFA_rap1 <- function(lfc_df, organism, Rap1_Mod) #mypath is deseq filename
{
  if(organism == "HUMAN")
    Regulon_file<- read.csv("/nfs/research/petsalaki/users/cbarker/phenotype_networks/data/DOROTHEA/human_network.csv", header=T) ###Open the human regulon file
  ###subset to the threshold - keep only the most confident TFs
  Regulon_file<- Regulon_file[Regulon_file$confidence=='A'| Regulon_file$confidence=='B'| Regulon_file$confidence=='C' | Regulon_file$confidence=='D',]
  ens2symbol <- AnnotationDbi::select(org.Hs.eg.db, #add gene names
                                      key=lfc_df$name,
                                      columns="SYMBOL",
                                      keytype="ENSEMBL")
  #get symbol
  ens2symbol <- as_tibble(ens2symbol)
  lfc_df <- inner_join(lfc_df, ens2symbol, by=c("name"="ENSEMBL"))
  # Exclude probes with unknown or duplicated gene symbol
  DEsignature<-lfc_df[!(is.na(lfc_df$adj_pval) | lfc_df$adj_pval==""), ]
  # Estimatez-score values for the GES. Check VIPER manual for details
  myStatistics = matrix(DEsignature$lfc, dimnames = list(DEsignature$SYMBOL, 'lfc') )
  myPvalue = matrix(DEsignature$adj_pval, dimnames = list(DEsignature$SYMBOL, 'adj_pval') )
  mySignature = (qnorm(myPvalue/2, lower.tail = FALSE) * sign(myStatistics))[, 1]
  mySignature = mySignature[order(mySignature, decreasing = T)]
  reg_rap<-df2regulon(Regulon_file)
  reg_rap$Rap1_Mod<-Rap1_Mod
  # Estimate TF activities
  mrs = msviper(ges = mySignature, regulon = reg_rap, ges.filter = F, minsize = 4)
  
  
  TF_activities = data.frame(Regulon = names(mrs$es$nes),
                             Size = mrs$es$size[ names(mrs$es$nes) ],
                             NES = mrs$es$nes,
                             p.value = mrs$es$p.value,
                             FDR = p.adjust(mrs$es$p.value, method = 'fdr'))
  TF_activities = TF_activities[ order(TF_activities$p.value), ]
  # Save results
  if (organism == "HUMAN")
    return(TF_activities)
}

glm_dorot<-function(sample, data, Rap1_Mod, window_size)
{
  variable_patient<-"patient"
  print(sample)
  colData(data)[variable_patient][,1]<-"CONTROL"
  colData(data)[variable_patient][rownames(colData(data)) == sample,] <- "TEST"
  if(!is.null(window_size)){
    colData(data)[variable_patient][which(rownames(colData(data)) == samples_list[1]):window_size,] <- "TEST"
  }
  
  fit <- glm_gp(data, design = ~ patient)
  de_res <- test_de(fit, contrast = patientTEST)
  # The large `lfc` values come from groups were nearly all counts are 0
  # Setting them to Inf makes the plots look nicer
  de_res$lfc <- ifelse(abs(de_res$lfc) > 20, sign(de_res$lfc) * Inf, de_res$lfc)
  TF_a<-my_TFA_rap1(de_res, "HUMAN", Rap1_Mod)
  return(TF_a)
}


########MAIN########

#get data
data = readRDS(file = "/nfs/research/petsalaki/users/cbarker/TCGA_BRCA/tcga_data_skcm.RDS")
#remove nas
variable<-"definition"
data <- data[,!is.na(colData(data)[variable][,1])] #remove nas in days to death as we are doing survival analysis 
print(data)
#get necessary stuff to test rap1 
#add rap1 
source("/nfs/research/petsalaki/users/cbarker/phenotype_networks/scripts/get_wgcna.R")
wgcnas<-get_wgcna(path_wgcna = "/nfs/research/petsalaki/users/cbarker/phenotype_networks/data/modules/ALLgenesprmodule.tab", 
                  path_correlations = "/nfs/research/petsalaki/users/cbarker/phenotype_networks/data/modules/correlations.txt", 
                  is.full = F)
tfmode<-rep(1, length(wgcnas$MEsaddlebrown))
names(tfmode)<-wgcnas$MEsaddlebrown
Rap1_Mod<-list()
Rap1_Mod$tfmode<-tfmode
Rap1_Mod$likelihood<-rep(1, length(wgcnas$MEsaddlebrown))
#get vector of samples you want to test 
samples_list<-colnames(data)
#run functions for glm_poi and DOROTHEA
window_size<-20
TF_df<-lapply(samples_list, function(x) {glm_dorot(x, 
                                                   data = data, 
                                                   Rap1_Mod = Rap1_Mod,
                                                  window_size = 1)})
names(TF_df)<-samples_list
saveRDS(object = TF_df,
        file = "TF_df_skcm.RDS",
        compress = F)

