library(parallel)
library(randomForestSRC)
library(tidyverse)
library(pheatmap)

options(rf.cores = 8)
options(mc.cores = 8)

args <- commandArgs(trailingOnly = TRUE)

# for testing

stage <- "adult"
tissue <- "Rectal"
tfFilter <- "expressing_5pct" 
clusterFilter <- "noHotClusterTargetsNoAlt"
predictorValue <- "distanceModified"
alternateTargets <- "closeAlternates"
src <- "RF_Model_4"
#dir <- "/net/waterston/vol9/ChipSeqPipeline/RF_Model_fly2"

stage <- args[1]
tissue <- args[2]
tfFilter <- args[3]
clusterFilter <- args[4]
predictorValue <- args[5]
alternateTargets <- args[6]
src <- args[7]

dir <- str_c("/net/waterston/vol9/ChipSeqPipeline",src,sep="/")  # source of model data

stageDir <- str_c(dir,stage,sep="/")
tissueDir <- str_c(stageDir,"tissues",tissue,sep="/")

subTissueDirs <- list.files(path=tissueDir)[str_ends(list.files(path=tissueDir) , "tsv",negate=TRUE)]
subTissueFiles <- str_c(subTissueDirs , ".tsv",sep="")
stageTFs <- select(read_tsv(file=str_c(dir,stage,"allTFsExpr.tsv",sep="/"),show_col_types = FALSE),TF)

subTissueExprs <- read_tsv(file=str_c(tissueDir,subTissueFiles[1],sep="/"),show_col_types = FALSE)
if (length(subTissueDirs)>1){
  for (i in 2: length(subTissueDirs)){
    df <- read_tsv(file=str_c(tissueDir,subTissueFiles[i],sep="/"),show_col_types = FALSE)
    subTissueExprs <- left_join(subTissueExprs,df)
  }
}
cellTypes <- colnames(select(subTissueExprs,-BaseGene))

res <- matrix(nrow=nrow(stageTFs),ncol=length(cellTypes))
err <- vector(mode="double",length=length(cellTypes))
names(err) <- cellTypes
colnames(res) <- cellTypes  
rownames(res) <- stageTFs$TF  
s <- 1
for (s in 1:length(subTissueDirs)) {
  subTissue <- subTissueDirs[s]
  subTissueExprs <- read_tsv(file=str_c(tissueDir,subTissueFiles[s],sep="/"))
  subTissueDir <- str_c(tissueDir,subTissue,sep="/")
  cells <- list.files(path=subTissueDir)  
  cell <- cells[1]
  for (cell in cells){
#    print(cell)
    # read the TFs to use for modeling the cell and the cell expression for all genes
    cellDir <- str_c(subTissueDir,cell,sep="/")
    tfFilterDir <- str_c(cellDir,tfFilter,sep="/")
    tfFilterFile <- str_c(tfFilterDir,"TFs.txt",sep="/")
    cellTFs <- rename(read_tsv(file=tfFilterFile,col_names = FALSE,show_col_types = FALSE),TF=X1)
    cellExpr <- select(subTissueExprs, BaseGene, all_of(cell))
    
    # read the primary and alternate targets
    clusterFilterDir <- str_c(stageDir,clusterFilter,sep="/")
    predictorValueDir <- str_c(clusterFilterDir,predictorValue,sep="/")
    primary <- read_tsv(file=str_c(predictorValueDir,"predictor.tsv",sep="/"),col_names = TRUE,show_col_types = FALSE)
    altTargetDir <- str_c(predictorValueDir,alternateTargets,sep="/")
    alternate <- read_tsv(file=str_c(altTargetDir,"predictor.tsv",sep="/"),col_names = TRUE,show_col_types = FALSE)
    if (nrow(alternate) > 0){
      primary <- bind_rows(primary,alternate)  
    } 
    
    # limit the predictor TFs 
    t <- select(primary,BaseGene,all_of(cellTFs$TF))
    
    j <- left_join(t,cellExpr) %>% select(-BaseGene)
    f <- as.formula(str_c(cell,"~."))
    o <- rfsrc(f,  data=as.data.frame(j),importance = TRUE) 
    c <- o$importance
    res[names(c),cell] <- c
    err[cell] <- mean((o$yvar - o$predicted.oob)^2)
  }
}


resultDir <- str_c("/net/waterston/vol9/ChipSeqPipeline",str_c(src,"Results",sep="_"),stage,tfFilter,clusterFilter,predictorValue,alternateTargets,sep="/")
dir.create(path=resultDir,recursive = TRUE)

vimpFile <- str_c(resultDir,str_c(tissue,".vimp.rds",sep=""),sep="/")
saveRDS(res,file=vimpFile)
df <- as_tibble(res,rownames="TF")
write_tsv(df,file=str_c(resultDir,str_c(tissue,".vimp.tsv",sep=""),sep="/"))

res0 <- res
res0 <- ifelse(res0<=1,1.0,res0)
res0 <- log(res0)
res0[is.na(res)] <- -10

if (length(cellTypes) > 1){
  pngFile <- str_c(resultDir,str_c(tissue,".vimp.png",sep=""),sep="/")
  pheatmap(res0,fontsize = 10,cellheight=15,filename = pngFile,angle_col=45)
}

write_tsv(tibble(cell=cellTypes,mse=err),file=str_c(resultDir,str_c(tissue,".err.tsv",sep=""),sep="/"))
