rm(list=ls())

dataPath="~/Desktop/Projects/SubTyping/METABRIC/"
resultPath="~/Desktop/Projects/SubTyping/PackageAndTesting/CCResult/"

# dataPath="/wsu/home/ex/ex60/ex6091/Subtyping/PackageAndTesting/DataTCGA/"
# resultPath="/wsu/home/ex/ex60/ex6091/Subtyping/PackageAndTesting/CCResult/"

#IMPORTANT: restart R after installing ConsensusClusterPlus. Otherwise the script crashes with strange message 
#Error: lazy-load database 'P' is corrupt
library(preprocessCore)
library(ConsensusClusterPlus)

# normalize the data as for SNF when concatenate data
standardNormalization <- function (x) 
{
    x = as.matrix(x)
    mean = apply(x, 2, mean)
    sd = apply(x, 2, sd)
    sd[sd == 0] = 1
    xNorm = t((t(x) - mean)/sd)
    return(xNorm)
}

clusRange=2:10


for (dataset in c("METABRIC_discovery","METABRIC_validation")) {
    file=paste(dataPath, dataset, ".RData" ,sep="")
    load(file)
    
    t1=Sys.time()
    
    mydatGE=mydatGE[,colSums(is.na(mydatGE)) == 0]
    mydatCNV=t(mydatCNV[,-c(1:5)])
    
    patients=rownames(survival)
    patients=intersect(patients,rownames(mydatGE))
    patients=intersect(patients,rownames(mydatCNV))
    
    mydatGE=mydatGE[patients,]
    mydatCNV=mydatCNV[patients,]
        
    if (!file.exists(paste(resultPath, dataset, "/", sep=""))) {dir.create(paste(resultPath, dataset, "/", sep=""))}
    
    d <- t(mydatGE)
    d = sweep(d,1, apply(d,1,median,na.rm=T))
    path=paste(resultPath, dataset, "/", "GE",sep="")
    if (!file.exists(path)) {dir.create(path)}
    result_GE = ConsensusClusterPlus(d,maxK=max(clusRange),reps=1000,pItem=0.8,pFeature=1, title=path,clusterAlg="hc",distance="pearson", plot="png", seed=888)
        
    d <- t(mydatCNV)
    d = sweep(d,1, apply(d,1,median,na.rm=T))
    path=paste(resultPath, dataset, "/", "CNV",sep="")
    if (!file.exists(path)) {dir.create(path)}
    result_CNV = ConsensusClusterPlus(d,maxK=max(clusRange),reps=1000,pItem=0.8,pFeature=1, title=path,clusterAlg="hc",distance="pearson", plot="png", seed=888)
    
    d=cbind(mydatGE, mydatCNV)
    d=standardNormalization(d);d=t(d)
    d = sweep(d,1, apply(d,1,median,na.rm=T))
    path=paste(resultPath, dataset, "/", "All",sep="")
    if (!file.exists(path)) {dir.create(path)}
    result_All = ConsensusClusterPlus(d,maxK=max(clusRange),reps=1000,pItem=0.8,pFeature=1, title=path,clusterAlg="hc",distance="pearson", plot="png", seed=888)
    #icl_All = calcICL(result_All,title=path,plot="png")
    
    t2=Sys.time()
  
    resultFile=paste(resultPath, "CC_", dataset,".RData",sep="")
    save(result_GE, result_CNV, result_All, dataset, survival, survivalDFS, clinical, patients, t1, t2, file=resultFile)
}


library(survival)
for (dataset in c("METABRIC_discovery","METABRIC_validation")) {
    resultFile=paste(resultPath, "CC_", dataset,".RData",sep="")
    load(resultFile)

    # we get these from CDF plots
    if (dataset=="METABRIC_discovery") {
        kGE=6
        kCNV=8
        kAll=10
    } else if (dataset=="METABRIC_validation") {
        kGE=8
        kCNV=4
        kAll=8
    } 

    
    
    pdfFile=paste(resultPath, "CC_", dataset, ".pdf" ,sep="")  
    pdf(pdfFile)
    
    #Cox log rank test GE
    survi=survivalDFS[patients,]
    groupGE=result_GE[[kGE]]$consensusClass
    groups <- factor(groupGE); names(groups) = rownames(survi)
    coxp=round(summary(coxph(Surv(time = Survival, event = Death) ~ groups, data = survi, ties="exact"))$sctest[3],digits = 10)
    CI=round(summary(coxph(Surv(time = Survival, event = Death) ~ groups, data = survi))$concordance[1],3)
    mfit <- survfit(Surv(Survival, Death == 1) ~ factor(groups), data = survi)
    plot(mfit, col=unique(groups), main = paste("DFS survival curves for gene expression of ",dataset, " (CC)", sep=""), xlab = "Days", ylab="Survival", lwd=2)
    legend("top", legend = paste("Cox p-value:", coxp, ", CI=",CI, sep=""))
    
    #Cox log rank test CNV
    groupCNV=result_CNV[[kCNV]]$consensusClass
    groups <- factor(groupCNV); names(groups) = rownames(survi)
    coxp=round(summary(coxph(Surv(time = Survival, event = Death) ~ groups, data = survi, ties="exact"))$sctest[3],digits = 10)
    CI=round(summary(coxph(Surv(time = Survival, event = Death) ~ groups, data = survi))$concordance[1],3)
    mfit <- survfit(Surv(Survival, Death == 1) ~ factor(groups), data = survi)
    plot(mfit, col=unique(groups), main = paste("DFS survival curves for CNV ",dataset, " (CC)", sep=""), xlab = "Days", ylab="Survival", lwd=2)
    legend("top", legend = paste("Cox p-value:", coxp, ", CI=",CI, sep=""))
    

    #Cox All
    groupAll=result_All[[kAll]]$consensusClass
    groups <- factor(groupAll); names(groups) = rownames(survi)
    coxp=round(summary(coxph(Surv(time = Survival, event = Death) ~ groups, data = survi, ties="exact"))$sctest[3],digits = 10)
    CI=round(summary(coxph(Surv(time = Survival, event = Death) ~ groups, data = survi))$concordance[1],3)
    mfit <- survfit(Surv(Survival, Death == 1) ~ factor(groups), data = survi)
    plot(mfit, col=unique(groups), main = paste("DFS survival curves for combined data of ",dataset, " (CC)", sep=""), xlab = "Days", ylab="Survival", lwd=2)
    legend("top", legend = paste("Cox p-value:", coxp, ", CI=",CI, sep=""))
    
    
    #Cox log rank test GE
    survi=survival[patients,]
    groupGE=result_GE[[kGE]]$consensusClass
    groups <- factor(groupGE); names(groups) = rownames(survi)
    coxp=round(summary(coxph(Surv(time = Survival, event = Death) ~ groups, data = survi, ties="exact"))$sctest[3],digits = 10)
    CI=round(summary(coxph(Surv(time = Survival, event = Death) ~ groups, data = survi))$concordance[1],3)
    mfit <- survfit(Surv(Survival, Death == 1) ~ factor(groups), data = survi)
    plot(mfit, col=unique(groups), main = paste("Overall survival curves for gene expression of ",dataset, " (CC)", sep=""), xlab = "Days", ylab="Survival", lwd=2)
    legend("top", legend = paste("Cox p-value:", coxp, ", CI=",CI, sep=""))
    
    #Cox log rank test CNV
    groupCNV=result_CNV[[kCNV]]$consensusClass
    groups <- factor(groupCNV); names(groups) = rownames(survi)
    coxp=round(summary(coxph(Surv(time = Survival, event = Death) ~ groups, data = survi, ties="exact"))$sctest[3],digits = 10)
    CI=round(summary(coxph(Surv(time = Survival, event = Death) ~ groups, data = survi))$concordance[1],3)
    mfit <- survfit(Surv(Survival, Death == 1) ~ factor(groups), data = survi)
    plot(mfit, col=unique(groups), main = paste("Overall survival curves for CNV of ", dataset, " (CC)", sep=""), xlab = "Days", ylab="Survival", lwd=2)
    legend("top", legend = paste("Cox p-value:", coxp, ", CI=",CI, sep=""))
    
    #Cox All
    groupAll=result_All[[kAll]]$consensusClass
    groups <- factor(groupAll); names(groups) = rownames(survi)
    coxp=round(summary(coxph(Surv(time = Survival, event = Death) ~ groups, data = survi, ties="exact"))$sctest[3],digits = 10)
    CI=round(summary(coxph(Surv(time = Survival, event = Death) ~ groups, data = survi))$concordance[1],3)
    mfit <- survfit(Surv(Survival, Death == 1) ~ factor(groups), data = survi)
    plot(mfit, col=unique(groups), main = paste("Overall survival curves for combined data of ", dataset, " (CC)", sep=""), xlab = "Days", ylab="Survival", lwd=2)
    legend("top", legend = paste("Cox p-value:", coxp, ", CI=",CI, sep=""))
    
    
    dev.off()
}

