rm(list=ls())

library(iClusterPlus)

# dataPath="~/Desktop/Projects/Subtyping/DataGEO/"
# resultPath="~/Desktop/Projects/SubTyping/PackageAndTesting/iClusterPlusResult/"

dataPath="/wsu/home/ex/ex60/ex6091/Subtyping/PackageAndTesting/DataGEO/"
resultPath="/wsu/home/ex/ex60/ex6091/Subtyping/PackageAndTesting/iClusterPlusResult/"

maxGeneNo=2000
kRange=1:9 ## would be 2:10 clusters
nThread=60

set.seed(1)

for (dataset in c("AML2004", "GSE10245", "GSE19188","GSE14924", "Brain2002", "GSE43580","Lung2001", "GSE15061")) {
    load(paste(dataPath,dataset,".RData",sep=""))
    data=(get(paste("gene_",dataset,sep="")))
    group=get(paste("group_",dataset,sep=""))
    #remove healthy samples
    data=data[!rownames(data)%in%rownames(group)[group[,2]=="healthy"],]
    group=group[!rownames(group)%in%rownames(group)[group[,2]=="healthy"],]
    
    pca=prcomp(data)
    
    t1=Sys.time()
    
    # median absolute deviation
    if (ncol(data) > maxGeneNo) {
        sds=apply(data,FUN=mad,MARGIN=2)
        data=data[,order(sds, decreasing = T)[1:maxGeneNo]]
    }
    
    cv.fit=alist()
    for (k in kRange) {
        cv.fit[[k]]=tune.iClusterPlus(cpus=nThread, dt1=data, K=k, type=c("gaussian"))
    }
    
    t2=Sys.time()
    
    
    resultFile=paste(resultPath, "iClusterPlus_", dataset,".RData",sep="")
    save(cv.fit, dataset, data, group, pca, t1, t2, file=resultFile)
}

pdfFile=paste(resultPath, "iClusterPlus_GEO_Summary.pdf" ,sep="")  
pdf(pdfFile)
for (dataset in c("GSE10245", "GSE19188", "GSE43580", "GSE15061", "GSE14924", "Lung2001", "AML2004", "Brain2002")) {
    resultFile=paste(resultPath,"iClusterPlus_", dataset, ".RData" ,sep="")  
    load(resultFile)
    
    nK = length(cv.fit)
    BIC=getBIC(cv.fit)
    devR = getDevR(cv.fit) 
    minBICid = apply(BIC,2,which.min)
    devRatMinBIC = rep(NA,nK)
    for(i in 1:nK){
        devRatMinBIC[i] = devR[minBICid[i],i]
    } 
    
    plot(devRatMinBIC, xlab="k", main=paste("iClusterPlus result, ", dataset, sep=""))
}
dev.off()

# nK = length(cv.fit)
# BIC=getBIC(cv.fit)
# devR = getDevR(cv.fit) 
# minBICid = apply(BIC,2,which.min)
# devRatMinBIC = rep(NA,nK)
# for(i in 1:nK){
#     devRatMinBIC[i] = devR[minBICid[i],i]
# } 
# 
# clusters=getClusters(cv.fit)
# rownames(clusters)=rownames(data)
# colnames(clusters)=paste("K=",2:(length(cv.fit)+1),sep="")
# 
# k=4
# best.cluster=clusters[,k]
# best.fit=cv.fit[[k]]$fit[[which.min(BIC[,k])]]
# library("flexclust")
# randIndex(best.cluster,group[,2], correct = F)
