rm(list=ls())

library(iClusterPlus)

dataPath="~/Desktop/Projects/Subtyping/DataGEO/"
resultPath="~/Desktop/Projects/SubTyping/PackageAndTesting/iClusterPlusResult/"

# dataPath="/wsu/home/ex/ex60/ex6091/Subtyping/PackageAndTesting/DataGEO/"
# resultPath="/wsu/home/ex/ex60/ex6091/Subtyping/PackageAndTesting/iClusterPlusResult/"

kRange=1:9 ## would be 2:10 clusters
nThread=60

set.seed(1)


# 50000 thousand means all genes are used
for (maxGeneNo in c(3000, 4000, 50000)) {
  for (dataset in c("AML2004", "GSE10245", "GSE19188","GSE14924", "Brain2002", "GSE43580","Lung2001", "GSE15061")) {
      load(paste(dataPath,dataset,".RData",sep=""))
      data=(get(paste("gene_",dataset,sep="")))
      group=get(paste("group_",dataset,sep=""))
      #remove healthy samples
      data=data[!rownames(data)%in%rownames(group)[group[,2]=="healthy"],]
      group=group[!rownames(group)%in%rownames(group)[group[,2]=="healthy"],]
      
      pca=prcomp(data)
      
      t1=Sys.time()
      
      # median absolute deviation
      if (ncol(data) > maxGeneNo) {
          sds=apply(data,FUN=mad,MARGIN=2)
          data=data[,order(sds, decreasing = T)[1:maxGeneNo]]
      }
      
      cv.fit=alist()
      for (k in kRange) {
          cv.fit[[k]]=tune.iClusterPlus(cpus=nThread, dt1=data, K=k, type=c("gaussian"))
      }
      
      t2=Sys.time()
      
      
      resultFile=paste(resultPath, "iClusterPlus_", dataset, "_", maxGeneNo, ".RData",sep="")
      save(cv.fit, dataset, data, group, pca, t1, t2, file=resultFile)
  }
}

pdfFile=paste(resultPath, "iClusterPlus_GEO_MoreAnalysis_Summary.pdf" ,sep="")
pdf(pdfFile)
for (maxGeneNo in c(3000, 4000, 50000)) {
    for (dataset in c("GSE10245", "GSE19188", "GSE43580", "GSE15061", "GSE14924", "Lung2001", "AML2004", "Brain2002")) {
        resultFile=paste(resultPath, "iClusterPlus_", dataset, "_", maxGeneNo, ".RData",sep="")
        load(resultFile)

        nK = length(cv.fit)
        BIC=getBIC(cv.fit)
        devR = getDevR(cv.fit)
        minBICid = apply(BIC,2,which.min)
        devRatMinBIC = rep(NA,nK)
        for(i in 1:nK){
            devRatMinBIC[i] = devR[minBICid[i],i]
        }

        if (sum(is.na(devRatMinBIC))==0) plot(devRatMinBIC, xlab="k", main=paste("iClusterPlus result, ", dataset, ", ", maxGeneNo, sep=""))
        else plot(1, xlab="k", main=paste("iClusterPlus result, ", dataset, ", ", maxGeneNo, " NA NA", sep=""))
    }
}
dev.off()


## get number of clusters from BIC plot
library("flexclust")
clusterNo=data.frame(row.names = c("GSE10245", "GSE19188", "GSE43580", "GSE15061", "GSE14924", "Lung2001", "AML2004", "Brain2002"), 
                     X3000=c(2,6,6,9,4,6,3,2), X4000=c(2,5,5,9,3,7,NA,2), X50000=c(2,3,3,6,6,3,NA,1))

ARI=data.frame(row.names = c("GSE10245", "GSE19188", "GSE43580", "GSE15061", "GSE14924", "Lung2001", "AML2004", "Brain2002"), X3000=rep(NA,8), X4000=rep(NA,8))
for (maxGeneNo in c(3000, 4000, 50000)) {
    for (dataset in c("GSE10245", "GSE19188", "GSE43580", "GSE15061", "GSE14924", "Lung2001", "AML2004", "Brain2002")) {
        resultFile=paste(resultPath, "iClusterPlus_", dataset, "_", maxGeneNo, ".RData",sep="")
        load(resultFile)
        
        if (!is.na(clusterNo[dataset,paste("X", maxGeneNo, sep="")])) {
            clusters=getClusters(cv.fit)
            rownames(clusters)=rownames(data)
            colnames(clusters)=paste("K=",2:(length(cv.fit)+1),sep="")
            memb=clusters[,clusterNo[dataset,paste("X", maxGeneNo, sep="")]]
            
            ARI[dataset,paste("X", maxGeneNo, sep="")]=randIndex(memb,group[,2], correct = T)
        }
    }
}




