rm(list=ls())
resultPath="~/Desktop/Projects/Subtyping/PackageAndTesting/PINSResult/"

library(PINS)

#standard uniform with 1 cluster
nrow=100;ncol=1000

set.seed(1)
dataU <- matrix(runif(nrow*ncol, 0, 1), nrow=nrow, ncol=ncol)

resultU = PerturbationClustering(data = dataU)

resultFile=paste(resultPath, "PINS_Uniform1.RData", sep="")
save(dataU, resultU, file=resultFile)


#Standard Gaussian with 1 cluster
nrow=100;ncol=1000
Kmax=10

set.seed(1)
dataG <- matrix(rnorm(nrow*ncol, 0, 1), nrow=nrow, ncol=ncol)

resultG <- PerturbationClustering(data=dataG)

loop=20
AUC=matrix(NA, ncol=10, nrow=loop)
AUC[1,]=resultG$Discrepancy$AUC
for (i in 2:loop) {
  dataTMP <- matrix(rnorm(nrow*ncol, 0, 1), nrow=nrow, ncol=ncol)
  resultTMP = PerturbationClustering(data=dataTMP)
  AUC[i,]=resultTMP$Discrepancy$AUC
}
resultFile= paste(resultPath, "PINS_Gaussian1.RData", sep="")
save(dataG, resultG, AUC, file=resultFile)


# Gaussian datasets with 2-10 classes
Kmax=10
nrow=100;ncol=1000
for (classes in 2:10) {
  set.seed(1)
  dataG <- matrix(rnorm(nrow*ncol, 0, 1), nrow=nrow, ncol=ncol)
  rownames(dataG)=seq(nrow)
  
  str=NULL
  for (i in 1:classes) {
    str=c(str, rep(i, nrow/classes))
  }
  if (length(str)<nrow) {str=c(str, rep(classes, nrow-length(str)))}
  group=data.frame(row.names=seq(nrow),Sample=seq(nrow), Group=str)
  
  for (i in 1:classes) {
    dataG[rownames(group)[group[,2]==i], (100*(i-1)+1):(100*i)]=dataG[rownames(group)[group[,2]==i], (100*(i-1)+1):(100*i)]+2
  }
  
  resultG=PerturbationClustering(data=dataG)
  
  resultFile=paste(resultPath, "PINS_Gaussian", classes, ".RData", sep="")
  save(dataG, resultG, group, file=resultFile)
}






# Testing the time for PINS
dataG <- matrix(rnorm(200*10000, 0, 1), nrow=200, ncol=10000)
T=NULL
X=seq(4,20,by=2)
for (i in 1:length(X)) {
    T[i]=system.time(PerturbationClustering(data = dataG, Kmax = X[i]))[3]
}
save(T,X,dataG,file=paste(PINSPath,"SimulationKmax.RData",sep=""))


pdfFile=paste(pdfPath,"PINS_Kmax_Simulation.pdf", sep="")
pdf(pdfFile)
par(tcl=0.3,mgp=c(1.7,0.4,0),mar=c(3,3,2.5,1))
plot(X, T/60,xaxt='n', xlab="Maximum number of clusters (K)", ylab="Running time (minute)", main="Effect of K on PINS's running time", cex.lab=1.4, cex.axis=1.3, cex.main=1.7, col="red")
lines(X,T/60, lwd=2, col="blue")
axis(side=1, at=X, labels=X,cex.axis=1.3)
dev.off()





############ Check the sensibility of noise variance
library(flexclust)
library(PINS)
library(ConsensusClusterPlus)
library(iClusterPlus)
library(SNFtool)
nrow=100;ncol=1000
classes=9

MValues=c(4,3,2,1,0.9,0.8,0.7,0.6)
ARI_PINS=NULL
ARI_SNF=NULL
sigma=NULL

CCPath="~/Desktop/Projects/Subtyping/PackageAndTesting/CCResult/"
iClusterPlusPath="~/Desktop/Projects/Subtyping/PackageAndTesting/iClusterPlusResult/"

# CCPath="/wsu/home/ex/ex60/ex6091/Subtyping/PackageAndTesting/CCResult/"
# iClusterPlusPath="/wsu/home/ex/ex60/ex6091/Subtyping/PackageAndTesting/iClusterPlusResult/"

#PINS
for (ind in 1:length(MValues)) {
    set.seed(1)
    M=MValues[ind]
    
    dataG <- matrix(rnorm(nrow*ncol, 0, 1), nrow=nrow, ncol=ncol)
    rownames(dataG)=seq(nrow)
    str=NULL
    for (i in 1:classes) {
        str=c(str, rep(i, nrow/classes))
    }
    if (length(str)<nrow) {str=c(str, rep(classes, nrow-length(str)))}
    group=data.frame(row.names=seq(nrow),Sample=seq(nrow), Group=str)
    for (i in 1:classes) {
        dataG[rownames(group)[group[,2]==i], (100*(i-1)+1):(100*i)]=dataG[rownames(group)[group[,2]==i], (100*(i-1)+1):(100*i)] + M
    }
    
    sds=apply(dataG, FUN=var, MARGIN=2)
    sigma[ind]=median(sds)
    
    # PINS
    result=PerturbationClustering(dataG)
    ARI_PINS[ind]=randIndex(group[,2],result$groups, correct = TRUE)
}
sigma
ARI_PINS

    
# SNF
for (ind in 1:length(MValues)) {
    set.seed(1)
    M=MValues[ind]
    
    dataG <- matrix(rnorm(nrow*ncol, 0, 1), nrow=nrow, ncol=ncol)
    rownames(dataG)=seq(nrow)
    str=NULL
    for (i in 1:classes) {
        str=c(str, rep(i, nrow/classes))
    }
    if (length(str)<nrow) {str=c(str, rep(classes, nrow-length(str)))}
    group=data.frame(row.names=seq(nrow),Sample=seq(nrow), Group=str)
    for (i in 1:classes) {
        dataG[rownames(group)[group[,2]==i], (100*(i-1)+1):(100*i)]=dataG[rownames(group)[group[,2]==i], (100*(i-1)+1):(100*i)] + M
    }
    # SNF
    K = 20;##number of neighbors, usually (10~30)
    alpha = 0.5; ##hyperparameter, usually (0.3~0.8)
    NIT = 10; ###Number of Iterations, usually (10~20)
    data=standardNormalization(dataG)
    PSMgeneE = dist2(as.matrix(data),as.matrix(data));
    W1 = affinityMatrix(PSMgeneE, K, alpha)
    C = estimateNumberOfClustersGivenGraph(W1, NUMC=2:10)  #number of clusters
    groupSNF = spectralClustering(W1,C[[1]])
    ARI_SNF[ind]=randIndex(group[,2],groupSNF, correct = TRUE)
}
ARI_SNF
 

# CC results
for (ind in 1:length(MValues)) {
    set.seed(1)
    M=MValues[ind]
    
    dataG <- matrix(rnorm(nrow*ncol, 0, 1), nrow=nrow, ncol=ncol)
    rownames(dataG)=seq(nrow)
    str=NULL
    for (i in 1:classes) {
        str=c(str, rep(i, nrow/classes))
    }
    if (length(str)<nrow) {str=c(str, rep(classes, nrow-length(str)))}
    group=data.frame(row.names=seq(nrow),Sample=seq(nrow), Group=str)
    for (i in 1:classes) {
        dataG[rownames(group)[group[,2]==i], (100*(i-1)+1):(100*i)]=dataG[rownames(group)[group[,2]==i], (100*(i-1)+1):(100*i)] + M
    }
    
    # CC
    path=paste(CCPath, "Simulation_", M,sep="")
    if (!file.exists(path)) dir.create(path)
    d <- t(dataG)
    d = sweep(d,1, apply(d,1,median,na.rm=T))
    results = ConsensusClusterPlus(d,maxK=10,reps=1000, pItem=0.8,pFeature=1, title=path,clusterAlg="hc",distance="pearson", plot="png", seed=888)
    resultFile=paste(CCPath,"CC_Simulation_", M, ".RData" ,sep="")  
    save(dataG, results, group, file=resultFile)
}



k=c(9, 9, 9, 9, 9, 9, 9, 9)
ARI_CC=NULL
for (ind in 1:length(MValues)) {
    M=MValues[ind]
    resultFile=paste(CCPath,"CC_Simulation_", M, ".RData" ,sep="") 
    load(resultFile)
    memb=results[[k[ind]]]$consensusClass
    ARI_CC[ind]=randIndex(group$Group,memb)
}
ARI_CC

 

# iClusterPlus
for (ind in 1:length(MValues)) {
    set.seed(1)
    M=MValues[ind]
    
    dataG <- matrix(rnorm(nrow*ncol, 0, 1), nrow=nrow, ncol=ncol)
    rownames(dataG)=seq(nrow)
    str=NULL
    for (i in 1:classes) {
        str=c(str, rep(i, nrow/classes))
    }
    if (length(str)<nrow) {str=c(str, rep(classes, nrow-length(str)))}
    group=data.frame(row.names=seq(nrow),Sample=seq(nrow), Group=str)
    for (i in 1:classes) {
        dataG[rownames(group)[group[,2]==i], (100*(i-1)+1):(100*i)]=dataG[rownames(group)[group[,2]==i], (100*(i-1)+1):(100*i)] + M
    }
    
    # iClusterPlus
    cv.fit=alist()
    for (k in 1:9) {
        cv.fit[[k]]=tune.iClusterPlus(cpus=60, dt1=dataG, K=k, type=c("gaussian"))
    }
    resultFile=paste(iClusterPlusPath,"iClusterPlus_Simulation_", M, ".RData" ,sep="")  
    save(dataG, cv.fit, group, file=resultFile)
}

ARI_iCluster=NULL
k=c(9,9,9,1,1,1,1,1)
for (ind in 1:length(MValues)) {
    M=MValues[ind]
    resultFile=paste(iClusterPlusPath,"iClusterPlus_Simulation_", M, ".RData" ,sep="")  
    load(resultFile)
    
    nK = length(cv.fit)
    BIC=getBIC(cv.fit)
    devR = getDevR(cv.fit) 
    minBICid = apply(BIC,2,which.min)
    devRatMinBIC = rep(NA,nK)
    for(i in 1:nK){
        devRatMinBIC[i] = devR[minBICid[i],i]
    } 
    plot(devRatMinBIC)
    
    
    clusters=getClusters(cv.fit)
    rownames(clusters)=rownames(dataG)
    colnames(clusters)=paste("K=",2:(length(cv.fit)+1),sep="")
    memb=clusters[,k[ind]]
    
    ARI_iCluster[ind]=randIndex(memb,group[,2], correct = T)
}
############ Check the sensibility of noise variance
