#' @title SubtypingOmicsData: Subtyping multi-omics data
#' @description Perform subtyping using multiple types of data
#' 
#' @param dataList list of data matrices or data frames. Each matrix represents a data type where the rows are samples and the columns are features. The matrices must have the same set of samples.
#' @param Kmax the maximum number of clusters. Default value is 10.
#' @param noisePercent the parameter to determine the noise standard deviation. Default is "med", i.e. the noise standard deviation is the medium standard deviation of the features. If noisePercent is numeric, then the noise standard deviation is noisePercent * sd(data).
#' @param iter the number of perturbed datasets. Default value is 200.
#' @param kmIter the number of initial centers used in k-means clustering.
#' @param agreementCutoff agreement threshold to be considered consistent.
#' 
#' @details
#' 
#' The input is  a list of data matrices where each matrix represents the molecular measurements of a data type. The matrices have the same number of rows (samples). The algorithm first partitions each data type using the function \emph{PerturbationClustering}. It then merges the connectivities across data types into similarity matrices. Similarity-based clustering algorithms, such as partitioning around medoids (pam), hierarchical clustering (hclust), and dynamicTreeCut, are used to partition the built similarity. The algorithm returns the partitioning (from different similarity-based algorithms) that agrees the most with individual data types. That completes Stage I.
#' 
#' In Stage II, the algorithm attempts to split each discovered group if there is a strong agreement between data types, or the if the subtyping in Stage I is very unbalanced.
#'
#' @return
#' 
#' \emph{SubtypingOmicsData} returns a list with at least the following components:
#' \item{groups}{A vector of labels indicating the cluster to which each sample is allocated in Stage I}
#' \item{groups2}{A vector of labels indicating the cluster to which each sample is allocated in Stage II}
#' \item{dataTypeResult}{A list of results for individual data type. Each element of the list is the result of the \emph{PerturbationClustering} for the corresponding data matrix provided in dataList.}
#' 
#' @author
#' 
#' Tin Nguyen and Sorin Draghici
#' 
#' @references
#' 
#' Tin Nguyen, Rebecca Tagett, Diana Diaz, and Sorin Draghici (2015) A novel approach for data integration and disease subtyping. Submitted.
#' 
#' @seealso \code{\link{PerturbationClustering}}, \code{\link{hclust}}, \code{\link{pam}}, \code{\link{dynamicTreeCut}}, \code{\link{clusGap}}
#' 
#' @examples
#' 
#' #load the kidney cancer carcinoma data
#' data(KIRC)
#' #perform subtyping on the multi-omics data
#' dataList <- list (mydatGE, mydatME, mydatMI) 
#' names(dataList) = c("GE", "ME", "MI")
#' result=SubtypingOmicsData(dataList = dataList, Kmax = 10, noisePercent = "med", iter = 50)
#' # Plot the Kaplan-Meier curves and calculate Cox p-value
#' library(survival)
#' groups=result$groups;groups2=result$groups2
#' a <-intersect(unique(groups2), unique(groups));names(a) <- intersect(unique(groups2), unique(groups)); a[setdiff(unique(groups2), unique(groups))] <- seq(setdiff(unique(groups2), unique(groups)))+max(groups)
#' colors <- a[levels(factor(groups2))]
#' coxFit <- coxph(Surv(time = Survival, event = Death) ~ as.factor(groups2), data = survival, ties="exact")
#' mfit <- survfit(Surv(Survival, Death == 1) ~ as.factor(groups2), data = survival)
#' plot(mfit, col=colors, main = "Survival curves for KIRC, level 2", xlab = "Days", ylab="Survival", lwd=2)
#' legend("bottomright", legend = paste("Cox p-value:", round(summary(coxFit)$sctest[3],digits = 5), sep=""))
#' legend("bottomleft", fill=colors, legend=paste("Group ",levels(factor(groups2)), ": ", table(groups2)[levels(factor(groups2))], sep=""))
#' 
#' @export 
SubtypingOmicsData <- function (dataList, Kmax=10, noisePercent="med", iter=200, kmIter=50, agreementCutoff=0.5) {
  dataTypeResult <- list()
  for (i in 1:length(dataList)) {
    message(paste("Data type: ", i, sep=""))
    dataTypeResult[[i]] <- PerturbationClustering(data=dataList[[i]], Kmax=Kmax, noisePercent="med", iter=iter, kmIter=kmIter)
  }
  
  orig=dataTypeResult[[1]]$origS[[dataTypeResult[[1]]$k]]
  PW = dataTypeResult[[1]]$origS[[dataTypeResult[[1]]$k]]
  pert=dataTypeResult[[1]]$pertS[[dataTypeResult[[1]]$k]]
  for (i in 2:length(dataTypeResult)) {
    orig=orig+dataTypeResult[[i]]$origS[[dataTypeResult[[i]]$k]]
    PW = PW * dataTypeResult[[i]]$origS[[dataTypeResult[[i]]$k]]
    pert=pert+dataTypeResult[[i]]$pertS[[dataTypeResult[[i]]$k]]
  }
  orig=orig/length(dataTypeResult)
  pert=pert/length(dataTypeResult)
  
  groupings <- list()
  for (i in 1:length(dataTypeResult)) {
    k=dataTypeResult[[i]]$k
    groupings[[i]] <- kmeansSS(dataTypeResult[[i]]$pertS[[k]],k)$cluster
  }
  
  hierPart <- clusterUsingHierarchical(orig = orig, pert = pert, Kmax=Kmax, groupings=groupings)
  pamPart <- clusterUsingPAM(orig = orig, pert = pert, Kmax=Kmax, groupings=groupings)
  
  hcP <- hclust(as.dist(1-pert), method="average")  
  groupRP <- dynamicTreeCut::cutreeDynamic(hcP, distM=1-pert, cutHeight = 0.9*max(hcP$height))
  if (length(which(groupRP==0))>0) groupRP=groupRP+1
  names(groupRP) <- rownames(orig)
  kmRP <- structure(list(cluster=groupRP), class="kmeans")
  
  
  agreement = (sum(orig==0) + sum(orig==1)-nrow(orig))/(nrow(orig)^2-nrow(orig))
  
  if (agreement>agreementCutoff) {
    hcW <- hclust(dist(PW))
    maxK = min(Kmax, dim(unique(PW,MARGIN=2))[2])
    maxHeight = findMaxHeight(hcW, maxK = maxK)
    groups <- cutree(hcW, maxHeight)
    
    message("Check if we can proceed to stage II")
    groups2 = groups
    for (g in sort(unique(groups))) {
      miniGroup <- names(groups[groups==g])
      #this is just to make sure we don't split a group that is already very small
      if (length(miniGroup) > 30) {
        tmpList <- list()
        for (i in 1:length(dataList)) {
          tmpList[[i]] <- PerturbationClustering(data=dataList[[i]][miniGroup,], Kmax=Kmax/2, noisePercent="med", iter=iter, kmIter=kmIter)
        }
        
        origM=tmpList[[1]]$origS[[tmpList[[1]]$k]]
        PWM = tmpList[[1]]$origS[[tmpList[[1]]$k]]
        pertM=tmpList[[1]]$pertS[[tmpList[[1]]$k]]
        for (i in 2:length(tmpList)) {
          origM=origM+tmpList[[i]]$origS[[tmpList[[i]]$k]]
          PWM = PWM * tmpList[[i]]$origS[[tmpList[[i]]$k]]
          pertM=pertM+tmpList[[i]]$pertS[[tmpList[[i]]$k]]
        }
        origM=origM/length(tmpList)
        pertM=pertM/length(tmpList)
      
        agreementM = (sum(origM==0)+sum(origM==1)-nrow(origM))/(nrow(origM)^2-nrow(origM))
        if (agreementM >= agreementCutoff) {
          hcPWM <- hclust(dist(PWM))
          maxK = min(Kmax/2, dim(unique(PWM,MARGIN=2))[2]-1)        
          maxHeightM = findMaxHeight(hcPWM, maxK)
          groupsM <- cutree(hcPWM, maxHeightM)
          groupsM <- paste(g, groupsM, sep="-")
          groups2[miniGroup] <- groupsM
        } 
      }
    }
      
  } else {
    
#     groupings <- list()
#     for (i in 1:length(dataTypeResult)) {
#       groupings[[i]] <- dataTypeResult[[i]]$groups
#     }
    
#     hierPart <- clusterUsingHierarchical(orig = orig, pert = pert, Kmax=Kmax, groupings=groupings)
#     pamPart <- clusterUsingPAM(orig = orig, pert = pert, Kmax=Kmax, groupings=groupings)
#     
#     hcP <- hclust(as.dist(1-pert), method="average")  
#     groupRP <- dynamicTreeCut::cutreeDynamic(hcP, distM=1-pert, cutHeight = 0.9*max(hcP$height))
#     if (length(which(groupRP==0))>0) groupRP=groupRP+1
#     names(groupRP) <- rownames(survival)
#     kmRP <- structure(list(cluster=groupRP), class="kmeans")
    
    if (hierPart$diff[hierPart$k]<pamPart$diff[pamPart$k]) {
      km <- hierPart$km
    } else {
      km <- pamPart$km
    }
    
    l1 = groupings; l1[[length(l1)+1]]=kmRP$cluster
    l2 = groupings; l2[[length(l2)+1]]=km$cluster
    if (clusterAgreement(l1, nrow(orig)) > clusterAgreement(l2, nrow(orig)))
      km <- kmRP
    
    groups <- km$cluster
    
    
    message("Check if can proceed to stage II")
    groups2 <- groups
    normalizedEntropy=entropy(table(groups))/log(length(unique(groups)),exp(1))
    
    if (normalizedEntropy<0.5) {
      for (g in sort(unique(groups))) {
        miniGroup <- names(groups[groups==g])
        #this is just to make sure we don't split a group that is already very small
        if (length(miniGroup) > 30) {
          #this is to check if the data types in this group can be split
          gapCount=0
          for (i in 1:length(dataList)) {
            tmp=clusGap(prcomp(dataList[[i]][miniGroup,])$x,FUN=kmeans, K.max=Kmax/2, B=100)
            if (maxSE(tmp$Tab[,"gap"], tmp$Tab[,"SE.sim"], method="firstSEmax")>1) gapCount=gapCount+1
          }
          if (length(miniGroup) > 30 && gapCount > length(dataList)/2) {
            tmpList <- list()
            for (i in 1:length(dataList)) {
              tmpList[[i]] <- PerturbationClustering(data=dataList[[i]][miniGroup,], Kmax=Kmax/2, noisePercent="med", iter=iter, kmIter=kmIter)
            }
            
            origM=tmpList[[1]]$origS[[tmpList[[1]]$k]]
            PWM = tmpList[[1]]$origS[[tmpList[[1]]$k]]
            pertM=tmpList[[1]]$pertS[[tmpList[[1]]$k]]
            for (i in 2:length(tmpList)) {
              origM=origM+tmpList[[i]]$origS[[tmpList[[i]]$k]]
              PWM = PWM * tmpList[[i]]$origS[[tmpList[[i]]$k]]
              pertM=pertM+tmpList[[i]]$pertS[[tmpList[[i]]$k]]
            }
            origM=origM/length(tmpList)
            pertM=pertM/length(tmpList)
            
            hcPWM <- hclust(dist(PWM))
            maxK = min(Kmax/2, dim(unique(PWM,MARGIN=2))[2]-1)        
            maxHeightM = findMaxHeight(hcPWM, maxK)
            groupsM <- cutree(hcPWM, maxHeightM)
            groupsM <- paste(g, groupsM, sep="-")
            groups2[miniGroup] <- groupsM
          }
        }
      }
    }
    
  }

  result <- list()
  result$groups <- groups
  result$groups2 <- groups2
  result$dataTypeResult <- dataTypeResult
  result$hierPart  <- hierPart
  result$pamPart <- pamPart
  result$kmRP <- kmRP
  
  result
}
