#' Extract mutational signatures from trinucleotide context.
#'
#' @description Decompose a matrix of 96 substitution classes into \code{n} signatures.
#'
#' @details This function decomposes a non-negative matrix into n signatures.
#' Extracted signatures are compared against 30 experimentally validated signatures by calculating cosine similarity. See http://cancer.sanger.ac.uk/cosmic/signatures for details.
#'
#' @param mat Input matrix of diemnsion nx96 generated by \code{\link{trinucleotideMatrix}}
#' @param n decompose matrix into n signatures. Default NULL. Tries to predict best value for \code{n} by running NMF on a range of values and chooses based on cophenetic correlation coefficient.
#' @param nTry tries upto this number of signatures before choosing best \code{n}. Default 6.
#' @param plotBestFitRes plots consensus heatmap for range of values tried. Default FALSE
#' @param parallel calls to .opt argument of \code{\link{nmf}}. e.g, 'P4' for using 4 cores. See note on \code{\link{nmf}} for MAC users.
#' @return a list with decomposed scaled signatures, signature contributions in each sample and a cosine similarity table against validated signatures.
#' @examples
#' \dontrun{
#' laml.tnm <- trinucleotideMatrix(maf = laml, ref_genome = 'hg19.fa', prefix = 'chr',
#' add = TRUE, useSyn = TRUE)
#' laml.sign <- extractSignatures(mat = laml.tnm, plotBestFitRes = FALSE)
#' }
#' @import NMF
#' @importFrom grDevices pdf boxplot.stats dev.off
#' @seealso \code{\link{trinucleotideMatrix}} \code{\link{plotSignatures}}
#' @export


extractSignatures = function(mat, n = NULL, nTry = 6, plotBestFitRes = FALSE, parallel = NULL){

    #suppressPackageStartupMessages(require(NMF, quietly = TRUE))
    #transpose matrix
    mat = t(mat$nmf_matrix)

    #Validation
    zeroMutClass = names(which(rowSums(mat) == 0))

    if(length(zeroMutClass)){
      message(paste('Warning : Found zero mutations for conversions ', zeroMutClass, sep=''))
      #Add small value to avoid zero counts (maybe not appropriate). This happens when sample size is low or in cancers with low mutation rate.
      mat[which(rowSums(mat) == 0),] = 0.1
    }

    #Notes:
    #Available methods for nmf decompositions are 'brunet', 'lee', 'ls-nmf', 'nsNMF', 'offset'.
    #But based 21 breast cancer signatures data, defualt brunet seems to be working close to the results.
    #Sticking with default for now.

    if(is.null(n)){
      message('Estimating best rank..')
      if(!is.null(parallel)){
        nmfTry = NMF::nmfEstimateRank(mat, seq(2,nTry), method='brunet', nrun=10, seed=123456, .opt = parallel) #try nmf for a range of values
      }else{
        nmfTry = NMF::nmfEstimateRank(mat, seq(2,nTry), method='brunet', nrun=10, seed=123456) #try nmf for a range of values
      }

      if(plotBestFitRes){
        pdf('nmf_consensus.pdf', bg = 'white', pointsize = 9, width = 12, height = 12, paper = "special")
        NMF::consensusmap(nmfTry)
        dev.off()
        message('created nmf_consensus.pdf')
        #print(NMF::plot(nmfTry, 'cophenetic'))
      }

      nmf.sum = summary(nmfTry) # Get summary of estimates
      data.table::setDT(nmf.sum)
      print(nmf.sum)
      nmf.sum$diff = c(0, diff(nmf.sum$cophenetic))
      bestFit = nmf.sum[diff < 0, rank][1] #First point where cophenetic correlation coefficient starts decreasing

      plot(nmf.sum$rank, nmf.sum$cophenetic, axes = FALSE, pch = 16, col = "#D8B365", cex = 1.2, xlab = NA, ylab = NA)
      axis(side = 1, at = nmf.sum$rank, labels = nmf.sum$rank, lwd = 3, font = 2, cex.axis = 1.2)
      lines(x = nmf.sum$rank, y = round(nmf.sum$cophenetic, digits = 4), lwd = 3)
      points(nmf.sum$rank, nmf.sum$cophenetic, pch = 16, col = "#D8B365", cex = 1.6)
      axis(side = 2, at = round(nmf.sum$cophenetic, digits = 4), lwd = 3, font = 2, las = 2, cex = 1.4, cex.axis = 1.2)
      segments(x0 = bestFit, y0 = 0, x1 = bestFit, y1 = nmf.sum[rank == bestFit, cophenetic], lwd= 3, lty = 2, col = "maroon")
      title(main = "cophenetic metric", adj = 0, font.main = 4)


      #bestFit = nmf.sum[which(nmf.sum$cophenetic == max(nmf.sum$)),'rank'] #Get the best rank based on highest cophenetic correlation coefficient
      message(paste('Using ',bestFit, ' as a best-fit rank based on decreasing cophenetic correlation coefficient.', sep=''))
      n = as.numeric(bestFit)
    }

    if(!is.null(parallel)){
      conv.mat.nmf = NMF::nmf(x = mat, rank = n, .opt = parallel, seed = 123456)
    }else{
      conv.mat.nmf = NMF::nmf(x = mat, rank = n, seed = 123456)
    }

    #Signatures
    w = NMF::basis(conv.mat.nmf)
    w = apply(w, 2, function(x) x/sum(x)) #Scale the signatures (basis)
    colnames(w) = paste('Signature', 1:ncol(w),sep='_')

    #Contribution
    h = NMF::coef(conv.mat.nmf)
    colnames(h) = colnames(mat) #correct colnames (seems to be mssing with low mutation load)
    #For single signature, contribution will be 100% per sample
    if(n == 1){
      h = h/h
      rownames(h) = paste('Signature', '1', sep = '_')
    }else{
      h = apply(h, 2, function(x) x/sum(x)) #Scale contributions (coefs)
      rownames(h) = paste('Signature', 1:nrow(h),sep='_')
    }


    #conv.mat.nmf.signatures.melted = melt(conv.mat.nmf.signatures)
    #levels(conv.mat.nmf.signatures.melted$X1) = colOrder

    sigs = data.table::fread(input = system.file('extdata', 'signatures.txt', package = 'maftools'), stringsAsFactors = FALSE, data.table = FALSE)
    colnames(sigs) = gsub(pattern = ' ', replacement = '_', x = colnames(sigs))
    rownames(sigs) = sigs$Somatic_Mutation_Type
    sigs = sigs[,-c(1:3)]
    #sigs = sigs[,1:22] #use only first 21 validated sigantures
    sigs = sigs[rownames(w),]

    aetiology = structure(list(aetiology = c("spontaneous deamination of 5-methylcytosine",
                                             "APOBEC Cytidine Deaminase (C>T)", "defects in DNA-DSB repair by HR",
                                             "exposure to tobacco (smoking) mutagens", "Unknown", "defective DNA mismatch repair",
                                             "UV exposure", "Unknown", "defects in polymerase-eta", "defects in polymerase POLE",
                                             "exposure to alkylating agents", "Unknown", "APOBEC Cytidine Deaminase (C>G)",
                                             "Unknown", "defective DNA mismatch repair", "Unknown", "Unknown",
                                             "Unknown", "Unknown", "defective DNA mismatch repair", "unknown",
                                             "exposure to aristolochic acid", "Unknown", "exposures to aflatoxin",
                                             "Unknown", "defective DNA mismatch repair", "Unknown", "Unknown",
                                             "exposure to tobacco (chewing) mutagens", "Unknown")), .Names = "aetiology", row.names = c("Signature_1",
                                                                                                                                        "Signature_2", "Signature_3", "Signature_4", "Signature_5", "Signature_6",
                                                                                                                                        "Signature_7", "Signature_8", "Signature_9", "Signature_10",
                                                                                                                                        "Signature_11", "Signature_12", "Signature_13", "Signature_14",
                                                                                                                                        "Signature_15", "Signature_16", "Signature_17", "Signature_18",
                                                                                                                                        "Signature_19", "Signature_20", "Signature_21", "Signature_22",
                                                                                                                                        "Signature_23", "Signature_24", "Signature_25", "Signature_26",
                                                                                                                                        "Signature_27", "Signature_28", "Signature_29", "Signature_30"
                                             ), class = "data.frame")

    message('Comparing against experimentally validated 30 signatures.. (See http://cancer.sanger.ac.uk/cosmic/signatures for details.)')
    #corMat = c()
    coSineMat = c()
    for(i in 1:ncol(w)){
      sig = w[,i]
      coSineMat = rbind(coSineMat, apply(sigs, 2, function(x){
        round(crossprod(sig, x)/sqrt(crossprod(x) * crossprod(sig)), digits = 3) #Estimate cosine similarity against all 30 signatures
      }))
      #corMat = rbind(corMat, apply(sigs, 2, function(x) cor.test(x, sig)$estimate[[1]])) #Calulate correlation coeff.
    }
    #rownames(corMat) = colnames(w)
    rownames(coSineMat) = colnames(w)

    for(i in 1:nrow(coSineMat)){
      ae = aetiology[names(which(coSineMat[i,] == max(coSineMat[i,]))),]
      ae = paste0("Aetiology: ", ae, " [cosine-similarity: ", max(coSineMat[i,]), "]")
      message('Found ',rownames(coSineMat)[i], ' most similar to validated ', names(which(coSineMat[i,] == max(coSineMat[i,]))), '. ', ae, sep=' ')
    }

    return(list(signatures = w, contributions = h, coSineSimMat = coSineMat, nmfObj = conv.mat.nmf))
}
