options(stringsAsFactors = FALSE)
library('pROC')
library('caret')
library(doMC)
registerDoMC(cores = 12)


######## run random forest
#### 2016-02-09
# apparently, the small number of changed genes is preventing good learning
# maybe use sampling to help make it more balanced?
get_rf_fit_with_sampling <- function(my.features.all, my.training.idx, my.samples = 250, my.seed = 1234) {
  
  set.seed(my.seed)
  my.training <- my.features.all[my.training.idx,]
  
  my.training$age_change <- factor(my.training$age_change)
  
  # sample from constant in the same number as changing examples
  my.constant.idx <- which(my.training$age_change %in% 'CONSTANT') # indeces of constant examples
  my.changing.idx <- which(my.training$age_change != 'CONSTANT') # indeces of constant examples
  
  # use 10-fold cross-validation to build the model
  my.ctrl.opt <- trainControl(method = "cv", 
                              number = 10,
                              allowParallel=TRUE,
                              summaryFunction=getMultiClassConfusionMatrixBalancedAccuracy,
                              verbose=F,
                              classProbs = TRUE)
  
  # randomForest info for mtry parameter
  # Number of variables randomly sampled as candidates at each split. 
  # Note that the default values are different for classification (sqrt(p) where p is number of variables in x) and regression (p/3)
  #fineGrid <- expand.grid(mtry = seq( floor(sqrt(dim(my.heart.testing)[2])) - 4,floor(sqrt(dim(my.heart.testing)[2])) + 4,4))
  
  # prepare result structure
  my.rf.sampled.fits <- vector(mode='list',length=my.samples)
  
  for ( i in 1:my.samples) {
    
    my.constant.samples <- sample(my.constant.idx, length(my.changing.idx), replace = FALSE)
    
    my.sampled.training <- my.training[c(my.constant.samples,my.changing.idx),]
    
    fineGrid <- expand.grid(mtry = seq(3,21,3))

    # train model with caret train function
    my.rf.fit <- train( age_change ~ .,
                        data = my.sampled.training, # all but gene name
                        method="rf",
                        importance=TRUE,
                        trControl = my.ctrl.opt,
                        tuneGrid = fineGrid,
                        metric="balancedAcc"
                        )
    
    my.rf.sampled.fits[[i]] <- my.rf.fit
  }
  
  return (my.rf.sampled.fits)
  
}

####################
# 2016-08-03
# run without CONSTANT class

get_rf_fit_noCST <- function(my.features.all, my.training.idx, my.seed = 1234) {
  
  set.seed(my.seed)
  my.training <- my.features.all[my.training.idx,]
  
  # remove the constant class
  my.training2 <- data.frame(my.training[my.training$age_change != 'CONSTANT',])
  my.training2$age_change <- factor(my.training2$age_change)
  
  # use 10-fold cross-validation to build the model
  my.ctrl.opt <- trainControl(method = "cv",
                              number = 10,
                              allowParallel=TRUE,
                              verbose=F,
                              summaryFunction=twoClassSummary,
                              classProbs = TRUE)
  
  fineGrid <- expand.grid(mtry = seq(3,21,3))
  
  # train model with caret train function
  my.rf.fit <- train( age_change ~ .,
                      data = my.training2, # all but GeneName, age_FC, FDR
                      method="rf",
                      importance=TRUE,
                      trControl = my.ctrl.opt,
                      tuneGrid = fineGrid,
                      metric="ROC"
                      )
  
  return (my.rf.fit)
  
}



### Aaron functions
#getTwoClassBalancedAccuracy <- function (data, lev = NULL, model = NULL) {
#  require(pROC)
#  if (!all(levels(data[, "pred"]) == levels(data[, "obs"]))){
#    stop("levels of observed and predicted data do not match")
#  }
#  rocObject <- try(pROC:::roc(data$obs, data[, lev[1]]), silent = TRUE)
#  if (class(rocObject)[1] == "try-error") {
#    return(NA)
#  }else{
#    out <- ( sensitivity(data[, "pred"], data[, "obs"], lev[1]) + specificity(data[, "pred"], data[, "obs"], lev[2]) ) /2
#    names(out) <- "balancedAcc"
#    return(out)
#  }
#}

getMultiClassConfusionMatrixBalancedAccuracy <- function (data, lev = NULL, model = NULL) {
  
  if (!all(levels(data[, "pred"]) == levels(data[, "obs"]))){
    stop("levels of observed and predicted data do not match")
  }
  
  testConfMat <- try(confusionMatrix(data$pred, data$obs), silent = TRUE)
  if (class(testConfMat)[1] == "try-error") {
    return(NA)
  }else{
    # this gets the last trhee entries which are the 3 balanced accuracies
    
    out <- mean(testConfMat$byClass[,"Balanced Accuracy"])
    names(out) <- "balancedAcc"
    return(out)
  }
}

######## get data partition
get_training_idx <- function(my.features.all) {
    
  set.seed(1234)
  
  my.training.idx <- createDataPartition(my.features.all$age_change, p=0.67, list=FALSE) # 2/3 for training
  
  return(my.training.idx)
}




