####################################
## Random Forest for Lymphocytes ##
###################################

# Configuration -------------------
pks <- list('ggplot2' , 'tidyverse' , 'randomForest', 'caret', 'doParallel', 'gridExtra', 'grid')
lapply(pks , library , character.only = T, quietly = FALSE)
theme_set(theme_bw())

setwd(dir = "~/RMI2/Projet_TDD/20200114_Normalisation")
if (!dir.exists(paths = "results/RandomForest_lympho")) {
  dir.create(path = "results/RandomForest_lympho")
}


# funtions ----------------------------------------------------------------
plotImp <- function(model, model_name) {
  imp <- as.data.frame(model$importance)
  imp$variable <- rownames(imp)
  order_var <- imp[order(imp$IncNodePurity, decreasing = FALSE),"variable"]
  imp$variable <- factor(imp$variable, levels = order_var)
  
  # grob <- grobTree(textGrob(paste("% Var explained : "), 
  #                           x = 0.1,  
  #                           y = 0.95, 
  #                           hjust = 0,
  #                           gp = gpar(col = "red", fontsize = 13, fontface = "italic")))
  
  p1 <- ggplot(data = imp, aes(x = variable, y = IncNodePurity)) +
    geom_point(stat = "identity", color = "black") + 
    coord_flip() + ggtitle(model_name) +
    theme(legend.position="none")
  ggtitle(model_name)
  return(p1)
}


# loading and formatting data ----
table <- read.csv2(file = "data/2020-02-14_15-33-09_Subset_Data.csv",
                   sep = ",")
table <- as_tibble(table)
table <- as_tibble(apply(table, 2, function(x) as.numeric(as.character(x))))
table$n_m6ASeq_Peaks[is.na(table$n_m6ASeq_Peaks)] <- 0
table$total_m6ASeq_score[is.na(table$total_m6ASeq_score)] <- 0
table$n4G_CDS[is.na(table$n4G_CDS)] <- 0
table$n4G_UTR5[is.na(table$n4G_UTR5)] <- 0
table$n4G_UTR3[is.na(table$n4G_UTR3)] <- 0

table$X31CDS.End_DG <- table$X31CDS.End_DG/(table$CDS_length - 30)
table$UTR5_CG <- table$UTR5_C + table$UTR5_G
table$UTR3_CG <- table$UTR3_C + table$UTR3_G
table$CDS_CG <- table$CDS_C + table$CDS_G
table$asIndexnonTDD_LR <- table$Deg3hTrip_LR - table$asIndexTDD_t3TripTripCHX.0hTrip_LR
table$asIndexnonTDD_LA <- table$Deg3hTrip_LA - table$asIndexTDD_t3TripTripCHX.0hTrip_LA


tdd_LR <-  c("asIndexTDD_t3TripTripCHX.0hTrip_LR",
             "asIndexTDD_t3TripTripHAR.0hTrip_LR")

index_LR <-  c("CDS_length",
               "UTR5_length",
               "Exp_UTR3_Resting_CD4_T_cells_R1_Mean_l",
               "n_m6ASeq_Peaks",
               "X47UTR5.30CDS_DG",
               "X31CDS.End_DG",
               "total_m6ASeq_score",
               "RiboDens_LymphoR",
               "n4G_CDS",
               "n4G_UTR5",
               "n4G_UTR3",
               "UTR5_CG",
               "CDS_CG",
               "UTR3_CG",
               "asIndexnonTDD_LR")

tdd_LA <-  c("asIndexTDD_t3TripTripCHX.0hTrip_LA",
             "asIndexTDD_t3TripTripHAR.0hTrip_LA")

index_LA <-  c("CDS_length",
               "UTR5_length",
               "Exp_UTR3_Activated_CD4_T_cells_A1_Mean_l",
               "n_m6ASeq_Peaks",
               "X47UTR5.30CDS_DG",
               "X31CDS.End_DG",
               "total_m6ASeq_score",
               "RiboDens_LymphoA",
               "n4G_CDS",
               "n4G_UTR5",
               "n4G_UTR3",
               "UTR5_GC",
               "CDS_GC",
               "UTR3_GC",
               "asIndexnonTDD_LA")

# Resting lymphocytes ------------------------------------------------------

## Training set ------------------------------------------------------------

sub_index <- c(index_LR, "asIndexTDD_t3TripTripCHX.0hTrip_LR")
sub_data <- select(table, sub_index)

sub_data <- filter(sub_data, 
                     !is.na(asIndexTDD_t3TripTripCHX.0hTrip_LR) &
                     RiboDens_LymphoR != Inf)

sub_data <- na.exclude(sub_data) 
                        
set.seed(1518)
train <- sample(nrow(sub_data), round(nrow(sub_data)/3))
train_set <-  sub_data[train,]
valid_set <- sub_data[-train,]

# find best randomforest paramters ----------

asIndexTDD_in_lymphoR <- randomForest(asIndexTDD_t3TripTripCHX.0hTrip_LR ~., data = train_set, mtry = 5, ntree = 2000)
RibosomeDensity_in_lymphoR <- randomForest(RiboDens_LymphoR ~., data = train_set, mtry = 5, ntree = 2000)
asIndexnonTDD_in_lymphoR <- randomForest(asIndexnonTDD_LR ~., data = train_set, mtry = 5, ntree = 2000)

# check model -------------------------------------------------------------
## IndexTDD
pred_train <- predict(asIndexTDD_in_lymphoR, train_set, method = 'rf')
plot(pred_train, train_set$asIndexTDD_t3TripTripCHX.0hTrip_LR)
abline(a = 0, b = 1)
cor(pred_train, train_set$asIndexTDD_t3TripTripCHX.0hTrip_LR, method = "spearman")

pred_valid_IndexTDD <- predict(asIndexTDD_in_lymphoR, valid_set, method = 'rf')
plot(pred_valid_IndexTDD, valid_set$asIndexTDD_t3TripTripCHX.0hTrip_LR)
abline(a = 0, b = 1)
cor(pred_valid_IndexTDD, valid_set$asIndexTDD_t3TripTripCHX.0hTrip_LR, method = "spearman")

plotImp(asIndexTDD_in_lymphoR, "asIndexTDD_t3TripTripCHX.0hTrip_LR")

## RiboDens
pred_train_rd <- predict(RibosomeDensity_in_lymphoR, train_set, method = 'rf')
plot(pred_train_rd, train_set$RiboDens_LymphoR)
abline(a = 0, b = 1)
cor(pred_train_rd, train_set$RiboDens_LymphoR, method = "spearman")

pred_valid_rd <- predict(RibosomeDensity_in_lymphoR, valid_set, method = 'rf')
plot(pred_valid_rd, valid_set$RiboDens_LymphoR)
abline(a = 0, b = 1)
cor(pred_valid_rd, valid_set$RiboDens_LymphoR, method = "spearman")

plotImp(RibosomeDensity_in_lymphoR, "asIndexTDD_t3TripTripCHX.0hTrip_LR")


## IndexnonTDD
pred_train_nonTDD <- predict(asIndexnonTDD_in_lymphoR, train_set, method = 'rf')
plot(pred_train_nonTDD, train_set$asIndexnonTDD_LR)
abline(a = 0, b = 1)
cor(pred_train_nonTDD, train_set$asIndexnonTDD_LR, method = "spearman")

pred_valid_nonTDD <- predict(asIndexnonTDD_in_lymphoR, valid_set, method = 'rf')
plot(pred_valid_nonTDD, valid_set$asIndexnonTDD_LR)
abline(a = 0, b = 1)
cor(pred_valid_nonTDD, valid_set$asIndexnonTDD_LR, method = "spearman")

plotImp(asIndexnonTDD_in_lymphoR, "asIndexTDD_t3TripTripCHX.0hTrip_LR")


sub_data$n4G_CDS <- as.factor(sub_data$n4G_CDS)
sub_data$n4G_UTR3 <- as.factor(sub_data$n4G_UTR3)
sub_data$n4G_UTR5 <- as.factor(sub_data$n4G_UTR5)

msub_data <- select(sub_data, asIndexTDD_t3TripTripCHX.0hTrip_LR,
                              n4G_UTR5)
msub_data <- reshape2::melt(msub_data)

ggplot(data = msub_data, aes(y = (value), x = n4G_UTR5)) +
  geom_boxplot()
table(msub_data$n4G_UTR5)

# Activated lymphocytes ---------------------------------------------------
## Separate training set and valid set
sub_index <- c(index_LA, "asIndexTDD_t3TripTripCHX.0hTrip_LA")
sub_data <- select(table, sub_index)

sub_data <- filter(sub_data, 
                   !is.na(asIndexTDD_t3TripTripCHX.0hTrip_LA) &
                     RiboDens_LymphoA != Inf)

sub_data <- na.exclude(sub_data) 

set.seed(1518)
train <- sample(nrow(sub_data), round(nrow(sub_data)/3))
train_set <-  sub_data[train,]
valid_set <- sub_data[-train,]

## Random forest
asIndexTDD_in_lymphoA <- randomForest(asIndexTDD_t3TripTripCHX.0hTrip_LA ~., data = train_set, mtry = 5, ntree = 2000)
RibosomeDensity_in_lymphoA <- randomForest(RiboDens_LymphoA ~., data = train_set, mtry = 5, ntree = 2000)
asIndexnonTDD_in_lymphoA <- randomForest(asIndexnonTDD_LA ~., data = train_set, mtry = 5, ntree = 2000)

asIndexTDD_in_lymphoA
RibosomeDensity_in_lymphoA
asIndexnonTDD_in_lymphoA


## testing the prediction 
### for the index TDD
pred_train <- predict(asIndexTDD_in_lymphoA, train_set, method = 'rf')
ggplot() +
  geom_point(aes(x = pred_train, y = train_set$asIndexTDD_t3TripTripCHX.0hTrip_LA), alpha = 0.2, color = "chocolate") + 
  xlab("predicted values") + ylab("obesrved values") + ggtitle("Prediction of the TDD index in the trainning set") + 
  geom_abline(slope = 1, intercept = 0) 
cor(pred_train, train_set$asIndexTDD_t3TripTripCHX.0hTrip_LA, method = "spearman")

pred_valid_IndexTDD <- predict(asIndexTDD_in_lymphoA, valid_set, method = 'rf')
ggplot() +
  geom_point(aes(x = pred_valid_IndexTDD, y = valid_set$asIndexTDD_t3TripTripCHX.0hTrip_LA), alpha = 0.2, color = "chocolate") + 
  xlab("predicted values") + ylab("obesrved values") + ggtitle("Prediction of the TDD index in the validation set") + 
  geom_abline(slope = 1, intercept = 0) 
cor(pred_valid_IndexTDD, valid_set$asIndexTDD_t3TripTripCHX.0hTrip_LA, method = "spearman")

plotImp(asIndexTDD_in_lymphoA, "asIndexTDD_t3TripTripCHX.0hTrip_LA")


### for Ribosomal occupency
pred_train_rd <- predict(RibosomeDensity_in_lymphoA, train_set, method = 'rf')
ggplot() +
  geom_point(aes(x = pred_train_rd, y = train_set$RiboDens_LymphoA), alpha = 0.2, color = "blue") + 
  xlab("predicted values") + ylab("obesrved values") + ggtitle("Prediction of the Ribosome occupency in the trainning set") + 
  geom_abline(slope = 1, intercept = 0) 
cor(pred_train_rd, train_set$RiboDens_LymphoA, method = "spearman")

pred_valid_rd <- predict(RibosomeDensity_in_lymphoA, valid_set, method = 'rf')
ggplot() +
  geom_point(aes(x = pred_valid_rd, y = valid_set$RiboDens_LymphoA), alpha = 0.2, color = "blue") + 
  xlab("predicted values") + ylab("obesrved values") + ggtitle("Prediction of the Ribosome occupency in the validation set") + 
  geom_abline(slope = 1, intercept = 0) 
cor(pred_valid_rd, valid_set$RiboDens_LymphoA, method = "spearman")

plotImp(RibosomeDensity_in_lymphoA, "RiboDens_lymphoA")


### for Index non TDD
pred_train_nonTDD <- predict(asIndexnonTDD_in_lymphoA, train_set, method = 'rf')
ggplot() +
  geom_point(aes(x = pred_train_nonTDD, y = train_set$asIndexnonTDD_LA), alpha = 0.2, color = "darkgreen") + 
  xlab("predicted values") + ylab("obesrved values") + ggtitle("Prediction of the non TDD index in the trainning set") + 
  geom_abline(slope = 1, intercept = 0) 
cor(pred_train_nonTDD, train_set$asIndexnonTDD_LA, method = "spearman")

pred_valid_nonTDD <- predict(asIndexnonTDD_in_lymphoA, valid_set, method = 'rf')
ggplot() +
  geom_point(aes(x = pred_valid_nonTDD, y = valid_set$asIndexnonTDD_LA), alpha = 0.2, color = "darkgreen") + 
  xlab("predicted values") + ylab("obesrved values") + ggtitle("Prediction of the non TDD index in the validation set") + 
  geom_abline(slope = 1, intercept = 0) 
cor(pred_valid_nonTDD, valid_set$asIndexnonTDD_LA, method = "spearman")

plotImp(asIndexnonTDD_in_lymphoA, "asIndexnonTDD_LA")


# old functions --------------------
p2 <- lapply(names(mod_list_LR), function(x) plotImp(mod_list_LR[[x]], x))
p3 <- do.call("grid.arrange", c(p2, ncol = 2))
ggsave(filename = gsub(" ",
                       "_",
                       paste0("varImpPlot_LR_",
                              as.character(Sys.time()), 
                              ".pdf")),
       plot = p3,
       width = 35,
       height = 30,
       units = "cm")

p4 <- lapply(names(mod_list_LA), function(x) plotImp(mod_list_LA[[x]], x))
p5 <- do.call("grid.arrange", c(p4, ncol = 2))
ggsave(filename = gsub(" ",
                       "_",
                       paste0("varImpPlot_LA_",
                              as.character(Sys.time()),
                              ".pdf")),
       plot = p5,
       width = 35,
       height = 40,
       units = "cm")


