setwd('/Volumes/BB_Backup_3/BD_aging_project/2018-09_revision_analyses/Machine_learning/rerun_with_same_package_version/Model_RData/RF_GBM_summary_feature_importance')
library('pheatmap')
library('ggplot2')

options(stringsAsFactors=F)
# 2017-03-15
# new runs with new data

# 2017-03-28
# corrected cerebellum runs

# 2017-03-29
# include OB with brain data

# 2017-04-20
# use bubble chart to include Gini and DecAccuracy in plotting

# 2018-09-20
# use values from run with updated caret


my.col.palette.acc <- colorRampPalette(c("white","#FF9999","indianred1","firebrick1","firebrick3","firebrick4"))(1000)


get_rank_prod <- function (data4) {
  my.rk.ob <- rank(data4$OB_V2)
  my.rk.liv <- rank(data4$Liver_V2)
  my.rk.ht <- rank(data4$Heart_V2)
  my.rk.cb <- rank(data4$Cereb_V2)
  
  my.rks <- cbind(my.rk.ob,my.rk.liv,my.rk.ht,my.rk.cb)
  my.rk.prod <- apply(my.rks,1,prod)
  return(my.rk.prod)
  
}


#####################################################
#### 1. Runs over samples (with CONSTANT class)  ####
#####################################################

## a. Gini score (node purity)
OB    <- read.table('../Model_RData/RF/Chromatin/OB/2018-09-20_Olfactory_Bulb_median_features_imp_gini_classification_over_samples.txt', skip = 1)
Liver <- read.table('../Model_RData/RF/Chromatin/Liver/2018-09-20_Liver_median_features_imp_gini_classification_over_samples.txt', skip = 1)
Heart <- read.table('../Model_RData/RF/Chromatin/Heart/2018-09-20_Heart_median_features_imp_gini_classification_over_samples.txt', skip = 1)
Cereb <- read.table('../Model_RData/RF/Chromatin/Cerebellum/2018-09-20_Cerebellum_median_features_imp_gini_classification_over_samples.txt', skip = 1)

colnames(OB) <- paste("OB",colnames(OB), sep="_")
colnames(Liver) <- paste("Liver",colnames(Liver), sep="_")
colnames(Heart) <- paste("Heart",colnames(Heart), sep="_")
colnames(Cereb) <- paste("Cereb",colnames(Cereb), sep="_")

data1 <- merge(OB,Liver,by.x="OB_V1",by.y="Liver_V1", all.y = T)
data2 <- merge(Heart,Cereb,by.x="Heart_V1",by.y="Cereb_V1")

data3 <- merge(data1,data2,by.x="OB_V1",by.y="Heart_V1", all.y = T)
rownames(data3) <- data3$OB_V1

results.samp.gini <- data3

## b. Mean decrease in accuracy
OB    <- read.table('../Model_RData/RF/Chromatin/OB/2018-09-20_Olfactory_Bulb_median_features_imp_dec_acc_classification_over_samples.txt', skip = 1)
Liver <- read.table('../Model_RData/RF/Chromatin/Liver/2018-09-20_Liver_median_features_imp_dec_acc_classification_over_samples.txt', skip = 1)
Heart <- read.table('../Model_RData/RF/Chromatin/Heart/2018-09-20_Heart_median_features_imp_dec_acc_classification_over_samples.txt', skip = 1)
Cereb <- read.table('../Model_RData/RF/Chromatin/Cerebellum/2018-09-20_Cerebellum_median_features_imp_dec_acc_classification_over_samples.txt', skip = 1)

colnames(OB) <- paste("OB",colnames(OB), sep="_")
colnames(Liver) <- paste("Liver",colnames(Liver), sep="_")
colnames(Heart) <- paste("Heart",colnames(Heart), sep="_")
colnames(Cereb) <- paste("Cereb",colnames(Cereb), sep="_")

data1 <- merge(OB,Liver,by.x="OB_V1",by.y="Liver_V1", all.y = T)
data2 <- merge(Heart,Cereb,by.x="Heart_V1",by.y="Cereb_V1")

data3 <- merge(data1,data2,by.x="OB_V1",by.y="Heart_V1", all.y = T)
rownames(data3) <- data3$OB_V1

results.samp.decAcc <- data3

#####################################################
# for sorting, weigh the impact of Gini and of DecAcc
mean(sapply(results.samp.gini[,-1],mean)) # 8.584465
mean(sapply(results.samp.decAcc[,-1],mean)) # 0.007118409

#scaling for ranking
scale.factor <- mean(sapply(results.samp.gini[,-1],mean))/mean(sapply(results.samp.decAcc[,-1],mean))
my.weighted.rssults <- results.samp.gini[,-1] + scale.factor* results.samp.decAcc[,-1]

my.rk.prod <- get_rank_prod(my.weighted.rssults)
my.rk.sorted <- sort(my.rk.prod,index.return=T,decreasing=F)


#### Gini is on different scales for the different tissues: scale per tissue?
my.gini.maxes <- sapply(results.samp.gini[,-1],max) # 8.576321
results.samp.gini.scaled <-results.samp.gini
results.samp.gini.scaled[,2] <- 100 * results.samp.gini[,2]/my.gini.maxes[1]
results.samp.gini.scaled[,3] <- 100 * results.samp.gini[,3]/my.gini.maxes[2]
results.samp.gini.scaled[,4] <- 100 * results.samp.gini[,4]/my.gini.maxes[3]
results.samp.gini.scaled[,5] <- 100 * results.samp.gini[,5]/my.gini.maxes[4]

my.colnames <- c("OB_V2","Liver_V2","Heart_V2","Cereb_V2")
my.tissues <- c("OB","Liver","Heart","Cereb")

# format for ggplot
my.complete.results <- cbind(results.samp.gini.scaled[my.rk.sorted$ix,c('OB_V1',my.colnames[1])],rep(my.tissues[1],dim(results.samp.gini.scaled)[1]),results.samp.decAcc[my.rk.sorted$ix,my.colnames[1]])
colnames(my.complete.results) <- c('Feature_name','Gini','Tissue','Mean_decrease_in_accuracy')
for ( h in 2:length(my.colnames)) {
  my.new <- cbind(results.samp.gini.scaled[my.rk.sorted$ix,c('OB_V1',my.colnames[h])],rep(my.tissues[h],dim(results.samp.gini.scaled)[1]),results.samp.decAcc[my.rk.sorted$ix,my.colnames[h]])
  colnames(my.new) <- colnames(my.complete.results)
  my.complete.results <- rbind(my.complete.results, 
                          my.new)
}

# make decrease in acc percentage
my.complete.results$Mean_decrease_in_accuracy <- 100* my.complete.results$Mean_decrease_in_accuracy

# will make Gini the size and Dec in accuracy the color

# to preserve the wanted order
my.complete.results$Feature_name <- factor(my.complete.results$Feature_name, levels = unique(my.complete.results$Feature_name))

my.pdfname <-paste(Sys.Date(),"RF_with_sampling_accuracy_balloon_plot.pdf", sep="_")

pdf(my.pdfname, onefile=F, height = 10, width=10)
my.plot <- ggplot(my.complete.results,aes(x=Tissue,y=Feature_name,colour=Mean_decrease_in_accuracy,size=Gini))+ theme_bw()+ geom_point(shape = 16) 
my.plot <- my.plot + ggtitle("Random Forest Feature Importance") + labs(x = "Tissue", y = "Feature")
my.plot <- my.plot + scale_colour_gradientn(colours = my.col.palette.acc,space = "Lab", na.value = "grey50", guide = "colourbar")
#my.plot <- my.plot + scale_size_area(max_size = 8)
print(my.plot)
dev.off()  


#####################################################
####    2. Simple runs  (*NO* CONSTANT class)    ####
#####################################################

## a. Gini score (node purity)
OB <- read.table('../Model_RData/RF/Chromatin/OB/2018-09-20_Olfactory_Bulb_RF_features_imp_gini_classification_noCST.txt', skip = 1)
Liver <- read.table('../Model_RData/RF/Chromatin/Liver//2018-09-20_Liver_RF_features_imp_gini_classification_noCST.txt', skip = 1)
Heart <- read.table('../Model_RData/RF/Chromatin/Heart//2018-09-20_Heart_RF_features_imp_gini_classification_noCST.txt', skip = 1)
Cereb <- read.table('../Model_RData/RF/Chromatin/Cerebellum//2018-09-20_Cerebellum_RF_features_imp_gini_classification_noCST.txt', skip = 1)

colnames(OB) <- paste("OB",colnames(OB), sep="_")
colnames(Liver) <- paste("Liver",colnames(Liver), sep="_")
colnames(Heart) <- paste("Heart",colnames(Heart), sep="_")
colnames(Cereb) <- paste("Cereb",colnames(Cereb), sep="_")

data1 <- merge(OB,Liver,by.x="OB_V1",by.y="Liver_V1", all.y = T)
data2 <- merge(Heart,Cereb,by.x="Heart_V1",by.y="Cereb_V1")

data3 <- merge(data1,data2,by.x="OB_V1",by.y="Heart_V1", all.y = T)
rownames(data3) <- data3$OB_V1

results.samp.gini <- data3

## b. Mean decrease in accuracy
OB    <- read.table('../Model_RData/RF/Chromatin/OB/2018-09-20_Olfactory_Bulb_RF_features_imp_dec_acc_classification_noCST.txt', skip = 1)
Liver <- read.table('../Model_RData/RF/Chromatin/Liver/2018-09-20_Liver_RF_features_imp_dec_acc_classification_noCST.txt', skip = 1)
Heart <- read.table('../Model_RData/RF/Chromatin/Heart/2018-09-20_Heart_RF_features_imp_dec_acc_classification_noCST.txt', skip = 1)
Cereb <- read.table('../Model_RData/RF/Chromatin/Cerebellum/2018-09-20_Cerebellum_RF_features_imp_dec_acc_classification_noCST.txt', skip = 1)

colnames(OB) <- paste("OB",colnames(OB), sep="_")
colnames(Liver) <- paste("Liver",colnames(Liver), sep="_")
colnames(Heart) <- paste("Heart",colnames(Heart), sep="_")
colnames(Cereb) <- paste("Cereb",colnames(Cereb), sep="_")

data1 <- merge(OB,Liver,by.x="OB_V1",by.y="Liver_V1", all.y = T)
data2 <- merge(Heart,Cereb,by.x="Heart_V1",by.y="Cereb_V1")

data3 <- merge(data1,data2,by.x="OB_V1",by.y="Heart_V1", all.y = T)
rownames(data3) <- data3$OB_V1

results.samp.decAcc <- data3

#####################################################
# for sorting, weigh the impact of Gini and of DecAcc
mean(sapply(results.samp.gini[,-1],mean)) # 3.225598
mean(sapply(results.samp.decAcc[,-1],mean)) # 0.005579576

#scaling for ranking
scale.factor <- mean(sapply(results.samp.gini[,-1],mean))/mean(sapply(results.samp.decAcc[,-1],mean))
my.weighted.rssults <- results.samp.gini[,-1] + scale.factor* results.samp.decAcc[,-1]

my.rk.prod <- get_rank_prod(my.weighted.rssults)
my.rk.sorted <- sort(my.rk.prod,index.return=T,decreasing=F)


#### Gini is on different scales for the different tissues: scale per tissue?
my.gini.maxes <- sapply(results.samp.gini[,-1],max) # 8.576321
results.samp.gini.scaled <-results.samp.gini
results.samp.gini.scaled[,2] <- 100 * results.samp.gini[,2]/my.gini.maxes[1]
results.samp.gini.scaled[,3] <- 100 * results.samp.gini[,3]/my.gini.maxes[2]
results.samp.gini.scaled[,4] <- 100 * results.samp.gini[,4]/my.gini.maxes[3]
results.samp.gini.scaled[,5] <- 100 * results.samp.gini[,5]/my.gini.maxes[4]

my.colnames <- c("OB_V2","Liver_V2","Heart_V2","Cereb_V2")
my.tissues <- c("OB","Liver","Heart","Cereb")

# format for ggplot
my.complete.results <- cbind(results.samp.gini.scaled[my.rk.sorted$ix,c('OB_V1',my.colnames[1])],rep(my.tissues[1],dim(results.samp.gini.scaled)[1]),results.samp.decAcc[my.rk.sorted$ix,my.colnames[1]])
colnames(my.complete.results) <- c('Feature_name','Gini','Tissue','Mean_decrease_in_accuracy')
for ( h in 2:length(my.colnames)) {
  my.new <- cbind(results.samp.gini.scaled[my.rk.sorted$ix,c('OB_V1',my.colnames[h])],rep(my.tissues[h],dim(results.samp.gini.scaled)[1]),results.samp.decAcc[my.rk.sorted$ix,my.colnames[h]])
  colnames(my.new) <- colnames(my.complete.results)
  my.complete.results <- rbind(my.complete.results, 
                               my.new)
}

# make decrease in acc percentage
my.complete.results$Mean_decrease_in_accuracy <- 100* my.complete.results$Mean_decrease_in_accuracy

# will make Gini the size and Dec in accuracy the color

# to preserve the wanted order
my.complete.results$Feature_name <- factor(my.complete.results$Feature_name, levels = unique(my.complete.results$Feature_name))

my.pdfname <-paste(Sys.Date(),"RF_NoConstant_accuracy_balloon_plot.pdf", sep="_")

pdf(my.pdfname, onefile=F, height = 10, width=10)
my.plot <- ggplot(my.complete.results,aes(x=Tissue,y=Feature_name,colour=Mean_decrease_in_accuracy,size=Gini))+ theme_bw()+ geom_point(shape = 16) 
my.plot <- my.plot + ggtitle("Random Forest Feature Importance") + labs(x = "Tissue", y = "Feature")
my.plot <- my.plot + scale_colour_gradientn(colours = my.col.palette.acc,space = "Lab", na.value = "grey50", guide = "colourbar")
#my.plot <- my.plot + scale_size_area(max_size = 8)
print(my.plot)
dev.off()  

