setwd('/Volumes/BB_Backup_3/BD_aging_project/2018-09_revision_analyses/Machine_learning/rerun_with_same_package_version/Model_RData/RF_GBM_summary_feature_importance')
library('pheatmap')
library('ggplot2')

# 2017-03-15
# new runs with new data

# 2017-03-28
# corrected cerebellum runs

# 2017-03-29
# include OB with brain data

# 2017-04-20
# use bubble chart to include Gini and DecAccuracy in plotting

# 2018-09-20
# use values from run with updated caret

my.col.palette <- colorRampPalette(c("white","snow","#FFCCCC","#FF9999","indianred1","firebrick1","firebrick3","firebrick4"))(1000)
my.col.palette.acc <- colorRampPalette(c("white","#FFCCCC","#FF9999","indianred1","firebrick1","firebrick3","firebrick4"))(1000)


get_rank_prod <- function (data4) {
  my.rk.ob <- rank(data4$OB_V2)
  my.rk.liv <- rank(data4$Liver_V2)
  my.rk.ht <- rank(data4$Heart_V2)
  my.rk.cb <- rank(data4$Cereb_V2)
  
  my.rks <- cbind(my.rk.ob,my.rk.liv,my.rk.ht,my.rk.cb)
  my.rk.prod <- apply(my.rks,1,prod)
  return(my.rk.prod)
  
}


#####################################################
#### 1. Runs over samples (with CONSTANT class)  ####
#####################################################

## (node purity)
OB    <- read.table('../Model_RData/GBM/Chromatin/OB/2018-09-20_Olfactory_Bulb_median_features_importance_classification_over_samples.txt', skip = 1)
Liver <- read.table('../Model_RData/GBM/Chromatin/Liver/2018-09-20_Liver_median_features_importance_classification_over_samples.txt', skip = 1)
Heart <- read.table('../Model_RData/GBM/Chromatin/Heart/2018-09-20_Heart_median_features_importance_classification_over_samples.txt', skip = 1)
Cereb <- read.table('../Model_RData/GBM/Chromatin/Cerebellum/2018-09-20_Cerebellum_median_features_importance_classification_over_samples.txt', skip = 1)

colnames(OB) <- paste("OB",colnames(OB), sep="_")
colnames(Liver) <- paste("Liver",colnames(Liver), sep="_")
colnames(Heart) <- paste("Heart",colnames(Heart), sep="_")
colnames(Cereb) <- paste("Cereb",colnames(Cereb), sep="_")

data1 <- merge(OB,Liver,by.x="OB_V1",by.y="Liver_V1", all.y = T)
data2 <- merge(Heart,Cereb,by.x="Heart_V1",by.y="Cereb_V1")

data3 <- merge(data1,data2,by.x="OB_V1",by.y="Heart_V1", all.y = T)
rownames(data3) <- data3$OB_V1
# relative importance?
results.samp.gini <- data3

#### Gini is on different scales for the different tissues: scale per tissue?
my.gini.maxes <- sapply(results.samp.gini[,-1],max) # 8.576321
results.samp.gini.scaled <-results.samp.gini
results.samp.gini.scaled[,2] <- 100 * results.samp.gini[,2]/my.gini.maxes[1]
results.samp.gini.scaled[,3] <- 100 * results.samp.gini[,3]/my.gini.maxes[2]
results.samp.gini.scaled[,4] <- 100 * results.samp.gini[,4]/my.gini.maxes[3]
results.samp.gini.scaled[,5] <- 100 * results.samp.gini[,5]/my.gini.maxes[4]

my.rk.prod <- get_rank_prod(results.samp.gini.scaled[,-1])
my.rk.sorted <- sort(my.rk.prod,index.return=T,decreasing=F)


my.colnames <- c("OB_V2","Liver_V2","Heart_V2","Cereb_V2")
my.tissues <- c("OB","Liver","Heart","Cereb")

# format for ggplot
my.complete.results <- cbind(results.samp.gini.scaled[my.rk.sorted$ix,c('OB_V1',my.colnames[1])],rep(my.tissues[1],dim(results.samp.gini.scaled)[1]))
colnames(my.complete.results) <- c('Feature_name','Gini','Tissue')
for ( h in 2:length(my.colnames)) {
  my.new <- cbind(results.samp.gini.scaled[my.rk.sorted$ix,c('OB_V1',my.colnames[h])],rep(my.tissues[h],dim(results.samp.gini.scaled)[1]))
  colnames(my.new) <- colnames(my.complete.results)
  my.complete.results <- rbind(my.complete.results, 
                               my.new)
}

# to preserve the wanted order
my.complete.results$Feature_name <- factor(my.complete.results$Feature_name, levels = unique(my.complete.results$Feature_name))

my.pdfname <-paste(Sys.Date(),"summary_GBM_Importance_over_samples_balloon_plot.pdf", sep="_")

pdf(my.pdfname, onefile=F, height = 10, width=10)
my.plot <- ggplot(my.complete.results,aes(x=Tissue,y=Feature_name,size=Gini))+ theme_bw()+ geom_point(shape = 16, colour = "firebrick3") 
my.plot <- my.plot + ggtitle("GBM Sampling Feature Importance") + labs(x = "Tissue", y = "Feature")
#my.plot <- my.plot + scale_size_area(max_size = 8)
print(my.plot)
dev.off()  

#####################################################
####    2. Simple runs  (*NO* CONSTANT class)    ####
#####################################################

##  (node purity)
OB <- read.table('../Model_RData/GBM/Chromatin/OB/2018-09-20_Olfactory_Bulb_GBM_features_importance_classification_noCST.txt', skip = 1)
Liver <- read.table('../Model_RData/GBM/Chromatin/Liver/2018-09-20_Liver_GBM_features_importance_classification_noCST.txt', skip = 1)
Heart <- read.table('../Model_RData/GBM/Chromatin/Heart/2018-09-20_Heart_GBM_features_importance_classification_noCST.txt', skip = 1)
Cereb <- read.table('../Model_RData/GBM/Chromatin/Cerebellum/2018-09-20_Cerebellum_GBM_features_importance_classification_noCST.txt', skip = 1)

colnames(OB) <- paste("OB",colnames(OB), sep="_")
colnames(Liver) <- paste("Liver",colnames(Liver), sep="_")
colnames(Heart) <- paste("Heart",colnames(Heart), sep="_")
colnames(Cereb) <- paste("Cereb",colnames(Cereb), sep="_")

data1 <- merge(OB,Liver,by.x="OB_V1",by.y="Liver_V1", all.y = T)
data2 <- merge(Heart,Cereb,by.x="Heart_V1",by.y="Cereb_V1")

data3 <- merge(data1,data2,by.x="OB_V1",by.y="Heart_V1", all.y = T)
rownames(data3) <- data3$OB_V1

# GIni is not in same scale!!!
# relative importance?

results.samp.gini <- data3

#### Gini is on different scales for the different tissues: scale per tissue?
my.gini.maxes <- sapply(results.samp.gini[,-1],max) # 8.576321
results.samp.gini.scaled <-results.samp.gini
results.samp.gini.scaled[,2] <- 100 * results.samp.gini[,2]/my.gini.maxes[1]
results.samp.gini.scaled[,3] <- 100 * results.samp.gini[,3]/my.gini.maxes[2]
results.samp.gini.scaled[,4] <- 100 * results.samp.gini[,4]/my.gini.maxes[3]
results.samp.gini.scaled[,5] <- 100 * results.samp.gini[,5]/my.gini.maxes[4]

my.rk.prod <- get_rank_prod(results.samp.gini.scaled[,-1])
my.rk.sorted <- sort(my.rk.prod,index.return=T,decreasing=F)


my.colnames <- c("OB_V2","Liver_V2","Heart_V2","Cereb_V2")
my.tissues <- c("OB","Liver","Heart","Cereb")

# format for ggplot
my.complete.results <- cbind(results.samp.gini.scaled[my.rk.sorted$ix,c('OB_V1',my.colnames[1])],rep(my.tissues[1],dim(results.samp.gini.scaled)[1]))
colnames(my.complete.results) <- c('Feature_name','Gini','Tissue')
for ( h in 2:length(my.colnames)) {
  my.new <- cbind(results.samp.gini.scaled[my.rk.sorted$ix,c('OB_V1',my.colnames[h])],rep(my.tissues[h],dim(results.samp.gini.scaled)[1]))
  colnames(my.new) <- colnames(my.complete.results)
  my.complete.results <- rbind(my.complete.results, 
                               my.new)
}

# to preserve the wanted order
my.complete.results$Feature_name <- factor(my.complete.results$Feature_name, levels = unique(my.complete.results$Feature_name))

my.pdfname <-paste(Sys.Date(),"summary_GBM_Importance_NoCST_balloon_plot.pdf", sep="_")

pdf(my.pdfname, onefile=F, height = 10, width=10)
my.plot <- ggplot(my.complete.results,aes(x=Tissue,y=Feature_name,size=Gini))+ theme_bw()+ geom_point(shape = 16, colour = "firebrick3") 
my.plot <- my.plot + ggtitle("GBM Sampling Feature Importance") + labs(x = "Tissue", y = "Feature")
#my.plot <- my.plot + scale_size_area(max_size = 8)
print(my.plot)
dev.off() 