my.data[1:5,2]
options(stringsAsFactors = F)
my.data <- read.csv('/Users/benayoun/Downloads/2018-08-17_Alex_qPCR.txt', sep ="\t",header = T)
my.data.list <- list("circ30_4m_F"=my.data[1:5,2],
"circ30_20m_F"=my.data[6:10,2],
"circ30_4m_M"=my.data[11:15,2],
"circ30_20m_M"=my.data[16:20,2],
"Aff3_4m_F"=my.data[1:5,3],
"Aff3_20m_F"=my.data[6:10,3],
"Aff3_4m_M"=my.data[11:15,3],
"Aff3_20m_M"=my.data[16:20,3],
"Cdr1as_4m_F"=my.data[1:5,4],
"Cdr1as_20m_F"=my.data[6:10,4],
"Cdr1as_4m_M"=my.data[11:15,4],
"Cdr1as_20m_M"=my.data[16:20,4]
)
boxplot(my.data.list,
col = c("orchid","orchid","dodgerblue","dodgerblue"),
outline = F, ylim = c(0,5),
las = 2, ylab = "Fold change in 20m vs 4m (Cerebellum, ddCt method)")
boxplot(my.data.list,
col = c("orchid","orchid","dodgerblue","dodgerblue"),
outline = F, ylim = c(0,2.5),
las = 2, ylab = "Fold change in 20m vs 4m (Cerebellum, ddCt method)")
abline(h= 1, col = "red", lty = "dashed")
boxplot(my.data.list,
col = c("orchid","orchid","dodgerblue","dodgerblue"),
outline = F, ylim = c(0,2),
las = 2, ylab = "Fold change in 20m vs 4m (Cerebellum, ddCt method)")
abline(h= 1, col = "red", lty = "dashed")
abline(v= 4.5, col = "grey", lty = "dashed")
abline(v= 8.5, col = "grey", lty = "dashed")
wilcox.test(my.data.list$circ30_4m_F,my.data.list$circ30_20m_F)
options(stringsAsFactors = F)
my.data <- read.csv('/Users/benayoun/Downloads/2018-08-17_Alex_qPCR.txt', sep ="\t",header = T)
my.data.list <- list("circ30_4m_F"=my.data[1:5,2],
"circ30_20m_F"=my.data[6:10,2],
"circ30_4m_M"=my.data[11:15,2],
"circ30_20m_M"=my.data[16:20,2],
"Aff3_4m_F"=my.data[1:5,3],
"Aff3_20m_F"=my.data[6:10,3],
"Aff3_4m_M"=my.data[11:15,3],
"Aff3_20m_M"=my.data[16:20,3],
"Cdr1as_4m_F"=my.data[1:5,4],
"Cdr1as_20m_F"=my.data[6:10,4],
"Cdr1as_4m_M"=my.data[11:15,4],
"Cdr1as_20m_M"=my.data[16:20,4]
)
boxplot(my.data.list,
col = c("orchid","orchid","dodgerblue","dodgerblue"),
outline = F, ylim = c(0,2),
las = 2, ylab = "Fold change in 20m vs 4m (Cerebellum, ddCt method)")
abline(h= 1, col = "red", lty = "dashed")
abline(v= 4.5, col = "grey", lty = "dashed")
abline(v= 8.5, col = "grey", lty = "dashed")
wilcox.test(my.data.list$circ30_4m_F,my.data.list$circ30_20m_F)
wilcox.test(my.data.list$circ30_4m_M,my.data.list$circ30_20m_M) # p-value = 0.1508
wilcox.test(my.data.list$Aff3_4m_F,my.data.list$Aff3_20m_F) # p-value = 0.1508
wilcox.test(my.data.list$Aff3_4m_M,my.data.list$Aff3_20m_M) # p-value = 0.007937
wilcox.test(my.data.list$circ30_4m_M,my.data.list$circ30_20m_M) # p-value = 0.007937
wilcox.test(my.data.list$Cdr1as_4m_F,my.data.list$Cdr1as_20m_F) # p-value = 0.1508
wilcox.test(my.data.list$Cdr1as_4m_M,my.data.list$Cdr1as_20m_M) # p-value = 0.007937
boxplot(my.data.list,
col = c("orchid","orchid","dodgerblue","dodgerblue"),
outline = T, ylim = c(0,2),
las = 2, ylab = "Fold change in 20m vs 4m (Cerebellum, ddCt method)")
abline(h= 1, col = "red", lty = "dashed")
abline(v= 4.5, col = "grey", lty = "dashed")
abline(v= 8.5, col = "grey", lty = "dashed")
points(rep(1,5),"circ30_4m_F"=my.data[1:5,2], col = "grey")
rep(1,5),
rep(1,5)
points(rep(1,5),my.data[1:5,2], col = "grey")
boxplot(my.data.list,
col = c("orchid","orchid","dodgerblue","dodgerblue"),
outline = F, ylim = c(0,2),
las = 2, ylab = "Fold change in 20m vs 4m (Cerebellum, ddCt method)")
abline(h= 1, col = "red", lty = "dashed")
abline(v= 4.5, col = "grey", lty = "dashed")
abline(v= 8.5, col = "grey", lty = "dashed")
points(rep(1,5),my.data[1:5,2], col = "grey")
points(rep(2,5),my.data[6:10,2], col = "grey")
points(rep(3,5),my.data[11:15,2], col = "grey")
points(rep(4,5),my.data[16:20,2], col = "grey")
points(rep(5,5),my.data[1:5,3], col = "grey")
points(rep(6,5),my.data[6:10,3], col = "grey")
points(rep(7,5),my.data[11:15,3], col = "grey")
points(rep(8,5),my.data[16:20,3], col = "grey")
points(rep(9,5),my.data[1:5,4], col = "grey")
points(rep(10,5),my.data[6:10,4], col = "grey")
points(rep(11,5),my.data[11:15,4], col = "grey")
points(rep(12,5),my.data[16:20,4], col = "grey")
load("/Volumes/BB_Backup_3/BD_aging_project/Machine_learning_aging/Predict_Fold_change/2016-11-21_Complete_feature_matrices_FOLD_CHANGE_NA_RM.RData")
colnames(my.heart.features.v2)
my.col.include <- c("FPKM_3m", "H3K4me3_averageIntensity_3m", "H3K27ac_averageIntensity_3m", "SE_3m","SE_score_3m","SE_TSS_Dist","maxbreadth_y","BD_qt_y",
"BD_stat_y","H3K4me1_prom","CTCF_prom","Pol2_prom","Pol2_peaks","Pol2_abs_TSS_dist","Pol2_MACS2_max_score","CpG_promoter_percentage","CG_promoter_percentage","CTCF_peaks",
"CTCF_abs_TSS_dist","CTCF_MACS2_max_score","Constitutive.Exon","CpG_islands" , "H3K27me3_prom","DNAseI_prom","Bivalent_status")
head(my.heart.features.v2[,my.col.include])
source("https://bioconductor.org/biocLite.R")
biocLite()
source("https://bioconductor.org/biocLite.R")
biocLite()
biocLite("DESeq2")
biocLite("DiffBind")
setwd('/Volumes/BB_Backup_3/BD_aging_project/2018-09_revision_analyses/Machine_learning/rerun_with_same_package_version/Model_RData/RF_GBM_summary_feature_importance')
library('pheatmap')
library('ggplot2')
# 2017-03-15
# new runs with new data
# 2017-03-28
# corrected cerebellum runs
# 2017-03-29
# include OB with brain data
# 2017-04-20
# use bubble chart to include Gini and DecAccuracy in plotting
# 2018-09-20
# use values from run with updated caret
my.col.palette <- colorRampPalette(c("white","snow","#FFCCCC","#FF9999","indianred1","firebrick1","firebrick3","firebrick4"))(1000)
my.col.palette.acc <- colorRampPalette(c("white","#FFCCCC","#FF9999","indianred1","firebrick1","firebrick3","firebrick4"))(1000)
get_rank_prod <- function (data4) {
my.rk.ob <- rank(data4$OB_V2)
my.rk.liv <- rank(data4$Liver_V2)
my.rk.ht <- rank(data4$Heart_V2)
my.rk.cb <- rank(data4$Cereb_V2)
my.rks <- cbind(my.rk.ob,my.rk.liv,my.rk.ht,my.rk.cb)
my.rk.prod <- apply(my.rks,1,prod)
return(my.rk.prod)
}
#####################################################
#### 1. Runs over samples (with CONSTANT class)  ####
#####################################################
## (node purity)
OB    <- read.table('../Model_RData/GBM/Chromatin/OB/2018-09-20_Olfactory_Bulb_withBrainExtra_median_features_importance_classification_over_samples.txt', skip = 1)
Liver <- read.table('../Model_RData/GBM/Chromatin/Liver/2018-09-20_Liver_median_features_importance_classification_over_samples.txt', skip = 1)
Heart <- read.table('../Model_RData/GBM/Chromatin/Heart/2018-09-20_Heart_median_features_importance_classification_over_samples.txt', skip = 1)
Cereb <- read.table('../Model_RData/GBM/Chromatin/Cerebellum/2018-09-20_Cerebellum_median_features_importance_classification_over_samples.txt', skip = 1)
colnames(OB) <- paste("OB",colnames(OB), sep="_")
colnames(Liver) <- paste("Liver",colnames(Liver), sep="_")
colnames(Heart) <- paste("Heart",colnames(Heart), sep="_")
colnames(Cereb) <- paste("Cereb",colnames(Cereb), sep="_")
data1 <- merge(OB,Liver,by.x="OB_V1",by.y="Liver_V1", all.y = T)
data2 <- merge(Heart,Cereb,by.x="Heart_V1",by.y="Cereb_V1")
data3 <- merge(data1,data2,by.x="OB_V1",by.y="Heart_V1", all.y = T)
rownames(data3) <- data3$OB_V1
# relative importance?
results.samp.gini <- data3
#### Gini is on different scales for the different tissues: scale per tissue?
my.gini.maxes <- sapply(results.samp.gini[,-1],max) # 8.576321
results.samp.gini.scaled <-results.samp.gini
results.samp.gini.scaled[,2] <- 100 * results.samp.gini[,2]/my.gini.maxes[1]
results.samp.gini.scaled[,3] <- 100 * results.samp.gini[,3]/my.gini.maxes[2]
results.samp.gini.scaled[,4] <- 100 * results.samp.gini[,4]/my.gini.maxes[3]
results.samp.gini.scaled[,5] <- 100 * results.samp.gini[,5]/my.gini.maxes[4]
my.rk.prod <- get_rank_prod(results.samp.gini.scaled[,-1])
my.rk.sorted <- sort(my.rk.prod,index.return=T,decreasing=F)
my.colnames <- c("OB_V2","Liver_V2","Heart_V2","Cereb_V2")
my.tissues <- c("OB","Liver","Heart","Cereb")
# format for ggplot
my.complete.results <- cbind(results.samp.gini.scaled[my.rk.sorted$ix,c('OB_V1',my.colnames[1])],rep(my.tissues[1],dim(results.samp.gini.scaled)[1]))
colnames(my.complete.results) <- c('Feature_name','Gini','Tissue')
for ( h in 2:length(my.colnames)) {
my.new <- cbind(results.samp.gini.scaled[my.rk.sorted$ix,c('OB_V1',my.colnames[h])],rep(my.tissues[h],dim(results.samp.gini.scaled)[1]))
colnames(my.new) <- colnames(my.complete.results)
my.complete.results <- rbind(my.complete.results,
my.new)
}
# to preserve the wanted order
my.complete.results$Feature_name <- factor(my.complete.results$Feature_name, levels = unique(my.complete.results$Feature_name))
my.pdfname <-paste(Sys.Date(),"summary_GBM_Importance_over_samples_balloon_plot.pdf", sep="_")
pdf(my.pdfname, onefile=F, height = 10, width=10)
my.plot <- ggplot(my.complete.results,aes(x=Tissue,y=Feature_name,size=Gini))+ theme_bw()+ geom_point(shape = 16, colour = "firebrick3")
my.plot <- my.plot + ggtitle("GBM Sampling Feature Importance") + labs(x = "Tissue", y = "Feature")
#my.plot <- my.plot + scale_size_area(max_size = 8)
print(my.plot)
dev.off()
#####################################################
####    2. Simple runs  (*NO* CONSTANT class)    ####
#####################################################
##  (node purity)
OB <- read.table('../Model_RData/GBM/Chromatin/OB/2018-09-20_Olfactory_Bulb_withBrainExtra_GBM_features_importance_classification_noCST.txt', skip = 1)
Liver <- read.table('../Model_RData/GBM/Chromatin/Liver/2018-09-20_Liver_GBM_features_importance_classification_noCST.txt', skip = 1)
Heart <- read.table('../Model_RData/GBM/Chromatin/Heart/2018-09-20_Heart_GBM_features_importance_classification_noCST.txt', skip = 1)
Cereb <- read.table('../Model_RData/GBM/Chromatin/Cerebellum/2018-09-20_Cerebellum_GBM_features_importance_classification_noCST.txt', skip = 1)
colnames(OB) <- paste("OB",colnames(OB), sep="_")
colnames(Liver) <- paste("Liver",colnames(Liver), sep="_")
colnames(Heart) <- paste("Heart",colnames(Heart), sep="_")
colnames(Cereb) <- paste("Cereb",colnames(Cereb), sep="_")
data1 <- merge(OB,Liver,by.x="OB_V1",by.y="Liver_V1", all.y = T)
data2 <- merge(Heart,Cereb,by.x="Heart_V1",by.y="Cereb_V1")
data3 <- merge(data1,data2,by.x="OB_V1",by.y="Heart_V1", all.y = T)
rownames(data3) <- data3$OB_V1
# GIni is not in same scale!!!
# relative importance?
results.samp.gini <- data3
#### Gini is on different scales for the different tissues: scale per tissue?
my.gini.maxes <- sapply(results.samp.gini[,-1],max) # 8.576321
results.samp.gini.scaled <-results.samp.gini
results.samp.gini.scaled[,2] <- 100 * results.samp.gini[,2]/my.gini.maxes[1]
results.samp.gini.scaled[,3] <- 100 * results.samp.gini[,3]/my.gini.maxes[2]
results.samp.gini.scaled[,4] <- 100 * results.samp.gini[,4]/my.gini.maxes[3]
results.samp.gini.scaled[,5] <- 100 * results.samp.gini[,5]/my.gini.maxes[4]
my.rk.prod <- get_rank_prod(results.samp.gini.scaled[,-1])
my.rk.sorted <- sort(my.rk.prod,index.return=T,decreasing=F)
my.colnames <- c("OB_V2","Liver_V2","Heart_V2","Cereb_V2")
my.tissues <- c("OB","Liver","Heart","Cereb")
# format for ggplot
my.complete.results <- cbind(results.samp.gini.scaled[my.rk.sorted$ix,c('OB_V1',my.colnames[1])],rep(my.tissues[1],dim(results.samp.gini.scaled)[1]))
colnames(my.complete.results) <- c('Feature_name','Gini','Tissue')
for ( h in 2:length(my.colnames)) {
my.new <- cbind(results.samp.gini.scaled[my.rk.sorted$ix,c('OB_V1',my.colnames[h])],rep(my.tissues[h],dim(results.samp.gini.scaled)[1]))
colnames(my.new) <- colnames(my.complete.results)
my.complete.results <- rbind(my.complete.results,
my.new)
}
# to preserve the wanted order
my.complete.results$Feature_name <- factor(my.complete.results$Feature_name, levels = unique(my.complete.results$Feature_name))
my.pdfname <-paste(Sys.Date(),"summary_GBM_Importance_NoCST_balloon_plot.pdf", sep="_")
pdf(my.pdfname, onefile=F, height = 10, width=10)
my.plot <- ggplot(my.complete.results,aes(x=Tissue,y=Feature_name,size=Gini))+ theme_bw()+ geom_point(shape = 16, colour = "firebrick3")
my.plot <- my.plot + ggtitle("GBM Sampling Feature Importance") + labs(x = "Tissue", y = "Feature")
#my.plot <- my.plot + scale_size_area(max_size = 8)
print(my.plot)
dev.off()
setwd('/Volumes/BB_Backup_3/BD_aging_project/2018-09_revision_analyses/Machine_learning/rerun_with_same_package_version/Model_RData/RF_GBM_summary_feature_importance')
library('pheatmap')
library('ggplot2')
options(stringsAsFactors=F)
# 2017-03-15
# new runs with new data
# 2017-03-28
# corrected cerebellum runs
# 2017-03-29
# include OB with brain data
# 2017-04-20
# use bubble chart to include Gini and DecAccuracy in plotting
# 2018-09-20
# use values from run with updated caret
my.col.palette.acc <- colorRampPalette(c("white","#FF9999","indianred1","firebrick1","firebrick3","firebrick4"))(1000)
get_rank_prod <- function (data4) {
my.rk.ob <- rank(data4$OB_V2)
my.rk.liv <- rank(data4$Liver_V2)
my.rk.ht <- rank(data4$Heart_V2)
my.rk.cb <- rank(data4$Cereb_V2)
my.rks <- cbind(my.rk.ob,my.rk.liv,my.rk.ht,my.rk.cb)
my.rk.prod <- apply(my.rks,1,prod)
return(my.rk.prod)
}
#####################################################
#### 1. Runs over samples (with CONSTANT class)  ####
#####################################################
OB    <- read.table('../Model_RData/RF/Chromatin/OB/2018-09-20_Olfactory_Bulb_withBrainExtra_median_features_imp_gini_classification_over_samples.txt', skip = 1)
OB    <- read.table('../Model_RData/RF/Chromatin/OB/2018-09-20_Olfactory_Bulb_median_features_imp_gini_classification_over_samples.txt', skip = 1)
Liver <- read.table('../Model_RData/RF/Chromatin/Liver/2018-09-20_Liver_median_features_imp_gini_classification_over_samples.txt', skip = 1)
Heart <- read.table('../Model_RData/RF/Chromatin/Heart/2018-09-20_Heart_median_features_imp_gini_classification_over_samples.txt', skip = 1)
Cereb <- read.table('../Model_RData/RF/Chromatin/Cerebellum/2018-09-20_Cerebellum_median_features_imp_gini_classification_over_samples.txt', skip = 1)
colnames(OB) <- paste("OB",colnames(OB), sep="_")
colnames(Liver) <- paste("Liver",colnames(Liver), sep="_")
colnames(Heart) <- paste("Heart",colnames(Heart), sep="_")
colnames(Cereb) <- paste("Cereb",colnames(Cereb), sep="_")
data1 <- merge(OB,Liver,by.x="OB_V1",by.y="Liver_V1", all.y = T)
data2 <- merge(Heart,Cereb,by.x="Heart_V1",by.y="Cereb_V1")
data3 <- merge(data1,data2,by.x="OB_V1",by.y="Heart_V1", all.y = T)
rownames(data3) <- data3$OB_V1
results.samp.gini <- data3
OB    <- read.table('../Model_RData/RF/Chromatin/OB/2018-09-20_Olfactory_Bulb_withBrainExtra_median_features_imp_dec_acc_classification_over_samples.txt', skip = 1)
OB    <- read.table('../Model_RData/RF/Chromatin/OB/2018-09-20_Olfactory_bulb_withBrainExtra_median_features_imp_dec_acc_classification_over_samples.txt', skip = 1)
OB    <- read.table('../Model_RData/RF/Chromatin/OB/2018-09-20_Olfactory_Bulb_median_features_imp_dec_acc_classification_over_samples.txt', skip = 1)
Liver <- read.table('../Model_RData/RF/Chromatin/Liver/2018-09-20_Liver_median_features_imp_dec_acc_classification_over_samples.txt', skip = 1)
Heart <- read.table('../Model_RData/RF/Chromatin/Heart/2018-09-20_Heart_median_features_imp_dec_acc_classification_over_samples.txt', skip = 1)
Cereb <- read.table('../Model_RData/RF/Chromatin/Cerebellum/2018-09-20_Cerebellum_median_features_imp_dec_acc_classification_over_samples.txt', skip = 1)
colnames(OB) <- paste("OB",colnames(OB), sep="_")
colnames(Liver) <- paste("Liver",colnames(Liver), sep="_")
colnames(Heart) <- paste("Heart",colnames(Heart), sep="_")
colnames(Cereb) <- paste("Cereb",colnames(Cereb), sep="_")
data1 <- merge(OB,Liver,by.x="OB_V1",by.y="Liver_V1", all.y = T)
data2 <- merge(Heart,Cereb,by.x="Heart_V1",by.y="Cereb_V1")
data3 <- merge(data1,data2,by.x="OB_V1",by.y="Heart_V1", all.y = T)
rownames(data3) <- data3$OB_V1
results.samp.decAcc <- data3
mean(sapply(results.samp.gini[,-1],mean)) # 8.576321
mean(sapply(results.samp.decAcc[,-1],mean)) # 0.00712978
#scaling for ranking
scale.factor <- mean(sapply(results.samp.gini[,-1],mean))/mean(sapply(results.samp.decAcc[,-1],mean))
my.weighted.rssults <- results.samp.gini[,-1] + scale.factor* results.samp.decAcc[,-1]
my.rk.prod <- get_rank_prod(my.weighted.rssults)
my.rk.sorted <- sort(my.rk.prod,index.return=T,decreasing=F)
#### Gini is on different scales for the different tissues: scale per tissue?
my.gini.maxes <- sapply(results.samp.gini[,-1],max) # 8.576321
results.samp.gini.scaled <-results.samp.gini
results.samp.gini.scaled[,2] <- 100 * results.samp.gini[,2]/my.gini.maxes[1]
results.samp.gini.scaled[,3] <- 100 * results.samp.gini[,3]/my.gini.maxes[2]
results.samp.gini.scaled[,4] <- 100 * results.samp.gini[,4]/my.gini.maxes[3]
results.samp.gini.scaled[,5] <- 100 * results.samp.gini[,5]/my.gini.maxes[4]
my.colnames <- c("OB_V2","Liver_V2","Heart_V2","Cereb_V2")
my.tissues <- c("OB","Liver","Heart","Cereb")
# format for ggplot
my.complete.results <- cbind(results.samp.gini.scaled[my.rk.sorted$ix,c('OB_V1',my.colnames[1])],rep(my.tissues[1],dim(results.samp.gini.scaled)[1]),results.samp.decAcc[my.rk.sorted$ix,my.colnames[1]])
colnames(my.complete.results) <- c('Feature_name','Gini','Tissue','Mean_decrease_in_accuracy')
for ( h in 2:length(my.colnames)) {
my.new <- cbind(results.samp.gini.scaled[my.rk.sorted$ix,c('OB_V1',my.colnames[h])],rep(my.tissues[h],dim(results.samp.gini.scaled)[1]),results.samp.decAcc[my.rk.sorted$ix,my.colnames[h]])
colnames(my.new) <- colnames(my.complete.results)
my.complete.results <- rbind(my.complete.results,
my.new)
}
# make decrease in acc percentage
my.complete.results$Mean_decrease_in_accuracy <- 100* my.complete.results$Mean_decrease_in_accuracy
# will make Gini the size and Dec in accuracy the color
# to preserve the wanted order
my.complete.results$Feature_name <- factor(my.complete.results$Feature_name, levels = unique(my.complete.results$Feature_name))
my.pdfname <-paste(Sys.Date(),"RF_with_sampling_accuracy_balloon_plot.pdf", sep="_")
pdf(my.pdfname, onefile=F, height = 10, width=10)
my.plot <- ggplot(my.complete.results,aes(x=Tissue,y=Feature_name,colour=Mean_decrease_in_accuracy,size=Gini))+ theme_bw()+ geom_point(shape = 16)
my.plot <- my.plot + ggtitle("Random Forest Feature Importance") + labs(x = "Tissue", y = "Feature")
my.plot <- my.plot + scale_colour_gradientn(colours = my.col.palette.acc,space = "Lab", na.value = "grey50", guide = "colourbar")
#my.plot <- my.plot + scale_size_area(max_size = 8)
print(my.plot)
dev.off()
#####################################################
####    2. Simple runs  (*NO* CONSTANT class)    ####
#####################################################
## a. Gini score (node purity)
OB <- read.table('../Model_RData/RF/Chromatin/OB/2018-09-20_Olfactory_Bulb_withBrainExtra_RF_features_imp_gini_classification_noCST.txt', skip = 1)
Liver <- read.table('../Model_RData/RF/Chromatin/Liver//2018-09-20_Liver_RF_features_imp_gini_classification_noCST.txt', skip = 1)
Heart <- read.table('../Model_RData/RF/Chromatin/Heart//2018-09-20_Heart_RF_features_imp_gini_classification_noCST.txt', skip = 1)
Cereb <- read.table('../Model_RData/RF/Chromatin/Cerebellum//2018-09-20_Cerebellum_RF_features_imp_gini_classification_noCST.txt', skip = 1)
OB <- read.table('../Model_RData/RF/Chromatin/OB/2018-09-20_Olfactory_Bulb_RF_features_imp_gini_classification_noCST.txt', skip = 1)
Liver <- read.table('../Model_RData/RF/Chromatin/Liver//2018-09-20_Liver_RF_features_imp_gini_classification_noCST.txt', skip = 1)
Heart <- read.table('../Model_RData/RF/Chromatin/Heart//2018-09-20_Heart_RF_features_imp_gini_classification_noCST.txt', skip = 1)
Cereb <- read.table('../Model_RData/RF/Chromatin/Cerebellum//2018-09-20_Cerebellum_RF_features_imp_gini_classification_noCST.txt', skip = 1)
colnames(OB) <- paste("OB",colnames(OB), sep="_")
colnames(Liver) <- paste("Liver",colnames(Liver), sep="_")
colnames(Heart) <- paste("Heart",colnames(Heart), sep="_")
colnames(Cereb) <- paste("Cereb",colnames(Cereb), sep="_")
data1 <- merge(OB,Liver,by.x="OB_V1",by.y="Liver_V1", all.y = T)
data2 <- merge(Heart,Cereb,by.x="Heart_V1",by.y="Cereb_V1")
data3 <- merge(data1,data2,by.x="OB_V1",by.y="Heart_V1", all.y = T)
rownames(data3) <- data3$OB_V1
results.samp.gini <- data3
OB    <- read.table('../Model_RData/RF/Chromatin/OB/2018-09-20_Olfactory_Bulb__RF_features_imp_dec_acc_classification_noCST.txt', skip = 1)
Liver <- read.table('../Model_RData/RF/Chromatin/Liver/2018-09-20_Liver_RF_features_imp_dec_acc_classification_noCST.txt', skip = 1)
Heart <- read.table('../Model_RData/RF/Chromatin/Heart/2018-09-20_Heart_RF_features_imp_dec_acc_classification_noCST.txt', skip = 1)
OB    <- read.table('../Model_RData/RF/Chromatin/OB/2018-09-20_Olfactory_Bulb_RF_features_imp_dec_acc_classification_noCST.txt', skip = 1)
Liver <- read.table('../Model_RData/RF/Chromatin/Liver/2018-09-20_Liver_RF_features_imp_dec_acc_classification_noCST.txt', skip = 1)
Heart <- read.table('../Model_RData/RF/Chromatin/Heart/2018-09-20_Heart_RF_features_imp_dec_acc_classification_noCST.txt', skip = 1)
Cereb <- read.table('../Model_RData/RF/Chromatin/Cerebellum/2018-09-20_Cerebellum_RF_features_imp_dec_acc_classification_noCST.txt', skip = 1)
colnames(OB) <- paste("OB",colnames(OB), sep="_")
colnames(Liver) <- paste("Liver",colnames(Liver), sep="_")
colnames(Heart) <- paste("Heart",colnames(Heart), sep="_")
colnames(Cereb) <- paste("Cereb",colnames(Cereb), sep="_")
data1 <- merge(OB,Liver,by.x="OB_V1",by.y="Liver_V1", all.y = T)
data2 <- merge(Heart,Cereb,by.x="Heart_V1",by.y="Cereb_V1")
data3 <- merge(data1,data2,by.x="OB_V1",by.y="Heart_V1", all.y = T)
rownames(data3) <- data3$OB_V1
results.samp.decAcc <- data3
#####################################################
# for sorting, weigh the impact of Gini and of DecAcc
mean(sapply(results.samp.gini[,-1],mean)) # 3.232591
mean(sapply(results.samp.decAcc[,-1],mean)) # 0.005683469
#scaling for ranking
scale.factor <- mean(sapply(results.samp.gini[,-1],mean))/mean(sapply(results.samp.decAcc[,-1],mean))
my.weighted.rssults <- results.samp.gini[,-1] + scale.factor* results.samp.decAcc[,-1]
my.rk.prod <- get_rank_prod(my.weighted.rssults)
my.rk.sorted <- sort(my.rk.prod,index.return=T,decreasing=F)
#### Gini is on different scales for the different tissues: scale per tissue?
my.gini.maxes <- sapply(results.samp.gini[,-1],max) # 8.576321
results.samp.gini.scaled <-results.samp.gini
results.samp.gini.scaled[,2] <- 100 * results.samp.gini[,2]/my.gini.maxes[1]
results.samp.gini.scaled[,3] <- 100 * results.samp.gini[,3]/my.gini.maxes[2]
results.samp.gini.scaled[,4] <- 100 * results.samp.gini[,4]/my.gini.maxes[3]
results.samp.gini.scaled[,5] <- 100 * results.samp.gini[,5]/my.gini.maxes[4]
my.colnames <- c("OB_V2","Liver_V2","Heart_V2","Cereb_V2")
my.tissues <- c("OB","Liver","Heart","Cereb")
# format for ggplot
my.complete.results <- cbind(results.samp.gini.scaled[my.rk.sorted$ix,c('OB_V1',my.colnames[1])],rep(my.tissues[1],dim(results.samp.gini.scaled)[1]),results.samp.decAcc[my.rk.sorted$ix,my.colnames[1]])
colnames(my.complete.results) <- c('Feature_name','Gini','Tissue','Mean_decrease_in_accuracy')
for ( h in 2:length(my.colnames)) {
my.new <- cbind(results.samp.gini.scaled[my.rk.sorted$ix,c('OB_V1',my.colnames[h])],rep(my.tissues[h],dim(results.samp.gini.scaled)[1]),results.samp.decAcc[my.rk.sorted$ix,my.colnames[h]])
colnames(my.new) <- colnames(my.complete.results)
my.complete.results <- rbind(my.complete.results,
my.new)
}
# make decrease in acc percentage
my.complete.results$Mean_decrease_in_accuracy <- 100* my.complete.results$Mean_decrease_in_accuracy
# will make Gini the size and Dec in accuracy the color
# to preserve the wanted order
my.complete.results$Feature_name <- factor(my.complete.results$Feature_name, levels = unique(my.complete.results$Feature_name))
my.pdfname <-paste(Sys.Date(),"RF_NoConstant_accuracy_balloon_plot.pdf", sep="_")
pdf(my.pdfname, onefile=F, height = 10, width=10)
my.plot <- ggplot(my.complete.results,aes(x=Tissue,y=Feature_name,colour=Mean_decrease_in_accuracy,size=Gini))+ theme_bw()+ geom_point(shape = 16)
my.plot <- my.plot + ggtitle("Random Forest Feature Importance") + labs(x = "Tissue", y = "Feature")
my.plot <- my.plot + scale_colour_gradientn(colours = my.col.palette.acc,space = "Lab", na.value = "grey50", guide = "colourbar")
#my.plot <- my.plot + scale_size_area(max_size = 8)
print(my.plot)
dev.off()
setwd('/Volumes/BB_Backup_3/BD_aging_project/2018-09_revision_analyses/Machine_learning/rerun_with_same_package_version/Model_RData/RF_GBM_summary_feature_importance')
library('pheatmap')
library('ggplot2')
# 2017-03-15
# new runs with new data
# 2017-03-28
# corrected cerebellum runs
# 2017-03-29
# include OB with brain data
# 2017-04-20
# use bubble chart to include Gini and DecAccuracy in plotting
# 2018-09-20
# use values from run with updated caret
my.col.palette <- colorRampPalette(c("white","snow","#FFCCCC","#FF9999","indianred1","firebrick1","firebrick3","firebrick4"))(1000)
my.col.palette.acc <- colorRampPalette(c("white","#FFCCCC","#FF9999","indianred1","firebrick1","firebrick3","firebrick4"))(1000)
get_rank_prod <- function (data4) {
my.rk.ob <- rank(data4$OB_V2)
my.rk.liv <- rank(data4$Liver_V2)
my.rk.ht <- rank(data4$Heart_V2)
my.rk.cb <- rank(data4$Cereb_V2)
my.rks <- cbind(my.rk.ob,my.rk.liv,my.rk.ht,my.rk.cb)
my.rk.prod <- apply(my.rks,1,prod)
return(my.rk.prod)
}
#####################################################
#### 1. Runs over samples (with CONSTANT class)  ####
#####################################################
## (node purity)
OB    <- read.table('../Model_RData/GBM/Chromatin/OB/2018-09-20_Olfactory_Bulb_median_features_importance_classification_over_samples.txt', skip = 1)
Liver <- read.table('../Model_RData/GBM/Chromatin/Liver/2018-09-20_Liver_median_features_importance_classification_over_samples.txt', skip = 1)
Heart <- read.table('../Model_RData/GBM/Chromatin/Heart/2018-09-20_Heart_median_features_importance_classification_over_samples.txt', skip = 1)
Cereb <- read.table('../Model_RData/GBM/Chromatin/Cerebellum/2018-09-20_Cerebellum_median_features_importance_classification_over_samples.txt', skip = 1)
colnames(OB) <- paste("OB",colnames(OB), sep="_")
colnames(Liver) <- paste("Liver",colnames(Liver), sep="_")
colnames(Heart) <- paste("Heart",colnames(Heart), sep="_")
colnames(Cereb) <- paste("Cereb",colnames(Cereb), sep="_")
data1 <- merge(OB,Liver,by.x="OB_V1",by.y="Liver_V1", all.y = T)
data2 <- merge(Heart,Cereb,by.x="Heart_V1",by.y="Cereb_V1")
data3 <- merge(data1,data2,by.x="OB_V1",by.y="Heart_V1", all.y = T)
rownames(data3) <- data3$OB_V1
results.samp.gini <- data3
my.gini.maxes <- sapply(results.samp.gini[,-1],max) # 8.576321
results.samp.gini.scaled <-results.samp.gini
results.samp.gini.scaled[,2] <- 100 * results.samp.gini[,2]/my.gini.maxes[1]
results.samp.gini.scaled[,3] <- 100 * results.samp.gini[,3]/my.gini.maxes[2]
results.samp.gini.scaled[,4] <- 100 * results.samp.gini[,4]/my.gini.maxes[3]
results.samp.gini.scaled[,5] <- 100 * results.samp.gini[,5]/my.gini.maxes[4]
my.rk.prod <- get_rank_prod(results.samp.gini.scaled[,-1])
my.rk.sorted <- sort(my.rk.prod,index.return=T,decreasing=F)
my.colnames <- c("OB_V2","Liver_V2","Heart_V2","Cereb_V2")
my.tissues <- c("OB","Liver","Heart","Cereb")
# format for ggplot
my.complete.results <- cbind(results.samp.gini.scaled[my.rk.sorted$ix,c('OB_V1',my.colnames[1])],rep(my.tissues[1],dim(results.samp.gini.scaled)[1]))
colnames(my.complete.results) <- c('Feature_name','Gini','Tissue')
for ( h in 2:length(my.colnames)) {
my.new <- cbind(results.samp.gini.scaled[my.rk.sorted$ix,c('OB_V1',my.colnames[h])],rep(my.tissues[h],dim(results.samp.gini.scaled)[1]))
colnames(my.new) <- colnames(my.complete.results)
my.complete.results <- rbind(my.complete.results,
my.new)
}
# to preserve the wanted order
my.complete.results$Feature_name <- factor(my.complete.results$Feature_name, levels = unique(my.complete.results$Feature_name))
my.pdfname <-paste(Sys.Date(),"summary_GBM_Importance_over_samples_balloon_plot.pdf", sep="_")
pdf(my.pdfname, onefile=F, height = 10, width=10)
my.plot <- ggplot(my.complete.results,aes(x=Tissue,y=Feature_name,size=Gini))+ theme_bw()+ geom_point(shape = 16, colour = "firebrick3")
my.plot <- my.plot + ggtitle("GBM Sampling Feature Importance") + labs(x = "Tissue", y = "Feature")
#my.plot <- my.plot + scale_size_area(max_size = 8)
print(my.plot)
dev.off()
#####################################################
####    2. Simple runs  (*NO* CONSTANT class)    ####
#####################################################
##  (node purity)
##  (node purity)
OB <- read.table('../Model_RData/GBM/Chromatin/OB/2018-09-20_Olfactory_Bulb_GBM_features_importance_classification_noCST.txt', skip = 1)
Liver <- read.table('../Model_RData/GBM/Chromatin/Liver/2018-09-20_Liver_GBM_features_importance_classification_noCST.txt', skip = 1)
Heart <- read.table('../Model_RData/GBM/Chromatin/Heart/2018-09-20_Heart_GBM_features_importance_classification_noCST.txt', skip = 1)
Cereb <- read.table('../Model_RData/GBM/Chromatin/Cerebellum/2018-09-20_Cerebellum_GBM_features_importance_classification_noCST.txt', skip = 1)
colnames(OB) <- paste("OB",colnames(OB), sep="_")
colnames(Liver) <- paste("Liver",colnames(Liver), sep="_")
colnames(Heart) <- paste("Heart",colnames(Heart), sep="_")
colnames(Cereb) <- paste("Cereb",colnames(Cereb), sep="_")
data1 <- merge(OB,Liver,by.x="OB_V1",by.y="Liver_V1", all.y = T)
data2 <- merge(Heart,Cereb,by.x="Heart_V1",by.y="Cereb_V1")
data3 <- merge(data1,data2,by.x="OB_V1",by.y="Heart_V1", all.y = T)
rownames(data3) <- data3$OB_V1
# GIni is not in same scale!!!
# relative importance?
results.samp.gini <- data3
#### Gini is on different scales for the different tissues: scale per tissue?
my.gini.maxes <- sapply(results.samp.gini[,-1],max) # 8.576321
results.samp.gini.scaled <-results.samp.gini
results.samp.gini.scaled[,2] <- 100 * results.samp.gini[,2]/my.gini.maxes[1]
results.samp.gini.scaled[,3] <- 100 * results.samp.gini[,3]/my.gini.maxes[2]
results.samp.gini.scaled[,4] <- 100 * results.samp.gini[,4]/my.gini.maxes[3]
results.samp.gini.scaled[,5] <- 100 * results.samp.gini[,5]/my.gini.maxes[4]
my.rk.prod <- get_rank_prod(results.samp.gini.scaled[,-1])
my.rk.sorted <- sort(my.rk.prod,index.return=T,decreasing=F)
my.colnames <- c("OB_V2","Liver_V2","Heart_V2","Cereb_V2")
my.tissues <- c("OB","Liver","Heart","Cereb")
# format for ggplot
my.complete.results <- cbind(results.samp.gini.scaled[my.rk.sorted$ix,c('OB_V1',my.colnames[1])],rep(my.tissues[1],dim(results.samp.gini.scaled)[1]))
colnames(my.complete.results) <- c('Feature_name','Gini','Tissue')
for ( h in 2:length(my.colnames)) {
my.new <- cbind(results.samp.gini.scaled[my.rk.sorted$ix,c('OB_V1',my.colnames[h])],rep(my.tissues[h],dim(results.samp.gini.scaled)[1]))
colnames(my.new) <- colnames(my.complete.results)
my.complete.results <- rbind(my.complete.results,
my.new)
}
# to preserve the wanted order
my.complete.results$Feature_name <- factor(my.complete.results$Feature_name, levels = unique(my.complete.results$Feature_name))
my.pdfname <-paste(Sys.Date(),"summary_GBM_Importance_NoCST_balloon_plot.pdf", sep="_")
pdf(my.pdfname, onefile=F, height = 10, width=10)
my.plot <- ggplot(my.complete.results,aes(x=Tissue,y=Feature_name,size=Gini))+ theme_bw()+ geom_point(shape = 16, colour = "firebrick3")
my.plot <- my.plot + ggtitle("GBM Sampling Feature Importance") + labs(x = "Tissue", y = "Feature")
#my.plot <- my.plot + scale_size_area(max_size = 8)
print(my.plot)
dev.off()
