setwd('/Volumes/MyBook_3/BD_aging_project/Machine_learning_aging/Predict_level')

library("DESeq2")

# 2015-12-08
# Needs to generate a fpkm matrix for level prediction

##################################################################################### 
### READ ALL MATRICES

##################################################################################### 
get_fpkm <- function(my.matrix, my.lengths) {
  
  my.lib.sizes <- apply(my.matrix,2,sum)/1e6 # number of counts per library, in millions
  my.fpkms <- (my.matrix/(1e-3*my.lengths))/my.lib.sizes
  
  boxplot(my.fpkms+0.01,log='y',outline=F,las=2)
  
  my.fpkms
}
##################################################################################### 


####################################    Liver    #################################### 
# read in subread count matrix
my.liver1 <- read.table('/Volumes/MyBook_3/BD_aging_project/RNAseq/Liver/STAR/Aging_Liver_counts_genes.txt',skip=1,header=T,sep="\t",stringsAsFactors=F)
my.liver <- my.liver1[,c(1,6:15)]
rownames(my.liver) <- my.liver[,1]
colnames(my.liver) <- c("GeneName","Length",paste("3m",1:3,sep=""),paste("12m",1:3,sep=""),paste("29m",1:3,sep=""))

spikes.idx <- grep("ERCC-", rownames(my.liver))
my.liver <- my.liver[-spikes.idx,]

my.lengths <- my.liver$Length # extract length calculated by subreads
# fpkm: fragment per kb per million
my.liver.fpkm <- get_fpkm(my.liver[,3:11],my.lengths)
##################################################################################### 


####################################   Heart   #################################### 
# read in subread count matrix
my.heart1 <- read.table('/Volumes/MyBook_3/BD_aging_project/RNAseq/Heart/STAR/Aging_Heart_counts_genes.txt',skip=1,header=T,sep="\t",stringsAsFactors=F)
my.heart <- my.heart1[,c(1,6:15)]
rownames(my.heart) <- my.heart[,1]
colnames(my.heart) <- c("GeneName","Length",paste("3m",1:3,sep=""),paste("12m",1:3,sep=""),paste("29m",1:3,sep=""))

spikes.idx <- grep("ERCC-", rownames(my.heart))
my.heart <- my.heart[-spikes.idx,]

my.lengths <- my.heart$Length # extract length calculated by subreads
# fpkm: fragment per kb per million
my.heart.fpkm <- get_fpkm(my.heart[,3:11],my.lengths)
###################################################################################


#################################### Cerebellum #################################### 
# read in subread count matrix
# there were 2 nextseq runs based on poor clustering on flow cell
# will sum up count matrices
my.cereb1 <- read.table('/Volumes/MyBook_3/BD_aging_project/RNAseq/Cereb/1st_run/STAR/Aging_cerebellum_counts_genes.txt',skip=1,header=T,sep="\t")
my.cereb2 <- read.table('/Volumes/MyBook_3/BD_aging_project/RNAseq/Cereb/2nd_run/STAR/Aging_cerebellum_v2_counts_genes.txt',skip=1,header=T,sep="\t")

my.cereb <- my.cereb1[,c(1,6:15)]
my.cereb[,3:11] <- my.cereb[,3:11] + my.cereb2[,7:15]
rownames(my.cereb) <- my.cereb[,1]
colnames(my.cereb) <- c("GeneName","Length",paste("3m",1:3,sep=""),paste("12m",1:3,sep=""),paste("29m",1:3,sep=""))

spikes.idx <- grep("ERCC-", rownames(my.cereb))
my.cereb <- my.cereb[-spikes.idx,]

my.lengths <- my.cereb$Length # extract length calculated by subreads
# fpkm: fragment per kb per million
my.cereb.fpkm <- get_fpkm(my.cereb[,3:11],my.lengths)
####################################################################################


#################################### Olfactory Bulb #################################
# one of the 12mths samples was not analyzed
# read in subread count matrix
my.ob1 <- read.table('/Volumes/MyBook_3/BD_aging_project/RNAseq/OB/STAR/Aging_OlfactoryBulb_counts_genes.txt',skip=1,header=T,sep="\t",stringsAsFactors=F)
my.ob <- my.ob1[,c(1,6:14)]
rownames(my.ob) <- my.ob[,1]
colnames(my.ob) <- c("GeneName","Length",paste("3m",1:3,sep=""),paste("12m",1:2,sep=""),paste("29m",1:3,sep=""))

spikes.idx <- grep("ERCC-", rownames(my.ob))
my.ob <- my.ob[-spikes.idx,]

my.lengths <- my.ob$Length # extract length calculated by subreads
# fpkm: fragment per kb per million
my.ob.fpkm <- get_fpkm(my.ob[,3:10],my.lengths)
##################################################################################### 


####################################  NPCs pools  ###################################
# read in subread count matrix
my.npc1 <- read.table('/Volumes/MyBook_3/BD_aging_project/RNAseq/NPC_Pool/STAR/Aging_NPCs_pool_counts_genes.txt',skip=1,header=T,sep="\t",stringsAsFactors=F)
my.npc <- my.npc1[,c(1,6:12)]
rownames(my.npc) <- my.npc[,1]
colnames(my.npc) <- c("GeneName","Length",paste("3m",1:2,sep=""),paste("12m",1:2,sep=""),paste("29m",1:2,sep=""))

spikes.idx <- grep("ERCC-", rownames(my.npc))
my.npc <- my.npc[-spikes.idx,]

my.lengths <- my.npc$Length # extract length calculated by subreads
# fpkm: fragment per kb per million
my.npc.fpkm <- get_fpkm(my.npc[,3:8],my.lengths)
##################################################################################### 


boxplot(cbind(my.liver.fpkm, my.heart.fpkm,my.cereb.fpkm,my.ob.fpkm,my.npc.fpkm)+0.01,log='y',outline=F,las=2)

write.table(cbind(my.liver$GeneName,my.liver.fpkm) , file = "2015-12-08_LIVER_FPKM_matrix_forML.txt" , sep = "\t" , row.names = F, quote=F)
write.table(cbind(my.heart$GeneName,my.heart.fpkm) , file = "2015-12-08_HEART_FPKM_matrix_forML.txt" , sep = "\t" , row.names = F, quote=F)
write.table(cbind(my.cereb$GeneName,my.cereb.fpkm) , file = "2015-12-08_CEREBELLUM_FPKM_matrix_forML.txt" , sep = "\t" , row.names = F, quote=F)
write.table(cbind(my.ob$GeneName,my.ob.fpkm) , file = "2015-12-08_OLFACTORY_BULB_FPKM_matrix_forML.txt" , sep = "\t" , row.names = F, quote=F)
write.table(cbind(my.npc$GeneName,my.npc.fpkm) , file = "2015-12-08_NPCs_FPKM_matrix_forML.txt" , sep = "\t" , row.names = F, quote=F)

