options(stringsAsFactors = FALSE)

# load libraries for analysis
library(DESeq2)
library(pheatmap)
library('pvclust')
library('limma')
library('affy')
library(sva)


get_sorted_meta_data <- function(my.meta.data.patient,my.gene.counts,my.tissue) {
  
  my.correct.samples <- grep(my.tissue, my.meta.data.patient$Tissue_Type)
  my.datasets <- my.gene.counts[1,]
  
  my.cols <- which(my.datasets %in% my.meta.data.patient$Sample_ID[my.correct.samples])
  
  meta.sort <- sort(my.meta.data.patient$Sample_ID[my.correct.samples],index.return=T)
  data.sort <- sort(as.character(my.datasets[my.cols]),index.return=T)
  
  my.res <- list(my.meta.data.patient[my.correct.samples[meta.sort$ix],],my.gene.counts[,c(1,2,my.cols[data.sort$ix])])
  return(my.res)
}

get_sorted_meta_data_male_only <- function(my.meta.data.patient,my.gene.counts,my.tissue) {
  
  my.correct.samples <- intersect(grep(my.tissue, my.meta.data.patient$Tissue_Type),which(my.meta.data.patient$Gender %in% "Male"))
  
  my.datasets <- my.gene.counts[1,]
  
  my.cols <- which(my.datasets %in% my.meta.data.patient$Sample_ID[my.correct.samples])
  
  meta.sort <- sort(my.meta.data.patient$Sample_ID[my.correct.samples],index.return=T)
  data.sort <- sort(as.character(my.datasets[my.cols]),index.return=T)
  
  my.res <- list(my.meta.data.patient[my.correct.samples[meta.sort$ix],],my.gene.counts[,c(1,2,my.cols[data.sort$ix])])
  return(my.res)
}


#######################################################################################################################################
## This function takes in the RNAseq matrix and processes it through DEseq modeling with respect to age in months
# INPUT: my.matrix: count matrix from subreads, columns selected, rownames implemented
#        reps.3, reps.12, reps.29: replicates from each age, default is 3
#        my.tissue: name of the tissue

# OUTPUT: list with [[1]] DEseq result object and [[2]] normalized log2 count matrix

process_aging_rnaseq <- function(my.tissue, my.gene.counts, my.meta.data.patient) {
  
  my.sorted.data <- get_sorted_meta_data_male_only(my.meta.data.patient,my.gene.counts,my.tissue)
  
  my.matrix <- my.sorted.data[[2]][-1,-c(1:2)]
  my.matrix <- data.frame(my.matrix)
  my.matrix <- sapply(my.matrix,as.numeric)
  
  rownames(my.matrix) <- my.sorted.data[[2]][-1,2]
  colnames(my.matrix) <- my.sorted.data[[2]][1,-(1:2)]
  
  # make ages into numbers for DESeq2
  #   unique(my.sorted.data[[1]]$Age)
  #   [1] "60-69" "50-59" "30-39" "40-49" "20-29"
  my.sorted.data[[1]]$Age[my.sorted.data[[1]]$Age %in% "20-29"] <- 25
  my.sorted.data[[1]]$Age[my.sorted.data[[1]]$Age %in% "30-39"] <- 35
  my.sorted.data[[1]]$Age[my.sorted.data[[1]]$Age %in% "40-49"] <- 45
  my.sorted.data[[1]]$Age[my.sorted.data[[1]]$Age %in% "50-59"] <- 55
  my.sorted.data[[1]]$Age[my.sorted.data[[1]]$Age %in% "60-69"] <- 65
  my.sorted.data[[1]]$Age[my.sorted.data[[1]]$Age %in% "70-79"] <- 75
  my.sorted.data[[1]]$Age <- as.numeric(my.sorted.data[[1]]$Age)
  
  ncols <- dim(my.matrix)[2]
  
  # get output file prefix
  my.outprefix <- paste(Sys.Date(),my.tissue,"DESeq2_LINEAR_model_with_age",sep="_")
  
  # get the genes with no reads out
  my.null <- which(apply(my.matrix[,1:ncols], 1, sum) <= 1) # see deseq2 vignetter

  # need to remove samples for which I have no gentoype
  is.na(my.sorted.data[[1]]$PC1)
  my.filtered.metadata <- my.sorted.data[[1]][!is.na(my.sorted.data[[1]]$PC1),]
  
  
  my.filtered.matrix <- my.matrix[-my.null,!is.na(my.sorted.data[[1]]$PC1)]
  
  age <- my.sorted.data[[1]]$Age
  
  # design matrix
  dataDesign = data.frame( row.names = colnames( my.filtered.matrix ),
                           age = my.filtered.metadata$Age,
                           Sex = my.filtered.metadata$Gender,
                           RIN = my.filtered.metadata$RIN,
                           Ischemic_time = my.filtered.metadata$Ischemic_time,
                           RNA_batch = factor(my.filtered.metadata$RNA_batch),
                           Exp_batch = factor(my.filtered.metadata$Exp_Batch),
                           duplication_rate = my.filtered.metadata$duplication_rate,
                           Fixation_time  = my.filtered.metadata$Fixation_time,
                           Genotype_PC1 = my.filtered.metadata$PC1,
                           Genotype_PC2 = my.filtered.metadata$PC2,
                           Genotype_PC3 = my.filtered.metadata$PC3
  )
  
  
  
  # get matrix using age as a modeling covariate
  dds <- DESeqDataSetFromMatrix(countData = my.filtered.matrix,
                                colData = dataDesign,
                                design = ~ age + RIN + Ischemic_time + Fixation_time + Genotype_PC1 + Genotype_PC2 + Genotype_PC3 + RNA_batch)
  
  # run DESeq normalizations and export results
  dds.deseq <- DESeq(dds, fitType = "local")
  res <- results(dds.deseq, name= "age") # added the name of the tested variable: doesn't seem to be taken correctly by default for FC
  
  # plot dispersion
  my.disp.out <- paste(my.outprefix,"_dispersion_plot.pdf")
  
  pdf(my.disp.out)
  plotDispEsts(dds.deseq)
  dev.off()
  
  
  # normalized expression value
  tissue.cts <- log2( counts(dds.deseq, normalize = TRUE) + 0.01)
  #colnames(tissue.cts) <-paste(my.sorted.data[[1]]$Age,"y",sep="")
  
  # vsd transformation
  vsd <- getVarianceStabilizedData(dds.deseq)
  
  
  # regress out the non age variance
  full.model <- model.matrix(~  age + RIN + Ischemic_time + Fixation_time + Genotype_PC1 + Genotype_PC2 + Genotype_PC3 + RNA_batch,
                             data = dataDesign) # all variables
  fit <- lmFit(ExpressionSet(assayData=as.matrix(vsd)), full.model)
  fit.eb <- eBayes(fit)
  print(colnames(fit))
  
  ### Regress out batch
  mod <- coefficients(fit)[,-c(1:2)] %*% t(fit$design[,-c(1:2)]) ### I keep only age and intercept
  my.filtered.matrix.corrected <- vsd - mod
  colnames(my.filtered.matrix.corrected)<- paste(my.filtered.metadata$Age,"y_",colnames(vsd))
  
  # do MDS analysis
  mds.result <- cmdscale(1-cor(my.filtered.matrix.corrected,method="spearman"), k = 2, eig = FALSE, add = FALSE, x.ret = FALSE)
  x <- mds.result[, 1]
  y <- mds.result[, 2]
  
  my.col.choices <- colorRampPalette(c("coral","blueviolet","dodgerblue"))(length(unique(my.filtered.metadata$Age)))
  my.colors <- rep("",ncols)
  
  my.colors[my.filtered.metadata$Age == 25] <- my.col.choices[1]
  my.colors[my.filtered.metadata$Age == 35] <- my.col.choices[2]
  my.colors[my.filtered.metadata$Age == 45] <- my.col.choices[3]
  my.colors[my.filtered.metadata$Age == 55] <- my.col.choices[4]
  my.colors[my.filtered.metadata$Age == 65] <- my.col.choices[5]
  
  if(length(unique(my.filtered.metadata$Age)) > 5) {
    my.colors[my.filtered.metadata$Age == 75] <- my.col.choices[6]
  }
  
  my.mds.out <- paste(my.outprefix,"_MDS_plot.pdf")
  
  pdf(my.mds.out)
  plot(x, y, xlab = "MDS dimension 1", ylab = "MDS dimension 2",main="Multi-dimensional Scaling",cex=4, col=my.colors)
  text(x, y, paste(my.filtered.metadata$Age,"y",sep=""), cex=1)
  dev.off()
  
  # expression range
  my.exp.out <- paste(my.outprefix,"_Normalized_counts_boxplot.pdf")
  
  pdf(my.exp.out)
  boxplot(my.filtered.matrix.corrected, cex=0.5,ylab="Log2 DESeq2 Normalized counts", main = my.tissue,col=my.colors, outline=F)  
  dev.off()
  
  ### get the heatmap of aging changes at FDR5
  ## exclude NA
  res <- res[!is.na(res$padj),]
    
  genes.aging <- rownames(res)[res$padj < 0.05]
  my.num.aging <- length(genes.aging)
  
  my.age.sort <- sort(my.filtered.metadata$Age,index.return = T)
  
  if (my.num.aging > 0) {
    # heatmap drawing - only if there is at least one gene
    my.heatmap.out <- paste(my.outprefix,"_Heatmap_significant_genes.pdf")
    
    pdf(my.heatmap.out, height = 10, width = 30, onefile=F)
    my.heatmap.title <- paste(my.tissue," aging singificant (FDR<5%), ",my.num.aging, " genes",sep="")
    pheatmap(my.filtered.matrix.corrected[genes.aging,my.age.sort$ix],
             cluster_cols = F,
             cluster_rows = T,
             colorRampPalette(rev(c("#CC3333","#FF9999","#FFCCCC","white","#CCCCFF","#9999FF","#333399")))(50),
             show_rownames = F, scale="row",
             main = my.heatmap.title, cellwidth = 12)
    dev.off()
  }

  
 
  # output result tables to files
  my.out.ct.mat <- paste(my.outprefix,"_log2_counts_matrix.txt")
  my.out.ct.mat.cl <- paste(my.outprefix,"_log2_counts_matrix_cleaned.txt")
  my.out.stats <- paste(my.outprefix,"_all_genes_statistics.txt")
  my.out.fdr5 <- paste(my.outprefix,"_FDR5_genes_statistics.txt")
  my.out.rdata <- paste(my.outprefix,"_statistics.RData")
  
  write.table(tissue.cts, file = my.out.ct.mat , sep = "\t" , row.names = T, quote=F)
  write.table(res, file = my.out.stats , sep = "\t" , row.names = T, quote=F)
  write.table(res[genes.aging,], file = my.out.fdr5, sep = "\t" , row.names = T, quote=F)
  write.table(my.filtered.matrix.corrected, file = my.out.ct.mat.cl , sep = "\t" , row.names = T, quote=F)

  my.out.metadata <- paste(my.outprefix,"_metadata.txt")
  write.table(dataDesign, file = my.out.metadata , sep = "\t" , row.names = T, quote=F)
    
  return(res)
}

#######################################################################################################################################

