options(stringsAsFactors = FALSE)

# load libraries for analysis
library(DESeq2)
library(pheatmap)
library(Biobase)
library(limma)

# modified on 2015-12-02 to output annotated Gene name in the DESeq2 result object (for pathway analysis)

#######################################################################################################################################
## This function takes in the breadth matrix and processes it through DEseq modeling with respect to age in months
# INPUT: my.matrix: count matrix from subreads, columns selected, rownames implemented
#        reps.3, reps.12, reps.29: replicates from each age, default is 3
#        my.tissue: name of the tissue

# OUTPUT: list with [[1]] DEseq result object and [[2]] normalized log2 count matrix

process_aging_breadth<- function(my.tissue, my.matrix, my.annot, reps.3=2, reps.12=2, reps.29=2) {
  
  ncols <- dim(my.matrix)[2]
  
  # get output file prefix
  my.outprefix <- paste(Sys.Date(),my.tissue,"DESeq2_LINEAR_model_with_age_H3K4me3_breadth",sep="_")
  
  my.filtered.matrix <- my.matrix # for compatibility
  
  age <- as.numeric(c(rep(3,reps.3),rep(12,reps.12),rep(29,reps.29) )) # age in months
  
  # design matrix
  dataDesign = data.frame( row.names = colnames( my.filtered.matrix ), age = age )
  
  # get matrix using age as a modeling covariate
  dds <- DESeqDataSetFromMatrix(countData = my.filtered.matrix,
                                colData = dataDesign,
                                design = ~ age)
  
  # run DESeq normalizations and export results
  dds.deseq <- DESeq(dds)
  res.linear <- results(dds.deseq, name= "age") # added the name of the tested variable: doesn't seem to be taken correctly by default for FC
  
  # plot dispersion
  my.disp.out <- paste(my.outprefix,"_dispersion_plot.pdf",sep="")
  
  pdf(my.disp.out)
  plotDispEsts(dds.deseq)
  dev.off()
  
  
  my.maplot.out <- paste(my.outprefix,"_MA_plot.pdf",sep="")
  pdf(my.maplot.out)
  MAPlot_breadth(res.linear, my.tissue)
  dev.off()
  
  # normalized expression value
  tissue.cts <- log2( counts(dds.deseq, normalize = TRUE) + 0.01) # log2 norm of breadth
  colnames(tissue.cts) <- c(paste("3m",1:reps.3,sep=""),paste("12m",1:reps.12,sep=""),paste("29m",1:reps.29,sep=""))
  
  # do MDS analysis
  mds.result <- cmdscale(1-cor(tissue.cts,method="spearman"), k = 2, eig = FALSE, add = FALSE, x.ret = FALSE)
  x <- mds.result[, 1]
  y <- mds.result[, 2]
  
  my.colors <- c(rep("coral",reps.3), rep("blueviolet", reps.12),rep("dodgerblue",reps.29))
  
  my.mds.out <- paste(my.outprefix,"_MDS_plot.pdf",sep="")
  
  pdf(my.mds.out)
  plot(x, y, xlab = "MDS dimension 1", ylab = "MDS dimension 2",main="Multi-dimensional Scaling",cex=2)
  points(x, y, pch=16,col=my.colors,cex=2)
  legend("topleft",c("3m","12m","29m"),col=c("coral","blueviolet","dodgerblue"),pch=16,bty='n',pt.cex=2)
  dev.off()
  
  
  ##### PCA
  my.pos.var <- apply(tissue.cts,1,var) >0
  # do PCA analysis
  my.pca <- prcomp(t(tissue.cts[my.pos.var,]),scale = TRUE)
  x <- my.pca$x[,1]
  y <- my.pca$x[,2]
  
  my.summary <- summary(my.pca)
  
  my.pca.out <- paste(my.outprefix,"_PCA_plot.pdf",sep="")
  
  pdf(my.pca.out)
  plot(x,y, cex=2, 
       xlab = paste('PC1 (', round(100*my.summary$importance[,1][2],1),"%)", sep=""),
       ylab = paste('PC2 (', round(100*my.summary$importance[,2][2],1),"%)", sep=""),
       cex.lab = 1.5) 
  points(x,y, pch = 16, cex=3, col=NA,bg=my.colors)
  legend("topleft",c("3m","12m","29m"),col=c("coral","blueviolet","dodgerblue"),pch=16,bty='n',pt.cex=2)
  dev.off()
  #####
  
  
  # expression range
  my.exp.out <- paste(my.outprefix,"_Normalized_breadth_boxplot.pdf",sep="")
  
  pdf(my.exp.out)
  boxplot(tissue.cts,col=c(rep("coral",reps.3),rep("blueviolet",reps.12),rep("dodgerblue",reps.29)),
          cex=0.5,ylab="Log2 DESeq2 Normalized counts", main = my.tissue)  
  dev.off()
  
  ### get the heatmap of aging changes at FDR5
  ## exclude NA
  my.nas.bool <- is.na(res.linear$padj)
  res.linear <- res.linear[!is.na(res.linear$padj),]
  
  genes.aging <- rownames(res.linear)[res.linear$padj < 0.05]
  my.num.aging <- length(genes.aging)
  
  if (my.num.aging > 0) {
    # heatmap drawing - only if there is at least one gene
    my.heatmap.out <- paste(my.outprefix,"_Heatmap_significant_domains.pdf",sep="")
    
    pdf(my.heatmap.out)
    my.heatmap.title <- paste(my.tissue," aging significant (FDR<5%), ",my.num.aging, " domains",sep="")
    pheatmap(my.filtered.matrix[genes.aging,], # show called breadth, not DESeq norm breadth
             cluster_cols = F,
             cluster_rows = T,
             colorRampPalette(rev(c("#CC3333","#FF9999","#FFCCCC","white","#CCCCFF","#9999FF","#333399")))(50),
             show_rownames = F, scale="row",
             main = my.heatmap.title, cellwidth = 30)
    dev.off()
  }
  
  
  # remove NAs
  my.annot <-  my.annot[!my.nas.bool,]
  my.filtered.matrix <-  my.filtered.matrix[!my.nas.bool,]
  
  # get output files
  
  ## for all changes
  my.lost.de2 <- intersect(which(res.linear$padj < 0.05),which(res.linear$log2FoldChange < 0))
  my.gained.de2 <- intersect(which(res.linear$padj < 0.05),which(res.linear$log2FoldChange > 0))
  
  my.lost.out <- paste(my.outprefix,"_breadth_LINEAR_LOST_ALL.bed",sep="")
  my.gained.out <- paste(my.outprefix,"_breadth_LINEAR_GAINED_ALL.bed",sep="")
  my.out.all <-paste(my.outprefix,"_breadth_LINEAR_CHANGED_ALL.xls",sep="")
  
  write.table(cbind(my.annot[,1:7],res.linear)[my.lost.de2,c(2,3,4,1)], file=my.lost.out,sep="\t",quote=F,row.names=F,col.names=F)
  write.table(cbind(my.annot[,1:7],res.linear)[my.gained.de2,c(2,3,4,1)], file=my.gained.out,sep="\t",quote=F,row.names=F,col.names=F)
  write.table(cbind(my.annot[,1:7],res.linear)[union(my.lost.de2,my.gained.de2),], file=my.out.all,sep="\t",quote=F,row.names=F,col.names=F)
  
  ### get quartiles
  my.3m.qt <- get_my_av_qt (my.filtered.matrix[,1:2])
  my.12m.qt <- get_my_av_qt (my.filtered.matrix[,3:4])
  my.29m.qt <- get_my_av_qt (my.filtered.matrix[,5:6])
  
  my.null <- which(apply(my.filtered.matrix,1,sum)==0)
  my.bd.bckgd <- union(union(my.3m.qt[[20]],my.12m.qt[[20]]),my.29m.qt[[20]])
  
  my.all.bd.out <- paste(my.outprefix,"_AGING_ALL_broad_H3K4me3_domain.bed",sep="")
  my.all.k4.out <- paste(my.outprefix,"_AGING_ALL_broad_H3K4me3_domain.bed",sep="")
  
  
  write.table(my.annot[my.bd.bckgd,c(2,3,4,1)], file = my.all.bd.out,quote=F,row.names=F,col.names=F,sep="\t")
  write.table(my.annot[-my.null,c(2,3,4,1)], file = my.all.k4.out,quote=F,row.names=F,col.names=F,sep="\t")
  
  my.changedBD.de2 <- intersect(which(res.linear$padj < 0.05), my.bd.bckgd )
  my.lostBD.de2 <- intersect(my.changedBD.de2,which(res.linear$log2FoldChange < 0))
  my.gainedBD.de2 <- intersect(my.changedBD.de2,which(res.linear$log2FoldChange > 0))
  
  # show significant
  my.nums.bd.changes <- c(length(my.changedBD.de2),length(my.lostBD.de2), length(my.gainedBD.de2))
  names(my.nums.bd.changes) <- c("All changed BDs","Eroded BDs","Extended BDs")
  print(my.nums.bd.changes)
  
  
  my.heatmap.out.2 <- paste(my.outprefix,"_Heatmap_significant_BDs.pdf",sep="")
  
  pdf(my.heatmap.out.2)
  my.heatmap.title.2 <- paste(my.tissue," aging significant (FDR<5%), ",my.nums.bd.changes[1], " broad domains",sep="")
  pheatmap(my.filtered.matrix[my.changedBD.de2,], # show called breadth, not DESeq norm breadth
           cluster_cols = F,
           cluster_rows = T,
           colorRampPalette(rev(c("#CC3333","#FF9999","#FFCCCC","white","#CCCCFF","#9999FF","#333399")))(50),
           show_rownames = F, scale="row",
           main = my.heatmap.title.2, cellwidth = 30, border=NA)
  dev.off()
  
  
  # output result tables to files
  my.out.1 <- paste(my.outprefix,"_CHANGED_BROAD_ANNOT.xls",sep="")
  my.out.2 <- paste(my.outprefix,"_CHANGED_BROAD.bed",sep="")
  my.out.3 <- paste(my.outprefix,"_LOST_BROAD.bed",sep="")
  my.out.4 <- paste(my.outprefix,"_GAINED_BROAD.bed",sep="")
  
  write.table(cbind(my.annot[,1:7],res.linear)[my.changedBD.de2,], file=my.out.1,sep="\t",quote=F,row.names=F,col.names=F)
  write.table(cbind(my.annot[,1:7],res.linear)[my.changedBD.de2,c(2,3,4,1)], file=my.out.2,sep="\t",quote=F,row.names=F,col.names=F)
  write.table(cbind(my.annot[,1:7],res.linear)[my.lostBD.de2,c(2,3,4,1)], file=my.out.3,sep="\t",quote=F,row.names=F,col.names=F)
  write.table(cbind(my.annot[,1:7],res.linear)[my.gainedBD.de2,c(2,3,4,1)], file=my.out.4,sep="\t",quote=F,row.names=F,col.names=F)
  
  # create a slot for gene names
  res.linear.gnames <- res.linear
  res.linear.gnames$GeneName <- my.annot$Gene.Name
  #head(res.linear.gnames)
    
  return(res.linear.gnames)
  
}

#######################################################################################################################################
#######################################################################################################################################
#######################################################################################################################################
# same but with batch effect removal
process_aging_breadth_batch <- function(my.tissue, my.matrix, my.annot, my.batch, reps.3=4, reps.12=4, reps.29=4) {
  
  ncols <- dim(my.matrix)[2]
  
  # get output file prefix
  my.outprefix <- paste(Sys.Date(),my.tissue,"DESeq2_LINEAR_model_with_age_H3K4me3_breadth",sep="_")
  
  my.filtered.matrix <- my.matrix # for compatibility
  
  age <- as.numeric(c(rep(3,reps.3),rep(12,reps.12),rep(29,reps.29) )) # age in months
  
  # design matrix
  dataDesign = data.frame( row.names = colnames( my.filtered.matrix ), age = age , batch=my.batch)
  
  # get matrix using age as a modeling covariate
  dds <- DESeqDataSetFromMatrix(countData = my.filtered.matrix,
                                colData = dataDesign,
                                design = ~ age + age:batch + batch)
  
  # run DESeq normalizations and export results
  dds.deseq <- DESeq(dds)
  res.linear <- results(dds.deseq, name= "age") # added the name of the tested variable: doesn't seem to be taken correctly by default for FC
  
  # plot dispersion
  my.disp.out <- paste(my.outprefix,"_dispersion_plot.pdf",sep="")
  
  pdf(my.disp.out)
  plotDispEsts(dds.deseq)
  dev.off()
  
  
  my.maplot.out <- paste(my.outprefix,"_MA_plot.pdf",sep="")
  pdf(my.maplot.out)
  MAPlot_breadth(res.linear, my.tissue)
  dev.off()
  
  # normalized expression value
  tissue.cts <- log2( counts(dds.deseq, normalize = TRUE) + 0.01) # log2 norm of breadth
  colnames(tissue.cts) <- c(paste("3m",1:reps.3,sep=""),paste("12m",1:reps.12,sep=""),paste("29m",1:reps.29,sep=""))
  
  # regress out the non age variance
  full.model <- model.matrix(~ age + age:batch + batch, data = dataDesign) # all variables
  fit <- lmFit(ExpressionSet(assayData=as.matrix(my.filtered.matrix)), full.model)
  fit.eb <- eBayes(fit)
  print(colnames(fit))
  
  ### Regress out batch
  mod <- coefficients(fit)[,-c(1:2)] %*% t(fit$design[,-c(1:2)]) ### I keep only age and intercept
  my.filtered.matrix.corrected <- my.filtered.matrix - mod
  
  
  # do MDS analysis
  mds.result <- cmdscale(1-cor(my.filtered.matrix.corrected,method="spearman"), k = 2, eig = FALSE, add = FALSE, x.ret = FALSE)
  x <- mds.result[, 1]
  y <- mds.result[, 2]
  
  my.colors <- c(rep("coral",reps.3), rep("blueviolet", reps.12),rep("dodgerblue",reps.29))
  
  my.mds.out <- paste(my.outprefix,"_MDS_plot.pdf",sep="")
  
  pdf(my.mds.out)
  plot(x, y, xlab = "MDS dimension 1", ylab = "MDS dimension 2",main="Multi-dimensional Scaling",cex=2)
  points(x, y, pch=16,col=my.colors,cex=2)
  legend("topleft",c("3m","12m","29m"),col=c("coral","blueviolet","dodgerblue"),pch=16,bty='n',pt.cex=2)
  dev.off()
  
  
  ##### PCA
  my.pos.var <- apply(my.filtered.matrix.corrected,1,var) >0
  # do PCA analysis
  my.pca <- prcomp(t(my.filtered.matrix.corrected[my.pos.var,]),scale = TRUE)
  x <- my.pca$x[,1]
  y <- my.pca$x[,2]
  
  my.summary <- summary(my.pca)
  
  my.pca.out <- paste(my.outprefix,"_PCA_plot.pdf",sep="")
  
  pdf(my.pca.out)
  plot(x,y,cex=2, 
       xlab = paste('PC1 (', round(100*my.summary$importance[,1][2],1),"%)", sep=""),
       ylab = paste('PC2 (', round(100*my.summary$importance[,2][2],1),"%)", sep=""),
       cex.lab = 1.5) 
  points(x,y, pch = 16, cex=3, col=NA,bg=my.colors)
  legend("topleft",c("3m","12m","29m"),col=c("coral","blueviolet","dodgerblue"),pch=16,bty='n',pt.cex=2)
  dev.off()
  #####
  
  
  # expression range
  my.exp.out <- paste(my.outprefix,"_Normalized_breadth_boxplot.pdf",sep="")
  
  pdf(my.exp.out)
  boxplot(log2(my.filtered.matrix.corrected + 0.01),col=c(rep("coral",reps.3),rep("blueviolet",reps.12),rep("dodgerblue",reps.29)),
          cex=0.5,ylab="Log2 DESeq2 Normalized counts", main = my.tissue)  
  dev.off()
  
  ### get the heatmap of aging changes at FDR5
  ## exclude NA
  my.nas.bool <- is.na(res.linear$padj)
  res.linear <- res.linear[!is.na(res.linear$padj),]
  
  genes.aging <- rownames(res.linear)[res.linear$padj < 0.05]
  my.num.aging <- length(genes.aging)
  
  if (my.num.aging > 0) {
    # heatmap drawing - only if there is at least one gene
    my.heatmap.out <- paste(my.outprefix,"_Heatmap_significant_domains.pdf",sep="")
    
    pdf(my.heatmap.out)
    my.heatmap.title <- paste(my.tissue," aging significant (FDR<5%), ",my.num.aging, " domains",sep="")
    pheatmap(my.filtered.matrix.corrected[genes.aging,], # show called breadth, not DESeq norm breadth
             cluster_cols = F,
             cluster_rows = T,
             colorRampPalette(rev(c("#CC3333","#FF9999","#FFCCCC","white","#CCCCFF","#9999FF","#333399")))(50),
             show_rownames = F, scale="row",
             main = my.heatmap.title, cellwidth = 30)
    dev.off()
  }
  
  # remove NAs
  my.annot <-  my.annot[!my.nas.bool,]
  my.filtered.matrix <-  my.filtered.matrix[!my.nas.bool,]
  
  # get output files
  
  ## for all changes
  my.lost.de2 <- intersect(which(res.linear$padj < 0.05),which(res.linear$log2FoldChange < 0))
  my.gained.de2 <- intersect(which(res.linear$padj < 0.05),which(res.linear$log2FoldChange > 0))
  
  my.lost.out <- paste(my.outprefix,"_breadth_LINEAR_LOST_ALL.bed",sep="")
  my.gained.out <- paste(my.outprefix,"_breadth_LINEAR_GAINED_ALL.bed",sep="")
  my.out.all <-paste(my.outprefix,"_breadth_LINEAR_CHANGED_ALL.xls",sep="")
  
  write.table(cbind(my.annot[,1:7],res.linear)[my.lost.de2,c(2,3,4,1)], file=my.lost.out,sep="\t",quote=F,row.names=F,col.names=F)
  write.table(cbind(my.annot[,1:7],res.linear)[my.gained.de2,c(2,3,4,1)], file=my.gained.out,sep="\t",quote=F,row.names=F,col.names=F)
  write.table(cbind(my.annot[,1:7],res.linear)[union(my.lost.de2,my.gained.de2),], file=my.out.all,sep="\t",quote=F,row.names=F,col.names=F)
  
  ### get quartiles
  my.3m.qt <- get_my_av_qt (my.filtered.matrix[,1:4])
  my.12m.qt <- get_my_av_qt (my.filtered.matrix[,5:8])
  my.29m.qt <- get_my_av_qt (my.filtered.matrix[,9:12])
  
  my.null <- which(apply(my.filtered.matrix,1,sum)==0)
  my.bd.bckgd <- union(union(my.3m.qt[[20]],my.12m.qt[[20]]),my.29m.qt[[20]])
  
  my.all.bd.out <- paste(my.outprefix,"_AGING_ALL_broad_H3K4me3_domain.bed",sep="")
  my.all.k4.out <- paste(my.outprefix,"_AGING_ALL_broad_H3K4me3_domain.bed",sep="")
  
  
  write.table(my.annot[my.bd.bckgd,c(2,3,4,1)], file = my.all.bd.out,quote=F,row.names=F,col.names=F,sep="\t")
  write.table(my.annot[-my.null,c(2,3,4,1)], file = my.all.k4.out,quote=F,row.names=F,col.names=F,sep="\t")
  
  my.changedBD.de2 <- intersect(which(res.linear$padj < 0.05), my.bd.bckgd )
  my.lostBD.de2 <- intersect(my.changedBD.de2,which(res.linear$log2FoldChange < 0))
  my.gainedBD.de2 <- intersect(my.changedBD.de2,which(res.linear$log2FoldChange > 0))
  
  # show significant
  my.nums.bd.changes <- c(length(my.changedBD.de2),length(my.lostBD.de2), length(my.gainedBD.de2))
  names(my.nums.bd.changes) <- c("All changed BDs","Eroded BDs","Extended BDs")
  print(my.nums.bd.changes)
  
  
  my.heatmap.out.2 <- paste(my.outprefix,"_Heatmap_significant_BDs.pdf",sep="")
  if (my.changedBD.de2 > 0) {
    
    my.null.rows <- which(apply(my.filtered.matrix.corrected,1,sum) == 0)
    my.changedBD.de2.nonnull <- setdiff(my.changedBD.de2,my.null.rows)
    
    pdf(my.heatmap.out.2)
    my.heatmap.title.2 <- paste(my.tissue," aging significant (FDR<5%), ",my.nums.bd.changes[1], " broad domains",sep="")
    pheatmap(my.filtered.matrix.corrected[my.changedBD.de2.nonnull,], # show called breadth, not DESeq norm breadth
             cluster_cols = F,
             cluster_rows = T,
             colorRampPalette(rev(c("#CC3333","#FF9999","#FFCCCC","white","#CCCCFF","#9999FF","#333399")))(50),
             show_rownames = F, scale="row",
             main = my.heatmap.title.2, cellwidth = 30, border=NA)
    dev.off()
  }
  
  # output result tables to files
  my.out.1 <- paste(my.outprefix,"_CHANGED_BROAD_ANNOT.xls",sep="")
  my.out.2 <- paste(my.outprefix,"_CHANGED_BROAD.bed",sep="")
  my.out.3 <- paste(my.outprefix,"_LOST_BROAD.bed",sep="")
  my.out.4 <- paste(my.outprefix,"_GAINED_BROAD.bed",sep="")
  
  write.table(cbind(my.annot[,1:7],res.linear)[my.changedBD.de2,], file=my.out.1,sep="\t",quote=F,row.names=F,col.names=F)
  write.table(cbind(my.annot[,1:7],res.linear)[my.changedBD.de2,c(2,3,4,1)], file=my.out.2,sep="\t",quote=F,row.names=F,col.names=F)
  write.table(cbind(my.annot[,1:7],res.linear)[my.lostBD.de2,c(2,3,4,1)], file=my.out.3,sep="\t",quote=F,row.names=F,col.names=F)
  write.table(cbind(my.annot[,1:7],res.linear)[my.gainedBD.de2,c(2,3,4,1)], file=my.out.4,sep="\t",quote=F,row.names=F,col.names=F)
  
 # return(res.linear)
  
  # create a slot for gene names
  res.linear.gnames <- res.linear
  res.linear.gnames$GeneName <- my.annot$Gene.Name
  #head(res.linear.gnames)
  
  return(res.linear.gnames)
  
}
