options(stringsAsFactors = FALSE)

# load libraries for analysis
library(DESeq2)
library(pheatmap)
library('pvclust')
library(Biobase)
#library(limma)

#######################################################################################################################################
## This function takes in the height matrix and processes it through DEseq modeling with respect to age in months
# INPUT: my.matrix: count matrix from subreads, columns selected, rownames implemented
#        reps.3, reps.12, reps.29: replicates from each age, default is 3
#        my.tissue: name of the tissue
# OUTPUT: list of DESeq result object,with annotated genenames

process_aging_height <- function(my.tissue, my.matrix, my.annot, reps.3=2, reps.12=2, reps.29=2) {
  
  # the Diffbind has an offset in the name compared to HOMER (the beginning of diffbind is -1 HOMER)
  # put the -1 to be able to compare
  my.peak.annot <- read.csv(my.annot,header=T,sep="\t")
  rownames(my.peak.annot) <- paste(my.peak.annot$Chr,my.peak.annot$Start-1,my.peak.annot$End,sep="-")
  my.peak.annot$PeakName <- paste(my.peak.annot$Chr,my.peak.annot$Start-1,my.peak.annot$End,sep="-")
  
  my.matrix$PeakName <- rownames(my.matrix)
  
  # make it so gene name is there
  my.merged <- merge(my.matrix,my.peak.annot[,c(2:4,16,20)],by="PeakName")
  
  ncols <- dim(my.merged)[2]
  
  # get output file prefix
  my.outprefix <- paste(Sys.Date(),my.tissue,"DESeq2_LINEAR_model_with_age_H3K27ac_height_in_SE-Lite",sep="_")
  
  my.filtered.matrix <- my.matrix[,-c((ncols-4):ncols)] # for compatibility, remove the PeakName and coordinates column
  
  age <- as.numeric(c(rep(3,reps.3),rep(12,reps.12),rep(29,reps.29) )) # age in months
  
  # design matrix
  dataDesign = data.frame( row.names = colnames( my.filtered.matrix ), age = age )
  
  # get matrix using age as a modeling covariate
  dds <- DESeqDataSetFromMatrix(countData = round(my.filtered.matrix), # round the normalized counts for DESeq2
                                colData = dataDesign,
                                design = ~ age)
  
  # run DESeq normalizations and export results
  dds.deseq <- DESeq(dds)
  res.linear <- results(dds.deseq, name= "age") # added the name of the tested variable: doesn't seem to be taken correctly by default for FC
  
  # plot dispersion
  my.disp.out <- paste(my.outprefix,"_dispersion_plot.pdf",sep="")
  
  pdf(my.disp.out)
  plotDispEsts(dds.deseq)
  dev.off()
  
  
  my.maplot.out <- paste(my.outprefix,"_MA_plot.pdf",sep="")
  pdf(my.maplot.out)
  DESeq2::plotMA(res.linear)
  dev.off()
  
  # normalized expression value
  tissue.cts <- log2( counts(dds.deseq, normalize = TRUE) + 0.01) # log2 norm of height
  colnames(tissue.cts) <- c(paste("3m",1:reps.3,sep=""),paste("12m",1:reps.12,sep=""),paste("29m",1:reps.29,sep=""))
  
  # do MDS analysis
  mds.result <- cmdscale(1-cor(tissue.cts,method="spearman"), k = 2, eig = FALSE, add = FALSE, x.ret = FALSE)
  x <- mds.result[, 1]
  y <- mds.result[, 2]
  
  my.colors <- c(rep("coral",reps.3), rep("blueviolet", reps.12),rep("dodgerblue",reps.29))
  
  my.mds.out <- paste(my.outprefix,"_MDS_plot.pdf",sep="")
  
  pdf(my.mds.out)
  plot(x, y, xlab = "MDS dimension 1", ylab = "MDS dimension 2",main="Multi-dimensional Scaling",cex=2,pch=5)
  points(x, y, pch=18,col=my.colors,cex=2)
  legend("topleft",c("3m","12m","29m"),col=c("coral","blueviolet","dodgerblue"),pch=18,bty='n',pt.cex=1.5)
  dev.off()
  
  ##### PCA
  my.pos.var <- apply(tissue.cts,1,var) >0
  # do PCA analysis
  my.pca <- prcomp(t(tissue.cts[my.pos.var,]),scale = TRUE)
  x <- my.pca$x[,1]
  y <- my.pca$x[,2]
  
  my.summary <- summary(my.pca)
  
  my.pca.out <- paste(my.outprefix,"_PCA_plot.pdf",sep="")
  
  pdf(my.pca.out)
  plot(x,y, cex=2, 
       xlab = paste('PC1 (', round(100*my.summary$importance[,1][2],1),"%)", sep=""),
       ylab = paste('PC2 (', round(100*my.summary$importance[,2][2],1),"%)", sep=""),
       cex.lab = 1.5,pch=5) 
  points(x,y, pch=18, cex=3, col=my.colors)
  legend("topleft",c("3m","12m","29m"),col=c("coral","blueviolet","dodgerblue"),pch=16,bty='n',pt.cex=2)
  dev.off()
  #####
  
  
  
  # expression range
  my.exp.out <- paste(my.outprefix,"_Normalized_height_boxplot.pdf",sep="")
  
  pdf(my.exp.out)
  boxplot(tissue.cts,col=c(rep("coral",reps.3),rep("blueviolet",reps.12),rep("dodgerblue",reps.29)),
          cex=0.5,ylab="Log2 DESeq2/Diffbind Normalized counts", main = my.tissue)  
  dev.off()
  
  ### get the heatmap of aging changes at FDR5
  ## exclude NA
  my.nas.bool <- is.na(res.linear$padj)
  res.linear <- res.linear[!is.na(res.linear$padj),]
  
  genes.aging <- rownames(res.linear)[res.linear$padj < 0.05]
  my.num.aging <- length(genes.aging)
  
  if (my.num.aging > 1) {
    # heatmap drawing - only if there is at least one gene
    my.heatmap.out <- paste(my.outprefix,"_Heatmap_significant_domains.pdf",sep="")
    
    pdf(my.heatmap.out)
    my.heatmap.title <- paste(my.tissue," aging significant (FDR<5%), ",my.num.aging, " domains",sep="")
    pheatmap(my.filtered.matrix[genes.aging,], # show called height, not DESeq norm height
             cluster_cols = F,
             cluster_rows = T,
             colorRampPalette(rev(c("#CC3333","#FF9999","#FFCCCC","white","#CCCCFF","#9999FF","#333399")))(50),
             show_rownames = F, scale="row",
             main = my.heatmap.title, cellwidth = 30, border=NA)
    dev.off()
  }
  
  # # do clustering
  # my.pv <- pvclust(tissue.cts,nboot=1000)
  # my.heatmap.out <- paste(my.outprefix,"_PVCLUST_result.pdf",sep="")
  # 
  # pdf(my.heatmap.out)
  # plot(my.pv)
  # dev.off()
  
  # remove NAs
  my.filtered.matrix <-  my.filtered.matrix[!my.nas.bool,]
  my.merged <-  my.merged[!my.nas.bool,]
  
  # get output files
  
  ## for all changes
  my.decreased.de2 <- intersect(which(res.linear$padj < 0.05),which(res.linear$log2FoldChange < 0))
  my.increased.de2 <- intersect(which(res.linear$padj < 0.05),which(res.linear$log2FoldChange > 0))
  
  my.lost.out <- paste(my.outprefix,"_height_LINEAR_decreased_FDR5.xls",sep="")
  my.gained.out <- paste(my.outprefix,"_height_LINEAR_increased_FDR5.xls",sep="")
  
  write.table(cbind(my.merged[,c((ncols-3):ncols)],res.linear)[my.decreased.de2,], file=my.lost.out,sep="\t",quote=F,row.names=F,col.names=T)
  write.table(cbind(my.merged[,c((ncols-3):ncols)],res.linear)[my.increased.de2,], file=my.gained.out,sep="\t",quote=F,row.names=F,col.names=T)
  
  my.lost.out2 <- paste(my.outprefix,"_height_LINEAR_decreased_FDR5.bed",sep="")
  my.gained.out2 <- paste(my.outprefix,"_height_LINEAR_increased_FDR5.bed",sep="")
  my.background.out2 <- paste(my.outprefix,"_height_LINEAR_background.bed",sep="")
  
  write.table(cbind(my.merged[,c((ncols-3):ncols)])[my.decreased.de2,], file=my.lost.out2,sep="\t",quote=F,row.names=F,col.names=F)
  write.table(cbind(my.merged[,c((ncols-3):ncols)])[my.increased.de2,], file=my.gained.out2,sep="\t",quote=F,row.names=F,col.names=F)
  write.table(cbind(my.merged[,c((ncols-3):ncols)]), file=my.background.out2,sep="\t",quote=F,row.names=F,col.names=F)
  
  
  # show significant
  my.nums.bd.changes <- c(length(my.decreased.de2)+length(my.increased.de2),length(my.decreased.de2), length(my.increased.de2))
  names(my.nums.bd.changes) <- c("All changed domains","decreased domains","increased domains")
  print(my.nums.bd.changes)
  
  # create a slot for gene names
  res.linear.gnames <- res.linear
  
  my.genename.nas <- is.na(my.merged$Gene.Name)
  my.merged <- my.merged[!my.genename.nas,]
  res.linear.gnames <- res.linear.gnames[!my.genename.nas,]
  
  res.linear.gnames$GeneName <- as.vector(my.merged$Gene.Name)
  head(res.linear.gnames)
  
  # output result
  return(res.linear.gnames)
  
}




#######################################################################################################################################
## This function takes in the height matrix and processes it through DEseq modeling with respect to age in months
# INPUT: my.matrix: count matrix from subreads, columns selected, rownames implemented
#        reps.3, reps.12, reps.29: replicates from each age, default is 3
#        my.tissue: name of the tissue

# OUTPUT: list of DESeq result object,with annotated genenames
# 2016-02-24: add a batch modleing option for NPC

process_aging_height_batch <- function(my.tissue, my.matrix, my.annot, my.batch, reps.3=2, reps.12=2, reps.29=2) {
  
  # the Diffbind has an offset in the name compared to HOMER (the beginning of diffbind is -1 HOMER)
  # put the -1 to be able to compare
  my.peak.annot <- read.csv(my.annot,header=T,sep="\t")
  rownames(my.peak.annot) <- paste(my.peak.annot$Chr,my.peak.annot$Start-1,my.peak.annot$End,sep="-")
  my.peak.annot$PeakName <- paste(my.peak.annot$Chr,my.peak.annot$Start-1,my.peak.annot$End,sep="-")
  
  my.matrix$PeakName <- rownames(my.matrix)
  
  # make it so gene name is there
  my.merged <- merge(my.matrix,my.peak.annot[,c(2:4,16,20)],by="PeakName")
  
  ncols <- dim(my.merged)[2]
  
  # get output file prefix
  my.outprefix <- paste(Sys.Date(),my.tissue,"DESeq2_LINEAR_model_with_age_H3K27ac_height_in_SE-Lite",sep="_")
  
  my.filtered.matrix <- my.matrix[,-c((ncols-4):ncols)] # for compatibility, remove the PeakName and coordinates column
  
  age <- as.numeric(c(rep(3,reps.3),rep(12,reps.12),rep(29,reps.29) )) # age in months
  
  # design matrix
  dataDesign = data.frame( row.names = colnames( my.filtered.matrix ), age = age, batch = my.batch )
  
  # get matrix using age as a modeling covariate
  dds <- DESeqDataSetFromMatrix(countData = round(my.filtered.matrix), # round the normalized counts for DESeq2
                                colData = dataDesign,
                                design = ~ age + batch:age + batch)
  
  # run DESeq normalizations and export results
  dds.deseq <- DESeq(dds)
  res.linear <- results(dds.deseq, name= "age") # added the name of the tested variable: doesn't seem to be taken correctly by default for FC
  
  # plot dispersion
  my.disp.out <- paste(my.outprefix,"_dispersion_plot.pdf",sep="")
  
  pdf(my.disp.out)
  plotDispEsts(dds.deseq)
  dev.off()
  
  
  my.maplot.out <- paste(my.outprefix,"_MA_plot.pdf",sep="")
  pdf(my.maplot.out)
  DESeq2::plotMA(res.linear)
  dev.off()
  
  # normalized expression value
  tissue.cts <- log2( counts(dds.deseq, normalize = TRUE) + 0.01) # log2 norm of height
  colnames(tissue.cts) <- c(paste("3m",1:reps.3,sep=""),paste("12m",1:reps.12,sep=""),paste("29m",1:reps.29,sep=""))
 
  # regress out the non age variance
  full.model <- model.matrix(~ age + age:batch + batch, data = dataDesign) # all variables
  fit <- lmFit(ExpressionSet(assayData=as.matrix(my.filtered.matrix)), full.model)
  fit.eb <- eBayes(fit)
  print(colnames(fit))
  
  ### Regress out batch
  mod <- coefficients(fit)[,-c(1:2)] %*% t(fit$design[,-c(1:2)]) ### I keep only age and intercept
  my.filtered.matrix.corrected <- my.filtered.matrix - mod  
  
  # do MDS analysis
  mds.result <- cmdscale(1-cor(my.filtered.matrix.corrected,method="spearman"), k = 2, eig = FALSE, add = FALSE, x.ret = FALSE)
  x <- mds.result[, 1]
  y <- mds.result[, 2]
  
  my.colors <- c(rep("coral",reps.3), rep("blueviolet", reps.12),rep("dodgerblue",reps.29))
  
  my.mds.out <- paste(my.outprefix,"_MDS_plot.pdf",sep="")
  
  pdf(my.mds.out)
  plot(x, y, xlab = "MDS dimension 1", ylab = "MDS dimension 2",main="Multi-dimensional Scaling",cex=2,pch=5)
  points(x, y, pch=18,col=my.colors,cex=2)
  legend("topleft",c("3m","12m","29m"),col=c("coral","blueviolet","dodgerblue"),pch=18,bty='n',pt.cex=1.5)
  dev.off()
  
  
  ##### PCA
  my.pos.var <- apply(my.filtered.matrix.corrected,1,var) >0
  # do PCA analysis
  my.pca <- prcomp(t(my.filtered.matrix.corrected[my.pos.var,]),scale = TRUE)
  x <- my.pca$x[,1]
  y <- my.pca$x[,2]
  
  my.summary <- summary(my.pca)
  
  my.pca.out <- paste(my.outprefix,"_PCA_plot.pdf",sep="")
  
  pdf(my.pca.out)
  plot(x,y, cex=2, 
       xlab = paste('PC1 (', round(100*my.summary$importance[,1][2],1),"%)", sep=""),
       ylab = paste('PC2 (', round(100*my.summary$importance[,2][2],1),"%)", sep=""),
       cex.lab = 1.5,pch=5) 
  points(x,y, pch=18, cex=3, col=my.colors)
  legend("topleft",c("3m","12m","29m"),col=c("coral","blueviolet","dodgerblue"),pch=16,bty='n',pt.cex=2)
  dev.off()
  #####
  
  # expression range
  my.exp.out <- paste(my.outprefix,"_Normalized_height_boxplot.pdf",sep="")
  
  pdf(my.exp.out)
  boxplot(tissue.cts,col=c(rep("coral",reps.3),rep("blueviolet",reps.12),rep("dodgerblue",reps.29)),
          cex=0.5,ylab="Log2 DESeq2/Diffbind Normalized counts", main = my.tissue)  
  dev.off()
  
  ### get the heatmap of aging changes at FDR5
  ## exclude NA
  my.nas.bool <- is.na(res.linear$padj)
  res.linear <- res.linear[!is.na(res.linear$padj),]
  
  genes.aging <- rownames(res.linear)[res.linear$padj < 0.05]
  my.num.aging <- length(genes.aging)
  
  if (my.num.aging > 1) {
    # heatmap drawing - only if there is at least one gene
    my.heatmap.out <- paste(my.outprefix,"_Heatmap_significant_domains.pdf",sep="")
    
    pdf(my.heatmap.out)
    my.heatmap.title <- paste(my.tissue," aging significant (FDR<5%), ",my.num.aging, " domains",sep="")
    pheatmap(my.filtered.matrix.corrected[genes.aging,], # show called height, not DESeq norm height
             cluster_cols = F,
             cluster_rows = T,
             colorRampPalette(rev(c("#CC3333","#FF9999","#FFCCCC","white","#CCCCFF","#9999FF","#333399")))(50),
             show_rownames = F, scale="row",
             main = my.heatmap.title, cellwidth = 30, border=NA)
    dev.off()
  }
  
  # # do clustering
  # my.pv <- pvclust(tissue.cts,nboot=1000)
  # my.heatmap.out <- paste(my.outprefix,"_PVCLUST_result.pdf",sep="")
  # 
  # pdf(my.heatmap.out)
  # plot(my.pv)
  # dev.off()
  
  # remove NAs
  my.filtered.matrix <-  my.filtered.matrix[!my.nas.bool,]
  my.merged <-  my.merged[!my.nas.bool,]
  
  # get output files
  
  ## for all changes
  my.decreased.de2 <- intersect(which(res.linear$padj < 0.05),which(res.linear$log2FoldChange < 0))
  my.increased.de2 <- intersect(which(res.linear$padj < 0.05),which(res.linear$log2FoldChange > 0))
  
  my.lost.out <- paste(my.outprefix,"_height_LINEAR_decreased_FDR5.xls",sep="")
  my.gained.out <- paste(my.outprefix,"_height_LINEAR_increased_FDR5.xls",sep="")
  
  write.table(cbind(my.merged[,c((ncols-3):ncols)],res.linear)[my.decreased.de2,], file=my.lost.out,sep="\t",quote=F,row.names=F,col.names=T)
  write.table(cbind(my.merged[,c((ncols-3):ncols)],res.linear)[my.increased.de2,], file=my.gained.out,sep="\t",quote=F,row.names=F,col.names=T)
  
  my.lost.out2 <- paste(my.outprefix,"_height_LINEAR_decreased_FDR5.bed",sep="")
  my.gained.out2 <- paste(my.outprefix,"_height_LINEAR_increased_FDR5.bed",sep="")
  my.background.out2 <- paste(my.outprefix,"_height_LINEAR_background.bed",sep="")
  
  write.table(cbind(my.merged[,c((ncols-3):ncols)])[my.decreased.de2,], file=my.lost.out2,sep="\t",quote=F,row.names=F,col.names=F)
  write.table(cbind(my.merged[,c((ncols-3):ncols)])[my.increased.de2,], file=my.gained.out2,sep="\t",quote=F,row.names=F,col.names=F)
  write.table(cbind(my.merged[,c((ncols-3):ncols)]), file=my.background.out2,sep="\t",quote=F,row.names=F,col.names=F)
  
  
  # show significant
  my.nums.bd.changes <- c(length(my.decreased.de2)+length(my.increased.de2),length(my.decreased.de2), length(my.increased.de2))
  names(my.nums.bd.changes) <- c("All changed domains","decreased domains","increased domains")
  print(my.nums.bd.changes)
  
  # create a slot for gene names
  res.linear.gnames <- res.linear
  
  my.genename.nas <- is.na(my.merged$Gene.Name)
  my.merged <- my.merged[!my.genename.nas,]
  res.linear.gnames <- res.linear.gnames[!my.genename.nas,]
  
  res.linear.gnames$GeneName <- as.vector(my.merged$Gene.Name)
  head(res.linear.gnames)
  
  # output result
  return(res.linear.gnames)
  
}