options(stringsAsFactors = FALSE)

# load libraries for analysis
library(DESeq2)
library(pheatmap)
library('pvclust')
library(Biobase)
library(limma)

#######################################################################################################################################
## This function takes in the height matrix and processes it through DEseq modeling with respect to age in months
# INPUT: my.matrix: count matrix from subreads, columns selected, rownames implemented
#        reps.3, reps.12, reps.29: replicates from each age, default is 3
#        my.tissue: name of the tissue
# OUTPUT: list of DESeq result object,with annotated genenames

# my.tissue <- "Liver"
# my.matrix <- my.liver
# my.annot <- "../Diffbind/2018-09-13_Liver_H3K27ac_aging_diffbind.Repeat_ANNOT.txt"
# reps.3=2
# reps.12=2
# reps.29=2

process_aging_height_TE <- function(my.tissue, my.matrix, my.annot, reps.3=2, reps.12=2, reps.29=2) {
  
  # the Diffbind has an offset in the name compared to HOMER (the beginning of diffbind is -1 HOMER)
  # put the -1 to be able to compare
  my.peak.annot <- read.csv(my.annot,header=F,sep="\t")
  colnames(my.peak.annot) <- c("Diffbind_Peak","Elem_Unique_Name","Overlap","RepeatFamily", "Repeat_coord")
  
  my.matrix$Diffbind_Peak <- rownames(my.matrix)
  
  # make it so gene name is there
  my.merged <- merge(my.matrix,my.peak.annot[,c("Diffbind_Peak","Elem_Unique_Name","RepeatFamily")],by="Diffbind_Peak")
  ncols <- dim(my.merged)[2]
  
  # collapse repeats per family (lack of precision of single ended chip-seq)
  # "Diffbind_Peak"    "3m1" "3m2" "12m1" "12m2" "29m1" "29m2" "Elem_Unique_Name" "RepeatFamily"
  my.summed_repeats <- aggregate(my.merged[,-c(1,(ncols-1):ncols)],by=list(my.merged$RepeatFamily),FUN=sum)
  colnames(my.summed_repeats)[1] <- "Repeat"
  my.clean.names <-  gsub('(', "_", my.summed_repeats$Repeat, fixed = T)
  my.clean.names <-  gsub(')', "_", my.clean.names, fixed = T)
  rownames(my.summed_repeats) <- my.clean.names
  
  # get output file prefix
  my.outprefix <- paste(Sys.Date(),my.tissue,"DESeq2_LINEAR_model_with_age_H3K27ac_height_Repeats",sep="_")
  
  my.filtered.matrix <- my.summed_repeats[,-1] # for compatibility, remove the PeakName and coordinates column
  
  age <- as.numeric(c(rep(3,reps.3),rep(12,reps.12),rep(29,reps.29) )) # age in months
  
  # design matrix
  dataDesign = data.frame( row.names = colnames( my.filtered.matrix ), age = age )
  
  # get matrix using age as a modeling covariate
  dds <- DESeqDataSetFromMatrix(countData = round(my.filtered.matrix), # round the normalized counts for DESeq2
                                colData = dataDesign,
                                design = ~ age)
  
  # run DESeq normalizations and export results
  dds.deseq <- DESeq(dds)
  res.linear <- results(dds.deseq, name= "age") # added the name of the tested variable: doesn't seem to be taken correctly by default for FC
  
  # plot dispersion
  my.disp.out <- paste(my.outprefix,"_dispersion_plot.pdf",sep="")
  
  pdf(my.disp.out)
  plotDispEsts(dds.deseq)
  dev.off()
  
  # normalized expression value
  tissue.cts <- log2( counts(dds.deseq, normalize = TRUE) + 0.01) # log2 norm of height
  colnames(tissue.cts) <- c(paste("3m",1:reps.3,sep=""),paste("12m",1:reps.12,sep=""),paste("29m",1:reps.29,sep=""))
  
  # expression range
  my.exp.out <- paste(my.outprefix,"_Normalized_height_boxplot.pdf",sep="")
  
  pdf(my.exp.out)
  boxplot(tissue.cts,col=c(rep("coral",reps.3),rep("blueviolet",reps.12),rep("dodgerblue",reps.29)),
          cex=0.5,ylab="Log2 DESeq2/Diffbind Normalized counts", main = my.tissue)  
  dev.off()
  
  ### get the heatmap of aging changes at FDR5
  ## exclude NA
  my.nas.bool <- is.na(res.linear$padj)
  res.linear <- res.linear[!is.na(res.linear$padj),]
  
  genes.aging <- rownames(res.linear)[res.linear$padj < 0.05]
  my.num.aging <- length(genes.aging)
  
  if (my.num.aging > 1) {
    # heatmap drawing - only if there is at least one gene
    my.heatmap.out <- paste(my.outprefix,"_Heatmap_significant_domains.pdf",sep="")
    
    pdf(my.heatmap.out)
    my.heatmap.title <- paste(my.tissue," aging H3K27ac significant (FDR<5%), ",my.num.aging, " domains",sep="")
    pheatmap(my.filtered.matrix[genes.aging,], # show called height, not DESeq norm height
             cluster_cols = F,
             cluster_rows = T,
             colorRampPalette(rev(c("#CC3333","#FF9999","#FFCCCC","white","#CCCCFF","#9999FF","#333399")))(50),
             show_rownames = T, scale="row",
             main = my.heatmap.title, cellwidth = 30, border=NA)
    dev.off()
  }
  
  # remove NAs
  my.filtered.matrix <-  my.filtered.matrix[!my.nas.bool,]
  my.summed_repeats <-  my.summed_repeats[!my.nas.bool,]
  
  # get output files
  
  ## for all changes
  my.decreased.de2 <- intersect(which(res.linear$padj < 0.05),which(res.linear$log2FoldChange < 0))
  my.increased.de2 <- intersect(which(res.linear$padj < 0.05),which(res.linear$log2FoldChange > 0))
  
  my.lost.out <- paste(my.outprefix,"_height_LINEAR_decreased_FDR5.xls",sep="")
  my.gained.out <- paste(my.outprefix,"_height_LINEAR_increased_FDR5.xls",sep="")
  
  write.table(cbind(my.summed_repeats$Repeat,res.linear)[my.decreased.de2,], file=my.lost.out,sep="\t",quote=F,row.names=F,col.names=T)
  write.table(cbind(my.summed_repeats$Repeat,res.linear)[my.increased.de2,], file=my.gained.out,sep="\t",quote=F,row.names=F,col.names=T)
  
  # show significant
  my.nums.bd.changes <- c(length(my.decreased.de2)+length(my.increased.de2),length(my.decreased.de2), length(my.increased.de2))
  names(my.nums.bd.changes) <- c("All changed domains","decreased domains","increased domains")
  print(my.nums.bd.changes)
  
}
