# my.NPCs.data <- read.table("/Volumes/MyBook_3/BD_aging_project/ChIP-seq/NPC_cultures/New_rep_5_6/Combined_height/NPC_combined_Diffbind_matrix_concentration_forS2norm.txt",header=T,sep="\t")
# my.cols <- c(4:5,10:11,6:7,12:13,8:9,14:15) # reorder by age group
# my.NPCs <- my.NPCs.data[,my.cols]
#
# # generate unique rownames based on peak coordinates
# rownames(my.NPCs) <- paste(my.NPCs.data[,1],my.NPCs.data[,2],my.NPCs.data[,3],sep="-")
# colnames(my.NPCs) <- c("3m1","3m2","3m5","3m6","12m1","12m2","12m5","12m6","29m1","29m2","29m5","29m6")
#
# my.batch.per.seq <- factor(paste("experiment",rep(c(1,1,2,2),3),sep=""))
#
# # process data and save RData NPCsject
# my.NPCs.height.process.batch <- process_aging_height_batch("NPCs", my.NPCs,
#                                                      "/Volumes/MyBook_3/BD_aging_project/ChIP-seq/NPC_cultures/New_rep_5_6/Combined_height/HOMER_NPC_combined_Diffbind_peaks_coord.xls",
#                                                      my.batch.per.seq)
# save(my.NPCs.height.process.batch, file="H3K4me3_height_result_NPCs-combined_2016-06-01.RData")
# # All changed domains   decreased domains   increased domains
# # 12                   9                   3
# #####################################################################################
options(stringsAsFactors = FALSE)
# load libraries for analysis
library(DESeq2)
library(pheatmap)
library('pvclust')
library(Biobase)
library(limma)
#######################################################################################################################################
## This function takes in the height matrix and processes it through DEseq modeling with respect to age in months
# INPUT: my.matrix: count matrix from subreads, columns selected, rownames implemented
#        reps.3, reps.12, reps.29: replicates from each age, default is 3
#        my.tissue: name of the tissue
# OUTPUT: list of DESeq result object,with annotated genenames
# my.tissue <- "Liver"
# my.matrix <- my.liver
# my.annot <- "../Diffbind/2018-09-13_Liver_H3K4me3_aging_diffbind.Repeat_ANNOT.txt"
# reps.3=2
# reps.12=2
# reps.29=2
process_aging_height_TE <- function(my.tissue, my.matrix, my.annot, reps.3=2, reps.12=2, reps.29=2) {
# the Diffbind has an offset in the name compared to HOMER (the beginning of diffbind is -1 HOMER)
# put the -1 to be able to compare
my.peak.annot <- read.csv(my.annot,header=F,sep="\t")
colnames(my.peak.annot) <- c("Diffbind_Peak","Elem_Unique_Name","Overlap","RepeatFamily", "Repeat_coord")
my.matrix$Diffbind_Peak <- rownames(my.matrix)
# make it so gene name is there
my.merged <- merge(my.matrix,my.peak.annot[,c("Diffbind_Peak","Elem_Unique_Name","RepeatFamily")],by="Diffbind_Peak")
ncols <- dim(my.merged)[2]
# collapse repeats per family (lack of precision of single ended chip-seq)
# "Diffbind_Peak"    "3m1" "3m2" "12m1" "12m2" "29m1" "29m2" "Elem_Unique_Name" "RepeatFamily"
my.summed_repeats <- aggregate(my.merged[,-c(1,(ncols-1):ncols)],by=list(my.merged$RepeatFamily),FUN=sum)
colnames(my.summed_repeats)[1] <- "Repeat"
my.clean.names <-  gsub('(', "_", my.summed_repeats$Repeat, fixed = T)
my.clean.names <-  gsub(')', "_", my.clean.names, fixed = T)
rownames(my.summed_repeats) <- my.clean.names
# get output file prefix
my.outprefix <- paste(Sys.Date(),my.tissue,"DESeq2_LINEAR_model_with_age_H3K4me3_height_Repeats",sep="_")
my.filtered.matrix <- my.summed_repeats[,-1] # for compatibility, remove the PeakName and coordinates column
age <- as.numeric(c(rep(3,reps.3),rep(12,reps.12),rep(29,reps.29) )) # age in months
# design matrix
dataDesign = data.frame( row.names = colnames( my.filtered.matrix ), age = age )
# get matrix using age as a modeling covariate
dds <- DESeqDataSetFromMatrix(countData = round(my.filtered.matrix), # round the normalized counts for DESeq2
colData = dataDesign,
design = ~ age)
# run DESeq normalizations and export results
dds.deseq <- DESeq(dds)
res.linear <- results(dds.deseq, name= "age") # added the name of the tested variable: doesn't seem to be taken correctly by default for FC
# plot dispersion
my.disp.out <- paste(my.outprefix,"_dispersion_plot.pdf",sep="")
pdf(my.disp.out)
plotDispEsts(dds.deseq)
dev.off()
# normalized expression value
tissue.cts <- log2( counts(dds.deseq, normalize = TRUE) + 0.01) # log2 norm of height
colnames(tissue.cts) <- c(paste("3m",1:reps.3,sep=""),paste("12m",1:reps.12,sep=""),paste("29m",1:reps.29,sep=""))
# expression range
my.exp.out <- paste(my.outprefix,"_Normalized_height_boxplot.pdf",sep="")
pdf(my.exp.out)
boxplot(tissue.cts,col=c(rep("coral",reps.3),rep("blueviolet",reps.12),rep("dodgerblue",reps.29)),
cex=0.5,ylab="Log2 DESeq2/Diffbind Normalized counts", main = my.tissue)
dev.off()
### get the heatmap of aging changes at FDR5
## exclude NA
my.nas.bool <- is.na(res.linear$padj)
res.linear <- res.linear[!is.na(res.linear$padj),]
genes.aging <- rownames(res.linear)[res.linear$padj < 0.05]
my.num.aging <- length(genes.aging)
if (my.num.aging > 0) {
# heatmap drawing - only if there is at least one gene
my.heatmap.out <- paste(my.outprefix,"_Heatmap_significant_domains.pdf",sep="")
pdf(my.heatmap.out)
my.heatmap.title <- paste(my.tissue," aging H3K4me3 significant (FDR<5%), ",my.num.aging, " domains",sep="")
pheatmap(my.filtered.matrix[genes.aging,], # show called height, not DESeq norm height
cluster_cols = F,
cluster_rows = T,
colorRampPalette(rev(c("#CC3333","#FF9999","#FFCCCC","white","#CCCCFF","#9999FF","#333399")))(50),
show_rownames = T, scale="row",
main = my.heatmap.title, cellwidth = 30, border=NA)
dev.off()
}
# remove NAs
my.filtered.matrix <-  my.filtered.matrix[!my.nas.bool,]
my.summed_repeats <-  my.summed_repeats[!my.nas.bool,]
# get output files
## for all changes
my.decreased.de2 <- intersect(which(res.linear$padj < 0.05),which(res.linear$log2FoldChange < 0))
my.increased.de2 <- intersect(which(res.linear$padj < 0.05),which(res.linear$log2FoldChange > 0))
my.lost.out <- paste(my.outprefix,"_height_LINEAR_decreased_FDR5.xls",sep="")
my.gained.out <- paste(my.outprefix,"_height_LINEAR_increased_FDR5.xls",sep="")
write.table(cbind(my.summed_repeats$Repeat,res.linear)[my.decreased.de2,], file=my.lost.out,sep="\t",quote=F,row.names=F,col.names=T)
write.table(cbind(my.summed_repeats$Repeat,res.linear)[my.increased.de2,], file=my.gained.out,sep="\t",quote=F,row.names=F,col.names=T)
# show significant
my.nums.bd.changes <- c(length(my.decreased.de2)+length(my.increased.de2),length(my.decreased.de2), length(my.increased.de2))
names(my.nums.bd.changes) <- c("All changed domains","decreased domains","increased domains")
print(my.nums.bd.changes)
}
#
# #######################################################################################################################################
# #######################################################################################################################################
# #######################################################################################################################################
# # # same but with batch modeling for NPC H3K4me3
process_aging_height_TE_batch <- function(my.tissue, my.matrix, my.annot, my.batch, reps.3=4, reps.12=4, reps.29=4) {
# the Diffbind has an offset in the name compared to HOMER (the beginning of diffbind is -1 HOMER)
# put the -1 to be able to compare
my.peak.annot <- read.csv(my.annot,header=F,sep="\t")
colnames(my.peak.annot) <- c("Diffbind_Peak","Elem_Unique_Name","Overlap","RepeatFamily", "Repeat_coord")
my.matrix$Diffbind_Peak <- rownames(my.matrix)
# make it so gene name is there
my.merged <- merge(my.matrix,my.peak.annot[,c("Diffbind_Peak","Elem_Unique_Name","RepeatFamily")],by="Diffbind_Peak")
ncols <- dim(my.merged)[2]
# collapse repeats per family (lack of precision of single ended chip-seq)
# "Diffbind_Peak"    "3m1" "3m2" "12m1" "12m2" "29m1" "29m2" "Elem_Unique_Name" "RepeatFamily"
my.summed_repeats <- aggregate(my.merged[,-c(1,(ncols-1):ncols)],by=list(my.merged$RepeatFamily),FUN=sum)
colnames(my.summed_repeats)[1] <- "Repeat"
my.clean.names <-  gsub('(', "_", my.summed_repeats$Repeat, fixed = T)
my.clean.names <-  gsub(')', "_", my.clean.names, fixed = T)
rownames(my.summed_repeats) <- my.clean.names
# get output file prefix
my.outprefix <- paste(Sys.Date(),my.tissue,"DESeq2_LINEAR_model_with_age_H3K4me3_height_Repeats",sep="_")
my.filtered.matrix <- my.summed_repeats[,-1] # for compatibility, remove the PeakName and coordinates column
age <- as.numeric(c(rep(3,reps.3),rep(12,reps.12),rep(29,reps.29) )) # age in months
# design matrix
dataDesign = data.frame( row.names = colnames( my.filtered.matrix ), age = age , batch=my.batch)
# get matrix using age as a modeling covariate
dds <- DESeqDataSetFromMatrix(countData = round(my.filtered.matrix), # round the normalized counts for DESeq2
colData = dataDesign,
design = ~ age + age:batch + batch)
# run DESeq normalizations and export results
dds.deseq <- DESeq(dds)
res.linear <- results(dds.deseq, name= "age") # added the name of the tested variable: doesn't seem to be taken correctly by default for FC
# plot dispersion
my.disp.out <- paste(my.outprefix,"_dispersion_plot.pdf",sep="")
pdf(my.disp.out)
plotDispEsts(dds.deseq)
dev.off()
# normalized expression value
tissue.cts <- log2( counts(dds.deseq, normalize = TRUE) + 0.01) # log2 norm of height
colnames(tissue.cts) <- c(paste("3m",1:reps.3,sep=""),paste("12m",1:reps.12,sep=""),paste("29m",1:reps.29,sep=""))
# regress out the non age variance
full.model <- model.matrix(~ age + age:batch + batch, data = dataDesign) # all variables
fit <- lmFit(ExpressionSet(assayData=as.matrix(my.filtered.matrix)), full.model)
fit.eb <- eBayes(fit)
print(colnames(fit))
### Regress out batch
mod <- coefficients(fit)[,-c(1:2)] %*% t(fit$design[,-c(1:2)]) ### I keep only age and intercept
my.filtered.matrix.corrected <- my.filtered.matrix - mod
# expression range
my.exp.out <- paste(my.outprefix,"_Normalized_height_boxplot.pdf",sep="")
pdf(my.exp.out)
boxplot(tissue.cts,col=c(rep("coral",reps.3),rep("blueviolet",reps.12),rep("dodgerblue",reps.29)),
cex=0.5,ylab="Log2 DESeq2/Diffbind Normalized counts", main = my.tissue)
dev.off()
### get the heatmap of aging changes at FDR5
## exclude NA
my.nas.bool <- is.na(res.linear$padj)
res.linear <- res.linear[!is.na(res.linear$padj),]
genes.aging <- rownames(res.linear)[res.linear$padj < 0.05]
my.num.aging <- length(genes.aging)
if (my.num.aging > 0) {
# heatmap drawing - only if there is at least one gene
my.heatmap.out <- paste(my.outprefix,"_Heatmap_significant_domains.pdf",sep="")
pdf(my.heatmap.out)
my.heatmap.title <- paste(my.tissue," aging H3K4me3 significant (FDR<5%), ",my.num.aging, " domains",sep="")
pheatmap(my.filtered.matrix.corrected[genes.aging,], # show called height, not DESeq norm height
cluster_cols = F,
cluster_rows = T,
colorRampPalette(rev(c("#CC3333","#FF9999","#FFCCCC","white","#CCCCFF","#9999FF","#333399")))(50),
show_rownames = T, scale="row",
main = my.heatmap.title, cellwidth = 30, border=NA)
dev.off()
}
# remove NAs
my.filtered.matrix.corrected <-  my.filtered.matrix.corrected[!my.nas.bool,]
my.summed_repeats <-  my.summed_repeats[!my.nas.bool,]
# get output files
## for all changes
my.decreased.de2 <- intersect(which(res.linear$padj < 0.05),which(res.linear$log2FoldChange < 0))
my.increased.de2 <- intersect(which(res.linear$padj < 0.05),which(res.linear$log2FoldChange > 0))
my.lost.out <- paste(my.outprefix,"_height_LINEAR_decreased_FDR5.xls",sep="")
my.gained.out <- paste(my.outprefix,"_height_LINEAR_increased_FDR5.xls",sep="")
write.table(cbind(my.summed_repeats$Repeat,res.linear)[my.decreased.de2,], file=my.lost.out,sep="\t",quote=F,row.names=F,col.names=T)
write.table(cbind(my.summed_repeats$Repeat,res.linear)[my.increased.de2,], file=my.gained.out,sep="\t",quote=F,row.names=F,col.names=T)
# show significant
my.nums.bd.changes <- c(length(my.decreased.de2)+length(my.increased.de2),length(my.decreased.de2), length(my.increased.de2))
names(my.nums.bd.changes) <- c("All changed domains","decreased domains","increased domains")
print(my.nums.bd.changes)
}
my.NPCs.data <- read.table("../Diffbind/2018-09-13_NSPC_H3K4me3_aging_Repeats_DiffBind_normalized_counts.txt",header=T,sep="\t")
colnames(my.NPCs.data)
my.cols <- c(4:5,10:11,6:7,12:13,8:9,14:15) # reorder by age group
my.NPCs <- my.NPCs.data[,my.cols]
colnames(my.NPCs)
# generate unique rownames based on peak coordinates
rownames(my.NPCs) <- paste(my.NPCs.data[,1],my.NPCs.data[,2],my.NPCs.data[,3],sep="-")
colnames(my.NPCs) <- c("3m1","3m2","3m5","3m6","12m1","12m2","12m5","12m6","29m1","29m2","29m5","29m6")
my.batch.per.seq <- factor(paste("experiment",rep(c(1,1,2,2),3),sep=""))
process_aging_height_TE_batch("NPCs", my.NPCs,"../Diffbind/2018-09-13_NSPC_H3K4me3_aging_diffbind.Repeat_ANNOT.txt",my.batch.per.seq)
my.NPCs.data <- read.table("../Diffbind/2018-09-13_NSPC_H3K27ac_aging_Repeats_DiffBind_normalized_counts.txt",header=T,sep="\t")
my.NPCs.data
setwd('/Volumes/BB_Backup_3/BD_aging_project/2018-09_revision_analyses/TE_chromatin/RepeatMasker/H3K27ac_TEs')
source('height_analysis_functions_linear_K27ac_vREPEATS.R')
setwd('/Volumes/BB_Backup_3/BD_aging_project/2018-09_revision_analyses/TE_chromatin/RepeatMasker/H3K27ac_TEs')
source('height_analysis_functions_linear_K27ac_vREPEATS.R')
# 2018-09-13
# Run for repetitive elements
####################################    Liver    ####################################
# read in Diffbind vector matrix
my.liver.data <- read.table("../Diffbind/2018-09-13_Liver_H3K27ac_aging_Repeats_DiffBind_normalized_counts.txt",header=T,sep="\t")
my.liver <- my.liver.data[,4:9]
# generate unique rownames based on peak coordinates
rownames(my.liver) <- paste(my.liver.data[,1],my.liver.data[,2],my.liver.data[,3],sep="-")
colnames(my.liver) <- c("3m1","3m2","12m1","12m2","29m1","29m2")
# process data
process_aging_height_TE("Liver", my.liver, "../Diffbind/2018-09-13_Liver_H3K27ac_aging_diffbind.Repeat_ANNOT.txt")
# All changed domains   decreased domains   increased domains
# 39                  17                  22
#####################################################################################
####################################    heart    ####################################
# read in Diffbind vector matrix
my.heart.data <- read.table("../Diffbind/2018-09-13_Heart_H3K27ac_aging_Repeats_DiffBind_normalized_counts.txt",header=T,sep="\t")
my.heart <- my.heart.data[,4:9]
# generate unique rownames based on peak coordinates
rownames(my.heart) <- paste(my.heart.data[,1],my.heart.data[,2],my.heart.data[,3],sep="-")
colnames(my.heart) <- c("3m1","3m2","12m1","12m2","29m1","29m2")
# process data
process_aging_height_TE("heart", my.heart, "../Diffbind/2018-09-13_Heart_H3K27ac_aging_diffbind.Repeat_ANNOT.txt")
# All changed domains   decreased domains   increased domains
# 7                   0                   7
#####################################################################################
####################################    cerebellum    ####################################
# read in Diffbind vector matrix
my.cerebellum.data <- read.table("../Diffbind/2018-09-13_Cerebellum_H3K27ac_aging_Repeats_DiffBind_normalized_counts.txt",header=T,sep="\t")
my.cerebellum <- my.cerebellum.data[,4:9]
# generate unique rownames based on peak coordinates
rownames(my.cerebellum) <- paste(my.cerebellum.data[,1],my.cerebellum.data[,2],my.cerebellum.data[,3],sep="-")
colnames(my.cerebellum) <- c("3m1","3m2","12m1","12m2","29m1","29m2")
# process data
process_aging_height_TE("cerebellum", my.cerebellum, "../Diffbind/2018-09-13_Cerebellum_H3K27ac_aging_diffbind.Repeat_ANNOT.txt")
# All changed domains   decreased domains   increased domains
# 5                   5                   0
#####################################################################################
####################################    OB    ####################################
# read in Diffbind vector matrix
my.OB.data <- read.table("../Diffbind/2018-09-13_OB_H3K27ac_aging_Repeats_DiffBind_normalized_counts.txt",header=T,sep="\t")
my.OB <- my.OB.data[,4:9]
# generate unique rownames based on peak coordinates
rownames(my.OB) <- paste(my.OB.data[,1],my.OB.data[,2],my.OB.data[,3],sep="-")
colnames(my.OB) <- c("3m1","3m2","12m1","12m2","29m1","29m2")
# process data
process_aging_height_TE("OB", my.OB, "../Diffbind/2018-09-13_OB_H3K27ac_aging_diffbind.Repeat_ANNOT.txt")
# All changed domains   decreased domains   increased domains
# 0                   0                   0
#####################################################################################
####################################    NPCs    ####################################
# read in Diffbind vector matrix
my.NPCs.data <- read.table("../Diffbind/2018-09-13_NSPC_H3K27ac_aging_Repeats_DiffBind_normalized_counts.txt",header=T,sep="\t")
my.NPCs <- my.NPCs.data[,4:9]
# generate unique rownames based on peak coordinates
rownames(my.NPCs) <- paste(my.NPCs.data[,1],my.NPCs.data[,2],my.NPCs.data[,3],sep="-")
colnames(my.NPCs) <- c("3m1","3m2","12m1","12m2","29m1","29m2")
# process data and save RData NPCsject
process_aging_height_TE("NPCs", my.NPCs,"../Diffbind/2018-09-13_NSPC_H3K27ac_aging_diffbind.Repeat_ANNOT.txt")
# All changed domains   decreased domains   increased domains
# 0                   0                   0
#####################################################################################
options(stringsAsFactors = FALSE)
# load libraries for analysis
library(DESeq2)
library(pheatmap)
library('pvclust')
library(Biobase)
library(limma)
#######################################################################################################################################
## This function takes in the height matrix and processes it through DEseq modeling with respect to age in months
# INPUT: my.matrix: count matrix from subreads, columns selected, rownames implemented
#        reps.3, reps.12, reps.29: replicates from each age, default is 3
#        my.tissue: name of the tissue
# OUTPUT: list of DESeq result object,with annotated genenames
# my.tissue <- "Liver"
# my.matrix <- my.liver
# my.annot <- "../Diffbind/2018-09-13_Liver_H3K27ac_aging_diffbind.Repeat_ANNOT.txt"
# reps.3=2
# reps.12=2
# reps.29=2
process_aging_height_TE <- function(my.tissue, my.matrix, my.annot, reps.3=2, reps.12=2, reps.29=2) {
# the Diffbind has an offset in the name compared to HOMER (the beginning of diffbind is -1 HOMER)
# put the -1 to be able to compare
my.peak.annot <- read.csv(my.annot,header=F,sep="\t")
colnames(my.peak.annot) <- c("Diffbind_Peak","Elem_Unique_Name","Overlap","RepeatFamily", "Repeat_coord")
my.matrix$Diffbind_Peak <- rownames(my.matrix)
# make it so gene name is there
my.merged <- merge(my.matrix,my.peak.annot[,c("Diffbind_Peak","Elem_Unique_Name","RepeatFamily")],by="Diffbind_Peak")
ncols <- dim(my.merged)[2]
# collapse repeats per family (lack of precision of single ended chip-seq)
# "Diffbind_Peak"    "3m1" "3m2" "12m1" "12m2" "29m1" "29m2" "Elem_Unique_Name" "RepeatFamily"
my.summed_repeats <- aggregate(my.merged[,-c(1,(ncols-1):ncols)],by=list(my.merged$RepeatFamily),FUN=sum)
colnames(my.summed_repeats)[1] <- "Repeat"
my.clean.names <-  gsub('(', "_", my.summed_repeats$Repeat, fixed = T)
my.clean.names <-  gsub(')', "_", my.clean.names, fixed = T)
rownames(my.summed_repeats) <- my.clean.names
# get output file prefix
my.outprefix <- paste(Sys.Date(),my.tissue,"DESeq2_LINEAR_model_with_age_H3K27ac_height_Repeats",sep="_")
my.filtered.matrix <- my.summed_repeats[,-1] # for compatibility, remove the PeakName and coordinates column
age <- as.numeric(c(rep(3,reps.3),rep(12,reps.12),rep(29,reps.29) )) # age in months
# design matrix
dataDesign = data.frame( row.names = colnames( my.filtered.matrix ), age = age )
# get matrix using age as a modeling covariate
dds <- DESeqDataSetFromMatrix(countData = round(my.filtered.matrix), # round the normalized counts for DESeq2
colData = dataDesign,
design = ~ age)
# run DESeq normalizations and export results
dds.deseq <- DESeq(dds)
res.linear <- results(dds.deseq, name= "age") # added the name of the tested variable: doesn't seem to be taken correctly by default for FC
# plot dispersion
my.disp.out <- paste(my.outprefix,"_dispersion_plot.pdf",sep="")
pdf(my.disp.out)
plotDispEsts(dds.deseq)
dev.off()
# normalized expression value
tissue.cts <- log2( counts(dds.deseq, normalize = TRUE) + 0.01) # log2 norm of height
colnames(tissue.cts) <- c(paste("3m",1:reps.3,sep=""),paste("12m",1:reps.12,sep=""),paste("29m",1:reps.29,sep=""))
# expression range
my.exp.out <- paste(my.outprefix,"_Normalized_height_boxplot.pdf",sep="")
pdf(my.exp.out)
boxplot(tissue.cts,col=c(rep("coral",reps.3),rep("blueviolet",reps.12),rep("dodgerblue",reps.29)),
cex=0.5,ylab="Log2 DESeq2/Diffbind Normalized counts", main = my.tissue)
dev.off()
### get the heatmap of aging changes at FDR5
## exclude NA
my.nas.bool <- is.na(res.linear$padj)
res.linear <- res.linear[!is.na(res.linear$padj),]
genes.aging <- rownames(res.linear)[res.linear$padj < 0.05]
my.num.aging <- length(genes.aging)
if (my.num.aging > 02 - only if there is at least one gene
my.heatmap.out <- paste(my.outprefix,"_Heatmap_significant_domains.pdf",sep="")
pdf(my.heatmap.out)
my.heatmap.title <- paste(my.tissue," aging H3K27ac significant (FDR<5%), ",my.num.aging, " domains",sep="")
pheatmap(my.filtered.matrix[genes.aging,], # show called height, not DESeq norm height
cluster_cols = F,
cluster_rows = T,
colorRampPalette(rev(c("#CC3333","#FF9999","#FFCCCC","white","#CCCCFF","#9999FF","#333399")))(50),
show_rownames = T, scale="row",
main = my.heatmap.title, cellwidth = 30, border=NA)
dev.off()
}
# remove NAs
my.filtered.matrix <-  my.filtered.matrix[!my.nas.bool,]
my.summed_repeats <-  my.summed_repeats[!my.nas.bool,]
# get output files
## for all changes
my.decreased.de2 <- intersect(which(res.linear$padj < 0.05),which(res.linear$log2FoldChange < 0))
my.increased.de2 <- intersect(which(res.linear$padj < 0.05),which(res.linear$log2FoldChange > 0))
my.lost.out <- paste(my.outprefix,"_height_LINEAR_decreased_FDR5.xls",sep="")
my.gained.out <- paste(my.outprefix,"_height_LINEAR_increased_FDR5.xls",sep="")
write.table(cbind(my.summed_repeats$Repeat,res.linear)[my.decreased.de2,], file=my.lost.out,sep="\t",quote=F,row.names=F,col.names=T)
write.table(cbind(my.summed_repeats$Repeat,res.linear)[my.increased.de2,], file=my.gained.out,sep="\t",quote=F,row.names=F,col.names=T)
# show significant
my.nums.bd.changes <- c(length(my.decreased.de2)+length(my.increased.de2),length(my.decreased.de2), length(my.increased.de2))
names(my.nums.bd.changes) <- c("All changed domains","decreased domains","increased domains")
print(my.nums.bd.changes)
}
options(stringsAsFactors = FALSE)
# load libraries for analysis
library(DESeq2)
library(pheatmap)
library('pvclust')
library(Biobase)
library(limma)
#######################################################################################################################################
## This function takes in the height matrix and processes it through DEseq modeling with respect to age in months
# INPUT: my.matrix: count matrix from subreads, columns selected, rownames implemented
#        reps.3, reps.12, reps.29: replicates from each age, default is 3
#        my.tissue: name of the tissue
# OUTPUT: list of DESeq result object,with annotated genenames
# my.tissue <- "Liver"
# my.matrix <- my.liver
# my.annot <- "../Diffbind/2018-09-13_Liver_H3K27ac_aging_diffbind.Repeat_ANNOT.txt"
# reps.3=2
# reps.12=2
# reps.29=2
process_aging_height_TE <- function(my.tissue, my.matrix, my.annot, reps.3=2, reps.12=2, reps.29=2) {
# the Diffbind has an offset in the name compared to HOMER (the beginning of diffbind is -1 HOMER)
# put the -1 to be able to compare
my.peak.annot <- read.csv(my.annot,header=F,sep="\t")
colnames(my.peak.annot) <- c("Diffbind_Peak","Elem_Unique_Name","Overlap","RepeatFamily", "Repeat_coord")
my.matrix$Diffbind_Peak <- rownames(my.matrix)
# make it so gene name is there
my.merged <- merge(my.matrix,my.peak.annot[,c("Diffbind_Peak","Elem_Unique_Name","RepeatFamily")],by="Diffbind_Peak")
ncols <- dim(my.merged)[2]
# collapse repeats per family (lack of precision of single ended chip-seq)
# "Diffbind_Peak"    "3m1" "3m2" "12m1" "12m2" "29m1" "29m2" "Elem_Unique_Name" "RepeatFamily"
my.summed_repeats <- aggregate(my.merged[,-c(1,(ncols-1):ncols)],by=list(my.merged$RepeatFamily),FUN=sum)
colnames(my.summed_repeats)[1] <- "Repeat"
my.clean.names <-  gsub('(', "_", my.summed_repeats$Repeat, fixed = T)
my.clean.names <-  gsub(')', "_", my.clean.names, fixed = T)
rownames(my.summed_repeats) <- my.clean.names
# get output file prefix
my.outprefix <- paste(Sys.Date(),my.tissue,"DESeq2_LINEAR_model_with_age_H3K27ac_height_Repeats",sep="_")
my.filtered.matrix <- my.summed_repeats[,-1] # for compatibility, remove the PeakName and coordinates column
age <- as.numeric(c(rep(3,reps.3),rep(12,reps.12),rep(29,reps.29) )) # age in months
# design matrix
dataDesign = data.frame( row.names = colnames( my.filtered.matrix ), age = age )
# get matrix using age as a modeling covariate
dds <- DESeqDataSetFromMatrix(countData = round(my.filtered.matrix), # round the normalized counts for DESeq2
colData = dataDesign,
design = ~ age)
# run DESeq normalizations and export results
dds.deseq <- DESeq(dds)
res.linear <- results(dds.deseq, name= "age") # added the name of the tested variable: doesn't seem to be taken correctly by default for FC
# plot dispersion
my.disp.out <- paste(my.outprefix,"_dispersion_plot.pdf",sep="")
pdf(my.disp.out)
plotDispEsts(dds.deseq)
dev.off()
# normalized expression value
tissue.cts <- log2( counts(dds.deseq, normalize = TRUE) + 0.01) # log2 norm of height
colnames(tissue.cts) <- c(paste("3m",1:reps.3,sep=""),paste("12m",1:reps.12,sep=""),paste("29m",1:reps.29,sep=""))
# expression range
my.exp.out <- paste(my.outprefix,"_Normalized_height_boxplot.pdf",sep="")
pdf(my.exp.out)
boxplot(tissue.cts,col=c(rep("coral",reps.3),rep("blueviolet",reps.12),rep("dodgerblue",reps.29)),
cex=0.5,ylab="Log2 DESeq2/Diffbind Normalized counts", main = my.tissue)
dev.off()
### get the heatmap of aging changes at FDR5
## exclude NA
my.nas.bool <- is.na(res.linear$padj)
res.linear <- res.linear[!is.na(res.linear$padj),]
genes.aging <- rownames(res.linear)[res.linear$padj < 0.05]
my.num.aging <- length(genes.aging)
if (my.num.aging > 1) {
# heatmap drawing - only if there is at least one gene
my.heatmap.out <- paste(my.outprefix,"_Heatmap_significant_domains.pdf",sep="")
pdf(my.heatmap.out)
my.heatmap.title <- paste(my.tissue," aging H3K27ac significant (FDR<5%), ",my.num.aging, " domains",sep="")
pheatmap(my.filtered.matrix[genes.aging,], # show called height, not DESeq norm height
cluster_cols = F,
cluster_rows = T,
colorRampPalette(rev(c("#CC3333","#FF9999","#FFCCCC","white","#CCCCFF","#9999FF","#333399")))(50),
show_rownames = T, scale="row",
main = my.heatmap.title, cellwidth = 30, border=NA)
dev.off()
}
# remove NAs
my.filtered.matrix <-  my.filtered.matrix[!my.nas.bool,]
my.summed_repeats <-  my.summed_repeats[!my.nas.bool,]
# get output files
## for all changes
my.decreased.de2 <- intersect(which(res.linear$padj < 0.05),which(res.linear$log2FoldChange < 0))
my.increased.de2 <- intersect(which(res.linear$padj < 0.05),which(res.linear$log2FoldChange > 0))
my.lost.out <- paste(my.outprefix,"_height_LINEAR_decreased_FDR5.xls",sep="")
my.gained.out <- paste(my.outprefix,"_height_LINEAR_increased_FDR5.xls",sep="")
write.table(cbind(my.summed_repeats$Repeat,res.linear)[my.decreased.de2,], file=my.lost.out,sep="\t",quote=F,row.names=F,col.names=T)
write.table(cbind(my.summed_repeats$Repeat,res.linear)[my.increased.de2,], file=my.gained.out,sep="\t",quote=F,row.names=F,col.names=T)
# show significant
my.nums.bd.changes <- c(length(my.decreased.de2)+length(my.increased.de2),length(my.decreased.de2), length(my.increased.de2))
names(my.nums.bd.changes) <- c("All changed domains","decreased domains","increased domains")
print(my.nums.bd.changes)
}
####################################    cerebellum    ####################################
# read in Diffbind vector matrix
my.cerebellum.data <- read.table("../Diffbind/2018-09-13_Cerebellum_H3K27ac_aging_Repeats_DiffBind_normalized_counts.txt",header=T,sep="\t")
my.cerebellum <- my.cerebellum.data[,4:9]
# generate unique rownames based on peak coordinates
rownames(my.cerebellum) <- paste(my.cerebellum.data[,1],my.cerebellum.data[,2],my.cerebellum.data[,3],sep="-")
colnames(my.cerebellum) <- c("3m1","3m2","12m1","12m2","29m1","29m2")
# process data
process_aging_height_TE("cerebellum", my.cerebellum, "../Diffbind/2018-09-13_Cerebellum_H3K27ac_aging_diffbind.Repeat_ANNOT.txt")
# All changed domains   decreased domains   increased domains
# 135                  65                  70
#####################################################################################
####################################    OB    ####################################
# read in Diffbind vector matrix
my.OB.data <- read.table("../Diffbind/2018-09-13_OB_H3K27ac_aging_Repeats_DiffBind_normalized_counts.txt",header=T,sep="\t")
my.OB <- my.OB.data[,4:9]
# generate unique rownames based on peak coordinates
rownames(my.OB) <- paste(my.OB.data[,1],my.OB.data[,2],my.OB.data[,3],sep="-")
colnames(my.OB) <- c("3m1","3m2","12m1","12m2","29m1","29m2")
# process data
process_aging_height_TE("OB", my.OB, "../Diffbind/2018-09-13_OB_H3K27ac_aging_diffbind.Repeat_ANNOT.txt")
# All changed domains   decreased domains   increased domains
# 0                   0                   0
#####################################################################################
####################################    NPCs    ####################################
# read in Diffbind vector matrix
my.NPCs.data <- read.table("../Diffbind/2018-09-13_NSPC_H3K27ac_aging_Repeats_DiffBind_normalized_counts.txt",header=T,sep="\t")
my.NPCs <- my.NPCs.data[,4:9]
# generate unique rownames based on peak coordinates
rownames(my.NPCs) <- paste(my.NPCs.data[,1],my.NPCs.data[,2],my.NPCs.data[,3],sep="-")
colnames(my.NPCs) <- c("3m1","3m2","12m1","12m2","29m1","29m2")
# process data and save RData NPCsject
process_aging_height_TE("NPCs", my.NPCs,"../Diffbind/2018-09-13_NSPC_H3K27ac_aging_diffbind.Repeat_ANNOT.txt")
# All changed domains   decreased domains   increased domains
# 0                   0                   0
