setwd('/Volumes/MyBook_3/BD_aging_project/Public_datasets/Liver_files_for_BB/kallisto_results')
options(stringsAsFactors = F)

# 2017-05-15
# analyze Fish Liver RNAseq
# using param's kallisto runs

library(DESeq2)
library(pheatmap)
library('pvclust')

# read in Kallisto mappings
my.data <- read.csv("2017-05-10_Liver_Nfur_aging_kallisto_mapping.txt", sep = "\t", header = T)

# sum read over genes (to not have results over transcripts for DEseq2)
my.data.per.gene <- aggregate(my.data[,6:30],by=list(my.data$ENS_GID),FUN=sum)

# round counts (DESeq needs integers)
my.data.per.gene[,2:26] <- round(my.data.per.gene[,2:26])
rownames(my.data.per.gene) <- my.data.per.gene$Group.1
colnames(my.data.per.gene)[1] <- 'GeneName'

# get the genes with no reads out
my.null <- which(apply(my.data.per.gene[,2:26], 1, sum) <= 5) # see deseq2 vignetter
my.filtered.matrix <- my.data.per.gene[-my.null,2:26] # 23634 genes

# age in weeks
my.age <- rep(c(5, 12,20,27,39), each = 5)

# design matrix
dataDesign = data.frame( row.names = colnames( my.filtered.matrix ), 
                         age = my.age)

# get matrix using age as a modeling covariate
dds <- DESeqDataSetFromMatrix(countData = my.filtered.matrix,
                              colData = dataDesign,
                              design = ~ age)

# run DESeq normalizations and export results
dds.deseq <- DESeq(dds)

res <- results(dds.deseq, name = "age") # added the name of the tested variable

# plot dispersion
my.disp.out <- paste(Sys.Date(),"_dispersion_plot.pdf")

pdf(my.disp.out)
plotDispEsts(dds.deseq)
dev.off()

# parse sample names
my.sample.names <- paste(my.age,"w",1:25, sep = "")

# normalized expression value
tissue.cts <- log2( counts(dds.deseq, normalize = TRUE) + 0.01)
colnames(tissue.cts) <- my.sample.names

# do MDS analysis
mds.result <- cmdscale(1-cor(tissue.cts,method="spearman"), k = 2, eig = FALSE, add = FALSE, x.ret = FALSE)
x <- mds.result[, 1]
y <- mds.result[, 2]

my.palette <- colorRampPalette(c("coral","blueviolet","dodgerblue"))(5)

my.colors <- c(rep(my.palette[1],5), rep(my.palette[2],5),rep(my.palette[3],5),rep(my.palette[4],5),rep(my.palette[5],5))

my.mds.out <- paste(Sys.Date(),"killifish_liver_aging_analysis_MDS_plot.pdf", sep ="_")

pdf(my.mds.out)
plot(x, y, xlab = "MDS dimension 1", ylab = "MDS dimension 2",main="Multi-dimensional Scaling",cex=2)
points(x, y, pch=16,col=my.colors,cex=2)
text(x, y,my.sample.names ,col="grey",cex=0.5, pos  = 1)
legend("topleft",c("5w","12w","20w","27w","39w"),col=my.colors[c(1,6,11,16,21)],pch=16,bty='n',pt.cex=2)
dev.off()


#### PCA #### 
my.pos.var <- apply(tissue.cts,1,var) >0
# do PCA analysis
my.pca <- prcomp(t(tissue.cts[my.pos.var,]),scale = TRUE)
x <- my.pca$x[,1]
y <- my.pca$x[,2]
z <- my.pca$x[,3]

my.summary <- summary(my.pca)
# Importance of components:
# PC1     PC2      PC3      PC4      PC5      PC6      PC7      PC8      PC9     PC10     PC11     PC12     PC13     PC14     PC15     PC16     PC17     PC18     PC19     PC20     PC21
# Standard deviation     64.0411 59.2012 45.90342 41.60039 37.62856 35.71833 30.80461 30.31115 28.76348 26.37342 24.35950 24.10144 23.28181 22.55657 21.99715 21.30630 20.88535 20.40694 20.24318 19.55692 18.56820
# Proportion of Variance  0.1735  0.1483  0.08916  0.07322  0.05991  0.05398  0.04015  0.03887  0.03501  0.02943  0.02511  0.02458  0.02293  0.02153  0.02047  0.01921  0.01846  0.01762  0.01734  0.01618  0.01459
# Cumulative Proportion   0.1735  0.3218  0.41098  0.48421  0.54412  0.59810  0.63825  0.67712  0.71213  0.74156  0.76667  0.79125  0.81418  0.83571  0.85618  0.87539  0.89385  0.91147  0.92881  0.94499  0.95958

my.pca.out <- paste(Sys.Date(),"killifish_liver_aging_PCA_plot.pdf",sep="")

pdf(my.pca.out)
par(mfrow=c(2,2))
plot(x,y,pch = 1, cex=3, 
     xlab = paste('PC1 (', round(100*my.summary$importance[,1][2],1),"%)", sep=""),
     ylab = paste('PC2 (', round(100*my.summary$importance[,2][2],1),"%)", sep=""),
     cex.lab = 1.5, main = "PC1 vs. PC2") 
points(x,y, pch = 16, cex=3, col=my.colors)
text(x, y,my.sample.names ,col="grey",cex=0.5, pos  = 1)
legend("topleft",c("5w","12w","20w","27w","39w"),col=my.colors[c(1,6,11,16,21)],pch=16,bty='n',pt.cex=2)

plot(x,z,pch = 1, cex=3, 
     xlab = paste('PC1 (', round(100*my.summary$importance[,1][2],1),"%)", sep=""),
     ylab = paste('PC3 (', round(100*my.summary$importance[,3][2],1),"%)", sep=""),
     cex.lab = 1.5, main = "PC1 vs. PC3") 
points(x,z, pch = 16, cex=3, col=my.colors)
text(x, z,my.sample.names ,col="grey",cex=0.5, pos  = 1)

plot(y,z,pch = 1, cex=3, 
     xlab = paste('PC2 (', round(100*my.summary$importance[,2][2],1),"%)", sep=""),
     ylab = paste('PC3 (', round(100*my.summary$importance[,3][2],1),"%)", sep=""),
     cex.lab = 1.5, main = "PC1 vs. PC3") 
points(y,z, pch = 16, cex=3, col=my.colors)
text(y, z,my.sample.names ,col="grey",cex=0.5, pos  = 1)
par(mfrow=c(1,1))
dev.off()
#### #### #### #### #### #### #### 

# expression range
my.exp.out <- paste(Sys.Date(),"_Normalized_counts_boxplot.pdf")

pdf(my.exp.out)
boxplot(tissue.cts,names= my.sample.names,col=my.colors,cex=0.5,ylab="Log2 DESeq2 Normalized counts", las = 2)  
dev.off()

### get the heatmap of aging changes at FDR5
## exclude NA
res <- res[!is.na(res$padj),]

genes.aging <- rownames(res)[res$padj < 0.05]
my.num.aging <- length(genes.aging) # 2145

# heatmap drawing - only if there is at least one gene
my.heatmap.out <- paste(Sys.Date(),"_Heatmap_significant_genes.pdf", sep = "_")

pdf(my.heatmap.out, width = 8, height = 5, onefile = F)
my.heatmap.title <- paste("Aging significant (FDR<5%), ",my.num.aging, " genes",sep="")
pheatmap(tissue.cts[genes.aging,],
         cluster_cols = F,
         cluster_rows = T,
         colorRampPalette(rev(c("#CC3333","#FF9999","#FFCCCC","white","#CCCCFF","#9999FF","#333399")))(50),
         show_rownames = F, scale="row",
         main = my.heatmap.title, cellwidth = 25)
dev.off()

my.sig <- res$padj < 0.05

my.volcano.out <- paste(Sys.Date(),"_Volcano_plot.pdf", sep = "_")

pdf(my.volcano.out)
smoothScatter(res$log2FoldChange,-log10(res$padj), col = "grey", xlim=c(-1.5,1.5))
points(res$log2FoldChange[my.sig],-log10(res$padj)[my.sig], cex= 0.6, col = "red")
dev.off()

# do clustering
my.pv <- pvclust(tissue.cts,nboot=100)
my.heatmap.out <- paste(Sys.Date(),"_PVCLUST_result.pdf")

pdf(my.heatmap.out)
plot(my.pv)
dev.off()

# output result tables to files
my.outprefix <- paste(Sys.Date(),"Killifish_RNAseq_aging", sep = "_")
my.out.ct.mat <- paste(my.outprefix,"_log2_counts_matrix.txt", sep = "_")
my.out.stats <- paste(my.outprefix,"_all_genes_statistics.txt", sep = "_")
my.out.fdr5 <- paste(my.outprefix,"_FDR5_genes_statistics.txt", sep = "_")
my.out.rdata <- paste(my.outprefix,"_statistics.RData", sep = "_")

write.table(tissue.cts, file = my.out.ct.mat , sep = "\t" , row.names = T, quote=F)
write.table(res, file = my.out.stats , sep = "\t" , row.names = T, quote=F)
write.table(res[genes.aging,], file = my.out.fdr5, sep = "\t" , row.names = T, quote=F)

save(res,file=my.out.rdata)




############################################################################################################################################
###########################################                   Mouse orthologs                    ###########################################
############################################################################################################################################
options(stringsAsFactors=F)
library(bitops)

# compare to mouse liver results

# get ortholog names
# use Param's BLAST results
my.orthology <- read.csv('/Volumes/MyBook_3/BD_aging_project/Public_datasets/Liver_files_for_BB/Mouse_blast_results/BestHits_nfur-mmus_1e-3.txt', sep = "\t", header = F)

get_first <- function (vec) {
  return(vec[1])
}

my.nfur.names <- unlist(lapply(strsplit(my.orthology$V1,"|",fixed = T),get_first))
# length(my.nfur.names)
# [1] 21073

my.mouse.names <- unlist(lapply(strsplit(my.orthology$V2,"|",fixed = T),get_first))
length(my.mouse.names)
# [1] 21073

my.orth.table <- data.frame(cbind(my.nfur.names,my.mouse.names))
colnames(my.orth.table) <- c("Nfur_Symbol","Mouse_Symbol")

#
load("/Volumes/MyBook_3/BD_aging_project/RNAseq/All_tissues_analysis/DEseq2_runs/Separate/RNA_seq_result_Liver_2015-11-19.RData")

my.liver.nfur.process <- res
my.liver.nfur.process$Nfur_Symbol <- rownames(my.liver.nfur.process)
my.liver.RNAseq.process[[1]]$Mouse_Symbol <- rownames(my.liver.RNAseq.process[[1]])

#### Liver #####
my.liver.merge1 <- merge(data.frame(my.liver.nfur.process),my.orth.table)
my.liver.merge2 <- merge(data.frame(my.liver.RNAseq.process[[1]]),my.liver.merge1, by='Mouse_Symbol')

my.mouse.sig <- my.liver.merge2$padj.x < 0.05 # 313
my.nfur.sig <- my.liver.merge2$padj.y < 0.05 # 1465
my.both.sig <- bitAnd(my.mouse.sig,my.nfur.sig)>0 # 44

test.mouse.neg <- wilcox.test(my.liver.merge2$log2FoldChange.y[my.mouse.sig & (my.liver.merge2$log2FoldChange.x <0)], alternative = "less")
test.mouse.pos <- wilcox.test(my.liver.merge2$log2FoldChange.y[my.mouse.sig & (my.liver.merge2$log2FoldChange.x >0)], alternative = "greater")

pdf(paste(Sys.Date(),"LogFC_of_mouse_orthologs_Liver_Nfur.pdf",sep="_"))
boxplot(my.liver.merge2$log2FoldChange.y[my.mouse.sig & (my.liver.merge2$log2FoldChange.x <0)],
        my.liver.merge2$log2FoldChange.y[my.mouse.sig & (my.liver.merge2$log2FoldChange.x >0)],
        names = c("Mouse Ortholog Down","Mouse Ortholog Up"),
        ylab = "nfur Liver aging log2FC per y",
        ylim = c(-0.08,0.08), col = c("cadetblue2","brown1"),
        main = "Liver (Nfur)"
)
text(1,0.07,signif(test.mouse.neg$p.value, digits = 3))
text(2,0.07,signif(test.mouse.pos$p.value, digits = 3))
abline(h=0,lty="dashed", col = "red", lwd = 2)
dev.off()
