################################################################
## Analyze the statistical significance of correlations between marks
## covering the origins.
##
## Authors: Jacques van Helden and Benoit Ballester
################################################################

## The configuration file should be loaded first, either from the web site or locally
## source("http://www.bigre.ulb.ac.be/courses/statistics_bioinformatics/R-files/config.R")

library("stats4bioinfo")

## Load some custom libraries
#source(file.path(dir.util, "util.R"))

## Code from Benoit Ballester to compute correlations
calc.correlations <- function(ori) {
  base=substr(basename(ori), 1, nchar(basename(ori)) - 4) 
  cat(base, sep="\n")
  ##-----------------------------
  ##-- Correlation matrix
  ##-----------------------------

  ##-- Read CSV file
  df = read.table(ori, header=TRUE, sep = "")
  ##-- Extract the coordinates 
  df2 = data.frame(df[,4],df[,9:43])
  ##-- Correlation 
  df2_corr = cor(df2[,2:36], use="all.obs", method="pearson")

  ##-- Get the order of the Ori heatmap 
  ##cols=rev(colorRampPalette(brewer.pal(9,"RdBu"))(100))
  ##hm = heatmap.2(df2_corr, scale="none", trace="none", col=cols, main=paste("Correlation between Marks\n", base) )
  ##order = (hm$rowInd)
  
  ##-- Change clustering method
  cols=rev(colorRampPalette(brewer.pal(9,"RdBu"))(100))
  hc=function(x) {hclust(dist(x), method = "average", members = NULL)}
  hm = heatmap.2(df2_corr, scale="none", trace="none", col=cols, main=paste("Correlation between Marks\n", base), hclustfun = hc  )
  order = (hm$rowInd)
}

force.order <- TRUE
mark.order <- c("H3K56ac",
                "H3k27ac",
                "p300",
                "TBP",
                "Pol2_Ser5P",
                "Pol2_global",
                "H3K4me2",
                "Dnase",
                "H3k09ac",
                "CpG",
                "H3K4me3",
                "ES_UMR",
                "H3k36me3",
                "X5hmC",
                "X5fC",
                "ES_LMR",
                "H3k04me1",
                "H2A.Z",
                "Ac_H2A.Z",
                "H3k09me3",
                "Sox2",
                "Nanog",
                "Ctcf",
                "Eset",
                "bivalent_domain",
                "H3K27me3",
                "Ezh2",
                "Ring1b",
                "Smc3",
                "Smc1",
                "Gquad",
                "Pol2_Ser2P",
                "Rest",
                "Smarc4",
                "Wdr5",
                "Klf4",
                "CHD4",
                "E2f1",
                "Med1",
                "Oct.04",
                "mof",
                "ES_FMR")


################################################################
## Required libraries
library(gplots) ## Required for heatmaps.2
library(RColorBrewer)
cols <- rev(colorRampPalette(brewer.pal(9,"RdBu"))(100))

dir.main <- "~/replication_origins"
dir.results <- file.path(dir.main, "analysis", "f4_rnase","marks","mark_correlations_stats")
dir.create(dir.results, showWarnings=FALSE, recurs=TRUE)
dir.figures <-file.path(dir.results, "figures")
dir.create(dir.figures, showWarnings=FALSE, recurs=TRUE)
setwd(dir.main)

################################################################
## Read matrix with correlation of chromatin marks at replication origins

rand.rep <- length(rand.cor.files) ## number of replicates for random simulations
type <- "peaks" ## SWEMBL peaks matching at least one SICER zone
## type <- "zones" ## SICER zones overlapping at least one SWEMBL peak

for (type in c("zones", "peaks")) {

  verbose (paste("Analyzing correlations between markrs and origin", type), 1)
  if (type == "peaks") {
    matrix.dir <- file.path(dir.main, "analysis","f4_rnase","marks","mark_correlations2")
    ##  matrix.prefix <- "f4ori_es_matrix_corr"
    matrix.file <- file.path(matrix.dir, "f4ori_es_matrix_corr.mat")
                                        #paste(sep="", matrix.prefix, ".mat"))
  } else {
    matrix.dir <- file.path(dir.main, "analysis","f4_rnase","marks","mark_correlations_sicer")
    matrix.file <- file.path(matrix.dir, "ES_indiff_C3_BN_vs_F4_RNAse_SICER_SWEMBLmatch_corr.mat")
  }
  ## list.files(matrix.dir)
  
  ## ##############################################################
  ## Read the correlation matrix
  ori.cor <- read.table(matrix.file)
  dim(ori.cor)
  mark.nb <- nrow(ori.cor)
  mark.names <- rownames(ori.cor)
  print(mark.names)
  
  ## ##############################################################
  ## Load correlation matrices obtained with random peaks/zones (selected with
  ## RSAT random-genome-fragments), in order to estimate a mark-specific
  ## expected distribution of correlation values.
  rand.cor.dir <- file.path(matrix.dir, "random_matrix1000")
  rand.cor.files <- list.files(rand.cor.dir, pattern="rand_marks_matrix.*_corr.mat")
  # head(rand.cor.files)
  verbose(paste("Reading matrices for", rand.rep, "random repetitions"))

  #  if (rand.rep == 0) {
  #    stop(paste("Cannot find random correlation matrices in dir\n", rand.cor.dir))
  #  }


  ## create a 3D array to store random matrix values
  rand.array <- array(NA, dim=c(mark.nb, mark.nb, rand.rep))
  m <- 0
  for (file in rand.cor.files) {
    m <- m+1
    if (m %% 100==1) {
      verbose (paste(sep="", "Reading ", type, " random correlation matrix ", m, "/", rand.rep, " ", file))
    }
    rand.mat <- as.matrix(read.table(file.path(rand.cor.dir, file)))
    
    ## Check that mark names are the same in the random and origin matrices
    if (sum(rownames(rand.mat) != rownames(ori.cor)) > 0) {
      stop(paste("Row names in random matrix", file, "do not correspond to the reference matrix"))
    }
    if (sum(colnames(rand.mat) != colnames(ori.cor)) > 0) {
      stop(paste("Column names in random matrix", file, "do not correspond to the reference matrix"))
    }
    
    rand.array[,,m] <- rand.mat
  }

  ## ##############################################################
  ## Draw heatmaps of correlation matrices

  ## Draw the heatmap of the original correlation matrix
  x11(width=7, height=7)
  hm <- heatmap.2(as.matrix(ori.cor),  scale="none", trace="none", 
                  main=paste("Correlation between marks\nat origin", type),
                  col=cols, breaks=seq(-1,1,2/length(cols)))
  ## Export the heatmap with marks ordered according to the result of hierarhical clustering procedure
  dev.copy2pdf(file=file.path(dir.figures,paste(sep="", "mark_correlations_at_ori_",type,"_heatmap.pdf")), width=7, height=7)
  
  
  ## Select the order of the marks to generate comparable heatmaps 
  ## for real data and random control. These comparable heatmaps will 
  ## thus have no tree displayed, so we keep both figure types.
  if (force.order) {
    hm.order <- mark.order
  } else {
    hm.order.index <- (hm$rowInd)
    hm.order <- rownames(ori.cor)[hm.order.index]
  }
  print(hm.order)

  ## Draw a heatmap with the re-ordered rows and columns
  x11(width=7, height=7)
  heatmap.2(as.matrix(ori.cor[rev(hm.order), hm.order]),
            Rowv=FALSE, Colv=FALSE, dendrogram="none", scale="none", trace="none", 
            main=paste("Correlation between marks\nat origin", type),
            col=cols, breaks=seq(-1,1,2/length(cols)))
  dev.copy2pdf(file=file.path(dir.figures,paste(sep="", "mark_correlations_at_ori_",type,"_heatmap_sorted.pdf")), width=7, height=7)

  ## Compute mean of random correlations for each pair of marks
  verbose (paste("Computing mean correlations for random", type))
  rand.mean <- apply(rand.array, c(1,2), mean, na.rm=TRUE)
  rownames(rand.mean) <- rownames(ori.cor)
  colnames(rand.mean) <- colnames(ori.cor)
  #  export.object(rand.mean, file=file.path(dir.results, paste(sep="", "mark_correlations_in_random_",type,"_mean")), export.format="table")
  write.table(rand.mean, 
              file=file.path(dir.results, paste(sep="", "mark_correlations_in_random_",type,"_mean.tab")),
              sep="\t", quote=FALSE, row.names=TRUE, col.names=NA)
  
  ## Check that mark names are the same in the matrices "random mean" and "origins"
  if (sum(rownames(rand.mean) != rownames(ori.cor)) > 0) {
    stop(paste("Row names in random matrix", file, "do not correspond to the reference matrix"))
  }
  if (sum(colnames(rand.mean) != colnames(ori.cor)) > 0) {
    stop(paste("Column names in random matrix", file, "do not correspond to the reference matrix"))
  }
  
  ## Draw a heatmap with the mean correlation between random peaks/zones
  x11(width=7, height=7)
  heatmap.2(rand.mean[rev(hm.order), hm.order],
            Rowv=FALSE, Colv=FALSE, dendrogram="none", scale="none", trace="none", 
            main=paste("Mean correlations betw. marks\nin random", type),
            col=cols, breaks=seq(-1,1,2/length(cols)))
  dev.copy2pdf(file=file.path(dir.figures,paste(sep="", "mark_correlations_in_random_",type,"_mean_heatmap.pdf")), width=7, height=7)
  

  ## Hybrid matrix with ori.cor in upper triangle and rand.mean in lower triangle
  # cor.hybrid <- ori.cor
  verbose("Computing hybrid map")
  cor.hybrid <- ori.cor[hm.order,hm.order]
  rand.mean.reordered <- rand.mean[hm.order,hm.order]
  for (i in 1:nrow(cor.hybrid)) {
    #  cor.hybrid[i,i:ncol(cor.hybrid)] <-  -1
    cor.hybrid[i,i:ncol(cor.hybrid)] <- rand.mean.reordered[i,i:ncol(cor.hybrid)]
  }
  ## reverse the columns of the hybrid matrix to get the same order for the exported tab-delimited file
  cor.hybrid <- cor.hybrid[rev(1:nrow(cor.hybrid)),rev(1:ncol(cor.hybrid))]
  
  ## export.object(cor.hybrid, file=file.path(dir.results, paste(sep="", "mark_correlations_in_random_", type, "_vs_ori_hybrid_matrix")), export.format="table")
  write.table(cor.hybrid, 
              file=file.path(dir.results, paste(sep="", "mark_correlations_in_random_", type, "_vs_ori_hybrid_matrix.tab")), 
              sep="\t", quote=FALSE, row.names=TRUE, col.names=NA)
  
  
  ## Hybrid heatmap showing mark correlations in origins (upper triangle) versus random peaks/zones (lower triangle)
  x11(width=7, height=7)
  heatmap.2(#rand.mean[rev(hm.order), hm.order],
            as.matrix(cor.hybrid[rev(hm.order), hm.order]),
            Rowv=FALSE, Colv=FALSE, dendrogram="none", scale="none", trace="none",
            main=paste("Correlation betw. marks\norigins vs random ", type, "\n(hybrid map)"),
            col=cols, breaks=seq(-1,1,2/length(cols)))
  dev.copy2pdf(file=file.path(dir.figures, paste(sep="", "mark_correlations_in_random_", type, "_vs_ori_hybrid_heatmap.pdf")), width=7, height=7)


  ## Compute the difference between correlations at origins and in random peaks/zones
  verbose(paste("Computing difference between correlation within actual and random", type))
  cor.diff <- as.matrix(ori.cor - rand.mean)
  #  export.object(cor.diff, file=file.path(dir.results, paste(sep="", "mark_correlations_in_random_", type, "_vs_ori_diff")), export.format="table")
  write.table(cor.diff, 
              file=file.path(dir.results, paste(sep="", "mark_correlations_in_random_", type, "_vs_ori_diff.tab")),
              sep="\t", quote=FALSE, row.names=TRUE, col.names=NA)
  
  ## Draw heatmap of difference between mark correlations in origins and random peaks/zones
  x11(width=7, height=7)
  heatmap.2(cor.diff[rev(hm.order), hm.order],
            Rowv=FALSE, Colv=FALSE, dendrogram="none", scale="none", trace="none",
            main=paste("Correlation diff. betw. marks\norigins vs random", type),
            col=cols, breaks=seq(-0.4,0.4,0.8/length(cols)))
  dev.copy2pdf(file=file.path(dir.figures, paste(sep="", "mark_correlations_in_random_", type, "_vs_ori_diff.pdf")), width=7, height=7)
  
  ## Draw heatmap of difference between mark correlations in origins and random peaks/zones, reclusterized
  x11(width=7, height=7)
  zlim.cor.diff <- 0.25
  heatmap.2(cor.diff,
            scale="none", trace="none",
            main=paste("Correlation diff. betw. marks\norigins vs random", type),
            zlim=c(-zim.cor.diff,zim.cor.diff),
            col=cols, breaks=seq(-zlim.cor.diff,zlim.cor.diff,2*zlim.cor.diff/length(cols)))
  dev.copy2pdf(file=file.path(dir.figures, paste(sep="", "mark_correlations_in_random_", type, "_vs_ori_diff_reclusterized.pdf")), width=7, height=7)
  
  ## Compute standard deviation of random correlations for each pair of marks
  rand.sd <- apply(rand.array, c(1,2), sd, na.rm=TRUE)
  rownames(rand.sd) <- rownames(ori.cor)
  colnames(rand.sd) <- colnames(ori.cor)
  #   export.object(rand.sd, file=file.path(dir.results, paste(sep="", "mark_correlations_in_random_", type, "_vs_ori_stdev")), export.format="table")
  write.table(rand.sd, 
              file=file.path(dir.results, paste(sep="", "mark_correlations_in_random_", type, "_vs_ori_stdev.tab")),
              sep="\t", quote=FALSE, row.names=TRUE, col.names=NA)
  
  ## Compute a z-score for each correlation value
  cor.z <- as.matrix(cor.diff / rand.sd)
  cor.z[is.na(cor.z)] <- 0
  #   export.object(cor.z, file=file.path(dir.results, paste(sep="", "mark_correlations_in_random_", type, "_vs_ori_zscore")), export.format="table")
  write.table(cor.z, 
              file=file.path(dir.results, paste(sep="", "mark_correlations_in_random_", type, "_vs_ori_zscore.tab")), 
              sep="\t", quote=FALSE, row.names=TRUE, col.names=NA)
  
  
  ## Draw heatmap of the correlation z-score
  x11(width=7, height=7)
  zlim.zscore <- quantile(cor.z, 0.995) ## zlim depends on the data type: z-scores reach higher values for peaks than for zones
  heatmap.2(cor.z[rev(hm.order), hm.order],
            Rowv=FALSE, Colv=FALSE, dendrogram="none", scale="none", trace="none", 
            main="Correlation gain (z-score)", zlim=c(-zlim.zscore,zlim.zscore),
            col=cols, breaks=seq(-zlim.zscore,zlim.zscore,2*zlim.zscore/length(cols)))
  dev.copy2pdf(file=file.path(dir.figures, paste(sep="", "mark_correlations_in_random_", type, "_vs_ori_zscore.pdf")), width=7, height=7)

  ## Draw heatmap of the correlation z-score, re-clusterized on the basis of zscores
  x11(width=7, height=7)
  heatmap.2(cor.z,
            scale="none", trace="none", 
            main="Correlation gain (z-scores), re-clusterized", 
            col=cols, breaks=seq(-zlim.zscore,zlim.zscore,2*zlim.zscore/length(cols)))
  dev.copy2pdf(file=file.path(dir.figures, paste(sep="", "mark_correlations_in_random_", type, "_vs_ori_zscore_reclusterized.pdf")), width=7, height=7)


  ################################################################
  ## Define a threshold on the z-score, using a Bonferroni correction
  ## on the number of pairs of marks.
  ##    alpha = 0.01 / nb mark pairs
  nb.cor <- mark.nb*(mark.nb-1)/2
  z.threshold <- qnorm(0.01/nb.cor, lower=FALSE)


  ################################################################
  ## Export a summary table with all the statistics
  verbose("Exporting table with all stats between mark pairs", 1)
  result.frame <- data.frame()  
  m1 <- 1
  m2 <- 2
  for (m1 in 1:(mark.nb-1)) {
    for (m2 in (m1+1):mark.nb) {
      ## if (m2==m1) {next} ## Tricky way to avoid problem with the last mark
      mark1 <- mark.names[m1]
      mark2 <- mark.names[m2]

      ref.cor <- ori.cor[m1,m2]
      z <- cor.z[m1,m2]

      if (abs(z) > z.threshold) {
        hist.file <- paste(sep="", "rand_marks_correlation_distrib_", mark1, "_vs_", mark2, ".pdf")
      } else {
        hist.file <- ""
      }

      ## Estimate a non-parametric p-value by counting the number of
      ## random osbervations more extreme than the actual correlation
      ## between marks in the peaks/zones
      rand.values <- rand.array[m1,m2,]
      n.rand.smaller <- sum(rand.values < ref.cor)
      n.rand.greater <- sum(rand.values > ref.cor)
      if (z < 0) {
        pval.sim <- n.rand.smaller / rand.rep
      } else {
        pval.sim <- n.rand.greater / rand.rep
      }
        
      result.row <- data.frame(m1=m1,
                               m2=m2,
                               mark1=mark1,
                               mark2=mark2,
                               cor=round(ori.cor[m1,m2], digits=3),
                               rand.mean=round(rand.mean[m1,m2], digits=3),
                               diff = round(cor.diff[m1,m2], digits=3),
                               sd = round(rand.sd[m1,m2], digits=3),
                               z=round(cor.z[m1,m2], digits=2),
                               pval.norm = pnorm(abs(cor.z[m1,m2]), lower=FALSE),
                               n.rand.greater = n.rand.greater,
                               n.rand.smaller = n.rand.smaller,
                               pval.sim = pval.sim,
                               histogram.file=hist.file)
      result.frame <- rbind(result.frame, result.row)
    }
  }
  head(dim) ## Check dimensions of the result frame
  head(result.frame[order(result.frame$pval.sim),]) ## Check lowest p-values in the simulations
  tail(result.frame[order(result.frame$pval.sim),]) ## Check highest p-values in the simulations
  result.frame <- result.frame[order(result.frame$pval.norm),] ## Sort result frame by increasing p-values of the normal p-vamie
  
  write.table(result.frame,
              file=file.path(dir.results, paste(sep="", "mark_correlations_in_random_",type,"_all_statistics.tab")),
              sep="\t", quote=FALSE, row.names=FALSE)
  
  result.frame.legend <- c(m1="index of the first mark in the correlation matrices",
                           m2="index of the second mark in the correlation matrices",
                           mark1="name fo the first mark",
                           mark2="name fo the second mark",
                           cor=paste("correlation between a pair of marks inside the origin", type),
                           rand.mean=paste("Mean correlation between marks in", rand.rep, "random selections of", type),
                           diff="cor - rand.mean",
                           sd = paste("standard deviation of the correlations in random", type),
                           z = paste("z-score of the correlations in random", type),
                           pval.norm = paste("p-value of the z-score, according to a normal distribution (beware: this distribution is questionable, it is just indicative)"),
                           n.rand.greater=paste("Number of random", type, "whose correlation is greater than in origin", type),
                           n.rand.smaller=paste("Number of random", type, "whose correlation is smaller than in origin", type),
                           pval.sim=paste("non-parametric estimate of the p-value, based on the frequency of random selection withb more extreme correlation than in oritin", type), 
                           histogram.file="Name of the histogram file (if exists)")
  write.table(result.frame.legend,
              file=file.path(dir.results, paste(sep="", "mark_correlations_in_random_",type,"_all_statistics_legend.tab")),
              sep="\t", quote=FALSE, row.names=TRUE, col.names=FALSE)
  
  
  ## ##############################################################
  ## Draw the complete distribution of random values for one pair of
  ## marks
  verbose("Exporting distribution plots", 1)
  x11(width=7, height=3)
  dir.cor.distrib <- file.path(dir.figures, "correlation_distributions", type)
  dir.create(dir.cor.distrib, showWarning=FALSE, recurs=TRUE)
  hist.index <- data.frame ## Table cintaining the list of histogram files per pair of marks
  
  par(cex=0.8)
  par(mar=c(2,2,2,0))
  hist.col <- c("rand"="#88DDFF","cor"="darkgreen")

  ## Select an arbitrary pair of marks for quick testing before running the loop
  m1 <- 2 ## For quick testing before running the loop
  m2 <- 19 ## For quick testing before running the loop

  for (m1 in 1:mark.nb) {
    for (m2 in m1:mark.nb) {
      if (m2==m1) {next}
      mark1 <- mark.names[m1]
      mark2 <- mark.names[m2]

      ref.cor <- ori.cor[m1,m2]
      z <- cor.z[m1,m2]
      
             
      ## Skip non-significant correlations based on a threshold on the z-score, 
      ## in order to avoid drawing non-informative histograms (the info about non-significant correlations is in the table).
      if (abs(z) < z.threshold) {next}
      
      ## Plot the distribution of random correlation values
      verbose(paste("Distribution of random correlations between", mark1, "and", mark2), 2)
      rand.values <- rand.array[m1,m2,]
      h <- hist(rand.values, breaks=seq(from=-1, to=1, by=0.01), plot=FALSE)
      max.count <- max(h$counts)
      hist(rand.values, breaks=seq(from=-1, to=1, by=0.01), col=hist.col["rand"],
           ylim=c(0,max.count*1.6),
           main=paste("correlation", mark1, "vs", mark2),
           xlab=NA, ylab=NA)
      
      ## Draw an arrow at the actual correlation value
      arrows(ref.cor, max.count*1.2, ref.cor, max.count*1.1, col=hist.col["cor"], lwd=2, angle=30, length=0.05, code=2)
      text(ref.cor, max.count*1.2, pos=3,
           col=hist.col["cor"],
           labels=paste(sep="\n",
             paste(sep="",
                   "corr = ", round(ref.cor, digits=3)),
             paste(sep="", "rand mean = ", round(rand.mean[m1,m2], digits=3),
                   "; sd = ", round(rand.sd[m1,m2], digits=3)),
             paste(sep="", "diff = ", round(cor.diff[m1,m2], digits=3),
                   "; z = ", round(z, digits=2))
             ), cex=1
           )
      
      legend("topleft", c(paste(rand.rep, "random marks"), "actual marks"), col=c(hist.col["rand"], hist.col["cor"]), lwd=5)
      #hist.file <- paste(sep="", "rand_marks_correlation_distrib_", mark1, "_vs_", mark2, ".pdf")
      hist.file <- as.vector(as.matrix(result.frame[(result.frame$m1==m1 & result.frame$m2==m2),"histogram.file"]))
      if (length(hist.file) > 0) {
        dev.copy2pdf(file=file.path(dir.cor.distrib, hist.file), width=7, height=3)
      }
      
      ################################################################
      ## Build a table that will serve to create an index of the histogram files
#       result.row <- data.frame(m1=m1,
#                                m2=m2,
#                                mark1=mark1,
#                                mark2=mark2,
#                                cor=ori.cor[m1,m2],
#                                z=ori.cor[m1,m2],
#                                hist.file)
#       hist.index <- rbind(result.frame, result.row)
# 
    }
  }
}

verbose("Job done")
  
################################################################

