################################################################
## Compute k-mer occurrences per slice

## Analysis of replication origins, nascent strands data from
## Christelle Cayrou and Marcel Mechali.
##
## R script by Jacques van Helden
##
## Running this script requires to first run the script config.R
##    source('http://pedagogix-tagc.univ-mrs.fr/courses/statistics_bioinformatics/R-files/config.R')


## Load some libraries
#library(stats4bioinfo)
# source('http://pedagogix-tagc.univ-mrs.fr/courses/statistics_bioinformatics/R-files/config.R')
source('/Users/jvanheld/statistics_bioinformatics/R-files/config.R')

library(gplots) ## Required for heatmaps.2
library(RColorBrewer) ## Required for nice colors on correlation heatmaps
correl.colors <- rev(colorRampPalette(brewer.pal(9,"RdBu"))(100))


## source('/Users/jvanheld/statistics_bioinformatics/R-files/config.R')
export.formats.plots <- c("png", "pdf")
verbosity <- 1
plot.a.lot <- FALSE ## Generate extra plot (which are not saved) for data exploration
# png.resol <- 72

## This variable should be adapted according to the computer on which
## R is running. All other paths are defined relative to dir.main
dir.main <- '/Users/jvanheld/replication_origins'

setwd(dir.main)

## Threshold to classify peaks according to strand polarity
alpha <- 1e-2 ## Threshold on the binomial p-value, to select significnant enrichment in G-rich 6-mers left or C-rich 6-mers right
balanced.threshold <- 0.4 ## Threshold to define the "balanced" occurrences of leftG and rightC
mkv <- 0 ## Markov order


## define a color per class and assign it to the peaks
class.colors <- c("leftG"="#DDBB00", 
                  "rightC"="#008833",
                  "both"="#884400",
                  "twilight"="#BBBBBB",
                  "balanced"="#6666FF",
                  "alpha"="red")

results <- list() ## Structure to store all the results
peaks.per.class.summary <- data.frame()


oligo.lengths <- 5:8
# oligo.lengths <- 6
# pos.ol <- 6
for (pos.ol in oligo.lengths) {
  
  if (pos.ol == 5) {
    nb.clusters <- 12
    #  G.cluster <- "cluster2"
    #  C.cluster <- "cluster3"
  } else if (pos.ol == 6) {
    nb.clusters <- 12
    #  G.cluster <- "cluster2"
    #  C.cluster <- "cluster3"
  } else if (pos.ol == 7) {
    nb.clusters <- 12
    #  G.cluster <- "cluster2"
    #  C.cluster <- "cluster3"
  } else if (pos.ol == 8) {  
    nb.clusters <- 11
    #  G.cluster <- "cluster2"
    #  C.cluster <- "cluster3"
  }
  
  pos.ol.suffix <- paste(sep="", pos.ol, "nt") ## Will also serve later
  G.cluster <- paste(sep="", "G-rich.", pos.ol.suffix)
  C.cluster <- paste(sep="", "C-rich.", pos.ol.suffix)
  
  file.prefix <- paste(sep="", "SWEMBL_ES_indiff_C3_BN_vs_F4_RNAse_R0.002_SICERmatch_summits_",pos.ol,"nt_ci50-1str-noov_bg_mkv", mkv)
  dir.pos <- file.path(dir.main, "analysis", "motifs", "position_analysis", file.prefix)
  #dir.occ <- file.path(dir.pos, "occurrences_per_slice")
  dir.occ <-  file.path(dir.main, "analysis", "motifs", "strand_polarity", paste(sep="", "occurrences_per_slice_",pos.ol,"nt_k", nb.clusters))
  # list.files(dir.occ)
  
  ## Define slice names
  slice.names <- c(left = "-450_-151",
                   central= "-150_149",
                   right = "+150_+449")
  nb.slices <- length(slice.names)
  
  ## Set manually the dimensions of the data (for simplification of the
  ## code, and to check consistency)
  nb.peaks <- 65019
  ## nb.peaks <- 2001 ## TEMPORARY FOR QUICK TEST
  
  # ################################################################################
  # ## Read the complete file with all oligonucleotide profiles (and chi2 statistics)
  # pos.file <- file.path(dir.pos, paste(sep="", file.prefix, ".tab"))
  # pos.profiles <- read.table(pos.file, comment.char=";", header=1)
  # names(pos.profiles)
  # dim(pos.profiles)
  # 
  # ## Analyze distribution of chi2 values
  # chi.values <- pos.profiles$chi2
  
  ################################################################################
  ## Read file describing the pattern/cluster assignation
  pattern.file <- file.path(dir.occ,  paste(sep="", file.prefix, "_patterns_k",nb.clusters,"_all.tab"))
  # list.files(dir.occ, pattern=file.prefix)
  patterns.clusters <- read.table(pattern.file)
  names(patterns.clusters) <- c("pattern", "pattern.id")
  #patterns.clusters$pattern.id <- sub(as.vector(as.matrix(patterns.clusters$pattern.id)), pattern="G-rich", replacement = "Grich")
  
  ## Extract separate vectors with the pattern itself, and the
  ## pattern.id, for convenience in the script below
  pattern <- as.vector(patterns.clusters$pattern)
  pattern.id <- as.vector(patterns.clusters$pattern.id)
  nb.patterns <- length(pattern)
  
  ## Extract cluster name
  patterns.clusters$cluster <- sub(perl=TRUE, '[^.]*\\.', "", pattern.id)
  table(patterns.clusters$cluster)
  cluster.names <- unique(patterns.clusters$cluster)
  if (length(cluster.names) != nb.clusters) {
    stop(paste("Invalid number of clusters in pattern file", pattern.file, "(expected:", nb.clusters, ")."))
  }
  
  # head(patterns.clusters)
  
  
  ## Prepare summary tables
  occ.per.cluster <- data.frame(matrix(NA, nrow=nb.peaks, ncol=nb.clusters))
  names(occ.per.cluster) <- cluster.names
  
  ## Total occurrences (all peaks) for each cluster
  total.per.cluster <- data.frame(matrix(NA,nrow=nb.slices, ncol=nb.clusters))
  names(total.per.cluster) <- cluster.names
  rownames(total.per.cluster) <- slice.names
  
  coverage.per.cluster <- data.frame(matrix(NA,nrow=nb.slices, ncol=nb.clusters))
  names(coverage.per.cluster) <- cluster.names
  rownames(coverage.per.cluster) <- slice.names
  
  coverage.per.pattern <- data.frame(matrix(NA,nrow=nb.slices, ncol=nb.patterns))
  names(coverage.per.pattern) <- pattern
  rownames(coverage.per.pattern) <- slice.names
  
  ## Read and pre-process occurrence tables
  slice.number <- 1
  for (slice.number in 1:nb.slices) {
    slice.name <- slice.names[slice.number]
    verbose(paste("Treating slice", slice.number, slice.name), 1)
    
    slice.prefix <- file.path(dir.occ, paste(sep="", file.prefix, "_slice_", slice.name))
    
    ## Initialize the list of results for current slice
    results.one.slice <- list()
    
    ## Read occurrence table, indicating the number of occurrences per
    ## peak, for each one of the selected patterns
    verbose(paste("Slice", slice.name, "; Reading",pos.ol.suffix,"occurrence table"), 1)
    occ.slice.file <- paste(sep="", slice.prefix, ".tab")
    ## TEMPORARY occ.slice.file <- file.path(dir.occ, paste(sep="", file.prefix, "_slice_", slice.name, "_top2000.tab"))
    occ.table <- read.delim(occ.slice.file, header=1, row.names=1, comment.char=';')
    names(occ.table) <- sub(names(occ.table), pattern="G.rich", replacement = "G-rich") ## Restore the correct column names, which were modified by read.delim
    names(occ.table) <- sub(names(occ.table), pattern="C.rich", replacement = "C-rich")
    # dim(occ.table)
    # names(occ.table)
    
    ## suppress the row with totals per pattern
    total.from.file <- occ.table["total",] ## Get it for confirmation
    occ.table <- occ.table[rownames(occ.table) != "total",]
    peak.names <- rownames(occ.table)
    
    ## only retain the columns with patterns (the original data file
    ## contains a column with total occurrences of all patterns)
    occ.table <- occ.table[,pattern.id]
    results.one.slice$occ.table <- occ.table  ## Store the occurrence table in a result list, we will re-use it later
    # names(occ.table)
    # setdiff(names(occ.table), pattern.id)
    # length(pattern.id)
    # dim(occ.table)
    # View(occ.table)
    
    
    ## Check dimension of the occ table (should be nb.patterns * nb.peaks)
    print(dim(occ.table))  ## Should give [1] 65020   100
    verbose(paste("Occurrence table", nrow(occ.table), "rows x ", ncol(occ.table), "columns"))
    if (length(peak.names) != nb.peaks) {
      stop(paste(sep="", length(peak.names), " is not the valid number of peaks in file", occ.slice.file, "(expected:", nb.peaks, ")."))
    }
    if (ncol(occ.table) != nb.patterns) {
      stop(paste("Invalid number of columns in file", occ.slice.file, "(expected:", nb.patterns, ")."))
    }
    
    ## Count the number of occurrences per cluster of oligonucleotides, for the current slice
    verbose(paste("Slice", slice.name, "; Counting", pos.ol.suffix, "occurrences per cluster"), 1)
    for (cluster.name in cluster.names) {
      cluster.columns <- (patterns.clusters$cluster == cluster.name)
      occ.per.cluster[,cluster.name] <- apply(data.frame(occ.table[,cluster.columns]), 1, sum)
    }
    rownames(occ.per.cluster) <- peak.names
    # dim(occ.per.cluster)
    # head(occ.per.cluster)
    # print(summary(occ.per.cluster))
    write.table(occ.per.cluster, file=paste(sep="", slice.prefix, "_occurrences_per_cluster.tab"), col.names = NA, row.names = TRUE, sep="\t", quote=FALSE)
    total.per.cluster[slice.number,] <- apply(occ.per.cluster,2,sum, na.rm=TRUE)
    results.one.slice$occ.per.cluster <- occ.per.cluster
    
    ## Compute match table: 1 or 0 for each peak, depending on whether it has or not the pattern
    verbose(paste("Slice", slice.name, "; Computing", pos.ol.suffix, "match table"), 1)
    match.table <- (occ.table >0)*1 ## A trick to convert Boolean table into integer values
    #     write.table(match.table, 
    #                 file=paste(sep="", slice.prefix, "_match_table.tab"), 
    #                 col.names = NA, row.names = TRUE, sep="\t", quote=FALSE) # No need to store this big table
    results.one.slice$match.table <- match.table  ## Store the matching table in a result list, we will re-use it later
    # print(summary(match.table))
    
    ## Compute the coverage per pattern (proportion of peaks covered by each pattern)
    verbose(paste("Slice", slice.name, "; Computing coverage per", pos.ol.suffix), 1)
    coverage.per.pattern[slice.number,] <- apply(match.table,2,sum)/nb.peaks
    # print(coverage.per.pattern)
    
    ## Peak coverage per pattern cluster for each slice
    verbose(paste("Slice", slice.name, "; Computing coverage per", pos.ol.suffix, "cluster"), 1)
    match.per.cluster <- (occ.per.cluster >0)*1 ## A trick to convert Boolean table into integer values
    coverage.per.cluster[slice.number,] <- apply(match.per.cluster,2,sum,na.rm=TRUE)/nb.peaks  
    # print(coverage.per.cluster)
    
    
    results[[pos.ol.suffix]][[slice.name]] <- results.one.slice
    
    ## Draw correlation heatmap between patterns
    verbose(paste("Correlation between", pos.ol.suffix, "for slice", slice.name), 1)
    pattern.correlation <- cor(na.omit(occ.table))
    # range(pattern.correlation)
    #     pdf(file=file.path(dir.occ,paste(sep="",file.prefix, "_pattern_correlation_heatmap_", slice.name, ".pdf")), 
    #         width=12, height=12)
    x11(width=15, height=15)
    hm <- heatmap.2(as.matrix(pattern.correlation),  scale="none", trace="none", 
                    margins = c(10,10), cexRow = 0.6, cexCol = 0.6,
                    main=paste("Slice", slice.name, "Correlation between ", pos.ol, "-mers"),
                    col=correl.colors, breaks=seq(-1,1,2/length(correl.colors)))
    #silence <- dev.copy2pdf(file=file.path(dir.occ,paste(sep="","pattern_correlation_heatmap.pdf", slice.name)), width=12, height=12)
    export.plot(file.prefix = file.path(dir.occ,paste(sep="",file.prefix, "_pattern_correlation_heatmap_", slice.name)), 
                export.formats=c("pdf","png"), width=15, height=15)
    silence <- dev.off()
    
    
    ## Draw correlation heatmap between clusters
    verbose(paste("Correlation between", pos.ol.suffix, "clusters for slice", slice.name), 1)
    cluster.correlation <- cor(na.omit(occ.per.cluster))
    range(cluster.correlation)
    #     pdf(file=file.path(dir.occ,paste(sep="",file.prefix, "_cluster_correlation_heatmap_", slice.name, ".pdf")), width=12, height=12)
    x11(width=12, height=12)
    hm <- heatmap.2(as.matrix(cluster.correlation),  scale="none", trace="none",  margins=c(10,10),
                    main=paste(pos.ol,"-mer clusters correl. in slice ", slice.name),
                    col=correl.colors, breaks=seq(-1,1,2/length(correl.colors)))
    # silence <- dev.copy2pdf(file=file.path(dir.occ,paste(sep="","cluster_correlation_heatmap.pdf", slice.name)), width=12, height=12)
    export.plot(file=file.path(dir.occ,paste(sep="",file.prefix, "_cluster_correlation_heatmap_", slice.name)), 
                export.formats=c("pdf","png"), width=12, height=12)
    silence <- dev.off()
    
    
  }
  
  
  ## Save summary tables
  verbose(paste("Saving summary tables for", pos.ol.suffix), 1)
  write.table(coverage.per.pattern, file=file.path(dir.occ, paste(sep="", file.prefix, "_coverage_per_pattern.tab")), col.names = NA, row.names = TRUE, sep="\t")
  write.table(coverage.per.cluster, file=file.path(dir.occ, paste(sep="", file.prefix, "_coverage_per_cluster.tab")), col.names = NA, row.names = TRUE, sep="\t")
  write.table(total.per.cluster, file=file.path(dir.occ, paste(sep="", file.prefix, "_total_per_cluster.tab")), col.names = NA, row.names = TRUE, sep="\t")
  
  ## Draw barplots
  
  ## Define colors and density for clusters and slices
  slice.colors <- terrain.colors(nb.slices) #rainbow(nb.slices)
  #   cluster.colors <- rainbow(nb.clusters/2)
  #   cluster.density <- c(rep(100,nb.clusters/2), rep(30, nb.clusters/2))
  if (nb.clusters <= 12) {
    cluster.colors <- brewer.pal(nb.clusters,"Paired")
  } else {
    cluster.colors <- rainbow(nb.clusters)     
  }
  
  
  ################################################################
  ## Draw barplots showing the coverage per slice for each pattern cluster in each position slice
  
  
  ## Draw a barplot with the coverage per cluster and per slice
  #   pdf(file=file.path(dir.occ, paste(sep="", file.prefix, "_coverage_per_cluster.pdf")), width=12, height=8)
  x11(width=12,height=8)
  par(mai=c(1.5,1,1,0.2))
  barplot(as.matrix(coverage.per.cluster), 
          main=paste("Peak coverage per", pos.ol.suffix,"cluster"), 
          beside=TRUE,  las=2,
          col=slice.colors)
  legend("topright", paste("Position slice", slice.names), 
         fill=slice.colors)
  #silence <- dev.copy2pdf(file=file.path(dir.occ,"coverage_per_cluster.pdf"), width=12, height=8)
  export.plot(file.prefix=file.path(dir.occ, paste(sep="", file.prefix, "_coverage_per_cluster")), 
              export.formats=c("pdf", "png"), width=12, height=8)
  silence <- dev.off()
  
  ## Draw a barplot with the occurrences per cluster and per slice
  #   pdf(file=file.path(dir.occ, paste(sep="", file.prefix, "_total_occ_per_cluster.pdf")), width=12, height=8)
  x11(width=12, height=8)
  barplot(as.matrix(total.per.cluster), 
          main=paste("Occurrences per",pos.ol.suffix,"cluster"), 
          ylab="Total occurrences", 
          beside=TRUE,  las=2,
          col=slice.colors)
  legend("topright", paste("Position slice", slice.names), 
         fill=slice.colors)
  # silence <- dev.copy2pdf(file=file.path(dir.occ,"total_occ_per_cluster.pdf"), width=12, height=8)
  export.plot(file.prefix=file.path(dir.occ, paste(sep="", file.prefix, "_total_occ_per_cluster")), 
              export.formats=c("pdf", "png"), width=12, height=8)
  silence <- dev.off()
  
  
  ## Drow a barplot showing the occurrences per position, for clusters grouped by slices
  #   pdf(file=file.path(dir.occ, paste(sep="", file.prefix, "_total_occ_per_slice.pdf")), width=12, height=8)
  x11(width=10,height=8)
  par(mai=c(1,1,1,1))
  barplot(t(as.matrix(total.per.cluster)), 
          main=paste(sep="", "Occurrences per slice: ", pos.ol.suffix),
          beside=TRUE,  las=1,
          #          density=cluster.density,
          legend.text=cluster.names,
          col=cluster.colors)
  #   legend("topright", cluster.names, 
  #          #         density=cluster.density,
  #          fill=cluster.colors)
  #   #silence <- dev.copy2pdf(file=file.path(dir.occ,"total_occ_per_slice.pdf"), width=12, height=8)
  export.plot(file.prefix=file.path(dir.occ, paste(sep="", file.prefix, "_total_occ_per_slice")), 
              export.formats=c("pdf", "png"), width=12, height=8)
  silence <- dev.off()
  
  
  ################################################################
  ## Annotate each peak according to the polarity og G-rich and C-rich motifs
  bed.file <- "analysis/peaks/ORIGINS/SWEMBL_ES_indiff_C3_BN_vs_F4_RNAse_R0.002_SICER_match/SWEMBL_ES_indiff_C3_BN_vs_F4_RNAse_R0.002_SICERmatch_summits.bed"
  verbose(paste("Reading peak coordinates from bed file", bed.file), 1)
  bed <- read.delim(bed.file, row.names=NULL, header=FALSE, as.is=TRUE)
  # dim(bed)
  names(bed) <- c("Chrom", "start", "end", "ID", "score", "strand")
  rownames(bed) <- bed[, "ID"]
  # head(bed)
  # tail(bed)
  bed$motif.class <- rep(NA, times=nrow(bed)) ## initialize a column for the motif-based peak class
  
  ## Build a summary table with occurrences per k-mer cluster in each positional slice of each peak
  # names(results)
  
  
  ## Collect G-rich/C-rich signals in left-side and right-side slices, resp.
  bed$leftG <- as.numeric(unlist(results[[pos.ol.suffix]][[slice.names["left"]]]$occ.per.cluster[G.cluster]))
  bed$leftC <- as.numeric(unlist(results[[pos.ol.suffix]][[slice.names["left"]]]$occ.per.cluster[C.cluster]))
  bed$rightG <- as.numeric(unlist(results[[pos.ol.suffix]][[slice.names["right"]]]$occ.per.cluster[G.cluster]))
  bed$rightC <- as.numeric(unlist(results[[pos.ol.suffix]][[slice.names["right"]]]$occ.per.cluster[C.cluster]))
  
  # max(bed$rightG)
  
  ## Draw distribution of left/right signals
  if (plot.a.lot) {
    x11(width=8, height=8)
    par(mfrow=c(4,1))
    hist.breaks <- (0:120)-0.5
    hist.xlim <- c(0,40)
    hist(bed$leftG, breaks=hist.breaks, main="left G-rich", xlab="Occurrences per slice (300bp)", ylab="Number of peaks", xlim=hist.xlim, col=class.colors["leftG"])
    hist(bed$rightG, breaks=hist.breaks,  main="right G-rich", xlab="Occurrences per slice (300bp)", ylab="Number of peaks", xlim=hist.xlim, col=class.colors["rightG"])
    hist(bed$leftC, breaks=hist.breaks,  main="left C-rich", xlab="Occurrences per slice (300bp)", ylab="Number of peaks", xlim=hist.xlim, col=class.colors["leftC"])
    hist(bed$rightC, breaks=hist.breaks,  main="right C-rich", xlab="Occurrences per slice (300bp)", ylab="Number of peaks", xlim=hist.xlim, col=class.colors["rightC"])
    par(mfrow=c(1,1))
  }
  
  ## Build a summary table with occurrences per cluster for each table
  verbose(paste("Analyzing occurrences of",pos.ol.suffix, "clusters per slice"))
  occ.per.cluster.and.slice <- data.frame(
    "left"=results[[pos.ol.suffix]][[slice.names["left"]]]$occ.per.cluster,
    "central"=results[[pos.ol.suffix]][[slice.names["central"]]]$occ.per.cluster,
    "right"=results[[pos.ol.suffix]][[slice.names["right"]]]$occ.per.cluster)
  # names(occ.per.cluster.and.slice)
  # head(occ.per.cluster.and.slice)
  
  ## Draw correlation heatmap between patterns
  occ.per.cluster.and.slice.correlation <- cor(na.omit(occ.per.cluster.and.slice))
  # range(occ.per.cluster.and.slice.correlation)
  #     pdf(file=file.path(dir.occ,paste(sep="",file.prefix, "_clusters_per_slice_correlation_heatmap_", slice.name, ".pdf")), 
  #         width=12, height=12)
  x11(width=12, height=12)
  hm <- heatmap.2(as.matrix(occ.per.cluster.and.slice.correlation),  scale="none", trace="none", 
                  margins = c(10,10), cexRow = 0.9, cexCol = 0.9,
                  main=paste("Correlation between ", pos.ol, "-mers in different slices"),
                  col=correl.colors, breaks=seq(-1,1,2/length(correl.colors)))
  #silence <- dev.copy2pdf(file=file.path(dir.occ,paste(sep="","pattern_correlation_heatmap.pdf", slice.name)), width=12, height=12)
  export.plot(file.prefix = file.path(dir.occ,paste(sep="",file.prefix, "_clusters_per_slice_correlation_heatmap_", slice.name)), 
              export.formats=c("pdf","png"), width=12, height=12)
  silence <- dev.off()
  
  ## Select some particularly relevant signals, either by clusters or specific patterns
  if (pos.ol == 6) {
    selected.signals <- cbind(occ.per.cluster.and.slice[, c("left.G.rich.6nt", 
                                                            "left.C.rich.6nt",
                                                            "right.G.rich.6nt",
                                                            "right.C.rich.6nt",
                                                            "central.cluster1")],
                              central.CACACA = results[[pos.ol.suffix]][[slice.names["central"]]]$occ.table$CACACA.cluster1,
                              central.TGTGTG = results[[pos.ol.suffix]][[slice.names["central"]]]$occ.table$TGTGTG.cluster1)
    selected.cor <- cor(selected.signals)
    # range(selected.cor)
    #     pdf(file=file.path(dir.occ,paste(sep="",file.prefix, "_clusters_per_slice_correlation_heatmap_", slice.name, ".pdf")), 
    #         width=12, height=12)
    x11(width=8, height=8)
    hm <- heatmap.2(as.matrix(selected.cor),  scale="none", trace="none", 
                    margins = c(10,10), cexRow = 1, cexCol = 1,
                    cellnote = round(digits=2, selected.cor), notecol="blue",
                    main=paste("Correlation between selected ", pos.ol, "-mers in different slices"),
                    col=correl.colors, breaks=seq(-1,1,2/length(correl.colors)))
    #silence <- dev.copy2pdf(file=file.path(dir.occ,paste(sep="","pattern_correlation_heatmap.pdf", slice.name)), width=12, height=12)
    export.plot(file.prefix = file.path(dir.occ,paste(sep="",file.prefix, "_selected_patterns_per_slice_correlation_heatmap_", slice.name)), 
                export.formats=c("pdf","png"), width=8, height=8)
    silence <- dev.off()
  }
  
  ################################################################
  ## Compute strand polarity parameters
  verbose(paste("Computing strand polarity parameters for", pos.ol.suffix))
  bed$leftG.vs.rightC <- bed$leftG/bed$rightC ## Fold-change
  bed$leftG.plus.rightC <- bed$leftG + bed$rightC ## sum of right G and left C
  
  
  ## To measure the G-rich k-mer enrichment on the left of the origins,
  ## compute binomial p-value for leftG relative to leftG + leftC
  bed$leftG.vs.leftC.enrich.pval <- pbinom(q=bed$leftG-1, size=bed$leftG + bed$leftC, prob = 0.5, lower.tail=FALSE)
  bed$leftG.vs.leftC.enrich <- (bed$leftG.vs.leftC.enrich.pval < alpha)*1
  
  ## As a control, compute binomial p-value for leftC relative to leftC+leftG.
  ## As expected, the number of significant peaks is much smaller.
  bed$leftC.vs.leftG.enrich.pval <- pbinom(q=bed$leftC-1, size=bed$leftC + bed$leftG, prob = 0.5, lower.tail=FALSE)
  bed$leftC.vs.leftG.enrich <- (bed$leftC.vs.leftG.enrich.pval < alpha)*1
  
  ## To measure the C-rich k-mer enrichment on the right of the origins,
  ## compute binomial p-value for rightC relative to rightC+rightG
  bed$rightC.vs.rightG.enrich.pval <- pbinom(q=bed$rightC-1, size=bed$rightG + bed$rightC, prob = 0.5, lower.tail=FALSE)
  bed$rightC.vs.rightG.enrich <- (bed$rightC.vs.rightG.enrich.pval < alpha)*1
  
  ## As a control, compute binomial p-value for rightG relative to rightG+rightC
  ## As expected, the number of significant peaks is much smaller.
  bed$rightG.vs.rightC.enrich.pval <- pbinom(q=bed$rightG-1, size=bed$rightC + bed$rightG, prob = 0.5, lower.tail=FALSE)
  bed$rightG.vs.rightC.enrich <- (bed$rightG.vs.rightC.enrich.pval < alpha)*1
  
  table(bed[, c("leftG.vs.leftC.enrich", "rightC.vs.rightG.enrich")])
  table(bed[, c("leftC.vs.leftG.enrich", "rightG.vs.rightC.enrich")])
  ## Note: among 65019 peaks, 9518 are enriched in right C-rich motifs, 9682 in G-rich left motifs, but only 499 in both !
  
  ## Less interesting approach (but I tried): compute the p-value of relative enrichment in leftG versus rightC for this, we use the binomial test: assume all occurrences were thrown at random between the left slice and RC right slice.
  ## Less interesting because it does not allow us to detect simultaneous enrichment in left G-rich and right C-rich k-mers.
  bed$leftG.vs.rightC.enrich.pval <- pbinom(q=bed$leftG-1, size=bed$leftG.plus.rightC, prob = 0.5, lower.tail=FALSE)
  bed$leftG.vs.rightC.enrich <- (bed$leftG.vs.rightC.enrich.pval < alpha)*1
  bed$rightC.vs.leftG.enrich.pval <- pbinom(q=bed$rightC-1, size=bed$leftG.plus.rightC, prob = 0.5, lower.tail=FALSE)
  bed$rightC.vs.leftG.enrich <- (bed$rightC.vs.leftG.enrich.pval < alpha)*1
  
  table(bed[,c("leftG.vs.rightC.enrich", "rightC.vs.leftG.enrich")])
  table(bed[,c("leftG.vs.leftC.enrich", c("leftG.vs.rightC.enrich"))])
  
  ## Assign each feature to a class according to its enrichment status
  bed$motif.class <- "twilight"
  bed$motif.class[(bed$leftG.vs.leftC.enrich==1)] <- "leftG"
  # sum(bed$leftG.vs.leftC.enrich)
  bed$motif.class[(bed$rightC.vs.rightG.enrich==1)] <- "rightC"
  bed$motif.class[(bed$leftG.vs.leftC.enrich & bed$rightC.vs.rightG.enrich)] <- "both"
  bed$motif.class[(bed$leftG.vs.leftC.enrich.pval >balanced.threshold) & (bed$rightC.vs.rightG.enrich.pval > balanced.threshold)] <- "balanced"
  table (bed$motif.class)
  
  ## Adapt strand orientation for features with significant enrichment of either leftG or rightC
  bed$strand <- rep(".", times=nrow(bed))
  bed[bed$motif.class=="leftG", "strand"] <- "+"
  bed[bed$motif.class=="rightC", "strand"] <- "-"
  table(bed$strand)
  
  ## Export a table wit the number of peaks per G-rich/C-rich motif polarity class
  peaks.per.class <- data.frame(table(bed$motif.class))
  names(peaks.per.class) <- c("Polarity.class", "Peaks")
  row.names(peaks.per.class) <- peaks.per.class$Polarity.class
  peaks.per.class <- peaks.per.class[,rev(names(peaks.per.class))]
  write.table(peaks.per.class, 
              file=file.path(dir.occ, paste(sep="", file.prefix, "_peaks_per_class.tab")), 
              col.names = TRUE, row.names = FALSE, sep="\t", quote=FALSE)
  
  ## Collect a summary of peaks per class for each k-mer length. 
  ## We will then compute the p-value of joint leftG and rightC
  peaks.per.class.summary <- rbind(peaks.per.class.summary, 
                                   t(data.frame(Peaks=peaks.per.class$Peaks, row.names=peaks.per.class$Polarity.class)))
  row.names(peaks.per.class.summary)[nrow(peaks.per.class.summary)]<- pos.ol.suffix
  #names(peaks.per.class.summary) <- peaks.per.class$Polarity.class
  
  bed$color <- class.colors[bed$motif.class]
  table(bed$color)
  
  ## Compare occurrences of left G and left C signals
  ## pdf(file=file.path(dir.occ, paste(sep="", file.prefix,"_leftG_vs_rightC_occ.pdf")), width=8, height=8)
  x11(width=8, height=8)
  plot(bed[, c("leftG", "leftC")], col=bed$color, 
       main=paste(sep="", pos.ol, "nt; leftG versus leftC occurrences"),
       xlab="G-rich 6-mers in left slice",
       ylab="C-rich 6-mers in left slice",
       panel.first=grid(col="#BBBBBB", lty="solid"))
  legend("topright", legend=names(class.colors), col=class.colors, pch=20, cex=1.2, bg="white", bty="o")
  #   png(file=file.path(dir.occ, paste(sep="", file.prefix,"_leftG_vs_rightC_occ.png")), width=8*png.resol, height=8*png.resol)
  #   silence <- dev.copy()
  #   silence <- dev.off()
  export.plot(file.prefix=file.path(dir.occ, paste(sep="", file.prefix,"_leftG_vs_leftC_occ")), 
              export.formats=c("png", "pdf"), width=8, height=8)
  silence <- dev.off()
  
  ## Compare occurrences of right G and right C signals
  ## pdf(file=file.path(dir.occ, paste(sep="", file.prefix,"_rightG_vs_rightC_occ.pdf")), width=8, height=8)
  x11(width=8, height=8)
  plot(bed[, c("rightG", "rightC")], col=bed$color, 
       main=paste(sep="", pos.ol, "nt; rightG versus rightC occurrences"),
       xlab="G-rich 6-mers in right slice",
       ylab="C-rich 6-mers in right slice",
       panel.first=grid(col="#BBBBBB", lty="solid"))
  legend("topright", legend=names(class.colors), col=class.colors, pch=20, cex=1.2, bg="white", bty="o")
  #   png(file=file.path(dir.occ, paste(sep="", file.prefix,"_rightG_vs_rightC_occ.png")), width=8*png.resol, height=8*png.resol)
  #   silence <- dev.copy()
  #   silence <- dev.off()
  export.plot(file.prefix=file.path(dir.occ, paste(sep="", file.prefix,"_rightG_vs_rightC_occ")), 
              export.formats=c("png", "pdf"), width=8, height=8)
  silence <- dev.off()
  
  ## Compare occurrences of left G and right C signals
  ## pdf(file=file.path(dir.occ, paste(sep="", file.prefix,"_leftG_vs_rightC_occ.pdf")), width=8, height=8)
  x11(width=8, height=8)
  plot(bed[, c("leftG", "rightC")], col=bed$color, 
       main=paste(sep="", pos.ol, "nt; leftG versus rightC occurrences"),
       xlab="G-rich 6-mers in left slice",
       ylab="C-rich 6-mers in right slice",
       panel.first=grid(col="#BBBBBB", lty="solid"))
  legend("topright", legend=names(class.colors), col=class.colors, pch=20, cex=1.2, bg="white", bty="o")
  #   png(file=file.path(dir.occ, paste(sep="", file.prefix,"_leftG_vs_rightC_occ.png")), width=8*png.resol, height=8*png.resol)
  #   silence <- dev.copy()
  #   silence <- dev.off()
  export.plot(file.prefix=file.path(dir.occ, paste(sep="", file.prefix,"_leftG_vs_rightC_occ")), 
              export.formats=c("png", "pdf"), width=8, height=8)
  silence <- dev.off()
  
  ## Plot the p-values of leftG and rightC enrichment
  x11(width=8, height=8)
  #   pdf(file=file.path(dir.occ, paste(sep="", file.prefix, "_leftG_vs_rightC_Pvalue.pdf")), width=8, height=8)
  plot(bed$leftG.vs.leftC.enrich.pval, bed$rightC.vs.rightG.enrich.pval, col=bed$color, log="xy", xlim=c(1e-40,1), ylim=c(1e-40,1),
       main=paste(sep="", pos.ol, "nt; leftG versus rightG p-values"),
       xlab="left G-rich p-value",
       ylab="right C-rich p-value",
       panel.first=grid(equilogs = FALSE, col="#BBBBBB", lty="solid"))
  abline(h=alpha, col=class.colors["alpha"])
  abline(v=alpha, col=class.colors["alpha"])
  abline(h=balanced.threshold, col=class.colors["balanced"])
  abline(v=balanced.threshold, col=class.colors["balanced"])
  legend("bottomleft", legend=names(class.colors), col=class.colors, pch=20, cex=1.2, bg="white", bty="o")
  #   pdf(file=file.path(dir.occ, paste(sep="", file.prefix, "_leftG_vs_rightC_Pvalue.png")), width=8*png.resol, height=8*png.resol)
  #   silence <- dev.copy()
  #   silence <- dev.off()
  export.plot(file.prefix=file.path(dir.occ, paste(sep="", file.prefix, "_leftG_vs_rightC_Pvalue.pdf")), 
              export.formats=c("pdf", "png"), width=8, height=8)
  silence <- dev.off()
  
  
  
  
  # plot(bed$leftG.vs.rightC.enrich.pval, bed$rightC.vs.leftG.enrich.pval, col=bed$color, log="xy")
  
  ## P-value histograms
  if (plot.a.lot) {
    par(mfrow=c(4,1))
    hist(bed$leftG.vs.leftC.enrich.pval    , breaks=100, xlab="Binomial p-value", ylab="Number of peaks", main="left G-rich vs left C-rich", col=class.colors["leftG"])
    hist(bed$rightC.vs.rightG.enrich.pval, breaks=100, xlab="Binomial p-value", ylab="Number of peaks", main="right C-rich vs right G-rich", col=class.colors["rightC"])
    hist(bed$leftG.vs.rightC.enrich.pval  , breaks=100, xlab="Binomial p-value", ylab="Number of peaks", main="left G-rich vs right C-rich", col=class.colors["leftG"])
    hist(bed$rightC.vs.leftG.enrich.pval  , breaks=100, xlab="Binomial p-value", ylab="Number of peaks", main="right C-rich vs left G-rich", col=class.colors["rightC"])
    par(mfrow=c(1,1))
  }
  
  ## Draw an M-A plot (not very good)
  bed$M <- unlist(log2(bed$leftG.vs.rightC))
  bed$A <- unlist(log2(bed[, "leftG"]*bed[, "rightC"])/2)
  
  # pdf(file=file.path(dir.occ, paste(sep="", file.prefix, "_MA_plot.pdf")), width=8, height=8)
  x11(width=8, height=8)
  plot(bed$A,bed$M, col=bed$color,
       main=paste("M-A plot of left G and right C",pos.ol.suffix,"signals"),
       xlab="A = log2(leftG * rightC)/2",
       ylab="M = log2(leftG / rightC) = log2-fold-change",
       panel.first=grid(lty="solid", col="#BBBBBB"))
  abline(h=0, col="black")
  #   silence <- dev.copy(png, file.path(dir.occ, paste(sep="", file.prefix, "_MA_plot.png")), width=8*png.resol, height=8*png.resol)
  #   silence <- dev.off()
  export.plot(file.prefix=file.path(dir.occ, paste(sep="", file.prefix, "_MA_plot.pdf")), 
              export.formats=c("pdf", "png"), width=8, height=8)
  silence <- dev.off()
  
  #   pdf(file=file.path(dir.occ, paste(sep="", file.prefix, "_M-GC_plot.pdf")), width=8, height=8)
  x11(width=8, height=8)
  plot(bed$leftG.plus.rightC, bed$M, col=bed$color,
       xlab="downtream G + left C",
       ylab="M = log2(leftG / rightC) = log2-fold-change",
       panel.first=grid(col="#BBBBBB",lty="solid"))
  #   silence <- dev.copy(png, file.path(dir.occ, paste(sep="", file.prefix, "_M-GC_plot.png")), width=8*png.resol, height=8*png.resol)
  export.plot(file.prefix=file.path(dir.occ, paste(sep="", file.prefix, "_M-GC_plot.pdf")), 
              export.formats=c("png", "pdf"), width=8, height=8)
  silence <- dev.off()
  
  # ## Volcano plot
  # plot( bed$leftG.vs.leftC.enrich.pval, bed$M, log="")
  
  
  ## Export annotated bed file
  file.annotated.bed <- file.path(dir.occ, paste(sep="", file.prefix, "_kmer_occ_annot.bed"))
  verbose(paste("Exporting annotated bed", file.annotated.bed), 1)
  to.export <- cbind(bed, occ.per.cluster.and.slice)
  # dim(to.export)
  # View(to.export)
  write.table(to.export, file=file.annotated.bed, col.names = TRUE, row.names = FALSE, sep="\t", quote=FALSE)
  
  ################################################################
  ## Princpal component analysis with occurrences per slice and cluster
  verbose(paste("Principal component maps for", pos.ol.suffix), 1)
  pr <- prcomp(occ.per.cluster.and.slice)
  # names(pr)
  # dim(pr$x)
  x11(width=12, height=6)
  #   pdf(file=file.path(dir.occ, paste(sep="", file.prefix, "_PC1-3.pdf")), width=12, height=6)
  par(mfrow=c(1,2))
  plot(pr$x[,1:2], col=bed$color, panel.first=grid(col="#BBBBBB", lty="solid"),
       main=paste("Principal components 1 vs 2 for", pos.ol.suffix))
  abline(v=0)
  abline(h=0)
  plot(pr$x[,3:2], col=bed$color, panel.first=grid(col="#BBBBBB", lty="solid"),
       main=paste("Principal components 2 vs 3 for", pos.ol.suffix))
  abline(v=0)
  abline(h=0)
  par(mfrow=c(1,1))
  #   silence <- dev.copy(png, file.path(dir.occ, paste(sep="", file.prefix, "_PC1-3.png")), width=12*png.resol, height=6*png.resol)
  #   silence <- dev.off()
  export.plot(file.prefix=file.path(dir.occ, paste(sep="", file.prefix, "_PC1-3")), 
              export.formats=c("pdf","png"), width=12, height=6)
  silence <- dev.off()


#   ########
#   for (class in 1:4) {
#     class.bed.file <- file.path("analysis", "f4_rnase", "data", 
#                                paste(sep="", "ori_mes_class", class ,".bed"))
#     class.bed <- read.delim(class.bed.file)
#     peak.ids <- 
#   }
}



