################################################################
## Analyze associations between peaks and various genomic features
## (CpG, G4, ...).

## THIS SHOULD BE ADAPTED TO THE LOCAL MAIN DIRECTORY
dir.main <- '~/replication_origins/'

#file.peak.assoc <- file.path(dir.main, 'analysis/peaks_fev2013/mes_sw_k20_ordered3_all_g4cpgtimingannot.merged.bin2')
file.peak.assoc <- file.path(dir.main, 'analysis/peaks_fev2013/mes_sw_k20_ordered3_all_g4cpgtimingannot.parsed.csv')
peak.assoc <- read.table(file.peak.assoc, sep=',', quote='"', header=TRUE); #"

names(peak.assoc)

## FIX A BUG: COLUMN HEADERS WERE SWAPPED BETWEEN SWEMBL AND BINS2 FILE
names(peak.assoc) <- sub("length", "counts", names(peak.assoc))
names(peak.assoc) <- sub("reads", "length", names(peak.assoc))

## Compute density
peak.assoc[,"density"] <- peak.assoc[,"counts"]/peak.assoc[,"length"]

## Regroup clusers
k2cluster <- c("1"="1",
                   "2"="2",
                   "3"="3",
                   "4"="4L",
                   "5"="5L",
                   "6"="6L",
                   "7"="7L",
                   "8"="8L",
                   "9"="9L",
                   "10"="10L",
                   "11"="11",
                   "12"="10R",
                   "13"="9R",
                   "14"="8R",
                   "15"="7R",
                   "16"="6R",
                   "17"="5R",
                   "17"="4R",
                   "19"="12",
                   "20"="12"
                   )
cluster.names <- as.vector(k2cluster)
peak.assoc[,"cluster"] <- k2cluster[peak.assoc[,"order.k"]]

## Regroup clusers
k2group <- c("1"="1",
                   "2"="2",
                   "3"="3",
                   "4"="4LR",
                   "5"="5LR",
                   "6"="6LR",
                   "7"="7LR",
                   "8"="8LR",
                   "9"="9LR",
                   "10"="10LR",
                   "11"="11",
                   "12"="10LR",
                   "13"="9LR",
                   "14"="8LR",
                   "15"="7LR",
                   "16"="6LR",
                   "17"="5LR",
                   "17"="4LR",
                   "19"="12",
                   "20"="12"
                   )
peak.assoc[,"group"] <- k2group[peak.assoc[,"order.k"]]

## Order for displaying groups on maps
group.order <- c("1", "2", "3", "4LR", "5LR", "6LR", "7LR", "8LR", "9LR", "10LR", "11", "12")

## Check the cluster -> group conversion
table(peak.assoc[,"order.k"], peak.assoc[,"cluster"])
table(peak.assoc[,"order.k"], peak.assoc[,"group"])
table(peak.assoc[,"cluster"], peak.assoc[,"group"])

## Compute a contingency table with the number of peaks in the class CpG+ versus CpG- and G4+ versus G4-
table(peak.assoc$cpg, peak.assoc$g4)

## Define a Boolean variable telling whether each peak overlaps or not at least one G4
peak.assoc[, "overlaps.g4"] <-  "G4-"
peak.assoc[peak.assoc[, "g4"] >= 1, "overlaps.g4"] <-  "G4+"

## Contingency table with the boolean values
table(peak.assoc$cpg, peak.assoc$overlaps.g4)

## Compute summary statistics by groups (CpG vesus CpG-, G4+ versus G4-)
by(peak.assoc[,"counts"], peak.assoc[, "overlaps.g4"], summary)
by(peak.assoc[,"counts"], peak.assoc[, "cpg"], summary)
by(peak.assoc[,"density"], peak.assoc[, "overlaps.g4"], summary)

## Draw a boxplot
boxplot(density ~ overlaps.g4, peak.assoc)



## compute the distribution of early/late origins across clusters
timing.vs.cluster.occ <- table(peak.assoc$timing, peak.assoc$group)

timing.vs.cluster.occ <- timing.vs.cluster.occ[, group.order]

## Relative frequencies: repartition of each timing group (Early/Late/None) among the clusters
timing.vs.cluster.freq <- timing.vs.cluster.occ / apply(timing.vs.cluster.occ, 1, sum) 

time.colors <- c("L"="#0000BB",
                 "E"="#00BBFF",
                 "N"="#BBBBBB"
                 )
time.labels <- c("L"="Late", "E"="Early", "N"="None")

x11(height=12, width=8)
par(mfrow=c(3,1))
for (t in 1:nrow(timing.vs.cluster.freq)) {
  group <- rownames(timing.vs.cluster.freq)[t]
  barplot(timing.vs.cluster.freq[t,],
          main=paste(time.labels[group], "; cluster | timing"),
          ylab="Frequency",
          col=time.colors[group],
          ylim=c(0,max(timing.vs.cluster.freq)),
          xlab="Group")
}
par(mfrow=c(1,1))



################################################################
## Reverse the "R" clusters

bin.width <- 100

## Load the bins file
file.bins <- file.path(dir.main, "analysis/peaks_fev2013/mes_hm/7kb/q99/mes_swembl_002_100_7kb_ordered3_q99_rownames.txt")
bins <- read.table(file.bins, header=FALSE)

## Compute row names
dim(bins)
bin.nb <- ncol(bins) - 7
read.columns <- (1:bin.nb)+5
bin.offset <- -bin.width * ((bin.nb+1)/2)
names(bins) <- c("tmp.nb", "chr", "start", "end", "summit", ((1:(ncol(bins) - 7))*bin.width + bin.offset), "k.cluster", "order.k")
bins[1:10,1:8]
names(bins)

bins[, "cluster"] <- k2cluster[bins$order.k]
bins[, "group"] <- k2group[bins$order.k]

table(bins[, "cluster"]) ## Count peaks per cluster
table(bins[, "group"]) ## Count peaks per group

clusters.to.flip <- grepl("R", cluster.names)
names(clusters.to.flip) <- cluster.names

bins[,"flipped"] <- clusters.to.flip[bins$cluster]
table(bins$flipped)

## bins.bk <- bins

bins[bins$flipped, read.columns] <- bins[bins$flipped, rev(read.columns)]

library(RColorBrewer)
col1 = colorRampPalette(brewer.pal(9,"PuBuGn"))(20)
image(t(as.matrix(bins[,read.columns])), col=col1)

