####################################################################
##### DEPENDS ON THE PREVIOUS EXECUTION OF THE FOLLOWING SCRIPTS:
##### - file_paths.R
####################################################################

source('00_library_of_functions.R')
library(TxDb.Mmusculus.UCSC.mm9.knownGene)
library(compEpiTools)

## load paths to primary files

load(
	# inputBamFile,mycChIPbamfiles,pol2ChIPbamfiles,
	# mycPeaksfiles,pol2Peaksfiles,pol2BroadPeaksfiles,
	# bampaths_4su_rep1,bampaths_4su_rep2,bampaths_4su_rep3,
	# bampaths_total_rep1,bampaths_total_rep2,bampaths_total_rep3,
	file=file.path('..','data','paths_3T9_mycER.RData')
	)

## path of BED files

mycPeaks <- bedfiles2granges(mycPeaksfiles)
pol2Peaks <- bedfiles2granges(pol2Peaksfiles)
pol2BroadPeaks <- bedfiles2granges(pol2BroadPeaksfiles)

## tiles of the genome without features

txRanges <- trim(transcripts(TxDb.Mmusculus.UCSC.mm9.knownGene)+10000)
strand(txRanges) <- '*'
txRanges <- reduce(txRanges)

featuresTmp <- c(list(txRanges), mycPeaks, pol2Peaks, pol2BroadPeaks)
features <- featuresTmp[[1]]
for( i in 2:length(featuresTmp) ) 
	features <- union(features, featuresTmp[[i]])

chrlen <- seqlengths(TxDb.Mmusculus.UCSC.mm9.knownGene)
chrgr <- do.call('c', sapply(1:length(chrlen), 
	function(i) GRanges(names(chrlen)[i], IRanges(start=1, end=chrlen[i]))))

nofeatures <- setdiff(chrgr, features)
nofeaturetiles <- unlist(tile(nofeatures, width=10000))
nofeaturetiles <- nofeaturetiles[width(nofeaturetiles)>9000]

## calculate the scaling factors samples/input 

set.seed(1)
nofeaturetilesample <- nofeaturetiles[sample(1:length(nofeaturetiles), 1000)]

inputnofeature <- GRcoverage(nofeaturetilesample, inputBamFile, Nnorm=FALSE, Snorm=FALSE)
mycnofeature <- sapply(mycChIPbamfiles, function(x) 
	GRcoverage(nofeaturetilesample, x, Nnorm=FALSE, Snorm=FALSE))
polnofeature <- sapply(pol2ChIPbamfiles, function(x) 
	GRcoverage(nofeaturetilesample, x, Nnorm=FALSE, Snorm=FALSE))

inputMycScales <- apply(mycnofeature, 2, function(y) lm(y~inputnofeature+0)$coefficients)
inputPol2Scales <- apply(polnofeature, 2, function(y) lm(y~inputnofeature+0)$coefficients)

## calculate scaling factors between samples

inputMycFeature <- sapply(seq_along(mycPeaks), function(i)
	GRcoverage(mycPeaks[[i]], inputBamFile, Nnorm=FALSE, Snorm=FALSE))
mycSamplesFeature <- sapply(seq_along(mycPeaks), function(i)
	GRcoverage(mycPeaks[[i]], mycChIPbamfiles[i], Nnorm=FALSE, Snorm=FALSE))
mycSamplesFeatureNR <- lapply(seq_along(mycPeaks), function(i) 
	mycSamplesFeature[[i]]-inputMycFeature[[i]]*inputMycScales[i])
mycSamplesFeatCoverageNR <- sapply(mycSamplesFeatureNR, sum)
mycSampleNorm <- mycSamplesFeatCoverageNR/median(mycSamplesFeatCoverageNR)

allPol2Peaks <- lapply(seq_along(pol2Peaks), function(i) 
	union(pol2Peaks[[i]], pol2BroadPeaks[[i]]))
txRanges <- trim(transcripts(TxDb.Mmusculus.UCSC.mm9.knownGene)+5000)
strand(txRanges) <- '*'
txRanges <- reduce(txRanges)
allPol2Features <- lapply(seq_along(allPol2Peaks), function(i) 
	union(allPol2Peaks[[i]], txRanges))

inputPol2Feature <- sapply(seq_along(allPol2Features), function(i)
	GRcoverage(allPol2Features[[i]], inputBamFile, Nnorm=FALSE, Snorm=FALSE))
pol2SamplesFeature <- sapply(seq_along(allPol2Features), function(i)
	GRcoverage(allPol2Features[[i]], pol2ChIPbamfiles[i], Nnorm=FALSE, Snorm=FALSE))
pol2SamplesFeatureNR <- lapply(seq_along(allPol2Features), function(i) 
	pol2SamplesFeature[[i]]-inputPol2Feature[[i]]*inputPol2Scales[i])
pol2SamplesFeatCoverageNR <- sapply(pol2SamplesFeatureNR, sum)
pol2SampleNorm <- pol2SamplesFeatCoverageNR/median(pol2SamplesFeatCoverageNR)

## calculate enrichment over genomic features (TSS, GB, TES)

mycTSSgr <- make_annotation('genome',annotationFeature="TSS",
	promoter_limits=c(2e3, 2e3), txdb=TxDb.Mmusculus.UCSC.mm9.knownGene)
mycTSS <- sapply(mycChIPbamfiles, function(x) 
	GRcoverage(mycTSSgr, x, Nnorm=FALSE, Snorm=TRUE))
mycTSSinput <- GRcoverage(mycTSSgr, inputBamFile, Nnorm=FALSE, Snorm=TRUE)

polTSSgr <- make_annotation("genome", annotationFeature="TSS",
	promoter_limits=c(50,700), termination_limits=c(1e3,4e3))
polGBgr <- make_annotation("genome", annotationFeature="GB" , 
	promoter_limits=c(50,700), termination_limits=c(1e3,4e3))
polTESgr <- make_annotation("genome", annotationFeature="TES", 
	promoter_limits=c(50,700), termination_limits=c(1e3,4e3))
pol2TSS <- sapply(pol2ChIPbamfiles, function(x) 
	GRcoverage(polTSSgr, x, Nnorm=FALSE, Snorm=TRUE))
pol2GB  <- sapply(pol2ChIPbamfiles, function(x) 
	GRcoverage(polGBgr,  x, Nnorm=FALSE, Snorm=TRUE))
pol2TES <- sapply(pol2ChIPbamfiles, function(x) 
	GRcoverage(polTESgr, x, Nnorm=FALSE, Snorm=TRUE))
polTSSinput <- GRcoverage(polTSSgr, inputBamFile, Nnorm=FALSE, Snorm=TRUE)
polGBinput  <- GRcoverage(polGBgr,  inputBamFile, Nnorm=FALSE, Snorm=TRUE)
polTESinput <- GRcoverage(polTESgr, inputBamFile, Nnorm=FALSE, Snorm=TRUE)

## normalize

mycTSSn <- t(t(mycTSS-sapply(inputMycScales, 
	function(x) x*mycTSSinput))/mycSampleNorm)
polTSSn <- t(t(pol2TSS-sapply(inputPol2Scales,
	function(x) x*polTSSinput))/pol2SampleNorm)
polGBn <- t(t(pol2GB-sapply(inputPol2Scales,
	function(x) x*polGBinput))/pol2SampleNorm)
polTESn <- t(t(pol2TES-sapply(inputPol2Scales,
	function(x) x*polTESinput))/pol2SampleNorm)

rownames(mycTSSn) <- names(mycTSSgr)
rownames(polTSSn) <- names(polTSSgr)
rownames(polGBn) <- names(polGBgr)
rownames(polTESn) <- names(polTESgr)

## call peaks on genomic features (TSS, GB, TES)

mycTSSPeaks <- sapply(mycPeaks, function(x) 1*(countOverlaps(mycTSSgr, x)>0))
polTSSPeaks <- sapply(allPol2Peaks, function(x) 1*(countOverlaps(polTSSgr, x)>0))
polGBPeaks  <- sapply(allPol2Peaks, function(x) 1*(countOverlaps(polGBgr, x)>0))
polTESPeaks <- sapply(allPol2Peaks, function(x) 1*(countOverlaps(polTESgr, x)>0))

colnames(mycTSSPeaks) <- names(mycChIPbamfiles)
colnames(polTSSPeaks) <- names(pol2ChIPbamfiles)
colnames(polGBPeaks) <- names(pol2ChIPbamfiles)
colnames(polTESPeaks) <- names(pol2ChIPbamfiles)

## saturate low signal based on peak calling

minval <- quantile(mycTSSn[mycTSSPeaks==1], .025)/10
mycTSSn[mycTSSn<minval] <- minval
minval <- quantile(polTSSn[polTSSPeaks==1], .025)/10
polTSSn[polTSSn<minval] <- minval
minval <- quantile(polGBn[polGBPeaks==1], .025)/10
polGBn[polGBn<minval] <- minval
minval <- quantile(polTESn[polTESPeaks==1], .025)/10
polTESn[polTESn<minval] <- minval

## save

save(mycTSSgr,polTSSgr,polGBgr,polTESgr,
	inputMycScales,mycSampleNorm,inputPol2Scales,pol2SampleNorm,
	mycTSSPeaks,polTSSPeaks,polGBPeaks,polTESPeaks,
	mycTSSn,polTSSn,polGBn,polTESn,
	file=file.path('..','data','ChIPseq_data.RData'))
