####################################################################
##### DEPENDS ON THE PREVIOUS EXECUTION OF THE FOLLOWING SCRIPTS:
##### - file_paths.R
####################################################################

library(devtools)
load_all('INSPEcT')
source('00_library_of_functions.R')
library(TxDb.Mmusculus.UCSC.mm9.knownGene)
library(RUVSeq)

#########################################
## quantify exonic and intronic signal ########
## from BAM files  ########################
######################

load(
	# inputBamFile,mycChIPbamfiles,pol2ChIPbamfiles,
	# mycPeaksfiles,pol2Peaksfiles,pol2BroadPeaksfiles,
	# bampaths_4su_rep1,bampaths_4su_rep2,bampaths_4su_rep3,
	# bampaths_total_rep1,bampaths_total_rep2,bampaths_total_rep3,
	file=file.path('..','data','paths_3T9_mycER.RData')
	)

rpkmsAndCounts_rep1 <- makeRPKMs(
	TxDb.Mmusculus.UCSC.mm9.knownGene,
	bampaths_4su_rep1,
	bampaths_total_rep1
	)
saveRDS(rpkmsAndCounts_rep1, 
	file=file.path('..','data','3T9MycER_rpkmsAndCounts_rep1.rds'))

rpkmsAndCounts_rep2 <- makeRPKMs(
	TxDb.Mmusculus.UCSC.mm9.knownGene,
	bampaths_4su_rep2,
	bampaths_total_rep2
	)
saveRDS(rpkmsAndCounts_rep2, 
	file=file.path('..','data','3T9MycER_rpkmsAndCounts_rep2.rds'))

rpkmsAndCounts_rep3 <- makeRPKMs(
	TxDb.Mmusculus.UCSC.mm9.knownGene,
	bampaths_4su_rep3,
	bampaths_total_rep3
	)
saveRDS(rpkmsAndCounts_rep3, 
	file=file.path('..','data','3T9MycER_rpkmsAndCounts_rep3.rds'))

## make reports of total-RNA and labeled-RNA libraries

total_stats <- t(cbind(
	rpkmsAndCounts_rep1$counts$total$stat,
	rpkmsAndCounts_rep2$counts$total$stat,
	rpkmsAndCounts_rep3$counts$total$stat
	))

foursu_stats <- t(cbind(
	rpkmsAndCounts_rep1$counts$foursu$stat,
	rpkmsAndCounts_rep2$counts$foursu$stat,
	rpkmsAndCounts_rep3$counts$foursu$stat
	))

save(total_stats, foursu_stats, 
	file=file.path('data','rna_libraries_stats.RData'))

###########################################
## normalize libraries with RUVSeq) ############
##########################################

rpkmsAndCounts_rep1 <- readRDS(
	file=file.path('..','data','3T9MycER_rpkmsAndCounts_rep1.rds'))

rpkmsAndCounts_rep2 <- readRDS(
	file=file.path('..','data','3T9MycER_rpkmsAndCounts_rep2.rds'))

rpkmsAndCounts_rep3 <- readRDS(
	file=file.path('..','data','3T9MycER_rpkmsAndCounts_rep3.rds'))

	tpts <- c(0,1/6,1/3,1/2,1,1.5,2,4,8,12,16)
x <- as.factor(rep(round(tpts,2), times=3))

############ 4sU-seq libraries

tcA_4sU_exons <- rpkmsAndCounts_rep1$counts$foursu$exonCounts
tcB_4sU_exons <- rpkmsAndCounts_rep2$counts$foursu$exonCounts
tcC_4sU_exons <- rpkmsAndCounts_rep3$counts$foursu$exonCounts
tcA_4sU_introns <- rpkmsAndCounts_rep1$counts$foursu$intronCounts
tcB_4sU_introns <- rpkmsAndCounts_rep2$counts$foursu$intronCounts
tcC_4sU_introns <- rpkmsAndCounts_rep3$counts$foursu$intronCounts

colnames(tcA_4sU_exons) <- paste(colnames(tcA_4sU_exons), 'tcA', sep='_')
colnames(tcB_4sU_exons) <- paste(colnames(tcB_4sU_exons), 'tcB', sep='_')
colnames(tcC_4sU_exons) <- paste(colnames(tcC_4sU_exons), 'tcC', sep='_')
colnames(tcA_4sU_introns) <- colnames(tcA_4sU_exons)
colnames(tcB_4sU_introns) <- colnames(tcB_4sU_exons)
colnames(tcC_4sU_introns) <- colnames(tcC_4sU_exons)

labeledGenesExons <- cbind(
	tcA_4sU_exons
	, tcB_4sU_exons
	, tcC_4sU_exons
	)

labeledGenesIntrons <- cbind(
	tcA_4sU_introns
	, tcB_4sU_introns
	, tcC_4sU_introns
	)

## keep only genes that have both features (and that are expressed)
filter <- apply(labeledGenesExons, 1, function(x) length(x[x>5])>ncol(labeledGenesExons)/3)
labeledGenesExonsExpressed <- rownames(labeledGenesExons[filter,])
filter <- apply(labeledGenesIntrons, 1, function(x) length(x[x>5])>ncol(labeledGenesExons)/3)
labeledGenesIntronsExpressed <- rownames(labeledGenesIntrons[filter,])
expressedLabeledGenes <- intersect(labeledGenesExonsExpressed, labeledGenesIntronsExpressed)
labeledGenesExons <- labeledGenesExons[expressedLabeledGenes, ]
labeledGenesIntrons <- labeledGenesIntrons[expressedLabeledGenes, ]

## normalize using upper quartile the whole dataset ( introns and exons )

rownames(labeledGenesExons) <- paste('e', rownames(labeledGenesExons), sep='')
rownames(labeledGenesIntrons) <- paste('i', rownames(labeledGenesIntrons), sep='')
labeledGenesExInt <- rbind(labeledGenesExons, labeledGenesIntrons)
setExInt <- newSeqExpressionSet(as.matrix(labeledGenesExInt),
	phenoData = data.frame(x, row.names=colnames(labeledGenesExInt)))
setExInt <- betweenLaneNormalization(setExInt, which="upper")
setEx <- setExInt[grep('^e', featureNames(setExInt))]
featureNames(setEx) <- sub('^e', '' , featureNames(setEx))
setInt <- setExInt[grep('^i', featureNames(setExInt))]
featureNames(setInt) <- sub('^i', '' , featureNames(setInt))

## the sum of the read counts at the gene level is supposed not to change
## and build the glm on them
labeledGenes <- assayData(setEx)$normalizedCounts + assayData(setInt)$normalizedCounts
set <- newSeqExpressionSet(as.matrix(labeledGenes),
	phenoData = data.frame(x, row.names=colnames(labeledGenes)))
design <- model.matrix(~x, data=pData(set))
y <- DGEList(counts=counts(set), group=x)
y <- estimateGLMCommonDisp(y, design)
y <- estimateGLMTagwiseDisp(y, design)
fit <- glmFit(y, design)
res <- residuals(fit, type="deviance")

## apply the model to exons and introns

setExNormLabeled <- RUVr(setEx, expressedLabeledGenes, k=1, res)
setIntNormLabeled <- RUVr(setInt, expressedLabeledGenes, k=1, res)

############ RNA-seq libraries

tcA_total_exons <- rpkmsAndCounts_rep1$counts$total$exonCounts
tcB_total_exons <- rpkmsAndCounts_rep2$counts$total$exonCounts
tcC_total_exons <- rpkmsAndCounts_rep3$counts$total$exonCounts
tcA_total_introns <- rpkmsAndCounts_rep1$counts$total$intronCounts
tcB_total_introns <- rpkmsAndCounts_rep2$counts$total$intronCounts
tcC_total_introns <- rpkmsAndCounts_rep3$counts$total$intronCounts

colnames(tcA_total_exons) <- paste(colnames(tcA_total_exons), 'tcA', sep='_')
colnames(tcB_total_exons) <- paste(colnames(tcB_total_exons), 'tcB', sep='_')
colnames(tcC_total_exons) <- paste(colnames(tcC_total_exons), 'tcC', sep='_')
colnames(tcA_total_introns) <- colnames(tcA_total_exons)
colnames(tcB_total_introns) <- colnames(tcB_total_exons)
colnames(tcC_total_introns) <- colnames(tcC_total_exons)

totalGenesExons <- cbind(
	tcA_total_exons
	, tcB_total_exons
	, tcC_total_exons
	)

totalGenesIntrons <- cbind(
	tcA_total_introns
	, tcB_total_introns
	, tcC_total_introns
	)

## keep only genes that have both features (and that are expressed)
filter <- apply(totalGenesExons, 1, function(x) length(x[x>5])>ncol(labeledGenesExons)/3)
totalGenesExonsExpressed <- rownames(totalGenesExons[filter,])
filter <- apply(totalGenesIntrons, 1, function(x) length(x[x>5])>ncol(labeledGenesExons)/3)
totalGenesIntronsExpressed <- rownames(totalGenesIntrons[filter,])
expressedTotalGenes <- intersect(totalGenesExonsExpressed, totalGenesIntronsExpressed)
totalGenesExons <- totalGenesExons[expressedTotalGenes, ]
totalGenesIntrons <- totalGenesIntrons[expressedTotalGenes, ]

## normalize using upper quartile the whole dataset ( introns and exons )

rownames(totalGenesExons) <- paste('e', rownames(totalGenesExons), sep='')
rownames(totalGenesIntrons) <- paste('i', rownames(totalGenesIntrons), sep='')
totalGenesExInt <- rbind(totalGenesExons, totalGenesIntrons)
setExInt <- newSeqExpressionSet(as.matrix(totalGenesExInt),
	phenoData = data.frame(x, row.names=colnames(totalGenesExInt)))
setExInt <- betweenLaneNormalization(setExInt, which="upper")
setEx <- setExInt[grep('^e', featureNames(setExInt))]
featureNames(setEx) <- sub('^e', '' , featureNames(setEx))
setInt <- setExInt[grep('^i', featureNames(setExInt))]
featureNames(setInt) <- sub('^i', '' , featureNames(setInt))

## the sum of the read counts at the gene level is supposed not to change
## and build the glm on them
totalGenes <- assayData(setEx)$normalizedCounts + assayData(setInt)$normalizedCounts
set <- newSeqExpressionSet(as.matrix(totalGenes),
	phenoData = data.frame(x, row.names=colnames(totalGenes)))
design <- model.matrix(~x, data=pData(set))
y <- DGEList(counts=counts(set), group=x)
y <- estimateGLMCommonDisp(y, design)
y <- estimateGLMTagwiseDisp(y, design)
fit <- glmFit(y, design)
res <- residuals(fit, type="deviance")

## apply the model to exons and introns

setExNormTotal <- RUVr(setEx, expressedTotalGenes, k=1, res)
setIntNormTotal <- RUVr(setInt, expressedTotalGenes, k=1, res)

############ merge 4sU-seq and RNA-seq libraries

labeledCountsExons <- assayData(setExNormLabeled)$normalizedCounts
labeledCountsIntrons <- assayData(setIntNormLabeled)$normalizedCounts
totalCountsExons <- assayData(setExNormTotal)$normalizedCounts
totalCountsIntrons <- assayData(setIntNormTotal)$normalizedCounts

labeledFeatures <- rownames(labeledCountsExons)
totalFeatures <- rownames(totalCountsExons)
cGenes <- intersect(labeledFeatures, totalFeatures)

labeledCountsExons <- labeledCountsExons[cGenes,]
labeledCountsIntrons <- labeledCountsIntrons[cGenes,]
totalCountsExons <- totalCountsExons[cGenes,]
totalCountsIntrons <- totalCountsIntrons[cGenes,]

save(
	labeledCountsExons
	, labeledCountsIntrons
	, totalCountsExons
	, totalCountsIntrons
	, file=file.path('..','data','3T9MycER_counts.RData')
	)

## make deseq datasets

library(DESeq2)

tCE <- DESeq(DESeqDataSetFromMatrix(
        countData=totalCountsExons,
        colData=DataFrame(tpts=factor(rep(signif(tpts,2),3))),
        design=~tpts
        ))

lCE <- DESeq(DESeqDataSetFromMatrix(
        countData=labeledCountsExons,
        colData=DataFrame(tpts=factor(rep(signif(tpts,2),3))),
        design=~tpts
        ))

save(tCE, lCE, file=file.path('..','data','3T9MycER_DESeq2.RData'))

############ transform counts into (pseudo)rpkms

# library size

labeledCounts <- rbind(labeledCountsExons, labeledCountsIntrons)
totalCounts <- rbind(totalCountsExons, totalCountsIntrons)
pseudoLibSizeTotal <- median(colSums(totalCounts))
pseudoLibSizeLabeled <- median(colSums(labeledCounts))

# featureWidth

exonWidth <- sapply(width(rpkmsAndCounts_rep1$annotation$exon),sum)
intronsWidth <- sapply(width(rpkmsAndCounts_rep1$annotation$intron),sum)

# rpkms
labeledRpkmsExons <- t(t(labeledCountsExons/exonWidth[cGenes])/pseudoLibSizeLabeled)*10^9
labeledRpkmsIntrons <- t(t(labeledCountsIntrons/intronsWidth[cGenes])/pseudoLibSizeLabeled)*10^9
totalRpkmsExons <- t(t(totalCountsExons/exonWidth[cGenes])/pseudoLibSizeTotal)*10^9
totalRpkmsIntrons <- t(t(totalCountsIntrons/intronsWidth[cGenes])/pseudoLibSizeTotal)*10^9

exonWidth <- exonWidth[cGenes]
intronsWidth <- intronsWidth[cGenes]

save(
	exonWidth
	, intronsWidth
	, pseudoLibSizeLabeled
	, pseudoLibSizeTotal
	, file=file.path('..','data','3T9MycER_normfactors.RData')
	)

save(
    labeledRpkmsExons
    , labeledRpkmsIntrons
    , totalRpkmsExons
    , totalRpkmsIntrons
    , file=file.path('..','data','3T9MycER_rpkms.RData')
    )

############ first-guess rates

load(file=file.path('..','data','3T9MycER_rpkms.RData'))
tpts <- c(0,1/6,1/3,1/2,1,1.5,2,4,8,12,16)

mycerIds <- newINSPEcT(
	rep(tpts, times=3), 1/12, 
	labeledRpkmsExons, totalRpkmsExons, 
	labeledRpkmsIntrons, totalRpkmsIntrons, 
	totalMedianNorm=FALSE, degDuringPulse=FALSE
	)

############ smoothing of rates

modelingParams(mycerIds)$useSigmoidFun <- FALSE
thresholds(mycerIds)$chisquare <- .01
thresholds(mycerIds)$brown['synthesis'] <- .05
thresholds(mycerIds)$brown['processing'] <- .05
thresholds(mycerIds)$brown['degradation'] <- .05

# this step may take several hours, 
# for this reason it has been split into parts 
# (only preparation of the groups here)

nGenes <- length(featureNames(mycerIds))
N <- 20
nN <- ceiling(nGenes/N)
groups <- ceiling(1:nGenes/nN)

for(i in 1:N) {
	mycerIds_group <- mycerIds[groups==i]
	save(mycerIds_group, 
		file=file.path('..','data',paste0('mycerIds_group_',i,'.RData')))
}
