####################################################################
##### DEPENDS ON THE PREVIOUS EXECUTION OF THE FOLLOWING SCRIPTS:
##### - file_paths.R
####################################################################

source('00_library_of_functions.R')
library(DEXSeq)

library(TxDb.Mmusculus.UCSC.mm9.knownGene)
txdb <- TxDb.Mmusculus.UCSC.mm9.knownGene
library(GenomicAlignments)

##### retrieve annotation

txAnn <- make_transcripts_annotation( txdb )
#    user  system elapsed 
# 157.868   0.292 158.855 

# unlist, remove mcols and sort annotations

txAnnSimple <- lapply(txAnn, function(x) {
	x <- unlist(x)
	mcols(x) <- NULL
	sort(x)
	})

fe <- txAnnSimple$fiveUTRexons
fi <- txAnnSimple$fiveUTRintrons
ce <- txAnnSimple$CDSexons
ci <- txAnnSimple$CDSintrons
te <- txAnnSimple$threeUTRexons
ti <- txAnnSimple$threeUTRintrons

### bam files

load(
	# inputBamFile,mycChIPbamfiles,pol2ChIPbamfiles,
	# mycPeaksfiles,pol2Peaksfiles,pol2BroadPeaksfiles,
	# bampaths_4su_rep1,bampaths_4su_rep2,bampaths_4su_rep3,
	# bampaths_total_rep1,bampaths_total_rep2,bampaths_total_rep3,
	file=file.path('..','data','paths_3T9_mycER.RData')
	)

for( tpidx in 2:11 ) {

	bamfiles <- c(
		total_0h_rep1=bampaths_total_rep1[1],
		total_0h_rep2=bampaths_total_rep2[1],
		total_0h_rep3=bampaths_total_rep3[1],
		total_2h_rep1=bampaths_total_rep1[tpidx],
		total_2h_rep2=bampaths_total_rep2[tpidx],
		total_2h_rep3=bampaths_total_rep3[tpidx]
		)

	print(bamfiles)

	### make counts (!!this has to be improved, not to keep counting the zero hours)

	dexseq_counts <- lapply(bamfiles, function(bamfile) {

		print(bamfile)
		bamReads <- importBamFile(bamfile, countMultiMappingReads=FALSE, isPairedEnd=FALSE)

		# exons
		fecounts <- countFeat(fe, bamReads, allowMultiOverlap=FALSE, strandSpecific=FALSE)
		cecounts <- countFeat(ce, bamReads, allowMultiOverlap=FALSE, strandSpecific=FALSE)
		tecounts <- countFeat(te, bamReads, allowMultiOverlap=FALSE, strandSpecific=FALSE)

		# introns
		ficounts <- countFeat(fi,
			bamReads[!(tecounts$onfeature | fecounts$onfeature | cecounts$onfeature)], 
			allowMultiOverlap=FALSE, strandSpecific=FALSE)
		cicounts <- countFeat(ci,
			bamReads[!(tecounts$onfeature | fecounts$onfeature | cecounts$onfeature)], 
			allowMultiOverlap=FALSE, strandSpecific=FALSE)
		ticounts <- countFeat(ti,
			bamReads[!(tecounts$onfeature | fecounts$onfeature | cecounts$onfeature)], 
			allowMultiOverlap=FALSE, strandSpecific=FALSE)

		list(
			fecounts = fecounts$counts, 
			cecounts = cecounts$counts, 
			tecounts = tecounts$counts, 
			ficounts = ficounts$counts, 
			cicounts = cicounts$counts, 
			ticounts = ticounts$counts
		)

		})

	saveRDS(dexseq_counts, file=file.path('..','data',paste0('dexseq_counts_',tpidx,'.rds')))

}

for( tpidx in 2:11 ) {

	dexseq_counts <- readRDS(file=file.path('..','data',paste0('dexseq_counts_',tpidx,'.rds')))

	##### gather counts from different samples into matrices and calculate densities

	femat <- sapply(dexseq_counts, '[[', 'fecounts')
	cemat <- sapply(dexseq_counts, '[[', 'cecounts')
	temat <- sapply(dexseq_counts, '[[', 'tecounts')
	fimat <- sapply(dexseq_counts, '[[', 'ficounts')
	cimat <- sapply(dexseq_counts, '[[', 'cicounts')
	timat <- sapply(dexseq_counts, '[[', 'ticounts')

	feDmat <- femat/width(fe)*10^3
	ceDmat <- cemat/width(ce)*10^3
	teDmat <- temat/width(te)*10^3
	fiDmat <- fimat/width(fi)*10^3
	ciDmat <- cimat/width(ci)*10^3
	tiDmat <- timat/width(ti)*10^3

	##### filter according to a minimum of read counts and density ... 

	fenames <- unique(names(which(apply(femat>10 & feDmat>20, 1, any))))
	cenames <- unique(names(which(apply(cemat>10 & ceDmat>20, 1, any))))
	tenames <- unique(names(which(apply(temat>10 & teDmat>20, 1, any))))
	finames <- unique(names(which(apply(fimat>10 & fiDmat>20, 1, any))))
	cinames <- unique(names(which(apply(cimat>10 & ciDmat>20, 1, any))))
	tinames <- unique(names(which(apply(timat>10 & tiDmat>20, 1, any))))

	##### ... and according to coherence between annotations (take coding exons as the reference)

	fenames <- fenames[ fenames %in% cenames ]
	tenames <- tenames[ tenames %in% cenames ]
	finames <- finames[ finames %in% cenames ]
	cinames <- cinames[ cinames %in% cenames ]
	tinames <- tinames[ tinames %in% cenames ]

	##### actually filter and integrate with annotation (!!this can be integrated with the genetation
		# of the annotation)

	annotateGR <- function(gr, tag)
	{
		tmp <- split(gr, names(gr))
		unlist(GRangesList(sapply(tmp, function(x) {
			if( as.character(strand(x))[1]=="+" ) {
				names(x) <- paste(tag,seq_along(x),sep=".")
			} else {
				names(x) <- paste(tag,rev(seq_along(x)),sep=".")
			}
			x
			})))
	}

	# 5' utr exons
	mcols(fe) <- femat
	exprfe <- fe[names(fe) %in% fenames]
	exprfe <- annotateGR(exprfe, 'fe')

	# coding exons
	mcols(ce) <- cemat
	exprce <- ce[names(ce) %in% cenames]
	exprce <- annotateGR(exprce, 'ce')

	# 3' utr exons
	mcols(te) <- temat
	exprte <- te[names(te) %in% tenames]
	exprte <- annotateGR(exprte, 'te')

	# 5' utr introns
	mcols(fi) <- fimat
	exprfi <- fi[names(fi) %in% finames]
	exprfi <- annotateGR(exprfi, 'fi')

	# coding introns
	mcols(ci) <- cimat
	exprci <- ci[names(ci) %in% cinames]
	exprci <- annotateGR(exprci, 'ci')

	# 3' utr introns
	mcols(ti) <- timat
	exprti <- ti[names(ti) %in% tinames]
	exprti <- annotateGR(exprti, 'ti')

	###### create DEX-seq data structure for all data

	allfeatfeat <- sort(c(exprfe,exprce,exprte,exprfi,exprci,exprti))
	featureRanges <- granges(allfeatfeat)
	groupID <- sapply(strsplit(names(allfeatfeat),'\\.'),'[[',1)
	featureID <- sapply(lapply(strsplit(names(allfeatfeat),'\\.'), function(x) 
		x[-1]),paste,collapse='_')
	countData <- as.matrix(mcols(allfeatfeat))
	colnames(countData) <- NULL
	sampleData <- data.frame(
		condition=c('WT','WT','WT','SH','SH','SH')
		)
	names(featureRanges) <- NULL

	allfeatDEX <- DEXSeqDataSet( 
		countData, sampleData=sampleData,
		featureID=featureID, groupID=groupID, 
		featureRanges=featureRanges)

	## test for differential expression

	allfeatDEX <- estimateSizeFactors( allfeatDEX )

	# estimate dispersions independently for each feature

	feDEX <- estimateDispersions( allfeatDEX[grep('fe',rownames(allfeatDEX))] )
	ceDEX <- estimateDispersions( allfeatDEX[grep('ce',rownames(allfeatDEX))] )
	teDEX <- estimateDispersions( allfeatDEX[grep('te',rownames(allfeatDEX))] )
	fiDEX <- estimateDispersions( allfeatDEX[grep('fi',rownames(allfeatDEX))] )
	ciDEX <- estimateDispersions( allfeatDEX[grep('ci',rownames(allfeatDEX))] )
	tiDEX <- estimateDispersions( allfeatDEX[grep('ti',rownames(allfeatDEX))] )

	allfeatDEX <- sort(rbind(feDEX,ceDEX,teDEX,fiDEX,ciDEX,tiDEX))
	allfeatDEX <- testForDEU( allfeatDEX )
	allfeatDEX <- estimateExonFoldChanges( allfeatDEX, fitExpToVar="condition")

	saveRDS(allfeatDEX, file=file.path('..','data',paste0('allfeatDEX_',tpidx,'.rds')))

}

library(compEpiTools)

mm9TES <- make_annotation("genome", annotationFeature="TES", 
	promoter_limits=c(0,0), termination_limits=c(0,0))
mm9TSS <- make_annotation("genome", annotationFeature="TSS", 
	promoter_limits=c(0,0), termination_limits=c(0,0))
mm9TXlen <- gene_length('genome')

tpidxs <- c( 2:11 )
names(tpidxs) <- c('10min','20min','30min','1h',
	'1.5h','2h','4h','8h','12h','16h')

data_folder <- 'data/'
DEXresALL <- lapply(tpidxs, function(idx) {
	print(idx)
	allfeatDEX <- readRDS(file=file.path('..',data_folder, 
		paste0('allfeatDEX_',idx,'.rds')))
	allfeatDEXres <- DEXSeqResults(allfeatDEX)
	## annotate with TSS distances
	allfeatDEXres$TSSdist <- abs(start(GRmidpoint(allfeatDEXres$genomicData)) - 
		start(mm9TSS[allfeatDEXres$groupID]))
	allfeatDEXres$TESdist <- abs(start(GRmidpoint(allfeatDEXres$genomicData)) - 
		start(mm9TES[allfeatDEXres$groupID]))	
	allfeatDEXres$TSSreldist <- allfeatDEXres$TSSdist/mm9TXlen[allfeatDEXres$groupID]
	return(allfeatDEXres)
	})

save(DEXresALL, file=file.path('..','data','DEXresALL.rds'))

### pre-process the DEX-seq file to save a reduced version on the disk

DEXresALL <- lapply(DEXresALL, function(x) {
	x$log2fold_t_c <- -x$log2fold_WT_SH
	x$log2fold_WT_SH <- NULL
	return(x)
	})

### on exons

## select exonic features from the DEXseq dataset
exonsDEXresALL <- lapply(DEXresALL, function(x) x[grepl('^.e',x$featureID),])

## select genes in which a single event occurred
exonsDEXresALLfilt <- lapply(exonsDEXresALL, oneEventGenes, 
	padj_thresh=.05, foldchange_thresh=0)

## merge in a unique data.frame
exonsList   <- lapply(exonsDEXresALLfilt, function(x) 
	data.frame(x[,c('groupID','featureID','TSSdist','TESdist','TSSreldist','padj','log2fold_t_c')]))
exonsDF   <- merge_list(exonsList, by=c('groupID','featureID','TSSdist','TESdist','TSSreldist'), all=TRUE)

save(exonsDF, file=file.path('data','dexseq_exonsDF.RData'))

### on introns

## select intronic features from the DEXseq dataset
intronsDEXresALL <- lapply(DEXresALL, function(x) x[grepl('^.i',x$featureID),])

## select genes in which a single event occurred
intronsDEXresALLfilt <- lapply(intronsDEXresALL, oneEventGenes, 
	padj_thresh=.05, foldchange_thresh=0)

## merge in a unique data.frame
intronsList <- lapply(intronsDEXresALLfilt, function(x) 
	data.frame(x[,c('groupID','featureID','TSSdist','TESdist','TSSreldist','padj','log2fold_t_c')]))
intronsDF <- merge_list(intronsList, 
	by=c('groupID','featureID','TSSdist','TESdist','TSSreldist'), all=TRUE)

save(intronsDF, file=file.path('data','dexseq_intronsDF.RData'))

### save sample genes for plotting purposes

require(org.Mm.eg.db)
symb2eg <- as.list(org.Mm.egSYMBOL2EG)

gene1_id <- symb2eg[['Wdr59']]
gene2_id <- symb2eg[['Arhgap44']]

dexseq_gene1 <- DEXresALL[['30min']][grepl('e',DEXresALL[['30min']]$featureID) & DEXresALL[['30min']]$groupID == gene1_id,]
dexseq_gene2 <- DEXresALL[['1h']][grepl('i',DEXresALL[['1h']]$featureID) & DEXresALL[['1h']]$groupID == gene2_id,]

save(dexseq_gene1, dexseq_gene2, file=file.path('data','dexseq_genes2plot.RData'))
