####################################################################
##### DEPENDS ON THE PREVIOUS EXECUTION OF THE FOLLOWING SCRIPTS:
##### - make_INSPEcT_smoothrates_combine.R
##### - make_ChIPseq.R
####################################################################

source('00_library_of_functions.R')

## ChIP-seq data

load(
	# mycTSSgr,polTSSgr,polGBgr,polTESgr,
	# inputMycScales,mycSampleNorm,inputPol2Scales,pol2SampleNorm,
	# mycTSSPeaks,polTSSPeaks,polGBPeaks,polTESPeaks,
	# mycTSSn,polTSSn,polGBn,polTESn,
	file=file.path('..','data','ChIPseq_data.RData'))

# make Log-Ratios

mycTSSLR <- log2(mycTSSn/mycTSSn[,1])
colnames(mycTSSLR) <- paste('mycTSSLR', colnames(mycTSSLR), sep="_")
polTSSLR <- log2(polTSSn/polTSSn[,1])
colnames(polTSSLR)  <- paste('polTSSLR', colnames(polTSSLR), sep="_")
polGBLR <- log2(polGBn/polGBn[,1])
colnames(polGBLR)  <- paste('polGBLR', colnames(polGBLR), sep="_")
polTESLR <- log2(polTESn/polTESn[,1])
colnames(polTESLR)  <- paste('polTESLR', colnames(polTESLR), sep="_")

# saturate absolute values

mycTSS <- mycTSSn/quantile(mycTSSn,.975); mycTSS[mycTSS>1] <- 1
colnames(mycTSS) <- paste('mycTSS', colnames(mycTSS), sep="_")
polTSS <- polTSSn/quantile(polTSSn,.975); polTSS[polTSS>1] <- 1
colnames(polTSS) <- paste('polTSS', colnames(polTSS), sep="_")
polGB <- polGBn/quantile(polGBn,.975); polGB[polGB>1] <- 1
colnames(polGB) <- paste('polGB', colnames(polGB), sep="_")
polTES <- polTESn/quantile(polTESn,.975); polTES[polTES>1] <- 1
colnames(polTES) <- paste('polTES', colnames(polTES), sep="_")

## INSPEcT data

library(INSPEcT)
load(file.path('..','data','3T9MycER_INSPEcTdataset.RData'))
inspectRankData <- cbind(
	matrixRank(viewModelRates(mycerIds, 'total')) / 
		max(matrixRank(viewModelRates(mycerIds, 'total')),na.rm=TRUE)
	, matrixRank(viewModelRates(mycerIds, 'preMRNA')) / 
		max(matrixRank(viewModelRates(mycerIds, 'preMRNA')),na.rm=TRUE)
	, matrixRank(viewModelRates(mycerIds, 'synthesis')) / 
		max(matrixRank(viewModelRates(mycerIds, 'synthesis')),na.rm=TRUE)
	, matrixRank(viewModelRates(mycerIds, 'degradation')) / 
		max(matrixRank(viewModelRates(mycerIds, 'degradation')),na.rm=TRUE)
	, matrixRank(viewModelRates(mycerIds, 'processing')) / 
		max(matrixRank(viewModelRates(mycerIds, 'processing')),na.rm=TRUE)
	)
colnames(inspectRankData) <- paste(c(rep('mrna',11), rep('premrna',11), 
	rep('synthesis',11), rep('degradation',11), rep('processing',11)),
	c('0min','10min','20min','30min','1h','1.5h','2h','4h','8h','12h','16h'),sep='_')

## model, logratios

inspectRates <- list(
	total=viewModelRates(mycerIds, 'total')
	, mature=viewModelRates(mycerIds, 'total')-viewModelRates(mycerIds, 'preMRNA')
	, preMRNA=viewModelRates(mycerIds, 'preMRNA')
	, synthesis=viewModelRates(mycerIds, 'synthesis')
	, degradation=viewModelRates(mycerIds, 'degradation')
	, processing=viewModelRates(mycerIds, 'processing')
	)
inspectLogratios <- lapply(inspectRates, function(x) log2(x)-log2(x[,1]))
inspectLogratios <- do.call('cbind', inspectLogratios)
colnames(inspectLogratios) <- paste(c(
	rep('mrnaLR',11), rep('maturemrnaLR',11), rep('premrnaLR',11), 
	rep('synthesisLR',11), rep('degradationLR',11), rep('processingLR',11)),
	c('0min','10min','20min','30min','1h','1.5h','2h','4h','8h','12h','16h'),sep='_')

# # gather features [allfeatures.rds]

cGenes <- intersect(featureNames(mycerIds), names(mycTSSgr))

allfeatures <- cbind(
	log2(1.5)*mycTSS[cGenes,],
	log2(1.5)*polTSS[cGenes,],
	log2(1.5)*polGB[cGenes,],
	log2(1.5)*polTES[cGenes,],
	log2(1.5)*inspectRankData[cGenes,],
	mycTSSLR[cGenes,],
	polTSSLR[cGenes,],
	polGBLR[cGenes,],
	polTESLR[cGenes,],
	inspectLogratios[cGenes,]
	)
save(allfeatures, file=file.path('..','data','allfeatures.rds'))

## define the set of genes [filtered.rds]

# bound
bound <- apply(mycTSSPeaks>0,1,any)
bound <- names(bound[bound])
# chr [1:11141] "100017" "100019" "100034361" "100034726" "100034739" "100036521" "100037258" "100037262" ...

# expressed
mycerMRNA <- viewModelRates(mycerIds, 'total')
expressed <- rownames(mycerMRNA)[which(apply(mycerMRNA,1,max)>1)]
# chr [1:9439] "100017" "100019" "100034361" "100037258" ...

# differentially regulated
inspectClass <- geneClass(mycerIds)
inspectDEG <- names(inspectClass)[!grepl('0',inspectClass)]
totalLR <- allfeatures[rownames(allfeatures) %in% inspectDEG,grep('^mrnaLR',colnames(allfeatures))]
syntLR <- allfeatures[rownames(allfeatures) %in% inspectDEG,grep('^synthesisLR',colnames(allfeatures))]
totFCgenes <- rownames(totalLR)[apply(abs(totalLR), 1, max)>log2(1.2)]
synFCgenes <- rownames(syntLR)[apply(abs(syntLR), 1, max)>log2(1.2)]
str( DEG <- union(totFCgenes, synFCgenes) )
# chr [1:6415] "100017" "100037258" "100037278" "100038417" "100038538" "100038712" "100038847" "100039684" ...

# filter
str(filtered <- intersect(intersect(bound, expressed),DEG))
# chr [1:4909] "100017" "100037258" "100038538" "100039864" "100039968" "100040617" "100040736" "100042970" ...
save(filtered, file=file.path('..','data','filtered.rds'))
# k1 DEG
DEGk1 <- intersect(names(inspectClass)[grepl('a',inspectClass)], synFCgenes)
str(filteredk1var <- intersect(filtered, DEGk1))
# chr [1:4651] "100017" "100037258" "100038538" "100039864" "100039968" "100040617" "100040736" "100042970" ...
save(filteredk1var, file=file.path('..','data','filteredk1var.rds'))
# filter for k1k2
DEGk2 <- names(inspectClass)[grepl('c',inspectClass)]
str(filteredk1k2var <- intersect(filteredk1var, DEGk2))
# chr [1:1333] "100038538" "100039864" "100040617" "100043424" "100201" "100206" "100340" "100383" "100465" ...
save(filteredk1k2var, file=file.path('..','data','filteredk1k2var.rds'))

# filter for k3
DEGk3 <- names(inspectClass)[grepl('b',inspectClass)]
str(filteredk3var <- intersect(filtered, DEGk3))
# chr [1:976] "100039968" "100182" "100216455" "100273" "100494" "100504663" "100608" "100609" "101148" ...
save(filteredk3var, file=file.path('..','data','filteredk3var.rds'))

## up and down genes

synthesisLR <- allfeatures[filtered, grep('synthesisLR', colnames(allfeatures))[-1]]
synthMaxResp <- synthesisLR[cbind(1:nrow(synthesisLR),apply(abs(synthesisLR), 1, which.max))]
upGenes <- filtered[synthMaxResp>0]
downGenes <- filtered[synthMaxResp<0]
save(upGenes, file=file.path('..','data','upGenes.rds'))
save(downGenes, file=file.path('..','data','downGenes.rds'))

## cluster

steady_weighth <- 15
synthesis_weight <- 4
pol2modelingfeatures <- c(
	## steady state features
	rep("polTSS_0min",steady_weighth),
	rep("polGB_0min",steady_weighth),
	rep("polTES_0min",steady_weighth),
	rep("synthesis_0min",steady_weighth),
	## polTSSLR
	"polTSSLR_10min","polTSSLR_20min","polTSSLR_30min","polTSSLR_2h","polTSSLR_4h",
	# polGBLR
	"polGBLR_10min","polGBLR_20min","polGBLR_30min","polGBLR_2h","polGBLR_4h",
	# polTESLR
	"polTESLR_10min","polTESLR_20min","polTESLR_30min","polTESLR_2h","polTESLR_4h",
	# synthesisLR
	rep(c("synthesisLR_10min","synthesisLR_20min","synthesisLR_30min",
		"synthesisLR_2h","synthesisLR_4h"),synthesis_weight)
	)

pol2modelingdata <- allfeatures[filteredk1var,pol2modelingfeatures]
pol2modelingclust <- aicKmeans(pol2modelingdata, 40, 
	stabilize=.05, by=1, iter.max=40, nstart=20, seed=0)
save(pol2modelingfeatures, pol2modelingclust, 
	file=file.path('..','data','pol2modelingclustering.RData'))

## make pseudogenes

library(GenomicRanges)
load(file=file.path('..','data','3T9MycER_smooth_rates.RData'))
load(
	# mycTSSgr,polTSSgr,polGBgr,polTESgr,
	# inputMycScales,mycSampleNorm,inputPol2Scales,pol2SampleNorm,
	# mycTSSPeaks,polTSSPeaks,polGBPeaks,polTESPeaks,
	# mycTSSn,polTSSn,polGBn,polTESn,
	file='data/ChIPseq_data.RData')

pol2TSS <- polTSSn * width(polTSSgr)
pol2GB  <- polGBn * width(polGBgr)
pol2TES <- polTESn * width(polTESgr)

rownames(pol2GB)  <- names(polTSSgr)
rownames(pol2TES) <- names(polGBgr)
rownames(pol2TSS) <- names(polTESgr)

# (normalize to obtain smaller numbers)

synthesis <- synthesis[,c(1,2,3,4,7,8)]/50
pol2GB <- pol2GB/5e4
pol2TES <- pol2TES/5e4
pol2TSS <- pol2TSS/5e4

pseudogenesDist <- lapply(which(pol2modelingclust$kmeans$size>50), function(i) {
        ix <- names(which(pol2modelingclust$kmeans$cluster==i))
        pol2_i <- list(
                pol2tss=pol2TSS[ix,], 
                pol2gb=pol2GB[ix,], 
                pol2tes=pol2TES[ix,], 
                synthesis=synthesis[ix,]
                )
        })

pseudogenes <- lapply(pseudogenesDist, function(pol2_i) {
        gene <- cbind(
                x=apply(pol2_i[['pol2tss']],2,median),
                y=apply(pol2_i[['pol2gb']],2,median),
                z=apply(pol2_i[['pol2tes']],2,median),
                a=apply(pol2_i[['synthesis']],2,median)
                )
        
        })

saveRDS(pseudogenes, file.path('..','data','pseudogenes.rds'))
saveRDS(pseudogenesDist, file.path('..','data','pseudogenesDist.rds'))

##############################
## make bootstrap sets #########
##############################

nBootstraps <- 10

vectorbinning <- function(vec, n)
  ceiling(seq_along(vec)/length(vec)*n)

clusters <- seq_along(pseudogenes)
bootstraps <- 1:nBootstraps

N <- length(bootstraps)*length(clusters)
trainingSets <- as.list(rep(NA, N))
testSets <- as.list(rep(NA, N))

count <- 0

for( clustNumber in clusters ) {

	set.seed(1)
	pol2_i <- pseudogenesDist[[clustNumber]]
	clustMembers <- 1:nrow(pol2_i[['pol2tss']])
	bootClass    <- sample(vectorbinning(clustMembers, nBootstraps))

	for( i in bootstraps ) {

		count <- count + 1
		ix <- clustMembers[bootClass != i]
		trainingSets[[count]] <- cbind(
			x=apply(pol2_i[['pol2tss']][ix,],2,median),
			y=apply(pol2_i[['pol2gb']][ix,],2,median),
			z=apply(pol2_i[['pol2tes']][ix,],2,median),
			a=apply(pol2_i[['synthesis']][ix,],2,median)
			)
		ix <- clustMembers[bootClass == i]
		testSets[[count]] <- cbind(
			x=apply(pol2_i[['pol2tss']][ix,],2,median),
			y=apply(pol2_i[['pol2gb']][ix,],2,median),
			z=apply(pol2_i[['pol2tes']][ix,],2,median),
			a=apply(pol2_i[['synthesis']][ix,],2,median)
			)

	}

}

# divide into 4 tranches

trainingSets <- split(trainingSets, 
	ceiling(seq_along(trainingSets)/length(trainingSets)*4))

testSets <- split(testSets, 
	ceiling(seq_along(testSets)/length(testSets)*4))

save(trainingSets, testSets, 
	file=file.path('..','data','bootstraps_tranches.rds'))


