## Gill et al. 2023 Genome Research Supplementary Code

#### Functions and Pre-processing ####
#### Create Mappable tiles Function ####
createMappableTiles <- function(genomeName = "BSgenome.Mmusculus.UCSC.mm10", 
                                tilewidth = 5000, 
                                mapfile="/work2/gpeters/Hroyo/ZZZ_bowtie_mappable_genome_v2/mm10.aln2_l50_m1.coord", 
                                minFractionMappable=0.8) {
  
  require(genomeName, character.only=T)
  gnm <- get(genomeName)
  require(GenomicRanges)
  #regFile <- paste0("/work2/gpeters/Hroyo/ZZZ_Generic/Tiles_", gsub(".","_",genomeName, fixed=T) ,"_",tilewidth,"b_",minFractionMappable*100,"percMappable_MapFile-",gsub(".","_",basename(mapfile), fixed=T),".rds")
  
  #if(file.exists(regFile)) {
  #	message("reading pre-existing regions files")
  #	tiles.select <- readRDS(regFile)
  
  #	} else {
  message("creating new regions object")
  
  # load uniquely mappable regions
  message("  loading mappable regions")
  tmp <- scan(mapfile, what=list(id=NULL,chr="",strand=NULL,start=1L,end=1L), quiet=TRUE)
  mapgr <- GRanges(tmp$chr, IRanges(start=tmp$start, end=tmp$end), seqlengths=seqlengths(gnm))
  
  # generate coordinates for tiling windows
  chrlength <- seqlengths(gnm)
  tiles <- tileGenome(chrlength, tilewidth=tilewidth, cut.last.tile.in.chrom=T)
  
  # remove unmappable tiles
  message("  calculating mappable fraction per tile")
  ov <- findOverlaps(tiles, mapgr)
  tilesmap <- pmin(end(tiles[queryHits(ov)]), end(mapgr[subjectHits(ov)])) - pmax(start(tiles[queryHits(ov)]), start(mapgr[subjectHits(ov)])) + 1 # length of the overlap regions (could be more than one or zero per tile)
  tmp <- tapply(tilesmap, queryHits(ov), sum) # gives total length of mappable regions within any mappable tile
  length(tilesmap)
  mapposPerTile <- rep(0, length(tiles)) # we want to construct a vector with all tiles (non-mappable tiles with value of 0)
  mapposPerTile[as.numeric(names(tmp))] <- tmp
  fractionMappablePerTile <- mapposPerTile /tilewidth
  keep <- fractionMappablePerTile >= minFractionMappable
  tiles.select <- tiles[keep]
  print(summary(keep))
  
  # calculate sequence features for tiles
  message("  extracting sequence features")
  tiles.seq <- getSeq(gnm, tiles.select)
  tiles.1 <- oligonucleotideFrequency(tiles.seq, width=1)
  tiles.2 <- oligonucleotideFrequency(tiles.seq, width=2)
  tiles.percGC <- rowSums(tiles.1[,c("C","G")]) *100 /rowSums(tiles.1) # GC percent
  tiles.ObsExp <- tiles.2
  for (i in 1:ncol(tiles.2)) {
    tiles.ObsExp[,i] <- tiles.2[,i] / 
      (tiles.1[,substring(colnames(tiles.2)[i], first=1, last=1)] / rowSums(tiles.1) 
       *	tiles.1[,substring(colnames(tiles.2)[i], first=2, last=2)])
  }
  colnames(tiles.ObsExp) <- paste0("obsexp.",colnames(tiles.ObsExp))
  
  # store sequence features and mappability in tiles
  mcols(tiles.select) <- DataFrame(percGC   = tiles.percGC,
                                   obsexpCG = tiles.ObsExp[,'obsexp.CG'])
  #}
  message("done")
  return(tiles.select)
}
####



##### Figure 1 Generation: #####
#### Panel C & D ####
library(TxDb.Mmusculus.UCSC.mm10.knownGene)
library(BSgenome.Mmusculus.UCSC.mm10)
library(QuasR)
library(edgeR)
library(ggplot2)

sampleFile = '/work2/gpeters/rohmalex/work/Mark/RNA-seq_181127/sampleFileDedupUniqueMappersProperPairs.txt'
saveDir = '/work2/gpeters/rohmalex/work/Mark/RNA-seq_181127/31_transcriptomeAssembly/analysis/'


# create 5kb mappable tiles
tiles.5kb = createMappableTiles()
encode.blacklist.gr = import.bed("/work2/gpeters/ozonevge/genomes/mm10/ENCODE_blacklist/mm10.blacklist.bed.gz")
ov = findOverlaps(query=encode.blacklist.gr, subject=tiles.5kb)
remove = unique(subjectHits(ov))
tiles.5kb = tiles.5kb[-remove]
seqlevels(tiles.5kb, pruning.mode='coarse') = paste0('chr', c(1:19, 'X', 'Y'))

# create qproject
qproj = qAlign(sampleFile=sampleFile, genome='BSgenome.Mmusculus.UCSC.mm10', paired='rf')

# library sizes
alignment.stats = alignmentStats(qproj)
library.sizes = as.integer(alignment.stats[, 'mapped'])
names(library.sizes) = gsub(':genome', '', rownames(alignment.stats))
names(library.sizes) = sapply(names(library.sizes), function(e){
  res = strsplit(e, '_')[[1]]
  res = res[2:length(res)]
  res = gsub('^([A-Z0-9]).*', '\\1', res)
  return(paste0(res, collapse=''))
})

# count alignments on UCSC known genes and genomic tiles
cl = makeCluster(10)
counts.genes = qCount(proj=qproj, query=TxDb.Mmusculus.UCSC.mm10.knownGene, reportLevel='gene',
                      orientation='opposite', useRead='first', mapqMin=255L, clObj=cl)
counts.tiles.5kb = qCount(proj=qproj, query=tiles.5kb,
                          orientation='opposite', useRead='first', mapqMin=255L, clObj=cl)
stopCluster(cl)

# save counts matrix
saveRDS(object=counts.genes,
        file='/tungstenfs/groups/gpeters/rohmalex/work/Mark/RNA-seq_181127/31_transcriptomeAssembly/Rdata/counts_UCSCknownGenes.rds')


# TMM normalization using total number of aligned reads
psc = 0.1
TMMnormFactors = calcNormFactors(object=counts.genes[, -1], lib.size=library.sizes, method='TMM')

log2.rpkm.genes = log2(rpkm(y=counts.genes[, -1],
                            gene.length=counts.genes[, 1],
                            lib.size=library.sizes*TMMnormFactors,
                            log=F) + psc)
# qqnorm(log2.rpkm.genes[, 1][which(log2.rpkm.genes[, 1] > log2(psc))])
# qqline(log2.rpkm.genes[, 1][which(log2.rpkm.genes[, 1] > log2(psc))])
log2.rpkm.tiles = log2(rpkm(y=counts.tiles.5kb[, -1],
                            gene.length=counts.tiles.5kb[, 1],
                            lib.size=library.sizes*TMMnormFactors,
                            log=F) + psc)

# compute PCA
# log2(rpkm) on genes
pca.genes = prcomp(x=t(log2.rpkm.genes), center=T, scale=F)
sdev.genes = pca.genes$sdev^2
sdev.genes = round(100*sdev.genes/sum(sdev.genes), 1)
pca.genes.df = as.data.frame(pca.genes$x[, 1:2])
pca.genes.df$sample = names(library.sizes)
pca.genes.df$celltype = gsub('[0-9]', '', pca.genes.df$sample)
pl = ggplot(pca.genes.df, aes(x=PC1, y=PC2, color=factor(celltype, levels=c('LZ','PD','RS','EES','LES')),
                              label=sample)) +
  geom_point() + geom_text_repel() + theme_bw() + theme(legend.position='none') +
  xlab(paste0('PC1 (var explained : ', sdev.genes[1], ')')) +
  ylab(paste0('PC2 (var explained : ', sdev.genes[2], ')')) +
  ggtitle('PC1 vs PC2 on log2(rpkm_genes)')
ggsave(filename=paste0(saveDir, '00_biplot_logRpkm_genes.png'),
       plot=pl, width=5, height=5, units='in', dpi=200)

# log2(rpkm) on 5kb mappable tiles 
pca.tiles = prcomp(x=t(log2.rpkm.tiles), center=T, scale=F)
sdev.tiles = pca.tiles$sdev^2
sdev.tiles = round(100*sdev.tiles/sum(sdev.tiles), 1)
pca.tiles.df = as.data.frame(pca.tiles$x[, 1:2])
pca.tiles.df$sample = names(library.sizes)
pca.tiles.df$celltype = gsub('[0-9]', '', pca.tiles.df$sample)
pl = ggplot(pca.tiles.df, aes(x=PC1, y=PC2, color=factor(celltype, levels=c('LZ','PD','RS','EES','LES')),
                              label=sample)) +
  geom_point() + geom_text_repel() + theme_bw() + theme(legend.position='none') +
  xlab(paste0('PC1 (var explained : ', sdev.tiles[1], ')')) +
  ylab(paste0('PC2 (var explained : ', sdev.tiles[2], ')')) +
  ggtitle('PC1 vs PC2 on log2(rpkm_5kbMappableTiles)')
ggsave(filename=paste0(saveDir, '00_biplot_logRpkm_5kbMappableTiles.png'),
       plot=pl, width=5, height=5, units='in', dpi=200)



# 2. Proportion of reads mapping to tiles that are mapped to known genes

percDf = data.frame(sample=names(library.sizes),
                    percTotalReadsWithinKnownGenes=round(100*colSums(counts.genes)[-1]/colSums(counts.tiles.5kb)[-1], 1),
                    stringsAsFactors=F)
percDf$sample = factor(percDf$sample, levels=paste0(rep(c('LZ','PD','RS','EES','LES'), each=3), 1:3))
pl = ggplot(data=percDf, aes(x=sample, y=percTotalReadsWithinKnownGenes)) + geom_col() +
  scale_y_continuous(breaks=seq(0, 100, by=20)) + theme(axis.text.x=element_text(angle=90, hjust=1))
ggsave(filename=paste0(saveDir, '01_percentageReadsFromTilesWithinKnownGenes_barplot.png'),
       plot=pl, width=5, height=5, units='in', dpi=200)
####




#### Panel F ####
library(QuasR)
library(BSgenome.Mmusculus.UCSC.mm10)
library(rtracklayer)
library(parallel)
library(GenomicFeatures)
library(TxDb.Mmusculus.UCSC.mm10.knownGene)
library(ggplot2)
library(reshape2)
library(mgcv)
library(GGally)
library(ggrepel)
library(viridis)
library(ggcorrplot)
library(plyr)
library(gtools)

savedir = '/work2/gpeters/rohmalex/work/Mark/RNA-seq_181127/31_transcriptomeAssembly/analysis/wholeProcess_190410/'
Rdatadir = '/work2/gpeters/rohmalex/work/Mark/RNA-seq_181127/31_transcriptomeAssembly/wholeProcess_190410/82_Rdata/'
txomeGtf = '/work2/gpeters/rohmalex/work/Mark/RNA-seq_181127/31_transcriptomeAssembly/wholeProcess_190410/42_modelMerged/globalTranscriptome.gtf'


# 1 : presence/absence of gene (known or novel) in each individual txome #
genesOriginFile = '/work2/gpeters/rohmalex/work/Mark/RNA-seq_181127/31_transcriptomeAssembly/wholeProcess_190410/42_modelMerged/globalTranscriptome_presenceOfGenesInIndividualCelltypes.txt'
genesOriginDf = read.table(file=genesOriginFile, header=T, sep='\t', stringsAsFactors=F)
loop1 = unique(genesOriginDf$overlapWithKnownGene)
names(loop1) = loop1
loop2 = colnames(genesOriginDf)[3:7]
names(loop2) = loop2
genesPresence = lapply(loop1, function(ov){
  lapply(loop2, function(ct){
    genesOriginDf$gene_id[genesOriginDf[, ct] == 'present' & genesOriginDf$overlapWithKnownGene == ov]
  })
})
saveRDS(genesPresence, paste0(Rdatadir, 'presenceOfPredictedGenesInIndividualTxomes.rds'))

# 2 : Generate Barplot for presence absence patterns
celltypes = c('LZ', 'PD', 'RS', 'EES', 'LES')

res2 = permutations(n=2, r=5, v=c('a', 'p'), repeats.allowed=TRUE)
res2 = res2[-1, 5:1]
res2.str = sapply(1:nrow(res2), function(i){paste0(res2[i, ], collapse='')})
res2.df = data.frame(x=rep(1:31, each=5),
                     y=rep(5:1, 31),
                     celltype=rep(c('LZ','PD','RS','EES','LES')),
                     presence=as.character(t(res2)),
                     stringsAsFactors=FALSE)
res2.df$color = sapply(1:nrow(res2.df), function(i){
  if(res2.df$presence[i] == 'a') return('#636363')
  if(res2.df$celltype[i] == 'LZ') return('#e8a438')
  if(res2.df$celltype[i] == 'PD') return('#e84a38')
  if(res2.df$celltype[i] == 'RS') return('#c238e8')
  if(res2.df$celltype[i] == 'EES') return('#3b38e8')
  if(res2.df$celltype[i] == 'LES') return('#38b0e8')
})
res2.df$color = factor(res2.df$color, levels=unique(res2.df$color))

genesOriginDf$pattern = sapply(1:nrow(genesOriginDf), function(i){
  paste0(gsub('^([ap]).*', '\\1', genesOriginDf[i, 3:7]), collapse='')
})

res2.barplot.known = data.frame(pattern=res2.str,
                                stringsAsFactors=FALSE)
res2.barplot.known$pct = sapply(1:nrow(res2.barplot.known), function(i){
  round(100*sum(genesOriginDf$overlapWithKnownGene == 'known' &
                  genesOriginDf$pattern == res2.barplot.known$pattern[i])/sum(genesOriginDf$overlapWithKnownGene == 'known'), digits=2)
})
res2.barplot.known$pattern = factor(res2.barplot.known$pattern, levels=unique(res2.barplot.known$pattern))
res2.barplot.known$overlapsGencode = 'known'

res2.barplot.novel = data.frame(pattern=res2.str,
                                stringsAsFactors=FALSE)
res2.barplot.novel$pct = sapply(1:nrow(res2.barplot.novel), function(i){
  round(100*sum(genesOriginDf$overlapWithKnownGene == 'novel' &
                  genesOriginDf$pattern == res2.barplot.novel$pattern[i])/sum(genesOriginDf$overlapWithKnownGene == 'novel'), digits=2)
})
res2.barplot.novel$pattern = factor(res2.barplot.novel$pattern, levels=unique(res2.barplot.novel$pattern))
res2.barplot.novel$overlapsGencode = 'novel'


pl3 = ggplot(data=rbind(res2.barplot.known,res2.barplot.novel) , aes(x=pattern, y=pct)) +
  geom_col() + facet_wrap(.~overlapsGencode, nrow=2) +
  ylab('Percentage of genes') + ggtitle('Percentage of genes predicted in the different celltypes') +
  theme(axis.text.x=element_text(angle=90))
ggsave(filename=paste0(savedir, '31_barPlot_knownGenes_presenceInIndividualTxomes.png'),
       plot=pl3, width=7, height=6, dpi=250)

pl4 = ggplot(data=res2.df, aes(x=x, y=y, color=color)) + 
  geom_point() +
  scale_colour_manual(values=levels(res2.df$color)) +
  # theme(axis.line = element_blank(),
  #       panel.grid.major = element_blank(),
  #       panel.grid.minor = element_blank(),
  #       panel.border = element_blank(),
  #       panel.background = element_blank(), axis.title=element_blank(), axis.text = element_blank()) + 
  guides(color=FALSE) + theme_classic() + coord_cartesian(ylim=c(0,6))
ggsave(filename=paste0(savedir, '31_barPlot_knownGenes_presenceInIndividualTxomes_XAXISPATTERN.png'),
       plot=pl4, width=4, height=1, dpi=200)
####


#### Supplementary Figure S1, S2 & S3 Generation ####
library(QuasR)
library(BSgenome.Mmusculus.UCSC.mm10)
library(rtracklayer)
library(parallel)
library(GenomicFeatures)
library(TxDb.Mmusculus.UCSC.mm10.knownGene)
library(ggplot2)
library(reshape2)
library(mgcv)
library(GGally)
library(ggrepel)
library(viridis)
library(ggcorrplot)
library(plyr)
library(edgeR)


savedir = '/work2/gpeters/rohmalex/work/Mark/RNA-seq_181127/31_transcriptomeAssembly/analysis/wholeProcess_190410/'
Rdatadir = '/work2/gpeters/rohmalex/work/Mark/RNA-seq_181127/31_transcriptomeAssembly/wholeProcess_190410/82_Rdata/'
txomeGtf = '/work2/gpeters/rohmalex/work/Mark/RNA-seq_181127/31_transcriptomeAssembly/wholeProcess_190410/42_modelMerged/globalTranscriptome.gtf'


### Load data
exprAllSamples = readRDS(paste0(Rdatadir, 'log2_rpkm_psc1_genes_allExternalSamples.rds'))
metadataDf = readRDS(paste0(Rdatadir, 'metadata_withGeneStatistics.rds'))
geneId2isKnown = metadataDf[, c('gene_id', 'overlapWithKnownGeneBool')]


### 1 : 3 boxplots for the distribution of expression (all genes, known, novel) for all 
###     samples for RS and ES and Gaysinkaya's samples for meiotic celltypes
# select relevant samples
exprChosenSamples = exprAllSamples[, grep('(Gill|Erkek|Gaysinkaya)', colnames(exprAllSamples))]
exprChosenSamples = exprChosenSamples[, -grep('H3f3b|PreLeptotene', colnames(exprChosenSamples))]
# average replicates (and L+Z and P+D for Gaysinkaya)
celltypesTmp = c('Leptotene|Zygotene', 'Pachytene|Diplotene', 'Round_Spermatid', 'Elongating_Spermatid')
exprChosenSamplesAvg = lapply(celltypesTmp, function(e){
  dataCt = exprChosenSamples[, grep(e, colnames(exprChosenSamples))]
  dataCtExp = list(dataCt[, grep('Gill', colnames(dataCt)), drop=F],
                   dataCt[, grep('Gill', colnames(dataCt), invert=T), drop=F])
  secondName = gsub('_.*', '', colnames(dataCtExp[[2]])[1])
  ct = gsub('\\|', '_', e)
  names(dataCtExp) = c(paste('Gill', ct, sep='_'),
                       paste(secondName, ct, sep='_'))
  dataAvg = lapply(dataCtExp, function(ee){
    if(ncol(ee) > 1){
      ee = rowMeans(ee)
    }
    return(ee)
  })
  # return matrix
  return(do.call(cbind, dataAvg))
})
exprChosenSamplesAvg = as.data.frame(do.call(cbind, exprChosenSamplesAvg))
# for some reason we get 'Erkek_Elongating_Spermatid_1' when calling do.call(cbind, dataAvg) even though
# dataAvg has the correct names
colnames(exprChosenSamplesAvg) = gsub('_[0-9]$', '', colnames(exprChosenSamplesAvg))
exprChosenSamplesAvg$gene_id = rownames(exprChosenSamplesAvg)

# transform data for plot
exprChosenSamplesAvgMelt = melt(exprChosenSamplesAvg, id.vars=c('gene_id'))
exprChosenSamplesAvgMelt$celltype = gsub('^[^_]+_', '', exprChosenSamplesAvgMelt$variable)
exprChosenSamplesAvgMelt$author = gsub('^([^_]+)_.*', '\\1', exprChosenSamplesAvgMelt$variable)
exprChosenSamplesAvgMelt$overlapWithKnownGeneBool = geneId2isKnown[match(exprChosenSamplesAvgMelt$gene_id, geneId2isKnown$gene_id),
                                                                   'overlapWithKnownGeneBool']
pl = ggplot(data=exprChosenSamplesAvgMelt, aes(x=overlapWithKnownGeneBool, y=value, fill=author)) +
  geom_violin(position=position_dodge(width=0.8)) +
  geom_boxplot(width=0.1, outlier.alpha=0.3, outlier.size=0.7, position=position_dodge(width=0.8)) +
  facet_wrap(.~factor(celltype, levels=sapply(celltypesTmp, function(ct){gsub('\\|', '_', ct)}))) +
  ylab('log2(rpkm+0.1)') + scale_x_discrete(name='Predicted genes', breaks=c(T, F), labels=c('known', 'novel'))
ggsave(filename=paste0(savedir, '71_boxplotsExpressionKnownVsNovelGenes_perCelltype.png'), plot=pl,
       device='png', width=6, height=6, units='in', dpi=200)
# - add Kaessmann's mouse testis RNA-seq data on the log2 rpkm distribution plots

# Load data
counts.KaessmannsRNA = readRDS(file='/tungstenfs/groups/gpeters/rohmalex/work/Mark/RNA-seq_181127/32_transcriptomeValidation/172_RIBOseqAnalysis_qCount/RData/counts.KaessmannsRNA.rds')
alnStats.KaessmannsRNA = readRDS(file='/tungstenfs/groups/gpeters/rohmalex/work/Mark/RNA-seq_181127/32_transcriptomeValidation/172_RIBOseqAnalysis_qCount/RData/alnStats.KaessmannsRNA.rds')

identical(exprChosenSamplesAvg$gene_id, rownames(counts.KaessmannsRNA)) # TRUE

log2rpkm.KaessmannsRNA = log2(edgeR::rpkm(y=counts.KaessmannsRNA[, -1],
                                          gene.length=counts.KaessmannsRNA[, 1],
                                          lib.size=alnStats.KaessmannsRNA$mapped,
                                          log=FALSE) + 0.1)

# remove liver data
log2rpkm.KaessmannsRNA = log2rpkm.KaessmannsRNA[, -grep('liver', colnames(log2rpkm.KaessmannsRNA))]

# avg across replicates
log2rpkm.KaessmannsRNA.avg = sapply(unique(gsub('_[123]', '', colnames(log2rpkm.KaessmannsRNA))), function(e){
  return(rowMeans(log2rpkm.KaessmannsRNA[, grep(e, colnames(log2rpkm.KaessmannsRNA))]))
})
colnames(log2rpkm.KaessmannsRNA.avg) = unique(gsub('_[123]', '', colnames(log2rpkm.KaessmannsRNA.avg)))
colnames(log2rpkm.KaessmannsRNA.avg) = paste0(colnames(log2rpkm.KaessmannsRNA.avg), '_avg')
log2rpkm.KaessmannsRNA.avg = as.data.frame(log2rpkm.KaessmannsRNA.avg)


exprChosenSamplesAvgBis = cbind(exprChosenSamplesAvg, log2rpkm.KaessmannsRNA.avg[, c('mouse_spermatocytes_rna_avg',
                                                                                     'mouse_roundSpermatids_rna_avg',
                                                                                     'mouse_elongatingSpermatids_rna_avg')])
colnames(exprChosenSamplesAvgBis)[10:12] = c('Kaessmann_Spermatocyte',
                                             'Kaessmann_Round_Spermatid',
                                             'Kaessmann_Elongating_Spermatid')

exprChosenSamplesAvgBisMelt = melt(exprChosenSamplesAvgBis, id.vars=c('gene_id'))
exprChosenSamplesAvgBisMelt$celltype = gsub('^[^_]+_', '', exprChosenSamplesAvgBisMelt$variable)
exprChosenSamplesAvgBisMelt$celltype = factor(exprChosenSamplesAvgBisMelt$celltype,
                                              levels=c('Leptotene_Zygotene','Pachytene_Diplotene','Spermatocyte',
                                                       'Round_Spermatid','Elongating_Spermatid'))
exprChosenSamplesAvgBisMelt$author = gsub('^([^_]+)_.*', '\\1', exprChosenSamplesAvgBisMelt$variable)
exprChosenSamplesAvgBisMelt$overlapWithKnownGeneBool = geneId2isKnown[match(exprChosenSamplesAvgBisMelt$gene_id, geneId2isKnown$gene_id),
                                                                      'overlapWithKnownGeneBool']
plBis = ggplot(data=exprChosenSamplesAvgBisMelt, aes(x=overlapWithKnownGeneBool, y=value, fill=author)) +
  geom_violin(position=position_dodge(width=0.8)) +
  geom_boxplot(width=0.1, outlier.alpha=0.3, outlier.size=0.7, position=position_dodge(width=0.8)) +
  facet_wrap(.~celltype) +
  ylab('log2(rpkm+0.1)') + scale_x_discrete(name='Predicted genes', breaks=c(T, F), labels=c('known', 'novel'))
ggsave(filename=paste0(savedir, '71bis_boxplotsExpressionKnownVsNovelGenes_perCelltype_withKaessmannsData.png'), plot=plBis,
       width=6, height=6, units='in', dpi=200)

# H3K4me3 in promoters of novel and known genes
library(GenomicRanges)
library(QuasR)
library(BSgenome.Mmusculus.UCSC.mm10)
library(parallel)
library(reshape2)
library(ComplexHeatmap)
library(circlize)


options(bitmapType='cairo')


baseDir = '/tungstenfs/groups/gpeters/rohmalex/work/Mark/RNA-seq_181127/32_transcriptomeValidation/'
outDir = paste0(baseDir, '120_K4me3aroundTSStoConfirmTx/')
outDataDir = paste0(outDir, 'RData/')
outPlotDir = paste0(outDir, 'plots/')
dir.create(outDir, showWarnings=FALSE)
dir.create(outDataDir, showWarnings=FALSE)
dir.create(outPlotDir, showWarnings=FALSE)
inpDataDir = '/tungstenfs/groups/gpeters/rohmalex/work/Mark/RNA-seq_181127/31_transcriptomeAssembly/wholeProcess_190410/42_modelMerged/'



# smoothing parameters and pseudocount for profiles
smooth.k <- 5
psc <- 0.0
trim <- 0.03

# Function to trim values which are outside some percentile
trim.vals <- function(x, trim=0.01){
  q <- quantile(x, c(trim/2, 1-trim/2), na.rm=T)
  x[x < q[1]] <- q[1]
  x[x > q[2]] <- q[2]
  x
}


# Functions to normalize and smooth profiles
smoothRows <- function(x, k=3) {
  # running mean
  tmp <- t(apply(x, 1, function(xx) as.vector(filter(xx, rep(1, k)/k))))
  # endrule := "constant"
  nNA <- floor(k/2)
  tmp[, 1:nNA] <- tmp[, nNA+1]
  tmp[, (ncol(tmp)-nNA+1):ncol(tmp)] <- tmp[, ncol(tmp)-nNA]
  tmp
}

# Normalize read counts, trim outlier values and smooth profiles
norm.smooth.trim.profile <- function(matr, norm.factor, smooth.k, psc, trim){
  matr <- (matr + psc)/norm.factor * 1e+6
  matr.smooth <- smoothRows(matr, k=smooth.k)
  colnames(matr.smooth) <- colnames(matr)
  
  q <- quantile(matr.smooth, c(trim/2,1-trim/2), na.rm=T)
  matr.smooth[matr.smooth < q[1]] <- q[1]
  matr.smooth[matr.smooth > q[2]] <- q[2]
  matr.smooth
}



## 1. 5' of every tx + label main/secondary isoform and known/novel

txListFile = paste0(inpDataDir, 'globalTranscriptome_fastaIdentifiersWithGencodeInfo.txt')
# one row per tx, format : <exonsCoordinates>_<geneId>_<(mono/multi)-exonic>[_<gencodeGeneId>_<gencodeGeneName>_<gencodeGeneType>_<gffcompareOverlapType>]
# exonsCoordinates is chr1:4492046-4492668(-);chr1:4496291-4497586(-)
# -> if only 3 values when splitting, the tx is novel, if 7, the tx is known
txExprFile = paste0(inpDataDir, 'meanExpression_allTx.tsv')
# one row per tx + header ; fist column is txId (exonsCoordinates) then one column per celltype ; expression value in TPM

txomeMetadataDf = read.table(file=txListFile, header=FALSE, stringsAsFactors=FALSE)
colnames(txomeMetadataDf) = c('fastaIdLong')

dataTmp = lapply(txomeMetadataDf$fastaIdLong, function(e){
  ee = strsplit(e, '_')[[1]]
  if (length(ee) == 3){
    return(data.frame(fastaId=ee[1], geneId=ee[2], overlapsGencode='novel', stringsAsFactors=FALSE))
  } else {
    return(data.frame(fastaId=ee[1], geneId=ee[2], overlapsGencode='known', stringsAsFactors=FALSE))
  }
})
dataTmp = do.call(rbind, dataTmp)

txomeMetadataDf = cbind(txomeMetadataDf, dataTmp)


# identify TSS of every transcript
TSSdata = lapply(1:nrow(txomeMetadataDf), function(i){
  exonsCoordList = strsplit(txomeMetadataDf$fastaId[i], ';')[[1]]
  chr = sub('^(chr[0-9A-Z]{1,2}):.*', '\\1', exonsCoordList[1])
  strand = sub('.*\\(([+-])\\)$', '\\1', exonsCoordList[1])
  TSS = as.integer(ifelse(strand == '+', 
                          sub('.*:([0-9]+)-.*', '\\1', exonsCoordList[1]),
                          sub('.*-([0-9]+)\\(-)$', '\\1', exonsCoordList[length(exonsCoordList)])))
  return(data.frame(chr=chr, TSS=TSS, strand=strand, stringsAsFactors=FALSE))
})
TSSdata = do.call(rbind, TSSdata)

txomeMetadataDf = cbind(txomeMetadataDf, TSSdata)


# identify main and alternative transcripts (main will be the one with highest mean expression across all 5 stages)

txomeExprDf = read.table(file=txExprFile, header=TRUE, sep='\t', stringsAsFactors=FALSE)
txomeExprDf$globalMean = sapply(1:nrow(txomeExprDf), function(i) mean(as.numeric(txomeExprDf[i, -1])))

identical(txomeMetadataDf$fastaId, txomeExprDf$name) # TRUE


txomeMetadataDf$isoform = 'alternative'
mainIsoformsIds = sapply(unique(txomeMetadataDf$geneId), function(e){
  red = txomeExprDf[which(txomeMetadataDf$geneId == e), ]
  return(red$name[which.max(red$globalMean)])
})

txomeMetadataDf$isoform[which(txomeMetadataDf$fastaId %in% mainIsoformsIds)] = 'main'
table(txomeMetadataDf$isoform)
# alternative        main 
#       26570       27289

txomeMetadataDf$globalMeanExpr = txomeExprDf$globalMean

# save data.frame
saveRDS(object=txomeMetadataDf, file=paste0(outDataDir, 'txomeMetadataDf.rds'))



## /!\ DISCARD FROM ALL DATA.FRAMES ALL TX WHICH HAVE THE SAME TSS
nrow(unique(txomeMetadataDf[, c('chr', 'TSS', 'strand')])) # 41296

txomeMetadataDf$chr.TSS.strand = paste(txomeMetadataDf$chr, txomeMetadataDf$TSS, txomeMetadataDf$strand, sep='.')
length(unique(txomeMetadataDf$chr.TSS.strand)) # 41296

# all tx with the same TSS belong to the same gene ?
table(sapply(unique(txomeMetadataDf$chr.TSS.strand[which(duplicated(txomeMetadataDf$chr.TSS.strand))]), function(e){
  redDf = txomeMetadataDf[which(txomeMetadataDf$chr.TSS.strand == e), ]
  return(length(unique(as.character(redDf$geneId))) == 1)
}))
# FALSE  TRUE 
#     2  8512

txToDiscard = c()

for(e in unique(txomeMetadataDf$chr.TSS.strand[which(duplicated(txomeMetadataDf$chr.TSS.strand))])){
  redDf = txomeMetadataDf[which(txomeMetadataDf$chr.TSS.strand == e), ]
  # if only one main isoform, discard all others
  if(sum(redDf$isoform == 'main') == 1) {txToDiscard = c(txToDiscard, as.character(redDf$fastaId[which(redDf$isoform != 'main')]))}
  # otherwise, keep only the one with the highest expression and discard all others
  else {
    redDf = redDf[-which.max(redDf$globalMeanExpr), ]
    txToDiscard = c(txToDiscard, as.character(redDf$fastaId))}
}

length(txToDiscard) # 12563

table(txomeMetadataDf$isoform[which(txomeMetadataDf$fastaId %in% txToDiscard)])
# alternative        main 
#       12561           2


txomeMetadataDf.uniqTSS = txomeMetadataDf[-which(txomeMetadataDf$fastaId %in% txToDiscard), ]
nrow(txomeMetadataDf.uniqTSS) #41296


# save txomeMetadataDf.uniqTSS
saveRDS(object=txomeMetadataDf.uniqTSS, file=paste0(outDataDir, 'txomeMetadataDf.uniqTSS.rds'))



table(txomeMetadataDf$overlapsGencode)
# known novel 
# 43940  9919
table(txomeMetadataDf.uniqTSS$overlapsGencode)
# known novel 
# 32675  8621



# 2. Consider TSS +/- 2kb and split these regions into 100 40bp bins

TSSgr = makeGRangesFromDataFrame(df=txomeMetadataDf.uniqTSS, keep.extra.columns=FALSE, ignore.strand=FALSE,
                                 seqnames.field='chr', start.field='TSS', end.field='TSS', strand.field='strand')
names(TSSgr) = txomeMetadataDf.uniqTSS$fastaId

TSSpm2kbGr = promoters(TSSgr, upstream=2000, downstream=2000)

TSSpm2kbTilesGr = unlist(tile(x=TSSpm2kbGr, n=100), use.names=FALSE)

# TSSpm2kbTilesGr = tile(x=TSSpm2kbGr, n=100) #GRangesList
# # revert order of tiles for regions on the - strand to have the downstream tiles before the upstream ones in any case
# TSSpm2kbTilesGr = lapply(TSSpm2kbTilesGr, function(e){
#   if(as.character(strand(e[1])) == '+') return(e)
#   else return(rev(e))
# })
# TSSpm2kbTilesGr = do.call(c, TSSpm2kbTilesGr)

# Use as names the original names (the fastaId), and append the position relative to anchor [TSS] (e.g. "...:-1981")
pos.rel.to.TSS <- start(resize(TSSpm2kbTilesGr, width=1, fix='center')) - start(TSSgr[names(TSSpm2kbTilesGr)])
# there is a gap of 1 between the positions for tiles on the + or - strand so we add 1 to the ones on the + strand to have
# consistent values everywhere
pos.rel.to.TSS[which(strand(TSSpm2kbTilesGr) == '+')] = pos.rel.to.TSS[which(strand(TSSpm2kbTilesGr) == '+')] + 1
# the strand was not taken into account so far so that regions on the - strand have the upstream tiles before the downstream ones
# and the positions relative to TSS are negative
pos.rel.to.TSS[which(strand(TSSpm2kbTilesGr) == '-')] = -pos.rel.to.TSS[which(strand(TSSpm2kbTilesGr) == '-')]

names(TSSpm2kbTilesGr) <- paste(names(TSSpm2kbTilesGr), pos.rel.to.TSS, sep=":")




# 3. Count K4me3 signal in these bins

sampleFile = '/tungstenfs/groups/gpeters/rohmalex/work/Mark/RNA-seq_181127/32_transcriptomeValidation/120_spermatogenesis_H3K4me3_BAM.qsample'
qProj = qAlign(sampleFile=sampleFile, genome='BSgenome.Mmusculus.UCSC.mm10', paired='no')

alnStats = as.data.frame(alignmentStats(qProj))
rownames(alnStats) = gsub(':genome$', '', rownames(alnStats))

clObj = makeCluster(20)
counts.TSSpm2kbTiles = qCount(proj=qProj, query=TSSpm2kbTilesGr, mapqMin=255L, orientation='any', clObj=clObj)
stopCluster(clObj)


# save counts matrix
saveRDS(object=counts.TSSpm2kbTiles, file=paste0(outDataDir, 'counts.H3K4me3.TSSpm2kbTiles.uniqueMappers.rds'))


# 4. Normalize and smooth the signal

## Normalize and smooth profiles, loop across all matrices

tiles.RC = as.data.frame(counts.TSSpm2kbTiles[, -1])
norm.factors = alnStats$mapped[match(colnames(tiles.RC), rownames(alnStats))]
names(norm.factors) = colnames(tiles.RC)

tiles.RC$region <- gsub('^(.*\\)):-?[0-9]{1,4}$', '\\1', rownames(tiles.RC))
tiles.RC$pos <- gsub('.*:(-?[0-9]{1,4})$', '\\1', rownames(tiles.RC))

region.order <- unique(tiles.RC$region)
pos.order <- unique(tiles.RC$pos)
pos.order <- as.numeric(pos.order)
pos.order <- sort(pos.order)
pos.order <- as.character(pos.order)
tiles.RC$pos <- factor(tiles.RC$pos,
                       levels=pos.order)
tiles.RC$region <- factor(tiles.RC$region,
                          levels=region.order)

# Create list of matrices for each sample
sample.names <- colnames(tiles.RC)[!grepl("region|pos", colnames(tiles.RC), perl=T)]


merged.profiles <- sapply(sample.names, function(smnm){
  acast(data=tiles.RC,
        formula=region ~ pos,
        value.var=smnm)
}, simplify=FALSE, USE.NAMES=TRUE)

merged.smooth.profiles <- sapply(names(merged.profiles), function(nm){
  normf <- norm.factors[nm]
  matr <- merged.profiles[[nm]]
  norm.smooth.trim.profile(matr,
                           norm.factor=normf,
                           smooth.k=smooth.k,
                           psc=psc,
                           trim=trim)
}, simplify=FALSE, USE.NAMES=TRUE)


# gene order on the heatmap : rank by mean over LZ from highest to lowest


genes.order = order(rowMeans(merged.smooth.profiles[[1]]), decreasing=TRUE)


# tmpa = do.call(cbind, merged.smooth.profiles)
# colnames(tmpa) = paste(rep(names(merged.smooth.profiles), each=100), colnames(tmpa), sep=':')
# 
# tmpb = as.matrix(tmpa[genes.order, ])
# rownames(tmpb) = NULL
# colnames(tmpb) = NULL
# tmpc = reshape2::melt(tmpb)
# colnames(tmpc) = c('y', 'x', 'value')
# 
# tmpd = reshape2::melt(tmpb[1:4000, ])
# colnames(tmpd) = c('y', 'x', 'value')
# 
# ggplot(data=tmpd, aes(x=x, y=y)) +
#   geom_raster(aes(fill=value)) + 
#   scale_fill_viridis()
# 
# 
# tmpe = tmpb[which(as.character(strand(TSSgr)) == '+'), ]
# tmpe = reshape2::melt(tmpe[1:4000, ])
# colnames(tmpe) = c('y', 'x', 'value')
# tmpf = tmpb[which(as.character(strand(TSSgr)) == '-'), ]
# tmpf = reshape2::melt(tmpf[1:4000, ])
# colnames(tmpf) = c('y', 'x', 'value')
# tmpg = tmpb[which(txomeMetadataDf.uniqTSS$overlapsGencode == 'novel'), ]
# tmpg = reshape2::melt(tmpg[1:4000, ])
# colnames(tmpg) = c('y', 'x', 'value')
# 
# 
# ggplot(data=tmpf, aes(x=x, y=y)) +
#   geom_raster(aes(fill=value)) + 
#   scale_fill_viridis()





# define color palettes for each heatmap
col.pal <- sapply(merged.smooth.profiles, function(e){
  require(RColorBrewer)
  require(circlize)
  range.val <- range(e)
  col.pal <- colorRamp2(breaks=seq(range.val[1], range.val[2], length.out=9),
                        colors=c("grey95", brewer.pal(9, "Oranges")[-1]))
  return(col.pal)
}, simplify = F)



# annotation part
identical(txomeMetadataDf.uniqTSS$fastaId, rownames(merged.smooth.profiles[[1]])) #TRUE
annot.htmp = rowAnnotation(overlapsGencode=txomeMetadataDf.uniqTSS$overlapsGencode[genes.order],
                           isoform=txomeMetadataDf.uniqTSS$isoform[genes.order],
                           col=list(overlapsGencode=c("known"="#1036e0", "novel"="#f7b645"),
                                    isoform=c("main"='#e01036', "alternative"='#73d65a')))

htmp.list = Heatmap(matrix=merged.smooth.profiles[[1]][genes.order, ],
                    name=names(merged.smooth.profiles)[1],
                    col=col.pal[[1]],
                    cluster_columns=F,
                    cluster_rows=F,
                    show_row_names=F,
                    show_column_names=F,
                    show_heatmap_legend=T,
                    left_annotation=annot.htmp)
for (i in 2:length(merged.smooth.profiles)){
  htmp.list = htmp.list + Heatmap(matrix=merged.smooth.profiles[[i]][genes.order, ],
                                  name=names(merged.smooth.profiles)[i],
                                  col=col.pal[[i]],
                                  cluster_columns=F,
                                  cluster_rows=F,
                                  show_row_names=F,
                                  show_column_names=F,
                                  show_heatmap_legend=T)
}

pdf(paste0(outPlotDir, '01_heatmap_K4me3_LZtoES_TSSpm2kb_orderedByRowMeansLZ_uniqTSS.pdf'))
draw(htmp.list)
dev.off()


## 6. Discard overlapping regions

# Overlapping TSS +/-2kb windows
ov = as.data.frame(findOverlaps(TSSpm2kbGr))
ov = ov[which(ov$queryHits < ov$subjectHits), ] # 16636

# # Number of individual regions overlapping only one other region -> found only once in subjectHits and not found in queryHits
# table(!duplicated(ov$subjectHits) & !(ov$subjectHits %in% unique(ov$queryHits)))
# # FALSE  TRUE 
# #  8162  8474
# 
# ovRed = ov[which(!duplicated(ov$subjectHits) & !(ov$subjectHits %in% unique(ov$queryHits))), ]
# 
# identical(names(TSSpm2kbGr), txomeMetadataDf.uniqTSS$fastaId) #TRUE
# 
# # loop across these and discard the most lowly expressed (mean across all 5 stages)
# toDiscard = sapply(1:nrow(ovRed), function(i){
#   j = which.min(txomeMetadataDf.uniqTSS$globalMeanExpr[as.integer(ovRed[i, ])])
#   return(ovRed[i, j])
# })
# 
# ov2 = as.data.frame(findOverlaps(TSSpm2kbGr[-toDiscard]))
# ov2 = ov2[which(ov2$queryHits < ov2$subjectHits), ] # 4733


# while still existing overlaps : findOverlaps(TSSpm2kbGrTmp)[1], discard the corresponding tx with the lowest mean expression
toDiscard2 = c()
overlapsBool = TRUE
while(overlapsBool){
  if(length(toDiscard2) == 0){
    redGr = TSSpm2kbGr
    redMetadataDf = txomeMetadataDf.uniqTSS
  }
  else {
    redGr = TSSpm2kbGr[-toDiscard2]
    redMetadataDf = txomeMetadataDf.uniqTSS[-toDiscard2, ]
  }
  ov3 = as.data.frame(findOverlaps(redGr))
  ov3 = ov3[which(ov3$queryHits < ov3$subjectHits), ]
  j = which.min(redMetadataDf$globalMeanExpr[as.integer(ov3[1, ])])
  toDiscard2 = c(toDiscard2, which(txomeMetadataDf.uniqTSS$fastaId == redMetadataDf$fastaId[ov3[1, j]]))
  if(nrow(ov3[-which(ov3$queryHits == ov3[1, j] | ov3$subjectHits == ov3[1, j]), ]) == 0){overlapsBool = FALSE}
}

length(toDiscard2) #11914

ov4 = as.data.frame(findOverlaps(TSSpm2kbGr[-toDiscard2]))
ov4 = ov4[which(ov4$queryHits < ov4$subjectHits), ] #0

length(TSSpm2kbGr[-toDiscard2]) #29382


identical(rownames(merged.profiles[[1]]), names(TSSpm2kbGr)) # TRUE

merged.profiles2 = lapply(merged.profiles, function(e){
  return(e[-toDiscard2, ])
})


merged.smooth.profiles2 <- sapply(names(merged.profiles2), function(nm){
  normf <- norm.factors[nm]
  matr <- merged.profiles2[[nm]]
  norm.smooth.trim.profile(matr,
                           norm.factor=normf,
                           smooth.k=smooth.k,
                           psc=psc,
                           trim=trim)
}, simplify=FALSE, USE.NAMES=TRUE)



### CGI status and CpG density
TSSpm2kbGr2 = TSSpm2kbGr[-toDiscard2] # 29382 4kb promoter regions

CGIgr = readRDS(file='/tungstenfs/groups/gpeters/rohmalex/data/cpgIslandExt.granges.rds')
TSSpm2kbGr2$CGI = overlapsAny(TSSpm2kbGr2, CGIgr)

library(Biostrings)
library(BSgenome.Mmusculus.UCSC.mm10)
TSSpm2kbGr2.seq <- getSeq(BSgenome.Mmusculus.UCSC.mm10, TSSpm2kbGr2)
TSSpm2kbGr2.freq <- oligonucleotideFrequency(TSSpm2kbGr2.seq, width=1)
TSSpm2kbGr2.percGC <- rowSums(TSSpm2kbGr2.freq[,c("C","G")]) *100 /rowSums(TSSpm2kbGr2.freq) # GC percent
TSSpm2kbGr2$percGC = TSSpm2kbGr2.percGC



genes.order2 = order(rowMeans(merged.smooth.profiles2[[1]]), decreasing=TRUE)



# define color palettes for each heatmap
col.pal2 <- sapply(merged.smooth.profiles2, function(e){
  require(RColorBrewer)
  require(circlize)
  range.val <- range(e)
  col.pal <- colorRamp2(breaks=seq(range.val[1], range.val[2], length.out=9),
                        colors=c("grey95", brewer.pal(9, "Oranges")[-1]))
  return(col.pal)
}, simplify = F)

col.pal.pctGC <- list(pctGC=colorRamp2(breaks=seq(min(TSSpm2kbGr2$percGC), max(TSSpm2kbGr2$percGC), length.out=9),
                                       colors=c("grey95", brewer.pal(9, "Blues")[-1])))


# annotation part
identical(txomeMetadataDf.uniqTSS$fastaId[-toDiscard2], rownames(merged.smooth.profiles2[[1]])) #TRUE
annot.htmp2 = rowAnnotation(overlapsGencode=txomeMetadataDf.uniqTSS$overlapsGencode[-toDiscard2][genes.order2],
                            isoform=txomeMetadataDf.uniqTSS$isoform[-toDiscard2][genes.order2],
                            CGI=TSSpm2kbGr2$CGI[genes.order2],
                            col=list(overlapsGencode=c("known"="#1036e0", "novel"="#f7b645"),
                                     isoform=c("main"='#e01036', "alternative"='#73d65a'),
                                     CGI=c("TRUE"='#000000', 'FALSE'='#ffffff')))

htmp.list2 = Heatmap(matrix=TSSpm2kbGr2$percGC[genes.order2],
                     name='pctGC',
                     col=col.pal.pctGC[[1]],
                     cluster_columns=F,
                     cluster_rows=F,
                     show_row_names=F,
                     show_column_names=F,
                     show_heatmap_legend=T,
                     width=unit(16, "points"),
                     left_annotation=annot.htmp2)

for (i in 1:length(merged.smooth.profiles2)){
  htmp.list2 = htmp.list2 + Heatmap(matrix=merged.smooth.profiles2[[i]][genes.order2, ],
                                    name=names(merged.smooth.profiles2)[i],
                                    col=col.pal2[[i]],
                                    cluster_columns=F,
                                    cluster_rows=F,
                                    show_row_names=F,
                                    show_column_names=F,
                                    show_heatmap_legend=T)
}

pdf(paste0(outPlotDir, '02_heatmap_K4me3_LZtoES_TSSpm2kb_orderedByRowMeansLZ_uniqTSSandNonOverlappingTSSregions.pdf'))
draw(htmp.list2)
dev.off()



txomeMetadataDf.uniqTSS.nonOverlappingRegions = txomeMetadataDf.uniqTSS[-toDiscard2, ]

saveRDS(object=txomeMetadataDf.uniqTSS.nonOverlappingRegions, file=paste0(outDataDir, 'txomeMetadataDf.uniqTSSandNonOverlappingTSSregions.rds'))



## Split by known/novel and main/alternative isoform, and within each group order by hierarchical clustering
table(txomeMetadataDf.uniqTSS.nonOverlappingRegions[, c('overlapsGencode', 'isoform')])
#                 isoform
# overlapsGencode alternative  main
#           known        2779 19318
#           novel         145  7140

txomeMetadataDf.uniqTSS.nonOverlappingRegions$ovGencode.isoform = paste(txomeMetadataDf.uniqTSS.nonOverlappingRegions$overlapsGencode,
                                                                        txomeMetadataDf.uniqTSS.nonOverlappingRegions$isoform, sep='.')


gene.order.ovGencode.isoform.hclust = mclapply(c('known.main', 'known.alternative', 'novel.main', 'novel.alternative'),
                                               function(e){
                                                 initial.idx = which(txomeMetadataDf.uniqTSS.nonOverlappingRegions$ovGencode.isoform == e)
                                                 data = do.call(cbind, merged.smooth.profiles2)
                                                 hcl = hclust(d=dist(x=data[initial.idx, ], method='euclidian'), method='complete')
                                                 return(list(initial.idx=initial.idx, hclust.obj=hcl))
                                               },
                                               mc.cores=20L)
names(gene.order.ovGencode.isoform.hclust) = c('known.main', 'known.alternative', 'novel.main', 'novel.alternative')

saveRDS(object=gene.order.ovGencode.isoform.hclust,
        file=paste0(outDataDir, 'hclust_uniqTSSandNonOverlappingTSSregions_TSSpm2kb_splitByOverlapsGencodeAndIsoform_allCelltypes.rds'))


# data.frame with correspondence between initial row number (for txomeMetadataDf.uniqTSS.nonOverlappingRegions and merged.smooth.profiles2)
# and new order after splitting into overlapsGencode/isoform and hierarchical clustering
genes.order3df = do.call(rbind, lapply(1:length(gene.order.ovGencode.isoform.hclust), function(i){
  e = gene.order.ovGencode.isoform.hclust[[i]]
  res = data.frame(initial=e$initial.idx, hclust=e$hclust.obj$order)
  if(i > 1){res$hclust = res$hclust + sum(sapply(1:(i-1), function(j){length(gene.order.ovGencode.isoform.hclust[[j]][['initial.idx']])}))}
  return(res)
}))
sapply(genes.order3df, range)
#      initial hclust
# [1,]       1      1
# [2,]   29382  29382


genes.order3 = genes.order3df$initial[genes.order3df$hclust]


# annotation part
annot.htmp3 = rowAnnotation(overlapsGencode=txomeMetadataDf.uniqTSS.nonOverlappingRegions$overlapsGencode[genes.order3],
                            isoform=txomeMetadataDf.uniqTSS.nonOverlappingRegions$isoform[genes.order3],
                            CGI=TSSpm2kbGr2$CGI[genes.order3],
                            col=list(overlapsGencode=c("known"="#1036e0", "novel"="#f7b645"),
                                     isoform=c("main"='#e01036', "alternative"='#73d65a'),
                                     CGI=c("TRUE"='#000000', 'FALSE'='#ffffff')))


htmp.list3 = Heatmap(matrix=TSSpm2kbGr2$percGC[genes.order3],
                     name='pctGC',
                     col=col.pal.pctGC[[1]],
                     cluster_columns=F,
                     cluster_rows=F,
                     show_row_names=F,
                     show_column_names=F,
                     show_heatmap_legend=T,
                     width=unit(16, "points"),
                     left_annotation=annot.htmp3)

for (i in 1:length(merged.smooth.profiles2)){
  htmp.list3 = htmp.list3 + Heatmap(matrix=merged.smooth.profiles2[[i]][genes.order3, ],
                                    name=names(merged.smooth.profiles2)[i],
                                    col=col.pal2[[i]],
                                    cluster_columns=F,
                                    cluster_rows=F,
                                    show_row_names=F,
                                    show_column_names=F,
                                    show_heatmap_legend=T)
}

pdf(paste0(outPlotDir, '03_heatmap_K4me3_LZtoES_TSSpm2kb_orderedByOverlapsGencodeAndIsoformAndHclust_uniqTSSandNonOverlappingTSSregions.pdf'))
draw(htmp.list3)
dev.off()




## Split by known/novel and main/alternative isoform, and within each group order by decreasing rowMeans(merged.smooth.profiles2[[1]])

gene.order.ovGencode.isoform.decreasingLZ = lapply(c('known.main', 'known.alternative', 'novel.main', 'novel.alternative'),
                                                   function(e){
                                                     initial.idx = which(txomeMetadataDf.uniqTSS.nonOverlappingRegions$ovGencode.isoform == e)
                                                     data = merged.smooth.profiles2[[1]][initial.idx, ]
                                                     return(list(initial.idx=initial.idx,
                                                                 final.idx=order(rowMeans(data), decreasing=TRUE)))
                                                   })
names(gene.order.ovGencode.isoform.decreasingLZ) = c('known.main', 'known.alternative', 'novel.main', 'novel.alternative')



# data.frame with correspondence between initial row number (for txomeMetadataDf.uniqTSS.nonOverlappingRegions and merged.smooth.profiles2)
# and new order after splitting into overlapsGencode/isoform and decreasing rowMeans(LZ)
genes.order4df = do.call(rbind, lapply(1:length(gene.order.ovGencode.isoform.decreasingLZ), function(i){
  e = gene.order.ovGencode.isoform.decreasingLZ[[i]]
  res = data.frame(initial=e$initial.idx, final=e$final.idx)
  if(i > 1){res$final = res$final + sum(sapply(1:(i-1), function(j){length(gene.order.ovGencode.isoform.decreasingLZ[[j]][['initial.idx']])}))}
  return(res)
}))

genes.order4 = genes.order4df$initial[genes.order4df$final]


# annotation part
annot.htmp4 = rowAnnotation(overlapsGencode=txomeMetadataDf.uniqTSS.nonOverlappingRegions$overlapsGencode[genes.order4],
                            isoform=txomeMetadataDf.uniqTSS.nonOverlappingRegions$isoform[genes.order4],
                            CGI=TSSpm2kbGr2$CGI[genes.order4],
                            col=list(overlapsGencode=c("known"="#1036e0", "novel"="#f7b645"),
                                     isoform=c("main"='#e01036', "alternative"='#73d65a'),
                                     CGI=c("TRUE"='#000000', 'FALSE'='#ffffff')))


htmp.list4 = Heatmap(matrix=TSSpm2kbGr2$percGC[genes.order4],
                     name='pctGC',
                     col=col.pal.pctGC[[1]],
                     cluster_columns=F,
                     cluster_rows=F,
                     show_row_names=F,
                     show_column_names=F,
                     show_heatmap_legend=T,
                     width=unit(16, "points"),
                     left_annotation=annot.htmp4)
for (i in 1:length(merged.smooth.profiles2)){
  htmp.list4 = htmp.list4 + Heatmap(matrix=merged.smooth.profiles2[[i]][genes.order4, ],
                                    name=names(merged.smooth.profiles2)[i],
                                    col=col.pal2[[i]],
                                    cluster_columns=F,
                                    cluster_rows=F,
                                    show_row_names=F,
                                    show_column_names=F,
                                    show_heatmap_legend=T)
}

pdf(paste0(outPlotDir, '04_heatmap_K4me3_LZtoES_TSSpm2kb_orderedByOverlapsGencodeAndIsoformAndDecreasingRowMeansLZ_uniqTSSandNonOverlappingTSSregions.pdf'))
draw(htmp.list4)
dev.off()





## Hierarchical clustering of the whole dataset

hclust.obj5 = hclust(d=dist(x=do.call(cbind, merged.smooth.profiles2), method='euclidian'), method='complete')

saveRDS(object=hclust.obj5, file=paste0(outDataDir, 'hclust_uniqTSSandNonOverlappingTSSregions_TSSpm2kb_allCelltypes.rds'))


# annotation part
annot.htmp5 = rowAnnotation(overlapsGencode=txomeMetadataDf.uniqTSS.nonOverlappingRegions$overlapsGencode[hclust.obj5$order],
                            isoform=txomeMetadataDf.uniqTSS.nonOverlappingRegions$isoform[hclust.obj5$order],
                            CGI=TSSpm2kbGr2$CGI[hclust.obj5$order],
                            col=list(overlapsGencode=c("known"="#1036e0", "novel"="#f7b645"),
                                     isoform=c("main"='#e01036', "alternative"='#73d65a'),
                                     CGI=c("TRUE"='#000000', 'FALSE'='#ffffff')))


htmp.list5 = Heatmap(matrix=TSSpm2kbGr2$percGC[hclust.obj5$order],
                     name='pctGC',
                     col=col.pal.pctGC[[1]],
                     cluster_columns=F,
                     cluster_rows=F,
                     show_row_names=F,
                     show_column_names=F,
                     show_heatmap_legend=T,
                     width=unit(16, "points"),
                     left_annotation=annot.htmp5)

for (i in 1:length(merged.smooth.profiles2)){
  htmp.list5 = htmp.list5 + Heatmap(matrix=merged.smooth.profiles2[[i]][hclust.obj5$order, ],
                                    name=names(merged.smooth.profiles2)[i],
                                    col=col.pal2[[i]],
                                    cluster_columns=F,
                                    cluster_rows=F,
                                    show_row_names=F,
                                    show_column_names=F,
                                    show_heatmap_legend=T)
}

pdf(paste0(outPlotDir, '05_heatmap_K4me3_LZtoES_TSSpm2kb_orderedByHclust_uniqTSSandNonOverlappingTSSregions.pdf'))
draw(htmp.list5)
dev.off()




# Instead of ordering the rows based only on rowMeans(LZ signal in bins), we split range(rowMeans(LZ)) into 10 equal parts, then
# within each of these bins, we split range(rowMeans(PD)) into 10 equal parts and so on until we reach ES


# order rowMeans(LZ) decreasingly, split into 10 parts, extract list of indices, consider the first part, order rowMeans(PD) 
# decreasingly, split into 10 parts, ...


k = 5

tmpDf = data.frame(regionId=rownames(merged.smooth.profiles2[[1]]),
                   LZ=rowMeans(merged.smooth.profiles2[[1]]),
                   PD=rowMeans(merged.smooth.profiles2[[2]]),
                   RS=rowMeans(merged.smooth.profiles2[[3]]),
                   ES=rowMeans(merged.smooth.profiles2[[4]]),
                   stringsAsFactors=FALSE)
rownames(tmpDf) = NULL

tmpDf$orderLZ = k+1-cut(x=tmpDf$LZ, breaks=k, labels=FALSE)

tmpDf$orderPD = 0
for (e in unique(tmpDf$orderLZ)){
  tmpDf$orderPD[which(tmpDf$orderLZ == e)] = (e-1)*k + k+1-cut(x=tmpDf$PD[which(tmpDf$orderLZ == e)], breaks=k, labels=FALSE)
}

tmpDf$orderRS = 0
for (e in unique(tmpDf$orderPD)){
  tmpDf$orderRS[which(tmpDf$orderPD == e)] = (e-1)*k + k+1-cut(x=tmpDf$RS[which(tmpDf$orderPD == e)], breaks=k, labels=FALSE)
}

tmpDf$orderES = 0
for (e in unique(tmpDf$orderRS)){
  tmpDf$orderES[which(tmpDf$orderRS == e)] = (e-1)*k + k+1-cut(x=tmpDf$ES[which(tmpDf$orderRS == e)], breaks=k, labels=FALSE)
}


genes.order6 = order(tmpDf$orderES)


# annotation part
annot.htmp6 = rowAnnotation(overlapsGencode=txomeMetadataDf.uniqTSS.nonOverlappingRegions$overlapsGencode[genes.order6],
                            isoform=txomeMetadataDf.uniqTSS.nonOverlappingRegions$isoform[genes.order6],
                            CGI=TSSpm2kbGr2$CGI[genes.order6],
                            col=list(overlapsGencode=c("known"="#1036e0", "novel"="#f7b645"),
                                     isoform=c("main"='#e01036', "alternative"='#73d65a'),
                                     CGI=c("TRUE"='#000000', 'FALSE'='#ffffff')))


htmp.list6 = Heatmap(matrix=TSSpm2kbGr2$percGC[genes.order6],
                     name='pctGC',
                     col=col.pal.pctGC[[1]],
                     cluster_columns=F,
                     cluster_rows=F,
                     show_row_names=F,
                     show_column_names=F,
                     show_heatmap_legend=T,
                     width=unit(16, "points"),
                     left_annotation=annot.htmp6)
for (i in 1:length(merged.smooth.profiles2)){
  htmp.list6 = htmp.list6 + Heatmap(matrix=merged.smooth.profiles2[[i]][genes.order6, ],
                                    name=names(merged.smooth.profiles2)[i],
                                    col=col.pal2[[i]],
                                    cluster_columns=F,
                                    cluster_rows=F,
                                    show_row_names=F,
                                    show_column_names=F,
                                    show_heatmap_legend=T)
}

pdf(paste0(outPlotDir, '06_heatmap_K4me3_LZtoES_TSSpm2kb_orderedByDecreasingRowMeansLZthenPDthenRSthenES_uniqTSSandNonOverlappingTSSregions_k5.pdf'))
draw(htmp.list6)
dev.off()



## illustration with k=5
tmpbDf = tmpDf[, 6:9]
tmpbDf = tmpbDf[sample(x=1:nrow(tmpbDf), size=1000, replace=FALSE), ]
tmpbDf$globalOrder = order(tmpbDf$orderES)
tmpbDf$orderPD = sapply(tmpbDf$orderPD, function(e){res = as.numeric(gsub('.*([0-9])$', '\\1', e)); if(res == 0){res = 5}; if(res > 5){res = res - 5}; return(res)})
tmpbDf$orderRS = sapply(tmpbDf$orderRS, function(e){res = as.numeric(gsub('.*([0-9])$', '\\1', e)); if(res == 0){res = 5}; if(res > 5){res = res - 5}; return(res)})
tmpbDf$orderES = sapply(tmpbDf$orderES, function(e){res = as.numeric(gsub('.*([0-9])$', '\\1', e)); if(res == 0){res = 5}; if(res > 5){res = res - 5}; return(res)})
colTmp <- sapply(1:4, function(i){
  require(RColorBrewer)
  require(circlize)
  range.val <- range(tmpbDf[, i])
  col.pal <- colorRamp2(breaks=seq(range.val[1], range.val[2], length.out=5),
                        colors=c("grey95", brewer.pal(5, "Blues")[-1]))
  return(col.pal)
}, simplify = F)
htmp.tmp = Heatmap(matrix=as.matrix(tmpbDf[tmpbDf$globalOrder, 1, drop=FALSE]), col=colTmp[[1]], cluster_columns=FALSE, cluster_rows=FALSE, name='orderLZ', show_row_names=F, show_column_names=F, show_heatmap_legend=F)
for (i in 2:4){
  htmp.tmp = htmp.tmp + Heatmap(matrix=as.matrix(tmpbDf[tmpbDf$globalOrder, i, drop=FALSE]), col=colTmp[[i]], cluster_columns=FALSE, cluster_rows=FALSE, name=colnames(tmpbDf)[i], show_row_names=F, show_column_names=F, show_heatmap_legend=F)
}
draw(htmp.tmp)






# kmeans clustering and decreasing rowSums(do.call(cbind, merged.smooth.profiles2)) within each cluster
# -> try different numbers of clusters
for (nClust in c(4,8,12,16)){
  set.seed(1234)
  kmeans.7 = kmeans(x=do.call(cbind, merged.smooth.profiles2), centers=nClust)
  kmeans.7.ord = order(kmeans.7$cluster, rowSums(do.call(cbind, merged.smooth.profiles2)), decreasing=TRUE)
  
  # annotation part
  annot.htmp7 = rowAnnotation(overlapsGencode=txomeMetadataDf.uniqTSS.nonOverlappingRegions$overlapsGencode[kmeans.7.ord],
                              isoform=txomeMetadataDf.uniqTSS.nonOverlappingRegions$isoform[kmeans.7.ord],
                              CGI=TSSpm2kbGr2$CGI[kmeans.7.ord],
                              col=list(overlapsGencode=c("known"="#1036e0", "novel"="#f7b645"),
                                       isoform=c("main"='#e01036', "alternative"='#73d65a'),
                                       CGI=c("TRUE"='#000000', 'FALSE'='#ffffff')))
  
  
  htmp.list7 = Heatmap(matrix=TSSpm2kbGr2$percGC[kmeans.7.ord],
                       name='pctGC',
                       col=col.pal.pctGC[[1]],
                       cluster_columns=F,
                       cluster_rows=F,
                       show_row_names=F,
                       show_column_names=F,
                       show_heatmap_legend=T,
                       width=unit(16, "points"),
                       left_annotation=annot.htmp7)
  for (i in 1:length(merged.smooth.profiles2)){
    htmp.list7 = htmp.list7 + Heatmap(matrix=merged.smooth.profiles2[[i]][kmeans.7.ord, ],
                                      name=names(merged.smooth.profiles2)[i],
                                      col=col.pal2[[i]],
                                      cluster_columns=F,
                                      cluster_rows=F,
                                      show_row_names=F,
                                      show_column_names=F,
                                      split=kmeans.7$cluster[kmeans.7.ord],
                                      show_heatmap_legend=T)
  }
  
  pdf(paste0(outPlotDir, '07_heatmap_K4me3_LZtoES_TSSpm2kb_orderedByKmeansAndDecreasingRowsums_', nClust, 'clusters_uniqTSSandNonOverlappingTSSregions.pdf'))
  draw(htmp.list7)
  dev.off()
}




# 7. Heatmap with K4me3 signal in promoters and transcript expression (salmon)

mcols(TSSpm2kbGr2) = cbind(mcols(TSSpm2kbGr2), log2(txomeExprDf[match(names(TSSpm2kbGr2), txomeExprDf$name), -1] + 0.01))
colnames(mcols(TSSpm2kbGr2))[3:7] = gsub('_TPM', '_log2TPM', colnames(mcols(TSSpm2kbGr2))[3:7])
# the range for the log2TPM values is quite large so we'll set the top 1% largest and smallest values to the
# 1st and 99th percentiles
for (i in 3:7){
  res = quantile(mcols(TSSpm2kbGr2)[, i], probs=c(0.01, 0.99))
  mcols(TSSpm2kbGr2)[, i][which(mcols(TSSpm2kbGr2)[, i] <= res[1])] = res[1]
  mcols(TSSpm2kbGr2)[, i][which(mcols(TSSpm2kbGr2)[, i] >= res[2])] = res[2]
}

# 3 ways of clustering : either using only K4me3 (as was done in kmeans.7), using only expression or using both
# data for kmeans clustering : K4me3 profiles (100 bins) + expression for each transcript in all stages, repeated 100 times each
kmeans.8.data = do.call(cbind, merged.smooth.profiles2)
kmeans.8.data = cbind(kmeans.8.data, mcols(TSSpm2kbGr2)[, rep(3:7, each=100)])

# color palette for transcript expression
col.palExpr <- sapply(3:7, function(i){
  require(RColorBrewer)
  require(circlize)
  range.val <- range(mcols(TSSpm2kbGr2)[, i])
  col.pal <- colorRamp2(breaks=seq(range.val[1], range.val[2], length.out=9),
                        colors=c("grey95", brewer.pal(9, "Greens")[-1]))
  return(col.pal)
}, simplify = F)
names(col.palExpr) = colnames(mcols(TSSpm2kbGr2))[3:7]

for (nClust in c(4,8,12,16)){
  set.seed(1234)
  kmeans.8.K4only = kmeans(x=do.call(cbind, merged.smooth.profiles2), centers=nClust)
  kmeans.8.K4only.ord = order(kmeans.8.K4only$cluster, rowSums(do.call(cbind, merged.smooth.profiles2)), decreasing=TRUE)
  
  set.seed(1234)
  kmeans.8.exprOnly = kmeans(x=mcols(TSSpm2kbGr2)[, 3:7], centers=nClust)
  kmeans.8.exprOnly.ord = order(kmeans.8.exprOnly$cluster, rowSums(as.matrix(mcols(TSSpm2kbGr2)[, 3:7])), decreasing=TRUE)
  
  set.seed(1234)
  kmeans.8.K4andExpr = kmeans(x=kmeans.8.data, centers=nClust)
  kmeans.8.K4andExpr.ord = order(kmeans.8.K4andExpr$cluster, rowSums(do.call(cbind, merged.smooth.profiles2)), decreasing=TRUE)
  
  ## K4 only
  
  # annotation part
  annot.htmp8 = rowAnnotation(overlapsGencode=txomeMetadataDf.uniqTSS.nonOverlappingRegions$overlapsGencode[kmeans.8.K4only.ord],
                              isoform=txomeMetadataDf.uniqTSS.nonOverlappingRegions$isoform[kmeans.8.K4only.ord],
                              CGI=TSSpm2kbGr2$CGI[kmeans.8.K4only.ord],
                              col=list(overlapsGencode=c("known"="#1036e0", "novel"="#f7b645"),
                                       isoform=c("main"='#e01036', "alternative"='#73d65a'),
                                       CGI=c("TRUE"='#000000', 'FALSE'='#ffffff')))
  
  
  htmp.list8 = Heatmap(matrix=TSSpm2kbGr2$percGC[kmeans.8.K4only.ord],
                       name='pctGC',
                       col=col.pal.pctGC[[1]],
                       cluster_columns=F,
                       cluster_rows=F,
                       show_row_names=F,
                       show_column_names=F,
                       show_heatmap_legend=T,
                       width=unit(16, "points"),
                       split=kmeans.8.K4only$cluster[kmeans.8.K4only.ord],
                       left_annotation=annot.htmp8)
  for (i in 1:length(merged.smooth.profiles2)){
    htmp.list8 = htmp.list8 + Heatmap(matrix=merged.smooth.profiles2[[i]][kmeans.8.K4only.ord, ],
                                      name=names(merged.smooth.profiles2)[i],
                                      col=col.pal2[[i]],
                                      cluster_columns=F,
                                      cluster_rows=F,
                                      show_row_names=F,
                                      show_column_names=F,
                                      width=unit(50, "points"),
                                      split=kmeans.8.K4only$cluster[kmeans.8.K4only.ord],
                                      show_heatmap_legend=T)
  }
  for (i in 3:7){
    htmp.list8 = htmp.list8 + Heatmap(matrix=as.numeric(mcols(TSSpm2kbGr2)[kmeans.8.K4only.ord, i]),
                                      name=colnames(mcols(TSSpm2kbGr2))[i],
                                      col=col.palExpr[[(i-2)]],
                                      cluster_columns=F,
                                      cluster_rows=F,
                                      show_row_names=F,
                                      show_column_names=F,
                                      width=unit(16, "points"),
                                      split=kmeans.8.K4only$cluster[kmeans.8.K4only.ord],
                                      show_heatmap_legend=T)
  }
  
  pdf(file=paste0(outPlotDir, '08_heatmap_K4me3_LZtoES_TSSpm2kb_txExpression_LZtoLES_orderedByKmeansConsideringOnlyK4AndDecreasingRowsumsK4_', nClust, 'clusters_uniqTSSandNonOverlappingTSSregions.pdf'),
      width=10)
  draw(htmp.list8)
  dev.off()
  
  
  ## expression only
  
  # annotation part
  annot.htmp9 = rowAnnotation(overlapsGencode=txomeMetadataDf.uniqTSS.nonOverlappingRegions$overlapsGencode[kmeans.8.exprOnly.ord],
                              isoform=txomeMetadataDf.uniqTSS.nonOverlappingRegions$isoform[kmeans.8.exprOnly.ord],
                              CGI=TSSpm2kbGr2$CGI[kmeans.8.exprOnly.ord],
                              col=list(overlapsGencode=c("known"="#1036e0", "novel"="#f7b645"),
                                       isoform=c("main"='#e01036', "alternative"='#73d65a'),
                                       CGI=c("TRUE"='#000000', 'FALSE'='#ffffff')))
  
  
  htmp.list9 = Heatmap(matrix=TSSpm2kbGr2$percGC[kmeans.8.exprOnly.ord],
                       name='pctGC',
                       col=col.pal.pctGC[[1]],
                       cluster_columns=F,
                       cluster_rows=F,
                       show_row_names=F,
                       show_column_names=F,
                       show_heatmap_legend=T,
                       width=unit(16, "points"),
                       split=kmeans.8.exprOnly$cluster[kmeans.8.exprOnly.ord],
                       left_annotation=annot.htmp9)
  for (i in 1:length(merged.smooth.profiles2)){
    htmp.list9 = htmp.list9 + Heatmap(matrix=merged.smooth.profiles2[[i]][kmeans.8.exprOnly.ord, ],
                                      name=names(merged.smooth.profiles2)[i],
                                      col=col.pal2[[i]],
                                      cluster_columns=F,
                                      cluster_rows=F,
                                      show_row_names=F,
                                      show_column_names=F,
                                      width=unit(50, "points"),
                                      split=kmeans.8.exprOnly$cluster[kmeans.8.exprOnly.ord],
                                      show_heatmap_legend=T)
  }
  for (i in 3:7){
    htmp.list9 = htmp.list9 + Heatmap(matrix=as.numeric(mcols(TSSpm2kbGr2)[kmeans.8.exprOnly.ord, i]),
                                      name=colnames(mcols(TSSpm2kbGr2))[i],
                                      col=col.palExpr[[(i-2)]],
                                      cluster_columns=F,
                                      cluster_rows=F,
                                      show_row_names=F,
                                      show_column_names=F,
                                      width=unit(16, "points"),
                                      split=kmeans.8.exprOnly$cluster[kmeans.8.exprOnly.ord],
                                      show_heatmap_legend=T)
  }
  
  pdf(file=paste0(outPlotDir, '09_heatmap_K4me3_LZtoES_TSSpm2kb_txExpression_LZtoLES_orderedByKmeansConsideringOnlyExpressionAndDecreasingRowsumsExpression_', nClust, 'clusters_uniqTSSandNonOverlappingTSSregions.pdf'),
      width=10)
  draw(htmp.list9)
  dev.off()
  
  
  ## K4 and expression
  
  # annotation part
  annot.htmp10 = rowAnnotation(overlapsGencode=txomeMetadataDf.uniqTSS.nonOverlappingRegions$overlapsGencode[kmeans.8.K4andExpr.ord],
                               isoform=txomeMetadataDf.uniqTSS.nonOverlappingRegions$isoform[kmeans.8.K4andExpr.ord],
                               CGI=TSSpm2kbGr2$CGI[kmeans.8.K4andExpr.ord],
                               col=list(overlapsGencode=c("known"="#1036e0", "novel"="#f7b645"),
                                        isoform=c("main"='#e01036', "alternative"='#73d65a'),
                                        CGI=c("TRUE"='#000000', 'FALSE'='#ffffff')))
  
  
  htmp.list10 = Heatmap(matrix=TSSpm2kbGr2$percGC[kmeans.8.K4andExpr.ord],
                        name='pctGC',
                        col=col.pal.pctGC[[1]],
                        cluster_columns=F,
                        cluster_rows=F,
                        show_row_names=F,
                        show_column_names=F,
                        show_heatmap_legend=T,
                        width=unit(16, "points"),
                        split=kmeans.8.K4andExpr$cluster[kmeans.8.K4andExpr.ord],
                        left_annotation=annot.htmp10)
  for (i in 1:length(merged.smooth.profiles2)){
    htmp.list10 = htmp.list10 + Heatmap(matrix=merged.smooth.profiles2[[i]][kmeans.8.K4andExpr.ord, ],
                                        name=names(merged.smooth.profiles2)[i],
                                        col=col.pal2[[i]],
                                        cluster_columns=F,
                                        cluster_rows=F,
                                        show_row_names=F,
                                        show_column_names=F,
                                        width=unit(50, "points"),
                                        split=kmeans.8.K4andExpr$cluster[kmeans.8.K4andExpr.ord],
                                        show_heatmap_legend=T)
  }
  for (i in 3:7){
    htmp.list10 = htmp.list10 + Heatmap(matrix=as.numeric(mcols(TSSpm2kbGr2)[kmeans.8.K4andExpr.ord, i]),
                                        name=colnames(mcols(TSSpm2kbGr2))[i],
                                        col=col.palExpr[[(i-2)]],
                                        cluster_columns=F,
                                        cluster_rows=F,
                                        show_row_names=F,
                                        show_column_names=F,
                                        width=unit(16, "points"),
                                        split=kmeans.8.K4andExpr$cluster[kmeans.8.K4andExpr.ord],
                                        show_heatmap_legend=T)
  }
  
  pdf(file=paste0(outPlotDir, '10_heatmap_K4me3_LZtoES_TSSpm2kb_txExpression_LZtoLES_orderedByKmeansConsideringK4AndExpressionAndDecreasingRowsumsK4_', nClust, 'clusters_uniqTSSandNonOverlappingTSSregions.pdf'),
      width=10)
  draw(htmp.list10)
  dev.off()
}
####


#### Figure 2 Generation: ####
## Make a plot of the highest values for expression for known and novel genes
local_packages_path <- "/tungstenfs/scratch/gbioinfo_work/gpeters/gillmark/Packages/"

library(BSgenome.Mmusculus.UCSC.mm10)
library(GenomicFeatures)

savedir = '/tungstenfs/scratch/gbioinfo_work/gpeters/gillmark/Post_Replicative_MaleGermCell_RNA_Seq_Comparison/Figure_Generation_NovelGeneStats/'
Rdatadir = '/work2/gpeters/rohmalex/work/Mark/RNA-seq_181127/31_transcriptomeAssembly/wholeProcess_190410/82_Rdata/'
txomeGtf = '/work2/gpeters/rohmalex/work/Mark/RNA-seq_181127/31_transcriptomeAssembly/wholeProcess_190410/42_modelMerged/globalTranscriptome.gtf'

### Load data
metadataFile = '/work2/gpeters/rohmalex/work/Mark/RNA-seq_181127/31_transcriptomeAssembly/wholeProcess_190410/42_modelMerged/globalTranscriptome_gencodeMetadataAndExpression.txt'
metadataDf = read.table(file=metadataFile, header=T, sep='\t', stringsAsFactors=F)
# all exons, no redundancy and merged when possible
exonsGr = readRDS(paste0(Rdatadir, 'exons_gr.rds'))


### overlapWithKnownGene
metadataDf$overlapWithKnownGeneBool = sapply(metadataDf$overlapWithKnownGene, function(e){
  as.logical(ifelse(e %in% c('full', 'partial'), T, F))
})
# FALSE  TRUE 
#  7367 19922


### isMultiExonic
metadataDf$isMultiExonic = sapply(metadataDf$mostExpressedIsoform, function(e){
  as.logical(ifelse(length(strsplit(e, ';')[[1]]) > 1, T, F))
})
# FALSE  TRUE 
#  6823 20466


### geneLength
metadataDf$geneLength = gsub('^chr[0-9XY]{1,2}:([0-9]+).*-([0-9]+)\\([+-]\\)$', '\\1;\\2', metadataDf$mostExpressedIsoform)
metadataDf$geneLength = sapply(metadataDf$geneLength, function(e){
  res = as.numeric(strsplit(e, ';')[[1]])
  return(res[2] - res[1])
})

## log2 geneLength
metadataDf[,"log2GeneLength"] <- log((metadataDf$geneLength),2)


### exonsLength
metadataDf$exonsLength = sapply(metadataDf$mostExpressedIsoform, function(e){
  res = as.integer(strsplit(gsub('chr[0-9XY]{1,2}:([0-9]+)-([0-9]+)\\([+-]\\)', '\\1;\\2', e), ';')[[1]])
  return(sum(sapply(seq(from=1, to=length(res)-1, by=2), function(i) {res[i+1] - res[i]})))
})


### exonic GC percentage
# -> we consider all exons from a given gene, not only the most expressed isoform
exons.seq = getSeq(BSgenome.Mmusculus.UCSC.mm10, exonsGr)
exons.baseFreq = as.data.frame(oligonucleotideFrequency(exons.seq, width=1))
sum(rowSums(exons.baseFreq) < width(exonsGr)) #1 -> no stretches of N among exonic sequences
# aggregate base counts per gene
genes.baseFreq = aggregate(x=exons.baseFreq, by=list(gene_id=exonsGr$gene_id), FUN=sum)
genes.baseFreq = genes.baseFreq[match(metadataDf$gene_id, genes.baseFreq$gene_id), ]
rownames(genes.baseFreq) = NULL
metadataDf$GCpercentage = sapply(1:nrow(metadataDf), function(i){
  round(100*rowSums(genes.baseFreq[i, c('C','G')])/rowSums(genes.baseFreq[i, c('A','C','G','T')]), 1)
})


### autosomal or sex chromosome
metadataDf$isAutosomal = sapply(metadataDf$mostExpressedIsoform, function(e){
  !grepl('chr[XY]', e)
})
# FALSE  TRUE 
#  1012 26277


## Highest expression level
## Select cell type with the highest expression level and keep it as a new column for plotting
metadataDf[, "maxExpression"] <- apply(metadataDf[, 9:13], 1, max)
metadataDf[,"log2TPM"] <- log((metadataDf$maxExpression+0.1),2)

## Many of the known genes have expression values of 0, as they are not expressed in testis
## To create a fair comparison between the novel and known genes, filter for expression of at least 0.5 TPM before
## plotting

## Create a data frame removing the genes where max expression is less than 0.5 TPM for plotting
filtered_metadataDf <- metadataDf[metadataDf$maxExpression > 0.5,]


### Save dataframe
saveRDS(metadataDf, file='/tungstenfs/scratch/gbioinfo_work/gpeters/gillmark/Post_Replicative_MaleGermCell_RNA_Seq_Comparison/Figure_Generation_NovelGeneStats/metadata_withGeneStatistics.rds')
saveRDS(filtered_metadataDf, file='/tungstenfs/scratch/gbioinfo_work/gpeters/gillmark/Post_Replicative_MaleGermCell_RNA_Seq_Comparison/Figure_Generation_NovelGeneStats/filtered_metadata_withGeneStatistics.rds')


library(ggplot2)
library(grid)
library(gridExtra)
metadataDf$overlapWithKnownGene2 = ifelse(metadataDf$overlapWithKnownGeneBool, 'known', 'novel')
filtered_metadataDf$overlapWithKnownGene2 = ifelse(filtered_metadataDf$overlapWithKnownGeneBool, 'known', 'novel')
#
tmp0 = data.frame(overlapsGencode=c('known', 'novel'),
                  stringsAsFactors=FALSE)
tmp0$freqMono = sapply(1:nrow(tmp0), function(i){
  sum(metadataDf$overlapWithKnownGene2 == tmp0$overlapsGencode[i] &
        !metadataDf$isMultiExonic)/sum(metadataDf$overlapWithKnownGene2 == tmp0$overlapsGencode[i])
})
tmp0$freqX = sapply(1:nrow(tmp0), function(i){
  sum(metadataDf$overlapWithKnownGene2 == tmp0$overlapsGencode[i] &
        !metadataDf$isAutosomal)/sum(metadataDf$overlapWithKnownGene2 == tmp0$overlapsGencode[i])
})


pl1 = ggplot(data=filtered_metadataDf, aes(x=overlapWithKnownGene2, y=maxExpression)) + 
  geom_boxplot(outlier.alpha = 0.3, width=.1) + ylab('TPM') + xlab('') + 
  coord_cartesian(ylim=c("0,10000"))

pl2 = ggplot(data=tmp0, aes(x=overlapsGencode, y=freqMono)) +
  geom_col() + ylab('Frequency of mono-exonic') + xlab('')

pl3 = ggplot(data=metadataDf, aes(x=overlapWithKnownGene2, y=log2GeneLength)) +
  geom_violin() + geom_boxplot(width=.1) + xlab('') + ylab('log2 Gene length')

pl4 = ggplot(data=metadataDf, aes(x=overlapWithKnownGene2, y=log2(exonsLength))) +
  geom_violin() + geom_boxplot(width=.1) + xlab('') + ylab('log2 Exon length')

pl5 = ggplot(data=metadataDf, aes(x=overlapWithKnownGene2, y=GCpercentage)) +
  geom_violin() + geom_boxplot(width=.1) + xlab('') + ylab('% GC')

pl6 = ggplot(data=tmp0, aes(x=overlapsGencode, y=freqX)) +
  geom_col() + ylab('Frequency of sex chromosome') + xlab('')

bigPl = arrangeGrob(grobs=lapply(1:6, function(i) get(paste0('pl', i))), ncol=3)

ggsave(filename=paste0(savedir, '61_statisticsForKnownVsNovelGenes_withoutLegends_2.png'), plot=bigPl,
       device='png', width=6, height=5, units='in', dpi=300)

testplot = ggplot(data = metadataDf,aes(group=overlapWithKnownGene2,x=log2TPM)) +
  geom_histogram() + facet_grid()

## Stats for genes with specific expression profiles

### 2 : plot statistics on the following subsets of novel genes :
# LZ (12%), PD (10%), RS (14%), PD+RS (8.8%), EES+LES (20%), LZ+PD+RS+EES+LES (2%) ###
## library(VennDiagram)
# load data on presence of predicted genes in individual txomes
genesPresence = readRDS(paste0(Rdatadir, 'presenceOfPredictedGenesInIndividualTxomes.rds'))
vennPartitionsNovel = get.venn.partitions(x=genesPresence$novel)
vennPartitionsNovel$..set..
setsToKeep = c(31,30,28,26,8,1) # in order mentioned above

novelPlotDf = lapply(setsToKeep, function(i){
  genesToKeep = vennPartitionsNovel$..values..[[i]]
  setName = gsub('^\\(([^)]+)\\).*', '\\1', vennPartitionsNovel$..set..[i])
  setName = gsub('∩', '+', setName)
  datTmp = subset(metadataDf, gene_id %in% genesToKeep)
  datTmp$predictedIn = setName
  return(datTmp)
})
novelPlotDf = as.data.frame(do.call(rbind, novelPlotDf))
setsToKeepFormattedNames = unique(novelPlotDf$predictedIn)
setsToKeepFormattedNames[6] = 'LZ+PD+RS\n+EES+LES'
novelPlotDf$predictedIn2 = novelPlotDf$predictedIn
novelPlotDf$predictedIn2[novelPlotDf$predictedIn2 == 'LZ+PD+RS+EES+LES'] = 'LZ+PD+RS\n+EES+LES'
# add number of genes in every subset as annotation
annotation = data.frame(x=setsToKeepFormattedNames,
                        y=0.95,
                        label=sapply(setsToKeep, function(i) vennPartitionsNovel$..count..[i]))


tmp1 = data.frame(predictedIn2=setsToKeepFormattedNames,
                  stringsAsFactors=FALSE)
tmp1$freqMono = sapply(1:nrow(tmp1), function(i){
  sum(novelPlotDf$predictedIn2 == tmp1$predictedIn2[i] &
        !novelPlotDf$isMultiExonic)/sum(novelPlotDf$predictedIn2 == tmp1$predictedIn2[i])
})
tmp1$freqX = sapply(1:nrow(tmp1), function(i){
  sum(novelPlotDf$predictedIn2 == tmp1$predictedIn2[i] &
        !novelPlotDf$isAutosomal)/sum(novelPlotDf$predictedIn2 == tmp1$predictedIn2[i])
})

novelPlotDf$predictedIn2 <- as.factor(novelPlotDf$predictedIn2)
library(dplyr)

novelPlotDf <- novelPlotDf %>%
  mutate(predictedIn2 = fct_relevel(predictedIn2, 
                                    "LZ","PD","RS","PD+RS","EES+LES","LZ+PD+RS\n+EES+LES"))

pl1 = ggplot(data=novelPlotDf, aes(x=predictedIn2, y=log(maxExpression+0.1,2)))+
  geom_violin()+
  geom_boxplot(width=0.1,outlier.alpha = 0.3)+
  theme(axis.title.x = element_blank())+
  labs(y="log2(TPM + 0.1)")

ggsave("/tungstenfs/scratch/gbioinfo_work/gpeters/gillmark/Post_Replicative_MaleGermCell_RNA_Seq_Comparison/Figure_Generation_NovelGeneStats/02_Figure2_log2_TPM_expression_pattern.pdf",
       plot=pl1, width=2,height=3,units = "in")



## pl2 = ggplot(data=tmp1, aes(x=factor(predictedIn2, levels=setsToKeepFormattedNames), y=freqMono)) +
##   geom_col() + ylab('Frequency of mono-exonic') + xlab('') +
##   theme(axis.text.x=element_text(size=4)) 

## pl3 = ggplot(data=novelPlotDf, aes(x=factor(predictedIn2, levels=setsToKeepFormattedNames),
##                                    y=log2(geneLength))) + ylab('log2 Gene length') +
##   geom_violin() + geom_boxplot(width=.1, outlier.alpha=0.3) +
##   xlab('') + theme(axis.text.x=element_text(size=4))

## pl4 = ggplot(data=novelPlotDf, aes(x=factor(predictedIn2, levels=setsToKeepFormattedNames), y=log2(exonsLength))) +
##   geom_violin() + geom_boxplot(width=.1, outlier.alpha=0.3) + ylab('log2 Exon length') +
##   xlab('') + theme(axis.text.x=element_text(size=4))

## pl5 = ggplot(data=novelPlotDf, aes(x=factor(predictedIn2, levels=setsToKeepFormattedNames), y=GCpercentage)) +
##   geom_violin() + geom_boxplot(width=.1, outlier.alpha=0.3) + ylab('% GC') +
##   xlab('') + theme(axis.text.x=element_text(size=4))

## pl6 = ggplot(data=tmp1, aes(x=factor(predictedIn2, levels=setsToKeepFormattedNames), y=freqX)) +
##   geom_col() + ylab('Frequency of sex chromosome') + xlab('') +
##   theme(axis.text.x=element_text(size=4))

## bigPl = arrangeGrob(grobs=lapply(1:6, function(i) get(paste0('pl', i))),
##                     ncol=3)

## ggsave(filename=paste0(savedir, '62_statisticsForSubsetsOfNovelGenes_withoutLegends_2.png'), plot=bigPl,
##        device='png', width=8, height=6, units='in', dpi=250)


## Load filtered meta data
scriptnb <- "02_"

filtered_meta_df <- readRDS(file='/tungstenfs/scratch/gbioinfo_work/gpeters/gillmark/Post_Replicative_MaleGermCell_RNA_Seq_Comparison/Figure_Generation_NovelGeneStats/filtered_metadata_withGeneStatistics.rds')

meta_df <- readRDS(file='/tungstenfs/scratch/gbioinfo_work/gpeters/gillmark/Post_Replicative_MaleGermCell_RNA_Seq_Comparison/Figure_Generation_NovelGeneStats/metadata_withGeneStatistics.rds')

meta_df$overlapWithKnownGene2 = ifelse(meta_df$overlapWithKnownGeneBool, 'known', 'novel')

TPM_plot <- ggplot(meta_df, aes(x=overlapWithKnownGene2,y=log(maxExpression+0.1,2)))+
  geom_violin()+
  geom_boxplot(width=0.1)+
  theme(axis.title.x = element_blank())+
  labs(y= "log2(TPM + 0.1)")

ggsave("/tungstenfs/scratch/gbioinfo_work/gpeters/gillmark/Post_Replicative_MaleGermCell_RNA_Seq_Comparison/Figure_Generation_NovelGeneStats/02_Figure2_log2_TPM_plot_novel_known.pdf",
       plot=TPM_plot, width=2,height=3,units = "in")
####

#### Figure 3 Generation: ####
#####
# PhastCons gives the probability for a given base to belong to a conserved element
# I computed the mean probability across exonic bases vs intronic bases for multi-exonic tx and the mean 
# probability across exonic bases vs the same number of bases downstream of the tx and not belonging to any
# known or newly predicted transcript for mono-exonic tx
# -> for mono-exonic tx, "intronicBases" in the table relates to the downstream untranscribed bases
# /!\ not all bases have a probability so I also kept the percentage of bases that were used to compute the mean
#
# Plot :
# 1. Intronic vs exonic conservation, facetted by overlapsGencode and exonicCategory
# 2. Exonic conservation vs tx expression (for tx with expression >= 0.5 TPM), facetted by overlapsGencode and 
#    exonicCategory
# 3. Intronic conservation vs tx expression (for tx with expression >= 0.5 TPM), facetted by overlapsGencode and 
#    exonicCategory
# Meeting from 21/08/19 :
# 5. Consider the set of known tx, novel tx, and take a set of "random" transcripts by assigning random start sites
#    in the genome and considering the structure of the novel tx (same exon structure)
#    Plot a histogram for the exonic conservation score for these 3 sets, we would expect the known to have a 
#    wide gaussian, the novel to have a narrower gaussian centered more left, and ideally the random set to have
#    a narrow gaussian centered even more to the left
# 6. Using conservation scores computed from an alignment of 6 close species, either considering the phylogenetic
#    model from UCSC (for euarchontoglires) or one I built from the alignment of the 6 species with phyloFit :
#    6.1. Make plot 1
#    6.2. Make plot 5 -> cumulative density instead of histogram
# 7. Correlation between the conservation scores : compare the known subsets from the 3 sources of scores,
#    compare the novel subsets from the 3 sources of scores (order of tx is the same between files)
# 8. Cumulative density of conservation scores (from the 6 species alignment and the phylogenetic model
#    computed with phyloFit) for tx that are repeat expression, tx that are chimeras initiated at repeats and
#    the rest
# 9. Cumulative plot of conservation scores generated from alignment of 6 species and computed phylogenetic model,
#    removing the tx coming from repeats, for the 4 subsets of tx (known, novel, randomKnown, randomNovel), and 
#    facetted by exonicCategory
# 10. Split the tx according to which celltype(s) they were found into, then plot exonic conservation distribution
#####


library(ggplot2)


savedir = '/work2/gpeters/rohmalex/work/Mark/RNA-seq_181127/32_transcriptomeValidation/01_analysis/conservation/'
Rdatadir = '/work2/gpeters/rohmalex/work/Mark/RNA-seq_181127/32_transcriptomeValidation/00_Rdata/'


### load data
consDf = read.table(file="/work2/gpeters/rohmalex/work/Mark/RNA-seq_181127/32_transcriptomeValidation/32_conservation_mm10.60way.phastCons.tsv",
                    header=T, sep='\t', stringsAsFactors=F)

# /!\ for some tx, we had probabilities for none of the bases so the mean conservation score is "None" and the
# pct bases informative is 0
consDf$exonicMeanConservationScore = as.numeric(consDf$exonicMeanConservationScore)
consDf$intronicMeanConservationScore = as.numeric(consDf$intronicMeanConservationScore)

sum(is.na(consDf$exonicMeanConservationScore)) #764 tx have no exonic conservation information
sum(is.na(consDf$intronicMeanConservationScore)) #783 tx have no intronic conservation information
sum(is.na(consDf$exonicMeanConservationScore) & is.na(consDf$intronicMeanConservationScore)) #394 common
table(consDf$exonicCategory[is.na(consDf$exonicMeanConservationScore)])
# mono-exonic multi-exonic 
#         471          293
table(consDf$exonicCategory[is.na(consDf$intronicMeanConservationScore)])
# mono-exonic multi-exonic 
#         423          360
table(consDf$pctExonicBasesInformative > 50)
# FALSE  TRUE 
#  2721 51138
table(consDf$pctIntronicBasesInformative > 50)
# FALSE  TRUE 
#  3581 50278
table(consDf$pctIntronicBasesInformative > 50 & consDf$pctExonicBasesInformative > 50)
# FALSE  TRUE 
#  4990 48869 

## keep only the transcripts for which we have more than 50% bases informative for both the exons and the introns
consDf_higherConf = subset(consDf, pctExonicBasesInformative > 50 & pctIntronicBasesInformative > 50)
round(100*(53859-48869)/53859) # 9% transcripts lost

## save data
saveRDS(consDf, paste0(Rdatadir, "conservation_phastCons_60way.rds"))
saveRDS(consDf_higherConf, paste0(Rdatadir, "conservation_phastCons_60way_txWithAtLeast50pctBasesInformative.rds"))


## plots

consDf_higherConf$overlapsGencode2 = sapply(consDf_higherConf$overlapsGencode, function(bool){
  ifelse(bool, "known", 'novel')
})


##########
### 1. ###
##########

# all tx, facetted by overlapsGencode2
pl = ggplot(data=consDf_higherConf,
            aes(x=exonicMeanConservationScore, y=intronicMeanConservationScore)) +
  geom_hex(bins=70) + scale_fill_viridis() +
  facet_wrap(.~overlapsGencode2) +
  ggtitle("All tx for which we have at least 50 bases intronic and exonic covered (48869)") +
  theme(plot.title=element_text(size=8))
ggsave(filename=paste0(savedir, "71_conservation_intronicVsExonicScatterplot_allTx_facettedByOverlapsGencode.png"),
       plot=pl, width=5, height=5, units='in', dpi=200)

# all tx, facetted by overlapsGencode2 and mono/multi-exonic
pl = ggplot(data=consDf_higherConf,
            aes(x=exonicMeanConservationScore, y=intronicMeanConservationScore)) +
  geom_hex(bins=70) + scale_fill_viridis() +
  facet_grid(overlapsGencode2~exonicCategory) +
  ggtitle("All tx for which we have at least 50 bases intronic and exonic covered (48869)") +
  theme(plot.title=element_text(size=8))
ggsave(filename=paste0(savedir, "72_conservation_intronicVsExonicScatterplot_allTx_facettedByOverlapsGencodeAndExonicCategory.png"),
       plot=pl, width=5, height=5, units='in', dpi=200)

# for the tx with intronicMeanConservationScore >= 0.75, check their gencodeTranscriptType
gencodeTranscriptType = mclapply(consDf_higherConf$fastaIdentifier,
                                 function(e){
                                   eSplit = strsplit(e, '_')[[1]]
                                   if(length(eSplit)>3){
                                     return(eSplit[6])
                                   } else {
                                     return(NA)
                                   }},
                                 mc.cores=10L)

gffcompareClassCode = mclapply(consDf_higherConf$fastaIdentifier,
                               function(e){
                                 eSplit = strsplit(e, '_')[[1]]
                                 if(length(eSplit)>3){
                                   return(eSplit[7])
                                 } else {
                                   return(NA)
                                 }},
                               mc.cores=10L)

consDf_higherConf$gencodeTranscriptType = unlist(gencodeTranscriptType)
consDf_higherConf$gffcompareClassCode = unlist(gffcompareClassCode)

table(consDf_higherConf$gencodeTranscriptType[consDf_higherConf$intronicMeanConservationScore >= 0.75])
# antisense                          lincRNA          nonsense-mediated-decay 
#        12                                5                                2 
# processed-pseudogene             processed-transcript                   protein-coding 
#                   50                                6                               32 
# transcribed-processed-pseudogene           unprocessed-pseudogene 
#                                1                                3 
table(consDf_higherConf$gffcompareClassCode[consDf_higherConf$intronicMeanConservationScore >= 0.75 &
                                              consDf_higherConf$gencodeTranscriptType == "protein-coding"])
# =  c  e  i  j  k  o 
# 3  1  1  8 11  2  6



###############
### 2. & 3. ###
###############

## read in expression values for all transcripts across celltypes and try to correlate exonicMeanConservationScore
## with expression
expressionDf = read.table(file="/work2/gpeters/rohmalex/work/Mark/RNA-seq_181127/31_transcriptomeAssembly/wholeProcess_190410/42_modelMerged/meanExpression_allTx.tsv",
                          header=T, sep='\t', stringsAsFactors=F)
identical(expressionDf$name, gsub('_.*$', '', consDf$fastaIdentifier)) #TRUE
sel = which(consDf$pctExonicBasesInformative > 50 & consDf$pctIntronicBasesInformative > 50)
expressionDf = expressionDf[sel, ]

consAndExprDf = cbind(consDf_higherConf, expressionDf)

invisible(lapply(c('LZ','PD','RS','EES','LES'), function(e){
  tmpDf = consAndExprDf[consAndExprDf[, paste0('mean_TPM_', e)] > 0.5, ]
  annot = data.frame(overlapsGencode2=c("known", "known", "novel", "novel"),
                     exonicCategory=c("mono-exonic", "multi-exonic", "mono-exonic", "multi-exonic"))
  annot$count = sapply(1:nrow(annot), function(i){
    sum(tmpDf$overlapsGencode2 == annot$overlapsGencode2[i] &
          tmpDf$exonicCategory == annot$exonicCategory[i])
  })
  pl1 = ggplot(data=tmpDf,
               aes(x=log2(.data[[paste0('mean_TPM_', e)]]+0.1), y=exonicMeanConservationScore)) +
    geom_hex(bins=70) + scale_fill_viridis() +
    facet_grid(overlapsGencode2~exonicCategory) +
    ggtitle(paste0("Tx for which we have at least 50 bases intronic and exonic\ncovered and expression > 0.5 TPM (", nrow(tmpDf), ")")) +
    xlab(paste0("log2(mean_TPM_", e, " + 0.1)")) + theme(plot.title=element_text(size=8)) +
    geom_text(data=annot, aes(x=13, y=0.9, label=count))
  ggsave(filename=paste0(savedir, "73_conservation_exonicConservationVsExpressionScatterplot_txWithExprGT0.5TPM_facettedByOverlapsGencodeAndExonicCategory_", e, ".png"),
         plot=pl1, width=5, height=5, units='in', dpi=200)
  pl2 = ggplot(data=tmpDf,
               aes(x=log2(.data[[paste0('mean_TPM_', e)]]+0.1), y=intronicMeanConservationScore)) +
    geom_hex(bins=70) + scale_fill_viridis() +
    facet_grid(overlapsGencode2~exonicCategory) +
    ggtitle(paste0("Tx for which we have at least 50 bases intronic and exonic\ncovered and expression > 0.5 TPM (", nrow(tmpDf), ")")) +
    xlab(paste0("log2(mean_TPM_", e, " + 0.1)")) + theme(plot.title=element_text(size=8)) +
    geom_text(data=annot, aes(x=13, y=0.9, label=count))
  ggsave(filename=paste0(savedir, "74_conservation_intronicConservationVsExpressionScatterplot_txWithExprGT0.5TPM_facettedByOverlapsGencodeAndExonicCategory_", e, ".png"),
         plot=pl2, width=5, height=5, units='in', dpi=200)
}))




###############
### 5. & 6. ###
###############

# 2 random subsets : one generated with the structure of the known tx and a random TSS from the same chromosome,
# and one generated from the structure of the novel tx
# all 3 files contain conservation scores from the same randomly generated tx

## conservation from alignment of 6 species
## with mod file from UCSC
cons_6species_UCSCmod = read.table(file='/work2/gpeters/rohmalex/work/Mark/RNA-seq_181127/32_transcriptomeValidation/35_conservation_mm10reducedMultipleAlignment_UCSCphylogeneticModel.tsv',
                                   header=T, sep='\t', stringsAsFactors=F)

# /!\ for some tx, we had probabilities for none of the bases so the mean conservation score is "None" and the
# pct bases informative is 0
cons_6species_UCSCmod$exonicMeanConservationScore = as.numeric(cons_6species_UCSCmod$exonicMeanConservationScore)

sum(is.na(cons_6species_UCSCmod$exonicMeanConservationScore)) #4640 tx have no exonic conservation information

table(cons_6species_UCSCmod$pctExonicBasesInformative > 50)
# FALSE  TRUE 
# 16077 91641

## keep only the transcripts for which we have more than 50% bases informative
cons_6species_UCSCmod_informativeTx = subset(cons_6species_UCSCmod, pctExonicBasesInformative > 50)

table(cons_6species_UCSCmod_informativeTx$overlapsGencode)
# known       novel randomKnown randomNovel 
# 41222        8272       34471        7676 

# OLD : histogram
pl = ggplot(data=cons_6species_UCSCmod_informativeTx, aes(x=exonicMeanConservationScore, fill=overlapsGencode)) +
  geom_histogram(position='identity', alpha=0.5, bins=50) + facet_wrap(~exonicCategory) +
  ggtitle('Mean conservation across exons for known, novel, and random tx\nConservation scores from alignment of 6 species close to mouse') +
  theme(plot.title=element_text(size=10), legend.position=c(0.2, 0.8)) +
  scale_fill_manual(values=c("#3fdb14", "#db3214", "#1450db"))
ggsave(filename=paste0(savedir, "75_exonicConservation_knownVsNovelVsRandom_6speciesAlignment_facettedByExonicCategory.png"),
       plot=pl, width=5, height=5, units='in', dpi=200)

# plot cumulative density
pl = ggplot(data=cons_6species_UCSCmod_informativeTx, aes(x=exonicMeanConservationScore, color=overlapsGencode)) +
  stat_ecdf(position='identity') + 
  facet_wrap(~exonicCategory) +
  scale_x_reverse() + ylab("Cumulative density") +
  ggtitle('Mean conservation across exons for known, novel, and random tx\nConservation scores from alignment of 6 species close to mouse\nphylogenetic model from UCSC for euarchontoglires') +
  theme(plot.title=element_text(size=10), legend.position=c(0.2, 0.8))
ggsave(filename=paste0(savedir, "81_exonicConservation_knownVsNovelVsRandom_6speciesAlignment_phylogeneticModelFromUCSC_facettedByExonicCategory.png"),
       plot=pl, width=5, height=5, units='in', dpi=200)



#################
## conservation from alignment of 6 species
## with mod file computed with phyloFit
cons_6species_computedMod = read.table(file='/work2/gpeters/rohmalex/work/Mark/RNA-seq_181127/32_transcriptomeValidation/36_conservation_mm10reducedMultipleAlignment_computedPhylogeneticModel.tsv',
                                       header=T, sep='\t', stringsAsFactors=F)

# /!\ for some tx, we had probabilities for none of the bases so the mean conservation score is "None" and the
# pct bases informative is 0
cons_6species_computedMod$exonicMeanConservationScore = as.numeric(cons_6species_computedMod$exonicMeanConservationScore)

sum(is.na(cons_6species_computedMod$exonicMeanConservationScore)) #4715 tx have no exonic conservation information

table(cons_6species_computedMod$pctExonicBasesInformative > 50)
# FALSE  TRUE 
# 16424 91294 

## keep only the transcripts for which we have more than 50% bases informative
cons_6species_computedMod_informativeTx = subset(cons_6species_computedMod, pctExonicBasesInformative > 50)

table(cons_6species_computedMod_informativeTx$overlapsGencode)
# known       novel randomKnown randomNovel 
# 41169        8217       34277        7631 

# plot cumulative density
pl = ggplot(data=cons_6species_computedMod_informativeTx, aes(x=exonicMeanConservationScore, color=overlapsGencode)) +
  stat_ecdf(position='identity') + 
  facet_wrap(~exonicCategory) +
  scale_x_reverse() + ylab("Cumulative density") +
  ggtitle('Mean conservation across exons for known, novel, and random tx\nConservation scores from alignment of 6 species close to mouse\nphylogenetic model computed with phyloFit and the 6 species alignment') +
  theme(plot.title=element_text(size=10), legend.position=c(0.2, 0.8))
ggsave(filename=paste0(savedir, "82_exonicConservation_knownVsNovelVsRandom_6speciesAlignment_phylogeneticModelComputed_facettedByExonicCategory.png"),
       plot=pl, width=5, height=5, units='in', dpi=200)



#################
## conservation from alignment of 60 species
cons_60species = read.table(file='/work2/gpeters/rohmalex/work/Mark/RNA-seq_181127/32_transcriptomeValidation/37_conservation_mm10.60multipleAlignment.tsv',
                            header=T, sep='\t', stringsAsFactors=F)

# /!\ for some tx, we had probabilities for none of the bases so the mean conservation score is "None" and the
# pct bases informative is 0
cons_60species$exonicMeanConservationScore = as.numeric(cons_60species$exonicMeanConservationScore)

sum(is.na(cons_60species$exonicMeanConservationScore)) #3284 tx have no exonic conservation information

table(cons_60species$pctExonicBasesInformative > 50)
# FALSE  TRUE 
# 12994 94724

## keep only the transcripts for which we have more than 50% bases informative
cons_60species_informativeTx = subset(cons_60species, pctExonicBasesInformative > 50)

table(cons_60species_informativeTx$overlapsGencode)
# known       novel randomKnown randomNovel 
# 42529        8604       35662        7929 

pl = ggplot(data=cons_60species_informativeTx, aes(x=exonicMeanConservationScore, color=overlapsGencode)) +
  stat_ecdf(position='identity') + 
  facet_wrap(~exonicCategory) +
  scale_x_reverse() + ylab("Cumulative density") +
  ggtitle('Mean conservation across exons for known, novel, and random tx\nConservation scores from alignment of 60 vertebrates') +
  theme(plot.title=element_text(size=10), legend.position=c(0.2, 0.8))
ggsave(filename=paste0(savedir, "83_exonicConservation_knownVsNovelVsRandom_60speciesAlignment_facettedByExonicCategory.png"),
       plot=pl, width=5, height=5, units='in', dpi=200)



##########
### 7. ###
##########

# Correlation between the mean exonic scores from the 3 sources (tx are in the same order in each file)
allConservationScores = data.frame(cons_6species_UCSCmod)
colnames(allConservationScores)[4:5] = c('exonicMeanConservationScore_6species_UCSCmod', 'pctExonicBasesInformative_6species_UCSCmod')
allConservationScores$exonicMeanConservationScore_6species_computedMod = cons_6species_computedMod$exonicMeanConservationScore
allConservationScores$pctExonicBasesInformative_6species_computedMod = cons_6species_computedMod$pctExonicBasesInformative
allConservationScores$exonicMeanConservationScore_60species = cons_60species$exonicMeanConservationScore
allConservationScores$pctExonicBasesInformative_60species = cons_60species$pctExonicBasesInformative


allConservationScores_informativeTx = subset(allConservationScores,
                                             pctExonicBasesInformative_6species_UCSCmod > 50 &
                                               pctExonicBasesInformative_6species_computedMod > 50 &
                                               pctExonicBasesInformative_60species > 50)

sapply(c('known', 'novel', 'randomKnown', 'randomNovel'), function(e){
  cor(allConservationScores_informativeTx$exonicMeanConservationScore_6species_computedMod[allConservationScores_informativeTx$overlapsGencode == e],
      allConservationScores_informativeTx$exonicMeanConservationScore_60species[allConservationScores_informativeTx$overlapsGencode == e])
})
#     known       novel randomKnown randomNovel 
# 0.9297871   0.8376683   0.7875205   0.7997629

# scatterplot
library(gridExtra)
plList = lapply(c('known', 'novel', 'randomKnown', 'randomNovel'), function(e){
  ggplot(data=allConservationScores_informativeTx[allConservationScores_informativeTx$overlapsGencode == e, ],
         aes(x=exonicMeanConservationScore_6species_computedMod, y=exonicMeanConservationScore_60species)) +
    geom_hex(bins=80) + scale_fill_viridis() + ggtitle(e) +
    theme(legend.position="none", axis.title.x=element_blank(), axis.title.y=element_blank(),
          plot.title=element_text(hjust=0.5))
})
grid.arrange(grobs=plList, left="exonicMeanConservationScore_60species", bottom="exonicMeanConservationScore_6species_computedMod")
grob = arrangeGrob(grobs=plList, left="exonicMeanConservationScore_60species", bottom="exonicMeanConservationScore_6species_computedMod")
ggsave(filename=paste0(savedir, "84_exonicConservation_60speciesVs6speciesAlignment_facettedByKnownOrNovelOrRandom.png"),
       plot=grob, width=5, height=5, units='in', dpi=200)


sapply(c('known', 'novel', 'randomKnown', 'randomNovel'), function(e){
  cor(allConservationScores_informativeTx$exonicMeanConservationScore_6species_computedMod[allConservationScores_informativeTx$overlapsGencode == e],
      allConservationScores_informativeTx$exonicMeanConservationScore_60species[allConservationScores_informativeTx$overlapsGencode == e],
      method='spearman')
})
#     known       novel randomKnown randomNovel 
# 0.9266962   0.7576728   0.7382213   0.7258910 



##########
### 8. ###
##########

# Separate the transcripts depending on if they come from repeat expression or are chimeras iniated at a repeat

txComingFromRepeats = readRDS(paste0(Rdatadir, 'txComingFromRepeats.rds'))
table(txComingFromRepeats$tx_fastaId %in% sapply(allConservationScores$fastaIdentifier, function(e){
  strsplit(e, '_')[[1]][1]
}))
# TRUE 
# 3591

txComingFromRepeats = readRDS(paste0(Rdatadir, 'txComingFromRepeats.rds'))
table(txComingFromRepeats$tx_fastaId %in% sapply(allConservationScores_informativeTx$fastaIdentifier, function(e){
  strsplit(e, '_')[[1]][1]
}))
# FALSE  TRUE 
#  1058  2533 
# -> we only have 2533/3591 repeat-associated tx for which we have conservation scores (meaning that we had <= 50%
#    bases for which we had conservation scores)

allConservationScores_informativeTx$longFastaIdentifier = allConservationScores_informativeTx$fastaIdentifier
allConservationScores_informativeTx$fastaIdentifier = sapply(allConservationScores_informativeTx$fastaIdentifier, function(e){strsplit(e, '_')[[1]][1]})
allConservationScores_informativeTx$repeatOverlap = 'no'
txComingFromRepeats_subsetWithCons = txComingFromRepeats[which(txComingFromRepeats$tx_fastaId %in%
                                                                 allConservationScores_informativeTx$fastaIdentifier)]
allConservationScores_informativeTx$repeatOverlap[match(
  txComingFromRepeats_subsetWithCons$tx_fastaId,
  allConservationScores_informativeTx$fastaIdentifier)] = txComingFromRepeats_subsetWithCons$potentialCategory


tmp = data.frame(allConservationScores_informativeTx)
tmp2 = sapply(c('no', 'chimeraStartingAtRepeat', 'repeatExpression'), function(e){sum(tmp$repeatOverlap == e)})
tmp$repeatOverlap = sapply(tmp$repeatOverlap, function(e){
  paste(e, tmp2[e], sep='_')
})
tmp$repeatOverlap = factor(tmp$repeatOverlap, levels=c('no_88761', 'chimeraStartingAtRepeat_2376', 'repeatExpression_157'))
pl = ggplot(data=tmp, aes(x=exonicMeanConservationScore_6species_computedMod, color=repeatOverlap)) +
  stat_ecdf(position='identity') + 
  scale_x_reverse() + ylab("Cumulative density") +
  ggtitle('Mean conservation across exons for tx coming from a repeat or not\nConservation scores from alignment of 6 species') +
  theme(plot.title=element_text(size=10), legend.position=c(0.3, 0.8))
ggsave(filename=paste0(savedir, "85_exonicConservation_splitByRepeatOverlap_6speciesAlignment_phylogeneticModelComputed.png"),
       plot=pl, width=5, height=5, units='in', dpi=200)



##########
### 9. ###
##########

# Cumulative density plot, removing tx coming from repeats
tmpDf = data.frame(cons_6species_computedMod_informativeTx)
tmpDf$fastaIdentifier = sapply(tmpDf$fastaIdentifier, function(e){
  strsplit(e, '_')[[1]][1]
})
tmpDf = tmpDf[!tmpDf$fastaIdentifier %in% txComingFromRepeats$tx_fastaId, ]
pl = ggplot(data=tmpDf, aes(x=exonicMeanConservationScore, color=overlapsGencode)) +
  stat_ecdf(position='identity') + 
  facet_wrap(~exonicCategory) +
  scale_x_reverse() + ylab("Cumulative density") +
  ggtitle('Mean conservation across exons for known, novel, and random tx\nConservation scores from alignment of 6 species close to mouse\nphylogenetic model computed with phyloFit and the 6 species alignment') +
  theme(plot.title=element_text(size=10), legend.position=c(0.2, 0.8))
ggsave(filename=paste0(savedir, "86_exonicConservation_knownVsNovelVsRandom_6speciesAlignment_phylogeneticModelComputed_txComingFromRepeatsRemoved_facettedByExonicCategory.png"),
       plot=pl, width=5, height=5, units='in', dpi=200)



###########
### 10. ###
###########

# Split the tx according to which celltype(s) they were found into, then plot exonic conservation distribution
# The file 
# /work2/gpeters/rohmalex/work/Mark/RNA-seq_181127/31_transcriptomeAssembly/wholeProcess_190410/42_modelMerged/globalTranscriptome_presenceOfGenesInIndividualCelltypes.txt
# contains the list of genes (not tx) and if they are present in the individual txomes
# cons_6species_computedMod_informativeTx contains the longFastaIdentifier -> split into fastaIdentifier and gene_id
# -> in genesPresenceInIndTxomesDf, keep only genes that are present in cons_6species_computedMod_informativeTx
# -> [average the conservation score by gene (check if the variance is low between isoforms)]
#    -> if we average we will give more weight to some exons (present in many isoforms) and less to others (present
#       in only one isoform), so I will consider the max score across the isoforms as readout for the gene 
# -> create one column summarizing the pattern of presence : 'PPPAA', 'AAPAA', ...
# -> add one column with the overlap with repeats
# -> boxplots for exonic conservation for all patterns of presence of gene (removing the genes coming from repeats),
#    facetted by overlapsGencode

genesPresenceInIndTxomesDf = read.table(file='/work2/gpeters/rohmalex/work/Mark/RNA-seq_181127/31_transcriptomeAssembly/wholeProcess_190410/42_modelMerged/globalTranscriptome_presenceOfGenesInIndividualCelltypes.txt',
                                        header=T, sep='\t', stringsAsFactors=F)

# cons_6species_computedMod_informativeTx contains longFastaIdentifier : split into fastaIdentifier and gene_id
cons_6species_computedMod_informativeTx$longFastaIdentifier = cons_6species_computedMod_informativeTx$fastaIdentifier
cons_6species_computedMod_informativeTx$fastaIdentifier = sapply(cons_6species_computedMod_informativeTx$longFastaIdentifier, function(e){
  strsplit(e, '_')[[1]][1]
})
cons_6species_computedMod_informativeTx$gene_id = sapply(cons_6species_computedMod_informativeTx$longFastaIdentifier, function(e){
  strsplit(e, '_')[[1]][2]
})

genesPresenceInIndTxomesDf_informativeGenes = genesPresenceInIndTxomesDf[genesPresenceInIndTxomesDf$gene_id %in% cons_6species_computedMod_informativeTx$gene_id, ]
# -> 24089/27289 remaining

# consider max(conservation) across isoforms of each gene
genesPresenceInIndTxomesDf_informativeGenes$exonicMeanConservationScore_highestIsoform = sapply(genesPresenceInIndTxomesDf_informativeGenes$gene_id, function(gnid){
  max(cons_6species_computedMod_informativeTx$exonicMeanConservationScore[cons_6species_computedMod_informativeTx$gene_id == gnid])
})

genesPresenceInIndTxomesDf_informativeGenes$presencePattern = sapply(1:nrow(genesPresenceInIndTxomesDf_informativeGenes), function(i){
  res = gsub('^([a-z]).*', '\\1', genesPresenceInIndTxomesDf_informativeGenes[i, c('LZ','PD','RS','EES','LES')])
  return(paste0(res, collapse=''))
})
annotDf = as.data.frame(table(genesPresenceInIndTxomesDf_informativeGenes$presencePattern, genesPresenceInIndTxomesDf_informativeGenes$overlapWithKnownGene))
colnames(annotDf) = c('presencePattern', 'overlapWithKnownGene', 'count')

genesPresenceInIndTxomesDf_informativeGenes$presencePattern = factor(genesPresenceInIndTxomesDf_informativeGenes$presencePattern,
                                                                     levels=annotDf$presencePattern[order(annotDf$count[annotDf$overlapWithKnownGene == 'novel'], decreasing=T)])


genesPresenceInIndTxomesDf_informativeGenes$repeatOverlap = F
genesPresenceInIndTxomesDf_informativeGenes$repeatOverlap[genesPresenceInIndTxomesDf_informativeGenes$gene_id %in%
                                                            txComingFromRepeats$gene_id] = T

# boxplots
pl = ggplot(data=subset(genesPresenceInIndTxomesDf_informativeGenes, !repeatOverlap),
            aes(x=presencePattern, y=exonicMeanConservationScore_highestIsoform, fill=overlapWithKnownGene)) +
  geom_boxplot(outlier.alpha=0.1, outlier.size=1) + #facet_wrap(~overlapWithKnownGene, nrow=2) +
  geom_text(data=annotDf[annotDf$overlapWithKnownGene == 'known', ],
            aes(x=presencePattern, y=0.9, label=count), size=1.6, color="#e33010") +
  geom_text(data=annotDf[annotDf$overlapWithKnownGene == 'novel', ],
            aes(x=presencePattern, y=0.85, label=count), size=1.6, color="#2ca4e6") +
  scale_fill_manual(name="overlapsGencode", values=c("#e33010", "#2ca4e6")) +
  theme(axis.text.x=element_text(angle=90, vjust=0.5))
ggsave(filename=paste0(savedir, "87_exonicConservationVsCelltypeOfOrigin_geneLevel_knownVsNovel_6speciesAlignment_phylogeneticModelComputed_txComingFromRepeatsRemoved.png"),
       plot=pl, width=7, height=4, units='in', dpi=200)
####



#### Figure 4 Generation: ####

# /!\ considering each individual transcript's transcript_type
# in the gencode file, they put gene_type and transcript_type which can differ
# 
# Analysis of results from CPAT which identifies coding potential within transcript sequences
# Outputs a text file with mRNA_size, ORF_size, Fickett_score, Hexamer_score, coding_prob and the 
# transcripts' fasta identifiers as row names
#
# 1. The tool was run on all transcripts and we have for each of the ones overlapping a known transcript
# the corresponding known transcript id (obtained by running gffcompare on the merged transcriptome and Gencode 
# transcripts), the gene_id, the transcript_type and the gffcompare_class_code
# We can check whether predicted transcripts overlapping a gencode transcript classified as protein_coding
# get coding_prob closer to 1 and other types of non-coding transcripts get a value closer to 0 to
# assess the validity of the tool and set a threshold between 0 and 1 to discriminate between coding
# and non coding
# We can also check if we get different coding_prob depending on the gffcompare_class_code
# 1.1. For the known multi-exonic and then mono-exonic : boxplots for coding_prob for the different 
#      transcript_type
# 1.2. For the known multi-exonic and then mono-exonic : boxplots for coding_prob for the different 
#      transcript_type and gffcompare_class_code
#
# 2. Distribution of coding_prob for the novel transcripts, multi-exonic vs mono-exonic and autosomes vs sex chromosomes

# Gencode documentation on gene_type : (https://www.gencodegenes.org/pages/biotypes.html)

# ~~~ coding ~~~
# "PROTEIN-CODING"
# "IG-V-GENE"
# "TR-C-GENE"

# ~~~ maybeCoding ~~~
# "RETAINED-INTRON"
# "TEC"
# "NONSENSE-MEDIATED-DECAY"
# "POLYMORPHIC-PSEUDOGENE"

# ~~~ nonCoding ~~~
# "SNRNA"
# "SNORNA"
# "MIRNA"
# "SCARNA"
# "RRNA"
# "MISC-RNA"
# "RIBOZYME"
# "PROCESSED-TRANSCRIPT"
# "NON-STOP-DECAY"
# "SENSE-INTRONIC"
# "SENSE-OVERLAPPING"
# "ANTISENSE"
# "LINCRNA"
# "PSEUDOGENE"
# "UNITARY-PSEUDOGENE"
# "PROCESSED-PSEUDOGENE"
# "UNPROCESSED-PSEUDOGENE"
# "TRANSCRIBED-UNITARY-PSEUDOGENE"
# "TRANSCRIBED-PROCESSED-PSEUDOGENE"
# "TRANSCRIBED-UNPROCESSED-PSEUDOGENE"
# "BIDIRECTIONAL-PROMOTER-LNCRNA"
# "MACRO-LNCRNA"
#####


library(ggplot2)

## Basic file setup
baseDir = '/tungstenfs/scratch/gbioinfo_work/gpeters/gillmark/Post_Replicative_MaleGermCell_RNA_Seq_Comparison/Coding_Potential_Analysis/CPAT/'
dir.create(baseDir, showWarnings=FALSE)
plotDir = paste0(baseDir, "Plots/")
dir.create(plotDir, showWarnings=FALSE)
dataDir = paste0(baseDir, "RData/")
dir.create(dataDir, showWarnings=FALSE)
scriptnb <- "14_"


### Read the output from CPAT and extract all the information from the fasta identifier
cpatData = read.table(file='/work2/gpeters/rohmalex/work/Mark/RNA-seq_181127/32_transcriptomeValidation/13_CPAT_oneTranscriptTypePerIsoform/allTranscripts',
                      header=T, sep='\t', stringsAsFactors=F)
cpatData$fastaId = rownames(cpatData)
rownames(cpatData) = NULL
fastaIdDf = sapply(1:nrow(cpatData), function(i){
  fastaId = cpatData$fastaId[i]
  fastaIdSplit = strsplit(fastaId, '_')[[1]]
  res = c(isOnAutosome='',
          gene_id='',
          exonicCategory='',
          overlapsGencodeTx='',
          gencode_transcript_id='',
          gencode_gene_name='',
          gencode_transcript_type='',
          gffcompare_class_code='')
  if(length(fastaIdSplit) == 3){ # does not overlap a known transcript
    res['overlapsGencodeTx'] = 'F'
    res['gencode_transcript_id'] = 'NA'
    res['gencode_gene_name'] = 'NA'
    res['gencode_transcript_type'] = 'NA'
    res['gffcompare_class_code'] = 'NA'
  } else { # overlaps with a known transcript
    res['overlapsGencodeTx'] = 'T'
    res['gencode_transcript_id'] = fastaIdSplit[4]
    res['gencode_gene_name'] = fastaIdSplit[5]
    res['gencode_transcript_type'] = fastaIdSplit[6]
    res['gffcompare_class_code'] = fastaIdSplit[7]
  }
  res['isOnAutosome'] = !grepl('^chr[XY]', fastaIdSplit[1], ignore.case=T)
  res['gene_id'] = fastaIdSplit[2]
  res['exonicCategory'] = fastaIdSplit[3]
  return(res)
})
fastaIdDf = as.data.frame(t(fastaIdDf), stringsAsFactors=F)
fastaIdDf$isOnAutosome = as.logical(fastaIdDf$isOnAutosome)
fastaIdDf$overlapsGencodeTx = as.logical(fastaIdDf$overlapsGencodeTx)
fastaIdDf[fastaIdDf == 'NA'] = NA

cpatData = cbind(cpatData, fastaIdDf)

table(cpatData$gencode_transcript_type)
cpatData$gencode_transcript_type2 = sapply(cpatData$gencode_transcript_type, function(e){
  ifelse(e %in% c("IG-V-GENE", "TR-C-GENE", "PROTEIN-CODING"), 'coding',
         ifelse(e %in% c("RETAINED-INTRON", "TEC", "POLYMORPHIC-PSEUDOGENE", "NONSENSE-MEDIATED-DECAY"), 'maybeCoding', 'nonCoding'))
})


cpatDataKnownMulti = subset(cpatData, overlapsGencodeTx & exonicCategory == "MULTI-EXONIC")

annot = data.frame(gencode_transcript_type=unique(cpatDataKnownMulti$gencode_transcript_type),
                   stringsAsFactors=F)
annot$count = sapply(annot$gencode_transcript_type, function(e){
  sum(cpatDataKnownMulti$gencode_transcript_type == e)
})
annot$gencode_transcript_type2 = sapply(annot$gencode_transcript_type, function(e){
  ifelse(e %in% c("IG-V-GENE", "TR-C-GENE", "PROTEIN-CODING"), 'coding',
         ifelse(e %in% c("RETAINED-INTRON", "TEC", "POLYMORPHIC-PSEUDOGENE", "NONSENSE-MEDIATED-DECAY"), 'maybeCoding', 'nonCoding'))
})

tx_type_ordered = c('PROTEIN-CODING', 'IG-V-GENE', "TR-C-GENE",
                    "RETAINED-INTRON", 'TEC', 'POLYMORPHIC-PSEUDOGENE', "NONSENSE-MEDIATED-DECAY",
                    unique(annot$gencode_transcript_type))
tx_type_ordered = tx_type_ordered[!duplicated(tx_type_ordered)]

pl1 = ggplot(data=cpatDataKnownMulti, aes(x=factor(gencode_transcript_type, levels=tx_type_ordered),
                                          y=coding_prob, fill=gencode_transcript_type2)) +
  geom_boxplot() +
  theme(axis.text.x=element_text(angle=90, hjust=1, size=5)) +
  xlab('gencode_transcript_type') + ggtitle('Multi-exonic transcripts overlapping a gencode transcript', 'All transcripts') +
  geom_text(data=annot, aes(x=gencode_transcript_type, y=1.1, label=count), size=1.2) +
  scale_fill_discrete(name='potential')
ggsave(filename=paste0(plotDir, '20_cpat_oneTranscriptTypePerIsoform_boxplots_codingProbabilityByTranscriptType_knownMultiExonic.png'),
       plot=pl1, width=6, height=5, units='in', dpi=200)


cpatDataKnownMono = subset(cpatData, overlapsGencodeTx & exonicCategory == "MONO-EXONIC")

annot = data.frame(gencode_transcript_type=unique(cpatDataKnownMono$gencode_transcript_type),
                   stringsAsFactors=F)
annot$count = sapply(annot$gencode_transcript_type, function(e){
  sum(cpatDataKnownMono$gencode_transcript_type == e)
})
annot$gencode_transcript_type2 = sapply(annot$gencode_transcript_type, function(e){
  ifelse(e %in% c("IG-V-GENE", "PROTEIN-CODING"), 'coding',
         ifelse(e %in% c("RETAINED-INTRON", "TEC", "NONSENSE-MEDIATED-DECAY"), 'maybeCoding', 'nonCoding'))
})

tx_type_ordered = c('PROTEIN-CODING', 'IG-V-GENE',
                    "RETAINED-INTRON", 'TEC', "NONSENSE-MEDIATED-DECAY",
                    unique(annot$gencode_transcript_type))
tx_type_ordered = tx_type_ordered[!duplicated(tx_type_ordered)]

pl2 = ggplot(data=cpatDataKnownMono, aes(x=factor(gencode_transcript_type, levels=tx_type_ordered),
                                         y=coding_prob, fill=gencode_transcript_type2)) +
  geom_boxplot() +
  theme(axis.text.x=element_text(angle=90, hjust=1, size=5)) +
  xlab('gencode_transcript_type') + ggtitle('Mono-exonic transcripts overlapping a gencode transcript', 'All transcripts') +
  geom_text(data=annot, aes(x=gencode_transcript_type, y=1.1, label=count), size=2) +
  scale_fill_discrete(name='potential')

ggsave(filename=paste0(plotDir, 
                       '21_cpat_oneTranscriptTypePerIsoform_boxplots_codingProbabilityByTranscriptType_knownMonoExonic.png'),
       plot=pl2, width=6, height=5, units='in', dpi=200)

ggsave(filename=paste0(plotDir, 
                       '21_cpat_oneTranscriptTypePerIsoform_boxplots_codingProbabilityByTranscriptType_knownMonoExonic.pdf'),
       plot=pl2)


annot = data.frame(gencode_transcript_type=unique(cpatDataKnownMulti$gencode_transcript_type),
                   stringsAsFactors=F)
annot$count = sapply(annot$gencode_transcript_type, function(e){
  sum(cpatDataKnownMulti$gencode_transcript_type == e)
})
annot$gencode_transcript_type2 = sapply(annot$gencode_transcript_type, function(e){
  ifelse(e %in% c("IG-V-GENE", "TR-C-GENE", "PROTEIN-CODING"), 'coding',
         ifelse(e %in% c("RETAINED-INTRON", "TEC", "POLYMORPHIC-PSEUDOGENE", "NONSENSE-MEDIATED-DECAY"), 'maybeCoding', 'nonCoding'))
})

head(annot[order(annot$count, decreasing=T), ])
#   gencode_transcript_type count gencode_transcript_type2
# 1          PROTEIN-CODING 29283                   coding
# 3                 LINCRNA  3836                nonCoding
# 2               ANTISENSE  2135                nonCoding
# 4 NONSENSE-MEDIATED-DECAY  1928              maybeCoding
# 6    PROCESSED-TRANSCRIPT  1340                nonCoding
# 5    PROCESSED-PSEUDOGENE   448                nonCoding
# -> PROTEIN-CODING, LINCRNA, ANTISENSE, NONSENSE-MEDIATED-DECAY, PROCESSED-TRANSCRIPT

mostAbundantTxTypesDf = subset(cpatDataKnownMulti,
                               gencode_transcript_type %in% c('PROTEIN-CODING','LINCRNA','ANTISENSE','NONSENSE-MEDIATED-DECAY','PROCESSED-TRANSCRIPT'))
sapply(unique(mostAbundantTxTypesDf$gencode_transcript_type), function(e){
  table(mostAbundantTxTypesDf$gffcompare_class_code[mostAbundantTxTypesDf$gencode_transcript_type == e])
})
# $`PROTEIN-CODING`
#    =     C     I     J     K     O     Y 
# 9535   343  1115 17241   865   172    12 
# $ANTISENSE
#   =    C    I    J    K    O 
# 434    7   47 1444   93  110 
# $LINCRNA
#   =    C    I    J    K    O    Y 
# 778   19  135 2623  169   85   27 
# $`NONSENSE-MEDIATED-DECAY`
#   =    C    I    J    K    O 
# 280   17   54 1567    8    2 
# $`PROCESSED-TRANSCRIPT`
#   =   C   I   J   K   O 
# 503  16  59 708  37  17

classCodesTmp = unique(mostAbundantTxTypesDf$gffcompare_class_code)

annot = data.frame(gencode_transcript_type=rep(c('PROTEIN-CODING','LINCRNA','ANTISENSE','NONSENSE-MEDIATED-DECAY','PROCESSED-TRANSCRIPT'), each=length(classCodesTmp)),
                   gffcompare_class_code=rep(classCodesTmp, 5))
annot$count = sapply(1:nrow(annot), function(i){
  nrow(subset(mostAbundantTxTypesDf, mostAbundantTxTypesDf$gencode_transcript_type == annot$gencode_transcript_type[i] &
                mostAbundantTxTypesDf$gffcompare_class_code == annot$gffcompare_class_code[i]))
})

annot$pct = sapply(1:nrow(annot), function(i){
  num = nrow(subset(mostAbundantTxTypesDf, mostAbundantTxTypesDf$gencode_transcript_type == annot$gencode_transcript_type[i] &
                      mostAbundantTxTypesDf$gffcompare_class_code == annot$gffcompare_class_code[i]))
  denom = nrow(subset(mostAbundantTxTypesDf, mostAbundantTxTypesDf$gencode_transcript_type == annot$gencode_transcript_type[i]))
  return(round(100*num/denom, 1))
})

pl3 = ggplot(data=mostAbundantTxTypesDf, aes(x=gffcompare_class_code, y= coding_prob)) + geom_violin() +
  facet_wrap(.~gencode_transcript_type, nrow=5) +
  geom_text(data=annot, aes(x=gffcompare_class_code, y=1.1, label=count), size=2)

ggsave(filename=paste0(plotDir,
                       '22_cpat_oneTranscriptTypePerIsoform_violinplots_codingProbabilityByTranscriptType_splitByTranscriptType_knownMultiExonic.png'),
       plot=pl3, width=4, height=6, units='in', dpi=200)

ggsave(filename=paste0(plotDir, '22_cpat_oneTranscriptTypePerIsoform_violinplots_codingProbabilityByTranscriptType_splitByTranscriptType_knownMultiExonic.pdf'),
       plot=pl3)

pl31 = ggplot(data=mostAbundantTxTypesDf, aes(x=gffcompare_class_code, y= coding_prob)) + geom_violin() +
  facet_wrap(.~gencode_transcript_type, nrow=5) +
  geom_text(data=annot, aes(x=gffcompare_class_code, y=1.1, label=pct), size=2)

ggsave(filename=paste0(plotDir,
                       '22_cpat_oneTranscriptTypePerIsoform_violinplots_codingProbabilityByTranscriptType_splitByTranscriptType_knownMultiExonic_percentage.png'),
       plot=pl31, width=4, height=6, units='in', dpi=200)

ggsave(filename=paste0(plotDir,
                       '22_cpat_oneTranscriptTypePerIsoform_violinplots_codingProbabilityByTranscriptType_splitByTranscriptType_knownMultiExonic_percentage.pdf'),
       plot=pl31)

annot = data.frame(gencode_transcript_type=unique(cpatDataKnownMono$gencode_transcript_type),
                   stringsAsFactors=F)
annot$count = sapply(annot$gencode_transcript_type, function(e){
  sum(cpatDataKnownMono$gencode_transcript_type == e)
})
annot$gencode_transcript_type2 = sapply(annot$gencode_transcript_type, function(e){
  ifelse(e %in% c("IG-V-GENE", "TR-C-GENE", "PROTEIN-CODING"), 'coding',
         ifelse(e %in% c("RETAINED-INTRON", "TEC", "POLYMORPHIC-PSEUDOGENE", "NONSENSE-MEDIATED-DECAY"), 'maybeCoding', 'nonCoding'))
})

head(annot[order(annot$count, decreasing=T), ])
#    gencode_transcript_type count gencode_transcript_type2
# 1           PROTEIN-CODING  1812                   coding
# 2     PROCESSED-PSEUDOGENE   732                nonCoding
# 7                  LINCRNA   267                nonCoding
# 6                ANTISENSE   165                nonCoding
# 5                      TEC   147              maybeCoding
# 10    PROCESSED-TRANSCRIPT   133                nonCoding
# -> PROTEIN-CODING, PROCESSED-PSEUDOGENE, LINCRNA, ANTISENSE

mostAbundantTxTypesMonoDf = subset(cpatDataKnownMono,
                                   gencode_transcript_type %in% c('PROTEIN-CODING','PROCESSED-PSEUDOGENE','LINCRNA','ANTISENSE'))
sapply(unique(mostAbundantTxTypesMonoDf$gencode_transcript_type), function(e){
  table(mostAbundantTxTypesMonoDf$gffcompare_class_code[mostAbundantTxTypesMonoDf$gencode_transcript_type == e])
})
#   PROTEIN-CODING PROCESSED-PSEUDOGENE ANTISENSE LINCRNA
# =            149                  236        22      12
# C            221                   88        16      14
# E            118                   22        51      55
# I           1005                    1        33     127
# K            141                  308        24      25
# O            135                   56        13      17
# P             43                   21         6      17

classCodesTmp = unique(mostAbundantTxTypesMonoDf$gffcompare_class_code)

annot = data.frame(gencode_transcript_type=rep(c('PROTEIN-CODING','PROCESSED-PSEUDOGENE','LINCRNA','ANTISENSE'), each=length(classCodesTmp)),
                   gffcompare_class_code=rep(classCodesTmp, 4))
annot$count = sapply(1:nrow(annot), function(i){
  nrow(subset(mostAbundantTxTypesMonoDf, mostAbundantTxTypesMonoDf$gencode_transcript_type == annot$gencode_transcript_type[i] &
                mostAbundantTxTypesMonoDf$gffcompare_class_code == annot$gffcompare_class_code[i]))
})

annot$pct = sapply(1:nrow(annot), function(i){
  num = nrow(subset(mostAbundantTxTypesMonoDf, mostAbundantTxTypesMonoDf$gencode_transcript_type == annot$gencode_transcript_type[i] &
                      mostAbundantTxTypesMonoDf$gffcompare_class_code == annot$gffcompare_class_code[i]))
  denom = nrow(subset(mostAbundantTxTypesMonoDf, mostAbundantTxTypesMonoDf$gencode_transcript_type == annot$gencode_transcript_type[i]))
  return(round(100*num/denom, 1))
})

# save data
saveRDS(cpatData, file=paste0(dataDir, 'CPAT_df.rds'))

####
## Take only those genes with high coding probability from CPAT prediction to perform
## further ORFik analysis on

## Load libraries
library(ggplot2)
library("ORFik", lib.loc ="/tungstenfs/groups/gpeters/DB/R_libs")
library(GenomicFeatures)
library(data.table)
library(GenomicRanges)
library(BSgenome.Mmusculus.UCSC.mm10)
library(rtracklayer)
library(AnnotationDbi)
library(dplyr)
library(stringr)
library(Biostrings)
library(tidyverse)

## Basic file setup
baseDir = '/tungstenfs/scratch/gbioinfo_work/gpeters/gillmark/Post_Replicative_MaleGermCell_RNA_Seq_Comparison/Coding_Potential_Analysis/CPAT/'
dir.create(baseDir, showWarnings=FALSE)
plotDir = paste0(baseDir, "Plots/")
dir.create(plotDir, showWarnings=FALSE)
dataDir = paste0(baseDir, "RData/")
dir.create(dataDir, showWarnings=FALSE)
scriptnb <- "15_"

## Load CPAT data
CPAT_data <- readRDS(paste0(dataDir,"CPAT_df.rds"))

## Add label for known vs. novel for plotting
CPAT_data <- CPAT_data %>%
  mutate(Known_Novel = case_when(
    overlapsGencodeTx == TRUE ~ "known",
    overlapsGencodeTx == FALSE ~ "novel"
  ))

## Change case for exon category to match figures in rest of paper
CPAT_data <- CPAT_data %>%
  mutate(exonicCategory = tolower(exonicCategory))

## Add a column with predicted amino acid length for each ORF

CPAT_data <- CPAT_data %>%
  mutate(Protein_length = ORF_size/3 - 1)

## Make some basic plots of CPAT data
Protein_length_vs_CPAT_score <- ggplot(CPAT_data,
                                       aes(x=Protein_length,y=coding_prob))+
  geom_point()+
  xlab("Predicted protein length (amino acids)")+
  ylab("Coding Probability")+
  facet_grid(exonicCategory ~ Known_Novel)

ggsave(filename = paste0(plotDir,scriptnb,"Protein_length_vs_CPAT_score.pdf"),
       plot = Protein_length_vs_CPAT_score)

## Subset the Gencode known gene data for mRNA to see the distribution in 
## "real" protein coding genes

Gencode_CPAT <- filter(CPAT_data, Known_Novel == "known")

Gencode_coding <- filter(CPAT_data,gencode_transcript_type2 == "coding")

Novel_CPAT <- filter(CPAT_data,Known_Novel == "novel")

## Plot the Protein length vs. CPAT score for known genes splitting by
## protein-coding vs. non-protein coding
Protein_length_vs_CPAT_score_known <- ggplot(Gencode_CPAT,
                                             aes(x=Protein_length,y=coding_prob))+
  geom_point()+
  xlab("Predicted protein length (amino acids)")+
  ylab("Coding Probability")+
  facet_wrap(exonicCategory ~ gencode_transcript_type2)

ggsave(filename = paste0(plotDir,scriptnb,"Protein_length_vs_CPAT_score_KnownProteinCoding.pdf"),
       plot = Protein_length_vs_CPAT_score_known)

## Plot the protein length distribution of known coding genes
Protein_length_known_coding <- ggplot(Gencode_coding,
                                      aes(x=Protein_length))+
  geom_histogram(bins = 100)+
  facet_grid(~exonicCategory)

#summary(Gencode_coding)
#coding_prob
#Min.   :0.0000186      
#1st Qu.:0.6811106 
#Median :0.9925440  
#Mean   :0.7852492                     
#3rd Qu.:0.9999754                     
#Max.   :1.0000000 

#Protein_length
#-1  
#1st Qu.: 151  
#Median : 332  
#Mean   : 466  
#3rd Qu.: 596  
#Max.   :6993 

#summary(Novel_CPAT)
#coding_prob
#Min.   :0.00000       
#1st Qu.:0.03588  
#Median :0.06972  
#Mean   :0.12302                     
#3rd Qu.:0.13794                     
#Max.   :1.00000

#Protein_length   
#Min.   :  -1.00  
#1st Qu.:  44.00  
#Median :  65.00  
#Mean   :  76.93  
#3rd Qu.:  91.00  
#Max.   :3528.00

## Add a column to the CPAT_data for those who meet the filtering threshold
CPAT_data <- CPAT_data %>%
  mutate(HighCoding = case_when(
    (coding_prob >= 0.68 & Protein_length >= 151) ~ "high",
    (TRUE ~ "low")
  ))

CPAT_data$HighCoding <- as.factor(CPAT_data$HighCoding)
CPAT_data$exonicCategory <- as.factor(CPAT_data$exonicCategory)
CPAT_data$isOnAutosome <- as.factor(CPAT_data$isOnAutosome)
CPAT_data$Known_Novel <- as.factor(CPAT_data$Known_Novel)

#    Known_Novel exonicCategory isOnAutosome HighCoding     n
# 1        known   multi-exonic         TRUE       high 23978
# 2        known   multi-exonic         TRUE        low 15360
# 3        novel   multi-exonic         TRUE        low  6344
# 4        novel    mono-exonic         TRUE        low  3242
# 5        known    mono-exonic         TRUE        low  2802
# 6        known   multi-exonic        FALSE       high   695
# 7        known    mono-exonic         TRUE       high   581
# 8        known   multi-exonic        FALSE        low   328
# 9        known    mono-exonic        FALSE        low   124
# 10       novel   multi-exonic         TRUE       high    99
# 11       novel   multi-exonic        FALSE        low    83
# 12       novel    mono-exonic         TRUE       high    77
# 13       known    mono-exonic        FALSE       high    72
# 14       novel    mono-exonic        FALSE        low    41
# 15       novel    mono-exonic        FALSE       high    25
# 16       novel   multi-exonic        FALSE       high     8

## Generate a frequency table for known vs. novel for exon number
## autosomal location, and high coding probability
CPAT_data %>% count(Known_Novel,exonicCategory,isOnAutosome,HighCoding, sort = TRUE)

## Subset the novel genes based on the 1st quartile values for known protein-coding genes
Novel_coding <- filter(Novel_CPAT, coding_prob >= 0.68 & Protein_length >= 151)

saveRDS(Novel_coding,file = paste0(dataDir,scriptnb,"Novel_coding_CPAT.rds"))


## Compare the distribution of values for these genes with a random subset of genes of the
## same size from the known protein-coding set
Gencode_coding_sample <- sample_n(Gencode_coding,
                                  size = 209)

Sampled_set_merge <- rbind(Gencode_coding_sample,
                           Novel_coding)

## Plot the sampled data set
Sampled_plot_coding_prob <- ggplot(data = Sampled_set_merge,
                                   aes(x=coding_prob,
                                       fill=Known_Novel))+
  geom_histogram(bins = 100,
                 alpha= 0.5)+
  xlab("Coding Probability")
ggsave(filename = paste0(plotDir,scriptnb,"Sampled_histogram_Coding_gene_Coding_prob.pdf"),
       plot = Sampled_plot_coding_prob)

Sampled_plot_protein_length <- ggplot(data = Sampled_set_merge,
                                      aes(x=Protein_length,
                                          fill=Known_Novel))+
  geom_histogram(bins = 100,
                 alpha= 0.5)+
  xlab("Protein length (amino acids)")
ggsave(filename = paste0(plotDir,scriptnb,"Sampled_histogram_Coding_gene_Protein_length.pdf"),
       plot = Sampled_plot_protein_length)


## Make a TxDb object from the new annotation

## GTF file from Alexia's de novo assembly (merged over all cell types)
DeNovo_Mouse_GTF_annotation <- "/tungstenfs/groups/gpeters/rohmalex/work/Mark/RNA-seq_181127/31_transcriptomeAssembly/wholeProcess_190410/42_modelMerged/globalTranscriptome.gtf"

## Make a metadata table for loading into the TxDb object
MetaData_names <- c("Organism","Taxonomy ID","Genome")
MetaData_values <- c("Mus musculus","10090", "mm10")
Meta_Data_dataframe <- as.data.frame(cbind(MetaData_names,MetaData_values))
colnames(Meta_Data_dataframe) <- c("name","value")

## Make a TxDb object from the the GTF file with Metadata
DeNovo_TxDb <- makeTxDbFromGFF(DeNovo_Mouse_GTF_annotation,
                               dataSource = "Rohmer_Assembly",
                               organism = "Mus musculus",
                               metadata = Meta_Data_dataframe)

## Extract novel coding gene IDs from CPAT dataframe
Novel_coding_IDs <- unique(Novel_coding$gene_id)

## Select the genes from the TxDb object
cols <- columns(DeNovo_TxDb)

## Create a GRanges object with these genes
DeNovo_grList <- transcriptsBy(DeNovo_TxDb,by = "gene" )
DeNovo_gr <- unlist(DeNovo_grList)

## subset on the novel coding gene IDs
Novel_coding.gr <- subset(DeNovo_gr, names(DeNovo_gr) %in% Novel_coding_IDs)
Novel_coding.gr_tx <- transcripts(Novel_coding.gr)
Novel_coding.gr[1,]

exonsBy(DeNovo_TxDb,"tx",use.names=TRUE) [[45708]]

## Make a table with all of the information about these genes
## Remove the irrelevant columns from the dataframe
unneeded_columns <- c("Fickett_score",
                      "Hexamer_score",
                      "isOnAutosome",
                      "overlapsGencodeTx",
                      "gencode_transcript_id",
                      "gencode_gene_name",
                      "gencode_transcript_type",
                      "gffcompare_class_code",
                      "gencode_transcript_type2",
                      "Known_Novel")
Novel_coding_df <- select(Novel_coding,-unneeded_columns)

## Reorder the columns to make the dataframe more logical
Novel_coding_df <-
  Novel_coding_df %>% 
  relocate(gene_id,
           coding_prob,
           mRNA_size,
           ORF_size,
           Protein_length,
           exonicCategory,
           fastaId)

## Get the chromosome and start and end from the fastaID column

Novel_coding_df$Chromosome <- sub("\\:.*", "", Novel_coding_df$fastaId)

Novel_coding_df$Start_site <- str_extract(Novel_coding_df$fastaId,":(.*?)-")
Novel_coding_df$Start_site <- str_sub(Novel_coding_df$Start_site,2,-2)

Novel_coding_df$Strand <- str_extract(Novel_coding_df$fastaId,"\\((.*?)\\)")
Novel_coding_df$Strand <- str_sub(Novel_coding_df$Strand,2,-2)
Novel_coding_df$Strand <- as.factor(Novel_coding_df$Strand)

Novel_coding.gr <- subset(DeNovo_gr, names(DeNovo_gr) %in% Novel_gene_ids)


## Get the transcript sequences for the samples
transcripts <- exonsBy(DeNovo_TxDb, by="tx", use.names=TRUE)
DeNovo_transcript_seq <- extractTranscriptSeqs(BSgenome.Mmusculus.UCSC.mm10, transcripts)

## Convert the transcript IDs to gene IDs for matching
Transcript_names <- names(DeNovo_transcript_seq)
All_gene_names <- substring(Transcript_names,1,nchar(Transcript_names)-2)

Overlap_case <-  All_gene_names %in% Novel_gene_ids
Overlap_indices <- which(Overlap_case, arr.ind = FALSE, useNames = TRUE)

Novel_transcript_sequences <- DeNovo_transcript_seq[Overlap_indices]


## Unlist the DeNovo_ORFs to add Gene_ID and Tx_ID columns
ORFs.gr <- unlist(ORFs)

ORF_metadata <- elementMetadata(ORFs.gr)

ORF_metadata$Gene_ID <- str_extract(ORF_metadata$names,"MSTRG.(\\d+).")
ORF_metadata$Gene_ID <- str_sub(ORF_metadata$Gene_ID,1,-2)

ORF_metadata$Tx_ID <- str_extract(ORF_metadata$names,"MSTRG.(\\d+.)(\\d+)")

mcols(ORFs.gr)$Gene_ID <- ORF_metadata$Gene_ID
mcols(ORFs.gr)$Tx_ID <- ORF_metadata$Tx_ID

## Subset the GRanges object to for just the genes with high CPAT scores
High_CPAT_ORFs <-  ORFs.gr[(elementMetadata(ORFs.gr)[, "Gene_ID"] %in% Novel_coding_IDs)]
High_CPAT_ORFs <- groupGRangesBy(High_CPAT_ORFs, High_CPAT_ORFs$names)
# length(High_CPAT_ORFs)
# 819 (number of ORFs in the genes with high CPAT score)

## Select only ORFs encoding proteins greater than 151 amino acids in length
High_CPAT_ORFs <- High_CPAT_ORFs[widthPerGroup(High_CPAT_ORFs) >= 453]
# length(High_CPAT_ORFs)
# 443

## Make a browser track for the ORFs with high CPAT values
export.bed12(High_CPAT_ORFs, file = paste0(dataDir,scriptnb,"High_CPAT_ORFs.bed"))

## Extract the DNA sequences of the high CPAT ORFs
High_CPAT_seq <- extractTranscriptSeqs(BSgenome.Mmusculus.UCSC.mm10::Mmusculus,
                                       High_CPAT_ORFs)

writeXStringSet(High_CPAT_seq, filepath = paste0(dataDir,scriptnb,"High_CPAT_ORF_sequences.fasta"))

## Translate the ORFs into proteins
High_CPAT_aa_seq <- Biostrings::translate(High_CPAT_seq)

saveRDS(High_CPAT_aa_seq,file=paste0(dataDir,scriptnb,"Novel_gene_aa_sequences.rds"))

writeXStringSet(High_CPAT_aa_seq, filepath = paste0(dataDir,scriptnb,"High_CPAT_protein_sequences.fasta"))

### Make a list of MS confirmed coding transcripts
Confirmed_transcripts <- c("MSTRG.14901.1", "MSTRG.18522.1", "MSTRG.21743.1",
                           "MSTRG.21752.1", "MSTRG.22002.1", "MSTRG.26070.1",
                           "MSTRG.4320.1", "MSTRG.6398.1", "MSTRG.10450.1",
                           "MSTRG.12384.1", "MSTRG.12829.1", "MSTRG.12535.1",
                           "MSTRG.26789.1", "MSTRG.26792.1", "MSTRG.26837.1",
                           "MSTRG.26874.1", "MSTRG.27007.1", "MSTRG.26430.1",
                           "MSTRG.26734.1")

### Get the transcript sequences for these transcripts
Coding_transcript_seq <- DeNovo_transcript_seq[Confirmed_transcripts]
####




#### Figure 5 Generation: ####
#### create annotation table for transсriptome vs repeats relationship ####

### CUTOFFS ###
min_jaccard_index <- 0.4


### INPUT FILES ###
globTrx.gtf <- "/tungstenfs/groups/gpeters/rohmalex/work/Mark/RNA-seq_181127/31_transcriptomeAssembly/wholeProcess_190410/42_modelMerged/compressedIndexedGtf/globalTranscriptome.gtf.gz"
repeats_rds <- "/tungstenfs/scratch/gbioinfo_work/gpeters/ozonevge/genomes/mm10/repeats/rmskAlignBaseline_Non_overlapingmin5bp_withDistToGene_23Aug17.rds"

### LOAD TRX AND SPLIT INTO TRXS AND EXONS ###
library(rtracklayer)
glob_trx_gr <- import(globTrx.gtf)

### LOAD REPEATS ###
repeats.gr <- readRDS(repeats_rds)


### function for transcriptome annotation reg.to intersection with repeats
annotate_repeat_intersections <- function(trxs_gr,reps_gr,min.JI = 0.0,min.overlap = 5L){
  ## split transcripts and exons
  transcripts_gr <- trxs_gr[which(trxs_gr$type == "transcript")]
  exons_gr <- trxs_gr[which(trxs_gr$type == "exon")]
  
  ## renumber exon numbers taking strand into account
  exons_gr$exon_number <- as.integer(exons_gr$exon_number)
  totexnnumb <- tapply(exons_gr$exon_number,exons_gr$transcript_id,max)
  exons_gr$strand_exon_number <- ifelse(strand(exons_gr)!="-",exons_gr$exon_number,totexnnumb[exons_gr$transcript_id] - exons_gr$exon_number + 1)
  exons_gr$exon_id <- paste(exons_gr$transcript_id,exons_gr$strand_exon_number,sep=":")
  
  
  ### OVERLAP EXONS WITH REPEATS ###
  ## store annotation using columns
  ## 1. repElement (NA if not overlapping)
  exons_vs_reps <- as.data.frame(findOverlaps(exons_gr,reps_gr,ignore.strand=T,minoverlap = min.overlap))
  exons_vs_reps$exon_id <- exons_gr$exon_id[exons_vs_reps$queryHits]
  exons_vs_reps$repElement <- names(reps_gr)[exons_vs_reps$subjectHits]
  exons_vs_reps$repName <- reps_gr$repName[exons_vs_reps$subjectHits]
  exons_vs_reps$repClass <- reps_gr$repClass[exons_vs_reps$subjectHits]
  exons_vs_reps$repFamily <- reps_gr$repFamily[exons_vs_reps$subjectHits]
  
  ## get width of overlap
  ovrlp_gr <- pintersect(exons_gr[exons_vs_reps$queryHits],reps_gr[exons_vs_reps$subjectHits],ignore.strand=T)
  union_gr <- punion(exons_gr[exons_vs_reps$queryHits],reps_gr[exons_vs_reps$subjectHits],ignore.strand=T)
  exons_vs_reps$intersect_len <- width(ovrlp_gr)
  exons_vs_reps$union_len <- width(union_gr)
  exons_vs_reps$jaccard_exon_vs_rep <- exons_vs_reps$intersect_len/exons_vs_reps$union_len
  
  
  ## select overlaps with jaccard index higher then cutoffs
  exons_vs_reps <- exons_vs_reps[exons_vs_reps$jaccard_exon_vs_rep >= min.JI,]
  
  ## select overlap with highest jaccard index
  exons_vs_reps_unique <- exons_vs_reps[order(exons_vs_reps$queryHits,-exons_vs_reps$jaccard_exon_vs_rep),]
  exons_vs_reps_unique <- exons_vs_reps_unique[!duplicated(exons_vs_reps_unique$queryHits),]
  
  
  
  ## add repeat info into exon gr
  exon_metdat <- mcols(exons_gr)
  exon_metdat <- cbind(exon_metdat,exons_vs_reps_unique[match(exon_metdat$exon_id,exons_vs_reps_unique$exon_id),4:10])
  mcols(exons_gr) <- exon_metdat
  
  
  ## add repeat annotation for transcripts
  ## columns for transcript annotation
  ## 1. ExonRepInt - logical
  ## 2. FirstExonRepInt  - logical
  ## 3. FirstExonRepElement - chr
  ## 4. FirstExonRepName - chr
  ## 5. FirstExonRepClass - chr
  ## 6. FirstExonRepFamily - chr
  ## 7. intersect_len - integer
  ## 8. union_len - integer
  ## 9. jaccard_exon_vs_rep - numeric
  
  trx_ids_int_rep <- unique(exons_gr$transcript_id[which(!is.na(exons_gr$repElement))])
  transcripts_gr$ExonRepInt <- transcripts_gr$transcript_id %in% trx_ids_int_rep
  
  firstexon_ids <- unique(exons_gr$transcript_id[which(!is.na(exons_gr$repElement) & exons_gr$strand_exon_number == 1)])
  transcripts_gr$FirstExonRepInt <- transcripts_gr$transcript_id %in% firstexon_ids
  
  first_exons_inrep_gr <- exons_gr[which(exons_gr$strand_exon_number == 1 & !is.na(exons_gr$repElement))]
  transcripts_gr$FirstExonRepElement <- first_exons_inrep_gr$repElement[match(transcripts_gr$transcript_id,first_exons_inrep_gr$transcript_id)]
  transcripts_gr$FirstExonRepName <- first_exons_inrep_gr$repName[match(transcripts_gr$transcript_id,first_exons_inrep_gr$transcript_id)]
  
  transcripts_gr$FirstExonRepClass <- first_exons_inrep_gr$repClass[match(transcripts_gr$transcript_id,first_exons_inrep_gr$transcript_id)]
  transcripts_gr$FirstExonRepFamily <- first_exons_inrep_gr$repFamily[match(transcripts_gr$transcript_id,first_exons_inrep_gr$transcript_id)]
  transcripts_gr$intersect_len <- first_exons_inrep_gr$intersect_len[match(transcripts_gr$transcript_id,first_exons_inrep_gr$transcript_id)]
  transcripts_gr$union_len <- first_exons_inrep_gr$union_len[match(transcripts_gr$transcript_id,first_exons_inrep_gr$transcript_id)]
  transcripts_gr$jaccard_exon_vs_rep <- first_exons_inrep_gr$jaccard_exon_vs_rep[match(transcripts_gr$transcript_id,first_exons_inrep_gr$transcript_id)]
  
  
  require(BSgenome.Mmusculus.UCSC.mm10)
  seqinfo(transcripts_gr) <- seqinfo(BSgenome.Mmusculus.UCSC.mm10)
  seqinfo(exons_gr) <- seqinfo(BSgenome.Mmusculus.UCSC.mm10)
  trxs_gr_w_rep <- c(transcripts_gr,
                     exons_gr)
  trxs_gr_w_rep <- trxs_gr_w_rep[order(trxs_gr_w_rep$transcript_id,trxs_gr_w_rep$type,trxs_gr_w_rep$strand_exon_number)]
  
  return(trxs_gr_w_rep)
  
}


### ANNOTATE TRANSCRIPTOME ###

glob_trx_gr_w_rep <- annotate_repeat_intersections(trxs_gr = glob_trx_gr,reps_gr = repeats.gr,min.JI = min_jaccard_index)

## export into gtf
export(object = glob_trx_gr_w_rep,con = paste0(tables.folder,scriptnb,"globalTranscriptome_w_repannot_minJaccardIndex_",min_jaccard_index,".gtf"))

## save RDS 
saveRDS(object = glob_trx_gr_w_rep,
        file = paste0("01_globalTranscriptome_w_repannot_minJaccardIndex_",min_jaccard_index,".rds"))
####


#### shuffling transcriptome to get null distribution for repeat intersection ####



### INPUT FILES ###
globTrx.gtf <- "/tungstenfs/groups/gpeters/rohmalex/work/Mark/RNA-seq_181127/31_transcriptomeAssembly/wholeProcess_190410/42_modelMerged/compressedIndexedGtf/globalTranscriptome.gtf.gz"
repeats_rds <- "/tungstenfs/scratch/gbioinfo_work/gpeters/ozonevge/genomes/mm10/repeats/rmskAlignBaseline_Non_overlapingmin5bp_withDistToGene_23Aug17.rds"

### LOAD TRX AND SPLIT INTO TRXS AND EXONS ###
library(rtracklayer)
glob_trx_gr <- import(globTrx.gtf)

### LOAD REPEATS ###
repeats.gr <- readRDS(repeats_rds)

### FUNCTIONS FOR ANNOTATION AND SHUFFLING ###

### function for transcriptome annotation reg.to intersection with repeats
annotate_repeat_intersections <- function(trxs_gr,reps_gr,
                                          #min.JI = 0.0,
                                          min.overlap = 5L){
  ## split transcripts and exons
  transcripts_gr <- trxs_gr[which(trxs_gr$type == "transcript")]
  exons_gr <- trxs_gr[which(trxs_gr$type == "exon")]
  
  ## renumber exon numbers taking strand into account
  exons_gr$exon_number <- as.integer(exons_gr$exon_number)
  totexnnumb <- tapply(exons_gr$exon_number,exons_gr$transcript_id,max)
  exons_gr$strand_exon_number <- ifelse(strand(exons_gr)!="-",exons_gr$exon_number,totexnnumb[exons_gr$transcript_id] - exons_gr$exon_number + 1)
  exons_gr$exon_id <- paste(exons_gr$transcript_id,exons_gr$strand_exon_number,sep=":")
  
  
  ### OVERLAP EXONS WITH REPEATS ###
  ## store annotation using columns
  ## 1. repElement (NA if not overlapping)
  exons_vs_reps <- as.data.frame(findOverlaps(exons_gr,reps_gr,ignore.strand=T,minoverlap = min.overlap))
  exons_vs_reps$exon_id <- exons_gr$exon_id[exons_vs_reps$queryHits]
  exons_vs_reps$repElement <- names(reps_gr)[exons_vs_reps$subjectHits]
  exons_vs_reps$repName <- reps_gr$repName[exons_vs_reps$subjectHits]
  exons_vs_reps$repClass <- reps_gr$repClass[exons_vs_reps$subjectHits]
  exons_vs_reps$repFamily <- reps_gr$repFamily[exons_vs_reps$subjectHits]
  
  ## get width of overlap
  ovrlp_gr <- pintersect(exons_gr[exons_vs_reps$queryHits],reps_gr[exons_vs_reps$subjectHits],ignore.strand=T)
  union_gr <- punion(exons_gr[exons_vs_reps$queryHits],reps_gr[exons_vs_reps$subjectHits],ignore.strand=T)
  exons_vs_reps$intersect_len <- width(ovrlp_gr)
  exons_vs_reps$union_len <- width(union_gr)
  exons_vs_reps$jaccard_exon_vs_rep <- exons_vs_reps$intersect_len/exons_vs_reps$union_len
  
  # ## select overlaps with jaccard index higher than cutoffs
  # exons_vs_reps <- exons_vs_reps[exons_vs_reps$jaccard_exon_vs_rep >= min.JI,]
  
  ## select overlap with highest jaccard index
  exons_vs_reps_unique <- exons_vs_reps[order(exons_vs_reps$queryHits,-exons_vs_reps$jaccard_exon_vs_rep),]
  exons_vs_reps_unique <- exons_vs_reps_unique[!duplicated(exons_vs_reps_unique$queryHits),]
  
  
  ## add repeat info into exon gr
  exon_metdat <- mcols(exons_gr)
  exon_metdat <- cbind(exon_metdat,exons_vs_reps_unique[match(exon_metdat$exon_id,exons_vs_reps_unique$exon_id),4:10])
  mcols(exons_gr) <- exon_metdat
  
  
  ## add repeat annotation for transcripts
  ## columns for transcript annotation
  ## 1. ExonRepInt - logical
  ## 2. FirstExonRepInt  - logical
  ## 3. FirstExonRepElement - chr
  ## 4. FirstExonRepName - chr
  ## 5. FirstExonRepClass - chr
  ## 6. FirstExonRepFamily - chr
  ## 7. intersect_len - integer
  ## 8. union_len - integer
  ## 9. jaccard_exon_vs_rep - numeric
  
  trx_ids_int_rep <- unique(exons_gr$transcript_id[which(!is.na(exons_gr$repElement))])
  transcripts_gr$ExonRepInt <- transcripts_gr$transcript_id %in% trx_ids_int_rep
  
  firstexon_ids <- unique(exons_gr$transcript_id[which(!is.na(exons_gr$repElement) & exons_gr$strand_exon_number == 1)])
  transcripts_gr$FirstExonRepInt <- transcripts_gr$transcript_id %in% firstexon_ids
  
  first_exons_inrep_gr <- exons_gr[which(exons_gr$strand_exon_number == 1 & !is.na(exons_gr$repElement))]
  transcripts_gr$FirstExonRepElement <- first_exons_inrep_gr$repElement[match(transcripts_gr$transcript_id,first_exons_inrep_gr$transcript_id)]
  transcripts_gr$FirstExonRepName <- first_exons_inrep_gr$repName[match(transcripts_gr$transcript_id,first_exons_inrep_gr$transcript_id)]
  
  transcripts_gr$FirstExonRepClass <- first_exons_inrep_gr$repClass[match(transcripts_gr$transcript_id,first_exons_inrep_gr$transcript_id)]
  transcripts_gr$FirstExonRepFamily <- first_exons_inrep_gr$repFamily[match(transcripts_gr$transcript_id,first_exons_inrep_gr$transcript_id)]
  transcripts_gr$intersect_len <- first_exons_inrep_gr$intersect_len[match(transcripts_gr$transcript_id,first_exons_inrep_gr$transcript_id)]
  transcripts_gr$union_len <- first_exons_inrep_gr$union_len[match(transcripts_gr$transcript_id,first_exons_inrep_gr$transcript_id)]
  transcripts_gr$jaccard_exon_vs_rep <- first_exons_inrep_gr$jaccard_exon_vs_rep[match(transcripts_gr$transcript_id,first_exons_inrep_gr$transcript_id)]
  
  
  require(BSgenome.Mmusculus.UCSC.mm10)
  seqinfo(transcripts_gr) <- seqinfo(BSgenome.Mmusculus.UCSC.mm10)
  seqinfo(exons_gr) <- seqinfo(BSgenome.Mmusculus.UCSC.mm10)
  trxs_gr_w_rep <- c(transcripts_gr,
                     exons_gr)
  trxs_gr_w_rep <- trxs_gr_w_rep[order(trxs_gr_w_rep$transcript_id,trxs_gr_w_rep$type,trxs_gr_w_rep$strand_exon_number)]
  
  return(trxs_gr_w_rep)
  
}



## function for shuffling trxs
randomizeGRangesList <- function(x,
                                 chr.set = unique(paste0("chr",c(1:19,"X","Y")),seqlevelsInUse(x)),
                                 ncores = 10){
  require(parallel)
  #browser()
  stopifnot(!any(is.na(seqlengths(x))))
  chrlens <- seqlengths(x)
  chrlens <- chrlens[names(chrlens) %in% chr.set]
  
  pick_rand_pos <- function(gr,chrlens){
    #browser()
    # get width of the span
    gspan <- range(gr)
    spw <- width(gspan)
    # pick random chr
    rchr <- sample(names(chrlens[chrlens > spw]),size = 1)
    
    # pick random position in the rchr
    rpos <- sample(1:(chrlens[rchr] - spw -1),size=1)
    
    #assign rchr and shift to rpos
    rgr <- gr
    suppressWarnings(seqnames(gr)[seq_along(gr)] <- rchr)
    # tryCatch(seqnames(rgr)[seq_along(rgr)] <- rchr,
    # 				 warning = function(w){
    # 				 	browser()
    # 				 })
    tryCatch(gr <- IRanges::shift(gr,shift = rpos - start(gspan)),
             warning = function(w){
               #browser()
             })
    # gr <- IRanges::shift(gr,shift = rpos - start(gspan))
    return(gr)
    # return rchr and shift
    # return(data.frame(rchr = rchr,rshift = rpos - start(gspan)))
    
  }
  #browser()
  ## randomize grlist
  #rx <- mendoapply(FUN = pick_rand_pos,gr=x,MoreArgs = list(chrlens=chrlens))
  rx <- GRangesList(mcmapply(FUN = pick_rand_pos,
                             gr=x,
                             MoreArgs = list(chrlens=chrlens)
                             ,mc.cores = ncores
  ))
  # rposdf <- do.call(rbind,mclapply(X = x,FUN = pick_rand_pos,chrlens=chrlens,mc.cores = ncores))
  # rx <- x
  # seqnames(rx) <- rposdf$rchr
  # 
  return(rx)
}


## get shuffled transcriptomes
glob_trx_gr_grl <- split(glob_trx_gr,f = glob_trx_gr$gene_id)
library(BSgenome.Mmusculus.UCSC.mm10)
seqinfo(glob_trx_gr_grl) <- seqinfo(BSgenome.Mmusculus.UCSC.mm10)


ncores <- 30
nrounds <- 100
set.seed(100345524)
library(tictoc)
tic()
shuffled_trxs_repannot <- lapply(1:nrounds,function(ri){
  message(paste0("shuffling round ",ri,"..."))
  #browser()
  shuffled_trxs <- unlist(randomizeGRangesList(glob_trx_gr_grl,ncores = ncores))
  message(paste0("annotating rep annotation in shuffling round ",ri,"..."))
  
  
  shuffled_trxs <- annotate_repeat_intersections(trxs_gr = shuffled_trxs,
                                                 reps_gr = repeats.gr
                                                 #min.JI = min_jaccard_inde
  )
  return(shuffled_trxs)
})
toc()

## save shuffled trxs
saveRDS(object = shuffled_trxs_repannot,
        file = paste0("02_shuffled_globalTranscriptome_w_repannot_nrounds_",nrounds,"_r2.rds"))
####
#### Jaccard CUTOFFS ####
min_jaccard_index <- 0.0


library(GenomicRanges)

trxs_annot_rds <- paste0("02_Trx_real_and_shuffled_annotation_minJI_",min_jaccard_index,".rds")
if(!file.exists(trxs_annot_rds)){
  ## load annotations
  glob_trx_gr <- readRDS("01_globalTranscriptome_w_repannot.rds")
  shuffled_trxs_list <- readRDS("02_shuffled_globalTranscriptome_w_repannot_nrounds_100.rds")
  
  trxs_list <- c(list("real_trx"=glob_trx_gr),
                 shuffled_trxs_list)
  rm(shuffled_trxs_list)
  rm(glob_trx_gr)
  ### load gffcompare output to get codes
  gffcomp_data <- read.table("/tungstenfs/scratch/gbioinfo_work/gpeters/rohmalex/work/Mark/RNA-seq_181127/31_transcriptomeAssembly/wholeProcess_190410/42_modelMerged/gffcmp.globalTranscriptome.gtf.tmap",
                             header=T)
  row.names(gffcomp_data) <- gffcomp_data$qry_id
  table(gffcomp_data$class_code)
  
  ##### FUNCTION FOR REANNOTATING THE TRANSCRIPTS WRT MONO- MULTI-EXON, OVERLAP WITH GENCODE, REPEATS ETC ######
  reannotate_JI_cutoff <- function(trxs_gr,min.JI,gffcmp){
    #browser()
    ## split transcripts and exons
    transcripts_gr <- trxs_gr[which(trxs_gr$type == "transcript")]
    exons_gr <- trxs_gr[which(trxs_gr$type == "exon")]
    
    trx_exon_counts <- elementNROWS(split(exons_gr$exon_id,exons_gr$transcript_id))
    transcripts_gr$exon_counts <- trx_exon_counts[as.character(transcripts_gr$transcript_id)]
    
    transcripts_gr$transcript_type <- ifelse(transcripts_gr$exon_counts == 1,"monoexonic","multiexonic")
    
    ### add gffcompare data to transcripts
    transcripts_gr$ref_id <- gffcmp$ref_id[match(transcripts_gr$transcript_id,gffcmp$qry_id)]
    transcripts_gr$ref_gene_id <- gffcmp$ref_gene_id[match(transcripts_gr$transcript_id,gffcmp$qry_id)]
    transcripts_gr$class_code <- gffcmp$class_code[match(transcripts_gr$transcript_id,gffcmp$qry_id)]
    ## filter exons acc.to JI
    exons_gr$intersect_len <- ifelse(exons_gr$jaccard_exon_vs_rep >= min.JI,
                                     exons_gr$intersect_len,
                                     NA)
    exons_gr$union_len <- ifelse(exons_gr$jaccard_exon_vs_rep >= min.JI,
                                 exons_gr$union_len,
                                 NA)
    exons_gr$jaccard_exon_vs_rep <- ifelse(exons_gr$jaccard_exon_vs_rep >= min.JI,
                                           exons_gr$jaccard_exon_vs_rep,
                                           NA)
    exons_gr$repElement <- ifelse(exons_gr$jaccard_exon_vs_rep >= min.JI,
                                  exons_gr$repElement,
                                  NA)
    exons_gr$repName <- ifelse(exons_gr$jaccard_exon_vs_rep >= min.JI,
                               exons_gr$repName,
                               NA)
    exons_gr$repClass <- ifelse(exons_gr$jaccard_exon_vs_rep >= min.JI,
                                exons_gr$repClass,
                                NA)
    exons_gr$repFamily <- ifelse(exons_gr$jaccard_exon_vs_rep >= min.JI,
                                 exons_gr$repFamily,
                                 NA)
    ## add annotation for transcripts
    trx_ids_int_rep <- unique(exons_gr$transcript_id[which(!is.na(exons_gr$repElement))])
    transcripts_gr$ExonRepInt <- transcripts_gr$transcript_id %in% trx_ids_int_rep
    
    firstexon_ids <- unique(exons_gr$transcript_id[which(!is.na(exons_gr$repElement) & exons_gr$strand_exon_number == 1)])
    transcripts_gr$FirstExonRepInt <- transcripts_gr$transcript_id %in% firstexon_ids
    
    first_exons_inrep_gr <- exons_gr[which(exons_gr$strand_exon_number == 1 & !is.na(exons_gr$repElement))]
    transcripts_gr$FirstExonRepElement <- first_exons_inrep_gr$repElement[match(transcripts_gr$transcript_id,first_exons_inrep_gr$transcript_id)]
    transcripts_gr$FirstExonRepName <- first_exons_inrep_gr$repName[match(transcripts_gr$transcript_id,first_exons_inrep_gr$transcript_id)]
    
    transcripts_gr$FirstExonRepClass <- first_exons_inrep_gr$repClass[match(transcripts_gr$transcript_id,first_exons_inrep_gr$transcript_id)]
    transcripts_gr$FirstExonRepFamily <- first_exons_inrep_gr$repFamily[match(transcripts_gr$transcript_id,first_exons_inrep_gr$transcript_id)]
    transcripts_gr$intersect_len <- first_exons_inrep_gr$intersect_len[match(transcripts_gr$transcript_id,first_exons_inrep_gr$transcript_id)]
    transcripts_gr$union_len <- first_exons_inrep_gr$union_len[match(transcripts_gr$transcript_id,first_exons_inrep_gr$transcript_id)]
    transcripts_gr$jaccard_exon_vs_rep <- first_exons_inrep_gr$jaccard_exon_vs_rep[match(transcripts_gr$transcript_id,first_exons_inrep_gr$transcript_id)]
    
    
    trxs_gr_w_rep <- c(transcripts_gr,
                       exons_gr)
    trxs_gr_w_rep <- trxs_gr_w_rep[order(trxs_gr_w_rep$transcript_id,trxs_gr_w_rep$type,trxs_gr_w_rep$strand_exon_number)]
    
    return(trxs_gr_w_rep)
  }
  
  
  
  trxs_list <- endoapply(trxs_list,FUN = reannotate_JI_cutoff,
                         min.JI = min_jaccard_index,
                         gffcmp=gffcomp_data)
  saveRDS(trxs_list,
          file = trxs_annot_rds)
  gc()
  
} else {
  message("Loading previously created annotation")
  trxs_list <- readRDS(trxs_annot_rds)
}



### extract multiexonic with first exonic overlap data for real and shuffled
multiexon_tss_trxs <- do.call(rbind,
                              lapply(names(trxs_list),
                                     function(trxnm){
                                       #browser()
                                       currtrxmdat <- data.frame(subset(mcols(trxs_list[[trxnm]]),
                                                                        type == "transcript" & transcript_type == "multiexonic" & FirstExonRepInt))
                                       currtrxmdat$trx_name <- trxnm
                                       currtrxmdat$trx_type <- ifelse(trxnm == "real_trx","real_trx","shuffled")
                                       return(currtrxmdat)
                                       
                                     }))

## plot ecdf for JI
library(ggplot2)
multiexon_tss_trxs$trx_type <- factor(multiexon_tss_trxs$trx_type,
                                      levels = c("shuffled","real_trx"))

pres_repNames <- unique(subset(multiexon_tss_trxs,trx_name == "real_trx")$FirstExonRepName)

multiexon_tss_trxs <- subset(multiexon_tss_trxs,FirstExonRepName %in% pres_repNames)

repnm_revecdf <- ggplot(multiexon_tss_trxs)+
  facet_wrap(~FirstExonRepName)+
  geom_line(data = subset(multiexon_tss_trxs,trx_type=="shuffled"),
            aes(x=jaccard_exon_vs_rep,
                y=after_stat(1-y),
                group=trx_name,
                #color=trx_type,
                alpha = trx_type),stat="ecdf",
            color="lightgrey",
            alpha = 0.25)+
  geom_line(data = subset(multiexon_tss_trxs,trx_type=="real_trx"),
            aes(x=jaccard_exon_vs_rep,
                y=after_stat(1-y),
                group=trx_name,
                #color=trx_type,
                alpha = trx_type),stat="ecdf",
            color="red",
            alpha=1)+
  theme_bw()
ggsave(filename = paste0(plots.folder,scriptnb,"RevECDF_repName_JI.pdf"),
       plot = repnm_revecdf,
       width=40,height=40)

seldat <- subset(multiexon_tss_trxs,FirstExonRepName %in% c("MER117","ORR1A2","ORR1A3","B3","IAPLTR3"))
ggplot(seldat)+
  facet_wrap(~FirstExonRepName)+
  geom_line(data = subset(seldat,trx_type=="shuffled"),
            aes(x=jaccard_exon_vs_rep,
                y=after_stat(1-y),
                group=trx_name,
                #color=trx_type,
                alpha = trx_type),stat="ecdf",
            color="lightgrey",
            alpha = 0.25)+
  geom_line(data = subset(seldat,trx_type=="real_trx"),
            aes(x=jaccard_exon_vs_rep,
                y=after_stat(1-y),
                group=trx_name,
                #color=trx_type,
                alpha = trx_type),stat="ecdf",
            color="red",
            alpha=1)+
  
  theme_bw()




## perform the following test
## 1. perform KS test comparing JI distribution for real_trx to combined shuffled trxs. get p-value
## 2. perform KS test comparing each round of permutation to combined shuffled trxs without current shufflinf round
## 3. from step 2 obtain null p-value distribution
## 4. estimate FDR

real_trx_data <- subset(multiexon_tss_trxs,trx_name == "real_trx")
saveRDS(object = real_trx_data,
        file = paste0("03_Multiexonic_firstexon_TEoverlap_observed_transcripts_metadat.rds"))


sel_repNames <- unique(real_trx_data$FirstExonRepName)
library(parallel)
ks_pval_data <- do.call(rbind,mclapply(sel_repNames,
                                       function(repnm){
                                         message("Starting ",repnm)
                                         ## perform KS test for real vs shuffled
                                         repnm_dat <- subset(multiexon_tss_trxs,FirstExonRepName == repnm)
                                         #browser()
                                         real_ks_res <- ks.test(x=subset(repnm_dat,trx_name == "real_trx")$jaccard_exon_vs_rep,
                                                                y=subset(repnm_dat,trx_type == "shuffled")$jaccard_exon_vs_rep,
                                                                alternative = "less",
                                                                exact = TRUE)
                                         
                                         null_pvals <- do.call(rbind,lapply(unique(subset(repnm_dat,trx_type == "shuffled")$trx_name),
                                                                            function(shufname){
                                                                              message("Processing ",repnm," ",shufname)
                                                                              shuff_ks_res <- ks.test(x=subset(repnm_dat,trx_name == shufname)$jaccard_exon_vs_rep,
                                                                                                      y=subset(repnm_dat,trx_name != shufname)$jaccard_exon_vs_rep,
                                                                                                      alternative = "less",
                                                                                                      exact = TRUE)
                                                                              data.frame(trx_name = shufname,
                                                                                         ks_pval = shuff_ks_res$p.value,
                                                                                         ks_Dstat = shuff_ks_res$statistic)
                                                                            }))
                                         pval_dat <- rbind(data.frame(trx_name = "real_trx",
                                                                      ks_pval = real_ks_res$p.value,
                                                                      ks_Dstat = real_ks_res$statistic),
                                                           null_pvals)
                                         pval_dat$trx_type <- ifelse(pval_dat$trx_name == "real_trx","real_trx","shuffled")
                                         pval_dat$FirstExonRepName <- repnm
                                         return(pval_dat)
                                       },mc.cores = 50))

ks_pval_data <- ks_pval_data[order(ks_pval_data$ks_pval),]

saveRDS(object = ks_pval_data,
        file = paste0("03_KS_pval_statistics_for_real_and_shuffled_trxs_100rand.rds"))

ks_hist <- ggplot(ks_pval_data)+
  facet_wrap(~FirstExonRepName)+
  geom_histogram(data = subset(ks_pval_data,trx_type == "shuffled"),
                 aes(x=ks_pval),fill="blue")+
  geom_vline(data = subset(ks_pval_data,trx_type == "real_trx"),
             aes(xintercept=ks_pval),color="red")+
  theme_bw()
ggsave(filename = paste0("03_Histograms_KS_pvalue_and_null_pval.pdf"),
       plot = ks_hist,
       width=40,height=40)


ks_ecdf <- ggplot(ks_pval_data)+
  facet_wrap(~FirstExonRepName)+
  geom_step(data = subset(ks_pval_data,trx_type == "shuffled"),
            aes(x=ks_pval),color="blue",stat="ecdf")+
  geom_vline(data = subset(ks_pval_data,trx_type == "real_trx"),
             aes(xintercept=ks_pval),color="red")+
  geom_text(data = subset(ks_pval_data,trx_type == "real_trx"),
            aes(x=ks_pval,y=1,label = paste0("KS p-val: ",format(ks_pval,digits = 2))),
            nudge_x = 0.01,hjust=0,vjust=1,color="red",size=1)+
  theme_bw()
ggsave(filename = paste0("03_ECDF_KS_pvalue_and_null_pval.pdf"),
       plot = ks_ecdf,
       width=40,height=40)


test_ks_data <- subset(ks_pval_data,FirstExonRepName %in% c("IAPLTR3","RLTR31C_MM","MERVL"))
test_ks_hist <- ggplot(test_ks_data)+
  facet_wrap(~FirstExonRepName)+
  geom_step(data = subset(test_ks_data,trx_type == "shuffled"),
            aes(x=ks_pval),color="blue",stat="ecdf")+
  geom_vline(data = subset(test_ks_data,trx_type == "real_trx"),
             aes(xintercept=ks_pval),color="red")+
  geom_text(data = subset(test_ks_data,trx_type == "real_trx"),
            aes(x=ks_pval,y=1,label = paste0("KS p-val: ",format(ks_pval,digits = 2))),nudge_x = 0.01,hjust=0,vjust=1,color="red")+
  theme_bw()
test_ks_hist

test_ks_data <- subset(ks_pval_data,FirstExonRepName %in% c("IAPLTR3"))

ggplot(test_ks_data,aes(x=ks_Dstat,y=ks_pval))+
  geom_point(aes(color=trx_type))

#### obtaining p-values of comparing KS stats of real to shuffled

repnames_aggr_pvals <- do.call(rbind,lapply(sel_repNames,
                                            function(repnm){
                                              realdat <- subset(ks_pval_data,FirstExonRepName == repnm & trx_type == "real_trx")
                                              shuffdat <- subset(ks_pval_data,FirstExonRepName == repnm & trx_type == "shuffled")
                                              
                                              ## calculate number of times shuffled p-value is below real
                                              #browser()
                                              nfp <- sum(shuffdat$ks_pval <= realdat$ks_pval)
                                              
                                              realdat$perm_pval <- (nfp + 1)/(nrow(shuffdat) + 1)
                                              return(realdat)
                                            }))
repnames_aggr_pvals$fdr <- p.adjust(repnames_aggr_pvals$perm_pval)
repnames_aggr_pvals <- repnames_aggr_pvals[order(repnames_aggr_pvals$ks_pval),]

## add info of how many trx overlap with each repeat
repname_counts <- table(real_trx_data$FirstExonRepName)
repnames_aggr_pvals$FirstExonRepName_count <- repname_counts[as.character(repnames_aggr_pvals$FirstExonRepName)]
## save results of this test
saveRDS(repnames_aggr_pvals,file=paste0(rds.folder,scriptnb,"KS_permutation_test_results_summarizes.rds"))



#### TEST FOR CALCULATING Z-SCORES AT CERTAIN CUTOFFS FOR JI ####
ji_cutvec <- c(0.2,
               0.4,
               0.6,
               0.8
)
zscore_JIcuts_data <- do.call(rbind,lapply(sel_repNames,
                                           function(repnm){
                                             message("Starting ",repnm)
                                             ## perform KS test for real vs shuffled
                                             
                                             repnm_dat <- subset(multiexon_tss_trxs,FirstExonRepName == repnm)
                                             
                                             ## get ecdfs for real and shuffled JIs
                                             ecdf_list <- tapply(repnm_dat$jaccard_exon_vs_rep,repnm_dat$trx_name,ecdf,simplify = F)
                                             
                                             #browser()
                                             ## get values of reverse ECDF at each cutoff
                                             cutecdfs <- do.call(rbind,sapply(ecdf_list,
                                                                              function(e){
                                                                                ecut <- 1-e(ji_cutvec)
                                                                                names(ecut) <- ji_cutvec
                                                                                return(ecut)
                                                                              },simplify = F))
                                             ## get z-scores for each cutoff
                                             real_idx <- grep("real_trx",row.names(cutecdfs))
                                             shuff_idx <- grep("shuffled",row.names(cutecdfs))
                                             zscores_ecdfs <- apply(cutecdfs,2,
                                                                    function(x){
                                                                      (x[real_idx] - mean(x[shuff_idx],na.rm=T))/sd(x[shuff_idx],na.rm=T)
                                                                    })
                                             
                                             ## run wilcoxon tests
                                             #browser()
                                             wilc_res <- apply(cutecdfs,2,
                                                               function(x){
                                                                 
                                                                 wres <- wilcox.test(x = x[real_idx],
                                                                                     y = x[shuff_idx],
                                                                                     alternative = "greater")
                                                                 
                                                                 return(wres$p.value)
                                                                 #(x[real_idx] - mean(x[shuff_idx],na.rm=T))/sd(x[shuff_idx],na.rm=T)
                                                               })
                                             
                                             outdf <- data.frame(FirstExonRepName = repnm,
                                                                 JIcutoff = as.numeric(names(zscores_ecdfs)),
                                                                 revECDF_Zscore = zscores_ecdfs,
                                                                 wilc_Pval = wilc_res
                                             )
                                             return(outdf)
                                             
                                           }
                                           #,mc.cores = 50
))

zscore_JIcuts_data$FirstExonRepName_count <- repname_counts[as.character(zscore_JIcuts_data$FirstExonRepName)]
zscore_JIcuts_data$fdr_wilc <- p.adjust(zscore_JIcuts_data$wilc_Pval)
ggplot(zscore_JIcuts_data,aes(x=wilc_Pval))+
  geom_histogram()
## save 
saveRDS(object = zscore_JIcuts_data,
        file = paste0("03_Zscore_for_revECDFs_at_JI_cutoffs.rds"))

## create matrix
library(reshape2)
min_trxs <- 50
zscore_mat <- acast(subset(zscore_JIcuts_data,FirstExonRepName_count >= min_trxs),formula = FirstExonRepName~ JIcutoff,
                    value.var = "revECDF_Zscore")
zscore_mat[!is.finite(zscore_mat)] <- NA
## order by max zscore
rowmax <- apply(zscore_mat,1,max,na.rm=T)
rowmax <- sort(rowmax,decreasing = T)
zscore_mat <- zscore_mat[names(rowmax),]


library(ComplexHeatmap)
Heatmap(zscore_mat[rowmax>=2,],cluster_rows = F,cluster_columns=F)
Heatmap(zscore_mat,cluster_rows = F,cluster_columns=F)

## create heatmaps for figures
## all repNames with trxs overlaping higher than min_trxs
library(circlize)
library(RColorBrewer)
zscore_col <- colorRamp2(breaks=seq(5,-5,length.out=11),
                         colors = brewer.pal(11,"Spectral"))

trx_counts <- repname_counts[row.names(zscore_mat)]
zscore_hm <- Heatmap(zscore_mat,
                     col = zscore_col,
                     cluster_rows = F,
                     cluster_columns=F,
                     left_annotation = rowAnnotation("# trxs" = anno_text(trx_counts,
                                                                          just = 1,
                                                                          location = unit(1,"npc"),
                                                                          width = unit(1,"cm")),
                                                     show_annotation_name = TRUE,
                                                     annotation_label = "transcripts",
                                                     annotation_name_side = "top",
                                                     border = TRUE),
                     column_names_side = "top",
                     column_names_rot = 0,
                     column_names_centered = T,
                     row_names_side = "left",
                     heatmap_legend_param = list(title="Z-score",
                                                 direction="horizontal",
                                                 legend_width = unit(0.2,"npc"),
                                                 at=c(-5,-2.5,0,2.5,5),
                                                 title_position="topcenter"))

pdf(paste0("03_Heatmap_permZscores_for_revECDF_at_JIcuts.pdf"),
    width=3.5,height=10)
draw(zscore_hm,heatmap_legend_side = "bottom")
dev.off()

sel_zscores_mat <- zscore_mat[head(names(rowmax),10),]
trx_counts <- repname_counts[row.names(sel_zscores_mat)]
sel_zscore_hm <- Heatmap(sel_zscores_mat,
                         col = zscore_col,
                         cluster_rows = F,
                         cluster_columns=F,
                         left_annotation = rowAnnotation("# trxs" = anno_text(trx_counts,
                                                                              just = 1,
                                                                              location = unit(1,"npc"),
                                                                              width = unit(1,"cm"),
                                                                              gp=gpar(fontsize=14)),
                                                         show_annotation_name = TRUE,
                                                         annotation_label = "transcripts",
                                                         annotation_name_side = "top",
                                                         border = TRUE),
                         column_names_side = "top",
                         column_names_rot = 0,
                         column_names_centered = T,
                         column_names_gp = gpar(fontsize=14,fontface="bold"),
                         row_names_gp = gpar(fontsize=18,fontface="bold"),
                         row_names_side = "left",
                         heatmap_legend_param = list(title="Z-score",
                                                     direction="horizontal",
                                                     legend_width = unit(0.2,"npc"),
                                                     at=c(-5,-2.5,0,2.5,5),
                                                     title_position="topcenter"))

pdf(paste0("03_Heatmap_SELECTED_permZscores_for_revECDF_at_JIcuts.pdf"),
    width=3.5,height=4)
draw(sel_zscore_hm,heatmap_legend_side = "bottom")
dev.off()


### plot rev cumulatives for 4 repNames with most transcripts
top4repNames <- sort(rowmax,decreasing = T)[1:6]
trxcounts <- data.frame(FirstExonRepName = names(top4repNames),
                        trx_counts = trx_counts[names(top4repNames)])
seldat <- subset(multiexon_tss_trxs,FirstExonRepName %in% trxcounts$FirstExonRepName)
library(ggrastr)
revcum_plots <- ggplot(seldat)+
  facet_wrap(~factor(FirstExonRepName,
                     levels = c(trxcounts$FirstExonRepName))
             # ,scales = "free"
  )+
  geom_line(data = subset(seldat,trx_type=="shuffled"),
            aes(x=jaccard_exon_vs_rep,
                y=after_stat(1-y),
                group=trx_name,
                #color=trx_type,
                alpha = trx_type),stat="ecdf",
            color="red",
            alpha = 0.05,
            show.legend = TRUE)+
  geom_line(data = subset(seldat,trx_type=="real_trx"),
            aes(x=jaccard_exon_vs_rep,
                y=after_stat(1-y),
                group=trx_name,
                #color=trx_type,
                alpha = trx_type),stat="ecdf",
            color="black",
            linewidth=1.3,
            alpha=1,
            show.legend = TRUE)+
  #geom_blank(aes(color=trx_type),show.legend = TRUE)+
  labs(y="1-ECDF(x)",x="Jaccard index")+
  scale_x_continuous(breaks = c(0,0.5,1))+
  # scale_color_manual(values = c("red","grey"),
  # 									 labels = c("observed","random"),
  # 									 guide = guide_legend(title=NULL))+
  theme_bw()+
  theme(strip.background = element_rect(fill=NA),
        strip.text = element_text(face="bold",size=18),
        axis.title = element_text(size=16),
        axis.text = element_text(size=14))

revcum_plots <- rasterise(revcum_plots,layers = "Line",dpi=500)

ggsave(filename = paste0("03_revECDF_JI_SELECTED_repNames_v1.pdf"),
       plot = revcum_plots,
       width=8,height=5.5)



#### PLOT GENOMEWIDE EXPRESSION OF ENRICHED TE AND EXPRESSION OF TRANSCRIPTS ####
## get expression of repeats
repeats_RC <- readRDS("RDS/04_readcounts_repeats_repName_aggr.rds")

## create QuasR project to get number of mapped reads
sampleFile <- "/tungstenfs/groups/gpeters/rohmalex/work/Mark/RNA-seq_181127/31_transcriptomeAssembly/wholeProcess_190410/102_MarksRNAseq_BAM.qsample"
genome.mm10.ercc <- "/work2/gpeters/ozonevge/genomes/mm10/BSgenome.Mmusculus.UCSC.mm10.merged.ERCC92.fa"

library(QuasR)

proj <- qAlign(sampleFile=sampleFile,
               genome=genome.mm10.ercc,
               paired="fr")

mapstat <- alignmentStats(proj)
row.names(mapstat) <- gsub(":genome","",row.names(mapstat))

## remove short repeats
repeats_RC <- repeats_RC[repeats_RC[,1] >= 1000,]


repeats_lRPKM <- get_log2RPKM(rc = repeats_RC,lib.size = mapstat[colnames(repeats_RC)[-1],"mapped"])

## calculate mean across replicates for each stage
stages <- factor(c(rep("LZ",3),
                   rep("PD",3),
                   rep("RS",3),
                   rep("EES",3),
                   rep("LES",3)),
                 levels = c("LZ","PD","RS","EES","LES"))
names(stages) <- colnames(repeats_lRPKM)

repeats_mean_lRPKM <- do.call(cbind,tapply(names(stages),stages,
                                           function(snms){
                                             rowMeans(repeats_lRPKM[,snms,drop=F])
                                           },simplify = F))
repeats_relExpr <- repeats_lRPKM - rowMeans(repeats_lRPKM)
repeats_mean_relExpr <- repeats_mean_lRPKM - rowMeans(repeats_mean_lRPKM)


### load expression of transcripts
library(tximport)
quant_files <- list.files(path ="/tungstenfs/scratch/gbioinfo_work/gpeters/rohmalex/work/Mark/RNA-seq_181127/31_transcriptomeAssembly/wholeProcess_190410/52_quantificationMergedTranscriptome",
                          recursive = T,full.names = T,
                          pattern = "quant.sf$")
library(stringr)
snames<- do.call(rbind,str_split(quant_files,"\\/"))[,13]
snames <- gsub("^1647F\\d+_","",snames,perl=T)
snames <- gsub("_quant$","",snames,perl=T)
snames <- paste0("Gill_",snames)

names(quant_files) <- snames
trx_salm_quant <- tximport(quant_files, type = "salmon",
                           txIn = TRUE,
                           txOut = TRUE)
trx_salm_abund <- trx_salm_quant[["abundance"]]

## rename trx ids
library(rtracklayer)
trx_ids_conv_table <- import("/tungstenfs/scratch/gbioinfo_work/gpeters/rohmalex/work/Mark/RNA-seq_181127/31_transcriptomeAssembly/wholeProcess_190410/42_modelMerged/globalTranscriptome.gtf")
trx_ids_conv_table <- trx_ids_conv_table[which(trx_ids_conv_table$type == "transcript")]
trx_ids_conv_table <- mcols(trx_ids_conv_table)
setequal(trx_ids_conv_table$fastaIdentifier,row.names(trx_salm_abund))
row.names(trx_salm_abund) <- trx_ids_conv_table$transcript_id[match(row.names(trx_salm_abund),
                                                                    trx_ids_conv_table$fastaIdentifier)]

trx_salm_log2abund <- log2(trx_salm_abund + 1)

## calculate mean across replicates
trx_salm_log2abund_mean <- do.call(cbind,
                                   tapply(names(stages),stages,
                                          function(snms){
                                            rowMeans(trx_salm_log2abund[,snms,drop=F])
                                          },simplify = F))


trx_salm_relExp <- trx_salm_log2abund_mean - rowMeans(trx_salm_log2abund_mean)

## repeat expression
sel_repnames_expr <- repeats_mean_relExpr[row.names(sel_zscores_mat),]

exp_col <- colorRamp2(breaks=seq(-1,1,length.out=11),
                      colors = rev(brewer.pal(11,"RdBu")))
sel_repnames_expr_hm <-Heatmap(sel_repnames_expr,
                               col = exp_col,
                               cluster_rows = F,cluster_columns = F,
                               heatmap_legend_param = list(title = "rel.expr. (log2)",
                                                           legend_direction="horizontal"),
                               column_title = "TE genomewide",
                               column_names_side = "top",
                               column_title_gp = gpar(fontsize=18),
                               column_names_rot = 0,
                               column_names_centered = T,
                               row_names_side = "left",
                               row_names_gp = gpar(fontsize=14,fontface="bold"),
                               column_names_gp = gpar(fontsize=14,fontface="bold"),
                               
)
## trx expr heatmap for j
sel_real_trx_dat <- subset(real_trx_data,FirstExonRepName %in% row.names(sel_zscores_mat))

sel_trxs_aver_expr <- do.call(rbind,tapply(sel_real_trx_dat$transcript_id,
                                           sel_real_trx_dat$FirstExonRepName,
                                           function(trxids){
                                             colMeans(trx_salm_relExp[trxids,,drop=F],na.rm=T)
                                           }))
sel_trxs_aver_expr <- sel_trxs_aver_expr[row.names(sel_zscores_mat),]

sel_trxs_expr_hm <-  Heatmap(sel_trxs_aver_expr,
                             col = exp_col,
                             cluster_rows = F,cluster_columns = F,
                             show_heatmap_legend = F,
                             column_title = "TE overlapping transcripts",
                             column_names_side = "top",
                             column_title_gp = gpar(fontsize=18),
                             column_names_rot = 0,
                             column_names_centered = T,
                             row_names_side = "left",
                             row_names_gp = gpar(fontsize=14,fontface="bold"),
                             column_names_gp = gpar(fontsize=14,fontface="bold"))

pdf(file = paste0("03_Heatmap_relExpr_TEgenomewide_and_overlap_Trxs.pdf"),
    width=6,
    height = 4)
draw(sel_repnames_expr_hm + sel_trxs_expr_hm,
     heatmap_legend_side = "bottom",gap=unit(6,"mm")
)
dev.off()


#### enrichment of gff compare codes 
class_codes <- c("=","c","k","m","n","j","e","o","s","x","i","y","p","u")
class_code_enr <- do.call(rbind,lapply(sel_repNames,
                                       function(repnm){
                                         message("Starting ",repnm)
                                         
                                         repnm_dat <- real_trx_data
                                         repnm_dat$isSelRep <- factor(with(repnm_dat,FirstExonRepName == repnm),
                                                                      levels = c(TRUE,FALSE))
                                         
                                         clcode_res <- do.call(rbind,lapply(class_codes,
                                                                            function(clcode){
                                                                              repnm_dat$isClasscode <- factor(repnm_dat$class_code == clcode,
                                                                                                              levels=c(TRUE,FALSE))
                                                                              
                                                                              tbl <- table(repnm_dat[,c("isSelRep","isClasscode")])
                                                                              fishres <- fisher.test(tbl)
                                                                              suppressWarnings(ctres <- chisq.test(tbl))
                                                                              
                                                                              enres <- data.frame(repName = repnm,
                                                                                                  class_code = clcode,
                                                                                                  obsNtrx = ctres$observed["TRUE","TRUE"],
                                                                                                  expNtrx = ctres$expected["TRUE","TRUE"],
                                                                                                  fisher.Pval = fishres$p.value)
                                                                              return(enres)
                                                                            }))
                                         return(clcode_res)
                                         
                                       }))
ggplot(class_code_enr,aes(x=fisher.Pval))+
  geom_histogram(bins=50)
class_code_enr$fdr <- p.adjust(class_code_enr$fisher.Pval)
class_code_enr <- class_code_enr[order(class_code_enr$fisher.Pval),]


## plot volcano
class_code_enr$enr <- with(class_code_enr,log2((obsNtrx + 1)/(expNtrx + 1)))

class_codes_desc <- c("=" = "exact match",
                      "c" = "contained in ref.",
                      "k" = "containment of ref.",
                      "m" = "retained intron, all introns matched or retained",
                      "n" = "retained intron, not all introns matched",
                      "j" = "multi-exon with at least 1 junction match",
                      "e" = "single exon transfag partially covering an intron",
                      "o" = "other same strand overlap with ref. exons",
                      "s" = "intron match on th eopposite strand",
                      "x" = "exonic overlap on the opposite strand",
                      "i" = "fully contained within a ref. intron",
                      "y" = "contains a ref. within its introns",
                      "p" = "possible polymerase run-on (no overlap)",
                      "u" = "unknown, intergenic")
class_code_enr$class_code_descr <- class_codes_desc[as.character(class_code_enr$class_code)]

library(ggrepel)
library(ggrastr)
class_enr_pl <- ggplot(class_code_enr,aes(x=enr,y=-log10(fdr))) + 
  geom_point(size=4)+
  geom_vline(xintercept = 1,linewidth =1,linetype="dashed",color="lightgrey")+
  geom_hline(yintercept = -log10(0.05),linewidth =1,linetype="dashed",color="lightgrey")+
  geom_label_repel(data=subset(class_code_enr,fdr <= 0.05 & enr >= 1),
                   aes(label=paste0(obsNtrx," transcripts\n5' exon overlaping ",repName,"\n",class_code_descr,"(",class_code,")")),
                   direction = "y",
                   #nudge_x=7,
                   min.segment.length = 0.00001,
                   xlim = c(4,NA),
                   hjust=0)+
  coord_cartesian(xlim=c(-4,14))+
  scale_x_continuous(breaks = c(-2,0,1,5,10),
                     minor_breaks = 0)+
  scale_y_continuous(breaks = c(0,-log10(0.05),2,4,6),
                     labels = c("0","5%","2","4","6"),
                     minor_breaks = 0)+
  labs(x=expression(log[2]~frac(obs.+1,exp.+1)),
       y=expression(-log[10]~"(FDR)"))+
  theme_bw()+
  theme(axis.title = element_text(face="bold",size=16),
        axis.text = element_text(size=14))
class_enr_pl <- rasterise(class_enr_pl,dpi=300)
ggsave(filename = paste0("03_classcode_volcano_scatter.pdf"),
       plot = class_enr_pl,
       width=6,
       height = 6)





### plot total counts for repeat overlaps
##### FUNCTION TO GET STATS ON REP OVERLAP AND CLASS CODE #####
get_repov_classcode_stats <- function(trx){
  #browser()
  ## extract transcripts only
  trx <- trx[trx$type == "transcript"]
  trx <- data.frame(mcols(trx))
  ## here we determine groups of transcripts
  ## 1. non-overlap
  ## 2. monoexonic overlapping repeats
  ## 3. first exon overlap:class_code combination
  ## 4. other exon overlap:class_code combination
  
  
  class_codes_unique <- unique(trx$class_code)
  trx$trxVSrep_type <- with(trx,
                            ifelse(!ExonRepInt,"non-overlap",
                                   ifelse(transcript_type == "monoexonic","monoexonic",
                                          ifelse(FirstExonRepInt,"first exon overlap",
                                                 "other exon overlap"))))
  trx$trxVSrep_type <- factor(with(trx,
                                   ifelse(!ExonRepInt,"non-overlap",
                                          ifelse(transcript_type == "monoexonic","monoexonic",
                                                 ifelse(FirstExonRepInt,"first exon overlap",
                                                        "other exon overlap"
                                                 )))),
                              levels = c("non-overlap","monoexonic",
                                         "first exon overlap",
                                         "other exon overlap"
                              ))
  
  repovcounts <- as.data.frame(table(trx$trxVSrep_type,useNA="ifany"))
  repovcounts$frac <- repovcounts$Freq/sum(repovcounts$Freq)
  
  colnames(repovcounts) <- c("trx_group","counts","frac")
  return(repovcounts)
}

#test_ann <- get_repov_classcode_stats(trxs_list[["real_trx"]])

## get counts for transcript groups wrt to repeat overlap and class_codes
trx_group_stats_v2 <- do.call(rbind,lapply(names(trxs_list),
                                           function(nm){
                                             
                                             trx <- trxs_list[[nm]]
                                             repovcounts <- get_repov_classcode_stats(trx)
                                             repovcounts$transcriptome <- nm
                                             return(repovcounts)
                                           }))

trx_group_stats_v2$trxtype <- ifelse(trx_group_stats_v2$transcriptome == "real_trx","real_trx","shuffled")


## calc mean and sd for shuffled
trx_group_mean_sd_v2 <- do.call(rbind,by(trx_group_stats_v2,trx_group_stats_v2$trx_group,
                                         function(dat){
                                           #browser()
                                           dout <- subset(dat,trxtype == "real_trx")
                                           # calc mean and sd for shuffled
                                           mean_cnt <- mean(subset(dat,trxtype == "shuffled")$counts,na.rm = T)
                                           sd_cnt <- sd(subset(dat,trxtype == "shuffled")$counts,na.rm = T)
                                           dout$mean_cnt_null <- mean_cnt
                                           dout$sd_cnt_null <- sd_cnt
                                           
                                           ## mean for frac
                                           mean_frac <- mean(subset(dat,trxtype == "shuffled")$frac,na.rm = T)
                                           sd_frac <- sd(subset(dat,trxtype == "shuffled")$frac,na.rm = T)
                                           
                                           dout$mean_frac_null <- mean_frac
                                           dout$sd_frac_null <- sd_frac
                                           return(dout)
                                         },simplify = F))

trx_group_mean_sd_v2$perc_text <- paste0("(",format(trx_group_mean_sd_v2$frac*100,digits = 2),"%)")
# trx_group_mean_sd_v2$trx_group2 <- ifelse(trx_group_mean_sd_v2$trx_group=="first exon overlap",
# 																					"5' exon overlap",
# 																					as.character(trx_group_mean_sd_v2$trx_group))
trx_group_plot <- ggplot(trx_group_mean_sd_v2)+
  geom_bar(aes(x=trx_group,y=counts),
           stat="identity",fill="black",color="black",width=0.6)+
  geom_text(aes(x=trx_group,y=counts,label=perc_text),
            vjust=0.5,hjust=0,nudge_y = 500,size=8)+
  geom_pointrange(aes(x=trx_group,y=mean_cnt_null,ymin = mean_cnt_null - sd_cnt_null,ymax = mean_cnt_null + sd_cnt_null,
                      color="random"),
                  size=0.7)+
  scale_color_manual(values = c("random" = "red"),guide = guide_legend(title=NULL))+
  scale_y_continuous(position="right")+
  scale_x_discrete(limits= (c("other exon overlap","first exon overlap","monoexonic","non-overlap")),
                   labels = c("other\nexon\noverlap","5' exon\noverlap","mono\nexonic","no\noverlap"))+
  coord_flip(ylim = c(0,30000))+
  # scale_y_log10(position="right",expand = expansion(mult = c(0,0.2), add = 0)
  # 									 #breaks = c(0,10000,20000)
  # 							
  # 							)+
  labs(y="# transcripts")+
  theme_bw()+
  theme(axis.title.y = element_blank(),
        axis.title.x = element_text(face="bold",size=18),
        axis.text.y = element_text(face="bold",size=18),
        axis.text.x = element_text(face="bold",size=12),
        legend.position = "none")
trx_group_plot
ggsave(filename = paste0("03_Trx_counts_wrt_repOvrlp_minJaccard_",min_jaccard_index,".pdf"),
       plot = trx_group_plot,
       width=6,height=7)
####



#### Figure 6 Generation: ####
###
#  I counted the signal in exons, 2kb antisense regions and H3K4me3 in a 2kb region centered around 
# the K4me3 anchor (11140 anchors with associated genes), and plotted heatmaps and it seems that 2 log2rpkm for
# the antisense signal is a good threshold between on and off
# We can also, instead of considering directly a threshold for the antisense signal, consider the ratio antisense/
# sense to look at the transcripts which have more antisense transcription than expected
# 
# 1. Look at the distribution of ratio antisense/sense transcription


library(GenomicRanges)
library(reshape2)
library(dplyr)
library(QuasR)
library(BSgenome.Mmusculus.UCSC.mm10)
library(parallel)
library(ComplexHeatmap)
library(Biostrings)
library(ggplot2)
library(viridis)


options(bitmapType='cairo')


baseDir = '/tungstenfs/groups/gpeters/rohmalex/work/Mark/RNA-seq_181127/32_transcriptomeValidation/130_K4me3forAntisenseTranscription/'
plotDir =  "/tungstenfs/scratch/gbioinfo_work/gpeters/gillmark/Post_Replicative_MaleGermCell_RNA_Seq_Comparison/plots/134b_analysisOfFeaturesAssociatedWithAntisenseTranscription/"
dir.create(plotDir, showWarnings=FALSE)
dataDir = paste0(baseDir, 'RData/')


## Load the data
data.list = readRDS(file=paste0(dataDir, 'list_countsExonsAndAntisenseAndK4me3_txomeMetadataDf_exonsAndK4me3regionsToCountGR.rds'))

# Load the original K4me3 called peaks in each celltype and associate the anchors to their associated celltype-
# specific peaks
repNames = c('Chu_Leptotene_Zygotene_H3K4me3', 'Chu_Pachytene_Diplotene_H3K4me3',
             'Erkek_Round_Spermatid_H3K4me3', 'Erkek_Elongating_Spermatid_H3K4me3')

narrowPeakData = lapply(repNames, function(rep){
  res = read.table(file=paste0(baseDir, rep, '/', rep, '_peaks.narrowPeak'), header=FALSE, skip=1,
                   sep='\t', stringsAsFactors=FALSE)
  colnames(res) = c('chr', 'start', 'end', 'name', 'score', 'strand', 'enrichment', 'pvalue', 'qvalue', 'summitStart')
  res$summitStart = res$start + res$summitStart
  return(res)
})

names(narrowPeakData) = repNames

# overlap data.list$K4anchorsGR and each instance of narrowPeakData
# instances in narrowPeakData contain multiple rows per peak if multiple summits, so we keep only one
# since we initially took a union of the summits from all celltypes and then considered one representative summit
# across the ones separated by less than 1kb, we take the closest peak associated to an anchor if it is located <
# 1kb away
narrowPeak.wAnchor = lapply(narrowPeakData, function(e){
  tmpGr = makeGRangesFromDataFrame(df=e, 
                                   seqnames.field='chr', start.field='start', end.field='end', strand.field='strand',
                                   keep.extra.columns=FALSE)
  tmpGr = unique(tmpGr)
  ov = distanceToNearest(x=data.list$K4anchorsGR, subject=tmpGr, ignore.strand=TRUE)
  ov = ov[which(mcols(ov)$distance <= 1000)]
  resGr = tmpGr[subjectHits(ov)]
  resGr$anchorId = data.list$K4anchorsGR$senseGeneId[queryHits(ov)]
  resGr$peakWidth = width(resGr)
  return(resGr)
})

####
sapply(narrowPeak.wAnchor, length)
# Chu_Leptotene_Zygotene_H3K4me3    Chu_Pachytene_Diplotene_H3K4me3      Erkek_Round_Spermatid_H3K4me3 
#                           6008                               5991                              10391 
# Erkek_Elongating_Spermatid_H3K4me3 
#                              10109
for (e in names(narrowPeak.wAnchor)){
  narrowPeak.wAnchor[[e]]$celltype = gsub('[_a-z]', '', gsub('^(Chu|Erkek)_(.*)_H3K4me3$', '\\2', e), ignore.case=FALSE)
}
####
narrowPeak.wAnchor.allCelltypesDf = do.call(rbind, lapply(narrowPeak.wAnchor, function(e) as.data.frame(e)))

table(table(unlist(lapply(narrowPeak.wAnchor, function(e) e$anchorId))))
#    1    2    3    4 
# 1178 3468 1591 4903
# -> 1178/11140 (11%) anchors are specific to one celltype, 3468/11140 (31%) are found in 2 celltypes,
#    1591/11140 (14%) are found in 3 celltypes and 4903/11140 (44%) are found in all celltypes

# in which celltypes are the anchors found in only one celltypes, in 2, in 3
resA = table(unlist(lapply(narrowPeak.wAnchor, function(e) e$anchorId)))
for (i in 1:3){
  print(paste('i =', i))
  print(table(sapply(names(resA[which(resA == i)]), function(e) paste(narrowPeak.wAnchor.allCelltypesDf$celltype[which(narrowPeak.wAnchor.allCelltypesDf$anchorId == e)], collapse=':'))))
}
# "i = 1"
#  ES  LZ  PD  RS 
# 528 155  27 468 
# 
# "i = 2"
# LZ:ES LZ:PD LZ:RS PD:ES PD:RS RS:ES 
#    29     4   112     4   192  3127 
# 
# "i = 3"
# LZ:PD:ES LZ:PD:RS LZ:RS:ES PD:RS:ES 
#        2       73      730      786 
# -> the anchors coming from a single celltype mostly come from RS (40%) or ES (45%)
# -> the anchors coming from 2 celltypes mostly come from RS and ES (90%)
# -> the anchors coming from 3 celltypes mostly come from PD, RS and ES (49%) and LZ, RS and ES (46%)



# 1. Distribution of ratio antisense/sense transcription
# -> scatterplot with exons vs antisense
# -> bin the K4me3 data and boxplot with corresponding exonic and antisense signals
# ratio antisense/sense : remove NAs and Inf values
pl1Df = data.frame(exons=unlist(lapply(1:5, function(i) data.list$log2.rpkm.exons[, i])),
                   antisense=unlist(lapply(1:5, function(i) data.list$log2.rpkm.antisense[, i])),
                   K4me3=unlist(lapply(c(1,2,3,4,4), function(i) data.list$log2.rpkm.K4me3[, i])),
                   celltype=rep(c('LZ','PD','RS','EES','LES'), each=nrow(data.list$log2.rpkm.exons)),
                   stringsAsFactors=FALSE)
pl1Df$celltype = factor(pl1Df$celltype, levels=unique(pl1Df$celltype))

pl1a = ggplot(data=pl1Df, aes(x=exons, y=antisense)) +
  geom_hex(bins=50) + scale_fill_viridis(trans='log2') +
  facet_wrap(.~celltype) + geom_abline(slope=1, intercept=0, color='red', linetype='dashed') + 
  ggtitle('Scatter with exonic expression vs antisense for all 11140 anchors, in all celltypes\nlog scale for density') +
  theme(plot.title=element_text(size=11))
ggsave(filename=paste0(plotDir, '01_scatter_exonicVsAntisense_splitByCelltype_allAnchors.png'), plot=pl1a,
       width=7, height=5, dpi=200)

pl1Df$K4me3Bin = cut(x=pl1Df$K4me3, breaks=c(0, 3, 5, 7, 9, 11, 17), include.lowest=TRUE)
pl1DfTmp = reshape2::melt(pl1Df, id.vars=c('K4me3Bin', 'celltype'))
pl1DfTmp = pl1DfTmp[-which(pl1DfTmp$variable == 'K4me3'), ]
pl1b = ggplot(data=pl1DfTmp, aes(x=K4me3Bin, y=value, fill=variable)) +
  geom_boxplot(outlier.alpha=0.1) + facet_wrap(.~celltype) +
  ggtitle('Distribution of exonic and antisense expression,\nstratified by bins of K4me3 expression at anchor +/- 1kb') +
  ylab('log2rpkm') + theme(plot.title=element_text(size=11))
ggsave(filename=paste0(plotDir, '01_boxplots_exonicAndAntisenseVsK4me3Bins_splitByCelltype_allAnchors.png'), plot=pl1b,
       width=7, height=5, dpi=200)


antisenseRatio = data.list$log2.rpkm.antisense/data.list$log2.rpkm.exons
dim(antisenseRatio)
# 11140    5
# we will discard rows with NAs and rows with Inf values later in a celltype-specific manner

summary(antisenseRatio[-which(rowSums(is.na(antisenseRatio)) >= 1 |
                                rowSums(is.infinite(antisenseRatio)) >= 1), ])
#    LZ.antisense      PD.antisense      RS.antisense    EES.antisense      LES.antisense     
# Min.   : 0.0000   Min.   :0.00000   Min.   :0.0000   Min.   : 0.00000   Min.   : 0.00000  
# 1st Qu.: 0.0000   1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.: 0.00000   1st Qu.: 0.00000  
# Median : 0.1582   Median :0.08679   Median :0.1386   Median : 0.06278   Median : 0.05089  
# Mean   : 0.2447   Mean   :0.18682   Mean   :0.2084   Mean   : 0.14356   Mean   : 0.12278  
# 3rd Qu.: 0.3596   3rd Qu.:0.26601   3rd Qu.:0.3141   3rd Qu.: 0.20347   3rd Qu.: 0.16465  
# Max.   :14.1614   Max.   :5.59612   Max.   :7.6087   Max.   :12.03399   Max.   :11.78500  

####
pl2Df = data.frame(exons=unlist(lapply(1:5, function(i) data.list$log2.rpkm.exons[, i])),
                   antisense=unlist(lapply(1:5, function(i) data.list$log2.rpkm.antisense[, i])),
                   K4me3=unlist(lapply(c(1,2,3,4,4), function(i) data.list$log2.rpkm.K4me3[, i])),
                   antisenseRatio=unlist(lapply(1:5, function(i) data.list$log2.rpkm.antisense[, i]/data.list$log2.rpkm.exons[, i])),
                   celltype=rep(c('LZ','PD','RS','EES','LES'), each=nrow(data.list$log2.rpkm.antisense)),
                   stringsAsFactors=FALSE)
# add anchorId
pl2Df$anchorId = rep(rownames(data.list$log2.rpkm.exons), 5)
# remove rows where antisenseRatio is NA or Inf
pl2Df = pl2Df[-which(is.na(pl2Df$antisenseRatio) | is.infinite(pl2Df$antisenseRatio)), ]
pl2Df$celltype = factor(pl2Df$celltype, levels=unique(pl2Df$celltype))

pl2a = ggplot(data=pl2Df, aes(x=exons, y=antisenseRatio)) +
  geom_hex(bins=50) + scale_fill_viridis() +
  facet_wrap(.~celltype, scales='free_y') +
  geom_hline(yintercept=1, color='red', linetype='dashed') + 
  ggtitle('Scatter with ratio antisense/exonic expression vs exonic for the valid\nanchors, in all celltypes') +
  theme(plot.title=element_text(size=11))
ggsave(filename=paste0(plotDir, '02_scatter_antisenseRatioVsExonic_splitByCelltype_allValidAnchors.png'),
       plot=pl2a, width=7, height=5, dpi=200)

pl2b = ggplot(data=pl2Df, aes(x=antisense, y=antisenseRatio)) +
  geom_hex(bins=50) + scale_fill_viridis(trans='log2') +
  facet_wrap(.~celltype, scales='free_y') +
  geom_hline(yintercept=1, color='red', linetype='dashed') + 
  ggtitle('Scatter with ratio antisense/exonic expression vs antisense for the valid\nanchors, in all celltypes\nlog scale for density') +
  theme(plot.title=element_text(size=11))
ggsave(filename=paste0(plotDir, '02_scatter_antisenseRatioVsAntisense_splitByCelltype_allValidAnchors.png'),
       plot=pl2b, width=7, height=5, dpi=200)

pl2c = ggplot(data=pl2Df, aes(x=antisenseRatio, y=K4me3)) +
  geom_hex(bins=50) + scale_fill_viridis(trans='log2') +
  facet_wrap(.~celltype, scales='free') +
  geom_hline(yintercept=1, color='red', linetype='dashed') + 
  ggtitle('Scatter with K4me3 vs ratio antisense/exonic expression for the valid\nanchors, in all celltypes\nlog scale for density') +
  theme(plot.title=element_text(size=11))
ggsave(filename=paste0(plotDir, '02_scatter_K4me3VsAntisenseRatio_splitByCelltype_allValidAnchors.png'),
       plot=pl2c, width=7, height=5, dpi=200)

pl2Df$antisenseRatioBin = cut(x=pl2Df$antisenseRatio, breaks=c(0,0.5,1,1.5,2,3,15), include.lowest=TRUE)
pl2dAnnotDf = data.frame(antisenseRatioBin=rep(unique(pl2Df$antisenseRatioBin), 5),
                         celltype=rep(c('LZ','PD','RS','EES','LES'), each=6),
                         stringsAsFactors=FALSE)
pl2dAnnotDf$count = sapply(1:nrow(pl2dAnnotDf), function(i){
  sum(pl2Df$antisenseRatioBin == pl2dAnnotDf$antisenseRatioBin[i] &
        pl2Df$celltype == pl2dAnnotDf$celltype[i])
})
pl2dAnnotDf$celltype = factor(pl2dAnnotDf$celltype, levels=c('LZ','PD','RS','EES','LES'))
pl2d = ggplot(data=pl2Df, aes(x=antisenseRatioBin, y=K4me3)) +
  geom_boxplot(outlier.alpha=0.1) + facet_wrap(.~celltype) +
  geom_text(data=pl2dAnnotDf, aes(x=antisenseRatioBin, y=15, label=count), size=2) + 
  ggtitle('Distribution of K4me3 signal at anchor +/- 1kb,\nstratified by bins of antisense/sense expression ratio') +
  ylab('H3K4me3, log2rpkm') + theme(plot.title=element_text(size=11), axis.text.x=element_text(size=8))
ggsave(filename=paste0(plotDir, '02_boxplots_K4me3VsAntisenseRatioBins_splitByCelltype_allValidAnchors.png'), 
       plot=pl2d, width=7, height=5, dpi=200)



# 2. Check if the transcripts and K4me3 peaks associated with antisense txion/increased antisense txion have specific
#    features :
#    - length of the K4 peak -> load the initial K4 peak calling data and associate the anchors with their corresponding
#      celltype-specific K4 peaks
#    - GC% at promoter (2kb region centered around anchor)
#    - presence of CGI at promoter
#    - gene expression level
#    - motif in the K4 region
# -> we stratify the data based on either antisense transcription (0, 2, 4, 6, 8, 16) 
#    or ratio antisense/sense expression (0, 0.5, 1, 1.5, 2, 2.5, 3, 40)

## Length of K4 peak
# if no peak was called in a given celltype, write NA for peakWidth
# /!\ we have data for 'ES' and not separately for EES and LES so we use the same data for both
pl1Df$anchorId = rep(rownames(data.list$log2.rpkm.exons), 5)
antisenseTranscriptionFeatures.list = list(
  antisenseOnly=pl1Df,
  antisenseRatio=pl2Df)
antisenseTranscriptionFeatures.list$antisenseOnly$antisenseBin = cut(x=antisenseTranscriptionFeatures.list$antisenseOnly$antisense,
                                                                     breaks=c(0, 2, 4, 6, 11), include.lowest=TRUE)

antisenseTranscriptionFeatures.list$antisenseOnly$K4me3PeakWidth = sapply(1:nrow(antisenseTranscriptionFeatures.list$antisenseOnly), function(i){
  ct = antisenseTranscriptionFeatures.list$antisenseOnly$celltype[i]
  if(ct %in% c('EES','LES')) ct = 'ES'
  res = narrowPeak.wAnchor.allCelltypesDf$peakWidth[which(narrowPeak.wAnchor.allCelltypesDf$anchorId == antisenseTranscriptionFeatures.list$antisenseOnly$anchorId[i] & 
                                                            narrowPeak.wAnchor.allCelltypesDf$celltype == ct)]
  return(ifelse(length(res) == 0, NA, res))
})
sapply(unique(antisenseTranscriptionFeatures.list$antisenseOnly$celltype), function(e) 
  sum(is.na(antisenseTranscriptionFeatures.list$antisenseOnly$K4me3PeakWidth[which(antisenseTranscriptionFeatures.list$antisenseOnly$celltype == e)])))
#   LZ   PD   RS  EES  LES
# 5132 5149  749 1031 1031
# -> anchors which have no called peak in the given celltype
# -> many anchors were coming from peaks in the latest celltypes and have no corresponding peaks (considering
#    overlap with anchor +/- 1kb) in LZ and PD

antisenseTranscriptionFeatures.list$antisenseRatio$K4me3PeakWidth = sapply(1:nrow(antisenseTranscriptionFeatures.list$antisenseRatio), function(i){
  ct = antisenseTranscriptionFeatures.list$antisenseRatio$celltype[i]
  if(ct %in% c('EES','LES')) ct = 'ES'
  res = narrowPeak.wAnchor.allCelltypesDf$peakWidth[which(narrowPeak.wAnchor.allCelltypesDf$anchorId == antisenseTranscriptionFeatures.list$antisenseRatio$anchorId[i] & 
                                                            narrowPeak.wAnchor.allCelltypesDf$celltype == ct)]
  return(ifelse(length(res) == 0, NA, res))
})
sapply(unique(antisenseTranscriptionFeatures.list$antisenseRatio$celltype), function(e) 
  sum(is.na(antisenseTranscriptionFeatures.list$antisenseRatio$K4me3PeakWidth[which(antisenseTranscriptionFeatures.list$antisenseRatio$celltype == e)])))
#   LZ   PD  RS   EES  LES
# 4544 4438  685  949  947


# scatter antisense vs sense, split by celltype, and color the anchors which had no associated called peak in each
# given celltype

pl3a = ggplot(data=antisenseTranscriptionFeatures.list$antisenseOnly, aes(x=exons, y=antisense)) +
  geom_hex(bins=50) + scale_fill_viridis(trans='log2') +
  facet_wrap(.~celltype) + geom_abline(slope=1, intercept=0, color='red', linetype='dashed') + 
  geom_point(data=antisenseTranscriptionFeatures.list$antisenseOnly[which(is.na(antisenseTranscriptionFeatures.list$antisenseOnly$K4me3PeakWidth)), ],
             aes(x=exons, y=antisense), pch='+', color='red', alpha=0.3) +
  ggtitle('Scatter with exonic expression vs antisense for all 11140 anchors, in all celltypes\nlog scale for density\nAnchors with no associated called peak at +/- 1kb in red') +
  theme(plot.title=element_text(size=11))
ggsave(filename=paste0(plotDir, '03_scatter_exonicVsAntisense_anchorsNotAssociatedToAK4PeakInRed_splitByCelltype_allAnchors.png'),
       plot=pl3a, width=7, height=5, dpi=200)

pl3b = ggplot(data=antisenseTranscriptionFeatures.list$antisenseRatio, aes(x=exons, y=antisense)) +
  geom_hex(bins=50) + scale_fill_viridis(trans='log2') +
  facet_wrap(.~celltype) + geom_abline(slope=1, intercept=0, color='red', linetype='dashed') + 
  geom_point(data=antisenseTranscriptionFeatures.list$antisenseRatio[which(is.na(antisenseTranscriptionFeatures.list$antisenseRatio$K4me3PeakWidth)), ],
             aes(x=exons, y=antisense), pch='+', color='red', alpha=0.3) +
  ggtitle('Scatter with exonic expression vs antisense for the valid anchors, in all celltypes\nlog scale for density\nAnchors with no associated called peak at +/- 1kb in red') +
  theme(plot.title=element_text(size=9))
ggsave(filename=paste0(plotDir, '03_scatter_exonicVsAntisense_anchorsNotAssociatedToAK4PeakInRed_splitByCelltype_allValidAnchors.png'),
       plot=pl3b, width=7, height=5, dpi=200)


# scatter K4me3 vs sense, split by celltype, and color the anchors which had no associated called peak in each
# given celltype

pl3c = ggplot(data=antisenseTranscriptionFeatures.list$antisenseOnly, aes(x=exons, y=K4me3)) +
  geom_hex(bins=50) + scale_fill_viridis(trans='log2') +
  facet_wrap(.~celltype) + geom_abline(slope=1, intercept=0, color='red', linetype='dashed') +
  geom_point(data=antisenseTranscriptionFeatures.list$antisenseOnly[which(is.na(antisenseTranscriptionFeatures.list$antisenseOnly$K4me3PeakWidth)), ],
             aes(x=exons, y=K4me3), pch='+', color='red', alpha=0.3, size=2) +
  ggtitle('Scatter with K4me3 signal vs exonic expression for all 11140 anchors, in all celltypes\nlog scale for density\nAnchors with no associated called peak at +/- 1kb in red') +
  theme(plot.title=element_text(size=11))
ggsave(filename=paste0(plotDir, '03_scatter_K4me3VsExonic_anchorsNotAssociatedToAK4PeakInRed_splitByCelltype_allAnchors.png'),
       plot=pl3c, width=7, height=5, dpi=200)

pl3d = ggplot(data=antisenseTranscriptionFeatures.list$antisenseRatio, aes(x=exons, y=K4me3)) +
  geom_hex(bins=50) + scale_fill_viridis(trans='log2') +
  facet_wrap(.~celltype) + geom_abline(slope=1, intercept=0, color='red', linetype='dashed') +
  geom_point(data=antisenseTranscriptionFeatures.list$antisenseRatio[which(is.na(antisenseTranscriptionFeatures.list$antisenseRatio$K4me3PeakWidth)), ],
             aes(x=exons, y=K4me3), pch='+', color='red', alpha=0.3, size=2) +
  ggtitle('Scatter with K4me3 signal vs exonic expression for the valid anchors, in all celltypes\nlog scale for density\nAnchors with no associated called peak at +/- 1kb in red') +
  theme(plot.title=element_text(size=9))
ggsave(filename=paste0(plotDir, '03_scatter_K4me3VsExonic_anchorsNotAssociatedToAK4PeakInRed_splitByCelltype_allValidAnchors.png'),
       plot=pl3d, width=7, height=5, dpi=200)


# boxplot with K4me3 peak length for each antisenseBin or antisenseRatioBin, for each celltype
# add number of anchors in each bin

pl4aAnnot = data.frame(antisenseBin=rep(levels(antisenseTranscriptionFeatures.list$antisenseOnly$antisenseBin), 5),
                       celltype=rep(levels(antisenseTranscriptionFeatures.list$antisenseOnly$celltype), each=length(levels(antisenseTranscriptionFeatures.list$antisenseOnly$antisenseBin))),
                       stringsAsFactors=FALSE)
pl4aAnnot$count = sapply(1:nrow(pl4aAnnot), function(i){
  sum(antisenseTranscriptionFeatures.list$antisenseOnly$celltype == pl4aAnnot$celltype[i] &
        antisenseTranscriptionFeatures.list$antisenseOnly$antisenseBin == pl4aAnnot$antisenseBin[i] &
        !is.na(antisenseTranscriptionFeatures.list$antisenseOnly$K4me3PeakWidth))
})
pl4aAnnot$antisenseBin = factor(pl4aAnnot$antisenseBin, levels=unique(pl4aAnnot$antisenseBin))
pl4aAnnot$celltype = factor(pl4aAnnot$celltype, levels=unique(pl4aAnnot$celltype))
pl4a = ggplot(data=antisenseTranscriptionFeatures.list$antisenseOnly, aes(x=antisenseBin, y=K4me3PeakWidth)) +
  geom_boxplot(outlier.alpha=0.3) +
  facet_wrap(.~celltype) +
  geom_text(data=pl4aAnnot, aes(x=antisenseBin, y=5000, label=count), size=2) +
  ggtitle('Distribution of K4me3 peak width for all 11140 anchors, stratified by antisense signal, in all celltypes\nSome anchors have no associated K4me3 peak in some celltypes') +
  theme(plot.title=element_text(size=9))
ggsave(filename=paste0(plotDir, '04_boxplot_K4me3PeakWidth_stratifiedByAntisenseSignal_splitByCelltype_allAnchors.png'),
       plot=pl4a, width=7, height=5, dpi=200)


pl4bAnnot = data.frame(antisenseRatioBin=rep(levels(antisenseTranscriptionFeatures.list$antisenseRatio$antisenseRatioBin), 5),
                       celltype=rep(levels(antisenseTranscriptionFeatures.list$antisenseRatio$celltype), each=length(levels(antisenseTranscriptionFeatures.list$antisenseRatio$antisenseRatioBin))),
                       stringsAsFactors=FALSE)
pl4bAnnot$count = sapply(1:nrow(pl4bAnnot), function(i){
  sum(antisenseTranscriptionFeatures.list$antisenseRatio$celltype == pl4bAnnot$celltype[i] &
        antisenseTranscriptionFeatures.list$antisenseRatio$antisenseRatioBin == pl4bAnnot$antisenseRatioBin[i] &
        !is.na(antisenseTranscriptionFeatures.list$antisenseRatio$K4me3PeakWidth))
})
pl4bAnnot$antisenseRatioBin = factor(pl4bAnnot$antisenseRatioBin, levels=unique(pl4bAnnot$antisenseRatioBin))
pl4bAnnot$celltype = factor(pl4bAnnot$celltype, levels=unique(pl4bAnnot$celltype))
pl4b = ggplot(data=antisenseTranscriptionFeatures.list$antisenseRatio, aes(x=antisenseRatioBin, y=K4me3PeakWidth)) +
  geom_boxplot(outlier.alpha=0.3) +
  facet_wrap(.~celltype) +
  geom_text(data=pl4bAnnot, aes(x=antisenseRatioBin, y=5000, label=count), size=2) +
  ggtitle('Distribution of K4me3 peak width for the valid anchors, stratified by antisense ratio, in all celltypes\nSome anchors have no associated K4me3 peak in some celltypes') +
  theme(plot.title=element_text(size=9), axis.text.x=element_text(size=8))
ggsave(filename=paste0(plotDir, '04_boxplot_K4me3PeakWidth_stratifiedByAntisenseRatio_splitByCelltype_allValidAnchors.png'),
       plot=pl4b, width=7, height=5, dpi=200)


## Scatters antisense or antisense ratio vs K4me3PeakWidth and colored by exonic expression
# ggplot(data=antisenseTranscriptionFeatures.list$antisenseOnly, aes(x=antisense, y=K4me3PeakWidth, color=exons)) +
#   geom_point(alpha=0.2, size=1) +
#   facet_wrap(.~celltype)
# 
# ggplot(data=antisenseTranscriptionFeatures.list$antisenseOnly, aes(x=K4me3, y=K4me3PeakWidth)) +
#   geom_hex(bins=50) +
#   facet_wrap(.~celltype)




## GC% at promoter (2kb region centered around anchor)
resB = as.data.frame(mcols(data.list$K4me3toCountGR)[c('senseGeneId', 'CGI', 'percGC')])

antisenseTranscriptionFeatures.list$antisenseOnly$pctGC = resB$percGC[match(antisenseTranscriptionFeatures.list$antisenseOnly$anchorId, resB$senseGeneId)]
antisenseTranscriptionFeatures.list$antisenseOnly$CGI = resB$CGI[match(antisenseTranscriptionFeatures.list$antisenseOnly$anchorId, resB$senseGeneId)]

antisenseTranscriptionFeatures.list$antisenseRatio$pctGC = resB$percGC[match(antisenseTranscriptionFeatures.list$antisenseRatio$anchorId, resB$senseGeneId)]
antisenseTranscriptionFeatures.list$antisenseRatio$CGI = resB$CGI[match(antisenseTranscriptionFeatures.list$antisenseRatio$anchorId, resB$senseGeneId)]

pl5a = ggplot(data=antisenseTranscriptionFeatures.list$antisenseOnly, aes(x=antisenseBin, y=pctGC)) +
  geom_boxplot() +
  facet_wrap(.~celltype) +
  ggtitle('Distribution of GC% in anchor +/- 1kb for all 11140 anchors, stratified by antisense signal, in all celltypes\nSome anchors have no associated K4me3 peak in some celltypes') +
  theme(plot.title=element_text(size=9))
ggsave(filename=paste0(plotDir, '05_boxplot_pctGC_stratifiedByAntisenseSignal_splitByCelltype_allAnchors.png'),
       plot=pl5a, width=7, height=5, dpi=200)

pl5b = ggplot(data=antisenseTranscriptionFeatures.list$antisenseRatio, aes(x=antisenseRatioBin, y=pctGC)) +
  geom_boxplot() +
  facet_wrap(.~celltype) +
  ggtitle('Distribution of GC% in anchor +/- 1kb for the valid anchors,\nstratified by antisense ratio, in all celltypes\nSome anchors have no associated K4me3 peak in some celltypes') +
  theme(plot.title=element_text(size=9), axis.text.x=element_text(size=8))
ggsave(filename=paste0(plotDir, '05_boxplot_pctGC_stratifiedByAntisenseRatio_splitByCelltype_allValidAnchors.png'),
       plot=pl5b, width=7, height=5, dpi=200)


## Presence of CGI at promoter
resC = antisenseTranscriptionFeatures.list$antisenseOnly[, c('antisenseBin', 'celltype', 'CGI')]
resC2 = unique(resC[, c('antisenseBin', 'celltype')])
resC2$fracCGI = sapply(1:nrow(resC2), function(i){
  round(100*sum(resC$CGI[which(resC$antisenseBin == resC2$antisenseBin[i] &
                                 resC$celltype == resC2$celltype[i])])/sum(resC$antisenseBin == resC2$antisenseBin[i] &
                                                                             resC$celltype == resC2$celltype[i]), 1)
})
pl6a = ggplot(data=resC2, aes(x=antisenseBin, y=fracCGI, group=celltype)) +
  geom_line() +
  facet_wrap(.~celltype) +
  ggtitle('Fraction of anchors +/- 1kb which overlap a CGI for all 11140 anchors, stratified by antisense signal, in all celltypes') +
  theme(plot.title=element_text(size=9))
ggsave(filename=paste0(plotDir, '06_fractionOfPromotersOverlappingWithCGI_stratifiedByAntisenseSignal_splitByCelltype_allAnchors.png'),
       plot=pl6a, width=7, height=5, dpi=200)

resD = antisenseTranscriptionFeatures.list$antisenseRatio[, c('antisenseRatioBin', 'celltype', 'CGI')]
resD2 = unique(resD[, c('antisenseRatioBin', 'celltype')])
resD2$fracCGI = sapply(1:nrow(resD2), function(i){
  round(100*sum(resD$CGI[which(resD$antisenseRatioBin == resD2$antisenseRatioBin[i] &
                                 resD$celltype == resD2$celltype[i])])/sum(resD$antisenseRatioBin == resD2$antisenseRatioBin[i] &
                                                                             resD$celltype == resD2$celltype[i]), 1)
})
pl6b = ggplot(data=resD2, aes(x=antisenseRatioBin, y=fracCGI, group=celltype)) +
  geom_line() +
  facet_wrap(.~celltype) +
  ggtitle('Fraction of anchors +/- 1kb which overlap a CGI for the valid anchors,\nstratified by antisense ratio, in all celltypes') +
  theme(plot.title=element_text(size=8), axis.text.x=element_text(size=8))
ggsave(filename=paste0(plotDir, '06_fractionOfPromotersOverlappingWithCGI_stratifiedByAntisenseRatio_splitByCelltype_allValidAnchors.png'),
       plot=pl6b, width=7, height=5, dpi=200)


## Gene expression level

pl7a = ggplot(data=antisenseTranscriptionFeatures.list$antisenseOnly, aes(x=antisenseBin, y=exons)) +
  geom_boxplot() +
  facet_wrap(.~celltype) +
  ggtitle('Distribution of exonic expression for all 11140 anchors, stratified by antisense signal, in all celltypes') +
  theme(plot.title=element_text(size=9))
ggsave(filename=paste0(plotDir, '07_boxplot_exonicExpression_stratifiedByAntisenseSignal_splitByCelltype_allAnchors.png'),
       plot=pl7a, width=7, height=5, dpi=200)

pl7b = ggplot(data=antisenseTranscriptionFeatures.list$antisenseRatio, aes(x=antisenseRatioBin, y=exons)) +
  geom_boxplot() +
  facet_wrap(.~celltype) +
  ggtitle('Distribution of exonic expression for the valid anchors,\nstratified by antisense ratio, in all celltypes') +
  theme(plot.title=element_text(size=9), axis.text.x=element_text(size=8))
ggsave(filename=paste0(plotDir, '07_boxplot_exonicExpression_stratifiedByAntisenseRatio_splitByCelltype_allValidAnchors.png'),
       plot=pl7b, width=7, height=5, dpi=200)


## K4me3 level

pl8a = ggplot(data=antisenseTranscriptionFeatures.list$antisenseOnly, aes(x=antisenseBin, y=K4me3)) +
  geom_boxplot() +
  facet_wrap(.~celltype) +
  ggtitle('Distribution of K4me3 signal in anchor +/- 1kb for all 11140 anchors, stratified by antisense signal, in all celltypes') +
  theme(plot.title=element_text(size=9))
ggsave(filename=paste0(plotDir, '08_boxplot_K4me3signal_stratifiedByAntisenseSignal_splitByCelltype_allAnchors.png'),
       plot=pl8a, width=7, height=5, dpi=200)

pl8b = ggplot(data=antisenseTranscriptionFeatures.list$antisenseRatio, aes(x=antisenseRatioBin, y=K4me3)) +
  geom_boxplot() +
  facet_wrap(.~celltype) +
  ggtitle('Distribution of K4me3 signal in anchor +/- 1kb for the valid anchors,\nstratified by antisense ratio, in all celltypes') +
  theme(plot.title=element_text(size=9), axis.text.x=element_text(size=8))
ggsave(filename=paste0(plotDir, '08_boxplot_K4me3signal_stratifiedByAntisenseRatio_splitByCelltype_allValidAnchors.png'),
       plot=pl8b, width=7, height=5, dpi=200)

# Load liver data (different anchors than testis)

data.list.liver = readRDS(file='/tungstenfs/groups/gpeters/rohmalex/work/Mark/RNA-seq_181127/32_transcriptomeValidation/140_K4me3forAntisenseTranscription_inExternalDatasets/MouseLiver/RData/list_countsExonsAndAntisenseAndK4me3_txomeMetadataDf_exonsAndK4me3regionsToCountGR.rds')
names(data.list.liver)
# [1] "log2.rpkm.exons"     "log2.rpkm.antisense" "log2.rpkm.K4me3"     "K4me3toCountGR"      "exonsToCountGR"     
# [6] "K4anchorsGR"


pl1BisDf = data.frame(exons=unlist(lapply(1:5, function(i) data.list$log2.rpkm.exons[, i])),
                      antisense=unlist(lapply(1:5, function(i) data.list$log2.rpkm.antisense[, i])),
                      K4me3=unlist(lapply(c(1,2,3,4,4), function(i) data.list$log2.rpkm.K4me3[, i])),
                      celltype=rep(c('LZ','PD','RS','EES','LES'), each=nrow(data.list$log2.rpkm.exons)),
                      stringsAsFactors=FALSE)
pl1BisDfTmp = data.frame(exons=rowMeans(data.list.liver$log2.rpkm.exons),
                         antisense=rowMeans(data.list.liver$log2.rpkm.antisense),
                         K4me3=data.list.liver$log2.rpkm.K4me3[, 'liver.h3k4me3'],
                         celltype='liver',
                         stringsAsFactors=FALSE)
pl1BisDf = rbind(pl1BisDf, pl1BisDfTmp)
pl1BisDf$celltype = factor(pl1BisDf$celltype, levels=unique(pl1BisDf$celltype))

pl1Bisa = ggplot(data=pl1BisDf, aes(x=exons, y=antisense)) +
  geom_hex(bins=50) + scale_fill_viridis(trans='log2') +
  facet_wrap(.~celltype) + geom_abline(slope=1, intercept=0, color='red', linetype='dashed') + 
  ggtitle('Scatter with exonic expression vs antisense for all 11140 testis anchors, in all\ntestis celltypes and 9554 liver anchors\nlog scale for density') +
  theme(plot.title=element_text(size=11))
ggsave(filename=paste0(plotDir, '01bis_scatter_exonicVsAntisense_splitByCelltype_allAnchors_plusLiver.png'), plot=pl1Bisa,
       width=7, height=5, dpi=200)

pl1BisDf$K4me3Bin = cut(x=pl1BisDf$K4me3, breaks=c(0, 3, 5, 7, 9, 11, 17), include.lowest=TRUE)
pl1BisDfTmp = reshape2::melt(pl1BisDf, id.vars=c('K4me3Bin', 'celltype'))
pl1BisDfTmp = pl1BisDfTmp[-which(pl1BisDfTmp$variable == 'K4me3'), ]
pl1Bisb = ggplot(data=pl1BisDfTmp, aes(x=K4me3Bin, y=value, fill=variable)) +
  geom_boxplot(outlier.alpha=0.1) + facet_wrap(.~celltype) +
  ggtitle('Distribution of exonic and antisense expression,\nstratified by bins of K4me3 expression at anchor +/- 1kb') +
  ylab('log2rpkm') + theme(plot.title=element_text(size=11))
ggsave(filename=paste0(plotDir, '01bis_boxplots_exonicAndAntisenseVsK4me3Bins_splitByCelltype_allAnchors_plusLiver.png'), plot=pl1Bisb,
       width=7, height=5, dpi=200)


# Separate between CGI and non CGI promoters
# -> info in data.list$K4me3toCountGR and data.list.liver$K4me3toCountGR
pl1BisDf$CGI = c(rep(data.list$K4me3toCountGR$CGI, 5), data.list.liver$K4me3toCountGR$CGI)
pl1BisDf$CGI[which(pl1BisDf$CGI)] = 'CGI'
pl1BisDf$CGI[which(pl1BisDf$CGI != 'CGI')] = 'nonCGI'
pl1BisDfTmp = reshape2::melt(pl1BisDf, id.vars=c('K4me3Bin', 'celltype', 'CGI'))
pl1BisDfTmp = pl1BisDfTmp[-which(pl1BisDfTmp$variable == 'K4me3'), ]
pl1Bisc = ggplot(data=pl1BisDfTmp, aes(x=K4me3Bin, y=value, fill=variable)) +
  geom_boxplot(outlier.alpha=0.1) + facet_wrap(.~CGI+celltype, nrow=4) +
  ggtitle('Distribution of exonic and antisense expression,\nstratified by bins of K4me3 expression at anchor +/- 1kb') +
  ylab('log2rpkm') + theme(plot.title=element_text(size=11))
ggsave(filename=paste0(plotDir, '01bis_boxplots_exonicAndAntisenseVsK4me3Bins_splitByCelltypeAndCGIpromoter_allAnchors_plusLiver.png'), plot=pl1Bisc,
       width=7, height=7, dpi=250)

pl1Bisd = ggplot(data=pl1BisDfTmp[which(pl1BisDfTmp$CGI == 'CGI'), ], aes(x=K4me3Bin, y=value, fill=variable)) +
  geom_boxplot(outlier.alpha=0.1) + facet_wrap(.~celltype) +
  ggtitle('Distribution of exonic and antisense expression,\nstratified by bins of K4me3 expression at anchor +/- 1kb\nFor CGI promoters') +
  ylab('log2rpkm') + theme(plot.title=element_text(size=11))
ggsave(filename=paste0(plotDir, '01bis_boxplots_exonicAndAntisenseVsK4me3Bins_splitByCelltype_CGIpromoters_allAnchors_plusLiver.png'), plot=pl1Bisd,
       width=7, height=5, dpi=200)
pl1Bise = ggplot(data=pl1BisDfTmp[which(pl1BisDfTmp$CGI == 'nonCGI'), ], aes(x=K4me3Bin, y=value, fill=variable)) +
  geom_boxplot(outlier.alpha=0.1) + facet_wrap(.~celltype) +
  ggtitle('Distribution of exonic and antisense expression,\nstratified by bins of K4me3 expression at anchor +/- 1kb\nFor non CGI promoters') +
  ylab('log2rpkm') + theme(plot.title=element_text(size=11))
ggsave(filename=paste0(plotDir, '01bis_boxplots_exonicAndAntisenseVsK4me3Bins_splitByCelltype_nonCGIpromoters_allAnchors_plusLiver.png'), plot=pl1Bise,
       width=7, height=5, dpi=200)



## P-values between the boxplots on pctGC vs antisenseBin
pl5BisAnnotDf = data.frame(celltype=rep(c('LZ','PD','RS','EES','LES'), each=3),
                           antisenseBin1=rep(levels(antisenseTranscriptionFeatures.list$antisenseOnly$antisenseBin)[1:3], 5),
                           antisenseBin2=rep(levels(antisenseTranscriptionFeatures.list$antisenseOnly$antisenseBin)[2:4], 5),
                           x=rep(c(1.5,2.5,3.5), 5),
                           y=80,
                           stringsAsFactors=FALSE)
pl5BisAnnotDf$pval = sapply(1:nrow(pl5BisAnnotDf), function(i){
  x = antisenseTranscriptionFeatures.list$antisenseOnly$pctGC[
    which(antisenseTranscriptionFeatures.list$antisenseOnly$celltype == pl5BisAnnotDf$celltype[i] &
            antisenseTranscriptionFeatures.list$antisenseOnly$antisenseBin == pl5BisAnnotDf$antisenseBin1[i])]
  y = antisenseTranscriptionFeatures.list$antisenseOnly$pctGC[
    which(antisenseTranscriptionFeatures.list$antisenseOnly$celltype == pl5BisAnnotDf$celltype[i] &
            antisenseTranscriptionFeatures.list$antisenseOnly$antisenseBin == pl5BisAnnotDf$antisenseBin2[i])]
  return(formatC(t.test(x=x, y=y)$p.value, format="e", digits=1))
})
pl5BisAnnotDf$celltype = factor(pl5BisAnnotDf$celltype, levels=unique(pl5BisAnnotDf$celltype))
pl5BisAnnotDf$antisenseBin1 = factor(pl5BisAnnotDf$antisenseBin1, levels=levels(antisenseTranscriptionFeatures.list$antisenseOnly$antisenseBin))
pl5BisAnnotDf$antisenseBin2 = factor(pl5BisAnnotDf$antisenseBin2, levels=levels(antisenseTranscriptionFeatures.list$antisenseOnly$antisenseBin))

pl5Bisa = ggplot(data=antisenseTranscriptionFeatures.list$antisenseOnly, aes(x=antisenseBin, y=pctGC)) +
  geom_boxplot() +
  facet_wrap(.~celltype) +
  geom_text(data=pl5BisAnnotDf, aes(x=x, y=y, label=pval), size=3) + 
  ggtitle('Distribution of GC% in anchor +/- 1kb for all 11140 anchors, stratified by antisense signal, in all celltypes\nSome anchors have no associated K4me3 peak in some celltypes') +
  theme(plot.title=element_text(size=9))
ggsave(filename=paste0(plotDir, '05bis_boxplot_pctGC_stratifiedByAntisenseSignal_splitByCelltype_allAnchors_withPvalues.png'),
       plot=pl5Bisa, width=7, height=5, dpi=200)

####