############ Script: heatmap of ERV repeat expression
# author: JAttig
# date: June 21st 2017
# Rversion: 3.3.2
#
#
### save base stats of TPM expression tables for RNA-seq of tumour and benign tissue
# 1) identify transcripts with less than 1 TPM across all tumour types in any sample
# 2) identify transcriptstoo widely expressed in benign
# 3) calculate median of all transcripts in each tumour type
# 4) calculate 75th percentile of all transcripts in each tumour
# 5) identify and deselect all transcriptswith 75th percentile less than 1 TPM in all tumour-type



setwd("/Users/attigj/Documents/Jan.Crick/computational.analysis/playground/")



### packages
suppressMessages(library(parallel))
suppressMessages(library(reshape2))
library("plyr")




####################
##### retrieve annotation
annot.colnames <-  c("sampleID", "consortium", "project_disease_type", "tissue_origin", "organismpart", "tumourbenign", "gender")
key.colnames <-  c("project_disease_type", "tissue_origin", "tumourbenign")

load("/camp/lab/kassiotisg/working/ERVAXX/sample_annotation/TCGAdata.DiscoverySet.sampleannotation.RData")
DiscoverySet$sampleID <- gsub("_gdc_realn_rehead.bam", "", DiscoverySet$file_name)

# arrange sample.annot by tissue group
DiscoverySet <- arrange(DiscoverySet, tissue_origin, tumourbenign, project_disease_type)




####################
##### inputtable: TPM values in each sample

TPMtable.ma <- readRDS("./TPMtables/TCGAsamples.TPMtable.ma.rds")



####################
##### inputtable: sort by groups
### select tumour samples and match TPMtable and sample annotation
DiscoverySetTumours <- DiscoverySet [ DiscoverySet$tumourbenign == "Primary" |
                                         (DiscoverySet$tumourbenign == "Metastasis" & DiscoverySet$tissue_origin == "Skin"), ]

### QC !!! check all samples are in annotation file
summary(colnames(TPMtable.ma) %in% DiscoverySetTumours$sampleID)
# all accounted for


#arrange count.Table according to sample annotation
m <- match( DiscoverySetTumours$sampleID, colnames(TPMtable.ma))
summary(is.na(m)) ###QC. 
TPMtable.ma <- TPMtable.ma [ , m]

if ( !identical(DiscoverySetTumours$sampleID,  colnames(TPMtable.ma) )){
   stop("not all files have metadata annotation associated with them", call.=FALSE)
}





####################
##### identify transcripts with less than 1 TPM across all tumour types in any sample

maxTPM.perTranscript <- apply(TPMtable.ma,1,max)
summary(maxTPM.perTranscript < 1)  #248765 out of 1001931 are less than 1TPM in all the samples
del <- maxTPM.perTranscript < 1
TPMtable.ma <- TPMtable.ma [ !del , ]


####################
##### too widely expressed in benign
##### all transcripts with TPM > 10 in 10% of the benign samples

normals.TPMmatrix <- readRDS("./normalsTPM.ma.rds")

filter <- normals.TPMmatrix >10 
filter <- rowSums( filter) >  (ncol(filter) * 0.1)  
transcriptsOfInterest.lvl1 <- names(filter)  [!filter]



                
####################
##### calculate median of all transcripts in each tumour type

require("doParallel")
require("dplyr")

##### implementation with foreach, subsetting and ddplyr
TPMtable.df <- as.data.frame(TPMtable.ma)
TPMtable.df <- cbind( t(TPMtable.df), DiscoverySetTumours [ , key.colnames ])
trancsriptNames <- rownames(TPMtable.ma)

registerDoParallel(cores=24)  ### significant boost by parallisation ... 
medians.list <- foreach(i=1:length(trancsriptNames)) %dopar% {
   tmp.df <- .subset2(TPMtable.df, i ) 
   tmp.df <- cbind( DiscoverySetTumours, transcript = tmp.df )
   transcript.median <- tmp.df %>%
      group_by(project_disease_type, tissue_origin, tumourbenign) %>% 
      summarise(median = median( transcript ))
   
   transcript.median$transcript = trancsriptNames[i]
   return(transcript.median)
}


require("data.table")
medians.mergedtable <- data.table::rbindlist(medians.list)
head(medians.mergedtable)

saveRDS(medians.mergedtable, file = "TCGAsamples.medians.rds")
TCGAsamples.medians <- readRDS("TCGAsamples.medians.rds")




####################
##### calculate 75th percentile of all transcripts in each tumour type
### to test for minimum expression in at least 25% of samples have min TPM of 1!

require("doParallel")
require("dplyr")

##### implementation with foreach, subsetting and ddplyr
### did extensive performance and implementation testing on this ... it's slower but more functional than data.table

TPMtable.df <- as.data.frame(TPMtable.ma)
TPMtable.df <- cbind( t(TPMtable.df), DiscoverySetTumours [ , key.colnames ])
transcriptNames <- rownames(TPMtable.ma)

registerDoParallel(cores=24)  ### significant boost by parallisation on camp ... 
upperQuartile.ls <- foreach(i=1:length(transcriptNames)) %dopar% {
   tmp.df <- .subset2(TPMtable.df, i ) 
   tmp.df <- cbind( DiscoverySetTumours, transcript = tmp.df )
   transcript.upperQuartile <- tmp.df %>%
      group_by(project_disease_type, tissue_origin, tumourbenign) %>% 
      summarise(upperQuartile = quantile(transcript, probs=0.75, na.rm=FALSE))
   
   transcript.upperQuartile$transcript = transcriptNames[i]
   return(transcript.upperQuartile)
}

upperQuartiles.df <- data.table::rbindlist(upperQuartile.ls)



#################### 
##### identify and deselect all transcripts with 75th percentile less than 1 TPM in all tumour-type
### select from upperQuartiles.df by sorting on median value

upperQuartiles.df <- arrange(upperQuartiles.df, desc(upperQuartile))
MaxupperQuartiles.df <- upperQuartiles.df [ !duplicated(upperQuartiles.df$transcript) ,]

# remove transcripts with max(median(TPM)) < 1 from tables
transcriptsOfInterest.lvl2 <- MaxupperQuartiles.df$transcript [ MaxupperQuartiles.df$upperQuartile >= 1]





q("no")