############ Script: annotate Trinity/GMAP flattened_transcriptome.gff3
# author: JAttig
# date: June 29th 2017
# Rversion: 3.3.2
#
#
### annotate flattened_transcriptome.gff3 
# used cuffcompare to infer the corresponding transcript in GENCODEv24 basic annotation 
# infer flattened_transcriptome.gff3 strand
# add cuffcompare-annotations onto transcripts
# add annotations onto each exon if it overlaps with LTR element
#
### NOTES
# update April 2018: did not add GENCODE details based on cuffcompare but based on exon overlaps (exact > partial; then collapsed)



setwd("/Users/attigj/Documents/Jan.Crick/computational.analysis/playground/")
source("/Users/attigj/Documents/R.sourcefunctions/convert.magic.R")
# courtesy of Bill Petti, see
source("https://gist.githubusercontent.com/BillPetti/c83a471face493466fa3/raw/e94e5b6641eb8c34b99eee8161fdefe014ec7219/convert.magic.R")


source("/Users/attigj/Documents/R.sourcefunctions/rangeoverlapper.R")
# courtesy of Thomas Girke, see
source("http://faculty.ucr.edu/~tgirke/Documents/R_BioCond/My_R_Scripts/rangeoverlapper.R")



library("rtracklayer")
library("GenomicRanges")
require("data.table")
   
     

#################### import transcriptome genomic annotation
### load as data.table
transcriptome.dt <- fread("gzcat < ./flattened_transcriptome.gff3.gz", sep="\t", sep2=";") 
colnames(transcriptome.dt) <- c("seqnames", "source", "type", "start", "end", "score", "phase", "strand", "group")

transcriptome.dt [,  c("ID", "Parent") := tstrsplit(group, ";", fixed=TRUE, names= c("ID", "Parent")) ]
transcriptome.dt [, c("score", "phase", "source", "group") := NULL ]
transcriptome.dt [, strand := factor("*", levels=c("+", "-", "*")) ]
transcriptome.dt [, Parent := gsub("Parent=", "", Parent)]
transcriptome.dt [, ID := gsub("ID=", "", ID)]


### visual inspection
str(transcriptome.dt)
summary(transcriptome.dt$type)
summary(is.na(transcriptome.dt$type))


##### separate exons and transcripts
exons.dt <- transcriptome.dt [ transcriptome.dt$type == "exon" ,]
transcripts.dt <- transcriptome.dt [ transcriptome.dt$type == "transcript" ,]

### generate GRange
exons.gr <- as( transcriptome.dt [ transcriptome.dt$type == "exon" , ],  "GRanges") 
exons.gr <- sort.GenomicRanges(exons.gr)






#################### transcript annotation
########## count number of exons for each transcript
exonnumber <- exons.dt [ , .N , by = "Parent"]
summary(exonnumber$N)


########## sum length each transcript
transcript.length <- as.data.table(data.frame(Parent = exons.gr$Parent, width = width(exons.gr)))
setkey(transcript.length, Parent)
transcript.length <- transcript.length [ , sum(width), by = "Parent"]
transcript.length [, Parent := as.character(Parent)]
setnames(transcript.length, "V1", "length")

#add both onto transcripts
m <- match( exonnumber$Parent, transcript.length [, Parent])
exonnumber$transcriptLength <- transcript.length$length[m]

setkey(transcripts.dt, ID)
setkey(exonnumber, Parent)
setnames(exonnumber, c("Parent", "N"), c("ID", "exonNumber"))
transcripts.dt <- exonnumber [transcripts.dt, on = "ID"]


# doublecheck transcript length assignment
m <- match( transcripts.dt [, ID], transcript.length [, Parent])
transcripts.dt [ , length := transcript.length [ m, length]]




########## number and length of ORFs in each trancript
ORFs.dt <- fread("./flattened_transcriptome.orf_scanner.gff3" )
colnames(ORFs.dt) <- c("ID", "source", "type", "start", "end", "score", "strand", "phase", "name")
setkey(ORFs.dt, ID)
ORFs.dt [, length := end - start ]
ORFs.dt [ ,`:=`(ORFnumber = .N), by = "ID" ]
ORFs.dt <- setorder(ORFs.dt, ID, -length)
ORFs.dt [ , c("source", "type", "start", "end", "score", "strand", "phase", "name") := NULL ]
ORFs.dt <- ORFs.dt [ !duplicated(ID) , ]
transcripts.dt <- ORFs.dt [transcripts.dt, on = "ID"]





####################  output of cuffcompare
### save .gff as .RData and split transcripts and exons
cuffcompareOutput.gr <- import.gff(gzfile("flattened_transcriptome.onGENCODEv24basic.gtf.gz"))
cuffcompareOutput.dt <- as.data.table(as.data.frame(cuffcompareOutput.gr))
setnames(cuffcompareOutput.dt, c("oId", "gene_name", "nearest_ref"), c("ID", "nearest_ref_gene", "nearest_ref_ensembl_transcriptID"))

### sort cuffcompare datatable
cuffcompareOutput.dt [, c("score", "phase", "source") := NULL ]
cuffcompareOutput.dt [, strand := factor("*", levels=c("+", "-", "*")) ]

cuffcompareOutput.transcriptref <- cuffcompareOutput.dt[, c("ID", "nearest_ref_ensembl_transcriptID", "nearest_ref_gene")]
setkeyv(cuffcompareOutput.transcriptref, c("ID", "nearest_ref_ensembl_transcriptID", "nearest_ref_gene"))
setorder(cuffcompareOutput.transcriptref, ID)

### collapse duplicates
cuffcompareOutput.transcriptref <- unique(cuffcompareOutput.transcriptref)
#cuffcompareOutput.transcriptref <- cuffcompareOutput.transcriptref [, lapply( .SD, function(column) paste(unique(column), collapse=";")) , .SDcols = -c("ID"), by = ID  ]


### add to transcripts.dt
transcripts.dt <- merge(transcripts.dt, cuffcompareOutput.transcriptref, 
                        all.x = T, all.y=FALSE, by = "ID")


### QC
transcripts.dt [ ID == "transa448f15a974bbc97", ]




#################### annotate exons
#there is no strand associated with annotation!

##### set strand based on identical exons in GENCODE
### GENCODE v24:
load("/Users/attigj/Documents/Jan.Crick/computational.analysis/annotation.files_local/Homo_sapiens/GENCODEannotation/gencode.v24.basic.exons.RData")
mcols(GENCODE.exons) <- subset(mcols(GENCODE.exons), select = -c(group, score, phase, source, type))


### identical Ranges only
overlaps <- findOverlaps( exons.gr, GENCODE.exons,type = "equal" )  #4,782,302 matches | 1,616,552 exons
overlaps.df <- as.data.frame(overlaps)
overlaps.df$ID = as.character( exons.gr$ID [ overlaps.df$queryHits ] )
overlaps.df <- convert.magic(overlaps.df, c("numeric", "numeric", "character"))

exons.gr$exactAnnotated <- exons.gr$ID %in% overlaps.df$ID
exons.gr$GENCODEexonID <- NA
exons.gr$GENCODEexonID [ overlaps.df$queryHits ] <- GENCODE.exons$exonID [ overlaps.df$subjectHits ]


### table partial annotation
overlaps <- as.data.table(findOverlaps( exons.gr, GENCODE.exons,type = "any" ))
overlaps [ , ID := as.character( exons.gr$ID [ overlaps$queryHits ] ) ]
exons.gr$partialAnnotated <- exons.gr$ID %in% overlaps$ID



########## infer strand were possible based on identical matches
##### exons of the same transcript are on the same strand...
overlaps.df$likelyStrand <- as.character(strand(GENCODE.exons)) [ overlaps.df$subjectHits ]

### ref.table with transript strand information
overlaps.df$transcript <- as.character(exons.gr$Parent) [ overlaps.df$queryHits ]
strand.ref <- overlaps.df [ c("transcript", "likelyStrand")]
strand.ref <- strand.ref [ !duplicated(strand.ref) ,]
# are there transcripts with more than one strand associated ? set them to *
summary(duplicated(strand.ref$transcript))
bothStrands <- strand.ref$transcript [ duplicated(strand.ref$transcript) ]
strand.ref$likelyStrand [ strand.ref$transcript %in% bothStrands ] <- "*"


#add onto exons... match selects by default the first match on duplicates 
m <- match( as.character(exons.gr$Parent), strand.ref$transcript )
exons.gr$likelyStrand <- as.factor(strand.ref$likelyStrand [ m ])
strand(exons.gr) <- ifelse( is.na( exons.gr$likelyStrand), "*", as.character(exons.gr$likelyStrand))


#add onto transcripts
m <- match( as.character(transcripts.dt$ID), strand.ref$transcript )
transcripts.dt$likelyStrand <- as.factor(strand.ref$likelyStrand [ m ])
transcripts.dt$strand <- ifelse( is.na( transcripts.dt$likelyStrand), "*", as.character(transcripts.dt$likelyStrand))




########## add gencode details
##### on exons:
### from cuffcompare .. 
cuffcompareOutput.transcriptref <- unique(cuffcompareOutput.transcriptref)
setnames(cuffcompareOutput.transcriptref, "ID", "Parent")
mcols.temp <- as.data.frame(mcols(exons.gr))
mcols.temp <- as.data.table(mcols.temp)
setkey(mcols.temp, Parent)
mcols.temp <- cuffcompareOutput.transcriptref [ mcols.temp, on ="Parent", nomatch=NA ]

exons.gr [ exons.gr$Parent == "transe2cb3632f4571b02"]
mcols.temp [ mcols.temp$Parent == "transe2cb3632f4571b02"]

if ( identical( mcols.temp$ID, exons.gr$ID )) { 
   mcols(exons.gr) <- mcols.temp
   rm(mcols.temp) } else { 
      print("mcols.temp is not in the same order as exons.gr, resorting ... ")
      m <- match( as.character(exons.gr$ID), mcols.temp$ID )
      mcols(exons.gr) <- mcols.temp [ m ]
   }



###  based on GENCODE exonID ... 
# add ID from partial annotation

overlaps <- as.data.table(findOverlaps( exons.gr, GENCODE.exons,type = "any", ignore.strand=FALSE ))
overlaps [ , ID := as.character( exons.gr$ID [ overlaps$queryHits ] ) ]
overlaps [ , GENCODEexonID := as.character( GENCODE.exons$exonID [ overlaps$subjectHits ] ) ]

#collapse duplicates by largest overlap
source("/Users/attigj/Documents/R.sourcefunctions/rangeoverlapper.R")



rangeInfo <- olRanges(exons.gr, GENCODE.exons )
overlaps <- as.data.table(as.data.frame(mcols(rangeInfo)[ c("ID", "Qindex", "Sindex", "OLlength", "OLpercQ", "OLpercS", "OLtype")]))
setkey(overlaps, ID)

#advantage protein coding genes and transcript support level 1 ...
overlaps <- cbind( overlaps, as.data.frame(mcols(GENCODE.exons)) [ overlaps [ , Sindex], ]  )

overlaps <- overlaps [ OLpercQ > 10 & (OLtype != "contained" | OLpercQ > 20) , ]
overlaps [ , ttypeSimpel :=  factor("other", levels= c("protein_coding", "lincRNA", "smallRNA", "other")) ]
overlaps [ transcript_type %in% c("protein_coding", "nonsense_mediated_decay"), ttypeSimpel := "protein_coding" ]
overlaps [ transcript_type %in% c("3prime_overlapping_ncrna", "antisense", "bidirectional_promoter_lncrna", "lincRNA", "macro_lncRNA", "processed_transcript", "sense_intronic", "sense_overlapping" ), 
           ttypeSimpel := "lincRNA" ]
overlaps [ transcript_type %in% c("miRNA", "Mt_tRNA", "Mt_rRNA", "snRNA", "scaRNA", "sRNA", "snoRNA", "rRNA" ), ttypeSimpel := "smallRNA" ]

setorder(overlaps, ID, transcript_support_level, ttypeSimpel, -OLpercQ)
overlaps <- overlaps [ !duplicated(ID), ]; rm(rangeInfo); gc()
table(overlaps$OLtype)

### exons.gr mcols - create reference and only replace GENCODEexonID if it's not present already
temp.exonRef.dt <- data.table( queryHits = 1:length(exons.gr), GENCODEexonID = exons.gr$GENCODEexonID, 
                               Parent = exons.gr$Parent) 
setkey(temp.exonRef.dt, queryHits)

temp.exonRef.dt <- merge(temp.exonRef.dt, overlaps, by.x = "queryHits", by.y = "Qindex", all.x = T)
#QC: temp.exonRef.dt [ Parent == "trans001cfff345c5d253" ]


#override entries based on GENCODEexonID to unify exactMatches and bestMatches
m <- match(temp.exonRef.dt$GENCODEexonID, GENCODE.exons$exonID)
temp.exonRef.dt [ which(!is.na(m)), c("geneID", "gene_type", "genelevel", "transcriptID", "transcript_type",
                               "transcript_support_level") :=
                     as.data.frame(mcols(GENCODE.exons) [m [ !is.na(m)], c("geneID", "gene_type", "genelevel", "transcriptID", "transcript_type", "transcript_support_level")])]
temp.exonRef.dt <- temp.exonRef.dt [ , c("queryHits", "GENCODEexonID", "Parent", "OLpercQ", "geneID", "gene_type", "genelevel", "transcriptID", "transcript_type",
                                         "transcript_support_level")]
setnames(temp.exonRef.dt, "Parent", "carryovertranscriptID")
setnames(temp.exonRef.dt, "carryovertranscriptID", "Parent")

exons.gr$GENCODEexonID <- NULL
exons.gr$queryHits <- NULL

mcols(exons.gr) <- cbind(mcols(exons.gr), temp.exonRef.dt  [ , -c("queryHits")] )
exons.gr [ exons.gr$Parent == "trans001cfff345c5d253"]




########## add gencode details
### on transcripts
setkey(temp.exonRef.dt, Parent)
temp.exonRef.dt <- temp.exonRef.dt [ , -c("GENCODEexonID", "OLpercQ")]
exonRef.collapsed.dt <- temp.exonRef.dt [, lapply( .SD, function(column) paste( sort(unique(column)), collapse=";")) , .SDcols = -c("Parent"), by = Parent  ]
setnames(exonRef.collapsed.dt, c("Parent", "transcriptID", "queryHits"), c("ID", "ensembl_transcriptID", "exonHits"))

transcripts.dt [ , c("geneID", "ensembl_transcriptID", "gene_type", "transcript_type", "transcript_support_level") := NULL ]
key(exonRef.collapsed.dt)

#bin by ID
setkey(transcripts.dt, ID)
transcripts.dt <- exonRef.collapsed.dt [ transcripts.dt, ]
transcripts.dt [ ID =="trans001cfff345c5d253", ]





##### annotation type: non-annotated vs partially annotated
# count if all exons of a transcript are outside GENCODE

exons.gr
countExons.dt <- data.table( ID = exons.gr$Parent, exactAnnotated = exons.gr$exactAnnotated, partialAnnotated = exons.gr$partialAnnotated )
setkey(countExons.dt, ID)

countExons.dt <- countExons.dt [ , list( exactAnnotated = sum(exactAnnotated), partialAnnotated = sum(partialAnnotated)), by = key(countExons.dt)]
transcripts.dt <- countExons.dt [transcripts.dt , ]

transcripts.dt [ , summaryAnnotation := ifelse( partialAnnotated == 0, "unannotated",
                                                ifelse( exactAnnotated == exonNumber, "known", "partial" ))]




#################### repeat annotation
##### add to exons.gr and cuffcompareOutput.transcriptref

### repeat annotation
print(load("~/Documents/Jan.Crick/computational.analysis/annotation.files_local/Homo_sapiens/RptMasking/GRCh38.78/Rpts.Dfam2annotated.GRCh38.78.GRange.RData"))
repeat.gtf
seqnames(repeat.gtf)

mcols(repeat.gtf) <- mcols(repeat.gtf)[ c("repFamily", "repeatIDs")]
repeat.gtf$repFamily <- gsub("?", "", repeat.gtf$repFamily, fixed=TRUE )


##### annotate if any exon overlaps with an LTR
# find overlaps of LTR repeats and exons
LTRrepeats.gtf <- repeat.gtf [ grepl("LTR", repeat.gtf$repeatIDs) ]
seqlevels(exons.gr) [ (! (seqlevels(exons.gr) %in% seqlevels(LTRrepeats.gtf)))]
overlaps <- findOverlaps( exons.gr, LTRrepeats.gtf,type = "any", ignore.strand = TRUE )  #273274 matches | 234,895 exons
overlaps.dt <- as.data.table(overlaps)
overlaps.dt <- overlaps.dt [, c("repClass", "repName", "repeatIDs") := as( mcols(LTRrepeats.gtf) [ overlaps.dt$subjectHits, c("repClass", "repName", "repeatIDs")], "list") ]
setkey(overlaps.dt, queryHits, subjectHits)

#table all columns
overlaps.collapsed.dt <- overlaps.dt [, lapply( .SD, function(column) paste(unique( column ), collapse=";")) , .SDcols = -c("queryHits"), by = queryHits  ]
overlaps.collapsed.dt$ID <- exons.gr$ID [ overlaps.collapsed.dt$queryHits ]

# merge with exons.gr
mcols.temp <- as.data.frame(mcols(exons.gr))
mcols.temp <- as.data.table(mcols.temp)
setkey(mcols.temp, ID)
setkey(overlaps.collapsed.dt, ID)
mcols.temp <- merge(mcols.temp, overlaps.collapsed.dt [ , .(ID, queryHits, repClass, repName, repeatIDs) ], by= "ID", all.x=TRUE)
exons.gr$LTRderived <- ifelse( (1:length(exons.gr)) %in% queryHits(overlaps), TRUE, FALSE  )

if ( identical( mcols.temp$ID, exons.gr$ID )) { 
   mcols(exons.gr) <- mcols.temp
   rm(mcols.temp) } else { 
      print("mcols.temp is not in the same order as exons.gr, resorting ... ")
      m <- match( as.character(exons.gr$ID), mcols.temp$ID )
      mcols(exons.gr) <- mcols.temp [ m ]
   }


##### annotate if any exon overlaps with any repeat
repeat.gtf <- repeat.gtf [ repeat.gtf$repFamily %in% c("LTR", "LINE", "SINE", "RC", "Retroposon")]
table(repeat.gtf$repFamily)

overlaps <- findOverlaps( exons.gr, repeat.gtf,type = "any", ignore.strand = TRUE )  #1,998,654 matches | 1,391,986 exons
overlaps.dt <- as.data.table(overlaps)
overlaps.dt <- overlaps.dt [, c("repClass", "repeatIDs") := as( mcols(repeat.gtf) [ overlaps.dt$subjectHits, c("repFamily", "repeatIDs")], "list") ]
setkey(overlaps.dt, queryHits, subjectHits)

#table all columns
overlaps.collapsed.dt <- overlaps.dt [, lapply( .SD, function(column) paste(unique( column ), collapse=";")) , .SDcols = -c("queryHits"), by = queryHits  ]
overlaps.collapsed.dt$ID <- exons.gr$ID [ overlaps.collapsed.dt$queryHits ]

# merge with exons.gr
mcols.temp <- as.data.frame(mcols(exons.gr))
mcols.temp <- as.data.table(mcols.temp)
setkey(mcols.temp, ID)
setnames(mcols.temp, c("repClass", "repeatIDs"), paste("LTR", c("repClass", "repeatIDs"), sep="/"))

setkey(overlaps.collapsed.dt, ID)
mcols.temp <- merge(mcols.temp, overlaps.collapsed.dt [ , .(ID, queryHits, repClass, repeatIDs) ], by= "ID", all.x=TRUE)
mcols.temp [ , queryHits := NULL ]

if ( identical( mcols.temp$ID, exons.gr$ID )) { 
   mcols(exons.gr) <- mcols.temp
   rm(mcols.temp) } else { 
      print("mcols.temp is not in the same order as exons.gr, resorting ... ")
      m <- match( as.character(exons.gr$ID), mcols.temp$ID )
      mcols(exons.gr) <- mcols.temp [ m ]
   }

exons.gr$RTEoverlap <- ifelse( (1:length(exons.gr)) %in% queryHits(overlaps), TRUE, FALSE  )
exons.gr [ exons.gr$Parent == "trans001cfff345c5d253" ]


### impose onto transcripts:
# add Parent to overlaps.dt as ID column and merge 
overlaps.dt$ID <- exons.gr$Parent [ overlaps.dt$queryHits ]
setkey(overlaps.dt, ID)

#collapse across transcripts
overlaps.dt [, c("queryHits", "subjectHits") := NULL ]
overlaps.dt <- unique(overlaps.dt)
overlaps.collapsed.dt2 <- overlaps.dt [, lapply( .SD, function(column) paste(sort(unique( column ), collapse=";"))) , .SDcols = -c("ID"), by = ID  ]

transcripts.dt$spansRTErepeat <- ifelse( transcripts.dt$ID %in% overlaps.collapsed.dt2$ID, TRUE, FALSE  )
setkey(transcripts.dt, ID)
#setnames(transcripts.dt, c("repClass", "repName", "repeatIDs"), paste("LTR", c("repClass", "repName", "repeatIDs"), sep="/"))

setkey(overlaps.collapsed.dt2, ID)
transcripts.dt <- overlaps.collapsed.dt2 [transcripts.dt, on = "ID"]


####################
###### QC
transcripts.dt [ transcripts.dt$ID =="transa448f15a974bbc97"]
exons.gr [ exons.gr$Parent == "transa448f15a974bbc97"]



####################
##### export
saveRDS(exons.gr, file = "./exons.gr_fullyannotated_updApril2018.rds")
saveRDS(transcripts.dt, file = "./transcripts.dt_fullyannotated_updApril2018.rds")
#################### 



##### regenerate full gtf
exons.gr$geneID <- NA
exons.gr$queryHits <- NULL
exons.gr$type <- "exon"

transcripts.dt [, c("gene_type", "transcript_type", "transcript_support_level") := NULL ]
transcripts.gr <- as(transcripts.dt, "GRanges")
mcols(transcripts.gr)
transcripts.gr$exactAnnotated <- NA
transcripts.gr$GENCODEexonID <- NA
transcripts.gr$type <- "transcript"

# reshuffle exons
mcols(exons.gr) <- mcols(exons.gr) [ colnames(mcols(exons.gr)) %in% colnames(mcols(transcripts.gr)) ]
mcols(transcripts.gr) <- mcols(transcripts.gr) [ colnames(mcols(transcripts.gr)) %in% colnames(mcols(exons.gr)) ]
mcols(transcripts.gr) <- mcols(transcripts.gr) [ colnames(mcols(exons.gr))]  #fix order of columns

transcriptome.gr <- append(exons.gr, transcripts.gr)

### cufflinks doesn't link empty string as geneID
transcriptome.gr$geneID [ transcriptome.gr$geneID == "" ] <- NA

### export
transcriptome.gr <- sort.GenomicRanges(transcriptome.gr)
export(transcriptome.gr, "flattened_transcriptome_fullyannotated_updApril2018.gtf", format ="gff3")


############ session Info
# R version 3.3.2 (2016-10-31)
# Platform: x86_64-apple-darwin13.4.0 (64-bit)
# Running under: macOS Sierra 10.12.6

# locale:
#   [1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8

# attached base packages:
#   [1] parallel  stats4    stats     graphics  grDevices utils     datasets  methods   base     

# other attached packages:
#   [1] data.table_1.10.4    rtracklayer_1.34.2   GenomicRanges_1.26.2 GenomeInfoDb_1.10.3 
# [5] IRanges_2.8.1        S4Vectors_0.12.1     BiocGenerics_0.20.0 

# loaded via a namespace (and not attached):
#   [1] lattice_0.20-34            XML_3.98-1.5               Rsamtools_1.26.2          
# [4] Biostrings_2.42.1          GenomicAlignments_1.10.1   bitops_1.0-6              
# [7] grid_3.3.2                 zlibbioc_1.20.0            XVector_0.14.0            
# [10] Matrix_1.2-8               BiocParallel_1.8.1         tools_3.3.2               
# [13] Biobase_2.34.0             RCurl_1.95-4.8             yaml_2.1.14               
# [16] SummarizedExperiment_1.4.0


