# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~~~~~~~~~ Calcualte cells per gene for lentiviral libraries, coding genes and also coding and non-coding genes ~~~~~~~~~~~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~





# ----------- Create table genecode_cen.bed from gencode, ensembl and ucsc ----------------------
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~~~ Go to START HERE (below) if gencode_gtf_ensembl_ucsc already constructed, to save time ~~~~~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# Create gene table from gencode and ensembl websites, supplemented with information from UCSC table brower
# All gencode tables are 1 based counting. ucsc tables are 0-based. I will use 1-based counting in my tables.

# ~~~~~~~ Download latest gencode GTF from gencode website ~~~~~~~~~~~
library(rtracklayer)
gencode_gtf <- import("gencode.v31.annotation.gtf")
gencode_gtf <- as.data.frame(gencode_gtf,stringsAsFactors=FALSE)

# remove version nos from gene_id, transcript_id to allow eventual merge with gencode_ensembl
gencode_gtf$gene_id <- gsub("\\.[0-9]+","",gencode_gtf$gene_id)
gencode_gtf$transcript_id <- gsub("\\.[0-9]+","",gencode_gtf$transcript_id)




# get rid of unneeded columns. Got rid of tx_id because when select type == "gene" (below), no associated tx_id
gencode_gtf <- gencode_gtf[,c("seqnames", "start",   "end", "width", "strand","type","gene_id","gene_type","gene_name")]
colnames(gencode_gtf) <- c("Chromosome","geneS","geneE","geneLength","strand","type","gene_id","gene_type","geneSymbol")

# get rid of unneeded types, eg exon, CDS, etc
gencode_gtf <- gencode_gtf[gencode_gtf$type=="gene",]

fctr.cols <- sapply(gencode_gtf, is.factor)
gencode_gtf[, fctr.cols] <- sapply(gencode_gtf[, fctr.cols], as.character)


# Get rid of "type". But remember to correct for ensembl and gencode being 1 based and ucsc being 0 based!!!
gencode_gtf <- gencode_gtf[,c("Chromosome", "geneS",   "geneE","geneLength", "strand",  "gene_id", "gene_type", "geneSymbol")]

dim(gencode_gtf)
# [1] 60603     8


# gencode_gtf has only one entry per gene_id, but multiple gene_id and tx_id for some geneSymbols, eg RF00019

length(unique(gencode_gtf$geneSymbol))
# [1] 59050

length(unique(gencode_gtf$gene_id))
# [1] 60603


# ~~~~~~~~~~ gencode v31 from ensembl ~~~~~~~~~~~~~~~

# Supplement info with gencode downloaded from ensembl biomart web site using structure option 

# library(seqinr)
# gencode_ensembl <- read.fasta("martquery_0115051407_592.txt")
# gencode_ensembl <- do.call(rbind,strsplit(names(gencode_ensembl),'\\|'))
# gencode_ensembl <- as.data.frame(gencode_ensembl)

# # replace empty entries with NA:
# is.na(gencode_ensembl) <- do.call(cbind,lapply(gencode_ensembl, FUN = function(x) {x==""}))



# gencode_ensembl <- read.table("martquery_0115213848_796.txt", header=TRUE, stringsAsFactors = FALSE, sep = "\t",quote="")

# Or supplement with gencode info downloaded from ensembl using biomaRt. This is **superior** to above.

# library(biomaRt)
# ensembl <- useDataset("hsapiens_gene_ensembl", mart=useMart("ensembl"))


# gencode_ensembl <- list()

# # download from ensembl. Download by chromosomes, or else ensembl times out.

# for (i in c(1:22,"X","Y","MT")) {

# gencode_ensembl[[i]] <- getBM(
						# attributes=c(
							# 'chromosome_name',
							# 'start_position',
							# 'end_position',
							# 'strand',
							# 'ensembl_gene_id',
							# 'ensembl_transcript_id',
							# 'gene_biotype',
							# 'external_gene_name',
							# 'transcript_length',
							# '5_utr_start',
							# '5_utr_end',
							# '3_utr_start',
							# '3_utr_end', 
							# 'cds_length', 
							# 'description'
						# ), 
			      # filters = 'chromosome_name', 
			      # values = i, 
			      # mart = ensembl
			      # )
			      
# }


# gencode_ensembl <- do.call(rbind, gencode_ensembl)





# chrOrder <- c(1:22,"X","Y","MT")
# gencode_ensembl$chromosome_name <-factor(gencode_ensembl$chromosome_name, levels=chrOrder)
# gencode_ensembl <- gencode_ensembl[order(gencode_ensembl$chromosome_name, gencode_ensembl$start_position), ]
# gencode_ensembl$chromosome_name <- as.character(gencode_ensembl$chromosome_name)






# save 'frozen' version for archival purposes:			      
# write.table(gencode_ensembl, "gencode_ensembl_v31.txt",quote=FALSE,sep="\t",row.names=FALSE,col.names=TRUE)

# Use 'frozen' table from biomaRt. # Use read.delim cf https://kbroman.org/blog/2017/08/08/eof-within-quoted-string/
# quote mark in a gene description entry introduced during production of gencode_ensembl via biomaRt
gencode_ensembl <- read.delim("gencode_ensembl_v31.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE)

# # alternative approach: https://kbroman.org/blog/2017/08/08/eof-within-quoted-string/
# gencode_ensembl <- read.table("gencode_ensembl_v31.txt", header=TRUE, sep="\t", stringsAsFactors=FALSE, check.names=FALSE, quote="", fill=FALSE)
			      
gencode_ensembl[gencode_ensembl$chromosome_name == "MT","chromosome_name"] <- "M"
			      
			      
colnames(gencode_ensembl) <- c("Chromosome","geneS","geneE","strand","gene_id","tx_id","gene_type","geneSymbol","txLength","5utrS","5utrE","3utrS","3utrE","cdsLength","gene_description")

gencode_ensembl[gencode_ensembl$strand==1,"strand"] <- "+"
gencode_ensembl[gencode_ensembl$strand==-1,"strand"] <- "-"





gencode_ensembl$Chromosome <- paste0("chr",gencode_ensembl$Chromosome)

dim(gencode_ensembl)
# [1] 513708     15




# chose longest txLength for each gene entry. Use gene_id rather than geneSymbol to preserve different entries for repetitive genes with different gene_id but same geneSymbol, eg RF00019
gencode_ensembl_agg <- merge(aggregate(txLength~gene_id,gencode_ensembl,max),gencode_ensembl,all.x=TRUE,sort=FALSE)

dim(gencode_ensembl_agg)
# [1] 106682     15


gencode_ensembl_agg <- gencode_ensembl_agg[!duplicated(gencode_ensembl_agg[,c("gene_id")]),]


dim(gencode_ensembl_agg)
# [1] 60558    15


# gencode_gtf is a bit bigger:
dim(gencode_gtf)
# [1] 60603     8



# Choose biggest 5utr in each tx_id. Does not necessarily chose longest txLength or cdsLength. Use gencode_ensembl_agg for that information.
gencode_ensembl_5utr <- gencode_ensembl
# add 1 to account for 1-based table
gencode_ensembl_5utr$"5utrDiff" <- gencode_ensembl_5utr$"5utrE"-gencode_ensembl_5utr$"5utrS" + 1
gencode_ensembl_5utr$"5utrDiff"[is.na(gencode_ensembl_5utr$"5utrDiff")] <- 0  # replace NAs with 0s so that which.max works next step
# Find biggest 5'UTR in each tx_id:
gencode_ensembl_5utr <- do.call(rbind, lapply(split(gencode_ensembl_5utr,as.factor(gencode_ensembl_5utr$tx_id)), function(x) {return(x[which.max(x$"5utrDiff"),])}))
gencode_ensembl_5utr[(is.na(gencode_ensembl_5utr$"5utrS")|is.na(gencode_ensembl_5utr$"5utrE")),"5utrDiff"] <- NA # replace 0s with NAs if either 5utrS or 5utrE is NA
rownames(gencode_ensembl_5utr) <- NULL

dim(gencode_ensembl_5utr)
# [1] 226721     16

# Choose biggest 3utr in each tx_id. Does not necessarily chose longest txLength or cdsLength. Use gencode_ensembl_agg for that information.
gencode_ensembl_3utr <- gencode_ensembl
# add 1 to account for 1-based table
gencode_ensembl_3utr$"3utrDiff" <- gencode_ensembl_3utr$"3utrE"-gencode_ensembl_3utr$"3utrS" + 1
gencode_ensembl_3utr$"3utrDiff"[is.na(gencode_ensembl_3utr$"3utrDiff")] <- 0 # replace NAs with 0s so that which.max works next step
# Find biggest 3'UTR in each tx_id:
gencode_ensembl_3utr <- do.call(rbind, lapply(split(gencode_ensembl_3utr,as.factor(gencode_ensembl_3utr$tx_id)), function(x) {return(x[which.max(x$"3utrDiff"),])}))
gencode_ensembl_3utr[(is.na(gencode_ensembl_3utr$"3utrS")|is.na(gencode_ensembl_3utr$"3utrE")),"3utrDiff"] <- NA # replace 0s with NAs if either 3utrS or 3utrE is NA
rownames(gencode_ensembl_3utr) <- NULL

dim(gencode_ensembl_3utr)
# [1] 226721     16



# merge gencode_ensembl_5utr, gencode_ensembl_3utr
# NB because tx_id is included in match, insists that max 5utr and 3utr in same transcript. That is good, because selects for tx_id with max 5utr and max 3utr, hence most likely to be bona fide full length transcripts. Note that tx_id refers to the max 5utr and 3utr, not the txLength necessarily.
gencode_ensembl_utr <- merge(gencode_ensembl_5utr[,setdiff(names(gencode_ensembl_5utr), c("3utrS", "3utrE"))],gencode_ensembl_3utr[,setdiff(names(gencode_ensembl_3utr), c("5utrS", "5utrE"))])


dim(gencode_ensembl_utr)
# [1] 226721     17

# # Here is strategy if do not wish to enforce same tx_id for gencode_ensembl_5utr and gencode_ensembl_3utr:
# gencode_ensembl_utr <- gencode_ensembl_utr <- merge(gencode_ensembl_5utr[,setdiff(names(gencode_ensembl_5utr), c("tx_id","3utrS", "3utrE"))],gencode_ensembl_3utr[,setdiff(names(gencode_ensembl_3utr), c("tx_id","5utrS", "5utrE"))])

# dim(gencode_ensembl_utr)
# # [1] 208984     16

# # makes only ~1% difference in dataframe size, so most max 5utr are in same tx as max 3utr
# 208984/206534
# # [1] 1.011862



# merge gencode_ensembl_agg, gencode_ensembl_utr
# NB tx_id in gencode_ensembl_agg refers to max txLength. tx_id in gencode_ensembl_utr refers to tx_id with max 5utr and 3utr. No obvious reason for them to match so well but they do.
gencode_ensembl_final <- merge(gencode_ensembl_agg[,c("gene_id","txLength","Chromosome","geneS","geneE","strand","tx_id","gene_type","geneSymbol","cdsLength","gene_description")],gencode_ensembl_utr[,c("Chromosome","geneS","geneE","strand","gene_id","tx_id","gene_type","geneSymbol","txLength","cdsLength","gene_description", "5utrS", "5utrE", "5utrDiff", "3utrS","3utrE", "3utrDiff")])

# Number of rows in gencode_ensembl_final exactly same as in gencode_ensembl_agg, even though all.x=TRUE was not used. Hence 100% exact matches of tx_id between the two dataframes!! Hence tx_id does refer to both maximum txLength, max 5utr and max 3utr.
dim(gencode_ensembl_agg)
# [1] 60558    15

dim(gencode_ensembl_final)
# [1] 60558    17



# Here is match if do not insist on tx_id matching in gencode_ensembl_agg and gencode_ensembl_utr. Very minimal difference, ~0.049%. However, the very slight increases must represent artifactual expansion based on exact matches of tx_id in two dataframes, as described above. Best to keep tx_id as above.

# gencode_ensembl_final <- merge(gencode_ensembl_agg[,c("gene_id","txLength","Chromosome","geneS","geneE","strand","gene_type","geneSymbol","cdsLength","gene_description")],gencode_ensembl_utr[,c("Chromosome","geneS","geneE","strand","gene_id","gene_type","geneSymbol","txLength","cdsLength","gene_description", "5utrS", "5utrE", "5utrDiff", "3utrS","3utrE", "3utrDiff")])

# dim(gencode_ensembl_final)
# # [1] 58705    16

# 58705/58676
# # [1] 1.000494





dim(gencode_gtf)
# [1] 60603     8

dim(gencode_ensembl_final)
# [1] 60558    17


# All genes found in gencode_gtf are also in gencode_ensembl_final:

dim(gencode_gtf[gencode_gtf$geneSymbol %in% setdiff(gencode_gtf$geneSymbol,gencode_ensembl_final$geneSymbol),c("geneSymbol","gene_type","Chromosome")])
# [1] 0   3


# All genes found in gencode_ensembl_final are also in gencode_gtf:
dim(gencode_ensembl_final[gencode_ensembl_final$geneSymbol %in% setdiff(gencode_ensembl_final$geneSymbol,gencode_gtf$geneSymbol),c("geneSymbol","gene_type","Chromosome")])
# [1] 0   3

# include all genes in gencode_gtf, leave some out from gencode_ensembl_final
dim(merge(gencode_gtf, gencode_ensembl_final,all.x=TRUE))
# [1] 60603    18

# include all genes in gencode_ensembl_final, leave some out from gencode_gtf
dim(merge(gencode_gtf, gencode_ensembl_final,all.y=TRUE))
# [1] 60558    18

# include all genes
dim(merge(gencode_gtf, gencode_ensembl_final,all=TRUE))
# [1] 60603    18


# Some genes repeated in gencode_gtf and in gencode_ensembl_final. 
# However, when repeated genes ignored, gencode_gtf and gencode_ensembl_final are same
# Thus extra genes in gencode_gtf are due to repeated genes

# genes repeated in gencode_gtf
length(gencode_gtf$geneSymbol)
# [1] 60603

length(unique(gencode_gtf$geneSymbol))
# [1] 59050


# Some genes repeated in gencode_ensembl_final
length(gencode_ensembl_final$geneSymbol)
# [1] 60558

length(unique(gencode_ensembl_final$geneSymbol))
# [1] 59050

# Extra repeated genes in gencode_gtf confirmed by:

setdiff(gencode_ensembl_final[duplicated(gencode_ensembl_final$geneSymbol),c("geneSymbol")],gencode_gtf[duplicated(gencode_gtf$geneSymbol),c("geneSymbol")])
character(0)


setdiff(gencode_gtf[duplicated(gencode_gtf$geneSymbol),c("geneSymbol")],gencode_ensembl_final[duplicated(gencode_ensembl_final$geneSymbol),c("geneSymbol")])
 # [1] "AL954722.1" "PLCXD1"     "GTPBP6"     "LINC00685"  "PPP2R3B"    "AL732314.6" "AL732314.4" "FABP5P13"   "KRT18P53"   "SHOX"       "AL672277.1" "RPL14P5"   
# [13] "CRLF2"      "CSF2RA"     "MIR3690"    "RNA5SP498"  "IL3RA"      "SLC25A6"    "LINC00106"  "ASMTL-AS1"  "ASMTL"      "P2RY8"      "AKAP17A"    "ASMT"      
# [25] "AL683807.1" "AL683807.2" "DHRSX"      "DHRSX-IT1"  "ZBED1"      "MIR6089"    "CD99P1"     "LINC00102"  "CD99"       "SPRY3"      "AMD1P2"     "DPH3P2"    
# [37] "VAMP7"      "ELOCP24"    "TRPC6P"     "IL9R"       "AJ271736.1" "WASIR1"     "WASH6P"     "DDX11L16"  


# Mostly seems to be X and Y repeaed genes. For example:
gencode_gtf[gencode_gtf$geneSymbol == "AL954722.1",]
        # Chromosome  geneS  geneE geneLength strand               gene_id              gene_type geneSymbol
# 2779282       chrX 253743 255091       1349      +       ENSG00000228572 unprocessed_pseudogene AL954722.1
# 2871982       chrY 253743 255091       1349      + ENSG00000228572_PAR_Y unprocessed_pseudogene AL954722.1


gencode_ensembl_final[gencode_ensembl_final$geneSymbol == "AL954722.1",]
              # gene_id txLength Chromosome  geneS  geneE strand           tx_id              gene_type geneSymbol cdsLength            gene_description 5utrS 5utrE
# 28166 ENSG00000228572      259       chrX 253743 255091      + ENST00000431238 unprocessed_pseudogene AL954722.1        NA regucalcin (RGN) pseudogene    NA    NA
      # 5utrDiff 3utrS 3utrE 3utrDiff
# 28166       NA    NA    NA       NA


# And:

gencode_gtf[gencode_gtf$geneSymbol == "WASIR1",]
        # Chromosome     geneS     geneE geneLength strand               gene_id gene_type geneSymbol
# 2871884       chrX 156014623 156016837       2215      -       ENSG00000185203    lncRNA     WASIR1
# 2881646       chrY  57201143  57203357       2215      - ENSG00000185203_PAR_Y    lncRNA     WASIR1



gencode_ensembl_final[gencode_ensembl_final$geneSymbol == "WASIR1",]
              # gene_id txLength Chromosome     geneS     geneE strand           tx_id gene_type geneSymbol cdsLength
# 15914 ENSG00000185203     1054       chrX 156014623 156016837      - ENST00000399966    lncRNA     WASIR1        NA
                                                       # gene_description 5utrS 5utrE 5utrDiff 3utrS 3utrE 3utrDiff
# 15914 WASH and IL9R antisense RNA 1 [Source:HGNC Symbol;Acc:HGNC:38513]    NA    NA       NA    NA    NA       NA


# From https://www.gencodegenes.org/pages/faq.html:
# "What is the difference between GENCODE GTF and Ensembl GTF?
# The gene annotation is the same in both files. The only exception is that the genes which are common to the human chromosome X and Y PAR regions can be found twice in the GENCODE GTF, while they are shown only for chromosome X in the Ensembl file."



# Therefore decided to use genes in gencode_gtf. Also gencode web sie says best set of genes for most users (ie high quality!)
gencode_gtf_ensembl <- merge(gencode_gtf,gencode_ensembl_final,all.x=TRUE)

dim(gencode_gtf_ensembl)
# [1] 60603    18

# # If wish to keep all genes in both data frames do this instead. Gives same results:
# gencode_gtf_ensembl <- merge(gencode_gtf,gencode_ensembl_final,all=TRUE)



# ~~~~~~~~ ucsc gencode v31 ~~~~~~~~~~~~~~~~~~~~~~~

# ucsc. Only adds exon counts for each gene.
# ucsc tables use 0-based counting. I ignore geneS and geneE in UCSC and will use 1-based counting of ensembl tables.
# On ucsc table browser: "Genes and Gene Predictions", "ALL GENCODE V31"
# output format: "selected fields from primary and related tables"
# Creates ucsc_gencode_v31.txt. Selected fields shown in colnames(), below

gencode_ucsc <- read.table("ucsc_gencode_v31.txt",header=FALSE,sep="\t",stringsAsFactors=FALSE)
colnames(gencode_ucsc) <- c("tx_id","Chromosome","strand","geneS","geneE","cdsS","cdsE","exonCount","exonS","exonE","score","geneSymbol")

# remove version nos from tx_id to allow merge with gencode_gtf_ensembl
gencode_ucsc$tx_id <- gsub("\\.[0-9]+","", gencode_ucsc$tx_id)


dim(gencode_ucsc)
# [1] 100040      12

gencode_gtf_ensembl_ucsc <- merge(gencode_gtf_ensembl,gencode_ucsc[,c("tx_id","geneSymbol","exonCount")],all.x=TRUE)

# Because combo of "tx_id" and "geneSymbol" not quite unique for duplicated genes in gencode_ucsc, end up with 28 duplicated chrX gene rows.
 gencode_gtf_ensembl_ucsc <- unique(gencode_gtf_ensembl_ucsc)

# replace empty entries with NA:
is.na(gencode_gtf_ensembl_ucsc) <- do.call(cbind,lapply(gencode_gtf_ensembl_ucsc, FUN = function(x) {x==""}))


gencode_gtf_ensembl_ucsc <- gencode_gtf_ensembl_ucsc[,c("Chromosome","gene_id","tx_id","geneSymbol","strand","geneS","geneE","geneLength", "txLength", "cdsLength", "5utrS", "5utrE", "5utrDiff", "3utrS", "3utrE", "3utrDiff", "exonCount", "gene_type","gene_description")]


# Sort
chrOrder<-paste("chr",c(1:22,"X","Y","M"),sep="")
gencode_gtf_ensembl_ucsc$Chromosome <-factor(gencode_gtf_ensembl_ucsc$Chromosome, levels=chrOrder)
gencode_gtf_ensembl_ucsc <- gencode_gtf_ensembl_ucsc[order(gencode_gtf_ensembl_ucsc$Chromosome, gencode_gtf_ensembl_ucsc$geneS), ]
gencode_gtf_ensembl_ucsc$Chromosome <- as.character(gencode_gtf_ensembl_ucsc$Chromosome)



# remember gencode is 1-based
dim(gencode_gtf_ensembl_ucsc)
# [1] 60603    19


# write.table(gencode_gtf_ensembl_ucsc, "gencode_gtf_ensembl_ucsc_v31.txt",quote=FALSE,sep="\t",row.names=FALSE,col.names=TRUE)



# ~~~~~~~~~~ if desired, can START HERE for gencode_gtf_ensembl_ucsc ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~~~~~~~~~~ Also permits use of 'frozen' archived v31 gencode tables ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~~~~~~~~~~ Note use of read.delim ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
gencode_gtf_ensembl_ucsc <- read.delim("gencode_gtf_ensembl_ucsc_v31.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE,check.names=FALSE)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~




# ------------- Total cells per gene for protein-coding library --------------------


# Number protein coding genes

dim(gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type=="protein_coding",])
# [1] 19975    19

# Assuming 2e4 genes and 1e7 cells in 75 cm2 flask, number cells per gene:

1e7/2e4
# [1] 500 <<<<<<<<<<<<<< use in paper

# Total number genes 
dim(gencode_gtf_ensembl_ucsc)
# [1] 60603    19


# Total number genes ~ 3x coding region genes
dim(gencode_gtf_ensembl_ucsc)[1]/dim(gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type=="protein_coding",])[1]
# [1] 3.033942 <<<<<<<<<<<<<<<<< use in paper

























