library(ggplot2)
library(cowplot) #used with plot_grid 


#----------------Aesthetics ---------------------------


theme2 <- theme(
	plot.margin = unit(c(t=1.3,r=0.5,b=1.3,l=0.5), "cm"),
	panel.grid.major = element_blank(), 
	panel.grid.minor = element_blank(), 
	panel.background = element_blank(), 
	legend.position="none", 
	axis.line.x = element_line(colour = "black", size = 0.1), 
	axis.line.y = element_line(colour = "black", size = 0.1), 
	axis.ticks = element_line(colour = "black", size = 0.1),
	axis.text=element_text(size=12), #numbers on tick marks of x and y axes
	axis.title=element_text(size=14), #titles of x and y axes
	axis.title.y=element_text(margin=margin(0,13,0,0)), #moves y axis title by adding margin space to bottom
	axis.title.x=element_text(margin=margin(10,0,0,0)),  #moves x axis title by adding margin space to top
	plot.title = element_text(size=32, face="bold", hjust = -0.14), #can provide "A","B", by ggtitle, but used plot_grid wch can shift more left
	plot.subtitle = element_text(size=14, face="plain", hjust = 0.5) #hjust shifts right
	)






# darkest two hues from 3-class PuBuGn in color brewer
# cb1<-rep(c("#1c9099", "#a6bddb"), 12)

# # darkest two hues from 3-class PuBu in color brewer
# cb1<-rep(c("#2b8cbe", "#a6bddb"), 12)


# #attractive pinks, greys
# cb1<-c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7","#999999", "#E69F00", "#56B4E9", "#E69F00", "#009E73", "#F0E442", "#0072B2", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7","#999999", "#D55E00", "#CC79A7")

# cb1_rev <- c("#CC79A7", "#D55E00", "#0072B2", "#F0E442", "#009E73", "#56B4E9", "#E69F00","#999999", "#CC79A7", "#D55E00", "#0072B2", "#D55E00", "#F0E442", "#009E73", "#56B4E9", "#0072B2", "#F0E442", "#009E73", "#56B4E9", "#E69F00","#999999", "#CC79A7", "#E69F00","#999999")

# #'4-class RdBu'
# cb2 <- c('#ca0020','#f4a582','#92c5de','#0571b0','#ca0020','#f4a582','#92c5de','#0571b0','#ca0020','#f4a582','#92c5de','#f4a582','#0571b0','#ca0020','#f4a582','#92c5de','#0571b0','#ca0020','#f4a582','#92c5de','#0571b0','#ca0020','#92c5de','#0571b0')

# #'4-class RdYlBu'
# cb3 <- c('#d7191c','#fdae61','#abd9e9','#2c7bb6','#d7191c','#fdae61','#abd9e9','#2c7bb6','#d7191c','#fdae61','#abd9e9','#fdae61','#2c7bb6','#d7191c','#fdae61','#abd9e9','#2c7bb6','#d7191c','#fdae61','#abd9e9','#2c7bb6','#d7191c','#abd9e9','#2c7bb6')
	
	
# size_point <- 0.3
# size_hline <- 0.1




# attractive grey and skyblue color scheme
# cb1<-rep(c("grey", "skyblue"), 12)

labels <- as.character(c("1", "", "", "4", "", "", "7", "", "", "10", "", "", "13", "", "", "16", "", "", "19", "", "", "", "X"))






# ----------- Create table genecode_cen.bed from gencode, ensembl and ucsc ----------------------
# ~~~ Go to START HERE (below) if gencode_gtf_ensembl_ucsc already constructed, to save time ~~~~~

# Create gene table from gencode and ensembl websites, supplemented with information from UCSC table brower
# All gencode tables are 1 based counting. ucsc tables are 0-based. I will use 1-based counting in my tables.

# ~~~~~~~ Download latest gencode GTF from gencode website ~~~~~~~~~~~
library(rtracklayer)
gencode_gtf <- import("gencode.v31.annotation.gtf")
gencode_gtf <- as.data.frame(gencode_gtf,stringsAsFactors=FALSE)

# remove version nos from gene_id, transcript_id to allow eventual merge with gencode_ensembl
gencode_gtf$gene_id <- gsub("\\.[0-9]+","",gencode_gtf$gene_id)
gencode_gtf$transcript_id <- gsub("\\.[0-9]+","",gencode_gtf$transcript_id)




# get rid of unneeded columns. Got rid of tx_id because when select type == "gene" (below), no associated tx_id
gencode_gtf <- gencode_gtf[,c("seqnames", "start",   "end", "width", "strand","type","gene_id","gene_type","gene_name")]
colnames(gencode_gtf) <- c("Chromosome","geneS","geneE","geneLength","strand","type","gene_id","gene_type","geneSymbol")

# get rid of unneeded types, eg exon, CDS, etc
gencode_gtf <- gencode_gtf[gencode_gtf$type=="gene",]

fctr.cols <- sapply(gencode_gtf, is.factor)
gencode_gtf[, fctr.cols] <- sapply(gencode_gtf[, fctr.cols], as.character)


# Get rid of "type". But remember to correct for ensembl and gencode being 1 based and ucsc being 0 based!!!
gencode_gtf <- gencode_gtf[,c("Chromosome", "geneS",   "geneE","geneLength", "strand",  "gene_id", "gene_type", "geneSymbol")]

dim(gencode_gtf)
# [1] 60603     8


# gencode_gtf has only one entry per gene_id, but multiple gene_id and tx_id for some geneSymbols, eg RF00019

length(unique(gencode_gtf$geneSymbol))
# [1] 59050

length(unique(gencode_gtf$gene_id))
# [1] 60603


# ~~~~~~~~~~ gencode v31 from ensembl ~~~~~~~~~~~~~~~

# Supplement info with gencode downloaded from ensembl biomart web site using structure option 

# library(seqinr)
# gencode_ensembl <- read.fasta("martquery_0115051407_592.txt")
# gencode_ensembl <- do.call(rbind,strsplit(names(gencode_ensembl),'\\|'))
# gencode_ensembl <- as.data.frame(gencode_ensembl)

# # replace empty entries with NA:
# is.na(gencode_ensembl) <- do.call(cbind,lapply(gencode_ensembl, FUN = function(x) {x==""}))



# gencode_ensembl <- read.table("martquery_0115213848_796.txt", header=TRUE, stringsAsFactors = FALSE, sep = "\t",quote="")

# Or supplement with gencode info downloaded from ensembl using biomaRt. This is **superior** to above.

# library(biomaRt)
# ensembl <- useDataset("hsapiens_gene_ensembl", mart=useMart("ensembl"))


# gencode_ensembl <- list()

# # download from ensembl. Download by chromosomes, or else ensembl times out.

# for (i in c(1:22,"X","Y","MT")) {

# gencode_ensembl[[i]] <- getBM(
						# attributes=c(
							# 'chromosome_name',
							# 'start_position',
							# 'end_position',
							# 'strand',
							# 'ensembl_gene_id',
							# 'ensembl_transcript_id',
							# 'gene_biotype',
							# 'external_gene_name',
							# 'transcript_length',
							# '5_utr_start',
							# '5_utr_end',
							# '3_utr_start',
							# '3_utr_end', 
							# 'cds_length', 
							# 'description'
						# ), 
			      # filters = 'chromosome_name', 
			      # values = i, 
			      # mart = ensembl
			      # )
			      
# }


# gencode_ensembl <- do.call(rbind, gencode_ensembl)





# chrOrder <- c(1:22,"X","Y","MT")
# gencode_ensembl$chromosome_name <-factor(gencode_ensembl$chromosome_name, levels=chrOrder)
# gencode_ensembl <- gencode_ensembl[order(gencode_ensembl$chromosome_name, gencode_ensembl$start_position), ]
# gencode_ensembl$chromosome_name <- as.character(gencode_ensembl$chromosome_name)






# save 'frozen' version for archival purposes:			      
# write.table(gencode_ensembl, "gencode_ensembl_v31.txt",quote=FALSE,sep="\t",row.names=FALSE,col.names=TRUE)

# Use 'frozen' table from biomaRt. # Use read.delim cf https://kbroman.org/blog/2017/08/08/eof-within-quoted-string/
# quote mark in a gene description entry introduced during production of gencode_ensembl via biomaRt
gencode_ensembl <- read.delim("gencode_ensembl_v31.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE)

# # alternative approach: https://kbroman.org/blog/2017/08/08/eof-within-quoted-string/
# gencode_ensembl <- read.table("gencode_ensembl_v31.txt", header=TRUE, sep="\t", stringsAsFactors=FALSE, check.names=FALSE, quote="", fill=FALSE)
			      
gencode_ensembl[gencode_ensembl$chromosome_name == "MT","chromosome_name"] <- "M"
			      
			      
colnames(gencode_ensembl) <- c("Chromosome","geneS","geneE","strand","gene_id","tx_id","gene_type","geneSymbol","txLength","5utrS","5utrE","3utrS","3utrE","cdsLength","gene_description")

gencode_ensembl[gencode_ensembl$strand==1,"strand"] <- "+"
gencode_ensembl[gencode_ensembl$strand==-1,"strand"] <- "-"





gencode_ensembl$Chromosome <- paste0("chr",gencode_ensembl$Chromosome)

dim(gencode_ensembl)
# [1] 513708     15




# chose longest txLength for each gene entry. Use gene_id rather than geneSymbol to preserve different entries for repetitive genes with different gene_id but same geneSymbol, eg RF00019
gencode_ensembl_agg <- merge(aggregate(txLength~gene_id,gencode_ensembl,max),gencode_ensembl,all.x=TRUE,sort=FALSE)

dim(gencode_ensembl_agg)
# [1] 106682     15


gencode_ensembl_agg <- gencode_ensembl_agg[!duplicated(gencode_ensembl_agg[,c("gene_id")]),]


dim(gencode_ensembl_agg)
# [1] 60558    15


# gencode_gtf is a bit bigger:
dim(gencode_gtf)
# [1] 60603     8



# Choose biggest 5utr in each tx_id. Does not necessarily chose longest txLength or cdsLength. Use gencode_ensembl_agg for that information.
gencode_ensembl_5utr <- gencode_ensembl
# add 1 to account for 1-based table
gencode_ensembl_5utr$"5utrDiff" <- gencode_ensembl_5utr$"5utrE"-gencode_ensembl_5utr$"5utrS" + 1
gencode_ensembl_5utr$"5utrDiff"[is.na(gencode_ensembl_5utr$"5utrDiff")] <- 0  # replace NAs with 0s so that which.max works next step
# Find biggest 5'UTR in each tx_id:
gencode_ensembl_5utr <- do.call(rbind, lapply(split(gencode_ensembl_5utr,as.factor(gencode_ensembl_5utr$tx_id)), function(x) {return(x[which.max(x$"5utrDiff"),])}))
gencode_ensembl_5utr[(is.na(gencode_ensembl_5utr$"5utrS")|is.na(gencode_ensembl_5utr$"5utrE")),"5utrDiff"] <- NA # replace 0s with NAs if either 5utrS or 5utrE is NA
rownames(gencode_ensembl_5utr) <- NULL

dim(gencode_ensembl_5utr)
# [1] 226721     16

# Choose biggest 3utr in each tx_id. Does not necessarily chose longest txLength or cdsLength. Use gencode_ensembl_agg for that information.
gencode_ensembl_3utr <- gencode_ensembl
# add 1 to account for 1-based table
gencode_ensembl_3utr$"3utrDiff" <- gencode_ensembl_3utr$"3utrE"-gencode_ensembl_3utr$"3utrS" + 1
gencode_ensembl_3utr$"3utrDiff"[is.na(gencode_ensembl_3utr$"3utrDiff")] <- 0 # replace NAs with 0s so that which.max works next step
# Find biggest 3'UTR in each tx_id:
gencode_ensembl_3utr <- do.call(rbind, lapply(split(gencode_ensembl_3utr,as.factor(gencode_ensembl_3utr$tx_id)), function(x) {return(x[which.max(x$"3utrDiff"),])}))
gencode_ensembl_3utr[(is.na(gencode_ensembl_3utr$"3utrS")|is.na(gencode_ensembl_3utr$"3utrE")),"3utrDiff"] <- NA # replace 0s with NAs if either 3utrS or 3utrE is NA
rownames(gencode_ensembl_3utr) <- NULL

dim(gencode_ensembl_3utr)
# [1] 226721     16



# merge gencode_ensembl_5utr, gencode_ensembl_3utr
# NB because tx_id is included in match, insists that max 5utr and 3utr in same transcript. That is good, because selects for tx_id with max 5utr and max 3utr, hence most likely to be bona fide full length transcripts. Note that tx_id refers to the max 5utr and 3utr, not the txLength necessarily.
gencode_ensembl_utr <- merge(gencode_ensembl_5utr[,setdiff(names(gencode_ensembl_5utr), c("3utrS", "3utrE"))],gencode_ensembl_3utr[,setdiff(names(gencode_ensembl_3utr), c("5utrS", "5utrE"))])


dim(gencode_ensembl_utr)
# [1] 226721     17

# # Here is strategy if do not wish to enforce same tx_id for gencode_ensembl_5utr and gencode_ensembl_3utr:
# gencode_ensembl_utr <- gencode_ensembl_utr <- merge(gencode_ensembl_5utr[,setdiff(names(gencode_ensembl_5utr), c("tx_id","3utrS", "3utrE"))],gencode_ensembl_3utr[,setdiff(names(gencode_ensembl_3utr), c("tx_id","5utrS", "5utrE"))])

# dim(gencode_ensembl_utr)
# # [1] 208984     16

# # makes only ~1% difference in dataframe size, so most max 5utr are in same tx as max 3utr
# 208984/206534
# # [1] 1.011862



# merge gencode_ensembl_agg, gencode_ensembl_utr
# NB tx_id in gencode_ensembl_agg refers to max txLength. tx_id in gencode_ensembl_utr refers to tx_id with max 5utr and 3utr. No obvious reason for them to match so well but they do.
gencode_ensembl_final <- merge(gencode_ensembl_agg[,c("gene_id","txLength","Chromosome","geneS","geneE","strand","tx_id","gene_type","geneSymbol","cdsLength","gene_description")],gencode_ensembl_utr[,c("Chromosome","geneS","geneE","strand","gene_id","tx_id","gene_type","geneSymbol","txLength","cdsLength","gene_description", "5utrS", "5utrE", "5utrDiff", "3utrS","3utrE", "3utrDiff")])

# Number of rows in gencode_ensembl_final exactly same as in gencode_ensembl_agg, even though all.x=TRUE was not used. Hence 100% exact matches of tx_id between the two dataframes!! Hence tx_id does refer to both maximum txLength, max 5utr and max 3utr.
dim(gencode_ensembl_agg)
# [1] 60558    15

dim(gencode_ensembl_final)
# [1] 60558    17



# Here is match if do not insist on tx_id matching in gencode_ensembl_agg and gencode_ensembl_utr. Very minimal difference, ~0.049%. However, the very slight increases must represent artifactual expansion based on exact matches of tx_id in two dataframes, as described above. Best to keep tx_id as above.

# gencode_ensembl_final <- merge(gencode_ensembl_agg[,c("gene_id","txLength","Chromosome","geneS","geneE","strand","gene_type","geneSymbol","cdsLength","gene_description")],gencode_ensembl_utr[,c("Chromosome","geneS","geneE","strand","gene_id","gene_type","geneSymbol","txLength","cdsLength","gene_description", "5utrS", "5utrE", "5utrDiff", "3utrS","3utrE", "3utrDiff")])

# dim(gencode_ensembl_final)
# # [1] 58705    16

# 58705/58676
# # [1] 1.000494





dim(gencode_gtf)
# [1] 60603     8

dim(gencode_ensembl_final)
# [1] 60558    17


# All genes found in gencode_gtf are also in gencode_ensembl_final:

dim(gencode_gtf[gencode_gtf$geneSymbol %in% setdiff(gencode_gtf$geneSymbol,gencode_ensembl_final$geneSymbol),c("geneSymbol","gene_type","Chromosome")])
# [1] 0   3


# All genes found in gencode_ensembl_final are also in gencode_gtf:
dim(gencode_ensembl_final[gencode_ensembl_final$geneSymbol %in% setdiff(gencode_ensembl_final$geneSymbol,gencode_gtf$geneSymbol),c("geneSymbol","gene_type","Chromosome")])
# [1] 0   3

# include all genes in gencode_gtf, leave some out from gencode_ensembl_final
dim(merge(gencode_gtf, gencode_ensembl_final,all.x=TRUE))
# [1] 60603    18

# include all genes in gencode_ensembl_final, leave some out from gencode_gtf
dim(merge(gencode_gtf, gencode_ensembl_final,all.y=TRUE))
# [1] 60558    18

# include all genes
dim(merge(gencode_gtf, gencode_ensembl_final,all=TRUE))
# [1] 60603    18


# Some genes repeated in gencode_gtf and in gencode_ensembl_final. 
# However, when repeated genes ignored, gencode_gtf and gencode_ensembl_final are same
# Thus extra genes in gencode_gtf are due to repeated genes

# genes repeated in gencode_gtf
length(gencode_gtf$geneSymbol)
# [1] 60603

length(unique(gencode_gtf$geneSymbol))
# [1] 59050


# Some genes repeated in gencode_ensembl_final
length(gencode_ensembl_final$geneSymbol)
# [1] 60558

length(unique(gencode_ensembl_final$geneSymbol))
# [1] 59050

# Extra repeated genes in gencode_gtf confirmed by:

setdiff(gencode_ensembl_final[duplicated(gencode_ensembl_final$geneSymbol),c("geneSymbol")],gencode_gtf[duplicated(gencode_gtf$geneSymbol),c("geneSymbol")])
character(0)


setdiff(gencode_gtf[duplicated(gencode_gtf$geneSymbol),c("geneSymbol")],gencode_ensembl_final[duplicated(gencode_ensembl_final$geneSymbol),c("geneSymbol")])
 # [1] "AL954722.1" "PLCXD1"     "GTPBP6"     "LINC00685"  "PPP2R3B"    "AL732314.6" "AL732314.4" "FABP5P13"   "KRT18P53"   "SHOX"       "AL672277.1" "RPL14P5"   
# [13] "CRLF2"      "CSF2RA"     "MIR3690"    "RNA5SP498"  "IL3RA"      "SLC25A6"    "LINC00106"  "ASMTL-AS1"  "ASMTL"      "P2RY8"      "AKAP17A"    "ASMT"      
# [25] "AL683807.1" "AL683807.2" "DHRSX"      "DHRSX-IT1"  "ZBED1"      "MIR6089"    "CD99P1"     "LINC00102"  "CD99"       "SPRY3"      "AMD1P2"     "DPH3P2"    
# [37] "VAMP7"      "ELOCP24"    "TRPC6P"     "IL9R"       "AJ271736.1" "WASIR1"     "WASH6P"     "DDX11L16"  


# Mostly seems to be X and Y repeaed genes. For example:
gencode_gtf[gencode_gtf$geneSymbol == "AL954722.1",]
        # Chromosome  geneS  geneE geneLength strand               gene_id              gene_type geneSymbol
# 2779282       chrX 253743 255091       1349      +       ENSG00000228572 unprocessed_pseudogene AL954722.1
# 2871982       chrY 253743 255091       1349      + ENSG00000228572_PAR_Y unprocessed_pseudogene AL954722.1


gencode_ensembl_final[gencode_ensembl_final$geneSymbol == "AL954722.1",]
              # gene_id txLength Chromosome  geneS  geneE strand           tx_id              gene_type geneSymbol cdsLength            gene_description 5utrS 5utrE
# 28166 ENSG00000228572      259       chrX 253743 255091      + ENST00000431238 unprocessed_pseudogene AL954722.1        NA regucalcin (RGN) pseudogene    NA    NA
      # 5utrDiff 3utrS 3utrE 3utrDiff
# 28166       NA    NA    NA       NA


# And:

gencode_gtf[gencode_gtf$geneSymbol == "WASIR1",]
        # Chromosome     geneS     geneE geneLength strand               gene_id gene_type geneSymbol
# 2871884       chrX 156014623 156016837       2215      -       ENSG00000185203    lncRNA     WASIR1
# 2881646       chrY  57201143  57203357       2215      - ENSG00000185203_PAR_Y    lncRNA     WASIR1



gencode_ensembl_final[gencode_ensembl_final$geneSymbol == "WASIR1",]
              # gene_id txLength Chromosome     geneS     geneE strand           tx_id gene_type geneSymbol cdsLength
# 15914 ENSG00000185203     1054       chrX 156014623 156016837      - ENST00000399966    lncRNA     WASIR1        NA
                                                       # gene_description 5utrS 5utrE 5utrDiff 3utrS 3utrE 3utrDiff
# 15914 WASH and IL9R antisense RNA 1 [Source:HGNC Symbol;Acc:HGNC:38513]    NA    NA       NA    NA    NA       NA


# From https://www.gencodegenes.org/pages/faq.html:
# "What is the difference between GENCODE GTF and Ensembl GTF?
# The gene annotation is the same in both files. The only exception is that the genes which are common to the human chromosome X and Y PAR regions can be found twice in the GENCODE GTF, while they are shown only for chromosome X in the Ensembl file."



# Therefore decided to use genes in gencode_gtf. Also gencode web sie says best set of genes for most users (ie high quality!)
gencode_gtf_ensembl <- merge(gencode_gtf,gencode_ensembl_final,all.x=TRUE)

dim(gencode_gtf_ensembl)
# [1] 60603    18

# # If wish to keep all genes in both data frames do this instead. Gives same results:
# gencode_gtf_ensembl <- merge(gencode_gtf,gencode_ensembl_final,all=TRUE)



# ~~~~~~~~ ucsc gencode v31 ~~~~~~~~~~~~~~~~~~~~~~~

# ucsc. Only adds exon counts for each gene.
# ucsc tables use 0-based counting. I ignore geneS and geneE in UCSC and will use 1-based counting of ensembl tables.
# On ucsc table browser: "Genes and Gene Predictions", "ALL GENCODE V31"
# output format: "selected fields from primary and related tables"
# Creates ucsc_gencode_v31.txt. Selected fields shown in colnames(), below

gencode_ucsc <- read.table("ucsc_gencode_v31.txt",header=FALSE,sep="\t",stringsAsFactors=FALSE)
colnames(gencode_ucsc) <- c("tx_id","Chromosome","strand","geneS","geneE","cdsS","cdsE","exonCount","exonS","exonE","score","geneSymbol")

# remove version nos from tx_id to allow merge with gencode_gtf_ensembl
gencode_ucsc$tx_id <- gsub("\\.[0-9]+","", gencode_ucsc$tx_id)


dim(gencode_ucsc)
# [1] 100040      12

gencode_gtf_ensembl_ucsc <- merge(gencode_gtf_ensembl,gencode_ucsc[,c("tx_id","geneSymbol","exonCount")],all.x=TRUE)

# Because combo of "tx_id" and "geneSymbol" not quite unique for duplicated genes in gencode_ucsc, end up with 28 duplicated chrX gene rows.
 gencode_gtf_ensembl_ucsc <- unique(gencode_gtf_ensembl_ucsc)

# replace empty entries with NA:
is.na(gencode_gtf_ensembl_ucsc) <- do.call(cbind,lapply(gencode_gtf_ensembl_ucsc, FUN = function(x) {x==""}))


gencode_gtf_ensembl_ucsc <- gencode_gtf_ensembl_ucsc[,c("Chromosome","gene_id","tx_id","geneSymbol","strand","geneS","geneE","geneLength", "txLength", "cdsLength", "5utrS", "5utrE", "5utrDiff", "3utrS", "3utrE", "3utrDiff", "exonCount", "gene_type","gene_description")]


# Sort
chrOrder<-paste("chr",c(1:22,"X","Y","M"),sep="")
gencode_gtf_ensembl_ucsc$Chromosome <-factor(gencode_gtf_ensembl_ucsc$Chromosome, levels=chrOrder)
gencode_gtf_ensembl_ucsc <- gencode_gtf_ensembl_ucsc[order(gencode_gtf_ensembl_ucsc$Chromosome, gencode_gtf_ensembl_ucsc$geneS), ]
gencode_gtf_ensembl_ucsc$Chromosome <- as.character(gencode_gtf_ensembl_ucsc$Chromosome)



# remember gencode is 1-based
dim(gencode_gtf_ensembl_ucsc)
# [1] 60603    19


# write.table(gencode_gtf_ensembl_ucsc, "gencode_gtf_ensembl_ucsc_v31.txt",quote=FALSE,sep="\t",row.names=FALSE,col.names=TRUE)



# ~~~~~~~~~~ if desired, can START HERE for gencode_gtf_ensembl_ucsc ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~~~~~~~~~~ Also permits use of 'frozen' archived v31 gencode tables ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~~~~~~~~~~ Note use of read.delim ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
gencode_gtf_ensembl_ucsc <- read.delim("gencode_gtf_ensembl_ucsc_v31.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE,check.names=FALSE)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~





# ------------- Prepare gene count data for plotting ---------------------------------------------------


g_unique <- read.delim("growth_loci_unique.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE,check.names=FALSE)
d_unique <- read.delim("paclitaxel_loci_unique.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE,check.names=FALSE)
Ix <- read.delim("Ix_loci.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE,check.names=FALSE)

 
# number g and d loci on each chromosome 

g_qtls_per_gene <- g_unique

# exclude centromeres because ensembl does not include them (nor does the crispr data)
g_qtls_per_gene <- g_qtls_per_gene[g_qtls_per_gene$geneSymbol != "CEN",] 
g_qtls_per_gene$g_count <- 0
g_qtls_per_gene <- aggregate(g_count~Chromosome, data= g_qtls_per_gene,function(x) sum(!is.na(x)))

d_qtls_per_gene <- d_unique
d_qtls_per_gene <- d_qtls_per_gene[d_qtls_per_gene$geneSymbol != "CEN",] # exclude centromeres
d_qtls_per_gene$d_count <- 0
d_qtls_per_gene <- aggregate(d_count~Chromosome, data= d_qtls_per_gene,function(x) sum(!is.na(x)))


Ix_qtls_per_gene <- Ix
Ix_qtls_per_gene <- Ix_qtls_per_gene[Ix_qtls_per_gene$geneSymbol != "CEN",] # exclude centromeres
Ix_qtls_per_gene$Ix_count <- 0
Ix_qtls_per_gene <- aggregate(Ix_count~Chromosome, data= Ix_qtls_per_gene,function(x) sum(!is.na(x)))


g_d_qtls_per_gene <- merge(g_qtls_per_gene, d_qtls_per_gene,all.x=TRUE,all.y=TRUE)
g_d_qtls_per_gene[is.na(g_d_qtls_per_gene$d_count),]$d_count <- 0

rh_qtls_per_gene <- merge(g_d_qtls_per_gene, Ix_qtls_per_gene,all.x=TRUE,all.y=TRUE)
rh_qtls_per_gene[is.na(rh_qtls_per_gene$Ix_count),]$Ix_count <- 0


# Add results for crispr knockouts

# from Sabatini
crispr <- read.table("aac7041_SM_Table_S3.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE)
colnames(crispr)[1] <- c("geneSymbol")
crispr_kbm7 <- crispr[crispr$KBM7.adjusted.p.value < 0.05,]

# merge using data frame takes forever. Use data.table:
library(data.table)
dt <- data.table(crispr_kbm7, key=names(crispr_kbm7))
dt2 <- data.table(gencode_gtf_ensembl_ucsc, key=names(gencode_gtf_ensembl_ucsc))
dt <- dt[,c("geneSymbol")]
dt2 <- dt2[,c("geneSymbol","Chromosome")]
setDT(dt)[dt2, Chromosome := i.Chromosome, on = c(geneSymbol = "geneSymbol")]


crispr_kbm7_gene <- as.data.frame(dt)
crispr_kbm7_gene$kbm7_count <- 0
crispr_kbm7_gene <- aggregate(kbm7_count~Chromosome, data=crispr_kbm7_gene, function(x) sum(!is.na(x)))


rh_qtls_per_gene <- merge(rh_qtls_per_gene, crispr_kbm7_gene,all.x=TRUE,all.y=FALSE)


# from Moffat
crispr <- read.table("mmc3-1.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE)
colnames(crispr)[1] <- c("geneSymbol")
crispr_tko <- crispr[crispr$numTKOHits == 5,]


# merge using data frame takes forever. Use data.table:
library(data.table)
dt <- data.table(crispr_tko, key=names(crispr_tko))
dt2 <- data.table(gencode_gtf_ensembl_ucsc, key=names(gencode_gtf_ensembl_ucsc))
dt <- dt[,c("geneSymbol")]
dt2 <- dt2[,c("geneSymbol","Chromosome")]
setDT(dt)[dt2, Chromosome := i.Chromosome, on = c(geneSymbol = "geneSymbol")]



crispr_tko_gene <- as.data.frame(dt)
crispr_tko_gene$tko_count <- 0
crispr_tko_gene <- aggregate(tko_count~Chromosome, data=crispr_tko_gene, function(x) sum(!is.na(x)))


rh_qtls_per_gene <- merge(rh_qtls_per_gene, crispr_tko_gene,all.x=TRUE,all.y=FALSE)




# Add number gencode ensembl_genes on each chromosome 
gencode_ensembl_genes <- gencode_gtf_ensembl_ucsc
gencode_ensembl_genes$ensembl_genes <- 0
gencode_ensembl_genes <- aggregate(ensembl_genes~Chromosome, data= gencode_ensembl_genes,function(x) sum(!is.na(x)))



rh_qtls_per_gene <- merge(rh_qtls_per_gene, gencode_ensembl_genes,all.x=TRUE,all.y=FALSE)



rh_qtls_per_gene$norm_g_count <- (rh_qtls_per_gene$g_count/rh_qtls_per_gene$ensembl_genes)*(sum(rh_qtls_per_gene$ensembl_genes)/sum(rh_qtls_per_gene$g_count))
rh_qtls_per_gene$norm_d_count <- (rh_qtls_per_gene$d_count/rh_qtls_per_gene$ensembl_genes)*(sum(rh_qtls_per_gene$ensembl_genes)/sum(rh_qtls_per_gene$d_count))
rh_qtls_per_gene$norm_Ix_count <- (rh_qtls_per_gene$Ix_count/rh_qtls_per_gene$ensembl_genes)*(sum(rh_qtls_per_gene$ensembl_genes)/sum(rh_qtls_per_gene$Ix_count))
rh_qtls_per_gene$norm_kbm7_count <- (rh_qtls_per_gene$kbm7_count/rh_qtls_per_gene$ensembl_genes)*(sum(rh_qtls_per_gene$ensembl_genes)/sum(rh_qtls_per_gene$kbm7_count))
rh_qtls_per_gene$norm_tko_count <- (rh_qtls_per_gene$tko_count/rh_qtls_per_gene$ensembl_genes)*(sum(rh_qtls_per_gene$ensembl_genes)/sum(rh_qtls_per_gene$tko_count))



# sort
rh_qtls_per_gene$Chromosome <- factor(rh_qtls_per_gene$Chromosome,levels=paste0("chr",c(1:22,"X","Y")))
rh_qtls_per_gene  <- rh_qtls_per_gene[order(rh_qtls_per_gene$Chromosome),]
# rh_qtls_per_gene$Chromosome <- as.character(rh_qtls_per_gene$Chromosome)
row.names(rh_qtls_per_gene) <- NULL





# -----------------------------------------------------------
# ------------------------ Overall Calx ---------------------
# -----------------------------------------------------------


# ----- RH and CRISPR growth genes omnibus comparison -------


# RH growth genes significantly diff from ensembl genes

chisq.test(rh_qtls_per_gene$g_count,p= rh_qtls_per_gene$ensembl_genes,rescale.p=TRUE)

	# Chi-squared test for given probabilities

# data:  rh_qtls_per_gene$g_count
# X-squared = 75.214, df = 22, p-value = 9.761e-08 <<<<<<<<<<<<<<<< use in paper


chisq.test(rh_qtls_per_gene$g_count,p= rh_qtls_per_gene$ensembl_genes,rescale.p=TRUE)$p.value
# [1] 9.760575e-08


# kbm7 crispr growth genes are different from ensembl genes
chisq.test(rh_qtls_per_gene$kbm7_count,p= rh_qtls_per_gene$ensembl_genes,rescale.p=TRUE)

	# Chi-squared test for given probabilities

# data:  rh_qtls_per_gene$kbm7_count
# X-squared = 80.785, df = 22, p-value = 1.203e-08


# RH growth genes are different from kbm7 crispr genes
chisq.test(rh_qtls_per_gene$g_count,p= rh_qtls_per_gene$kbm7_count,rescale.p=TRUE)

	# Chi-squared test for given probabilities

# data:  rh_qtls_per_gene$g_count
# X-squared = 150.96, df = 22, p-value < 2.2e-16

chisq.test(rh_qtls_per_gene$g_count,p= rh_qtls_per_gene$kbm7_count,rescale.p=TRUE)$p.value
# [1] 3.153036e-21


# TKO crispr genes are different from ensembl genes

chisq.test(rh_qtls_per_gene$tko_count,p= rh_qtls_per_gene$ensembl_genes,rescale.p=TRUE)

	# Chi-squared test for given probabilities

# data:  rh_qtls_per_gene$tko_count
# X-squared = 53.068, df = 22, p-value = 0.0002222



# RH growth genes are different from TKO crispr genes

chisq.test(rh_qtls_per_gene$g_count,p= rh_qtls_per_gene$tko_count,rescale.p=TRUE)

	# Chi-squared test for given probabilities

# data:  rh_qtls_per_gene$g_count
# X-squared = 224.79, df = 22, p-value < 2.2e-16

chisq.test(rh_qtls_per_gene$g_count,p= rh_qtls_per_gene$tko_count,rescale.p=TRUE)$p.value
# [1] 1.498391e-35



# But kbm7 crispr and TKO crispr genes are not different from each other

chisq.test(rh_qtls_per_gene$tko_count,p= rh_qtls_per_gene$kbm7_count,rescale.p=TRUE)

	# Chi-squared test for given probabilities

# data:  rh_qtls_per_gene$tko_count
# X-squared = 16.607, df = 22, p-value = 0.7847




#---------- g individual comparisons -----------

 
g_surfeit_deficit <- data.frame("Chromosome"=character(),"chi.sq"=numeric(), "df"=numeric(), "p"=numeric(), "obs"=numeric(), "exp"=numeric(), "obs/exp"=numeric(), "sig"=character(), stringsAsFactors=FALSE, check.names=FALSE)

for(i in c(1:23)) {
	ans <- chisq.test(c(rh_qtls_per_gene[i,c("g_count")],sum(rh_qtls_per_gene[,c("g_count")][-i])),p=c(rh_qtls_per_gene[i,c("ensembl_genes")],sum(rh_qtls_per_gene[,c("ensembl_genes")][-i])),rescale.p=TRUE)
	g_surfeit_deficit[i,"Chromosome"] <- as.character(rh_qtls_per_gene[i,c("Chromosome")])
	g_surfeit_deficit[i,"chi.sq"] <- ans$statistic
	g_surfeit_deficit[i,"df"] <- ans$parameter
	g_surfeit_deficit[i,"p"] <- ans$p.value
	g_surfeit_deficit[i,"obs"] <- ans$observed[1]
	g_surfeit_deficit[i,"exp"] <- ans$expected[1]
	g_surfeit_deficit[i,"obs/exp"] <- g_surfeit_deficit[i,"obs"]/g_surfeit_deficit[i,"exp"]
}
 

g_surfeit_deficit[g_surfeit_deficit$p >= 0.05,"sig"] <- "n.s."
g_surfeit_deficit[g_surfeit_deficit$p < 0.05 & g_surfeit_deficit$p >= 0.01,"sig"] <- "*"
g_surfeit_deficit[g_surfeit_deficit$p < 0.01 & g_surfeit_deficit$p >= 0.001,"sig"] <- "**"
g_surfeit_deficit[g_surfeit_deficit$p < 0.001,"sig"] <- "***"
 

g_surfeit_deficit$Chromosome <- factor(g_surfeit_deficit$Chromosome,levels=paste0("chr",c(1:22,"X","Y")))
g_surfeit_deficit  <- g_surfeit_deficit[order(g_surfeit_deficit$Chromosome),]
# g_surfeit_deficit$Chromosome <- as.character(g_surfeit_deficit$Chromosome)
row.names(g_surfeit_deficit) <- NULL

 # obs/exp gives exactly same numbers as rh_qtls_per_gene$norm_g_count
 
g_surfeit_deficit
   # Chromosome       chi.sq df           p obs      exp   obs/exp  sig
# 1        chr1 1.926988e-05  1 0.996497500  78 77.96305 1.0004740 n.s.
# 2        chr2 5.323397e+00  1 0.021040949  77 59.79400 1.2877547    *
# 3        chr3 3.137946e+00  1 0.076490076  57 45.38701 1.2558660 n.s.
# 4        chr4 9.196351e+00  1 0.002424980  56 37.77738 1.4823686   **
# 5        chr5 1.039492e+01  1 0.001263624  63 42.50846 1.4820580   **
# 6        chr6 9.927524e-01  1 0.319070603  50 43.59148 1.1470132 n.s.
# 7        chr7 2.701582e-02  1 0.869443896  44 42.95022 1.0244419 n.s.
# 8        chr8 5.629966e-01  1 0.453055373  31 35.36909 0.8764715 n.s.
# 9        chr9 1.608508e+00  1 0.204701632  26 33.16030 0.7840700 n.s.
# 10      chr10 4.446217e-01  1 0.504900164  37 33.23155 1.1133996 n.s.
# 11      chr11 1.121320e+00  1 0.289634337  55 47.88080 1.1486860 n.s.
# 12      chr12 1.369175e+00  1 0.241953666  36 43.52023 0.8272016 n.s.
# 13      chr13 2.587032e+00  1 0.107742086  27 19.90758 1.3562672 n.s.
# 14      chr14 5.010081e+00  1 0.025200134  20 32.51904 0.6150243    *
# 15      chr15 4.370626e-01  1 0.508543641  28 31.64978 0.8846824 n.s.
# 16      chr16 2.546586e+00  1 0.110532894  27 36.42361 0.7412775 n.s.
# 17      chr17 5.885165e+00  1 0.015268967  28 43.60573 0.6421175    *
# 18      chr18 9.768596e-02  1 0.754624442  19 17.69879 1.0735194 n.s.
# 19      chr19 9.518524e+00  1 0.002034081  23 42.63671 0.5394412   **
# 20      chr20 4.704630e+00  1 0.030081482  11 20.76260 0.5297989    * <<<<<<<<<<<<<< use in paper
# 21      chr21 7.255938e+00  1 0.007066697   3 12.42621 0.2414252   **
# 22      chr22 2.345398e+00  1 0.125653471  13 19.72233 0.6591514 n.s.
# 23       chrX 3.983187e+00  1 0.045956539  46 34.51408 1.3327896    * <<<<<<<<<<<<<<< use in paper


#---------- kbm7 individual comparisons -----------

 
kbm7_surfeit_deficit <- data.frame("Chromosome"=character(),"chi.sq"=numeric(), "df"=numeric(), "p"=numeric(), "obs"=numeric(), "exp"=numeric(), "obs/exp"=numeric(), "sig"=character(), stringsAsFactors=FALSE, check.names=FALSE)

for(i in c(1:23)) {
	ans <- chisq.test(c(rh_qtls_per_gene[i,c("kbm7_count")],sum(rh_qtls_per_gene[,c("kbm7_count")][-i])),p=c(rh_qtls_per_gene[i,c("ensembl_genes")],sum(rh_qtls_per_gene[,c("ensembl_genes")][-i])),rescale.p=TRUE)
	kbm7_surfeit_deficit[i,"Chromosome"] <- as.character(rh_qtls_per_gene[i,c("Chromosome")])
	kbm7_surfeit_deficit[i,"chi.sq"] <- ans$statistic
	kbm7_surfeit_deficit[i,"df"] <- ans$parameter
	kbm7_surfeit_deficit[i,"p"] <- ans$p.value
	kbm7_surfeit_deficit[i,"obs"] <- ans$observed[1]
	kbm7_surfeit_deficit[i,"exp"] <- ans$expected[1]
	kbm7_surfeit_deficit[i,"obs/exp"] <- kbm7_surfeit_deficit[i,"obs"]/kbm7_surfeit_deficit[i,"exp"]
}
 

kbm7_surfeit_deficit[kbm7_surfeit_deficit$p >= 0.05,"sig"] <- "n.s."
kbm7_surfeit_deficit[kbm7_surfeit_deficit$p < 0.05 & kbm7_surfeit_deficit$p >= 0.01,"sig"] <- "*"
kbm7_surfeit_deficit[kbm7_surfeit_deficit$p < 0.01 & kbm7_surfeit_deficit$p >= 0.001,"sig"] <- "**"
kbm7_surfeit_deficit[kbm7_surfeit_deficit$p < 0.001,"sig"] <- "***"
 

kbm7_surfeit_deficit$Chromosome <- factor(kbm7_surfeit_deficit$Chromosome,levels=paste0("chr",c(1:22,"X","Y")))
kbm7_surfeit_deficit  <- kbm7_surfeit_deficit[order(kbm7_surfeit_deficit$Chromosome),]
# kbm7_surfeit_deficit$Chromosome <- as.character(kbm7_surfeit_deficit$Chromosome)
row.names(kbm7_surfeit_deficit) <- NULL

# obs/exp gives exactly same numbers as rh_qtls_per_gene$norm_kbm7_count
 
kbm7_surfeit_deficit
   # Chromosome      chi.sq df            p obs       exp   obs/exp  sig
# 1        chr1  1.48200848  1 2.234605e-01 216 199.60364 1.0821446 n.s.
# 2        chr2  1.02602711  1 3.110936e-01 141 153.08662 0.9210472 n.s.
# 3        chr3  0.04404098  1 8.337774e-01 114 116.20135 0.9810557 n.s.
# 4        chr4  5.10259449  1 2.389008e-02  75  96.71893 0.7754428    *
# 5        chr5  0.93463151  1 3.336623e-01  99 108.83160 0.9096623 n.s.
# 6        chr6  0.05418541  1 8.159341e-01 114 111.60438 1.0214653 n.s.
# 7        chr7  5.04871156  1 2.464426e-02  87 109.96260 0.7911781    *
# 8        chr8 12.96912463  1 3.166700e-04  57  90.55314 0.6294646  ***
# 9        chr9  0.18620608  1 6.660932e-01  81  84.89813 0.9540846 n.s.
# 10      chr10  1.50144386  1 2.204493e-01  74  85.08055 0.8697640 n.s.
# 11      chr11  6.49427776  1 1.082223e-02 150 122.58604 1.2236303    *
# 12      chr12  0.11073037  1 7.393142e-01 108 111.42196 0.9692883 n.s.
# 13      chr13  5.12198219  1 2.362462e-02  35  50.96807 0.6867045    *
# 14      chr14  0.48872611  1 4.844961e-01  77  83.25635 0.9248543 n.s.
# 15      chr15  0.05285426  1 8.181691e-01  79  81.03083 0.9749375 n.s.
# 16      chr16  4.82121868  1 2.811146e-02 114  93.25295 1.2224814    *
# 17      chr17 22.07334373  1 2.624287e-06 160 111.64086 1.4331670  ***
# 18      chr18  0.63613562  1 4.251136e-01  40  45.31306 0.8827478 n.s.
# 19      chr19 15.30354969  1 9.154433e-05 149 109.15995 1.3649694  ***
# 20      chr20  0.15582514  1 6.930297e-01  56  53.15710 1.0534810 n.s.
# 21      chr21  0.02113418  1 8.844140e-01  31  31.81400 0.9744139 n.s.
# 22      chr22  0.12733200  1 7.212145e-01  53  50.49377 1.0496343 n.s.
# 23       chrX  1.03407429  1 3.092037e-01  79  88.36411 0.8940282 n.s.



#---------- tko individual comparisons -----------

 
tko_surfeit_deficit <- data.frame("Chromosome"=character(),"chi.sq"=numeric(), "df"=numeric(), "p"=numeric(), "obs"=numeric(), "exp"=numeric(), "obs/exp"=numeric(), "sig"=character(), stringsAsFactors=FALSE, check.names=FALSE)

for(i in c(1:23)) {
	ans <- chisq.test(c(rh_qtls_per_gene[i,c("tko_count")],sum(rh_qtls_per_gene[,c("tko_count")][-i])),p=c(rh_qtls_per_gene[i,c("ensembl_genes")],sum(rh_qtls_per_gene[,c("ensembl_genes")][-i])),rescale.p=TRUE)
	tko_surfeit_deficit[i,"Chromosome"] <- as.character(rh_qtls_per_gene[i,c("Chromosome")])
	tko_surfeit_deficit[i,"chi.sq"] <- ans$statistic
	tko_surfeit_deficit[i,"df"] <- ans$parameter
	tko_surfeit_deficit[i,"p"] <- ans$p.value
	tko_surfeit_deficit[i,"obs"] <- ans$observed[1]
	tko_surfeit_deficit[i,"exp"] <- ans$expected[1]
	tko_surfeit_deficit[i,"obs/exp"] <- tko_surfeit_deficit[i,"obs"]/tko_surfeit_deficit[i,"exp"]
}
 

tko_surfeit_deficit[tko_surfeit_deficit$p >= 0.05,"sig"] <- "n.s."
tko_surfeit_deficit[tko_surfeit_deficit$p < 0.05 & tko_surfeit_deficit$p >= 0.01,"sig"] <- "*"
tko_surfeit_deficit[tko_surfeit_deficit$p < 0.01 & tko_surfeit_deficit$p >= 0.001,"sig"] <- "**"
tko_surfeit_deficit[tko_surfeit_deficit$p < 0.001,"sig"] <- "***"
 

tko_surfeit_deficit$Chromosome <- factor(tko_surfeit_deficit$Chromosome,levels=paste0("chr",c(1:22,"X","Y")))
tko_surfeit_deficit  <- tko_surfeit_deficit[order(tko_surfeit_deficit$Chromosome),]
# tko_surfeit_deficit$Chromosome <- as.character(tko_surfeit_deficit$Chromosome)
row.names(tko_surfeit_deficit) <- NULL

# obs/exp gives exactly same numbers as rh_qtls_per_gene$norm_tko_count
 
tko_surfeit_deficit
   # Chromosome       chi.sq df            p obs      exp   obs/exp  sig
# 1        chr1 9.547818e-02  1 0.7573249923  75 72.49196 1.0345975 n.s.
# 2        chr2 6.060135e-01  1 0.4362927720  50 55.59793 0.8993141 n.s.
# 3        chr3 4.418334e-01  1 0.5062388735  38 42.20195 0.9004323 n.s.
# 4        chr4 5.131920e+00  1 0.0234897282  22 35.12634 0.6263107    *
# 5        chr5 1.507756e+00  1 0.2194818099  32 39.52541 0.8096058 n.s.
# 6        chr6 7.369563e-03  1 0.9315887139  40 40.53243 0.9868642 n.s.
# 7        chr7 2.602884e+00  1 0.1066694774  30 39.93617 0.7511988 n.s.
# 8        chr8 1.504489e+00  1 0.2199818872  26 32.88705 0.7905848 n.s.
# 9        chr9 9.380346e-04  1 0.9755667303  31 30.83326 1.0054077 n.s.
# 10      chr10 8.082974e-01  1 0.3686241093  26 30.89951 0.8414372 n.s.
# 11      chr11 3.705472e+00  1 0.0542343215  57 44.52074 1.2803021 n.s.
# 12      chr12 1.671667e-01  1 0.6826422517  43 40.46617 1.0626159 n.s.
# 13      chr13 2.344491e+00  1 0.1257266336  12 18.51056 0.6482787 n.s.
# 14      chr14 2.071859e+00  1 0.1500385196  38 30.23700 1.2567383 n.s.
# 15      chr15 1.947333e+00  1 0.1628745441  22 29.42874 0.7475685 n.s.
# 16      chr16 3.822111e+00  1 0.0505804701  45 33.86756 1.3287049 n.s.
# 17      chr17 1.429669e+01  1 0.0001561392  64 40.54568 1.5784667  ***
# 18      chr18 1.232483e+00  1 0.2669246819  12 16.45677 0.7291830 n.s.
# 19      chr19 4.735229e+00  1 0.0295509173  53 39.64466 1.3368761    *
# 20      chr20 4.013062e+00  1 0.0451490905  28 19.30557 1.4503585    *
# 21      chr21 5.253674e-01  1 0.4685610566  14 11.55419 1.2116814 n.s.
# 22      chr22 6.220542e-01  1 0.4302849545  15 18.33831 0.8179600 n.s.
# 23       chrX 3.307160e+00  1 0.0689786042  22 32.09203 0.6855284 n.s.

 
 

# -------------- RH paclitaxel genes omnibus comparison --------------------------


# RH paclitaxel genes vs ensembl genes, not sig diff 

chisq.test(rh_qtls_per_gene$d_count,p= rh_qtls_per_gene$ensembl_genes,rescale.p=TRUE)

	# Chi-squared test for given probabilities

# data:  rh_qtls_per_gene$d_count
# X-squared = 22.771, df = 22, p-value = 0.4148 <<<<<<<<<< use in paper

# Warning message:
# In chisq.test(rh_qtls_per_gene$d_count, p = rh_qtls_per_gene$ensembl_genes,  :
  # Chi-squared approximation may be incorrect


# check warning on Chi sq, but P val practically the same (chi sq little more conservative).
chisq.test(rh_qtls_per_gene$d_count,p= rh_qtls_per_gene$ensembl_genes,rescale.p=TRUE,simulate.p.value=TRUE,B=1e4)

	# Chi-squared test for given probabilities with simulated p-value (based on 10000 replicates)

# data:  rh_qtls_per_gene$d_count
# X-squared = 22.771, df = NA, p-value = 0.4

library(XNomial)

xmulti(rh_qtls_per_gene$d_count, rh_qtls_per_gene$ensembl_genes,detail=2)
# Full enumeration requires examination of 2.142582e+15 tables.
# Error in xmulti(rh_qtls_per_gene$d_count, rh_qtls_per_gene$ensembl_genes,  : 
  # This operation could take more than several minutes.
    # • The monte carlo version, "xmonte" is recommended for this case. 
    # • To override this cutoff, change the parameter 'safety' to something greater than the required number of trials.

d_ensembl <- xmonte(rh_qtls_per_gene$d_count, rh_qtls_per_gene$ensembl_genes,detail=3,ntrials=1e7)

# P value (LLR) = 0.2868436 ± 0.000143
 # 1e+07  random trials
 # Observed:  3 2 2 0 0 3 5 0 3 2 3 1 1 0 2 2 1 0 2 2 0 0 0 
 # Expected Ratio:  5471 4196 3185 2651 2983 3059 3014 2482 2327 2332 3360 3054 1397 2282 2221 2556 3060 1242 2992 1457 872 1384 2422


d_ensembl$observedLLR
# [1] -14.24168

d_ensembl$pLLR
# [1] 0.2868436 

d_ensembl$standard.error
# [1] 0.000143026


# for fun did xmonte with Chisq option, even though LLR (above) is preferred.

d_ensembl_2 <- xmonte(rh_qtls_per_gene$d_count, rh_qtls_per_gene$ensembl_genes,statName="Chisq",detail=3,ntrials=1e7)


# P value (Chisq) = 0.4008998 ± 0.000155
 # 1e+07  random trials
 # Observed:  3 2 2 0 0 3 5 0 3 2 3 1 1 0 2 2 1 0 2 2 0 0 0 
 # Expected Ratio:  5471 4196 3185 2651 2983 3059 3014 2482 2327 2332 3360 3054 1397 2282 2221 2556 3060 1242 2992 1457 872 1384 2422


d_ensembl_2$observedChi
# [1] 22.7709

d_ensembl_2$pChi
# [1] 0.4008998

d_ensembl_2$standard.error
# [1] 0.0001549771





#---------- d individual comparisons -----------

 
d_surfeit_deficit <- data.frame("Chromosome"=character(),"chi.sq"=numeric(), "df"=numeric(), "p"=numeric(), "obs"=numeric(), "exp"=numeric(), "obs/exp"=numeric(), "sig"=character(), stringsAsFactors=FALSE, check.names=FALSE)

for(i in c(1:23)) {
	ans <- chisq.test(c(rh_qtls_per_gene[i,c("d_count")],sum(rh_qtls_per_gene[,c("d_count")][-i])),p=c(rh_qtls_per_gene[i,c("ensembl_genes")],sum(rh_qtls_per_gene[,c("ensembl_genes")][-i])),rescale.p=TRUE)
	d_surfeit_deficit[i,"Chromosome"] <- as.character(rh_qtls_per_gene[i,c("Chromosome")])
	d_surfeit_deficit[i,"chi.sq"] <- ans$statistic
	d_surfeit_deficit[i,"df"] <- ans$parameter
	d_surfeit_deficit[i,"p"] <- ans$p.value
	d_surfeit_deficit[i,"obs"] <- ans$observed[1]
	d_surfeit_deficit[i,"exp"] <- ans$expected[1]
	d_surfeit_deficit[i,"obs/exp"] <- d_surfeit_deficit[i,"obs"]/d_surfeit_deficit[i,"exp"]
}
 

d_surfeit_deficit[d_surfeit_deficit$p >= 0.05,"sig"] <- "n.s."
d_surfeit_deficit[d_surfeit_deficit$p < 0.05 & d_surfeit_deficit$p >= 0.01,"sig"] <- "*"
d_surfeit_deficit[d_surfeit_deficit$p < 0.01 & d_surfeit_deficit$p >= 0.001,"sig"] <- "**"
d_surfeit_deficit[d_surfeit_deficit$p < 0.001,"sig"] <- "***"
 

d_surfeit_deficit$Chromosome <- factor(d_surfeit_deficit$Chromosome,levels=paste0("chr",c(1:22,"X","Y")))
d_surfeit_deficit  <- d_surfeit_deficit[order(d_surfeit_deficit$Chromosome),]
# d_surfeit_deficit$Chromosome <- as.character(d_surfeit_deficit$Chromosome)
row.names(d_surfeit_deficit) <- NULL

# obs/exp gives exactly same numbers as rh_qtls_per_gene$norm_d_count
 
d_surfeit_deficit
   # Chromosome      chi.sq df           p obs       exp   obs/exp  sig
# 1        chr1 0.003569397  1 0.952359186   3 3.1002850 0.9676530 n.s.
# 2        chr2 0.064532401  1 0.799470617   2 2.3777730 0.8411232 n.s.
# 3        chr3 0.022280335  1 0.881343754   2 1.8048634 1.1081171 n.s.
# 4        chr4 1.571702588  1 0.209959929   0 1.5022584 0.0000000 n.s.
# 5        chr5 1.778834012  1 0.182292568   0 1.6903948 0.0000000 n.s.
# 6        chr6 0.975098442  1 0.323411925   3 1.7334622 1.7306405 n.s.
# 7        chr7 6.680901038  1 0.009745135   5 1.7079618 2.9274659   ** <<<<<<<<<<<< use in paper
# 8        chr8 1.467183615  1 0.225790582   0 1.4064901 0.0000000 n.s.
# 9        chr9 2.230289420  1 0.135328496   3 1.3186553 2.2750449 n.s.
# 10      chr10 0.362466090  1 0.547139812   2 1.3214887 1.5134447 n.s.
# 11      chr11 0.668267301  1 0.413656358   3 1.9040317 1.5756040 n.s.
# 12      chr12 0.324996066  1 0.568620549   1 1.7306288 0.5778246 n.s.
# 13      chr13 0.056143793  1 0.812698051   1 0.7916465 1.2631900 n.s.
# 14      chr14 1.344283313  1 0.246280007   0 1.2931549 0.0000000 n.s.
# 15      chr15 0.453542181  1 0.500657611   2 1.2585876 1.5890828 n.s.
# 16      chr16 0.219392434  1 0.639503244   2 1.4484241 1.3808110 n.s.
# 17      chr17 0.327419216  1 0.567182714   1 1.7340289 0.5766917 n.s.
# 18      chr18 0.718688837  1 0.396574331   0 0.7038117 0.0000000 n.s.
# 19      chr19 0.057558359  1 0.810397523   2 1.6954949 1.1795966 n.s.
# 20      chr20 1.711903396  1 0.190738601   2 0.8256471 2.4223424 n.s.
# 21      chr21 0.501429127  1 0.478872848   0 0.4941416 0.0000000 n.s.
# 22      chr22 0.802797919  1 0.370258152   0 0.7842797 0.0000000 n.s.
# 23       chrX 1.430223874  1 0.231727906   0 1.3724895 0.0000000 n.s.






# -------------- RH Ix genes omnibus comparison --------------------------


# RH Ix genes vs ensembl genes, sig diff 

chisq.test(rh_qtls_per_gene$Ix_count,p= rh_qtls_per_gene$ensembl_genes,rescale.p=TRUE)

# Chi-squared test for given probabilities

# data:  rh_qtls_per_gene$Ix_count
# X-squared = 50.945, df = 22, p-value = 0.0004363 <<<<<<<<<< use in paper

# Warning message:
# In chisq.test(rh_qtls_per_gene$Ix_count, p = rh_qtls_per_gene$ensembl_genes,  :
  # Chi-squared approximation may be incorrect


# check warning on Chi sq, but P val practically the same (chi sq little more conservative).
chisq.test(rh_qtls_per_gene$Ix_count,p= rh_qtls_per_gene$ensembl_genes,rescale.p=TRUE,simulate.p.value=TRUE,B=1e4)

# Chi-squared test for given probabilities with simulated p-value (based on 10000 replicates)

# data:  rh_qtls_per_gene$Ix_count
# X-squared = 50.945, df = NA, p-value = 0.0009999

library(XNomial)

xmulti(rh_qtls_per_gene$Ix_count, rh_qtls_per_gene$ensembl_genes,detail=2)
# Full enumeration requires examination of 1.963937e+19 tables.
# Error in xmulti(rh_qtls_per_gene$Ix_count, rh_qtls_per_gene$ensembl_genes,  : 
  # This operation could take more than several minutes.
    # • The monte carlo version, "xmonte" is recommended for this case. 
    # • To override this cutoff, change the parameter 'safety' to something greater than the required number of trials.

Ix_ensembl <- xmonte(rh_qtls_per_gene$Ix_count, rh_qtls_per_gene$ensembl_genes,detail=3,ntrials=1e7)

# P value (LLR) = 0.0001326 ± 3.641e-06
 # 1e+07  random trials
 # Observed:  6 6 1 0 0 10 6 0 4 1 2 3 4 0 4 3 2 0 2 3 0 0 0 
 # Expected Ratio:  5471 4196 3185 2651 2983 3059 3014 2482 2327 2332 3360 3054 1397 2282 2221 2556 3060 1242 2992 1457 872 1384 2422



Ix_ensembl$observedLLR
# [1] -28.22111

Ix_ensembl$pLLR
# [1] 0.0001326 

Ix_ensembl$standard.error
# [1] 3.641187e-06


# for fun did xmonte with Chisq option, even though LLR (above) is preferred.

Ix_ensembl_2 <- xmonte(rh_qtls_per_gene$Ix_count, rh_qtls_per_gene$ensembl_genes,statName="Chisq",detail=3,ntrials=1e7)


# P value (Chisq) = 0.0011208 ± 1.058e-05
 # 1e+07  random trials
 # Observed:  6 6 1 0 0 10 6 0 4 1 2 3 4 0 4 3 2 0 2 3 0 0 0 
 # Expected Ratio:  5471 4196 3185 2651 2983 3059 3014 2482 2327 2332 3360 3054 1397 2282 2221 2556 3060 1242 2992 1457 872 1384 2422



Ix_ensembl_2$observedChi
# [1] 50.94538

Ix_ensembl_2$pChi
# [1] 0.0011208

Ix_ensembl_2$standard.error
# [1] 1.058085e-05





#---------- Ix individual comparisons -----------

 
Ix_surfeit_deficit <- data.frame("Chromosome"=character(),"chi.sq"=numeric(), "df"=numeric(), "p"=numeric(), "obs"=numeric(), "exp"=numeric(), "obs/exp"=numeric(), "sig"=character(), stringsAsFactors=FALSE, check.names=FALSE)

for(i in c(1:23)) {
	ans <- chisq.test(c(rh_qtls_per_gene[i,c("Ix_count")],sum(rh_qtls_per_gene[,c("Ix_count")][-i])),p=c(rh_qtls_per_gene[i,c("ensembl_genes")],sum(rh_qtls_per_gene[,c("ensembl_genes")][-i])),rescale.p=TRUE)
	Ix_surfeit_deficit[i,"Chromosome"] <- as.character(rh_qtls_per_gene[i,c("Chromosome")])
	Ix_surfeit_deficit[i,"chi.sq"] <- ans$statistic
	Ix_surfeit_deficit[i,"df"] <- ans$parameter
	Ix_surfeit_deficit[i,"p"] <- ans$p.value
	Ix_surfeit_deficit[i,"obs"] <- ans$observed[1]
	Ix_surfeit_deficit[i,"exp"] <- ans$expected[1]
	Ix_surfeit_deficit[i,"obs/exp"] <- Ix_surfeit_deficit[i,"obs"]/Ix_surfeit_deficit[i,"exp"]
}
 

Ix_surfeit_deficit[Ix_surfeit_deficit$p >= 0.05,"sig"] <- "n.s."
Ix_surfeit_deficit[Ix_surfeit_deficit$p < 0.05 & Ix_surfeit_deficit$p >= 0.01,"sig"] <- "*"
Ix_surfeit_deficit[Ix_surfeit_deficit$p < 0.01 & Ix_surfeit_deficit$p >= 0.001,"sig"] <- "**"
Ix_surfeit_deficit[Ix_surfeit_deficit$p < 0.001,"sig"] <- "***"
 

Ix_surfeit_deficit$Chromosome <- factor(Ix_surfeit_deficit$Chromosome,levels=paste0("chr",c(1:22,"X","Y")))
Ix_surfeit_deficit  <- Ix_surfeit_deficit[order(Ix_surfeit_deficit$Chromosome),]
# Ix_surfeit_deficit$Chromosome <- as.character(Ix_surfeit_deficit$Chromosome)
row.names(Ix_surfeit_deficit) <- NULL

# obs/exp gives exactly same numbers as rh_qtls_per_gene$norm_Ix_count
 
Ix_surfeit_deficit
   # Chromosome       chi.sq df            p obs       exp   obs/exp  sig
# 1        chr1  0.136325571  1 7.119616e-01   6 5.1975366 1.1543930 n.s.
# 2        chr2  1.093765343  1 2.956384e-01   6 3.9862664 1.5051678 n.s.
# 3        chr3  1.432325365  1 2.313853e-01   1 3.0258004 0.3304911 n.s.
# 4        chr4  2.634913162  1 1.045378e-01   0 2.5184920 0.0000000 n.s.
# 5        chr5  2.982162902  1 8.418670e-02   0 2.8338972 0.0000000 n.s.
# 6        chr6 18.246792969  1 1.940524e-05  10 2.9060984 3.4410397  ***
# 7        chr7  3.617779727  1 5.716512e-02   6 2.8633477 2.0954493 n.s.
# 8        chr8  2.459690179  1 1.168018e-01   0 2.3579393 0.0000000 n.s.
# 9        chr9  1.506692001  1 2.196446e-01   4 2.2106868 1.8093924 n.s.
# 10      chr10  0.693780542  1 4.048814e-01   1 2.2154369 0.4513782 n.s.
# 11      chr11  0.471573716  1 4.922641e-01   2 3.1920532 0.6265560 n.s.
# 12      chr12  0.003534249  1 9.525940e-01   3 2.9013484 1.0340020 n.s.
# 13      chr13  5.511201753  1 1.889506e-02   4 1.3271721 3.0139271    * <<<<<<<<<< use in paper
# 14      chr14  2.253651437  1 1.332995e-01   0 2.1679361 0.0000000 n.s.
# 15      chr15  1.758055296  1 1.848673e-01   4 2.1099852 1.8957479 n.s.
# 16      chr16  0.140618360  1 7.076671e-01   3 2.4282405 1.2354625 n.s.
# 17      chr17  0.298224197  1 5.849978e-01   2 2.9070485 0.6879830 n.s.
# 18      chr18  1.204860697  1 2.723523e-01   0 1.1799197 0.0000000 n.s.
# 19      chr19  0.262790072  1 6.082098e-01   2 2.8424474 0.7036190 n.s.
# 20      chr20  1.933195366  1 1.644093e-01   3 1.3841731 2.1673590 n.s.
# 21      chr21  0.840631184  1 3.592163e-01   0 0.8284138 0.0000000 n.s.
# 22      chr22  1.345867099  1 2.460019e-01   0 1.3148219 0.0000000 n.s.
# 23       chrX  2.397728260  1 1.215116e-01   0 2.3009383 0.0000000 n.s.





# --------------------------------------------------------------------------
# ---------------- Plot histograms -----------------------------------------
# --------------------------------------------------------------------------



#----------------- human RH growth genes/ensembl genes ---------------------


# balloon_scale <- 5 # inflation factor for significant points	
# size_point <- 0.1*(1 + balloon_scale*(logP$log10p_g_avg/max(logP$log10p_g_avg))) # scale significant points


sig_rh_growth <- g_surfeit_deficit  
sig_rh_growth <- sig_rh_growth[sig_rh_growth$sig != "n.s.",]




p1 <- ggplot() + 
		geom_bar(
			data = rh_qtls_per_gene, 
			stat="identity", 
			fill="grey65",
			position = position_dodge(width=0.5),
				aes(
					x = Chromosome, 
					y = norm_g_count
					)
				)  +
		geom_text(
			data = sig_rh_growth,
			size=2.5,
			aes(
				x = Chromosome,
				y = obs/exp + 0.05,
				label=sig
				)
		) +
		theme2 +
		scale_x_discrete(labels = labels, expand = c(0.062, 0)) +
		xlab("Chromosome") + 
		ylab(expression(Obs*'/'*Exp)) + 
		# ylab(expression('Loci per gene ('*''%*%''*10^{-2}*')')) + 
		labs(subtitle="Growth") 
print(p1)		



# ------------------ human RH paclitaxel genes/ensembl genes ------------------------

sig_rh_paclitaxel <- d_surfeit_deficit  
sig_rh_paclitaxel <- sig_rh_paclitaxel[sig_rh_paclitaxel$sig != "n.s.",]


p2 <- ggplot() + 
		geom_bar(
			data = rh_qtls_per_gene, 
			stat="identity", 
			fill="grey65",
				aes(
					x = Chromosome, 
					y = norm_d_count
					)
				)  +
		geom_text(
			data = sig_rh_paclitaxel,
			size=2.5,
			aes(
				x = Chromosome,
				y = obs/exp + 0.05,
				label=sig
				)
		) +
		theme2 +
		scale_x_discrete(labels = labels, expand = c(0.062, 0)) +
		xlab("Chromosome") +
		ylab(expression(Obs*'/'*Exp)) + 
		# ylab(expression('Loci per gene ('*''%*%''*10^{-2}*')')) + 
		labs(subtitle="Paclitaxel") 
print(p2)	


# ------------------ human RH Ix genes/ensembl genes ------------------------


sig_rh_Ix <- Ix_surfeit_deficit  
sig_rh_Ix <- sig_rh_Ix[sig_rh_Ix$sig != "n.s.",]


p3 <- ggplot() + 
		geom_bar(
			data = rh_qtls_per_gene, 
			stat="identity", 
			fill="grey65",
				aes(
					x = Chromosome, 
					y = norm_Ix_count
					)
				)  +
		geom_text(
			data = sig_rh_Ix,
			size=2.5,
			aes(
				x = Chromosome,
				y = obs/exp + 0.05,
				label=sig
				)
		) +
		theme2 +
		scale_x_discrete(labels = labels, expand = c(0.062, 0)) +
		xlab("Chromosome") +
		ylab(expression(Obs*'/'*Exp)) + 
		# ylab(expression('Loci per gene ('*''%*%''*10^{-2}*')')) + 
		labs(subtitle="Ix") 
print(p3)	



# ------------------ kbm7 crispr genes/ensembl genes ------------------------


sig_kbm7_crisp <- kbm7_surfeit_deficit  
sig_kbm7_crisp <- sig_kbm7_crisp[sig_kbm7_crisp$sig != "n.s.",]

p4 <- ggplot() + 
		geom_bar(
			data = rh_qtls_per_gene, 
			stat="identity", 
			fill="grey65",
				aes(
					x = Chromosome, 
					y = norm_kbm7_count
					)
				)  +
		geom_text(
			data = sig_kbm7_crisp,
			size=2.5,
			aes(
				x = Chromosome,
				y = obs/exp + 0.05,
				label=sig
				)
		) +
		theme2 +
		scale_x_discrete(labels = labels, expand = c(0.062, 0)) +
		xlab("Chromosome") +
		ylab(expression(Obs*'/'*Exp)) + 
		# ylab(expression('Loci per gene ('*''%*%''*10^{-2}*')')) + 
		labs(subtitle="KBM7 CRISPR") 
print(p4)	


# ------------------ tko crispr genes/ensembl genes ------------------------


sig_tko_crisp <- tko_surfeit_deficit  
sig_tko_crisp <- sig_tko_crisp[sig_tko_crisp$sig != "n.s.",]


p5 <- ggplot() + 
		geom_bar(
			data = rh_qtls_per_gene, 
			stat="identity", 
			fill="grey65",
				aes(
					x = Chromosome, 
					y = norm_tko_count
					)
				)  +
		geom_text(
			data = sig_tko_crisp,
			size=2.5,
			aes(
				x = Chromosome,
				y = obs/exp + 0.05,
				label=sig
				)
		) +
		theme2 +
		scale_x_discrete(labels = labels, expand = c(0.062, 0)) +
		xlab("Chromosome") +
		ylab(expression(Obs*'/'*Exp)) + 
		# ylab(expression('Loci per gene ('*''%*%''*10^{-2}*')')) +  
		labs(subtitle="TKO CRISPR") 
print(p5)	



#------------------Make file --------------------------


# Decided to omit p4, p5

pdf("qtls_per_gene_4.pdf",width=7.5,height=6.67)
plot_grid(p1, p2, p3, labels=c("A", "B", "C"), ncol = 2, nrow = 2, label_size = 14)
dev.off()




































