
library(XNomial)


# ----------- functions ---------------------

sem <- function(x) {sqrt(var(x,na.rm=TRUE)/sum(!is.na(x)))}

compare <- function(a,b) {
	print(t.test(a,b))
	
	print(paste0("mean of a = ", mean(a)))
	print(paste0("sem of a = ", sem(a)))
	print(paste0("sd of a = ", sd(a)))
	print(paste0("number in a = ", sum(!is.na(a))))
	
	print(paste0("mean of b = ", mean(b)))
	print(paste0("sem of b = ", sem(b)))
	print(paste0("sd of b = ", sd(b)))
	print(paste0("number in b = ", sum(!is.na(b))))
	
}


# ------------------------- data ----------------------------------------

g_unique <- read.delim("growth_loci_unique.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE,check.names=FALSE)
d_unique <- read.delim("paclitaxel_loci_unique.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE,check.names=FALSE)



 # ---------------- Percent d cr and nc genes --------------------------------

dim(d_unique)
# [1] 38 50

# remove CEN
d_unique_no_cen <- d_unique[d_unique$geneSymbol != "CEN",]

(d_unique_no_cen_cr <- dim(d_unique_no_cen[d_unique_no_cen$gene_type=="protein_coding",])[1])
# [1] 19 <<<<<<<<<<<< use in paper

(d_unique_no_cen_cr_nc <- dim(d_unique_no_cen)[1])
# [1] 34  <<<<<<<<<<<< use in paper

d_unique_no_cen_cr/d_unique_no_cen_cr_nc
# [1] 0.5588235 <<<<<<<<<<<< use in paper

# RH nc hit rate
(d_unique_no_cen_nc <- dim(d_unique_no_cen[d_unique_no_cen$gene_type!="protein_coding",])[1])
# [1] 15 <<<<<<<<<<<< use in paper

d_unique_no_cen_nc/d_unique_no_cen_cr_nc
# [1] 0.4411765 <<<<<<<<<<<< use in paper



# --------- Differences in log10P vals for cr and nc d genes, CEN excluded --------------


# CEN excluded, cr genes have significantly lower logP vals than nc

compare(d_unique[d_unique$geneSymbol != "CEN" & d_unique$gene_type == "protein_coding","log10P"],d_unique[d_unique$geneSymbol != "CEN" & d_unique$gene_type != "protein_coding","log10P"])

	# Welch Two Sample t-test

# data:  a and b
# t = -2.1474, df = 28.535, p-value = 0.04039
# alternative hypothesis: true difference in means is not equal to 0
# 95 percent confidence interval:
 # -10.0291774  -0.2409359
# sample estimates:
# mean of x mean of y 
 # 14.32531  19.46037 

# [1] "mean of a = 14.3253090725951"
# [1] "sem of a = 1.4926095185676"
# [1] "sd of a = 6.50613405360305"
# [1] "number in a = 19"
# [1] "mean of b = 19.4603657059652"
# [1] "sem of b = 1.86820479562434"
# [1] "sd of b = 7.23552606075791"
# [1] "number in b = 15"




# -------- Evaluate nc cr enrichment in d_unique ----------------
# ----- Folowing logic in g_loci_text_blurb_1.R -----------------
# ---- Difficult to be certain using theoretical calx, therefore sample -------------
#---- compare nc_cr g loci with all locations sampled from logP file --------


# gene_sample.txt constructed from nc_cr_sample_1.R

gene_sample <- read.delim("gene_sample.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE,check.names=FALSE)

dim(gene_sample)
# [1] 305391     23

gene_sample_no_cen <- gene_sample[gene_sample$geneSymbol != "CEN",]



# percent of cr and all genes in gene_sample. Expected cr hit rate by sampling each position in logP. 
# Gives very different results to theoretical calx above
# Now not significantly different to observed hit rate in g_unique_no_cen

(gene_sample_no_cen_cr <- dim(gene_sample_no_cen[gene_sample_no_cen$gene_type == "protein_coding",])[1])
# [1] 153517

(gene_sample_no_cen_nc <- dim(gene_sample_no_cen[gene_sample_no_cen$gene_type != "protein_coding",])[1])
# [1] 137947

(gene_sample_no_cen_cr_nc <- dim(gene_sample_no_cen)[1])
# [1] 291464

gene_sample_no_cen_cr/gene_sample_no_cen_cr_nc
# [1] 0.52671


chisq.test(c(d_unique_no_cen_cr, d_unique_no_cen_nc),p = c(gene_sample_no_cen_cr, gene_sample_no_cen_nc),rescale.p=TRUE)

	# Chi-squared test for given probabilities

# data:  c(d_unique_no_cen_cr, d_unique_no_cen_nc)
# X-squared = 0.14066, df = 1, p-value = 0.7076




# -------- However, d_unique are non-redundant. To be fair, we should make gene_sample non-redundant ------------
# This sampling approach is more realistic.
# Makes little difference in calculation. Now cr are 0.441917 of genes in gene_sample, and RH paclitaxel genes at 0.5588235 are still non-significantly over-represented. This is different from g_unique, where cr growth genes are significantly overrepresented; cf g_loci_text_blurb_1.R


gene_sample_no_cen_unique <- gene_sample_no_cen[!duplicated(gene_sample_no_cen$ensembl_gene_id),]



(gene_sample_no_cen_unique_cr <- dim(gene_sample_no_cen_unique[gene_sample_no_cen_unique$gene_type == "protein_coding",])[1])
# [1] 18184


(gene_sample_no_cen_unique_nc <- dim(gene_sample_no_cen_unique[gene_sample_no_cen_unique$gene_type != "protein_coding",])[1])
# [1] 22964


(gene_sample_no_cen_unique_cr_nc <- dim(gene_sample_no_cen_unique)[1])
# [1] 41148

gene_sample_no_cen_unique_cr/gene_sample_no_cen_unique_cr_nc
# [1] 0.441917 <<<<<<<<< use in paper

gene_sample_no_cen_unique_nc/gene_sample_no_cen_unique_cr_nc
# [1] 0.558083 <<<<<<<<< use in paper


chisq.test(c(d_unique_no_cen_cr, d_unique_no_cen_nc),p = c(gene_sample_no_cen_unique_cr, gene_sample_no_cen_unique_nc),rescale.p=TRUE)

	# Chi-squared test for given probabilities

# data:  c(d_unique_no_cen_cr, d_unique_no_cen_nc)
# X-squared = 1.8842, df = 1, p-value = 0.1699 <<<<<<<<<<<<<<<< use in paper




# ------------ Differences in nc cr in g_unique and d_unique ----------------------



# observed RH g cr hit rate
# remove CEN
g_unique_no_cen <- g_unique[g_unique$geneSymbol != "CEN",]

(g_unique_no_cen_cr <- dim(g_unique_no_cen[g_unique_no_cen$gene_type=="protein_coding",])[1])
# [1] 442 <<<<<<<<<<<< use in paper

(g_unique_no_cen_cr_nc <- dim(g_unique_no_cen)[1])
# [1] 855 <<<<<<<<<<<< use in paper

g_unique_no_cen_cr/g_unique_no_cen_cr_nc
# [1] 0.5169591 <<<<<<<<<<<< use in paper

# RH nc hit rate
(g_unique_no_cen_nc <- dim(g_unique_no_cen[g_unique_no_cen$gene_type!="protein_coding",])[1])
# [1] 413 <<<<<<<<<<<< use in paper

g_unique_no_cen_nc/g_unique_no_cen_cr_nc
# [1] 0.4830409 <<<<<<<<<<<< use in paper


# d_unique not sig diff from g_unique nc cr, perhaps because lack power in d_unique

chisq.test(x=c(d_unique_no_cen_cr, d_unique_no_cen_nc), p=c(g_unique_no_cen_cr, g_unique_no_cen_nc), rescale.p=TRUE)

	# Chi-squared test for given probabilities

# data:  c(d_unique_no_cen_cr, d_unique_no_cen_nc)
# X-squared = 0.23863, df = 1, p-value = 0.6252


# -------------- Overlap in d and g genes -----------------------

intersect(g_unique$geneSymbol,d_unique$geneSymbol)
# [1] "CEN"        "LSAMP"      "KHDRBS2"    "AC074389.2" "SEMA3D"     "TBC1D12"    "AK6P1"      "RN7SL584P"  "GATAD2A"    


g_unique[g_unique$geneSymbol %in% d_unique$geneSymbol,c("Chromosome","pos","conc","log10P","sig_coef","dist","geneSymbol")]
    # Chromosome       pos conc    log10P   sig_coef    dist geneSymbol
# 44        chr1 121970000  avg 16.163295 -0.0931521   56460        CEN
# 191       chr3 116650000    0 14.664971 -0.1955521       0      LSAMP
# 354       chr6  61740000   75 42.007826 -0.3358581       0    KHDRBS2
# 385       chr7   1620000   75 19.223579 -0.5102915     654 AC074389.2
# 398       chr7  85120000   75 21.176183 -0.5422764       0     SEMA3D
# 506      chr10  94440000    0 22.879916 -0.2762953       0    TBC1D12
# 538      chr11  54280000   75 22.464474 -0.2432813       0        CEN
# 587      chr12  34460000   75 18.175422 -0.2741533 -210043      AK6P1
# 660      chr15  16980000   75 19.525177 -0.2873017  103674        CEN
# 661      chr15  20100000    0  9.705184  0.0775562   -3972  RN7SL584P
# 771      chr19  19460000   75 20.278043 -0.3192729       0    GATAD2A
# 827       chrX  58750000   75 19.221598 -0.2016729       0        CEN



dim(g_unique)
# [1] 859  50

dim(g_unique[g_unique$geneSymbol != "CEN",])
# [1] 855  50 


dim(g_unique[g_unique$geneSymbol != "CEN",][g_unique[g_unique$geneSymbol != "CEN",]$geneSymbol %in% d_unique[d_unique$geneSymbol != "CEN",]$geneSymbol,c("Chromosome","pos","conc","log10P","sig_coef","dist","geneSymbol")])
# [1] 8  7

intersect(g_unique[g_unique$geneSymbol != "CEN",c("geneSymbol")], d_unique[d_unique$geneSymbol != "CEN",c("geneSymbol")])
# [1] "LSAMP"      "KHDRBS2"    "AC074389.2" "SEMA3D"     "TBC1D12"    "AK6P1"      "RN7SL584P"  "GATAD2A"  

# Percent overlap g_unique
dim(g_unique[g_unique$geneSymbol != "CEN",][g_unique[g_unique$geneSymbol != "CEN",]$geneSymbol %in% d_unique[d_unique$geneSymbol != "CEN",]$geneSymbol,c("Chromosome","pos","conc","log10P","sig_coef","dist","geneSymbol")])[1]/dim(g_unique[g_unique$geneSymbol != "CEN",])[1]
# [1] 0.009356725



dim(d_unique)
# [1] 38 50


dim(d_unique[d_unique$geneSymbol != "CEN",])
# [1] 34 50

dim(d_unique[d_unique$geneSymbol != "CEN",][d_unique[d_unique$geneSymbol != "CEN",]$geneSymbol %in% g_unique[g_unique$geneSymbol != "CEN",]$geneSymbol,c("Chromosome","pos","wk","log10P","sig_coef","dist","geneSymbol")])
# [1] 8 7

intersect(d_unique[d_unique$geneSymbol != "CEN",c("geneSymbol")], g_unique[g_unique$geneSymbol != "CEN",c("geneSymbol")])
# [1] "LSAMP"      "KHDRBS2"    "AC074389.2" "SEMA3D"     "TBC1D12"    "AK6P1"      "RN7SL584P"  "GATAD2A"  

# Percent overlap d_unique
dim(d_unique[d_unique$geneSymbol != "CEN",][d_unique[d_unique$geneSymbol != "CEN",]$geneSymbol %in% g_unique[g_unique$geneSymbol != "CEN",]$geneSymbol,c("Chromosome","pos","wk","log10P","sig_coef","dist","geneSymbol")])[1]/dim(d_unique[d_unique$geneSymbol != "CEN",])[1]
# [1] 0.2352941


gencode_gtf_ensembl_ucsc <- read.delim("gencode_gtf_ensembl_ucsc_v31.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE,check.names=FALSE)

dim(gencode_gtf_ensembl_ucsc)
# [1] 60603    19

gencode_gtf_ensembl_ucsc$growth_uniq <- 0
gencode_gtf_ensembl_ucsc$paclitaxel_uniq <- 0

gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_id %in% g_unique$ensembl_gene_id,"growth_uniq"] <- 1
gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_id %in% d_unique$ensembl_gene_id,"paclitaxel_uniq"] <- 1



table(gencode_gtf_ensembl_ucsc[,c("growth_uniq","paclitaxel_uniq")])
           # paclitaxel_uniq
# growth_uniq     0     1
          # 0 59722    26
          # 1   847     8




fisher.test(table(gencode_gtf_ensembl_ucsc[,c("growth_uniq","paclitaxel_uniq")]))

	# Fisher's Exact Test for Count Data

# data:  table(gencode_gtf_ensembl_ucsc[, c("growth_uniq", "paclitaxel_uniq")])
# p-value = 1.994e-08
# alternative hypothesis: true odds ratio is not equal to 1
# 95 percent confidence interval:
  # 8.460366 49.534598
# sample estimates:
# odds ratio 
  # 21.69037 
  
  
fisher.test(table(gencode_gtf_ensembl_ucsc[,c("growth_uniq","paclitaxel_uniq")]))$p.value
# [1] 1.994426e-08



  
str(chisq.test(table(gencode_gtf_ensembl_ucsc[,c("growth_uniq","paclitaxel_uniq")])))
# List of 9
 # $ statistic: Named num 104
  # ..- attr(*, "names")= chr "X-squared"
 # $ parameter: Named int 1
  # ..- attr(*, "names")= chr "df"
 # $ p.value  : num 1.76e-24
 # $ method   : chr "Pearson's Chi-squared test with Yates' continuity correction"
 # $ data.name: chr "table(gencode_gtf_ensembl_ucsc[, c(\"growth_uniq\", \"paclitaxel_uniq\")])"
 # $ observed : 'table' int [1:2, 1:2] 59722 847 26 8
  # ..- attr(*, "dimnames")=List of 2
  # .. ..$ growth_uniq    : chr [1:2] "0" "1"
  # .. ..$ paclitaxel_uniq: chr [1:2] "0" "1"
 # $ expected : num [1:2, 1:2] 59714.48 854.52 33.52 0.48
  # ..- attr(*, "dimnames")=List of 2
  # .. ..$ growth_uniq    : chr [1:2] "0" "1"
  # .. ..$ paclitaxel_uniq: chr [1:2] "0" "1"
 # $ residuals: 'table' num [1:2, 1:2] 0.0308 -0.2573 -1.2989 10.8583
  # ..- attr(*, "dimnames")=List of 2
  # .. ..$ growth_uniq    : chr [1:2] "0" "1"
  # .. ..$ paclitaxel_uniq: chr [1:2] "0" "1"
 # $ stdres   : 'table' num [1:2, 1:2] 10.9 -10.9 -10.9 10.9
  # ..- attr(*, "dimnames")=List of 2
  # .. ..$ growth_uniq    : chr [1:2] "0" "1"
  # .. ..$ paclitaxel_uniq: chr [1:2] "0" "1"
 # - attr(*, "class")= chr "htest"
# Warning message:
# In chisq.test(table(gencode_gtf_ensembl_ucsc[, c("growth_uniq",  :
  # Chi-squared approximation may be incorrect














































