# compare fxn

sem <- function(x) {sqrt(var(x,na.rm=TRUE)/sum(!is.na(x)))}

compare <- function(a,b) {
	print(t.test(a,b))
	
	print(paste0("exact P value = ", t.test(a,b)$p.value))
	
	print(paste0("mean of a = ", mean(a, na.rm = TRUE)))
	print(paste0("sem of a = ", sem(a)))
	print(paste0("sd of a = ", sd(a, na.rm = TRUE)))
	print(paste0("number in a = ", sum(!is.na(a))))
	
	print(paste0("mean of b = ", mean(b, na.rm = TRUE)))
	print(paste0("sem of b = ", sem(b)))
	print(paste0("sd of b = ", sd(b, na.rm = TRUE)))
	print(paste0("number in b = ", sum(!is.na(b))))
	
}



#-------- general observations on centromeres ---------------

g_unique <- read.delim("growth_loci_unique.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE,check.names=FALSE)
d_unique <- read.delim("paclitaxel_loci_unique.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE,check.names=FALSE)

dim(g_unique)
# [1] 859  50 <<<<<<< use in paper


# Number of CEN in g_unique <<<<<<<<<<<<<<<<<<<<<<<<<<< use in paper
g_unique[g_unique$geneSymbol=="CEN",c(1:11,42:43)]
    # Chromosome      posS      posE       pos conc   log10P   sig_coef   dist ensembl_gene_id ensembl_tx_id geneSymbol coef_g_75nM  coef_g_avg
# 44        chr1 121470000 122470000 121970000  avg 16.16330 -0.0931521  56460            cen1          <NA>        CEN  -0.2177887 -0.09315210
# 538      chr11  53780000  54780000  54280000   75 22.46447 -0.2432813      0           cen11          <NA>        CEN  -0.2432813 -0.06453220
# 660      chr15  16480000  17480000  16980000   75 19.52518 -0.2873017 103674           cen15          <NA>        CEN  -0.2873017 -0.07923161
# 827       chrX  58250000  59250000  58750000   75 19.22160 -0.2016729      0            cenX          <NA>        CEN  -0.2016729 -0.03127083


# Proportion of g_unique that are CEN:

4/859
# [1] 0.004656577 <<<<<<<<<<<<<<< use in paper





# CEN have significantly lower logP vals than other genes

compare(g_unique[g_unique$geneSymbol == "CEN","log10P"],g_unique[g_unique$geneSymbol != "CEN","log10P"])

	# Welch Two Sample t-test <<<<<<<<<<<<<<< use in paper

# data:  a and b
# t = -3.1895, df = 3.9523, p-value = 0.0338
# alternative hypothesis: true difference in means is not equal to 0
# 95 percent confidence interval:
 # -8.2498386 -0.5516674
# sample estimates:
# mean of x mean of y 
 # 19.34364  23.74439 

# [1] "exact P value = 0.0337973252785756"
# [1] "mean of a = 19.3436361535961"
# [1] "sem of a = 1.28782917419748"
# [1] "sd of a = 2.57565834839495"
# [1] "number in a = 4"
# [1] "mean of b = 23.7443891941396"
# [1] "sem of b = 0.495172701360252"
# [1] "sd of b = 14.4790394559656"
# [1] "number in b = 855"








# ---------------- Percent cr and nc genes --------------------------------


# see peak_g_1.R for construction of gencode_gtf_ensembl_ucsc. This file lacks CEN
gencode_gtf_ensembl_ucsc <- read.delim("gencode_gtf_ensembl_ucsc_v31.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE,check.names=FALSE)



# percent of cr and all genes in g_unique and gencode. Expected cr hit rate if all genes, cr and nc, are equivalent

(gencode_no_cen_cr <- dim(gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type == "protein_coding",])[1])
# [1] 19975

(gencode_no_cen_nc <- dim(gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type != "protein_coding",])[1])
# [1] 40628

(gencode_no_cen_cr_nc <- dim(gencode_gtf_ensembl_ucsc)[1])
# [1] 60603

gencode_no_cen_cr/gencode_no_cen_cr_nc
# [1] 0.3296041


# observed RH cr hit rate, higher than gencode
# remove CEN
g_unique_no_cen <- g_unique[g_unique$geneSymbol != "CEN",]

(g_unique_no_cen_cr <- dim(g_unique_no_cen[g_unique_no_cen$gene_type=="protein_coding",])[1])
# [1] 442 <<<<<<<<<<<< use in paper

(g_unique_no_cen_cr_nc <- dim(g_unique_no_cen)[1])
# [1] 855 <<<<<<<<<<<< use in paper

g_unique_no_cen_cr/g_unique_no_cen_cr_nc
# [1] 0.5169591 <<<<<<<<<<<< use in paper

# RH nc hit rate
(g_unique_no_cen_nc <- dim(g_unique_no_cen[g_unique_no_cen$gene_type!="protein_coding",])[1])
# [1] 413 <<<<<<<<<<<< use in paper

g_unique_no_cen_nc/g_unique_no_cen_cr_nc
# [1] 0.4830409 <<<<<<<<<<<< use in paper


# g loci significantly more likely to be cr than nc compared to gencode

chisq.test(c(g_unique_no_cen_cr, g_unique_no_cen_nc),p = c(gencode_no_cen_cr, gencode_no_cen_nc),rescale.p=TRUE)

	# Chi-squared test for given probabilities

# data:  c(g_unique_no_cen_cr, g_unique_no_cen_nc)
# X-squared = 135.82, df = 1, p-value < 2.2e-16

chisq.test(c(g_unique_no_cen_cr, g_unique_no_cen_nc),p = c(gencode_no_cen_cr, gencode_no_cen_nc),rescale.p=TRUE)$p.value
# [1] 2.181678e-31


# However, result could be confounded because cr genes longer than nr in gencode
mean(gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type=="protein_coding",]$geneLength)
# [1] 68279.93

mean(gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type!="protein_coding",]$geneLength)
# [1] 15101.03

# ratio
mean(gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type=="protein_coding",]$geneLength)/mean(gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type!="protein_coding",]$geneLength)
# [1] 4.52154

# on the other hand, nc genes are more numerous in gencode (see above)

# key calx is percent genome covered by cr and nc in gencode and RH growth genes
# percent genome covreed by cr in gencode

sum(gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type=="protein_coding",]$geneLength)
# [1] 1363891503

# Human genome excluding mitochondrion 3088286401 bp from human_chr_lengths_1.R
# excluding chrY human genome is 3088286401-57227415

human_noY_noM <- 3088286401-57227415




# Hence percent genome covered by cr genes (incl introns) in gencode is roughly 3 times nc
(gencode_no_cen_cr_geneLength <- sum(gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type=="protein_coding",]$geneLength)/human_noY_noM)
# [1] 0.4499719


# percent genome covered by nc genes (incl introns) in gencode
(gencode_no_cen_nc_geneLength <- sum(gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type != "protein_coding",]$geneLength)/human_noY_noM)
# [1] 0.2024127

# Hence, based on geneLength in gencode, expect 73% hit rate for cr genes. RH growth genes have significantly fewer cr genes (and higher nc genes) 
gencode_no_cen_cr_geneLength/(gencode_no_cen_cr_geneLength + gencode_no_cen_nc_geneLength)
# [1] 0.6897341


chisq.test(c(g_unique_no_cen_cr, g_unique_no_cen_nc),p = c(gencode_no_cen_cr_geneLength, gencode_no_cen_nc_geneLength),rescale.p=TRUE)

	# Chi-squared test for given probabilities

# data:  c(g_unique_no_cen_cr, g_unique_no_cen_nc)
# X-squared = 119.26, df = 1, p-value < 2.2e-16



chisq.test(c(g_unique_no_cen_cr, g_unique_no_cen_nc),p = c(gencode_no_cen_cr_geneLength, gencode_no_cen_nc_geneLength),rescale.p=TRUE)$p.value
# [1] 9.163736e-28


# But confounded by introns. What about txLength? Even excluding introns, cr tx lengths still ~2.5 fold higher coverage than nc tx length


# Percent genome covered by cr txs (excl introns)
(gencode_no_cen_cr_txLength <- sum(gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type=="protein_coding",]$txLength,na.rm=TRUE)/human_noY_noM)
# [1] 0.02547526



# percent genome covered by nc txs (excl introns)
(gencode_no_cen_nc_txLength <- sum(gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type != "protein_coding",]$txLength,na.rm=TRUE)/human_noY_noM)
# [1] 0.01310518


# Therefore, based on txLength, expect 70% hit rate for cr genes. RH growth genes have significantly fewer cr genes (and higher nc genes). txLength is not quite as significant as geneLength, but still very significant.
gencode_no_cen_cr_txLength/(gencode_no_cen_cr_txLength+ gencode_no_cen_nc_txLength)
# [1] 0.6603155



chisq.test(c(g_unique_no_cen_cr, g_unique_no_cen_nc),p = c(gencode_no_cen_cr_txLength, gencode_no_cen_nc_txLength),rescale.p=TRUE)

	# Chi-squared test for given probabilities

# data:  c(g_unique_no_cen_cr, g_unique_no_cen_nc)
# X-squared = 78.338, df = 1, p-value < 2.2e-16


chisq.test(c(g_unique_no_cen_cr, g_unique_no_cen_nc),p = c(gencode_no_cen_cr_txLength, gencode_no_cen_nc_txLength),rescale.p=TRUE)$p.value
# [1] 8.683037e-19




# ---- But difficult to be certain using theoretical calx, therefore sample -------------
#---- compare nc_cr g loci with all locations sampled from logP file --------


# gene_sample.txt constructed from nc_cr_sample_1.R

gene_sample <- read.delim("gene_sample.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE,check.names=FALSE)

dim(gene_sample)
# [1] 305391     23

gene_sample_no_cen <- gene_sample[gene_sample$geneSymbol != "CEN",]



# percent of cr and all genes in gene_sample. Expected cr hit rate by sampling each position in logP. 
# Gives very different results to theoretical calx above
# Now not significantly different to observed hit rate in g_unique_no_cen

(gene_sample_no_cen_cr <- dim(gene_sample_no_cen[gene_sample_no_cen$gene_type == "protein_coding",])[1])
# [1] 153517

(gene_sample_no_cen_nc <- dim(gene_sample_no_cen[gene_sample_no_cen$gene_type != "protein_coding",])[1])
# [1] 137947

(gene_sample_no_cen_cr_nc <- dim(gene_sample_no_cen)[1])
# [1] 291464

gene_sample_no_cen_cr/gene_sample_no_cen_cr_nc
# [1] 0.52671


chisq.test(c(g_unique_no_cen_cr, g_unique_no_cen_nc),p = c(gene_sample_no_cen_cr, gene_sample_no_cen_nc),rescale.p=TRUE)

	# Chi-squared test for given probabilities

# data:  c(g_unique_no_cen_cr, g_unique_no_cen_nc)
# X-squared = 0.32611, df = 1, p-value = 0.568





# -------- However, g_unique are non-redundant. To be fair, we should make gene_sample non-redundant ------------
# Makes enormous difference in calculation. Now cr are 0.45 of genes in gene_sample, and RH growth genes at 0.548 are significantly over-represented!
# Note this is opposite direction to theoretical calx above using geneLength and txLength
# However, this sampling approach is more realistic

gene_sample_no_cen_unique <- gene_sample_no_cen[!duplicated(gene_sample_no_cen$ensembl_gene_id),]



(gene_sample_no_cen_unique_cr <- dim(gene_sample_no_cen_unique[gene_sample_no_cen_unique$gene_type == "protein_coding",])[1])
# [1] 18184


(gene_sample_no_cen_unique_nc <- dim(gene_sample_no_cen_unique[gene_sample_no_cen_unique$gene_type != "protein_coding",])[1])
# [1] 22964


(gene_sample_no_cen_unique_cr_nc <- dim(gene_sample_no_cen_unique)[1])
# [1] 41148

gene_sample_no_cen_unique_cr/gene_sample_no_cen_unique_cr_nc
# [1] 0.441917 <<<<<<<<< use in paper

gene_sample_no_cen_unique_nc/gene_sample_no_cen_unique_cr_nc
# [1] 0.558083 <<<<<<<<< use in paper


chisq.test(c(g_unique_no_cen_cr, g_unique_no_cen_nc),p = c(gene_sample_no_cen_unique_cr, gene_sample_no_cen_unique_nc),rescale.p=TRUE)

	# Chi-squared test for given probabilities

# data:  c(g_unique_no_cen_cr, g_unique_no_cen_nc)
# X-squared = 19.523, df = 1, p-value = 9.942e-06 <<<<<<<<<<<<<<<< use in paper



# --------------- logP vals cr vs nc --------------------------------------

# including CEN
compare(g_unique[g_unique$gene_type == "protein_coding","log10P"],g_unique[g_unique$gene_type != "protein_coding","log10P"])

	# Welch Two Sample t-test 

# data:  a and b
# t = -0.37357, df = 832.44, p-value = 0.7088
# alternative hypothesis: true difference in means is not equal to 0
# 95 percent confidence interval:
 # -2.313471  1.573658
# sample estimates:
# mean of x mean of y 
 # 23.54433  23.91423 

# [1] "exact P value = 0.708818709017806"
# [1] "mean of a = 23.5443262735282"
# [1] "sem of a = 0.648357220861902"
# [1] "sd of a = 13.6309299735178"
# [1] "number in a = 442"
# [1] "mean of b = 23.9142328362213"
# [1] "sem of b = 0.748405816437236"
# [1] "sd of b = 15.2828792429354"
# [1] "number in b = 417"

# excluding CEN
compare(g_unique_no_cen[g_unique_no_cen$gene_type == "protein_coding","log10P"],g_unique_no_cen[g_unique_no_cen$gene_type != "protein_coding","log10P"])

	# Welch Two Sample t-test <<<<<<<<<<<<<<<<<<<< use in paper

# data:  a and b
# t = -0.4161, df = 824.61, p-value = 0.6774
# alternative hypothesis: true difference in means is not equal to 0
# 95 percent confidence interval:
 # -2.367950  1.539602
# sample estimates:
# mean of x mean of y 
 # 23.54433  23.95850 

# [1] "exact P value = 0.677447819416332"
# [1] "mean of a = 23.5443262735282"
# [1] "sem of a = 0.648357220861902"
# [1] "sd of a = 13.6309299735178"
# [1] "number in a = 442"
# [1] "mean of b = 23.9585001164404"
# [1] "sem of b = 0.755257510444029"
# [1] "sd of b = 15.3486463124574"
# [1] "number in b = 413"




# ------- mean distance g_unique_no_cen vs gene_sample_no_cen_unique ---------

# To keep power equal, must choose same number of genes from gene_sample_no_cen_unique as in g_unique_no_cen

# To to START HERE (below) if sampling already done, to save time.

## DO NOT DELETE!!!
# ans <- data.frame(mean.sample=numeric(),sd.sample=numeric())

# # number permutations
# # takes about 20 min for 1e5 samples
# n <- 1e5

# for(i in c(1:n)) {
	# print(i)
	# sample.dist <- gene_sample_no_cen_unique[sample(c(1:nrow(gene_sample_no_cen_unique)),dim(g_unique_no_cen)[1],replace=FALSE),"dist"]
	# ans[i,"mean.sample"] <- mean(sample.dist) # measures bias (systematic error, trueness or previously, accuracy)
	# ans[i,"sd.sample"] <- sd(sample.dist) # precision (random errors)
# }


# write.table(ans,"gene_sample_mean_dist.txt",quote=FALSE,sep="\t",row.names=FALSE,col.names=TRUE)


# START HERE
ans <- read.table("gene_sample_mean_dist.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE)



# Bias. Highly significant in g_unique compared to sample

mean(g_unique_no_cen$dist)
# [1] -1854.125 

# no sem for single observation
sem(mean(g_unique_no_cen$dist))
[1] NA


mean(ans$mean.sample)
# [1] 11626.93 

sd(ans$mean.sample)
# [1] 3065.098 

# meaningless sem because can make arbitrarily small by ramping up number of samples
sem(ans$mean.sample)
# [1] 9.692692

# meaningless t test because power can be arbitrarily ramped up by increasing samples
t.test(ans$mean.sample,mu=mean(g_unique_no_cen$dist))

	# One Sample t-test

# data:  ans$mean.sample
# t = 1390.8, df = 99999, p-value < 2.2e-16
# alternative hypothesis: true mean is not equal to -1854.125
# 95 percent confidence interval:
 # 11607.93 11645.93
# sample estimates:
# mean of x 
 # 11626.93  

t.test(ans$mean.sample,mu=mean(g_unique_no_cen$dist))$p.value
# [1] 0

# Howerver, no overlap between mean(g_unique_no_cen$dist) and lowest sample.
# Therefore p ~< 10^-5 (number of samples) 
min(ans$mean.sample)
# [1] 6444.676

# Actual P val calx, one sided. cf https://stats.stackexchange.com/questions/109207/p-values-equal-to-0-in-permutation-test, comment by Trisoloriansunscreen
(sum(ans$mean.sample <= mean(g_unique_no_cen$dist))+1)/(n+1)
# [1] 9.9999e-06 

# Two sided. Right hand term swings obs value to opposite of mean of null.
(sum(ans$mean.sample <= mean(g_unique_no_cen$dist))+1)/(n+1) + (sum(ans$mean.sample >= 2*mean(ans$mean.sample) - mean(g_unique_no_cen$dist))+1)/(n+1)
# [1] 0.004129959

# Two sided using median(ans$mean.sample) (see below for explanation)
(sum(ans$mean.sample <= mean(g_unique_no_cen$dist))+1)/(n+1) + (sum(ans$mean.sample >= 2*median(ans$mean.sample) - mean(g_unique_no_cen$dist))+1)/(n+1)
# [1] 0.01116989

# For graph:
hist(ans$mean.sample,xlim=c(-10e3,5e4),breaks=100)
abline(v=mean(g_unique_no_cen$dist),col="red",lwd=0.5)










# Precision. No significant diff between g_unique and sample.

sd(g_unique_no_cen$dist)
# [1] 42170.62 

# no sem for single observation
sem(sd(g_unique_no_cen$dist))
# [1] NA

mean(ans$sd.sample)
# 52417.86 

sd(ans$sd.sample)
# [1] 73736.6 

# One sided P value
(sum(ans$sd.sample >= sd(g_unique_no_cen$dist))+1)/(n+1)
# [1] 0.101009

# Two sided P value. Have to use median for ans$sd.sample, because so non-normal mean(ans$sd.sample) is greater than sd(g_unique_no_cen$dist), even though graph below says otherwise. In fact, using mean(ans$sd.sample) gives two sided P value > 1 (in fact, 1.00095)
(sum(ans$sd.sample >= sd(g_unique_no_cen$dist))+1)/(n+1) + (sum(ans$sd.sample <= 2*mean(ans$sd.sample) - sd(g_unique_no_cen$dist))+1)/(n+1)
# [1] 1.00095

# Two sided P val using median(ans$sd.sample)
(sum(ans$sd.sample >= sd(g_unique_no_cen$dist))+1)/(n+1) + (sum(ans$sd.sample <= 2*median(ans$sd.sample) - sd(g_unique_no_cen$dist))+1)/(n+1)
# [1] 0.101029 


# For graph:
hist(ans$sd.sample,breaks=100)
abline(v=sd(g_unique_no_cen$dist),col="red",lwd=0.5)


#------- overlap between petal core fitness genes and RH-BSA g genes---------


petal <- read.table("petal.txt",header=TRUE,stringsAsFactors=FALSE,sep="\t")
g_unique <- read.delim("growth_loci_unique.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE,check.names=FALSE)

dim(g_unique)
# [1] 859  50


dim(petal[petal$numTKOHits >= 3,])
# [1] 1580    9


length(intersect(petal[petal$numTKOHits >= 3,]$Gene, g_unique$geneSymbol))
# [1] 21

intersect(petal[petal$numTKOHits >= 3,]$Gene, g_unique$geneSymbol)
# [1] "ABCB7"  "BRCA2"  "DHX37"  "ELAC2"  "KANSL1" "MCM5"   "MRPS11" "NBAS"   "NUP155" "RHPN1"  "RIOK1"  "RUVBL1" "SHOC2"  "SHQ1"   "SMC2"   "SMC5"   "SPC24" 
# [18] "TBCE"   "TRRAP"  "USPL1"  "WDR18" 

petal$core <- 0
petal[petal$numTKOHits >= 3,"core"] <- 1

petal$RH <- 0
petal[petal$Gene %in% g_unique$geneSymbol,"RH"] <- 1

# Fisher test consistent with lack of overlap between RH genes and CRISPR genes
# Did not include in paper, because already adequately discussed

fisher.test(table(petal[,c("core","RH")]))

	# Fisher's Exact Test for Count Data

# data:  table(petal[, c("core", "RH")])
# p-value = 0.006094
# alternative hypothesis: true odds ratio is not equal to 1
# 95 percent confidence interval:
 # 0.3356615 0.8565174
# sample estimates:
# odds ratio 
 # 0.5502926  



# ----------- Number nc genes, excluding CEN -----------------

dim(g_unique[g_unique$geneSymbol != "CEN" & g_unique$gene_type != "protein_coding",])
# [1] 413  50

#------------ genes with -AS in name (antisense) ----------------

length(g_unique[grepl("antisense RNA",g_unique[,"gene_description"]),"gene_description"])
# [1] 20


g_unique[grepl("antisense RNA",g_unique[,"gene_description"]),"geneSymbol"]
 # [1] "NFIA-AS2"     "GNG12-AS1"    "ZRANB2-AS2"   "EIF1B-AS1"    "ARHGEF3-AS1"  "SYNPR-AS1"    "TAPT1-AS1"    "TMEM161B-AS1" "LIX1-AS1"     "P4HA2-AS1"   
# [11] "STK32A-AS1"   "SAP30L-AS1"   "LY86-AS1"     "RNF217-AS1"   "MNX1-AS1"     "SMC5-AS1"     "PGR-AS1"      "TBX5-AS1"     "DNAH17-AS1"   "PTCHD1-AS"  




























