library(XNomial)


# ----------- functions ---------------------

sem <- function(x) {sqrt(var(x,na.rm=TRUE)/sum(!is.na(x)))}

compare <- function(a,b) {
	print(t.test(a,b))
	
	print(paste0("mean of a = ", mean(a)))
	print(paste0("sem of a = ", sem(a)))
	print(paste0("sd of a = ", sd(a)))
	print(paste0("number in a = ", sum(!is.na(a))))
	
	print(paste0("mean of b = ", mean(b)))
	print(paste0("sem of b = ", sem(b)))
	print(paste0("sd of b = ", sd(b)))
	print(paste0("number in b = ", sum(!is.na(b))))
	
}


# ------------------------- data ----------------------------------------

g_unique <- read.delim("growth_loci_unique.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE,check.names=FALSE)
d_unique <- read.delim("paclitaxel_loci_unique.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE,check.names=FALSE)
Ix <- read.delim("Ix_loci.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE,check.names=FALSE)



 # ---------------- Percent Ix cr and nc genes --------------------------------

dim(Ix)
# [1] 62 50 <<<<<<<<<<<<< use in paper

# remove CEN
Ix_no_cen <- Ix[Ix$geneSymbol != "CEN",]

(Ix_no_cen_cr <- dim(Ix_no_cen[Ix_no_cen$gene_type=="protein_coding",])[1])
# [1] 26 <<<<<<<<<<<< use in paper

(Ix_no_cen_cr_nc <- dim(Ix_no_cen)[1])
# [1] 57  <<<<<<<<<<<< use in paper

Ix_no_cen_cr/Ix_no_cen_cr_nc
# [1] 0.4561404 <<<<<<<<<<<< use in paper

# RH nc hit rate
(Ix_no_cen_nc <- dim(Ix_no_cen[Ix_no_cen$gene_type!="protein_coding",])[1])
# [1] 31 <<<<<<<<<<<< use in paper

Ix_no_cen_nc/Ix_no_cen_cr_nc
# [1] 0.5438596 <<<<<<<<<<<< use in paper



# --------- Differences in log10P vals for cr and nc Ix genes, CEN excluded --------------


# CEN excluded, cr genes have non significant diff logP vals than nc

compare(Ix[Ix$geneSymbol != "CEN" & Ix$gene_type == "protein_coding","log10P"],Ix[Ix$geneSymbol != "CEN" & Ix$gene_type != "protein_coding","log10P"])

	# Welch Two Sample t-test

# data:  a and b
# t = 0.81886, df = 36.468, p-value = 0.4182
# alternative hypothesis: true difference in means is not equal to 0
# 95 percent confidence interval:
 # -1.152568  2.714719
# sample estimates:
# mean of x mean of y 
 # 12.39047  11.60940 

# [1] "exact P value = 0.418183623453641"
# [1] "mean of a = 12.3904744907595"
# [1] "sem of a = 0.858203467806406"
# [1] "sd of a = 4.37599622897786"
# [1] "number in a = 26"
# [1] "mean of b = 11.6093986577617"
# [1] "sem of b = 0.416321890259485"
# [1] "sd of b = 2.31798218405279"
# [1] "number in b = 31"




# -------- Evaluate nc cr enrichment in d_unique ----------------
# ----- Folowing logic in g_loci_text_blurb_1.R -----------------
# ---- Difficult to be certain using theoretical calx, therefore sample -------------
#---- compare nc_cr g loci with all locations sampled from logP file --------


# gene_sample.txt constructed from nc_cr_sample_1.R

gene_sample <- read.delim("gene_sample.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE,check.names=FALSE)

dim(gene_sample)
# [1] 305391     23

gene_sample_no_cen <- gene_sample[gene_sample$geneSymbol != "CEN",]



# percent of cr and all genes in gene_sample. Expected cr hit rate by sampling each position in logP. 
# Now not significantly different to observed hit rate in g_unique_no_cen

(gene_sample_no_cen_cr <- dim(gene_sample_no_cen[gene_sample_no_cen$gene_type == "protein_coding",])[1])
# [1] 153517

(gene_sample_no_cen_nc <- dim(gene_sample_no_cen[gene_sample_no_cen$gene_type != "protein_coding",])[1])
# [1] 137947

(gene_sample_no_cen_cr_nc <- dim(gene_sample_no_cen)[1])
# [1] 291464

gene_sample_no_cen_cr/gene_sample_no_cen_cr_nc
# [1] 0.52671


chisq.test(c(Ix_no_cen_cr, Ix_no_cen_nc),p = c(gene_sample_no_cen_cr, gene_sample_no_cen_nc),rescale.p=TRUE)

	# Chi-squared test for given probabilities

# data:  c(Ix_no_cen_cr, Ix_no_cen_nc)
# X-squared = 1.1387, df = 1, p-value = 0.2859





# -------- However, Ix are non-redundant. To be fair, we should make gene_sample non-redundant ------------
# This sampling approach is more realistic.
# Makes little difference in calculation. Now cr are 0.441917 of genes in gene_sample, and RH Ix genes at 0.4561404 are still non-significantly over-represented. This is different from g_unique, where cr growth genes are significantly overrepresented; cf g_loci_text_blurb_1.R


gene_sample_no_cen_unique <- gene_sample_no_cen[!duplicated(gene_sample_no_cen$ensembl_gene_id),]



(gene_sample_no_cen_unique_cr <- dim(gene_sample_no_cen_unique[gene_sample_no_cen_unique$gene_type == "protein_coding",])[1])
# [1] 18184


(gene_sample_no_cen_unique_nc <- dim(gene_sample_no_cen_unique[gene_sample_no_cen_unique$gene_type != "protein_coding",])[1])
# [1] 22964


(gene_sample_no_cen_unique_cr_nc <- dim(gene_sample_no_cen_unique)[1])
# [1] 41148

gene_sample_no_cen_unique_cr/gene_sample_no_cen_unique_cr_nc
# [1] 0.441917 <<<<<<<<< use in paper

gene_sample_no_cen_unique_nc/gene_sample_no_cen_unique_cr_nc
# [1] 0.558083 <<<<<<<<< use in paper


chisq.test(c(Ix_no_cen_cr, Ix_no_cen_nc),p = c(gene_sample_no_cen_unique_cr, gene_sample_no_cen_unique_nc),rescale.p=TRUE)

	# Chi-squared test for given probabilities

# data:  c(Ix_no_cen_cr, Ix_no_cen_nc)
# X-squared = 0.046756, df = 1, p-value = 0.8288 <<<<<<<<<<<<<<<< use in paper




































