# --------- Examine g and d centromeres -------------------


library(XNomial)

# ----------- functions ------------------------

sem <- function(x) {sqrt(var(x,na.rm=TRUE)/sum(!is.na(x)))}

compare <- function(a,b) {
	print(t.test(a,b))
	
	print(paste0("exact P value = ", t.test(a,b)$p.value))
	
	print(paste0("mean of a = ", mean(a, na.rm = TRUE)))
	print(paste0("sem of a = ", sem(a)))
	print(paste0("sd of a = ", sd(a, na.rm = TRUE)))
	print(paste0("number in a = ", sum(!is.na(a))))
	
	print(paste0("mean of b = ", mean(b, na.rm = TRUE)))
	print(paste0("sem of b = ", sem(b)))
	print(paste0("sd of b = ", sd(b, na.rm = TRUE)))
	print(paste0("number in b = ", sum(!is.na(b))))
	
}



# ------------------------- data ----------------------------------------

g_unique <- read.delim("growth_loci_unique.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE,check.names=FALSE)
d_unique <- read.delim("paclitaxel_loci_unique.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE,check.names=FALSE)


# -------------- examine centromeres -------------------------

# 859 unique g loci incl. CEN
dim(g_unique)
# [1] 859  50 <<<<<<< use in paper

# 38 unique d loci incl. CEN
dim(d_unique)
# [1] 38 50 <<<<<<< use in paper



# 4 g CEN
dim(g_unique[g_unique$geneSymbol=="CEN",])
# [1]  4 50 <<<<<<< use in paper


# Freq of g CEN
dim(g_unique[g_unique$geneSymbol=="CEN",])[1]/dim(g_unique)[1]
# [1] 0.004656577 <<<<<<< use in paper


# All g CEN copy numbers go down (sig_coef < 0)
g_unique[g_unique$geneSymbol=="CEN",c("Chromosome","ensembl_gene_id","geneSymbol","conc","log10P","sig_coef")]
    # Chromosome ensembl_gene_id geneSymbol conc   log10P   sig_coef
# 44        chr1            cen1        CEN  avg 16.16330 -0.0931521
# 538      chr11           cen11        CEN   75 22.46447 -0.2432813
# 660      chr15           cen15        CEN   75 19.52518 -0.2873017
# 827       chrX            cenX        CEN   75 19.22160 -0.2016729



# 4 d CEN
dim(d_unique[d_unique$geneSymbol=="CEN",])
# [1] 4 50 <<<<<<< use in paper

# Freq of d CEN
dim(d_unique[d_unique$geneSymbol=="CEN",])[1]/dim(d_unique)[1]
# [1] 0.1052632 <<<<<<<<<< use in paper


# Is freq of d CEN significantly higher than g CEN? Yes.

chisq.test(x=c(dim(d_unique[d_unique$geneSymbol=="CEN",])[1], dim(d_unique)[1]-dim(d_unique[d_unique$geneSymbol=="CEN",])[1]),p=c(dim(g_unique[g_unique$geneSymbol=="CEN",])[1], dim(g_unique)[1]-dim(g_unique[g_unique$geneSymbol=="CEN",])[1]),rescale.p=TRUE)

	# Chi-squared test for given probabilities

# data:  c(dim(d_unique[d_unique$geneSymbol == "CEN", ])[1], dim(d_unique)[1] -     dim(d_unique[d_unique$geneSymbol == "CEN", ])[1])
# X-squared = 82.984, df = 1, p-value < 2.2e-16

# Warning message:
# In chisq.test(x = c(dim(d_unique[d_unique$geneSymbol == "CEN", ])[1],  :
  # Chi-squared approximation may be incorrect

chisq.test(x=c(dim(d_unique[d_unique$geneSymbol=="CEN",])[1], dim(d_unique)[1]-dim(d_unique[d_unique$geneSymbol=="CEN",])[1]),p=c(dim(g_unique[g_unique$geneSymbol=="CEN",])[1], dim(g_unique)[1]-dim(g_unique[g_unique$geneSymbol=="CEN",])[1]),rescale.p=TRUE)$p.value
# [1] 8.270242e-20
# Warning message:
# In chisq.test(x = c(dim(d_unique[d_unique$geneSymbol == "CEN", ])[1],  :
  # Chi-squared approximation may be incorrect




# <<<<<<< use in paper >>>>>>>>>


# Use xmulti to account for chisq approx being incorrect
# xmulti is much more conservative

xmulti(obs=c(dim(d_unique[d_unique$geneSymbol=="CEN",])[1], dim(d_unique)[1]-dim(d_unique[d_unique$geneSymbol=="CEN",])[1]),expr=c(dim(g_unique[g_unique$geneSymbol=="CEN",])[1], dim(g_unique)[1]-dim(g_unique[g_unique$geneSymbol=="CEN",])[1]),detail = 3)

# P value  (LLR)  =  3.058e-05
# P value (Prob)  =  3.058e-05
# P value (Chisq) =  3.058e-05

# Observed:  4 34 
# Expected ratio:  4 855 
# Total number of tables:  39 



str(xmulti(obs=c(dim(d_unique[d_unique$geneSymbol=="CEN",])[1], dim(d_unique)[1]-dim(d_unique[d_unique$geneSymbol=="CEN",])[1]),expr=c(dim(g_unique[g_unique$geneSymbol=="CEN",])[1], dim(g_unique)[1]-dim(g_unique[g_unique$geneSymbol=="CEN",])[1]),detail = 3))

# P value  (LLR)  =  3.058e-05
# P value (Prob)  =  3.058e-05
# P value (Chisq) =  3.058e-05

# Observed:  4 34 
# Expected ratio:  4 855 
# Total number of tables:  39 
# List of 11
 # $ obs               : int [1:2] 4 34
 # $ expr              : num [1:2] 4 855
 # $ statType          : chr "LLR"
 # $ pLLR              : num 3.06e-05
 # $ pProb             : num 3.06e-05
 # $ pChi              : num 3.06e-05
 # $ observedLLR       : num -8.85
 # $ observedProb      : num 2.96e-05
 # $ observedChi       : num 83
 # $ asymptotic.p.value: num 8.27e-20
 # $ cases.examined    : num 39


xmulti(obs=c(dim(d_unique[d_unique$geneSymbol=="CEN",])[1], dim(d_unique)[1]-dim(d_unique[d_unique$geneSymbol=="CEN",])[1]),expr=c(dim(g_unique[g_unique$geneSymbol=="CEN",])[1], dim(g_unique)[1]-dim(g_unique[g_unique$geneSymbol=="CEN",])[1]),detail = 3)$pLLR

# P value  (LLR)  =  3.058e-05
# P value (Prob)  =  3.058e-05
# P value (Chisq) =  3.058e-05

# Observed:  4 34 
# Expected ratio:  4 855 
# Total number of tables:  39 
# [1] 3.058054e-05




xmulti(obs=c(dim(d_unique[d_unique$geneSymbol=="CEN",])[1], dim(d_unique)[1]-dim(d_unique[d_unique$geneSymbol=="CEN",])[1]),expr=c(dim(g_unique[g_unique$geneSymbol=="CEN",])[1], dim(g_unique)[1]-dim(g_unique[g_unique$geneSymbol=="CEN",])[1]),detail = 3)$observedLLR

# P value  (LLR)  =  3.058e-05
# P value (Prob)  =  3.058e-05
# P value (Chisq) =  3.058e-05

# Observed:  4 34 
# Expected ratio:  4 855 
# Total number of tables:  39 
# [1] -8.849753


# ^^^^^^^^^^^ use in paper ^^^^^^^^^^^^^



# All d CEN copy numbers go down (sig_coef < 0)
d_unique[d_unique$geneSymbol=="CEN",c("Chromosome","ensembl_gene_id","geneSymbol","wk","log10P","sig_coef")] # <<<<<<<<<< use in paper
   # Chromosome ensembl_gene_id geneSymbol wk   log10P    sig_coef
# 24      chr11           cen11        CEN  6 31.37027 -0.01864165
# 30      chr16           cen16        CEN  6 17.47496 -0.01352808
# 36      chr20           cen20        CEN  6 16.51963 -0.01516259
# 38       chrX            cenX        CEN  6 39.72011 -0.01776110


# g d CEN overlap
intersect(g_unique[g_unique$geneSymbol=="CEN","ensembl_gene_id"],d_unique[d_unique$geneSymbol=="CEN","ensembl_gene_id"])
# [1] "cen11" "cenX"     <<<<<<< use in paper




# <<<<<<< use in paper >>>>>>>>>


# is cen overlap between g and d significant? No. Probably because insufficient observations of CEN and hence low power.

cen_overlap <- data.frame(Chromosome = paste0("cen",c(1:22,"X")),g_cen = rep(0,23),d_cen = rep(0,23))

cen_overlap[cen_overlap$Chromosome %in% g_unique[g_unique$geneSymbol=="CEN","ensembl_gene_id"],"g_cen"] <- 1
cen_overlap[cen_overlap$Chromosome %in% d_unique[d_unique$geneSymbol=="CEN","ensembl_gene_id"],"d_cen"] <- 1


fisher.test(table(cen_overlap[,c("g_cen","d_cen")]))

	# Fisher's Exact Test for Count Data

# data:  table(cen_overlap[, c("g_cen", "d_cen")])
# p-value = 0.1246
# alternative hypothesis: true odds ratio is not equal to 1
# 95 percent confidence interval:
   # 0.3579547 167.8886949
# sample estimates:
# odds ratio 
  # 7.369274   

# ^^^^^^^^^^^ use in paper ^^^^^^^^^^^^^





# Are p vals for g and d cens different? No.

compare(g_unique[g_unique$geneSymbol=="CEN","log10P"],d_unique[d_unique$geneSymbol=="CEN","log10P"])

	# Welch Two Sample t-test <<<<<<<<<<<<<<<<<<<<< use in paper

# data:  a and b
# t = -1.201, df = 3.3139, p-value = 0.3085
# alternative hypothesis: true difference in means is not equal to 0
# 95 percent confidence interval:
 # -24.33850  10.48328
# sample estimates:
# mean of x mean of y 
 # 19.34364  26.27124 

# [1] "exact P value = 0.308490574164713"
# [1] "mean of a = 19.3436361535961"
# [1] "sem of a = 1.28782917419748"
# [1] "sd of a = 2.57565834839495"
# [1] "number in a = 4"
# [1] "mean of b = 26.2712445566508"
# [1] "sem of b = 5.62242973398471"
# [1] "sd of b = 11.2448594679694"
# [1] "number in b = 4"




# <<<<<<< use in paper >>>>>>>>>

# Are p vals for d_unique cen and d_unique non-cen different? No.

compare(d_unique[d_unique$geneSymbol=="CEN","log10P"],d_unique[d_unique$geneSymbol !="CEN","log10P"])

	# Welch Two Sample t-test

# data:  a and b
# t = 1.6816, df = 3.2966, p-value = 0.183
# alternative hypothesis: true difference in means is not equal to 0
# 95 percent confidence interval:
 # -7.742236 27.103174
# sample estimates:
# mean of x mean of y 
 # 26.27124  16.59078 

# [1] "exact P value = 0.182979600530888"
# [1] "mean of a = 26.2712445566508"
# [1] "sem of a = 5.62242973398471"
# [1] "sd of a = 11.2448594679694"
# [1] "number in a = 4"
# [1] "mean of b = 16.590775234376"
# [1] "sem of b = 1.23666088167073"
# [1] "sd of b = 7.21091011125902"
# [1] "number in b = 34"

# ^^^^^^^^^^^ use in paper ^^^^^^^^^^^^^



# ---------- g and d copy number changes -----------------



# All but 7 of g_unique copy number goes down

sum(g_unique$sig_coef < 0)
# [1] 852


sum(g_unique$sig_coef > 0)
# [1] 7


length(g_unique$sig_coef < 0)
# [1] 859




# All but 7 of d_unique copy number goes down

sum(d_unique$sig_coef < 0)
# [1] 31

sum(d_unique$sig_coef >= 0)
# [1] 7 <<<<<<<<<<<<< use in paper

length(d_unique$sig_coef < 0)
# [1] 38 <<<<<<<<<<<<< use in paper


d_unique[d_unique$sig_coef>=0,c("Chromosome","pos","wk","log10P","sig_coef","dist","geneSymbol")]
   # Chromosome       pos  wk    log10P    sig_coef   dist geneSymbol
# 6        chr3  27020000   6 16.466247 0.019296561  90085      NEK10
# 18       chr9 133350000   3  6.124421 0.012001199      0      RPL7A
# 19      chr10  21790000 avg  7.914584 0.015221526      0     DNAJC1
# 20      chr10  94390000 avg  7.554392 0.016172217  12504    TBC1D12
# 21      chr11  10930000   6 17.882010 0.029605551 -21029  ZBED5-AS1
# 22      chr11  11450000   4 13.182948 0.015906147      0    GALNT18
# 23      chr11  13060000   3  6.566386 0.009412885      0 AC013762.1



























