# --------- Examine g and d centromeres -------------------


library(XNomial)

# ----------- functions ------------------------

sem <- function(x) {sqrt(var(x,na.rm=TRUE)/sum(!is.na(x)))}

compare <- function(a,b) {
	print(t.test(a,b))
	
	print(paste0("exact P value = ", t.test(a,b)$p.value))
	
	print(paste0("mean of a = ", mean(a, na.rm = TRUE)))
	print(paste0("sem of a = ", sem(a)))
	print(paste0("sd of a = ", sd(a, na.rm = TRUE)))
	print(paste0("number in a = ", sum(!is.na(a))))
	
	print(paste0("mean of b = ", mean(b, na.rm = TRUE)))
	print(paste0("sem of b = ", sem(b)))
	print(paste0("sd of b = ", sd(b, na.rm = TRUE)))
	print(paste0("number in b = ", sum(!is.na(b))))
	
}



# ------------------------- data ----------------------------------------

g_unique <- read.delim("growth_loci_unique.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE,check.names=FALSE)
d_unique <- read.delim("paclitaxel_loci_unique.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE,check.names=FALSE)
Ix <- read.delim("Ix_loci.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE,check.names=FALSE)


# -------------- examine centromeres -------------------------

# 859 unique g loci incl. CEN
dim(g_unique)
# [1] 859  50 <<<<<<< use in paper





# 4 g CEN
dim(g_unique[g_unique$geneSymbol=="CEN",])
# [1]  4 50 <<<<<<< use in paper


# Freq of g CEN
4/859
# [1] 0.004656577


# All g CEN copy numbers go down (sig_coef < 0)
g_unique[g_unique$geneSymbol=="CEN",c("Chromosome","ensembl_gene_id","geneSymbol","conc","log10P","sig_coef")]
    # Chromosome ensembl_gene_id geneSymbol conc   log10P   sig_coef
# 44        chr1            cen1        CEN  avg 16.16330 -0.0931521
# 538      chr11           cen11        CEN   75 22.46447 -0.2432813
# 660      chr15           cen15        CEN   75 19.52518 -0.2873017
# 827       chrX            cenX        CEN   75 19.22160 -0.2016729



# 4 d CEN
dim(d_unique[d_unique$geneSymbol=="CEN",])
# [1] 4 50 <<<<<<< use in paper

# 38 unique d loci incl. CEN
dim(d_unique)
# [1] 38 50 <<<<<<< use in paper

# Freq of d CEN
4/38
# [1] 0.1052632


# All d CEN copy numbers go down (sig_coef < 0)
d_unique[d_unique$geneSymbol=="CEN",c("Chromosome","ensembl_gene_id","geneSymbol","wk","log10P","sig_coef")]
   # Chromosome ensembl_gene_id geneSymbol wk   log10P    sig_coef
# 24      chr11           cen11        CEN  6 31.37027 -0.01864165
# 30      chr16           cen16        CEN  6 17.47496 -0.01352808
# 36      chr20           cen20        CEN  6 16.51963 -0.01516259
# 38       chrX            cenX        CEN  6 39.72011 -0.01776110

# 62 unique Ix loci incl. CEN
dim(Ix)
# [1] 62 50 <<<<<<< use in paper


# 5 Ix CEN
dim(Ix[Ix$geneSymbol=="CEN",])
# [1]  5 50 <<<<<<< use in paper


# Freq of Ix CEN
5/62
# [1] 0.08064516


# All Ix CEN copy numbers go down (sig_coef < 0)
Ix[Ix$geneSymbol=="CEN",c("Chromosome","ensembl_gene_id","geneSymbol","wk_conc","log10P","sig_coef")]
   # Chromosome ensembl_gene_id geneSymbol wk_conc    log10P     sig_coef <<<<<<<<<<<<<< use in paper
# 36      chr11           cen11        CEN      Ix 19.636416 -0.003718164
# 45      chr15           cen15        CEN      Ix 16.578741 -0.004334794
# 50      chr16           cen16        CEN      Ix  9.665248 -0.003083569
# 60      chr20           cen20        CEN      Ix 13.012052 -0.003178911
# 62       chrX            cenX        CEN      Ix 22.030698 -0.003550042


# g d CEN overlap
intersect(g_unique[g_unique$geneSymbol=="CEN","ensembl_gene_id"],d_unique[d_unique$geneSymbol=="CEN","ensembl_gene_id"])
# [1] "cen11" "cenX"       <<<<<<< use in paper

# Ix g CEN overlap
intersect(Ix[Ix$geneSymbol=="CEN","ensembl_gene_id"],g_unique[g_unique$geneSymbol=="CEN","ensembl_gene_id"])
# [1] "cen11" "cen15" "cenX"   <<<<<<< use in paper

# Ix d CEN overlap
intersect(Ix[Ix$geneSymbol=="CEN","ensembl_gene_id"],d_unique[d_unique$geneSymbol=="CEN","ensembl_gene_id"])
# [1] "cen11" "cen16" "cen20" "cenX" <<<<<<< use in paper







# is cen overlap between Ix and g significant? Yes.

cen_overlap <- data.frame(Chromosome = paste0("cen",c(1:22,"X")),Ix_cen = rep(0,23),g_cen = rep(0,23))

cen_overlap[cen_overlap$Chromosome %in% Ix[Ix$geneSymbol=="CEN","ensembl_gene_id"],"Ix_cen"] <- 1
cen_overlap[cen_overlap$Chromosome %in% g_unique[g_unique$geneSymbol=="CEN","ensembl_gene_id"],"g_cen"] <- 1


fisher.test(table(cen_overlap[,c("Ix_cen","g_cen")])) 

	# Fisher's Exact Test for Count Data <<<<<<<<<<<<<<<<<< use in paper

# data:  table(cen_overlap[, c("Ix_cen", "g_cen")])
# p-value = 0.02089
# alternative hypothesis: true odds ratio is not equal to 1
# 95 percent confidence interval:
    # 1.087766 1394.226903
# sample estimates:
# odds ratio 
  # 19.82415  
       
       
       
# is cen overlap between Ix and d significant? Yes.

cen_overlap <- data.frame(Chromosome = paste0("cen",c(1:22,"X")),Ix_cen = rep(0,23),d_cen = rep(0,23))

cen_overlap[cen_overlap$Chromosome %in% Ix[Ix$geneSymbol=="CEN","ensembl_gene_id"],"Ix_cen"] <- 1
cen_overlap[cen_overlap$Chromosome %in% d_unique[d_unique$geneSymbol=="CEN","ensembl_gene_id"],"d_cen"] <- 1


fisher.test(table(cen_overlap[,c("Ix_cen","d_cen")]))


	# Fisher's Exact Test for Count Data <<<<<<<<<<<<<<<<<<<<<<<< use in paper

# data:  table(cen_overlap[, c("Ix_cen", "d_cen")])
# p-value = 0.0005647
# alternative hypothesis: true odds ratio is not equal to 1
# 95 percent confidence interval:
 # 4.194984      Inf
# sample estimates:
# odds ratio 
       # Inf






# Are p vals for Ix and g cens different? No.

compare(Ix[Ix$geneSymbol=="CEN","log10P"],g_unique[g_unique$geneSymbol=="CEN","log10P"])

	# Welch Two Sample t-test <<<<<<<<<<<<<<<<<<< use in paper

# data:  a and b
# t = -1.2297, df = 6.2041, p-value = 0.2634
# alternative hypothesis: true difference in means is not equal to 0
# 95 percent confidence interval:
 # -9.395084  3.077073
# sample estimates:
# mean of x mean of y 
 # 16.18463  19.34364 

# [1] "exact P value = 0.26338912209298"
# [1] "mean of a = 16.1846309693915"
# [1] "sem of a = 2.22277146820476"
# [1] "sd of a = 4.97026810135285"
# [1] "number in a = 5"
# [1] "mean of b = 19.3436361535961"
# [1] "sem of b = 1.28782917419748"
# [1] "sd of b = 2.57565834839495"
# [1] "number in b = 4"





# Are p vals for Ix and d cens different? No.

compare(Ix[Ix$geneSymbol=="CEN","log10P"],d_unique[d_unique$geneSymbol=="CEN","log10P"])

	# Welch Two Sample t-test <<<<<<<<<<<<<<<< use in paper

# data:  a and b
# t = -1.6684, df = 3.9389, p-value = 0.1717
# alternative hypothesis: true difference in means is not equal to 0
# 95 percent confidence interval:
 # -26.975835   6.802607
# sample estimates:
# mean of x mean of y 
 # 16.18463  26.27124 

# [1] "exact P value = 0.171677351103179"
# [1] "mean of a = 16.1846309693915"
# [1] "sem of a = 2.22277146820476"
# [1] "sd of a = 4.97026810135285"
# [1] "number in a = 5"
# [1] "mean of b = 26.2712445566508"
# [1] "sem of b = 5.62242973398471"
# [1] "sd of b = 11.2448594679694"
# [1] "number in b = 4"

























