# test overlap between GWAS significant g, d and Ix loci, including CEN.

library(psych)
library(vcd)
library(limma)


g <- read.delim("growth_loci.txt",sep="\t",stringsAsFactors=FALSE,header=TRUE,check.names=FALSE)
d <- read.delim("paclitaxel_loci.txt",sep="\t",stringsAsFactors=FALSE,header=TRUE,check.names=FALSE)
g_unique <- read.delim("growth_loci_unique.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE,check.names=FALSE)
d_unique <- read.delim("paclitaxel_loci_unique.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE,check.names=FALSE)
Ix <- read.delim("Ix_loci.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE,check.names=FALSE)

g_bind <- g
d_bind <- d
Ix_bind <- Ix

g_bind[g_bind$conc == 0,]$conc <- "g_0nM"
g_bind[g_bind$conc == 8,]$conc <- "g_8nM"
g_bind[g_bind$conc == 25,]$conc <- "g_25nM"
g_bind[g_bind$conc == 75,]$conc <- "g_75nM"
g_bind[g_bind$conc == "avg",]$conc <- "g_avg"


d_bind[d_bind$wk == 2,]$wk <- "d_w2"
d_bind[d_bind$wk == 3,]$wk <- "d_w3"
d_bind[d_bind$wk == 4,]$wk <- "d_w4"
d_bind[d_bind$wk == 6,]$wk <- "d_w6"
d_bind[d_bind$wk == "avg",]$wk <- "d_avg"

colnames(g_bind)[5] <- "wk_conc"
colnames(d_bind)[5] <- "wk_conc"

g_and_d <- rbind(g_bind,d_bind,Ix_bind)


#no d loci w1
g_and_d_overlap_frac <- data.frame("g 0nM"=numeric(), "g 8nM"=numeric(), "g 25nM"=numeric(), "g 75nM"=numeric(), "g avg"=numeric(),"d w2"=numeric(), "d w3"=numeric(), "d w4"=numeric(), "d w6"=numeric(), "d avg"=numeric(), "Ix" = numeric(), check.names=FALSE)

k <- as.character(c("g_0nM","g_8nM","g_25nM","g_75nM","g_avg", "d_w2", "d_w3","d_w4","d_w6","d_avg", "Ix"))

for(i in 1:11) 
{
	for(j in 1:11) 
	{
		g_and_d_overlap_frac[i,j] <- dim(g_and_d[g_and_d$wk_conc==k[i],][with(g_and_d[g_and_d$wk_conc==k[i],], paste(Chromosome, ensembl_gene_id, sep = "\r")) %in% with(g_and_d[g_and_d$wk_conc==k[j],], paste(Chromosome, ensembl_gene_id, sep="\r")), ])[1]
	}
}

row.names(g_and_d_overlap_frac) <- c(colnames(g_and_d_overlap_frac))

g_and_d_overlap_dec <- g_and_d_overlap_frac
for(i in 1:11) {
	g_and_d_overlap_dec[i,] <- g_and_d_overlap_dec[i,]/g_and_d_overlap_dec[i,i]
}


# Note g_and_d_overlap_frac uses non-unique g and d tables (obvs)

# <<<<<<<<<<<<<<<<<< use in paper >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
g_and_d_overlap_frac
       # g 0nM g 8nM g 25nM g 75nM g avg d w2 d w3 d w4 d w6 d avg Ix
# g 0nM    409   305    162      3   142    1    4    2    2     4  2
# g 8nM    305   460    238      5   207    0    0    0    0     0  0
# g 25nM   162   238    466     16   414    0    0    0    0     0  2
# g 75nM     3     5     16     55    16    0    4    4    7     4 15
# g avg    142   207    414     16   446    0    0    0    0     0  2
# d w2       1     0      0      0     0    2    2    2    2     2  0
# d w3       4     0      0      4     0    2   26   17   15    23  7
# d w4       2     0      0      4     0    2   17   19   15    18  8
# d w6       2     0      0      7     0    2   15   15   25    16 14
# d avg      4     0      0      4     0    2   23   18   16    25  8
# Ix         2     0      2     15     2    0    7    8   14     8 62
# ^^^^^^^^^^^^^^^^^^^^ use in paper ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^




g_and_d_overlap_dec
            # g 0nM      g 8nM     g 25nM      g 75nM      g avg        d w2        d w3        d w4        d w6       d avg          Ix
# g 0nM  1.00000000 0.74572127 0.39608802 0.007334963 0.34718826 0.002444988 0.009779951 0.004889976 0.004889976 0.009779951 0.004889976
# g 8nM  0.66304348 1.00000000 0.51739130 0.010869565 0.45000000 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000
# g 25nM 0.34763948 0.51072961 1.00000000 0.034334764 0.88841202 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000 0.004291845
# g 75nM 0.05454545 0.09090909 0.29090909 1.000000000 0.29090909 0.000000000 0.072727273 0.072727273 0.127272727 0.072727273 0.272727273
# g avg  0.31838565 0.46412556 0.92825112 0.035874439 1.00000000 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000 0.004484305
# d w2   0.50000000 0.00000000 0.00000000 0.000000000 0.00000000 1.000000000 1.000000000 1.000000000 1.000000000 1.000000000 0.000000000
# d w3   0.15384615 0.00000000 0.00000000 0.153846154 0.00000000 0.076923077 1.000000000 0.653846154 0.576923077 0.884615385 0.269230769
# d w4   0.10526316 0.00000000 0.00000000 0.210526316 0.00000000 0.105263158 0.894736842 1.000000000 0.789473684 0.947368421 0.421052632
# d w6   0.08000000 0.00000000 0.00000000 0.280000000 0.00000000 0.080000000 0.600000000 0.600000000 1.000000000 0.640000000 0.560000000
# d avg  0.16000000 0.00000000 0.00000000 0.160000000 0.00000000 0.080000000 0.920000000 0.720000000 0.640000000 1.000000000 0.320000000
# Ix     0.03225806 0.00000000 0.03225806 0.241935484 0.03225806 0.000000000 0.112903226 0.129032258 0.225806452 0.129032258 1.000000000


gene_num <- 60603 # num cr and nc genes, GENCODE v31, https://www.gencodegenes.org/human/stats.html. See also gencode_gtf_ensembl_ucsc_v31.txt, which has # rows == 60603.

# add 23 CEN, because including CENs in overlaps
gene_num <- gene_num + 23

g_and_d_fish_pval <- as.data.frame(matrix("NA", ncol = ncol(g_and_d_overlap_frac), nrow = nrow(g_and_d_overlap_frac)),stringsAsFactors=FALSE)
colnames(g_and_d_fish_pval) <- colnames(g_and_d_overlap_frac)
rownames(g_and_d_fish_pval) <- rownames(g_and_d_overlap_frac)

g_and_d_fish_odds_ratio <- g_and_d_fish_pval

# No log.p option for fisher.test

for(i in 1:nrow(g_and_d_overlap_frac)){
	for(j in 1:ncol(g_and_d_overlap_frac)){
		union <- g_and_d_overlap_frac[i,j]
		d_not_g <- g_and_d_overlap_frac[j,j] - union
		g_not_d <- g_and_d_overlap_frac[i,i] - union
		not_g_not_d <- gene_num - union - d_not_g - g_not_d
		g_and_d_fish_pval[i,j] <-	fisher.test(matrix(c(union,d_not_g,g_not_d,not_g_not_d),2,2))$p.value
		g_and_d_fish_odds_ratio[i,j] <-	fisher.test(matrix(c(union,d_not_g,g_not_d,not_g_not_d),2,2))$estimate[[1]]
		}
	}
	


g_and_d_fish_pval
                       # g 0nM                g 8nM                g 25nM                g 75nM                 g avg                 d w2                 d w3
# g 0nM                      0                    0 1.09200187261342e-239   0.00616800549548198 7.66246815977211e-202    0.013447159175353  2.7130946102272e-05
# g 8nM                      0                    0                     0  6.26384115165444e-05                     0                    1                    1
# g 25nM 1.09200187261342e-239                    0                     0  2.59490218411265e-21                     0                    1                    1
# g 75nM   0.00616800549548197 6.26384115165443e-05  2.59490218411268e-21 1.16971852099428e-190   1.2866648639961e-21                    1 8.92591193279054e-09
# g avg  7.66246815977254e-202                    0                     0  1.28666486399609e-21                     0                    1                    1
# d w2       0.013447159175353                    1                     1                     1                     1 5.44150871270568e-10 1.76849033162935e-07
# d w3    2.71309461022721e-05                    1                     1  8.92591193279054e-09                     1 1.76849033162935e-07 1.81474536990845e-98
# d w4     0.00719610452211717                    1                     1  2.32509854237576e-09                     1  9.3049798987267e-08 9.42981229392097e-59
# d w6        0.01229307552561                    1                     1  1.61348822235117e-16                     1  1.6324526138117e-07  6.0114025671735e-47
# d avg   2.30797548257963e-05                    1                     1  7.55777973654602e-09                     1  1.6324526138117e-07 2.01932101700747e-82
# Ix        0.0658887431859247                    1    0.0824992865323019  2.56446538729253e-33    0.0765328131252135                    1 5.33696013760736e-16
                       # d w4                 d w6                d avg                    Ix
# g 0nM   0.00719610452211717     0.01229307552561 2.30797548257963e-05    0.0658887431859247
# g 8nM                     1                    1                    1                     1
# g 25nM                    1                    1                    1    0.0824992865323019
# g 75nM 2.32509854237576e-09 1.61348822235117e-16 7.55777973654602e-09  2.56446538729256e-33
# g avg                     1                    1                    1    0.0765328131252136
# d w2    9.3049798987267e-08  1.6324526138117e-07  1.6324526138117e-07                     1
# d w3   9.42981229392124e-59 6.01140256717342e-47 2.01932101700752e-82  5.33696013760733e-16
# d w4   1.64361583398136e-74 3.01902138867694e-50 4.78803007330236e-64  5.59940108027233e-20
# d w6    3.0190213886769e-50 4.22982246776233e-95 2.62371357238681e-51  1.23869333118355e-36
# d avg   4.7880300733025e-64 2.62371357238681e-51 4.22982246776233e-95  7.97472143331389e-19
# Ix     5.59940108027237e-20 1.23869333118355e-36 7.97472143331395e-19 9.69604000439668e-212





g_and_d_fish_odds_ratio
                  # g 0nM            g 8nM           g 25nM           g 75nM            g avg             d w2             d w3             d w4             d w6
# g 0nM               Inf 1112.74079505459 129.053930828731 8.54623353164983  104.58741584785 146.793794765525 26.9926680643548 17.3915529187817 12.8566517104643
# g 8nM  1112.74079505459              Inf 280.892811959627 13.2065095410543 204.300405292217                0                0                0                0
# g 25nM 129.053930828732 280.892811959626              Inf 54.8093992123365 8965.34407375267                0                0                0                0
# g 75nM 8.54623353164983 13.2065095410543 54.8093992123366              Inf 57.4124886657971                0 214.082394443306 315.537403235159 489.995020386973
# g avg   104.58741584785 204.300405292217 8965.34407375269 57.4124886657971              Inf                0                0                0                0
# d w2   146.793794765525                0                0                0                0              Inf              Inf              Inf              Inf
# d w3   26.9926680643547                0                0 214.082394443305                0              Inf              Inf 4503599627370496 8107.19869209992
# d w4   17.3915529187817                0                0 315.537403235159                0              Inf 4503599627370496              Inf 11603.3065711261
# d w6   12.8566517104643                0                0 489.995020386972                0              Inf 8107.19869209992 11603.3065711261              Inf
# d avg   28.311543890846                0                0 225.557897016407                0              Inf 4503599627370496 4503599627370496 13888.1720584246
# Ix     4.92597205239277                0 4.31733050386964 481.843095534527 4.51376543930841                0 399.523225208023 818.443169772864 1521.89398299836
                  # d avg               Ix
# g 0nM   28.311543890846 4.92597205239277
# g 8nM                 0                0
# g 25nM                0 4.31733050386964
# g 75nM 225.557897016407 481.843095534528
# g avg                 0  4.5137654393084
# d w2                Inf                0
# d w3   4503599627370496 399.523225208024
# d w4   4503599627370496 818.443169772865
# d w6   13888.1720584246 1521.89398299837
# d avg               Inf 526.755846613837
# Ix     526.755846613837              Inf



# ----------------------------------------- Compare g vs g, d vs d, and g vs d sharing ------------------------ 


# Sharing ratio. Could also use differences in pvals (or logP vals if fisher.test allowed), and odds ratios. Decided to use odds ratios.
# Also, some p vals are 0, so cannot compare -log10P. Some odds ratios are Inf, so cannot compare ensemble odds ratios either (except for specific comparisons without 0 and/or Inf). Could use sharing in those ensemble situations.
# Gives very similar results to g_d_comb_logP_1.R

sem <- function(x) {sqrt(var(x,na.rm=TRUE)/sum(!is.na(x)))}

compare <- function(a,b) {
	print(t.test(a,b))
	
	print(paste0("exact P value = ", t.test(a,b)$p.value))
	
	print(paste0("mean of a = ", mean(a, na.rm = TRUE)))
	print(paste0("sem of a = ", sem(a)))
	print(paste0("sd of a = ", sd(a, na.rm = TRUE)))
	print(paste0("number in a = ", sum(!is.na(a))))
	
	print(paste0("mean of b = ", mean(b, na.rm = TRUE)))
	print(paste0("sem of b = ", sem(b)))
	print(paste0("sd of b = ", sd(b, na.rm = TRUE)))
	print(paste0("number in b = ", sum(!is.na(b))))
	
}


# compare g-g and d-d sharing. Significantly higher for d-d, largely because of week 2 overlapping with nearly all drug loci. P val becomes insignificant if wk2 omitted (not shown)


compare(g_and_d_overlap_dec[1:5,1:5][upper.tri(g_and_d_overlap_dec[1:5,1:5])],g_and_d_overlap_dec[6:10,6:10][upper.tri(g_and_d_overlap_dec[6:10,6:10])])

	# Welch Two Sample t-test

# data:  a and b
# t = -4.3851, df = 14.217, p-value = 0.000601
# alternative hypothesis: true difference in means is not equal to 0
# 95 percent confidence interval:
 # -0.7150264 -0.2457691
# sample estimates:
# mean of x mean of y 
# 0.3688249 0.8492227 

# [1] "exact P value = 0.000600999761062176"
# [1] "mean of a = 0.36882492599276"
# [1] "sem of a = 0.0953738819709266"
# [1] "sd of a = 0.301598696320197"
# [1] "number in a = 10"
# [1] "mean of b = 0.849222672064777"
# [1] "sem of b = 0.0539015114263782"
# [1] "sd of b = 0.170451545432946"
# [1] "number in b = 10"



# Compare g-g and g-d sharing, significant difference, g-g higher than g-d:

compare(g_and_d_overlap_dec[1:5,1:5][upper.tri(g_and_d_overlap_dec[1:5,1:5])],as.vector(as.matrix(g_and_d_overlap_dec[1:5,6:10])))

	# Welch Two Sample t-test

# data:  a and b
# t = 3.6999, df = 9.0879, p-value = 0.004839
# alternative hypothesis: true difference in means is not equal to 0
# 95 percent confidence interval:
 # 0.1377777 0.5696930
# sample estimates:
 # mean of x  mean of y 
# 0.36882493 0.01508958 

# [1] "exact P value = 0.00483923793262738"
# [1] "mean of a = 0.36882492599276"
# [1] "sem of a = 0.0953738819709266"
# [1] "sd of a = 0.301598696320197"
# [1] "number in a = 10"
# [1] "mean of b = 0.0150895754612136"
# [1] "sem of b = 0.0066606844100933"
# [1] "sd of b = 0.0333034220504665"
# [1] "number in b = 25"



# Compare d-d and g-d sharing, significant difference, d-d higher than g-d

compare(g_and_d_overlap_dec[6:10,6:10][upper.tri(g_and_d_overlap_dec[6:10,6:10])],as.vector(as.matrix(g_and_d_overlap_dec[1:5,6:10])))

	# Welch Two Sample t-test

# data:  a and b
# t = 15.358, df = 9.2761, p-value = 6.573e-08
# alternative hypothesis: true difference in means is not equal to 0
# 95 percent confidence interval:
 # 0.7118273 0.9564389
# sample estimates:
 # mean of x  mean of y 
# 0.84922267 0.01508958 

# [1] "exact P value = 6.57316692059667e-08"
# [1] "mean of a = 0.849222672064777"
# [1] "sem of a = 0.0539015114263782"
# [1] "sd of a = 0.170451545432946"
# [1] "number in a = 10"
# [1] "mean of b = 0.0150895754612136"
# [1] "sem of b = 0.0066606844100933"
# [1] "sd of b = 0.0333034220504665"
# [1] "number in b = 25"


# In fact, main source of g-d sharing is from g_75nM, as one might expect!

g_and_d_overlap_dec[1:5,6:10]
              # d w2        d w3        d w4        d w6       d avg
# g 0nM  0.002444988 0.009779951 0.004889976 0.004889976 0.009779951
# g 8nM  0.000000000 0.000000000 0.000000000 0.000000000 0.000000000
# g 25nM 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000
# g 75nM 0.000000000 0.072727273 0.072727273 0.127272727 0.072727273
# g avg  0.000000000 0.000000000 0.000000000 0.000000000 0.000000000


# Difference in sharing ratio of g-d for 75 nM is significantly higher than rest of g-d (also repeated below):

compare(as.vector(as.matrix(g_and_d_overlap_dec[1:5,6:10]["g 75nM",])), as.vector(as.matrix(g_and_d_overlap_dec[1:5,6:10][-c(4),])))

	# Welch Two Sample t-test

# data:  a and b
# t = 3.3319, df = 4.01, p-value = 0.02894
# alternative hypothesis: true difference in means is not equal to 0
# 95 percent confidence interval:
 # 0.01130877 0.12369456
# sample estimates:
  # mean of x   mean of y 
# 0.069090909 0.001589242 

# [1] "exact P value = 0.0289441934090892"
# [1] "mean of a = 0.0690909090909091"
# [1] "sem of a = 0.0202464158648364"
# [1] "sd of a = 0.0452723621745045"
# [1] "number in a = 5"
# [1] "mean of b = 0.00158924205378973"
# [1] "sem of b = 0.000715583618854046"
# [1] "sd of b = 0.0032001872306859"
# [1] "number in b = 20"


# Also,  -log10P  of g-d for 75 nM significantly higher than rest of g-d:

compare(as.vector(-log10(as.numeric(as.matrix(g_and_d_fish_pval[1:5,6:10]["g 75nM",])))), as.vector(-log10(as.numeric(as.matrix(g_and_d_fish_pval[1:5,6:10][-c(4),])))))

	# Welch Two Sample t-test

# data:  a and b
# t = 2.9183, df = 4.1446, p-value = 0.04148
# alternative hypothesis: true difference in means is not equal to 0
# 95 percent confidence interval:
  # 0.4533086 14.2725983
# sample estimates:
# mean of x mean of y 
# 8.1193492 0.7563958 

# [1] "exact P value = 0.0414820843379339"
# [1] "mean of a = 8.1193492016393"
# [1] "sem of a = 2.50066303556204"
# [1] "sd of a = 5.59165253633771"
# [1] "number in a = 5"
# [1] "mean of b = 0.756395765603084"
# [1] "sem of b = 0.335031923473759"
# [1] "sd of b = 1.49830831103966"
# [1] "number in b = 20"



# Significant diff using odds ratio of g-d for 75 nM vs rest of g-d:
# <<<<<<<<<<<< do NOT use in paper >>>>>>>>>>>>>>>>>>>>
# <<<<<<<<<<<<< data too sparse >>>>>>>>>>>>>>>>>>>>>>>>

g_and_d_fish_odds_ratio[1:5,6:10]
                   # d w2             d w3             d w4             d w6            d avg
# g 0nM  146.793794765525 26.9926680643548 17.3915529187817 12.8566517104643  28.311543890846
# g 8nM                 0                0                0                0                0
# g 25nM                0                0                0                0                0
# g 75nM                0 214.082394443306 315.537403235159 489.995020386973 225.557897016407
# g avg                 0                0                0                0                0



compare(as.numeric(as.matrix(g_and_d_fish_odds_ratio[1:5,6:10]["g 75nM",])), as.numeric(as.matrix(g_and_d_fish_odds_ratio[1:5,6:10][-c(4),])))

# # 	Welch Two Sample t-test <<<<<<<<<<<<<<<<<< do NOT use in paper, data too sparse

# data:  a and b
# t = 2.975, df = 4.0697, p-value = 0.04006
# alternative hypothesis: true difference in means is not equal to 0
# 95 percent confidence interval:
  # 17.33609 457.49838
# sample estimates:
# mean of x mean of y 
# 249.03454  11.61731 

# [1] "exact P value = 0.0400593456340094"
# [1] "mean of a = 249.034543016369"
# [1] "sem of a = 79.459295014681"
# [1] "sd of a = 177.676385097037"
# [1] "number in a = 5"
# [1] "mean of b = 11.6173105674986"
# [1] "sem of b = 7.40680471950154"
# [1] "sd of b = 33.1242376977434"
# [1] "number in b = 20"



# ^^^^^^^^^^^ do NOT use in paper ^^^^^^^^^^^^^^^^
# ^^^^^^^^^^ data too sparse ^^^^^^^^^^^^^^^^^^^^^^


# However, spot quote that highest g/d overlap is at g 75 nM and d 6 wks, cf tables above.
# odds ratio = 489.995020386972, P = 1.61348822235117e-16 <<<<<<<<<<<<< use in paper





# g-d for 75 nM significantly higher than rest of g-d:
# <<<<<<<<<<<< do not use in paper >>>>>>>>>>>>>>>>>>>>

g_and_d_overlap_dec[1:5,6:10]
              # d w2        d w3        d w4        d w6       d avg
# g 0nM  0.002444988 0.009779951 0.004889976 0.004889976 0.009779951
# g 8nM  0.000000000 0.000000000 0.000000000 0.000000000 0.000000000
# g 25nM 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000
# g 75nM 0.000000000 0.072727273 0.072727273 0.127272727 0.072727273
# g avg  0.000000000 0.000000000 0.000000000 0.000000000 0.000000000

compare(as.numeric(as.matrix(g_and_d_overlap_dec[1:5,6:10]["g 75nM",])), as.numeric(as.matrix(g_and_d_overlap_dec[1:5,6:10][-c(4),])))

	# Welch Two Sample t-test

# data:  a and b
# t = 3.3319, df = 4.01, p-value = 0.02894
# alternative hypothesis: true difference in means is not equal to 0
# 95 percent confidence interval:
 # 0.01130877 0.12369456
# sample estimates:
  # mean of x   mean of y 
# 0.069090909 0.001589242 

# [1] "exact P value = 0.0289441934090892"
# [1] "mean of a = 0.0690909090909091"
# [1] "sem of a = 0.0202464158648364"
# [1] "sd of a = 0.0452723621745045"
# [1] "number in a = 5"
# [1] "mean of b = 0.00158924205378973"
# [1] "sem of b = 0.000715583618854046"
# [1] "sd of b = 0.0032001872306859"
# [1] "number in b = 20"



# ^^^^^^^^^^^ do not use in paper ^^^^^^^^^^^^^^^^





# Numbers overlapping loci in non-unique g and non-unique d:

g_and_d_overlap_frac[1:5,6:10]
       # d w2 d w3 d w4 d w6 d avg
# g 0nM     1    4    2    2     4
# g 8nM     0    0    0    0     0
# g 25nM    0    0    0    0     0
# g 75nM    0    4    4    7     4
# g avg     0    0    0    0     0

sum(g_and_d_overlap_frac[1:5,6:10])
# [1] 32

# Alternatively. However, above likely more reliable because counts each contingency separately.
dim(d[paste(d$Chromosome, d$ensembl_gene_id, sep = "\r") %in% paste(g$Chromosome, g$ensembl_gene_id, sep = "\r"),])[1]
# [1] 28






# ----------------------------------------- Compare Ix vs g, Ix vs d, sharing ------------------------ 

# For Ix loci expect symmetry with d and g, with largest effects at highest g and d levels.

# # Check Ix overlap using sharing ratio. Could also use differences in pvals (or logP vals if fisher.test allowed), and odds ratios. Decided to use odds ratio.


compare(as.numeric(as.vector(g_and_d_overlap_dec["Ix",c(1:5)])),as.numeric(as.vector(g_and_d_overlap_dec["Ix",c(6:10)])))

	# Welch Two Sample t-test

# data:  a and b
# t = -0.90874, df = 7.6923, p-value = 0.3911
# alternative hypothesis: true difference in means is not equal to 0
# 95 percent confidence interval:
 # -0.18350266  0.08027686
# sample estimates:
 # mean of x  mean of y 
# 0.06774194 0.11935484 

# [1] "exact P value = 0.391069969635478"
# [1] "mean of a = 0.067741935483871"
# [1] "sem of a = 0.043994134506406"
# [1] "sd of a = 0.0983738753675929"
# [1] "number in a = 5"
# [1] "mean of b = 0.119354838709677"
# [1] "sem of b = 0.035921060405355"
# [1] "sd of b = 0.0803219328902499"
# [1] "number in b = 5"




# Also perhaps surprisinlgy, Ix vs g OR not significantly lower than Ix vs d:

compare(as.numeric(g_and_d_fish_odds_ratio["Ix",1:5]),as.numeric(g_and_d_fish_odds_ratio["Ix",6:10]))

	# Welch Two Sample t-test

# data:  a and b
# t = -2.0428, df = 5.114, p-value = 0.09527
# alternative hypothesis: true difference in means is not equal to 0
# 95 percent confidence interval:
 # -1246.9470   138.5405
# sample estimates:
# mean of x mean of y 
 # 99.12003 653.32324 

# [1] "exact P value = 0.0952672548747031"
# [1] "mean of a = 99.1200327060196"
# [1] "sem of a = 95.6849368808552"
# [1] "sd of a = 213.958023288369"
# [1] "number in a = 5"
# [1] "mean of b = 653.323244918617"
# [1] "sem of b = 253.864603214193"
# [1] "sd of b = 567.658509867947"
# [1] "number in b = 5"





# Ix vs g sharing lower, but not significantly, than Ix vs d:

compare(as.numeric(g_and_d_overlap_dec["Ix",1:5]),as.numeric(g_and_d_overlap_dec["Ix",6:10]))

	# Welch Two Sample t-test 

# data:  a and b
# t = -0.90874, df = 7.6923, p-value = 0.3911
# alternative hypothesis: true difference in means is not equal to 0
# 95 percent confidence interval:
 # -0.18350266  0.08027686
# sample estimates:
 # mean of x  mean of y 
# 0.06774194 0.11935484 

# [1] "exact P value = 0.391069969635478"
# [1] "mean of a = 0.067741935483871"
# [1] "sem of a = 0.043994134506406"
# [1] "sd of a = 0.0983738753675929"
# [1] "number in a = 5"
# [1] "mean of b = 0.119354838709677"
# [1] "sem of b = 0.035921060405355"
# [1] "sd of b = 0.0803219328902499"
# [1] "number in b = 5"






# Most of the 21 loci (see also below) that overlap between Ix and non-unique g occurs at g 75 nM:

g_and_d_overlap_frac["Ix",1:5]
   # g 0nM g 8nM g 25nM g 75nM g avg
# Ix     2     0      2     15     2


# Alternatively to calculate number overlapping loci between Ix and non-unique g
dim(g[paste(g$Chromosome, g$ensembl_gene_id, sep = "\r") %in% paste(Ix$Chromosome, Ix$ensembl_gene_id, sep = "\r"),])[1]
# [1] 21 


# Ix loci that overlap with non-unique d loci
g_and_d_overlap_frac["Ix",6:10]
   # d w2 d w3 d w4 d w6 d avg
# Ix    0    7    8   14     8

sum(g_and_d_overlap_frac["Ix",6:10])
# [1] 37


# Alternatively
dim(d[paste(d$Chromosome, d$ensembl_gene_id, sep = "\r") %in% paste(Ix$Chromosome, Ix$ensembl_gene_id, sep = "\r"),])[1]
# [1] 37





# The overlap for Ix vs g 75nM significantly higher OR than Ix vs g 0nM, g 8nM and g 25nM

t.test(as.numeric(g_and_d_fish_odds_ratio["Ix",c(1:3,5)]),mu=as.numeric(g_and_d_fish_odds_ratio["Ix",4]))

	# One Sample t-test

# data:  as.numeric(g_and_d_fish_odds_ratio["Ix", c(1:3, 5)])
# t = -414.77, df = 3, p-value = 3.091e-08
# alternative hypothesis: true mean is not equal to 481.8431
# 95 percent confidence interval:
 # -0.2314141  7.1099481
# sample estimates:
# mean of x 
 # 3.439267 
 

t.test(as.numeric(g_and_d_fish_odds_ratio["Ix",c(1:3,5)]),mu=as.numeric(g_and_d_fish_odds_ratio["Ix",4]),alternative = "two.sided")$p.value
# [1] 3.09054e-08 


# However, more conservative (and accurate) to use pooled variance, since single observation sample is not a theoretical value, but is empirical

t.test(as.numeric(g_and_d_fish_odds_ratio["Ix",c(1:3,5)]),as.numeric(g_and_d_fish_odds_ratio["Ix",4]),var.equal=TRUE)

	# Two Sample t-test

# data:  as.numeric(g_and_d_fish_odds_ratio["Ix", c(1:3, 5)]) and as.numeric(g_and_d_fish_odds_ratio["Ix", 4])
# t = -185.49, df = 3, p-value = 3.455e-07
# alternative hypothesis: true difference in means is not equal to 0
# 95 percent confidence interval:
 # -486.6117 -470.1959
# sample estimates:
 # mean of x  mean of y 
  # 3.439267 481.843096 







mean(as.numeric(g_and_d_fish_odds_ratio["Ix",c(1:3,5)]))
# [1] 3.439267 

sem(as.numeric(g_and_d_fish_odds_ratio["Ix",c(1:3,5)]))
# [1] 1.153415 









# The overlap for Ix vs g 75nM significantly higher sharing than Ix vs g 0nM, g 8nM and g 25nM

t.test(as.numeric(g_and_d_overlap_dec["Ix",c(1:3,5)]),as.numeric(g_and_d_overlap_dec["Ix",4]),var.equal=TRUE)

	# Two Sample t-test 

# data:  as.numeric(g_and_d_overlap_dec["Ix", c(1:3, 5)]) and as.numeric(g_and_d_overlap_dec["Ix", 4])
# t = -12.075, df = 3, p-value = 0.001222
# alternative hypothesis: true difference in means is not equal to 0
# 95 percent confidence interval:
 # -0.2751304 -0.1603535
# sample estimates:
 # mean of x  mean of y 
# 0.02419355 0.24193548 

 

t.test(as.numeric(g_and_d_overlap_dec["Ix",c(1:3,5)]),as.numeric(g_and_d_overlap_dec["Ix",4]),var.equal=TRUE)$p.value
# [1] 0.001222401


mean(as.numeric(g_and_d_overlap_dec["Ix",c(1:3,5)]))
# [1] 0.02419355 

sem(as.numeric(g_and_d_overlap_dec["Ix",c(1:3,5)]))
# [1] 0.008064516 





# -------------- Number of genes overlap between Ix and g_unique and d_unique -----------------------

# Decided to quote directly quote numbers in paper body rather than create a figure like for g and d.
# In addition, created Venn diagram to show overlap of Ix loci with g_unique, d_unique (see below)

# 15 out of 62 Ix loci overlap with 859 g_unique

dim(Ix[paste(Ix$Chromosome, Ix$ensembl_gene_id, sep = "\r") %in% paste(g_unique$Chromosome, g_unique$ensembl_gene_id, sep = "\r"),])
# [1] 15 50 <<<<<<<<<<< use in paper

dim(Ix)
# [1] 62 50 <<<<<<<<<<< use in paper

# Percent overlap
15/62
# [1] 0.2419355

# All overlap growth genes, except 2 (ie 13 out of 15), are 75nM in g_unique

# Concatentates Chromosome and ensembl_gene_id in rare cases of genes with identical ensembl_gene_id but different Chromosomes. Similar to approach used to make g_and_d_overlap_frac above (though actually no identical ensembl_gene_id in g_unique or d_unique).

g_unique[paste(g_unique$Chromosome, g_unique$ensembl_gene_id, sep = "\r") %in% paste(Ix$Chromosome,Ix$ensembl_gene_id,sep = "\r"),c("Chromosome","pos","conc","log10P","ensembl_gene_id","geneSymbol","dist")]
    # Chromosome       pos conc    log10P ensembl_gene_id geneSymbol    dist
# 3         chr1   9050000   75  23.19961 ENSG00000142583     SLC2A5       0
# 91        chr2  29470000   75  20.53619 ENSG00000171094        ALK       0
# 343       chr6  31870000  avg  27.14249 ENSG00000204385    SLC44A4       0
# 354       chr6  61740000   75  42.00783 ENSG00000112232    KHDRBS2       0
# 385       chr7   1620000   75  19.22358 ENSG00000231476 AC074389.2     654
# 386       chr7   5800000   75  19.90818 ENSG00000011275     RNF216  -18262
# 398       chr7  85120000   75  21.17618 ENSG00000153993     SEMA3D       0
# 413       chr7 123690000   75  19.79111 ENSG00000106299       WASL       0
# 538      chr11  54280000   75  22.46447           cen11        CEN       0
# 587      chr12  34460000   75  18.17542 ENSG00000256614      AK6P1 -210043
# 660      chr15  16980000   75  19.52518           cen15        CEN  103674
# 668      chr15  48020000   75  18.91021 ENSG00000259754 AC092078.2       0
# 714      chr16  83400000   25 111.12911 ENSG00000140945      CDH13       0
# 771      chr19  19460000   75  20.27804 ENSG00000167491    GATAD2A       0
# 827       chrX  58750000   75  19.22160            cenX        CEN       0

# Nearly all overlap growth genes (except 6, ie 15 out of 21) are also 75 nM in g (see also g_and_d_overlap_frac above):

g[paste(g$Chromosome, g$ensembl_gene_id, sep = "\r") %in% paste(Ix$Chromosome,Ix$ensembl_gene_id, sep = "\r"),c("Chromosome","pos","conc","log10P","ensembl_gene_id","geneSymbol","dist")]
     # Chromosome       pos conc     log10P ensembl_gene_id geneSymbol    dist
# 4          chr1   9050000   75  23.199612 ENSG00000142583     SLC2A5       0
# 184        chr2  29470000   75  20.536194 ENSG00000171094        ALK       0
# 746        chr6  31870000   25  26.032704 ENSG00000204385    SLC44A4       0
# 747        chr6  31870000   75  20.909363 ENSG00000204385    SLC44A4       0
# 748        chr6  31870000  avg  27.142494 ENSG00000204385    SLC44A4       0
# 767        chr6  61740000   75  42.007826 ENSG00000112232    KHDRBS2       0
# 827        chr7   1620000   75  19.223579 ENSG00000231476 AC074389.2     654
# 828        chr7   5800000   75  19.908180 ENSG00000011275     RNF216  -18262
# 847        chr7  85120000   75  21.176183 ENSG00000153993     SEMA3D       0
# 881        chr7 123690000    0   7.201145 ENSG00000106299       WASL       0
# 882        chr7 123690000   75  19.791108 ENSG00000106299       WASL       0
# 1155      chr11  54280000   75  22.464474           cen11        CEN       0
# 1240      chr12  34460000   75  18.175422 ENSG00000256614      AK6P1 -210043
# 1411      chr15  16980000   75  19.525177           cen15        CEN  103674
# 1433      chr15  48020000   75  18.910210 ENSG00000259754 AC092078.2       0
# 1535      chr16  83400000   25 111.129115 ENSG00000140945      CDH13       0
# 1536      chr16  83400000  avg 110.411876 ENSG00000140945      CDH13       0
# 1537      chr16  83420000   75  45.308429 ENSG00000140945      CDH13       0
# 1647      chr19  19460000   75  20.278043 ENSG00000167491    GATAD2A       0
# 1759       chrX  58750000   75  19.221598            cenX        CEN       0
# 1760       chrX  62080000    0   8.959412            cenX        CEN       0



# 14 out of 62 Ix loci overlap with 38 d_unique

dim(Ix[paste(Ix$Chromosome, Ix$ensembl_gene_id, sep = "\r") %in% paste(d_unique$Chromosome, d_unique$ensembl_gene_id, sep = "\r"),])
# [1] 14 50 <<<<<<<<<<<<<< use in paper

dim(Ix)
# [1] 62 50 <<<<<<<<<<<<<< use in paper

# ratio overlap
14/62
# [1] 0.2258065

dim(d_unique)
# [1] 38 50 <<<<<<<<<<<<<< use in paper







# All overlap Ix genes are wk 6 in d_unique

d_unique[paste(d_unique$Chromosome, d_unique$ensembl_gene_id, sep = "\r") %in% paste(Ix$Chromosome, Ix$ensembl_gene_id, sep = "\r"),c("Chromosome","pos","wk","log10P","ensembl_gene_id","geneSymbol","dist")]
   # Chromosome       pos wk   log10P ensembl_gene_id geneSymbol    dist
# 2        chr1 120440000  6 25.15358 ENSG00000270231      NBPF8       0
# 3        chr1 149010000  6 20.94135 ENSG00000178104    PDE4DIP       0
# 9        chr6  61910000  6 29.55090 ENSG00000112232    KHDRBS2       0
# 11       chr7   1620000  6 17.18092 ENSG00000231476 AC074389.2     654
# 14       chr7  85100000  6 21.27563 ENSG00000153993     SEMA3D       0
# 16       chr9  19690000  6 18.00797 ENSG00000155886    SLC24A2       0
# 24      chr11  54270000  6 31.37027           cen11        CEN       0
# 25      chr12  34390000  6 27.86534 ENSG00000256614      AK6P1 -140043
# 26      chr13  70380000  6 20.50014 ENSG00000202433   RNU6-54P   79464
# 30      chr16  36310000  6 17.47496           cen16        CEN    1159
# 33      chr19  19460000  6 17.80277 ENSG00000167491    GATAD2A       0
# 36      chr20  29950000  6 16.51963           cen20        CEN       0
# 37      chr20  41780000  6 19.87393 ENSG00000212224    RF00568  -67401
# 38       chrX  58840000  6 39.72011            cenX        CEN       0



# but not in d (see also g_and_d_overlap_frac above):

d[paste(d$Chromosome, d$ensembl_gene_id, sep = "\r") %in% paste(Ix$Chromosome, Ix$ensembl_gene_id, sep = "\r"),c("Chromosome","pos","wk","log10P","ensembl_gene_id","geneSymbol","dist")]
   # Chromosome       pos  wk    log10P ensembl_gene_id geneSymbol    dist
# 2        chr1 120440000   6 25.153583 ENSG00000270231      NBPF8       0
# 3        chr1 120470000   3  6.977060 ENSG00000270231      NBPF8   -2262
# 4        chr1 120470000   4 18.199233 ENSG00000270231      NBPF8   -2262
# 5        chr1 120470000 avg  9.151064 ENSG00000270231      NBPF8   -2262
# 6        chr1 148990000   4 13.776576 ENSG00000178104    PDE4DIP       0
# 7        chr1 148990000 avg  7.234504 ENSG00000178104    PDE4DIP       0
# 8        chr1 149010000   6 20.941351 ENSG00000178104    PDE4DIP       0
# 24       chr6  61900000   3  7.133516 ENSG00000112232    KHDRBS2       0
# 25       chr6  61910000   4 15.692691 ENSG00000112232    KHDRBS2       0
# 26       chr6  61910000   6 29.550898 ENSG00000112232    KHDRBS2       0
# 27       chr6  61910000 avg  8.692963 ENSG00000112232    KHDRBS2       0
# 32       chr7   1620000   6 17.180916 ENSG00000231476 AC074389.2     654
# 38       chr7  85100000   6 21.275626 ENSG00000153993     SEMA3D       0
# 43       chr9  19690000   6 18.007965 ENSG00000155886    SLC24A2       0
# 56      chr11  51160000   3 10.361451           cen11        CEN       0
# 57      chr11  51160000   4 20.667361           cen11        CEN       0
# 58      chr11  51160000 avg 12.445448           cen11        CEN       0
# 59      chr11  54270000   6 31.370275           cen11        CEN       0
# 60      chr12  34390000   6 27.865343 ENSG00000256614      AK6P1 -140043
# 61      chr12  34410000   3 10.504746 ENSG00000256614      AK6P1 -160043
# 62      chr12  34410000   4 19.731760 ENSG00000256614      AK6P1 -160043
# 63      chr12  34410000 avg 12.361427 ENSG00000256614      AK6P1 -160043
# 64      chr13  70380000   3  6.310634 ENSG00000202433   RNU6-54P   79464
# 65      chr13  70380000   4 13.006915 ENSG00000202433   RNU6-54P   79464
# 66      chr13  70380000   6 20.500142 ENSG00000202433   RNU6-54P   79464
# 67      chr13  70380000 avg  7.605702 ENSG00000202433   RNU6-54P   79464
# 77      chr16  36310000   6 17.474960           cen16        CEN    1159
# 78      chr16  36340000   3  6.923263           cen16        CEN       0
# 79      chr16  36340000   4 12.654739           cen16        CEN       0
# 80      chr16  36340000 avg  8.089122           cen16        CEN       0
# 87      chr19  19460000   6 17.802773 ENSG00000167491    GATAD2A       0
# 92      chr20  29950000   6 16.519634           cen20        CEN       0
# 93      chr20  41780000   6 19.873927 ENSG00000212224    RF00568  -67401
# 94       chrX  58840000   3 13.903361            cenX        CEN       0
# 95       chrX  58840000   4 28.477798            cenX        CEN       0
# 96       chrX  58840000   6 39.720109            cenX        CEN       0
# 97       chrX  58840000 avg 16.820132            cenX        CEN       0


# Number Ix in common with EITHER g_unique OR d_unique

dim(Ix[paste(Ix$Chromosome, Ix$ensembl_gene_id, sep = "\r") %in% paste(g_unique$Chromosome, g_unique$ensembl_gene_id, sep = "\r") | paste(Ix$Chromosome, Ix$ensembl_gene_id, sep = "\r") %in% paste(d_unique$Chromosome, d_unique$ensembl_gene_id, sep = "\r"),])
# [1] 22 50 <<<<<<<<<<<<<<<<< use in paper



Ix[paste(Ix$Chromosome, Ix$ensembl_gene_id, sep = "\r") %in% paste(g_unique$Chromosome, g_unique$ensembl_gene_id, sep = "\r") | paste(Ix$Chromosome, Ix$ensembl_gene_id, sep = "\r") %in% paste(d_unique$Chromosome, d_unique$ensembl_gene_id, sep = "\r"),c("Chromosome","pos","log10P","ensembl_gene_id","geneSymbol","dist","gene_type")]
   # Chromosome       pos    log10P ensembl_gene_id geneSymbol    dist                          gene_type
# 1        chr1   9050000  9.480375 ENSG00000142583     SLC2A5       0                     protein_coding
# 5        chr1 120440000 15.762313 ENSG00000270231      NBPF8       0 transcribed_unprocessed_pseudogene
# 6        chr1 149010000 13.848713 ENSG00000178104    PDE4DIP       0                     protein_coding
# 7        chr2  29470000  9.707829 ENSG00000171094        ALK       0                     protein_coding
# 17       chr6  31870000 10.177431 ENSG00000204385    SLC44A4       0                     protein_coding
# 20       chr6  61740000 27.133638 ENSG00000112232    KHDRBS2       0                     protein_coding
# 24       chr7   1620000 11.725745 ENSG00000231476 AC074389.2     654                             lncRNA
# 25       chr7   5800000 11.579497 ENSG00000011275     RNF216  -18262                     protein_coding
# 28       chr7  85100000 20.102790 ENSG00000153993     SEMA3D       0                     protein_coding
# 29       chr7 123690000 21.947363 ENSG00000106299       WASL       0                     protein_coding
# 30       chr9  19690000 12.363473 ENSG00000155886    SLC24A2       0                     protein_coding
# 36      chr11  54270000 19.636416           cen11        CEN       0                         centromere
# 38      chr12  34460000 15.386505 ENSG00000256614      AK6P1 -210043               processed_pseudogene
# 42      chr13  70310000 14.864337 ENSG00000202433   RNU6-54P  149464                              snRNA
# 45      chr15  16980000 16.578741           cen15        CEN  103674                         centromere
# 49      chr15  48020000  8.573601 ENSG00000259754 AC092078.2       0                             lncRNA
# 50      chr16  36690000  9.665248           cen16        CEN       0                         centromere
# 53      chr16  83430000 10.621957 ENSG00000140945      CDH13       0                     protein_coding
# 56      chr19  19450000 12.653597 ENSG00000167491    GATAD2A       0                     protein_coding
# 60      chr20  29950000 13.012052           cen20        CEN       0                         centromere
# 61      chr20  41740000 16.464143 ENSG00000212224    RF00568  -27401                             snoRNA
# 62       chrX  58750000 22.030698            cenX        CEN       0                         centromere




# Intersection of Ix with g_unique OR d_unique

(Ix_N_g_N_d <- dim(Ix[paste(Ix$Chromosome, Ix$ensembl_gene_id, sep = "\r") %in% paste(g_unique$Chromosome, g_unique$ensembl_gene_id, sep = "\r") | paste(Ix$Chromosome, Ix$ensembl_gene_id, sep = "\r") %in% paste(d_unique$Chromosome, d_unique$ensembl_gene_id, sep = "\r"),])[1])
# [1] 22 <<<<<<<< use in paper

(Ix_num <- dim(Ix)[1])
# [1] 62

# Combine g_unique and d_unique and make overlap preesnt only once.
g_d_unique <- rbind(g_unique[,-c(5)], d_unique[,-c(5)])

(g_union_d <- length(unique(paste(g_d_unique$Chromosome, g_d_unique$ensembl_gene_id, sep = "\r"))))
# [1] 887



fisher.test(matrix(c(Ix_N_g_N_d, g_union_d - Ix_N_g_N_d, Ix_num - Ix_N_g_N_d, gene_num - Ix_N_g_N_d -(g_union_d - Ix_N_g_N_d) - (Ix_num - Ix_N_g_N_d)),2,2))

	# Fisher's Exact Test for Count Data <<<<<<<<<<<<< use in paper

# data:  
# p-value < 2.2e-16
# alternative hypothesis: true odds ratio is not equal to 1
# 95 percent confidence interval:
 # 21.39318 65.79632
# sample estimates:
# odds ratio 
  # 37.94709 

fisher.test(matrix(c(Ix_N_g_N_d, g_union_d - Ix_N_g_N_d, Ix_num - Ix_N_g_N_d, gene_num - Ix_N_g_N_d -(g_union_d - Ix_N_g_N_d) - (Ix_num - Ix_N_g_N_d)),2,2))$p.value
# [1] 6.609756e-25 <<<<<<<<<<<<<< use in paper




# Intersection of Ix AND g_unique AND d_unique
# to simplify, do 2x2 by pooling g_unique and d_unique, but perhaps should really do 2x2x2 fishers test?

(Ix_g_d <- dim(Ix[paste(Ix$Chromosome, Ix$ensembl_gene_id, sep = "\r") %in% paste(g_unique$Chromosome, g_unique$ensembl_gene_id, sep = "\r") & paste(Ix$Chromosome, Ix$ensembl_gene_id, sep = "\r") %in% paste(d_unique$Chromosome, d_unique$ensembl_gene_id, sep = "\r"),])[1])
# [1] 7 <<<<<<<<<< use in paper



# Intersection of g_unique AND d_unique
(g_d <- dim(g_unique[paste(g_unique$Chromosome, g_unique$ensembl_gene_id, sep = "\r") %in% paste(d_unique$Chromosome, d_unique$ensembl_gene_id, sep = "\r"),])[1])
# [1] 10 <<<<<<<<<< use in paper


fisher.test(matrix(c(Ix_g_d, g_d - Ix_g_d, Ix_num - Ix_g_d, gene_num - Ix_g_d -(g_d - Ix_g_d) - (Ix_num - Ix_g_d)),2,2))
	# Fisher's Exact Test for Count Data

# data:  
# p-value < 2.2e-16
# alternative hypothesis: true odds ratio is not equal to 1
# 95 percent confidence interval:
   # 563.7131 16384.0000
# sample estimates:
# odds ratio 
  # 2447.407 <<<<<<<<<<<<< use in paper


fisher.test(matrix(c(Ix_g_d, g_d - Ix_g_d, Ix_num - Ix_g_d, gene_num - Ix_g_d -(g_d - Ix_g_d) - (Ix_num - Ix_g_d)),2,2))$p.value
# [1] 9.860528e-20 <<<<<<<<<<<<< use in paper




# --------------- 2x2x2 Fisher test for Ix vs g_unique vs d_unique -------------------

# https://rcompanion.org/handbook/H_06.html
# For simplicity's sake decided to stick with analysis above for paper (i.e. Intersection of Ix AND g_unique AND d_unique), because no worries about correlated co-variates, interactions and the like
# Numbers a, b, c, d, e, f, g, below are all intersections in Venn diagram.

#a == AND, n == NOT

# g NOT Ix
ag_nIx <- paste(g_unique$Chromosome, g_unique$ensembl_gene_id, sep = "\r")[!(paste(g_unique$Chromosome, g_unique$ensembl_gene_id, sep = "\r") %in% paste(Ix$Chromosome, Ix$ensembl_gene_id, sep = "\r"))]

(b <- length(paste(d_unique$Chromosome, d_unique$ensembl_gene_id, sep = "\r")[paste(d_unique$Chromosome, d_unique$ensembl_gene_id, sep = "\r") %in% ag_nIx]))
# [1] 3

# d NOT Ix (# g in below does not mean redundant growth gene table, just an arbitary letter meaning a segment in Venn diagram)
ad_nIx <- paste(d_unique$Chromosome, d_unique$ensembl_gene_id, sep = "\r")[!(paste(d_unique$Chromosome, d_unique$ensembl_gene_id, sep = "\r") %in% paste(Ix$Chromosome, Ix$ensembl_gene_id, sep = "\r"))]

(g <- length(ad_nIx) - b)
# [1] 21




(a <- dim(Ix[paste(Ix$Chromosome, Ix$ensembl_gene_id, sep = "\r") %in% paste(g_unique$Chromosome, g_unique$ensembl_gene_id, sep = "\r") & paste(Ix$Chromosome, Ix$ensembl_gene_id, sep = "\r") %in% paste(d_unique$Chromosome, d_unique$ensembl_gene_id, sep = "\r"),])[1])
# [1] 7


# Ix NOT g
aIx_ng <- paste(Ix$Chromosome, Ix$ensembl_gene_id, sep = "\r")[!(paste(Ix$Chromosome, Ix$ensembl_gene_id, sep = "\r") %in% paste(g_unique$Chromosome, g_unique$ensembl_gene_id, sep = "\r"))]

(c <- length(aIx_ng) - a)
# [1] 40


# Ix NOT d
aIx_nd <- paste(Ix$Chromosome, Ix$ensembl_gene_id, sep = "\r")[!(paste(Ix$Chromosome, Ix$ensembl_gene_id, sep = "\r") %in% paste(d_unique$Chromosome, d_unique$ensembl_gene_id, sep = "\r"))]

(f <- length(aIx_nd) - c)
# [1] 8


# Ix AND d
aIx_ad <- paste(Ix$Chromosome, Ix$ensembl_gene_id, sep = "\r")[(paste(Ix$Chromosome, Ix$ensembl_gene_id, sep = "\r") %in% paste(d_unique$Chromosome, d_unique$ensembl_gene_id, sep = "\r"))]

(e <- length(aIx_ad) - a)
# [1] 7


(d <- dim(g_unique)[1] - a - b - c)
# [1] 809



matrix_1 <- matrix(c(a,b,c,d),2)
matrix_2 <- matrix(c(e,f,g, gene_num -a -b -c -d -e -f -g),2,byrow=TRUE)

fish_list <- list(matrix_1,matrix_2)

fish_list
# [[1]]
     # [,1] [,2]
# [1,]    7   40
# [2,]    3  809

# [[2]]
     # [,1]  [,2]
# [1,]    7     8
# [2,]   21 59731




cont_table_l <- data.frame(Ix = character(),paclitaxel=character(),growth=character(),counts=numeric(),stringsAsFactors=FALSE)

# eg nIx means NOT Ix, etc.

cont_table_l[1,] <- c("Ix","d","g",a)
cont_table_l[2,] <- c("nIx","d","g",b)
cont_table_l[3,] <- c("Ix","nd","g",c)
cont_table_l[4,] <- c("nIx","nd","g",d)
cont_table_l[5,] <- c("Ix","d","ng",e)
cont_table_l[6,] <- c("Ix","nd","ng",f)
cont_table_l[7,] <- c("nIx","d","ng",g)
cont_table_l[8,] <- c("nIx","nd","ng",gene_num -a -b -c -d -e -f -g)

cont_table_l$Ix <- factor(cont_table_l$Ix,levels=unique(cont_table_l$Ix))
cont_table_l$paclitaxel <- factor(cont_table_l$paclitaxel,levels=unique(cont_table_l$paclitaxel))
cont_table_l$growth <- factor(cont_table_l$growth,levels=unique(cont_table_l$growth))
cont_table_l$counts <- as.numeric(cont_table_l$counts)


cont_table_l
   # Ix paclitaxel growth counts
# 1  Ix          d      g      7
# 2 nIx          d      g      3
# 3  Ix         nd      g     40
# 4 nIx         nd      g    809
# 5  Ix          d     ng      7
# 6  Ix         nd     ng      8
# 7 nIx          d     ng     21
# 8 nIx         nd     ng  59731

cont_table_xtab <- xtabs(counts ~ Ix + paclitaxel + growth, data= cont_table_l)

# identical to fish_list above
cont_table_xtab
# , , growth = g

     # paclitaxel
# Ix        d    nd
  # Ix      7    40
  # nIx     3   809

# , , growth = ng

     # paclitaxel
# Ix        d    nd
  # Ix      7     8
  # nIx    21 59731
  
  
  
# flattened table
ftable(cont_table_xtab)
               # growth     g    ng
# Ix  paclitaxel                   
# Ix  d                     7     7
    # nd                   40     8
# nIx d                     3    21
    # nd                  809 59731



# cf https://rcompanion.org/handbook/H_06.html

mantelhaen.test(cont_table_xtab)

	# Mantel-Haenszel chi-squared test with continuity correction

# data:  cont_table_xtab
# Mantel-Haenszel X-squared = 323.04, df = 1, p-value < 2.2e-16
# alternative hypothesis: true common odds ratio is not equal to 1
# 95 percent confidence interval:
  # 30.97145 293.55593
# sample estimates:
# common odds ratio 
         # 95.35121 

mantelhaen.test(cont_table_xtab)$p.value
# [1] 3.157035e-72


# Significant, so C-M-H test not appropriate
woolf_test(cont_table_xtab)

	# Woolf-test on Homogeneity of Odds Ratios (no 3-Way assoc.)

# data:  cont_table_xtab
# X-squared = 19.224, df = 1, p-value = 1.162e-05


# For fun group by Ix
# Now mantelhaen n.s., but Woolf stays sig.

mantelhaen.test(xtabs(counts ~ paclitaxel + growth + Ix, data= cont_table_l))

	# Mantel-Haenszel chi-squared test with continuity correction

# data:  xtabs(counts ~ paclitaxel + growth + Ix, data = cont_table_l)
# Mantel-Haenszel X-squared = 0.080823, df = 1, p-value = 0.7762
# alternative hypothesis: true common odds ratio is not equal to 1
# 95 percent confidence interval:
 # 0.3106049 2.0870525
# sample estimates:
# common odds ratio 
         # 0.805139 

woolf_test(xtabs(counts ~ paclitaxel + growth + Ix, data= cont_table_l))

	# Woolf-test on Homogeneity of Odds Ratios (no 3-Way assoc.)

# data:  xtabs(counts ~ paclitaxel + growth + Ix, data = cont_table_l)
# X-squared = 19.224, df = 1, p-value = 1.162e-05






# Simplify to cxr table

cont_table_2_l <- cont_table_l
cont_table_2_l <- cont_table_2_l[c(1,3,2,4:8),]

cont_table_2_l
   # Ix paclitaxel growth counts
# 1  Ix          d      g      7
# 3  Ix         nd      g     40
# 2 nIx          d      g      3
# 4 nIx         nd      g    809
# 5  Ix          d     ng      7
# 6  Ix         nd     ng      8
# 7 nIx          d     ng     21
# 8 nIx         nd     ng  59731


# same as ftable above
matrix(cont_table_2_l$counts,4)
    # [,1]  [,2]
# [1,]    7     7
# [2,]   40     8
# [3,]    3    21
# [4,]  809 59731



# not sure this makes sense?
fisher.test(matrix(cont_table_2_l$counts,4))
	# Fisher's Exact Test for Count Data

# data:  matrix(cont_table_2_l$counts, 4)
# p-value < 2.2e-16
# alternative hypothesis: two.sided

fisher.test(matrix(cont_table_2_l$counts,4))$p.value
# [1] 1.127011e-76










# Number of unique Ix in common with neither g_unique and d_unique

dim(Ix[!(paste(Ix$Chromosome, Ix$ensembl_gene_id, sep = "\r") %in% paste(g_unique$Chromosome, g_unique$ensembl_gene_id, sep = "\r") | paste(Ix$Chromosome, Ix$ensembl_gene_id, sep = "\r") %in% paste(d_unique$Chromosome, d_unique$ensembl_gene_id, sep = "\r")),])
# [1] 40 50 <<<<<<<<<<<<< use in paper

Ix[!(paste(Ix$Chromosome, Ix$ensembl_gene_id, sep = "\r") %in% paste(g_unique$Chromosome, g_unique$ensembl_gene_id, sep = "\r") | paste(Ix$Chromosome, Ix$ensembl_gene_id, sep = "\r") %in% paste(d_unique$Chromosome, d_unique$ensembl_gene_id, sep = "\r")),c("Chromosome","pos","log10P","ensembl_gene_id","geneSymbol","dist","gene_type")]
   # Chromosome       pos    log10P ensembl_gene_id geneSymbol    dist                          gene_type
# 2        chr1  37020000 13.880037 ENSG00000163873      GRIK3       0                     protein_coding
# 3        chr1  40310000  9.576563 ENSG00000049089     COL9A2       0                     protein_coding
# 4        chr1 114790000 10.621649 ENSG00000052723      SIKE1   -9316                     protein_coding
# 8        chr2  94770000 10.106101 ENSG00000232502 AC073464.1   -8885               processed_pseudogene
# 9        chr2 120060000 14.049254 ENSG00000115109    EPB41L5       0                     protein_coding
# 10       chr2 136320000  9.732317 ENSG00000230037      UBBP1    9441               processed_pseudogene
# 11       chr2 139790000 12.763401 ENSG00000223554 AC078851.1   34896                             lncRNA
# 12       chr2 183880000  9.558082 ENSG00000286879 AC093639.3   17950                             lncRNA
# 13       chr3 155220000 10.502740 ENSG00000241336  LINC01487   20919                             lncRNA
# 14       chr6  11650000 10.497604 ENSG00000277473    RF00017   -5658                           misc_RNA
# 15       chr6  23720000 12.641160 ENSG00000219453 AL133270.1  -70227               processed_pseudogene
# 16       chr6  27980000  9.842610 ENSG00000216629     OR2W4P   -1926             unprocessed_pseudogene
# 18       chr6  47430000  8.693617 ENSG00000287485 AL451166.1  -32603                             lncRNA
# 19       chr6  52700000 10.751348 ENSG00000220377     GSTA8P       0             unprocessed_pseudogene
# 21       chr6  89180000 12.063463 ENSG00000146276     GABRR1       0                     protein_coding
# 22       chr6 100250000  9.109667 ENSG00000216378 AL080285.1  -71809               processed_pseudogene
# 23       chr6 131910000 11.138313 ENSG00000236673 AL117378.1       0                             lncRNA
# 26       chr7  27000000 10.095119 ENSG00000005020      SKAP2   -4762                     protein_coding
# 27       chr7  81800000 13.555833 ENSG00000019991        HGF  -29563                     protein_coding
# 31       chr9  42230000 13.681361 ENSG00000275649 AL445584.2    1811                             lncRNA
# 32       chr9  61260000  9.763546 ENSG00000275676 BX005040.1  -28138                             lncRNA
# 33       chr9  67240000 12.724401 ENSG00000276386  CNTNAP3P2       0             unprocessed_pseudogene
# 34      chr10 103200000  9.228262 ENSG00000076685      NT5C2   -6695                     protein_coding
# 35      chr11  11030000  9.771507 ENSG00000254401 AC111188.1       0                             lncRNA
# 37      chr11  68770000  9.247329 ENSG00000110090      CPT1A       0                     protein_coding
# 39      chr12  37500000 12.250643 ENSG00000258368    ZNF970P   75587             unprocessed_pseudogene
# 40      chr12  45670000 10.756353 ENSG00000273015 AC008124.1   48046                             lncRNA
# 41      chr13  41610000  8.957138 ENSG00000102763       VWA8       0                     protein_coding
# 43      chr13  82160000 10.861969 ENSG00000214182     PTMAP5 -468929   transcribed_processed_pseudogene
# 44      chr13  85320000  8.748368 ENSG00000226317  LINC00351   43601                             lncRNA
# 46      chr15  20630000 14.011334 ENSG00000258707 AC023310.2   -2636               processed_pseudogene
# 47      chr15  28630000 10.739942 ENSG00000206149    HERC2P9       0 transcribed_unprocessed_pseudogene
# 48      chr15  39720000 11.404134 ENSG00000150667      FSIP1       0                     protein_coding
# 51      chr16  50980000 13.905400 ENSG00000261637  LINC02127   37544                             lncRNA
# 52      chr16  55370000  9.306379 ENSG00000240760   RPL31P56  -16913               processed_pseudogene
# 54      chr17   4510000  9.494814 ENSG00000183018 AC118754.1       0                     protein_coding
# 55      chr17  10450000  8.651159 ENSG00000264424       MYH4       0                     protein_coding
# 57      chr19  58178808 15.256552 ENSG00000286632 AC008751.3   -1233                             lncRNA
# 58      chr20   2580000  9.976862 ENSG00000149488       TMC2       0                     protein_coding
# 59      chr20  12070000 11.734056 ENSG00000132640      BTBD3 -143392                     protein_coding





# ------------- Overall Fishers for overlap between g_unique and d_unique, Ix and g_unique, Ix and d_unique -----------------------------

# number non-unique growth genes including CEN
dim(g)
# [1] 1836   50


# number non-unique paclitaxel genes including CEN
dim(d)
# [1] 97 50


# number unique growth genes including CEN
(g_num <- dim(g_unique)[1])
# [1] 859 <<<<<<<<<<<<<<<<<<<< use in paper


# number unique paclitaxel genes including CEN
(d_num <- dim(d_unique)[1])
# [1] 38 <<<<<<<<<<<<<<<<<<<< use in paper


# number Ix genes including CEN
(Ix_num <- dim(Ix)[1])
# [1] 62 <<<<<<<<<<<<<<<<<<<< use in paper


# number overlapping unique paclitaxel genes including CEN and unique growth genes including CEN
(g_d_ovlp <- dim(d_unique[paste(d_unique$Chromosome, d_unique$ensembl_gene_id, sep = "\r") %in% paste(g_unique$Chromosome, g_unique$ensembl_gene_id, sep = "\r"),])[1])
# [1] 10 <<<<<<<<<<<<<<<<<<<< use in paper


# number overlapping unique paclitaxel genes including CEN and unique growth genes including CEN calculated other way
dim(g_unique[paste(g_unique$Chromosome, g_unique$ensembl_gene_id, sep = "\r") %in% paste(d_unique$Chromosome, d_unique$ensembl_gene_id, sep = "\r"),])[1]
# [1] 10 <<<<<<<<<<<<<<<<<<<< same result


# <<<<<<<<<<< use in paper >>>>>>>>>>>>>>>>>>>>>>>>
# Many overlaps (7 out of 10) of unique g and unique d due to conc 75 nM
g_unique[paste(g_unique$Chromosome, g_unique$ensembl_gene_id, sep = "\r") %in% paste(d_unique$Chromosome, d_unique$ensembl_gene_id, sep = "\r"),c("Chromosome","conc","log10P","geneSymbol","dist","ensembl_gene_id")]
    # Chromosome conc    log10P geneSymbol    dist ensembl_gene_id
# 191       chr3    0 14.664971      LSAMP       0 ENSG00000185565
# 354       chr6   75 42.007826    KHDRBS2       0 ENSG00000112232
# 385       chr7   75 19.223579 AC074389.2     654 ENSG00000231476
# 398       chr7   75 21.176183     SEMA3D       0 ENSG00000153993
# 506      chr10    0 22.879916    TBC1D12       0 ENSG00000108239
# 538      chr11   75 22.464474        CEN       0           cen11
# 587      chr12   75 18.175422      AK6P1 -210043 ENSG00000256614
# 661      chr15    0  9.705184  RN7SL584P   -3972 ENSG00000239471
# 771      chr19   75 20.278043    GATAD2A       0 ENSG00000167491
# 827       chrX   75 19.221598        CEN       0            cenX
# <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<




# number overlapping Ix genes (including CEN) and 859 unique growth genes (including CEN)
(Ix_g_ovlp <- dim(Ix[paste(Ix$Chromosome, Ix$ensembl_gene_id, sep = "\r") %in% paste(g_unique$Chromosome, g_unique$ensembl_gene_id, sep = "\r"),])[1])
# [1] 15 <<<<<<<<<<<<<<<<<<<< use in paper


# number overlapping Ix genes (including CEN) and 38 unique paclitaxel genes (including CEN)
(Ix_d_ovlp <- dim(Ix[paste(Ix$Chromosome, Ix$ensembl_gene_id, sep = "\r") %in% paste(d_unique$Chromosome, d_unique$ensembl_gene_id, sep = "\r"),])[1])
# [1] 14 <<<<<<<<<<<<<<<<<<<< use in paper


# gene_num is total number genes in GENCODE v31, including CEN, see above

# ovlap g and d
fisher.test(matrix(c(g_d_ovlp, g_num-g_d_ovlp, d_num-g_d_ovlp, gene_num - (g_num-g_d_ovlp)-(d_num-g_d_ovlp)-g_d_ovlp),2,2))

  
	# Fisher's Exact Test for Count Data <<<<<<<<<<<<<<<<<<< use in paper

# data:  
# p-value = 1.023e-10
# alternative hypothesis: true odds ratio is not equal to 1
# 95 percent confidence interval:
 # 10.85312 53.48333
# sample estimates:
# odds ratio 
  # 25.12388 




fisher.test(matrix(c(g_d_ovlp, g_num-g_d_ovlp, d_num-g_d_ovlp, gene_num - (g_num-g_d_ovlp)-(d_num-g_d_ovlp)-g_d_ovlp),2,2))$p.value
# [1] 1.022859e-10





# ovlap Ix and g
fisher.test(matrix(c(Ix_g_ovlp, g_num-Ix_g_ovlp, Ix_num-Ix_g_ovlp, gene_num - (g_num-Ix_g_ovlp)-(Ix_num-Ix_g_ovlp)-Ix_g_ovlp),2,2))

 
	# Fisher's Exact Test for Count Data <<<<<<<<<<<<< use in paper

# data:  
# p-value = 8.282e-15
# alternative hypothesis: true odds ratio is not equal to 1
# 95 percent confidence interval:
 # 11.67759 41.28613
# sample estimates:
# odds ratio 
  # 22.57287




fisher.test(matrix(c(Ix_g_ovlp, g_num-Ix_g_ovlp, Ix_num-Ix_g_ovlp, gene_num - (g_num-Ix_g_ovlp)-(Ix_num-Ix_g_ovlp)-Ix_g_ovlp),2,2))$p.value
# [1] 8.281682e-15 <<<<<<<<<<<<<<<< use in paper






# ovlap Ix and d
fisher.test(matrix(c(Ix_d_ovlp, d_num-Ix_d_ovlp, Ix_num-Ix_d_ovlp, gene_num - (d_num-Ix_d_ovlp)-(Ix_num-Ix_d_ovlp)-Ix_d_ovlp),2,2))
 
  
	# Fisher's Exact Test for Count Data <<<<<<<<<<<<< use in paper

# data:  
# p-value < 2.2e-16
# alternative hypothesis: true odds ratio is not equal to 1
# 95 percent confidence interval:
  # 330.813 1600.000
# sample estimates:
# odds ratio 
  # 730.8746




fisher.test(matrix(c(Ix_d_ovlp, d_num-Ix_d_ovlp, Ix_num-Ix_d_ovlp, gene_num - (d_num-Ix_d_ovlp)-(Ix_num-Ix_d_ovlp)-Ix_d_ovlp),2,2))$p.value
# [1] 2.661419e-33 <<<<<<<<<<<<<<<< use in paper





# -------------- Overlap in Ix and g_unique and d_unique genes, non-centromeric -----------------------
# Can ignore these calx, except number non-cen paclitaxel loci

# tallying non centromeric genes only, because gencode does not have CEN

gencode_gtf_ensembl_ucsc <- read.delim("gencode_gtf_ensembl_ucsc_v31.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE,check.names=FALSE)

dim(gencode_gtf_ensembl_ucsc)
# [1] 60603    19

gencode_gtf_ensembl_ucsc$growth_uniq <- 0
gencode_gtf_ensembl_ucsc$paclitaxel_uniq <- 0
gencode_gtf_ensembl_ucsc$Ix <- 0


gencode_gtf_ensembl_ucsc[paste(gencode_gtf_ensembl_ucsc$Chromosome,gencode_gtf_ensembl_ucsc$gene_id,sep="\r") %in% paste(g_unique$Chromosome, g_unique$ensembl_gene_id, sep="\r"),"growth_uniq"] <- 1
gencode_gtf_ensembl_ucsc[paste(gencode_gtf_ensembl_ucsc$Chromosome,gencode_gtf_ensembl_ucsc$gene_id,sep="\r") %in% paste(d_unique$Chromosome, d_unique$ensembl_gene_id, sep="\r"),"paclitaxel_uniq"] <- 1
gencode_gtf_ensembl_ucsc[paste(gencode_gtf_ensembl_ucsc$Chromosome,gencode_gtf_ensembl_ucsc$gene_id,sep="\r") %in% paste(Ix$Chromosome, Ix$ensembl_gene_id, sep="\r"),"Ix"] <- 1


table(gencode_gtf_ensembl_ucsc[,c("growth_uniq","Ix")])
           # Ix
# growth_uniq     0     1
          # 0 59703    45
          # 1   843    12 



# number non-cen Ix loci
sum(table(gencode_gtf_ensembl_ucsc[,c("growth_uniq","Ix")])[,2])
# [1] 57 


          
# number non-cen growth loci
sum(table(gencode_gtf_ensembl_ucsc[,c("growth_uniq","Ix")])[2,])
# [1] 855 




table(gencode_gtf_ensembl_ucsc[,c("paclitaxel_uniq","Ix")])
               # Ix
# paclitaxel_uniq     0     1
              # 0 60522    47
              # 1    24    10 


# number non-cen Ix loci
sum(table(gencode_gtf_ensembl_ucsc[,c("paclitaxel_uniq","Ix")])[,2])
# [1] 57 



# number non-cen paclitaxel loci
sum(table(gencode_gtf_ensembl_ucsc[,c("paclitaxel_uniq","Ix")])[2,])
# [1] 34 <<<<<<<<<<<< use in paper



fisher.test(table(gencode_gtf_ensembl_ucsc[,c("growth_uniq","Ix")]))

	# Fisher's Exact Test for Count Data

# data:  table(gencode_gtf_ensembl_ucsc[, c("growth_uniq", "Ix")])
# p-value = 2.279e-11
# alternative hypothesis: true odds ratio is not equal to 1
# 95 percent confidence interval:
  # 9.055877 36.448795
# sample estimates:
# odds ratio 
  # 18.88151  
  
  
fisher.test(table(gencode_gtf_ensembl_ucsc[,c("growth_uniq","Ix")]))$p.value
# [1] 2.27929e-11






fisher.test(table(gencode_gtf_ensembl_ucsc[,c("paclitaxel_uniq","Ix")]))

	# Fisher's Exact Test for Count Data

# data:  table(gencode_gtf_ensembl_ucsc[, c("paclitaxel_uniq", "Ix")])
# p-value < 2.2e-16
# alternative hypothesis: true odds ratio is not equal to 1
# 95 percent confidence interval:
  # 215.9957 1302.7578
# sample estimates:
# odds ratio 
  # 526.3133


fisher.test(table(gencode_gtf_ensembl_ucsc[,c("paclitaxel_uniq","Ix")]))$p.value
# [1] 3.02552e-23





# ---------------- Comprehensive overlap tables for g_unique, d_unique and Ix, including CEN -------------------



g_unique_bind <- g_unique
d_unique_bind <- d_unique
Ix_bind <- Ix

g_unique_bind[g_unique_bind$conc == 0,]$conc <- "g_unique_0nM"
g_unique_bind[g_unique_bind$conc == 8,]$conc <- "g_unique_8nM"
g_unique_bind[g_unique_bind$conc == 25,]$conc <- "g_unique_25nM"
g_unique_bind[g_unique_bind$conc == 75,]$conc <- "g_unique_75nM"
g_unique_bind[g_unique_bind$conc == "avg",]$conc <- "g_unique_avg"



d_unique_bind[d_unique_bind$wk == 3,]$wk <- "d_unique_w3"
d_unique_bind[d_unique_bind$wk == 4,]$wk <- "d_unique_w4"
d_unique_bind[d_unique_bind$wk == 6,]$wk <- "d_unique_w6"
d_unique_bind[d_unique_bind$wk == "avg",]$wk <- "d_unique_avg"

colnames(g_unique_bind)[5] <- "wk_conc"
colnames(d_unique_bind)[5] <- "wk_conc"

g_unique_and_d <- rbind(g_unique_bind,d_unique_bind,Ix_bind)


# no d_unique loci w1, w2
g_unique_and_d_unique_overlap_frac <- data.frame("g 0nM"=numeric(), "g 8nM"=numeric(), "g 25nM"=numeric(), "g 75nM"=numeric(), "g avg"=numeric(), "d w3"=numeric(), "d w4"=numeric(), "d w6"=numeric(), "d avg"=numeric(), "Ix" = numeric(), check.names=FALSE)

k <- as.character(c("g_unique_0nM","g_unique_8nM","g_unique_25nM","g_unique_75nM","g_unique_avg", "d_unique_w3","d_unique_w4","d_unique_w6","d_unique_avg", "Ix"))

for(i in 1:10) 
{
	for(j in 1:10) 
	{
		g_unique_and_d_unique_overlap_frac[i,j] <- dim(g_unique_and_d[g_unique_and_d$wk_conc==k[i],][with(g_unique_and_d[g_unique_and_d$wk_conc==k[i],], paste(Chromosome, ensembl_gene_id, sep = "\r")) %in% with(g_unique_and_d[g_unique_and_d$wk_conc==k[j],], paste(Chromosome, ensembl_gene_id, sep="\r")), ])[1]
	}
}

row.names(g_unique_and_d_unique_overlap_frac) <- c(colnames(g_unique_and_d_unique_overlap_frac))

g_unique_and_d_unique_overlap_dec <- g_unique_and_d_unique_overlap_frac
for(i in 1:10) {
	g_unique_and_d_unique_overlap_dec[i,] <- g_unique_and_d_unique_overlap_dec[i,]/g_unique_and_d_unique_overlap_dec[i,i]
}



g_unique_and_d_unique_overlap_frac
       # g 0nM g 8nM g 25nM g 75nM g avg d w3 d w4 d w6 d avg Ix
# g 0nM    102     0      0      0     0    0    0    1     2  0
# g 8nM      0   261      0      0     0    0    0    0     0  0
# g 25nM     0     0    317      0     0    0    0    0     0  1
# g 75nM     0     0      0     40     0    0    0    7     0 13
# g avg      0     0      0      0   139    0    0    0     0  1
# d w3       0     0      0      0     0    3    0    0     0  0
# d w4       0     0      0      0     0    0    4    0     0  0
# d w6       1     0      0      7     0    0    0   25     0 14
# d avg      2     0      0      0     0    0    0    0     6  0
# Ix         0     0      1     13     1    0    0   14     0 62



g_unique_and_d_unique_overlap_dec
           # g 0nM g 8nM     g 25nM    g 75nM      g avg d w3 d w4        d w6      d avg          Ix
# g 0nM  1.0000000     0 0.00000000 0.0000000 0.00000000    0    0 0.009803922 0.01960784 0.000000000
# g 8nM  0.0000000     1 0.00000000 0.0000000 0.00000000    0    0 0.000000000 0.00000000 0.000000000
# g 25nM 0.0000000     0 1.00000000 0.0000000 0.00000000    0    0 0.000000000 0.00000000 0.003154574
# g 75nM 0.0000000     0 0.00000000 1.0000000 0.00000000    0    0 0.175000000 0.00000000 0.325000000
# g avg  0.0000000     0 0.00000000 0.0000000 1.00000000    0    0 0.000000000 0.00000000 0.007194245
# d w3   0.0000000     0 0.00000000 0.0000000 0.00000000    1    0 0.000000000 0.00000000 0.000000000
# d w4   0.0000000     0 0.00000000 0.0000000 0.00000000    0    1 0.000000000 0.00000000 0.000000000
# d w6   0.0400000     0 0.00000000 0.2800000 0.00000000    0    0 1.000000000 0.00000000 0.560000000
# d avg  0.3333333     0 0.00000000 0.0000000 0.00000000    0    0 0.000000000 1.00000000 0.000000000
# Ix     0.0000000     0 0.01612903 0.2096774 0.01612903    0    0 0.225806452 0.00000000 1.000000000








g_and_d_fish_pval_unique <- as.data.frame(matrix("NA", ncol = ncol(g_unique_and_d_unique_overlap_frac), nrow = nrow(g_unique_and_d_unique_overlap_frac)),stringsAsFactors=FALSE)
colnames(g_and_d_fish_pval_unique) <- colnames(g_unique_and_d_unique_overlap_frac)
rownames(g_and_d_fish_pval_unique) <- rownames(g_unique_and_d_unique_overlap_frac)

g_and_d_fish_odds_ratio_unique <- g_and_d_fish_pval_unique

# No log.p option for fisher.test

for(i in 1:nrow(g_unique_and_d_unique_overlap_frac)){
	for(j in 1:ncol(g_unique_and_d_unique_overlap_frac)){
		union <- g_unique_and_d_unique_overlap_frac[i,j]
		d_not_g <- g_unique_and_d_unique_overlap_frac[j,j] - union
		g_not_d <- g_unique_and_d_unique_overlap_frac[i,i] - union
		not_g_not_d <- gene_num - union - d_not_g - g_not_d
		g_and_d_fish_pval_unique[i,j] <-	fisher.test(matrix(c(union,d_not_g,g_not_d,not_g_not_d),2,2))$p.value
		g_and_d_fish_odds_ratio_unique[i,j] <-	fisher.test(matrix(c(union,d_not_g,g_not_d,not_g_not_d),2,2))$estimate[[1]]
		}
	}
	







g_and_d_fish_pval_unique
                      # g 0nM             g 8nM            g 25nM                g 75nM             g avg                 d w3                 d w4                 d w6
# g 0nM                     0                 1                 1                     1                 1                    1                    1   0.0412308246082184
# g 8nM                     1                 0 0.650319737828759                     1                 1                    1                    1                    1
# g 25nM                    1 0.650319737828759                 0                     1                 1                    1                    1                    1
# g 75nM                    1                 1                 1 4.08203953758543e-144                 1                    1                    1 1.48815018979312e-17
# g avg                     1                 1                 1                     1                 0                    1                    1                    1
# d w3                      1                 1                 1                     1                 1 2.69274975886068e-14                    1                    1
# d w4                      1                 1                 1                     1                 1                    1 1.77671824809771e-18                    1
# d w6     0.0412308246082185                 1                 1  1.48815018979312e-17                 1                    1                    1 4.22982246776233e-95
# d avg  4.18592184858298e-05                 1                 1                     1                 1                    1                    1                    1
# Ix                        1                 1 0.277616353074987  4.08657509290991e-30 0.132712857140962                    1                    1 1.23869333118355e-36
                      # d avg                    Ix
# g 0nM  4.18592184858297e-05                     1
# g 8nM                     1                     1
# g 25nM                    1     0.277616353074987
# g 75nM                    1  4.08657509290991e-30
# g avg                     1     0.132712857140962
# d w3                      1                     1
# d w4                      1                     1
# d w6                      1  1.23869333118355e-36
# d avg   1.4503955585683e-26                     1
# Ix                        1 9.69604000439668e-212




g_and_d_fish_odds_ratio_unique
                  # g 0nM g 8nM           g 25nM           g 75nM            g avg d w3 d w4             d w6            d avg               Ix
# g 0nM               Inf     0                0                0                0    0    0 24.9304215491534 303.718905776364                0
# g 8nM                 0   Inf                0                0                0    0    0                0                0                0
# g 25nM                0     0              Inf                0                0    0    0                0                0 3.12537672925761
# g 75nM                0     0                0              Inf                0    0    0  710.64193213778                0 585.512692140646
# g avg                 0     0                0                0              Inf    0    0                0                0 7.17814058256903
# d w3                  0     0                0                0                0  Inf    0                0                0                0
# d w4                  0     0                0                0                0    0  Inf                0                0                0
# d w6   24.9304215491534     0                0  710.64193213778                0    0    0              Inf                0 1521.89398299837
# d avg  303.718905776365     0                0                0                0    0    0                0              Inf                0
# Ix                    0     0 3.12537672925761 585.512692140644 7.17814058256903    0    0 1521.89398299836                0              Inf



# Most significant spot comparison is g_unique d_unique overlap between g 75 nM and d w6, cf tables above.
# OR = 710.64193213778, P = 1.48815018979312e-17 <<<<<<<<<<<<<<<<< use in paper




# overlap of g_unique at 75 nM and other g_unique/d_unique comparisons. However, number observations too sparse --  do NOT use in paper.
# Use spot comparison above, instead.

t.test(as.numeric(g_and_d_fish_odds_ratio_unique[6:9,"g 75nM"]), as.numeric(unlist(g_and_d_fish_odds_ratio_unique[6:9,c(1:3,5)])))

	# Welch Two Sample t-test

# data:  as.numeric(g_and_d_fish_odds_ratio_unique[6:9, "g 75nM"]) and as.numeric(unlist(g_and_d_fish_odds_ratio_unique[6:9, c(1:3, as.numeric(g_and_d_fish_odds_ratio_unique[6:9, "g 75nM"]) and     5)]))
# t = 0.8794, df = 3.0685, p-value = 0.4426
# alternative hypothesis: true difference in means is not equal to 0
# 95 percent confidence interval:
 # -404.3642  718.6040
# sample estimates:
# mean of x mean of y 
# 177.66048  20.54058 

mean(as.numeric(g_and_d_fish_odds_ratio_unique[6:9,"g 75nM"]))
# [1] 177.6605


sem(as.numeric(g_and_d_fish_odds_ratio_unique[6:9,"g 75nM"]))
# [1] 177.6605


mean(as.numeric(unlist(g_and_d_fish_odds_ratio_unique[6:9,c(1:3,5)])))
# [1] 20.54058


sem(as.numeric(unlist(g_and_d_fish_odds_ratio_unique[6:9,c(1:3,5)])))
# [1] 18.94246




# Are ORs of Ix vs g significantly greater than Ix vs paclitaxel.

compare(as.numeric(g_and_d_fish_odds_ratio_unique["Ix",1:5]),as.numeric(g_and_d_fish_odds_ratio_unique["Ix",6:9]))

	# Welch Two Sample t-test

# data:  a and b
# t = -0.65666, df = 3.5663, p-value = 0.5513
# alternative hypothesis: true difference in means is not equal to 0
# 95 percent confidence interval:
 # -1421.2202   898.5997
# sample estimates:
# mean of x mean of y 
 # 119.1632  380.4735 

# [1] "exact P value = 0.551333520964833"
# [1] "mean of a = 119.163241890494"
# [1] "sem of a = 116.594814475268"
# [1] "sd of a = 260.713930990677"
# [1] "number in a = 5"
# [1] "mean of b = 380.47349574959"
# [1] "sem of b = 380.47349574959"
# [1] "sd of b = 760.94699149918"
# [1] "number in b = 4"












# Compare 15 loci in common between the 62 Ix loci and the 859 unique growth loci including centromeres
# Test if growth 75 nM significantly higher overlap than at other paclitaxel concs.
# use var.equal=TRUE, rather than mu, because more accurate and also more conservative
t.test(as.numeric(g_and_d_fish_odds_ratio_unique["Ix",c(1:3,5)]),as.numeric(g_and_d_fish_odds_ratio_unique["Ix","g 75nM"]),var.equal = TRUE)

	# Two Sample t-test

# data:  as.numeric(g_and_d_fish_odds_ratio_unique["Ix", c(1:3, 5)]) and as.numeric(g_and_d_fish_odds_ratio_unique["Ix", "g 75nM"])
# t = -153.19, df = 3, p-value = 6.134e-07
# alternative hypothesis: true difference in means is not equal to 0
# 95 percent confidence interval:
 # -595.0470 -570.8266
# sample estimates:
 # mean of x  mean of y 
  # 2.575879 585.512692 


t.test(as.numeric(g_and_d_fish_odds_ratio_unique["Ix",c(1:3,5)]),as.numeric(g_and_d_fish_odds_ratio_unique["Ix","g 75nM"]),var.equal = TRUE)$p.value
# [1] 6.133558e-07 

mean(as.numeric(g_and_d_fish_odds_ratio_unique["Ix",c(1:3,5)]))
# [1] 2.575879 

sem(as.numeric(g_and_d_fish_odds_ratio_unique["Ix",c(1:3,5)]))
# [1] 1.70179 

mean(as.numeric(g_and_d_fish_odds_ratio_unique["Ix","g 75nM"]))
# [1] 585.5127 

sem(as.numeric(g_and_d_fish_odds_ratio_unique["Ix","g 75nM"]))
# [1] NA






# ------------------------ Ix CEN ---------------------------


dim(Ix)
# [1] 62 50 <<<<<<<<<<<<<< use in paper


dim(Ix[Ix$geneSymbol=="CEN",])
# [1]  5 50 <<<<<<<<<<<< use in paper

# Ix CEN
Ix[Ix$geneSymbol=="CEN",c("Chromosome","pos","log10P","wk_conc","ensembl_gene_id","geneSymbol")]
   # Chromosome      pos    log10P wk_conc ensembl_gene_id geneSymbol <<<<<<<<<<<<<< use in paper
# 36      chr11 54270000 19.636416      Ix           cen11        CEN
# 45      chr15 16980000 16.578741      Ix           cen15        CEN
# 50      chr16 36690000  9.665248      Ix           cen16        CEN
# 60      chr20 29950000 13.012052      Ix           cen20        CEN
# 62       chrX 58750000 22.030698      Ix            cenX        CEN



# Do Ix CEN and Ix non-CEN have different logP vals? No.

compare(Ix[Ix$geneSymbol == "CEN",c("log10P")], Ix[Ix$geneSymbol != "CEN",c("log10P")])

	# Welch Two Sample t-test <<<<<<<<<<<<<< use in paper

# data:  a and b
# t = 1.8602, df = 4.3352, p-value = 0.1308
# alternative hypothesis: true difference in means is not equal to 0
# 95 percent confidence interval:
 # -1.890604 10.328508
# sample estimates:
# mean of x mean of y 
 # 16.18463  11.96568 

# [1] "exact P value = 0.130820484727277"
# [1] "mean of a = 16.1846309693915"
# [1] "sem of a = 2.22277146820476"
# [1] "sd of a = 4.97026810135285"
# [1] "number in a = 5"
# [1] "mean of b = 11.965678862287"
# [1] "sem of b = 0.450755267975524"
# [1] "sd of b = 3.4031276440413"
# [1] "number in b = 57"





# --------------- Venn diagram ---------------------------


# source("http://www.bioconductor.org/biocLite.R")

# class(biocLite)
# # [1] "function"

# biocLite("limma")


# Above not needed once limma downloaded. Simply use
# library(limma)



# to make vennCount work

gencode_gtf_ensembl_ucsc <- read.delim("gencode_gtf_ensembl_ucsc_v31.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE,check.names=FALSE)

dim(gencode_gtf_ensembl_ucsc)
# [1] 60603    19

# include CEN
cen <- gencode_gtf_ensembl_ucsc[0,]
cen[1:23,] <- NA
cen$Chromosome <- paste0("chr",c(1:22,"X"))
cen$gene_id <- paste0("cen",c(1:22,"X"))
cen$geneSymbol <- rep("CEN",23)
cen$gene_type <- rep("centromere",23)
cen$gene_description <- rep("centromere",23)


gencode_gtf_ensembl_ucsc_cen <- rbind(gencode_gtf_ensembl_ucsc,cen)

dim(gencode_gtf_ensembl_ucsc_cen)
# [1] 60626    19



gencode_gtf_ensembl_ucsc_cen$growth <- FALSE
gencode_gtf_ensembl_ucsc_cen$paclitaxel <- FALSE
gencode_gtf_ensembl_ucsc_cen$Ix <- FALSE



gencode_gtf_ensembl_ucsc_cen[paste(gencode_gtf_ensembl_ucsc_cen$Chromosome, gencode_gtf_ensembl_ucsc_cen$gene_id, sep = "\r") %in% paste(g_unique$Chromosome, g_unique$ensembl_gene_id, sep = "\r"),"growth"] <- TRUE
gencode_gtf_ensembl_ucsc_cen[paste(gencode_gtf_ensembl_ucsc_cen$Chromosome, gencode_gtf_ensembl_ucsc_cen$gene_id, sep = "\r") %in% paste(d_unique$Chromosome, d_unique$ensembl_gene_id, sep = "\r"),"paclitaxel"] <- TRUE
gencode_gtf_ensembl_ucsc_cen[paste(gencode_gtf_ensembl_ucsc_cen$Chromosome, gencode_gtf_ensembl_ucsc_cen$gene_id, sep = "\r") %in% paste(Ix$Chromosome, Ix$ensembl_gene_id, sep = "\r"),"Ix"] <- TRUE


venn <- gencode_gtf_ensembl_ucsc_cen[,c("growth","paclitaxel","Ix")]


head(venn)
  # growth paclitaxel    Ix
# 1  FALSE      FALSE FALSE
# 2  FALSE      FALSE FALSE
# 3  FALSE      FALSE FALSE
# 4  FALSE      FALSE FALSE
# 5  FALSE      FALSE FALSE
# 6  FALSE      FALSE FALSE


vennCount <- vennCounts(venn)

vennCount
  # growth paclitaxel Ix Counts
# 1      0          0  0  59699
# 2      0          0  1     40
# 3      0          1  0     21
# 4      0          1  1      7
# 5      1          0  0    841
# 6      1          0  1      8
# 7      1          1  0      3
# 8      1          1  1      7


# --------------- Make figure -------------------

pdf("venn_1.pdf",width=7.5,height=7.5,useDingbats=FALSE)
vennDiagram(vennCount,circle.col=c("red","blue","green3"),lwd=0.5,counts.col=c("blue"))
dev.off()

# -----------------------------------------------

# Also makes Ix vs g and Ix vs d and g vs d overlap calx easier. All venn calx also above, but by hand, and all agree. 


# growth vs paclitaxel

table(venn[,c("growth","paclitaxel")])
       # paclitaxel
# growth  FALSE  TRUE
  # FALSE 59739    28
  # TRUE    849    10
  

fisher.test(table(venn[,c("growth","paclitaxel")]))

	# Fisher's Exact Test for Count Data

# data:  table(venn[, c("growth", "paclitaxel")])
# p-value = 1.023e-10
# alternative hypothesis: true odds ratio is not equal to 1
# 95 percent confidence interval:
 # 10.85312 53.48333
# sample estimates:
# odds ratio 
  # 25.12388 

fisher.test(table(venn[,c("growth","paclitaxel")]))$p.value
# [1] 1.022859e-10


# Ix vs growth

table(venn[,c("growth","Ix")])
       # Ix
# growth  FALSE  TRUE
  # FALSE 59720    47
  # TRUE    844    15

fisher.test(table(venn[,c("growth","Ix")]))

	# Fisher's Exact Test for Count Data

# data:  table(venn[, c("growth", "Ix")])
# p-value = 8.282e-15
# alternative hypothesis: true odds ratio is not equal to 1
# 95 percent confidence interval:
 # 11.67759 41.28613
# sample estimates:
# odds ratio 
  # 22.57287 

fisher.test(table(venn[,c("growth","Ix")]))$p.value
# [1] 8.281682e-15



# Ix vs paclitaxel

table(venn[,c("paclitaxel","Ix")])
          # Ix
# paclitaxel FALSE  TRUE
     # FALSE 60540    48
     # TRUE     24    14

fisher.test(table(venn[,c("paclitaxel","Ix")]))

	# Fisher's Exact Test for Count Data

# data:  table(venn[, c("paclitaxel", "Ix")])
# p-value < 2.2e-16
# alternative hypothesis: true odds ratio is not equal to 1
# 95 percent confidence interval:
  # 330.813 1600.000
# sample estimates:
# odds ratio 
  # 730.8746 

fisher.test(table(venn[,c("paclitaxel","Ix")]))$p.value
# [1] 2.661419e-33



# Ix vs growth OR paclitaxel

table(c(venn$growth | venn$paclitaxel),venn$Ix)
       
        # FALSE  TRUE
  # FALSE 59699    40
  # TRUE    865    22
  
fisher.test(table(c(venn$growth | venn$paclitaxel),venn$Ix))

	# Fisher's Exact Test for Count Data

# data:  table(c(venn$growth | venn$paclitaxel), venn$Ix)
# p-value < 2.2e-16
# alternative hypothesis: true odds ratio is not equal to 1
# 95 percent confidence interval:
 # 21.39318 65.79632
# sample estimates:
# odds ratio 
  # 37.94709 

fisher.test(table(c(venn$growth | venn$paclitaxel),venn$Ix))$p.value
# [1] 6.609756e-25



# Ix vs growth AND paclitaxel

table(c(venn$growth & venn$paclitaxel),venn$Ix)

        # FALSE  TRUE
  # FALSE 60561    55
  # TRUE      3     7

fisher.test(table(c(venn$growth & venn$paclitaxel),venn$Ix))
	# Fisher's Exact Test for Count Data

# data:  table(c(venn$growth & venn$paclitaxel), venn$Ix)
# p-value < 2.2e-16
# alternative hypothesis: true odds ratio is not equal to 1
# 95 percent confidence interval:
   # 563.7131 16384.0000
# sample estimates:
# odds ratio 
  # 2447.407 



fisher.test(table(c(venn$growth & venn$paclitaxel),venn$Ix))$p.value
# [1] 9.860528e-20



































