#calculations performed in text for g_d_overlap figure discussion


logP <- read.table("log10P_human.txt",sep="\t",stringsAsFactors=FALSE, header=TRUE)



sem <- function(x) {sqrt(var(x,na.rm=TRUE)/sum(!is.na(x)))}

compare <- function(a,b) {
	print(t.test(a,b))
	
	print(paste0("exact P value = ", t.test(a,b)$p.value))
	
	print(paste0("mean of a = ", mean(a, na.rm = TRUE)))
	print(paste0("sem of a = ", sem(a)))
	print(paste0("sd of a = ", sd(a, na.rm = TRUE)))
	print(paste0("number in a = ", sum(!is.na(a))))
	
	print(paste0("mean of b = ", mean(b, na.rm = TRUE)))
	print(paste0("sem of b = ", sem(b)))
	print(paste0("sd of b = ", sd(b, na.rm = TRUE)))
	print(paste0("number in b = ", sum(!is.na(b))))
	
}





library(psych)

# Modified version of psych::corr.test, which provides more accurate P vals and even more accurate -log10P values:

corr_test <- function (x, y = NULL, use = "pairwise", method = "pearson", 
    adjust = "holm", alpha = 0.05, ci = TRUE, minlength = 5) 
{
    cl <- match.call()
    if (is.null(y)) {
        r <- cor(x, use = use, method = method)
        sym <- TRUE
        n <- t(!is.na(x)) %*% (!is.na(x))
    }
    else {
        r <- cor(x, y, use = use, method = method)
        sym = FALSE
        n <- t(!is.na(x)) %*% (!is.na(y))
    }
    if ((use == "complete") | (min(n) == max(n))) 
        n <- min(n)
    t <- (r * sqrt(n - 2))/sqrt(1 - r^2)
    p <- -2 * expm1(pt(abs(t), (n - 2), log.p = TRUE))
    print(paste0("Accurate P vals:"))
    print(p)
    log10P <- -(log(2) + pt(abs(t), (n - 2), log.p = TRUE, lower.tail = FALSE))/log(10)
    print(paste0("Even more accurate-log10P vals:"))
    print(log10P)
    se <- sqrt((1 - r * r)/(n - 2))
    nvar <- ncol(r)
    p[p > 1] <- 1
    if (adjust != "none") {
        if (is.null(y)) {
            lp <- upper.tri(p)
            pa <- p[lp]
            pa <- p.adjust(pa, adjust)
            p[upper.tri(p, diag = FALSE)] <- pa
        }
        else {
            p[] <- p.adjust(p, adjust)
        }
    }
    z <- fisherz(r[lower.tri(r)])
    if (ci) {
        if (min(n) < 4) {
            warning("Number of subjects must be greater than 3 to find confidence intervals.")
        }
        if (sym) {
            ncors <- nvar * (nvar - 1)/2
        }
        else ncors <- prod(dim(r))
        if (adjust != "holm") {
            dif.corrected <- qnorm(1 - alpha/(2 * ncors))
        }
        else {
            ord <- order(abs(z), decreasing = FALSE)
            dif.corrected <- qnorm(1 - alpha/(2 * order(ord)))
        }
        alpha <- 1 - alpha/2
        dif <- qnorm(alpha)
        if (sym) {
            if (is.matrix(n)) {
                sef <- 1/sqrt(n[lower.tri(n)] - 3)
            }
            else {
                sef <- 1/sqrt(n - 3)
            }
            lower <- fisherz2r(z - dif * sef)
            upper <- fisherz2r(z + dif * sef)
            lower.corrected <- fisherz2r(z - dif.corrected * 
                sef)
            upper.corrected <- fisherz2r(z + dif.corrected * 
                sef)
            ci <- data.frame(lower = lower, r = r[lower.tri(r)], 
                upper = upper, p = p[lower.tri(p)])
            ci.adj <- data.frame(lower.adj = lower.corrected, 
                upper.adj = upper.corrected)
            cnR <- abbreviate(colnames(r), minlength = minlength)
            k <- 1
            for (i in 1:(nvar - 1)) {
                for (j in (i + 1):nvar) {
                  rownames(ci)[k] <- paste(cnR[i], cnR[j], sep = "-")
                  k <- k + 1
                }
            }
        }
        else {
            n.x <- NCOL(x)
            n.y <- NCOL(y)
            z <- fisherz(r)
            if (adjust != "holm") {
                dif.corrected <- qnorm(1 - (1 - alpha)/(n.x * 
                  n.y))
            }
            else {
                ord <- order(abs(z), decreasing = FALSE)
                dif.corrected <- qnorm(1 - (1 - alpha)/(order(ord)))
            }
            sef <- 1/sqrt(n - 3)
            lower <- as.vector(fisherz2r(z - dif * sef))
            upper <- as.vector(fisherz2r(z + dif * sef))
            lower.corrected <- fisherz2r(z - dif.corrected * 
                sef)
            upper.corrected <- fisherz2r(z + dif.corrected * 
                sef)
            ci <- data.frame(lower = lower, r = as.vector(r), 
                upper = upper, p = as.vector(p))
            ci.adj <- data.frame(lower.adj = as.vector(lower.corrected), 
                r = as.vector(r), upper.adj = as.vector(upper.corrected))
            cnR <- abbreviate(rownames(r), minlength = minlength)
            cnC <- abbreviate(colnames(r), minlength = minlength)
            k <- 1
            for (i in 1:NCOL(y)) {
                for (j in 1:NCOL(x)) {
                  rownames(ci)[k] <- paste(cnR[j], cnC[i], sep = "-")
                  k <- k + 1
                }
            }
        }
    }
    else {
        ci <- sef <- ci.adj <- NULL
    }
    result <- list(r = r, n = n, t = t, p = p, accurate_P = p, log10P = log10P, se = se, sef = sef, 
        adjust = adjust, sym = sym, ci = ci, ci.adj = ci.adj, 
        Call = cl)
    class(result) <- c("psych", "corr.test")
    return(result)
}


#--------correlations (and P values) between logP values growth ------------
#-------- down sample every 1 Mb to remove autocorrelation ---------------

cor(logP[seq(1,nrow(logP),by=100),c(10:15)], method="pearson",use = "pairwise.complete.obs")
             # log10p_d_w1 log10p_d_w2 log10p_d_w3 log10p_d_w4 log10p_d_w6 log10p_d_avg
# log10p_d_w1   1.00000000   0.4719453   0.1055700  0.01251672 -0.01926074   0.07511894
# log10p_d_w2   0.47194529   1.0000000   0.8337715  0.69836937  0.58894065   0.79857216
# log10p_d_w3   0.10557001   0.8337715   1.0000000  0.97334083  0.91196427   0.99788875
# log10p_d_w4   0.01251672   0.6983694   0.9733408  1.00000000  0.97424496   0.98591782
# log10p_d_w6  -0.01926074   0.5889406   0.9119643  0.97424496  1.00000000   0.93257394
# log10p_d_avg  0.07511894   0.7985722   0.9978888  0.98591782  0.93257394   1.00000000


# corr.test gives quick R values and P values for dataframe, but only 2 dec places. For more accurate correls use cor(), for P values use cor.test on selected columns. Or use results from chart.Correlation() (shown in Fig in paper).

corr.test(logP[seq(1,nrow(logP),by=100),c(10:15)], method="pearson",use = "pairwise.complete.obs", adjust = "none")
# Call:corr.test(x = logP[seq(1, nrow(logP), by = 100), c(10:15)], use = "pairwise.complete.obs", 
    # method = "pearson", adjust = "none")
# Correlation matrix 
             # log10p_d_w1 log10p_d_w2 log10p_d_w3 log10p_d_w4 log10p_d_w6 log10p_d_avg
# log10p_d_w1         1.00        0.47        0.11        0.01       -0.02         0.08
# log10p_d_w2         0.47        1.00        0.83        0.70        0.59         0.80
# log10p_d_w3         0.11        0.83        1.00        0.97        0.91         1.00
# log10p_d_w4         0.01        0.70        0.97        1.00        0.97         0.99
# log10p_d_w6        -0.02        0.59        0.91        0.97        1.00         0.93
# log10p_d_avg        0.08        0.80        1.00        0.99        0.93         1.00
# Sample Size 
# [1] 3054
# Probability values (Entries above the diagonal are adjusted for multiple tests.) 
             # log10p_d_w1 log10p_d_w2 log10p_d_w3 log10p_d_w4 log10p_d_w6 log10p_d_avg
# log10p_d_w1         0.00           0           0        0.49        0.29            0
# log10p_d_w2         0.00           0           0        0.00        0.00            0
# log10p_d_w3         0.00           0           0        0.00        0.00            0
# log10p_d_w4         0.49           0           0        0.00        0.00            0
# log10p_d_w6         0.29           0           0        0.00        0.00            0
# log10p_d_avg        0.00           0           0        0.00        0.00            0

 # To see confidence intervals of the correlations, print with the short=FALSE option
 


corr_test(logP[seq(1,nrow(logP),by=100),c(10:15)], method="pearson",use = "pairwise.complete.obs", adjust = "none")
# [1] "Accurate P vals:"
               # log10p_d_w1   log10p_d_w2  log10p_d_w3 log10p_d_w4   log10p_d_w6 log10p_d_avg
# log10p_d_w1   0.000000e+00 3.132861e-169 4.972924e-09   0.4892794  2.872972e-01 3.246177e-05
# log10p_d_w2  3.132861e-169  0.000000e+00 0.000000e+00   0.0000000 1.252862e-284 0.000000e+00
# log10p_d_w3   4.972924e-09  0.000000e+00 0.000000e+00   0.0000000  0.000000e+00 0.000000e+00
# log10p_d_w4   4.892794e-01  0.000000e+00 0.000000e+00   0.0000000  0.000000e+00 0.000000e+00
# log10p_d_w6   2.872972e-01 1.252862e-284 0.000000e+00   0.0000000  0.000000e+00 0.000000e+00
# log10p_d_avg  3.246177e-05  0.000000e+00 0.000000e+00   0.0000000  0.000000e+00 0.000000e+00
# [1] "Even more accurate-log10P vals:"
             # log10p_d_w1 log10p_d_w2 log10p_d_w3 log10p_d_w4  log10p_d_w6 log10p_d_avg
# log10p_d_w1          Inf    168.5041    8.303388    0.310443    0.5416686     4.488628
# log10p_d_w2  168.5040589         Inf  789.100006  444.976039  283.9020966   674.636495
# log10p_d_w3    8.3033882    789.1000         Inf 1953.508221 1182.7126838  3625.920638
# log10p_d_w4    0.3104430    444.9760 1953.508221         Inf 1976.0712094  2372.275081
# log10p_d_w6    0.5416686    283.9021 1182.712684 1976.071209          Inf  1352.375180
# log10p_d_avg   4.4886278    674.6365 3625.920638 2372.275081 1352.3751799          Inf
# Call:corr_test(x = logP[seq(1, nrow(logP), by = 100), c(10:15)], use = "pairwise.complete.obs", 
    # method = "pearson", adjust = "none")
# Correlation matrix 
             # log10p_d_w1 log10p_d_w2 log10p_d_w3 log10p_d_w4 log10p_d_w6 log10p_d_avg
# log10p_d_w1         1.00        0.47        0.11        0.01       -0.02         0.08
# log10p_d_w2         0.47        1.00        0.83        0.70        0.59         0.80
# log10p_d_w3         0.11        0.83        1.00        0.97        0.91         1.00
# log10p_d_w4         0.01        0.70        0.97        1.00        0.97         0.99
# log10p_d_w6        -0.02        0.59        0.91        0.97        1.00         0.93
# log10p_d_avg        0.08        0.80        1.00        0.99        0.93         1.00
# Sample Size 
# [1] 3054
# Probability values (Entries above the diagonal are adjusted for multiple tests.) 
             # log10p_d_w1 log10p_d_w2 log10p_d_w3 log10p_d_w4 log10p_d_w6 log10p_d_avg
# log10p_d_w1         0.00           0           0        0.49        0.29            0
# log10p_d_w2         0.00           0           0        0.00        0.00            0
# log10p_d_w3         0.00           0           0        0.00        0.00            0
# log10p_d_w4         0.49           0           0        0.00        0.00            0
# log10p_d_w6         0.29           0           0        0.00        0.00            0
# log10p_d_avg        0.00           0           0        0.00        0.00            0

 # To see confidence intervals of the correlations, print with the short=FALSE option


#-------------worst and insignificant R value ---------------

# between d_wk1 and d_wk4

cor.test(logP[seq(1,nrow(logP),by=100),"log10p_d_w1"], logP[seq(1,nrow(logP),by=100),"log10p_d_w4"],method="pearson",use = "pairwise.complete.obs")

	# Pearson's product-moment correlation <<<<<<<<<<<< mention n.s. in paper

# data:  logP[seq(1, nrow(logP), by = 100), "log10p_d_w1"] and logP[seq(1, nrow(logP), by = 100), "log10p_d_w4"]
# t = 0.69154, df = 3052, p-value = 0.4893
# alternative hypothesis: true correlation is not equal to 0
# 95 percent confidence interval:
 # -0.02296214  0.04796408
# sample estimates:
       # cor 
# 0.01251672  

cor.test(logP[seq(1,nrow(logP),by=100),"log10p_d_w1"], logP[seq(1,nrow(logP),by=100),"log10p_d_w4"],method="pearson",use = "pairwise.complete.obs")$p.value
# [1] 0.4892794




# Even worse,  between d_wk1 and d_wk6

cor.test(logP[seq(1,nrow(logP),by=100),"log10p_d_w1"], logP[seq(1,nrow(logP),by=100),"log10p_d_w6"],method="pearson",use = "pairwise.complete.obs")

	# Pearson's product-moment correlation <<<<<<<<<<<< mention n.s. in paper

# data:  logP[seq(1, nrow(logP), by = 100), "log10p_d_w1"] and logP[seq(1, nrow(logP), by = 100), "log10p_d_w6"]
# t = -1.0643, df = 3052, p-value = 0.2873
# alternative hypothesis: true correlation is not equal to 0
# 95 percent confidence interval:
 # -0.05469204  0.01621900
# sample estimates:
        # cor 
# -0.01926074 


cor.test(logP[seq(1,nrow(logP),by=100),"log10p_d_w1"], logP[seq(1,nrow(logP),by=100),"log10p_d_w6"],method="pearson",use = "pairwise.complete.obs")$p.value
# [1] 0.2872972



# ------------ Least significant of significant P values ---------------------



cor.test(logP[seq(1,nrow(logP),by=100),"log10p_d_w1"],logP[seq(1,nrow(logP),by=100),"log10p_d_avg"])

# # 	Pearson's product-moment correlation <<<<<<<<<<<< use in paper

# data:  logP[seq(1, nrow(logP), by = 100), "log10p_d_w1"] and logP[seq(1, nrow(logP), by = 100), "log10p_d_avg"]
# t = 4.1617, df = 3052, p-value = 3.246e-05
# alternative hypothesis: true correlation is not equal to 0
# 95 percent confidence interval:
 # 0.03975621 0.11029374
# sample estimates:
       # cor 
# 0.07511894 


# ---------------- mean R --------------------------------


ans_mean <- mean(corr_test(logP[seq(1,nrow(logP),by=100),c(10:15)], method="pearson",use = "pairwise.complete.obs", adjust = "none")$ci$r)

ans_mean
# [1] 0.622765 <<<<<<<<<< use in paper


sem_mean <- sem(corr_test(logP[seq(1,nrow(logP),by=100),c(10:15)], method="pearson",use = "pairwise.complete.obs", adjust = "none")$ci$r)

sem_mean
# [1] 0.1013176 <<<<<<<<<< use in paper






#--------overlap between GWAS significant growth loci------------

# ***** For full combined analysis of g and d together, see g_d_comb_fish_1.R **********
# 60603 is total number of genes, cr and nc, according to GENCODE v31. Also see dim(gencode_gtf_ensembl_ucsc) in peak_g_1.R, which is 60603.


sem <- function(x) {sqrt(var(x,na.rm=TRUE)/sum(!is.na(x)))}

compare <- function(a,b) {
	print(t.test(a,b))
	
	print(paste0("exact P value = ", t.test(a,b)$p.value))
	
	print(paste0("mean of a = ", mean(a, na.rm = TRUE)))
	print(paste0("sem of a = ", sem(a)))
	print(paste0("sd of a = ", sd(a, na.rm = TRUE)))
	print(paste0("number in a = ", sum(!is.na(a))))
	
	print(paste0("mean of b = ", mean(b, na.rm = TRUE)))
	print(paste0("sem of b = ", sem(b)))
	print(paste0("sd of b = ", sd(b, na.rm = TRUE)))
	print(paste0("number in b = ", sum(!is.na(b))))
	
}



g <- read.delim("growth_loci.txt",sep="\t",stringsAsFactors=FALSE,header=TRUE,check.names=FALSE)
d <- read.delim("paclitaxel_loci.txt",sep="\t",stringsAsFactors=FALSE,header=TRUE,check.names=FALSE)
g_unique <- read.delim("growth_loci_unique.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE,check.names=FALSE)
d_unique <- read.delim("paclitaxel_loci_unique.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE,check.names=FALSE)
Ix <- read.delim("Ix_loci.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE,check.names=FALSE)

g_bind <- g
d_bind <- d
Ix_bind <- Ix

g_bind[g_bind$conc == 0,]$conc <- "g_0nM"
g_bind[g_bind$conc == 8,]$conc <- "g_8nM"
g_bind[g_bind$conc == 25,]$conc <- "g_25nM"
g_bind[g_bind$conc == 75,]$conc <- "g_75nM"
g_bind[g_bind$conc == "avg",]$conc <- "g_avg"


d_bind[d_bind$wk == 2,]$wk <- "d_w2"
d_bind[d_bind$wk == 3,]$wk <- "d_w3"
d_bind[d_bind$wk == 4,]$wk <- "d_w4"
d_bind[d_bind$wk == 6,]$wk <- "d_w6"
d_bind[d_bind$wk == "avg",]$wk <- "d_avg"

colnames(g_bind)[5] <- "wk_conc"
colnames(d_bind)[5] <- "wk_conc"

g_and_d <- rbind(g_bind,d_bind,Ix_bind)



# no significant loci wk 1:

d_overlap_frac <- data.frame("d wk2"=numeric(), "d wk3"=numeric(), "d wk4"=numeric(), "d wk6"=numeric(), "d avg"=numeric(), check.names=FALSE)

k <- as.character(c("d_w2", "d_w3","d_w4","d_w6","d_avg"))

for(i in 1:5) 
{
	for(j in 1:5) 
	{
		d_overlap_frac[i,j] <- dim(g_and_d[g_and_d$wk_conc==k[i],][with(g_and_d[g_and_d$wk_conc==k[i],], paste(Chromosome, ensembl_gene_id, sep = "\r")) %in% with(g_and_d[g_and_d$wk_conc==k[j],], paste(Chromosome, ensembl_gene_id, sep="\r")), ])[1]
	}
}

row.names(d_overlap_frac) <- c(colnames(d_overlap_frac))

d_overlap_dec <- d_overlap_frac
for(i in 1:5) {
	d_overlap_dec[i,] <- d_overlap_dec[i,]/d_overlap_dec[i,i]
}


d_overlap_frac
      # d wk2 d wk3 d wk4 d wk6 d avg
# d wk2     2     2     2     2     2
# d wk3     2    26    17    15    23
# d wk4     2    17    19    15    18
# d wk6     2    15    15    25    16
# d avg     2    23    18    16    25


d_overlap_dec
           # d wk2     d wk3     d wk4     d wk6     d avg
# d wk2 1.00000000 1.0000000 1.0000000 1.0000000 1.0000000
# d wk3 0.07692308 1.0000000 0.6538462 0.5769231 0.8846154
# d wk4 0.10526316 0.8947368 1.0000000 0.7894737 0.9473684
# d wk6 0.08000000 0.6000000 0.6000000 1.0000000 0.6400000
# d avg 0.08000000 0.9200000 0.7200000 0.6400000 1.0000000



gene_num <- 60603 # num cr and nc genes, GENCODE v31. See also gencode_gtf_ensembl_ucsc_v31.txt, which has # rows == 60603.

# add CEN
gene_num <- gene_num + 23

d_fish_pval <- as.data.frame(matrix("NA", ncol = ncol(d_overlap_frac), nrow = nrow(d_overlap_frac)),stringsAsFactors=FALSE)
colnames(d_fish_pval) <- colnames(d_overlap_frac)
rownames(d_fish_pval) <- rownames(d_overlap_frac)

d_fish_odds_ratio <- d_fish_pval

# No log.p option for fisher.test

for(i in 1:nrow(d_overlap_frac)){
	for(j in 1:ncol(d_overlap_frac)){
		union <- d_overlap_frac[i,j]
		d_not_g <- d_overlap_frac[j,j] - union
		g_not_d <- d_overlap_frac[i,i] - union
		not_g_not_d <- gene_num - union - d_not_g - g_not_d
		d_fish_pval[i,j] <-	fisher.test(matrix(c(union,d_not_g,g_not_d,not_g_not_d),2,2))$p.value
		d_fish_odds_ratio[i,j] <-	fisher.test(matrix(c(union,d_not_g,g_not_d,not_g_not_d),2,2))$estimate[[1]]
		}
	}
	


d_fish_pval
                     # d wk2                d wk3                d wk4                d wk6                d avg
# d wk2 5.44150871270568e-10 1.76849033162935e-07  9.3049798987267e-08  1.6324526138117e-07  1.6324526138117e-07
# d wk3 1.76849033162935e-07 1.81474536990845e-98 9.42981229392124e-59 6.01140256717342e-47 2.01932101700752e-82
# d wk4  9.3049798987267e-08 9.42981229392097e-59 1.64361583398136e-74 3.01902138867694e-50 4.78803007330236e-64
# d wk6  1.6324526138117e-07  6.0114025671735e-47  3.0190213886769e-50 4.22982246776233e-95 2.62371357238681e-51
# d avg  1.6324526138117e-07 2.01932101700747e-82  4.7880300733025e-64 2.62371357238681e-51 4.22982246776233e-95


d_fish_odds_ratio
      # d wk2            d wk3            d wk4            d wk6            d avg
# d wk2   Inf              Inf              Inf              Inf              Inf
# d wk3   Inf              Inf 4503599627370496 8107.19869209992 4503599627370496
# d wk4   Inf 4503599627370496              Inf 11603.3065711261 4503599627370496
# d wk6   Inf 8107.19869209992 11603.3065711261              Inf 13888.1720584246
# d avg   Inf 4503599627370496 4503599627370496 13888.1720584246              Inf



# Spot test of 'worst' P value overlap between wk 3 and wk 2 agrees with general d_fish_pval table:


fisher.test(matrix(c(2,2-2, 26-2, gene_num-2-2+2-26+2),2,2))

       
	# Fisher's Exact Test for Count Data <<<<<<<<<<<<<<< use in paper

# data:  
# p-value = 1.768e-07
# alternative hypothesis: true odds ratio is not equal to 1
# 95 percent confidence interval:
 # 448.7717      Inf
# sample estimates:
# odds ratio 
       # Inf 
       
  
fisher.test(matrix(c(2,2-2, 26-2, gene_num-2-2+2-26+2),2,2))$p.value
# [1] 1.76849e-07




















