#calculations performed in text for g_d_overlap figure discussion


logP <- read.table("log10P_human.txt",sep="\t",stringsAsFactors=FALSE, header=TRUE)




sem <- function(x) {sqrt(var(x,na.rm=TRUE)/sum(!is.na(x)))}

compare <- function(a,b) {
	print(t.test(a,b))
	
	print(paste0("exact P value = ", t.test(a,b)$p.value))
	
	print(paste0("mean of a = ", mean(a, na.rm = TRUE)))
	print(paste0("sem of a = ", sem(a)))
	print(paste0("sd of a = ", sd(a, na.rm = TRUE)))
	print(paste0("number in a = ", sum(!is.na(a))))
	
	print(paste0("mean of b = ", mean(b, na.rm = TRUE)))
	print(paste0("sem of b = ", sem(b)))
	print(paste0("sd of b = ", sd(b, na.rm = TRUE)))
	print(paste0("number in b = ", sum(!is.na(b))))
	
}





library(psych)

# Modified version of psych::corr.test, which provides more accurate P vals and even more accurate -log10P values:

corr_test <- function (x, y = NULL, use = "pairwise", method = "pearson", 
    adjust = "holm", alpha = 0.05, ci = TRUE, minlength = 5) 
{
    cl <- match.call()
    if (is.null(y)) {
        r <- cor(x, use = use, method = method)
        sym <- TRUE
        n <- t(!is.na(x)) %*% (!is.na(x))
    }
    else {
        r <- cor(x, y, use = use, method = method)
        sym = FALSE
        n <- t(!is.na(x)) %*% (!is.na(y))
    }
    if ((use == "complete") | (min(n) == max(n))) 
        n <- min(n)
    t <- (r * sqrt(n - 2))/sqrt(1 - r^2)
    p <- -2 * expm1(pt(abs(t), (n - 2), log.p = TRUE))
    print(paste0("Accurate P vals:"))
    print(p)
    log10P <- -(log(2) + pt(abs(t), (n - 2), log.p = TRUE, lower.tail = FALSE))/log(10)
    print(paste0("Even more accurate-log10P vals:"))
    print(log10P)
    se <- sqrt((1 - r * r)/(n - 2))
    nvar <- ncol(r)
    p[p > 1] <- 1
    if (adjust != "none") {
        if (is.null(y)) {
            lp <- upper.tri(p)
            pa <- p[lp]
            pa <- p.adjust(pa, adjust)
            p[upper.tri(p, diag = FALSE)] <- pa
        }
        else {
            p[] <- p.adjust(p, adjust)
        }
    }
    z <- fisherz(r[lower.tri(r)])
    if (ci) {
        if (min(n) < 4) {
            warning("Number of subjects must be greater than 3 to find confidence intervals.")
        }
        if (sym) {
            ncors <- nvar * (nvar - 1)/2
        }
        else ncors <- prod(dim(r))
        if (adjust != "holm") {
            dif.corrected <- qnorm(1 - alpha/(2 * ncors))
        }
        else {
            ord <- order(abs(z), decreasing = FALSE)
            dif.corrected <- qnorm(1 - alpha/(2 * order(ord)))
        }
        alpha <- 1 - alpha/2
        dif <- qnorm(alpha)
        if (sym) {
            if (is.matrix(n)) {
                sef <- 1/sqrt(n[lower.tri(n)] - 3)
            }
            else {
                sef <- 1/sqrt(n - 3)
            }
            lower <- fisherz2r(z - dif * sef)
            upper <- fisherz2r(z + dif * sef)
            lower.corrected <- fisherz2r(z - dif.corrected * 
                sef)
            upper.corrected <- fisherz2r(z + dif.corrected * 
                sef)
            ci <- data.frame(lower = lower, r = r[lower.tri(r)], 
                upper = upper, p = p[lower.tri(p)])
            ci.adj <- data.frame(lower.adj = lower.corrected, 
                upper.adj = upper.corrected)
            cnR <- abbreviate(colnames(r), minlength = minlength)
            k <- 1
            for (i in 1:(nvar - 1)) {
                for (j in (i + 1):nvar) {
                  rownames(ci)[k] <- paste(cnR[i], cnR[j], sep = "-")
                  k <- k + 1
                }
            }
        }
        else {
            n.x <- NCOL(x)
            n.y <- NCOL(y)
            z <- fisherz(r)
            if (adjust != "holm") {
                dif.corrected <- qnorm(1 - (1 - alpha)/(n.x * 
                  n.y))
            }
            else {
                ord <- order(abs(z), decreasing = FALSE)
                dif.corrected <- qnorm(1 - (1 - alpha)/(order(ord)))
            }
            sef <- 1/sqrt(n - 3)
            lower <- as.vector(fisherz2r(z - dif * sef))
            upper <- as.vector(fisherz2r(z + dif * sef))
            lower.corrected <- fisherz2r(z - dif.corrected * 
                sef)
            upper.corrected <- fisherz2r(z + dif.corrected * 
                sef)
            ci <- data.frame(lower = lower, r = as.vector(r), 
                upper = upper, p = as.vector(p))
            ci.adj <- data.frame(lower.adj = as.vector(lower.corrected), 
                r = as.vector(r), upper.adj = as.vector(upper.corrected))
            cnR <- abbreviate(rownames(r), minlength = minlength)
            cnC <- abbreviate(colnames(r), minlength = minlength)
            k <- 1
            for (i in 1:NCOL(y)) {
                for (j in 1:NCOL(x)) {
                  rownames(ci)[k] <- paste(cnR[j], cnC[i], sep = "-")
                  k <- k + 1
                }
            }
        }
    }
    else {
        ci <- sef <- ci.adj <- NULL
    }
    result <- list(r = r, n = n, t = t, p = p, accurate_P = p, log10P = log10P, se = se, sef = sef, 
        adjust = adjust, sym = sym, ci = ci, ci.adj = ci.adj, 
        Call = cl)
    class(result) <- c("psych", "corr.test")
    return(result)
}


#--------correlations (and P values) between logP values growth ------------
#-------- down sample every 1 Mb to remove autocorrelation ---------------

cor(logP[seq(1,nrow(logP),by=100),c(5:9)], method="pearson",use = "pairwise.complete.obs")
              # log10p_g_0nM log10p_g_8nM log10p_g_25nM log10p_g_75nM log10p_g_avg
# log10p_g_0nM     1.0000000    0.9868138     0.8764049     0.2145261    0.8563488
# log10p_g_8nM     0.9868138    1.0000000     0.9403100     0.3227331    0.9253467
# log10p_g_25nM    0.8764049    0.9403100     1.0000000     0.5892128    0.9990887
# log10p_g_75nM    0.2145261    0.3227331     0.5892128     1.0000000    0.6211084
# log10p_g_avg     0.8563488    0.9253467     0.9990887     0.6211084    1.0000000


# corr.test gives quick R values and P values for dataframe, but only 2 dec places. For more accurate correls use cor(), for P values use cor.test on selected columns. Or use results from chart.Correlation() (shown in Fig in paper).

corr.test(logP[seq(1,nrow(logP),by=100),c(5:9)], method="pearson",use = "pairwise.complete.obs", adjust = "none")
# Call:corr.test(x = logP[seq(1, nrow(logP), by = 100), c(5:9)], use = "pairwise.complete.obs", 
    # method = "pearson", adjust = "none")
# Correlation matrix 
              # log10p_g_0nM log10p_g_8nM log10p_g_25nM log10p_g_75nM log10p_g_avg
# log10p_g_0nM          1.00         0.99          0.88          0.21         0.86
# log10p_g_8nM          0.99         1.00          0.94          0.32         0.93
# log10p_g_25nM         0.88         0.94          1.00          0.59         1.00
# log10p_g_75nM         0.21         0.32          0.59          1.00         0.62
# log10p_g_avg          0.86         0.93          1.00          0.62         1.00
# Sample Size 
# [1] 3054
# Probability values (Entries above the diagonal are adjusted for multiple tests.) 
              # log10p_g_0nM log10p_g_8nM log10p_g_25nM log10p_g_75nM log10p_g_avg
# log10p_g_0nM             0            0             0             0            0
# log10p_g_8nM             0            0             0             0            0
# log10p_g_25nM            0            0             0             0            0
# log10p_g_75nM            0            0             0             0            0
# log10p_g_avg             0            0             0             0            0

 # To see confidence intervals of the correlations, print with the short=FALSE option

corr_test(logP[seq(1,nrow(logP),by=100),c(5:9)], method="pearson",use = "pairwise.complete.obs", adjust = "none")
# [1] "Accurate P vals:"
              # log10p_g_0nM log10p_g_8nM log10p_g_25nM log10p_g_75nM log10p_g_avg
# log10p_g_0nM  0.000000e+00  0.00000e+00  0.000000e+00  3.991574e-33            0
# log10p_g_8nM  0.000000e+00  0.00000e+00  0.000000e+00  5.697480e-75            0
# log10p_g_25nM 0.000000e+00  0.00000e+00  0.000000e+00 5.919166e-285            0
# log10p_g_75nM 3.991574e-33  5.69748e-75 5.919166e-285  0.000000e+00            0
# log10p_g_avg  0.000000e+00  0.00000e+00  0.000000e+00  0.000000e+00            0
# [1] "Even more accurate-log10P vals:"
              # log10p_g_0nM log10p_g_8nM log10p_g_25nM log10p_g_75nM log10p_g_avg
# log10p_g_0nM           Inf   2415.54355      970.2931      32.39886     877.7445
# log10p_g_8nM    2415.54355          Inf     1430.4960      74.24432    1287.3736
# log10p_g_25nM    970.29309   1430.49605           Inf     284.22774    4182.3129
# log10p_g_75nM     32.39886     74.24432      284.2277           Inf     324.6468
# log10p_g_avg     877.74455   1287.37355     4182.3129     324.64675          Inf
# Call:corr_test(x = logP[seq(1, nrow(logP), by = 100), c(5:9)], use = "pairwise.complete.obs", 
    # method = "pearson", adjust = "none")
# Correlation matrix 
              # log10p_g_0nM log10p_g_8nM log10p_g_25nM log10p_g_75nM log10p_g_avg
# log10p_g_0nM          1.00         0.99          0.88          0.21         0.86
# log10p_g_8nM          0.99         1.00          0.94          0.32         0.93
# log10p_g_25nM         0.88         0.94          1.00          0.59         1.00
# log10p_g_75nM         0.21         0.32          0.59          1.00         0.62
# log10p_g_avg          0.86         0.93          1.00          0.62         1.00
# Sample Size 
# [1] 3054
# Probability values (Entries above the diagonal are adjusted for multiple tests.) 
              # log10p_g_0nM log10p_g_8nM log10p_g_25nM log10p_g_75nM log10p_g_avg
# log10p_g_0nM             0            0             0             0            0
# log10p_g_8nM             0            0             0             0            0
# log10p_g_25nM            0            0             0             0            0
# log10p_g_75nM            0            0             0             0            0
# log10p_g_avg             0            0             0             0            0

 # To see confidence intervals of the correlations, print with the short=FALSE option


#-------------worst but still significant R value ---------------
# between g 75 nM and g 0 nM


cor.test(logP[seq(1,nrow(logP),by=100),"log10p_g_0nM"], logP[seq(1,nrow(logP),by=100),"log10p_g_75nM"],method="pearson",use = "pairwise.complete.obs")

	# Pearson's product-moment correlation  <<<<<<<<<<<<<<<<< use in paper

# data:  logP[seq(1, nrow(logP), by = 100), "log10p_g_0nM"] and logP[seq(1, nrow(logP), by = 100), "log10p_g_75nM"]
# t = 12.134, df = 3052, p-value < 2.2e-16
# alternative hypothesis: true correlation is not equal to 0
# 95 percent confidence interval:
 # 0.1804304 0.2481070
# sample estimates:
      # cor 
# 0.2145261 

cor.test(logP[seq(1,nrow(logP),by=100),"log10p_g_0nM"], logP[seq(1,nrow(logP),by=100),"log10p_g_75nM"],method="pearson",use = "pairwise.complete.obs")$p.value
# [1] 3.991574e-33  <<<<<<<<<<<<<<<<< use in paper




# ---------------- mean R --------------------------------


ans_mean <- mean(corr_test(logP[seq(1,nrow(logP),by=100),c(5:9)], method="pearson",use = "pairwise.complete.obs", adjust = "none")$ci$r)

ans_mean
# [1] 0.7331893 <<<<<<<<<< use in paper


sem_mean <- sem(corr_test(logP[seq(1,nrow(logP),by=100),c(5:9)], method="pearson",use = "pairwise.complete.obs", adjust = "none")$ci$r)

sem_mean
# [1] 0.08952389 <<<<<<<<<< use in paper








#-------- comparing R values for 75 nM comparisons with other comparisons ----------




compare(cor(logP[seq(1,nrow(logP),by=100),c(5:9)], method="pearson",use = "pairwise.complete.obs")["log10p_g_75nM",-4],as.matrix(cor(logP[seq(1,nrow(logP),by=100),c(5:9)], method="pearson",use = "pairwise.complete.obs"))[upper.tri(cor(logP[seq(1,nrow(logP),by=100),c(5:9)], method="pearson",use = "pairwise.complete.obs"))][-c(4:6,10)])

	# Welch Two Sample t-test <<<<<<<<<<<<<<<<< use in paper

# data:  a and b
# t = -4.8157, df = 3.3325, p-value = 0.01334
# alternative hypothesis: true difference in means is not equal to 0
# 95 percent confidence interval:
 # -0.8024968 -0.1851506
# sample estimates:
# mean of x mean of y 
# 0.4368951 0.9307188 

# [1] "exact P value = 0.0133446211383975"
# [1] "mean of a = 0.436895123306205"
# [1] "sem of a = 0.0998399337376448"
# [1] "sd of a = 0.19967986747529"
# [1] "number in a = 4"
# [1] "mean of b = 0.930718811794001"
# [1] "sem of b = 0.0233971745423795"
# [1] "sd of b = 0.0573111390516664"
# [1] "number in b = 6"



# ---------------------- Comparing  -log10P values for 75 nM comparisons with other comparisons ----------


acc_logP_vals <- corr_test(logP[seq(1,nrow(logP),by=100),c(5:9)], method="pearson",use = "pairwise.complete.obs", adjust = "none")$log10P


compare(acc_logP_vals["log10p_g_75nM",-4],as.matrix(acc_logP_vals)[upper.tri(acc_logP_vals)][-c(4:6,10)])

	# Welch Two Sample t-test

# data:  a and b
# t = -3.2301, df = 5.2016, p-value = 0.02193
# alternative hypothesis: true difference in means is not equal to 0
# 95 percent confidence interval:
 # -3004.6806  -358.8151
# sample estimates:
# mean of x mean of y 
 # 178.8794 1860.6273 

# [1] "exact P value = 0.0219294395882935"
# [1] "mean of a = 178.87941628206"
# [1] "sem of a = 73.4571553646612"
# [1] "sd of a = 146.914310729322"
# [1] "number in a = 4"
# [1] "mean of b = 1860.62727554235"
# [1] "sem of b = 515.446156320456"
# [1] "sd of b = 1262.58007286397"
# [1] "number in b = 6"



#--------overlap between GWAS significant growth loci------------


# Worst overlap is between 75 nM and 0 nM. 
# ***** For full combined analysis of g and d together, see g_d_comb_fish_1.R **********
# 60603 is total number of genes, cr and nc, according to GENCODE v31


g <- read.delim("growth_loci.txt",sep="\t",stringsAsFactors=FALSE,header=TRUE,check.names=FALSE)
d <- read.delim("paclitaxel_loci.txt",sep="\t",stringsAsFactors=FALSE,header=TRUE,check.names=FALSE)
g_unique <- read.delim("growth_loci_unique.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE,check.names=FALSE)
d_unique <- read.delim("paclitaxel_loci_unique.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE,check.names=FALSE)
Ix <- read.delim("Ix_loci.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE,check.names=FALSE)

g_bind <- g
d_bind <- d
Ix_bind <- Ix

g_bind[g_bind$conc == 0,]$conc <- "g_0nM"
g_bind[g_bind$conc == 8,]$conc <- "g_8nM"
g_bind[g_bind$conc == 25,]$conc <- "g_25nM"
g_bind[g_bind$conc == 75,]$conc <- "g_75nM"
g_bind[g_bind$conc == "avg",]$conc <- "g_avg"


# No significant paclitaxel loci for week 1
d_bind[d_bind$wk == 2,]$wk <- "d_w2"
d_bind[d_bind$wk == 3,]$wk <- "d_w3"
d_bind[d_bind$wk == 4,]$wk <- "d_w4"
d_bind[d_bind$wk == 6,]$wk <- "d_w6"
d_bind[d_bind$wk == "avg",]$wk <- "d_avg"

colnames(g_bind)[5] <- "wk_conc"
colnames(d_bind)[5] <- "wk_conc"

g_and_d <- rbind(g_bind,d_bind,Ix_bind)




g_overlap_frac <- data.frame("g 0nM"=numeric(), "g 8nM"=numeric(), "g 25nM"=numeric(), "g 75nM"=numeric(), "g avg"=numeric(), check.names=FALSE)

k <- as.character(c("g_0nM","g_8nM","g_25nM","g_75nM","g_avg"))

for(i in 1:5) 
{
	for(j in 1:5) 
	{
		g_overlap_frac[i,j] <- dim(g_and_d[g_and_d$wk_conc==k[i],][with(g_and_d[g_and_d$wk_conc==k[i],], paste(Chromosome, ensembl_gene_id, sep = "\r")) %in% with(g_and_d[g_and_d$wk_conc==k[j],], paste(Chromosome, ensembl_gene_id, sep="\r")), ])[1]
	}
}

row.names(g_overlap_frac) <- c(colnames(g_overlap_frac))

g_overlap_dec <- g_overlap_frac
for(i in 1:5) {
	g_overlap_dec[i,] <- g_overlap_dec[i,]/g_overlap_dec[i,i]
}


g_overlap_frac
       # g 0nM g 8nM g 25nM g 75nM g avg
# g 0nM    409   305    162      3   142
# g 8nM    305   460    238      5   207
# g 25nM   162   238    466     16   414
# g 75nM     3     5     16     55    16
# g avg    142   207    414     16   446


g_overlap_dec
            # g 0nM      g 8nM    g 25nM      g 75nM     g avg
# g 0nM  1.00000000 0.74572127 0.3960880 0.007334963 0.3471883
# g 8nM  0.66304348 1.00000000 0.5173913 0.010869565 0.4500000
# g 25nM 0.34763948 0.51072961 1.0000000 0.034334764 0.8884120
# g 75nM 0.05454545 0.09090909 0.2909091 1.000000000 0.2909091
# g avg  0.31838565 0.46412556 0.9282511 0.035874439 1.0000000



gene_num <- 60603 # num cr and nc genes, GENCODE v31. See also gencode_gtf_ensembl_ucsc_v31.txt, which has # rows == 60603.

# add CEN
gene_num <- gene_num + 23


g_fish_pval <- as.data.frame(matrix("NA", ncol = ncol(g_overlap_frac), nrow = nrow(g_overlap_frac)),stringsAsFactors=FALSE)
colnames(g_fish_pval) <- colnames(g_overlap_frac)
rownames(g_fish_pval) <- rownames(g_overlap_frac)

g_fish_odds_ratio <- g_fish_pval

# No log.p option for fisher.test

for(i in 1:nrow(g_overlap_frac)){
	for(j in 1:ncol(g_overlap_frac)){
		union <- g_overlap_frac[i,j]
		d_not_g <- g_overlap_frac[j,j] - union
		g_not_d <- g_overlap_frac[i,i] - union
		not_g_not_d <- gene_num - union - d_not_g - g_not_d
		g_fish_pval[i,j] <-	fisher.test(matrix(c(union,d_not_g,g_not_d,not_g_not_d),2,2))$p.value
		g_fish_odds_ratio[i,j] <-	fisher.test(matrix(c(union,d_not_g,g_not_d,not_g_not_d),2,2))$estimate[[1]]
		}
	}
	


g_fish_pval
                       # g 0nM                g 8nM                g 25nM                g 75nM                 g avg
# g 0nM                      0                    0 1.09200187261342e-239   0.00616800549548198 7.66246815977211e-202 <<<<<<<<<<<<< use in paper
# g 8nM                      0                    0                     0  6.26384115165444e-05                     0
# g 25nM 1.09200187261342e-239                    0                     0  2.59490218411265e-21                     0
# g 75nM   0.00616800549548197 6.26384115165443e-05  2.59490218411268e-21 1.16971852099428e-190   1.2866648639961e-21
# g avg  7.66246815977254e-202                    0                     0  1.28666486399609e-21                     0


g_fish_odds_ratio
                  # g 0nM            g 8nM           g 25nM           g 75nM            g avg
# g 0nM               Inf 1112.74079505459 129.053930828731 8.54623353164983  104.58741584785
# g 8nM  1112.74079505459              Inf 280.892811959627 13.2065095410543 204.300405292217
# g 25nM 129.053930828732 280.892811959626              Inf 54.8093992123365 8965.34407375267
# g 75nM 8.54623353164983 13.2065095410543 54.8093992123366              Inf 57.4124886657971
# g avg   104.58741584785 204.300405292217 8965.34407375269 57.4124886657971              Inf



# Spot test of 'worst' P value overlap between 75 nM and 0 nM agrees with general g_fish_pval table:


fisher.test(matrix(c(3,55-3, 409-3, gene_num-3-55+3-409+3),2,2))


	# Fisher's Exact Test for Count Data <<<<<<<<<<< use in paper

# data:  
# p-value = 0.006168
# alternative hypothesis: true odds ratio is not equal to 1
# 95 percent confidence interval:
  # 1.700668 26.567059
# sample estimates:
# odds ratio 
  # 8.546234 


  
fisher.test(matrix(c(3,55-3, 409-3, gene_num-3-55+3-409+3),2,2))$p.value
# [1] 0.006168005 <<<<<<<<<< use in paper

# Mean sharing for 75 nM and 0 nM:
g_overlap_dec["g 75nM","g 0nM"]
# [1] 0.05454545


#------------- Compare mean sharing of 75 nM and all others ---------------


compare(as.numeric(g_overlap_dec["g 75nM",-4]),as.matrix(g_overlap_dec)[upper.tri(g_overlap_dec)][-c(4:6,10)])

	# Welch Two Sample t-test

# data:  a and b
# t = -3.4826, df = 7.9708, p-value = 0.008335
# alternative hypothesis: true difference in means is not equal to 0
# 95 percent confidence interval:
 # -0.6245428 -0.1267544
# sample estimates:
# mean of x mean of y 
# 0.1818182 0.5574668 

# [1] "exact P value = 0.00833532927243128"
# [1] "mean of a = 0.181818181818182"
# [1] "sem of a = 0.0634195440498677"
# [1] "sd of a = 0.126839088099735"
# [1] "number in a = 4"
# [1] "mean of b = 0.557466812754572"
# [1] "sem of b = 0.0872505615404337"
# [1] "sd of b = 0.213719355545365"
# [1] "number in b = 6"



#------------- Compare OR of 75 nM and all others ---------------

compare(as.numeric(g_fish_odds_ratio["g 75nM",-4]),as.numeric(as.matrix(g_fish_odds_ratio)[upper.tri(g_fish_odds_ratio)][-c(4:6,10)]))


	# Welch Two Sample t-test <<<<<<<<<<<< use in paper

# data:  a and b
# t = -1.2251, df = 5.0008, p-value = 0.2751
# alternative hypothesis: true difference in means is not equal to 0
# 95 percent confidence interval:
 # -5471.374  1939.388
# sample estimates:
 # mean of x  mean of y 
  # 33.49366 1799.48657 

# [1] "exact P value = 0.275093962838428"
# [1] "mean of a = 33.4936577377095"
# [1] "sem of a = 13.1034781134938"
# [1] "sd of a = 26.2069562269875"
# [1] "number in a = 4"
# [1] "mean of b = 1799.48657212261"
# [1] "sem of b = 1441.46809156825"
# [1] "sd of b = 3530.86130484567"
# [1] "number in b = 6"















