
library(corrplot)
library(PerformanceAnalytics)
library(ggplot2)
library(magick)
library(cowplot)
library(RColorBrewer)





# ------------------------------------- fxn -------------------------------------------

sem <- function(x) {sqrt(var(x,na.rm=TRUE)/sum(!is.na(x)))}

compare <- function(a,b) {
	print(t.test(a,b))
	
	print(paste0("exact P value = ", t.test(a,b)$p.value))
	
	print(paste0("mean of a = ", mean(a, na.rm = TRUE)))
	print(paste0("sem of a = ", sem(a)))
	print(paste0("sd of a = ", sd(a, na.rm = TRUE)))
	print(paste0("number in a = ", sum(!is.na(a))))
	
	print(paste0("mean of b = ", mean(b, na.rm = TRUE)))
	print(paste0("sem of b = ", sem(b)))
	print(paste0("sd of b = ", sd(b, na.rm = TRUE)))
	print(paste0("number in b = ", sum(!is.na(b))))
	
}




# -------------- Read in human and hamster logP dataframes ----------------------------


logP_human <-  read.table("log10P_human.txt",sep="\t",stringsAsFactors=FALSE,header=TRUE)
logP_hamster <-  read.table("log10P_hamster.txt",sep="\t",stringsAsFactors=FALSE,header=TRUE)


human_thresh <- read.table("human_thresh_95.txt",sep="\t",stringsAsFactors=FALSE,header=FALSE,row.names=1)
colnames(human_thresh) <- "threshold"




hamster_thresh <- read.table("hamster_thresh_95.txt",sep="\t",stringsAsFactors=FALSE,header=FALSE,row.names=1)
colnames(hamster_thresh) <- "threshold"



align_human <- read.table("RH_pool_human_total_align.txt",sep="\t",stringsAsFactors=FALSE,header=TRUE)
align_hamster <- read.table("RH_pool_hamster_total_align.txt",sep="\t",stringsAsFactors=FALSE,header=TRUE)




# ---------------- Abundance hamster-specific reads vs human-specific reads ---------------


mean(align_hamster$hamster_aligned_and_human_unaligned/align_human$human_aligned_and_hamster_unaligned)
# [1] 83.70776  <<<<<<<<<<<<<<<<<<< use in paper

sem(align_hamster$hamster_aligned_and_human_unaligned/align_human$human_aligned_and_hamster_unaligned)
# [1] 15.81248  <<<<<<<<<<<<<<<<<<< use in paper


# ----------- Number of hamster contigs ------------------------------------

length(unique(logP_hamster$Contig_ID))
# [1] 132       <<<<<<<<< use in paper


#---------- Compare logP vals hamster and human for growth ---------------

# Compare avg P values

compare(unlist(logP_human[,c(5:9)]),unlist(logP_hamster[,c(6:10)]))

	# Welch Two Sample t-test

# data:  a and b
# t = 877.96, df = 1711700, p-value < 2.2e-16
# alternative hypothesis: true difference in means is not equal to 0
# 95 percent confidence interval:
 # 6.933678 6.964704
# sample estimates:
# mean of x mean of y 
 # 8.725532  1.776341 

# [1] "exact P value = 0"
# [1] "mean of a = 8.72553180459999"
# [1] "sem of a = 0.00768213284782494"
# [1] "sd of a = 9.49281314534797"
# [1] "number in a = 1526955"
# [1] "mean of b = 1.77634073460231"
# [1] "sem of b = 0.00190636546809316"
# [1] "sd of b = 1.98838579084312"
# [1] "number in b = 1087900"

# to get accurate p val:
pt(-abs(877.96), 1711700,log.p=TRUE)/log(10) # convert loge to log10
# [1] -138192.2

# Find remainder
(abs(pt(-abs(877.96), 1711700,log.p=TRUE)/log(10)))%%1
# [1] 0.2381985

1-0.2381985
# [1] 0.7618015

-138193 + 0.7618015
# [1] -138192.2 #find nearest whole power of 10

2*(10^ 0.7618015)
# [1] 11.55664 # multiply by 2, to make 2 tailed

# 11.55664 * 10^-138193 = 1.155664 * 10^-138192 # answer



# Avg P vals in 1 Mb steps:

compare(unlist(logP_human[seq(1,nrow(logP_human),by=100),c(5:9)]),unlist(logP_hamster[seq(1,nrow(logP_hamster),by=100),c(6:10)]))

	# Welch Two Sample t-test

# data:  a and b
# t = 87.767, df = 17120, p-value < 2.2e-16 
# alternative hypothesis: true difference in means is not equal to 0
# 95 percent confidence interval:
 # 6.820795 7.132414
# sample estimates:
# mean of x mean of y 
 # 8.761717  1.785113 

# [1] "exact P value = 0"
# [1] "mean of a = 8.76171730043391"
# [1] "sem of a = 0.0771458629961305"
# [1] "sd of a = 9.53305635088605"
# [1] "number in a = 15270"
# [1] "mean of b = 1.78511286220531"
# [1] "sem of b = 0.0191632994958106"
# [1] "sd of b = 1.99887085063531"
# [1] "number in b = 10880"


# Makes hardly any difference

# Explore P vals for growth points exceeding GWAS significance. 10 kb steps, because GWAS thresholding excludes noise. 1 Mb steps may possibly miss imptnt points? Without 1 Mb steps:

compare(
		c(
			c(logP_human[logP_human$log10p_g_0nM > human_thresh["log10p_g_0nM",],c("log10p_g_0nM")]),
			c(logP_human[logP_human$log10p_g_8nM > human_thresh["log10p_g_8nM",],c("log10p_g_8nM")]),
			c(logP_human[logP_human$log10p_g_25nM > human_thresh["log10p_g_25nM",],c("log10p_g_25nM")]),
			c(logP_human[logP_human$log10p_g_75nM > human_thresh["log10p_g_75nM",],c("log10p_g_75nM")]),
			c(logP_human[logP_human$log10p_g_avg > human_thresh["log10p_g_avg",],c("log10p_g_avg")])
			),
		c(
			c(logP_hamster[logP_hamster$log10p_g_0nM > hamster_thresh["log10p_g_0nM",],c("log10p_g_0nM")]),
			c(logP_hamster[logP_hamster$log10p_g_8nM > hamster_thresh["log10p_g_8nM",],c("log10p_g_8nM")]),
			c(logP_hamster[logP_hamster$log10p_g_25nM > hamster_thresh["log10p_g_25nM",],c("log10p_g_25nM")]),
			c(logP_hamster[logP_hamster$log10p_g_75nM > hamster_thresh["log10p_g_75nM",],c("log10p_g_75nM")]),
			c(logP_hamster[logP_hamster$log10p_g_avg > hamster_thresh["log10p_g_avg",],c("log10p_g_avg")])
			)
		)



	# Welch Two Sample t-test

# data:  a and b
# t = 665.09, df = 307500, p-value < 2.2e-16
# alternative hypothesis: true difference in means is not equal to 0
# 95 percent confidence interval:
 # 12.59193 12.66637
# sample estimates:
# mean of x mean of y 
# 21.797973  9.168823 

# [1] "exact P value = 0"
# [1] "mean of a = 21.7979727074327"
# [1] "sem of a = 0.0168629456963335"
# [1] "sd of a = 10.284544548375"
# [1] "number in a = 371966"
# [1] "mean of b = 9.16882296256231"
# [1] "sem of b = 0.00872965689430662"
# [1] "sd of b = 1.46788083950494"
# [1] "number in b = 28274"




# Same, in 1 Mb steps, just for completeness:



compare(
		c(
			c(logP_human[seq(1,nrow(logP_human),by=100),][logP_human[seq(1,nrow(logP_human),by=100),]$log10p_g_0nM > human_thresh["log10p_g_0nM",],c("log10p_g_0nM")]),
			c(logP_human[seq(1,nrow(logP_human),by=100),][logP_human[seq(1,nrow(logP_human),by=100),]$log10p_g_8nM > human_thresh["log10p_g_8nM",],c("log10p_g_8nM")]),
			c(logP_human[seq(1,nrow(logP_human),by=100),][logP_human[seq(1,nrow(logP_human),by=100),]$log10p_g_25nM > human_thresh["log10p_g_25nM",],c("log10p_g_25nM")]),
			c(logP_human[seq(1,nrow(logP_human),by=100),][logP_human[seq(1,nrow(logP_human),by=100),]$log10p_g_75nM > human_thresh["log10p_g_75nM",],c("log10p_g_75nM")]),
			c(logP_human[seq(1,nrow(logP_human),by=100),][logP_human[seq(1,nrow(logP_human),by=100),]$log10p_g_avg > human_thresh["log10p_g_avg",],c("log10p_g_avg")])
			),
		c(
			c(logP_hamster[seq(1,nrow(logP_hamster),by=100),][logP_hamster[seq(1,nrow(logP_hamster),by=100),]$log10p_g_0nM > hamster_thresh["log10p_g_0nM",],c("log10p_g_0nM")]),
			c(logP_hamster[seq(1,nrow(logP_hamster),by=100),][logP_hamster[seq(1,nrow(logP_hamster),by=100),]$log10p_g_8nM > hamster_thresh["log10p_g_8nM",],c("log10p_g_8nM")]),
			c(logP_hamster[seq(1,nrow(logP_hamster),by=100),][logP_hamster[seq(1,nrow(logP_hamster),by=100),]$log10p_g_25nM > hamster_thresh["log10p_g_25nM",],c("log10p_g_25nM")]),
			c(logP_hamster[seq(1,nrow(logP_hamster),by=100),][logP_hamster[seq(1,nrow(logP_hamster),by=100),]$log10p_g_75nM > hamster_thresh["log10p_g_75nM",],c("log10p_g_75nM")]),
			c(logP_hamster[seq(1,nrow(logP_hamster),by=100),][logP_hamster[seq(1,nrow(logP_hamster),by=100),]$log10p_g_avg > hamster_thresh["log10p_g_avg",],c("log10p_g_avg")])
			)
		)





	# Welch Two Sample t-test             <<<<<<<<<<<<<<<<<< use in paper

# data:  a and b
# t = 67.501, df = 3237, p-value < 2.2e-16       
# alternative hypothesis: true difference in means is not equal to 0
# 95 percent confidence interval:
 # 12.32606 13.06356
# sample estimates:
# mean of x mean of y 
# 21.859787  9.164979 

# [1] "exact P value = 0"
# [1] "mean of a = 21.8597868780633"
# [1] "sem of a = 0.168128145395501"
# [1] "sd of a = 10.2819674209389"
# [1] "number in a = 3740"
# [1] "mean of b = 9.16497874185774"
# [1] "sem of b = 0.0842800831689494"
# [1] "sd of b = 1.4426426441627"
# [1] "number in b = 293"






#---------- Compare logP vals hamster and human for drug ---------------

compare(unlist(logP_human[,c(10:15)]),unlist(logP_hamster[,c(11:16)]))

	# Welch Two Sample t-test

# data:  a and b
# t = -69.253, df = 2803400, p-value < 2.2e-16
# alternative hypothesis: true difference in means is not equal to 0
# 95 percent confidence interval:
 # -0.1587228 -0.1499858
# sample estimates:
# mean of x mean of y 
 # 1.129691  1.284045 

# [1] "exact P value = 0"
# [1] "mean of a = 1.12969091396507"
# [1] "sem of a = 0.00143362330743598"
# [1] "sd of a = 1.94061236942082"
# [1] "number in a = 1832346"
# [1] "mean of b = 1.28404522947747"
# [1] "sem of b = 0.0017066092555753"
# [1] "sd of b = 1.94993083969882"
# [1] "number in b = 1305480"

# Hamster higher -logP vals than human! Not just due to inclusion of w1 and w2 (insignificant P vals in human):

compare(unlist(logP_human[,c(12:15)]),unlist(logP_hamster[,c(13:16)]))

	# Welch Two Sample t-test

# data:  a and b
# t = -47.059, df = 1877900, p-value < 2.2e-16
# alternative hypothesis: true difference in means is not equal to 0
# 95 percent confidence interval:
 # -0.1555706 -0.1431299
# sample estimates:
# mean of x mean of y 
 # 1.511282  1.660632 

# [1] "exact P value = 0"
# [1] "mean of a = 1.51128151396799"
# [1] "sem of a = 0.00205085355771213"
# [1] "sd of a = 2.26669331163551"
# [1] "number in a = 1221564"
# [1] "mean of b = 1.66063174936606"
# [1] "sem of b = 0.00242207098502589"
# [1] "sd of b = 2.25957285611082"
# [1] "number in b = 870320"

# Doing w4 and w6 alone. (These are only wks in d_unique)

compare(unlist(logP_human[,c(13,14)]),unlist(logP_hamster[,c(14,15)]))

	# Welch Two Sample t-test

# data:  a and b
# t = 18.905, df = 982500, p-value < 2.2e-16
# alternative hypothesis: true difference in means is not equal to 0
# 95 percent confidence interval:
 # 0.09120167 0.11229980
# sample estimates:
# mean of x mean of y 
 # 2.005180  1.903429 

# [1] "exact P value = 1.07589256269476e-79"
# [1] "mean of a = 2.00517957584471"
# [1] "sem of a = 0.00364530125378695"
# [1] "sd of a = 2.84889563512383"
# [1] "number in a = 610782"
# [1] "mean of b = 1.90342884290541"
# [1] "sem of b = 0.00395987394224582"
# [1] "sd of b = 2.61219651041032"
# [1] "number in b = 435160"

# Now human -logP > hamster.

# Results 1 Mb steps:

# All wks:
compare(unlist(logP_human[seq(1,nrow(logP_human),by=100),c(10:15)]),unlist(logP_hamster[seq(1,nrow(logP_hamster),by=100),c(11:16)]))

	# Welch Two Sample t-test

# data:  a and b
# t = -7.0974, df = 27777, p-value = 1.301e-12
# alternative hypothesis: true difference in means is not equal to 0
# 95 percent confidence interval:
 # -0.2019170 -0.1145269
# sample estimates:
# mean of x mean of y 
 # 1.126339  1.284561 

# [1] "exact P value = 1.30139553320317e-12"
# [1] "mean of a = 1.12633936429708"
# [1] "sem of a = 0.014212834035572"
# [1] "sd of a = 1.92393690926636"
# [1] "number in a = 18324"
# [1] "mean of b = 1.28456133976506"
# [1] "sem of b = 0.0171745607659017"
# [1] "sd of b = 1.96241434402155"
# [1] "number in b = 13056"


# Exclude wks 1 and 2 with no sig human loci:
compare(unlist(logP_human[seq(1,nrow(logP_human),by=100),c(12:15)]),unlist(logP_hamster[seq(1,nrow(logP_hamster),by=100),c(13:16)]))

	# Welch Two Sample t-test

# data:  a and b
# t = -4.8908, df = 18596, p-value = 1.013e-06
# alternative hypothesis: true difference in means is not equal to 0
# 95 percent confidence interval:
 # -0.21748529 -0.09303717
# sample estimates:
# mean of x mean of y 
 # 1.506257  1.661518 

# [1] "exact P value = 1.01259359242518e-06"
# [1] "mean of a = 1.50625666617006"
# [1] "sem of a = 0.0203219215051686"
# [1] "sd of a = 2.24610100186475"
# [1] "number in a = 12216"
# [1] "mean of b = 1.66151789671052"
# [1] "sem of b = 0.0243884300182278"
# [1] "sd of b = 2.2753241956334"
# [1] "number in b = 8704"


# wks 4 and 6, only wks with significant human loci in d_unique:
compare(unlist(logP_human[seq(1,nrow(logP_human),by=100),c(13,14)]),unlist(logP_hamster[seq(1,nrow(logP_hamster),by=100),c(14,15)]))

	# Welch Two Sample t-test

# data:  a and b
# t = 1.7332, df = 9742.5, p-value = 0.08309
# alternative hypothesis: true difference in means is not equal to 0
# 95 percent confidence interval:
 # -0.01220765  0.19861426
# sample estimates:
# mean of x mean of y 
 # 1.998237  1.905033 

# [1] "exact P value = 0.0830925213413966"
# [1] "mean of a = 1.99823675387596"
# [1] "sem of a = 0.0360888048986235"
# [1] "sd of a = 2.82047344122909"
# [1] "number in a = 6108"
# [1] "mean of b = 1.90503345038705"
# [1] "sem of b = 0.039867183529881"
# [1] "sd of b = 2.63002573903335"
# [1] "number in b = 4352"



# compare P vals for drug points that exceed GWAS. Use 10 kb steps, because 1 Mb steps may miss imptnt points as using GWAS threshold and whole peaks cd easily be missed:

compare(
		c(
			c(logP_human[logP_human$log10p_d_w1 > human_thresh["log10p_d_w1",],c("log10p_d_w1")]),
			c(logP_human[logP_human$log10p_d_w2 > human_thresh["log10p_d_w2",],c("log10p_d_w2")]),
			c(logP_human[logP_human$log10p_d_w3 > human_thresh["log10p_d_w3",],c("log10p_d_w3")]),
			c(logP_human[logP_human$log10p_d_w4 > human_thresh["log10p_d_w4",],c("log10p_d_w4")]),
			c(logP_human[logP_human$log10p_d_w6 > human_thresh["log10p_d_w6",],c("log10p_d_w6")]),
			c(logP_human[logP_human$log10p_d_avg > human_thresh["log10p_d_avg",],c("log10p_d_avg")])
			),
		c(
			c(logP_hamster[logP_hamster$log10p_d_w1 > hamster_thresh["log10p_d_w1",],c("log10p_d_w1")]),
			c(logP_hamster[logP_hamster$log10p_d_w2 > hamster_thresh["log10p_d_w2",],c("log10p_d_w2")]),
			c(logP_hamster[logP_hamster$log10p_d_w3 > hamster_thresh["log10p_d_w3",],c("log10p_d_w3")]),
			c(logP_hamster[logP_hamster$log10p_d_w4 > hamster_thresh["log10p_d_w4",],c("log10p_d_w4")]),
			c(logP_hamster[logP_hamster$log10p_d_w6 > hamster_thresh["log10p_d_w6",],c("log10p_d_w6")]),
			c(logP_hamster[logP_hamster$log10p_d_avg > hamster_thresh["log10p_d_avg",],c("log10p_d_avg")])
			)
		)



	# Welch Two Sample t-test

# data:  a and b
# t = 100.58, df = 4750.8, p-value < 2.2e-16
# alternative hypothesis: true difference in means is not equal to 0
# 95 percent confidence interval:
 # 9.284905 9.654039
# sample estimates:
# mean of x mean of y 
 # 20.78000  11.31053 

# [1] "exact P value = 0"
# [1] "mean of a = 20.7799975400792"
# [1] "sem of a = 0.0924340743232153"
# [1] "sd of a = 6.14321614547189"
# [1] "number in a = 4417"
# [1] "mean of b = 11.3105257789811"
# [1] "sem of b = 0.0178649639408595"
# [1] "sd of b = 2.69743208270272"
# [1] "number in b = 22798"

# Human drug -logP vals > hamster





---------
# Same, but 1 Mb steps:


compare(
		c(
			c(logP_human[seq(1,nrow(logP_human),by=100),][logP_human[seq(1,nrow(logP_human),by=100),]$log10p_d_w1 > human_thresh["log10p_d_w1",],c("log10p_d_w1")]),
			c(logP_human[seq(1,nrow(logP_human),by=100),][logP_human[seq(1,nrow(logP_human),by=100),]$log10p_d_w2 > human_thresh["log10p_d_w2",],c("log10p_d_w2")]),
			c(logP_human[seq(1,nrow(logP_human),by=100),][logP_human[seq(1,nrow(logP_human),by=100),]$log10p_d_w3 > human_thresh["log10p_d_w3",],c("log10p_d_w3")]),
			c(logP_human[seq(1,nrow(logP_human),by=100),][logP_human[seq(1,nrow(logP_human),by=100),]$log10p_d_w4 > human_thresh["log10p_d_w4",],c("log10p_d_w4")]),
			c(logP_human[seq(1,nrow(logP_human),by=100),][logP_human[seq(1,nrow(logP_human),by=100),]$log10p_d_w6 > human_thresh["log10p_d_w6",],c("log10p_d_w6")]),
			c(logP_human[seq(1,nrow(logP_human),by=100),][logP_human[seq(1,nrow(logP_human),by=100),]$log10p_d_avg > human_thresh["log10p_d_avg",],c("log10p_d_avg")])
			),
		c(
			c(logP_hamster[seq(1,nrow(logP_hamster),by=100),][logP_hamster[seq(1,nrow(logP_hamster),by=100),]$log10p_d_w1 > hamster_thresh["log10p_d_w1",],c("log10p_d_w1")]),
			c(logP_hamster[seq(1,nrow(logP_hamster),by=100),][logP_hamster[seq(1,nrow(logP_hamster),by=100),]$log10p_d_w2 > hamster_thresh["log10p_d_w2",],c("log10p_d_w2")]),
			c(logP_hamster[seq(1,nrow(logP_hamster),by=100),][logP_hamster[seq(1,nrow(logP_hamster),by=100),]$log10p_d_w3 > hamster_thresh["log10p_d_w3",],c("log10p_d_w3")]),
			c(logP_hamster[seq(1,nrow(logP_hamster),by=100),][logP_hamster[seq(1,nrow(logP_hamster),by=100),]$log10p_d_w4 > hamster_thresh["log10p_d_w4",],c("log10p_d_w4")]),
			c(logP_hamster[seq(1,nrow(logP_hamster),by=100),][logP_hamster[seq(1,nrow(logP_hamster),by=100),]$log10p_d_w6 > hamster_thresh["log10p_d_w6",],c("log10p_d_w6")]),
			c(logP_hamster[seq(1,nrow(logP_hamster),by=100),][logP_hamster[seq(1,nrow(logP_hamster),by=100),]$log10p_d_avg > hamster_thresh["log10p_d_avg",],c("log10p_d_avg")])
			)
		)




	# Welch Two Sample t-test

# data:  a and b
# t = 9.559, df = 46, p-value = 1.674e-12      <<<<<<<<<<<<<<<<<<<<<<<<<<<<<< use in paper
# alternative hypothesis: true difference in means is not equal to 0
# 95 percent confidence interval:
  # 7.371074 11.303474
# sample estimates:
# mean of x mean of y 
 # 20.66575  11.32848 

# [1] "exact P value = 1.67442846921101e-12"
# [1] "mean of a = 20.6657540235948"
# [1] "sem of a = 0.960419130959215"
# [1] "sd of a = 6.37069979774185"
# [1] "number in a = 44"
# [1] "mean of b = 11.3284800727107"
# [1] "sem of b = 0.178147783745906"
# [1] "sd of b = 2.72513707687104"
# [1] "number in b = 234"

# Similar to 10 kb steps, but as expected, less powerful.
# Use in paper because more conservative than 10 kb steps and consistent with rest of paper.
# In addition similarity of 10 kb steps and (non-overlapping) 1 Mb steps suggests few discrepant peaks missed by 1 Mb steps.













