library(mgcv)
library(multcomp)

 copy_raw <- read.table("RH_human_gseq.txt",sep="\t",stringsAsFactors=FALSE,header=TRUE)
 cell <- read.table("cell_label_info.txt",sep="\t",stringsAsFactors=FALSE,header=TRUE)
 sum_reads <- colSums(copy_raw[,5:ncol(copy_raw)])
 reads <- read.table("RH_pool_human_total_align.txt",sep="\t",stringsAsFactors=FALSE,header=TRUE) # uses mapped human reads, cf human_AIC_1.R
 

# down sample copy_raw to prevent autocorrelation
 
copy_raw_sub <- copy_raw[seq(1,nrow(copy_raw),100),]


ptm <- proc.time()

copy_raw_sub_l <- reshape(copy_raw_sub, 
  varying = c(colnames(copy_raw_sub[c(5:ncol(copy_raw_sub))])), 
  v.names = "copy",
  timevar = "RH_ID", 
  times = c(colnames(copy_raw_sub[c(5:ncol(copy_raw_sub))])), 
  new.row.names = 1:1e6,
  direction = "long")
  
copy_raw_sub_l$week <- 0
copy_raw_sub_l[grepl("_w0_",copy_raw_sub_l$RH_ID),]$week <- 0
copy_raw_sub_l[grepl("_w1_",copy_raw_sub_l$RH_ID),]$week <- 1
copy_raw_sub_l[grepl("_w2_",copy_raw_sub_l$RH_ID),]$week <- 2
copy_raw_sub_l[grepl("_w3_",copy_raw_sub_l$RH_ID),]$week <- 3
copy_raw_sub_l[grepl("_w4_",copy_raw_sub_l$RH_ID),]$week <- 4
copy_raw_sub_l[grepl("_w6_",copy_raw_sub_l$RH_ID),]$week <- 6

copy_raw_sub_l$conc <- 0
copy_raw_sub_l[grepl("_d0",copy_raw_sub_l$RH_ID),]$conc <- 0
copy_raw_sub_l[grepl("_d8",copy_raw_sub_l$RH_ID),]$conc <- 8
copy_raw_sub_l[grepl("_d25",copy_raw_sub_l$RH_ID),]$conc <- 25
copy_raw_sub_l[grepl("_d75",copy_raw_sub_l$RH_ID),]$conc <- 75

copy_raw_sub_l$pool <- 0
copy_raw_sub_l[grepl("RH1_",copy_raw_sub_l$RH_ID),]$pool <- 1
copy_raw_sub_l[grepl("RH2_",copy_raw_sub_l$RH_ID),]$pool <- 2
copy_raw_sub_l[grepl("RH3_",copy_raw_sub_l$RH_ID),]$pool <- 3
copy_raw_sub_l[grepl("RH4_",copy_raw_sub_l$RH_ID),]$pool <- 4
copy_raw_sub_l[grepl("RH5_",copy_raw_sub_l$RH_ID),]$pool <- 5
copy_raw_sub_l[grepl("RH6_",copy_raw_sub_l$RH_ID),]$pool <- 6



copy_raw_sub_l <- merge(copy_raw_sub_l,cell)
copy_raw_sub_l$sum_reads <- sum_reads[copy_raw_sub_l$RH_ID]
copy_raw_sub_l  <- merge(copy_raw_sub_l,reads[,c(1:5,9)])
colnames(copy_raw_sub_l)[13] <- "total_reads"


copy_raw_sub_l$pool <- as.factor(copy_raw_sub_l$pool)
copy_raw_sub_l$cell <- as.factor(copy_raw_sub_l$cell)

# goodness of fit from https://stats.idre.ucla.edu/r/dae/poisson-regression/							
gof <- data.frame(
				chi.sq = numeric(),
				df = numeric(),
				p = numeric()
				)

ngroup <- max(copy_raw_sub_l$id)

for (i in 1:ngroup) { 
	cat("i = ",i,"/",ngroup, "\n")
	tryCatch ({
		
				
				m1_nb <- gam(copy ~ week * conc + s(pool, bs = "re") + s(cell, bs = "re") + offset(log(total_reads)), data = subset(copy_raw_sub_l,copy_raw_sub_l$id == i), family = nb, method = "REML")
				
				gof[i,c("chi.sq","df","p")] <- with(m1_nb, cbind(deviance, df.residual, pchisq(deviance, df.residual, lower.tail=FALSE)))
				
				
		}, error = function(e) {cat ("Error on line ", i, ": ", conditionMessage(e),"\n")})
}

# write.table(ans,"overdisp_1.txt",quote=FALSE, sep="\t",row.names=FALSE)


print(proc.time() - ptm)

# ----------- Numbers to quote in paper -----------------

head(gof)
        # chi.sq        df         p
# 1 5.099595e-14 111.00000 1.0000000
# 2 9.429475e+01  91.11546 0.3889336
# 3 9.134100e+01  89.45507 0.4247333
# 4 9.756206e+01  91.86574 0.3224637
# 5 1.070573e+02  97.68367 0.2426961
# 6 9.979315e+01  92.95862 0.2952587


sem <- function(x) {sqrt(var(x,na.rm=TRUE)/sum(!is.na(x)))}

compare <- function(a,b) {
	print(t.test(a,b))
	
	print(paste0("exact P value = ", t.test(a,b)$p.value))
	
	print(paste0("mean of a = ", mean(a, na.rm = TRUE)))
	print(paste0("sem of a = ", sem(a)))
	print(paste0("sd of a = ", sd(a, na.rm = TRUE)))
	print(paste0("number in a = ", sum(!is.na(a))))
	
	print(paste0("mean of b = ", mean(b, na.rm = TRUE)))
	print(paste0("sem of b = ", sem(b)))
	print(paste0("sd of b = ", sd(b, na.rm = TRUE)))
	print(paste0("number in b = ", sum(!is.na(b))))
	
}

# --------------- GOF calx ---------------------------

mean(gof$chi.sq)
# [1] 92.93585 <<<<<<<<< use in paper

sem(gof$chi.sq)
# [1] 0.3628703 <<<<<<<<< use in paper

mean(gof$df)
# [1] 93.5899 <<<<<<<<< use in paper

sem(gof$df)
# [1] 0.1024745 <<<<<<<<< use in paper

mean(gof$p)
# [1] 0.3908706

sem(gof$p)
# [1] 0.00254113

mean(-log10(gof$p))
# [1] 0.4269322 <<<<<<<<< use in paper

sem(-log10(gof$p))
# [1] 0.002139665 <<<<<<<<< use in paper































