# ------------------------------- Permutation threshold for human genome ----------------------
# ------ Permute and find maximum -log10P value of genome segments using Hoffman cluster  -----
# ---------------------------------- Final segment for each genome shuffle --------------------
# ------------------------------ Post-process to put segments together ------------------------
# ----------------- And find maximum -log10P value for whole genome with each shuffle ---------



# Shuffle within pools and fixed effects for growth and paclitaxel, pools for Ix
# Slower than shuffling within pools only and takes ~25 days per genome scan.


# -------------- Script for Hoffmann ----------------
# --------------- Background --------------------

# Create directory Output in Hoffman to accept informational output from runJobs.com
# run job as 
# >>>>>>>>> ./master_final_human <<<<<<<<<<<
# which runs these:
# qsub -cwd -V -N PJ -l h_data=4G,h_rt=04:00:00 -M eplau -m n -t 305001-305001:391 runJob_final.com
# Maxvmem = xxxG. Wallclock Time   = xxh yy min.


# make sure mgcv and multcomp are loaded into appropriate R version, eg R/3.5.0
# in R use install.packages('package_name', dependencies=TRUE)
# CA server did not work, but IA did
# cf https://www.hoffman2.idre.ucla.edu/software/r/
# 'module load R/3.5.0' before running qsub


# Main output dataframes are placed in same directory in which runJobs.com are run.
# afterwards, process in R using results_qsub_3 or similar
# cf http://www.maths.lancs.ac.uk/~rowlings/HPC/RJobs/
# https://www.ccn.ucla.edu/wiki/index.php/Hoffman2:Submitting_Jobs

# dim(RH_human) ## after removal of chrY
# # [1] 305391    119
# 305391/5e3
# # [1] 61.0782
# 61*5e3
# # [1] 305000 # maximum row before administering final step by hand, which goes from 305001 to 305391, with output log10p_raw_sub_62.txt

# ------------- Begin here ---------------

main <- function() {

args <- commandArgs(trailingOnly = TRUE)
gam <- args[1]

j = as.numeric(Sys.getenv("SGE_TASK_ID"))
step = as.numeric(Sys.getenv("SGE_TASK_STEPSIZE"))
cat("Starting run j = ",j,"\n")


# ---------------- libraries -----------------------------------------------

library(mgcv)
library(multcomp)

#----------------- Prepare human gseq data ---------------------


RH_human <- read.table("RH_human_gseq.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE)

# Code below is to get rid of up and down ramps for copy number changes. 
# Should not be used here, where doing P val analyses.
# # Get rows at beginning of each chromosome:
# RH_human_start <- RH_human[RH_human$posS == 0 & RH_human$posE == 1e6,]

# # Get rid of ramp ups and ramp downs:
# RH_human <- RH_human[c(0,diff(RH_human$pos)) == 1e4,]

# # combine RH_human without ramps and RH_human_start:
# RH_human <- rbind(RH_human_start,RH_human)


# Sort:
chrOrder<-paste("chr",c(1:22,"X","Y"),sep="")
RH_human$Chromosome <-factor(RH_human$Chromosome, levels=chrOrder)
RH_human <- RH_human[order(RH_human$Chromosome, RH_human$pos), ]
RH_human$Chromosome <- as.character(RH_human$Chromosome)


# # Transform chr1 etc. to numbers
# RH_human$Chromosome <- gsub('chr', '', RH_human$Chromosome)
# RH_human[RH_human$Chromosome == "X","Chromosome"] <- 23
# RH_human[RH_human$Chromosome == "Y","Chromosome"] <- 24
# chrOrder<-c(1:24)
# RH_human$Chromosome <-factor(RH_human$Chromosome, levels=chrOrder)
# RH_human <- RH_human[order(RH_human$Chromosome, RH_human$pos), ]
# RH_human$Chromosome <- as.numeric(RH_human$Chromosome)

# # Compute chromosome size
# gen_coord <- aggregate(pos~Chromosome,FUN=max,data=RH_human)
# colnames(gen_coord)[2] <- "chr_size"
# gen_coord$Chromosome <-factor(gen_coord$Chromosome, levels=chrOrder)
# gen_coord <- gen_coord[order(gen_coord$Chromosome), ]
# gen_coord$Chromosome <- as.numeric(gen_coord$Chromosome)

# # Use cumsum to make genome coordinates
# gen_coord$coord <- c(0,cumsum(gen_coord$chr_size)[-24])

# # merge genome coordinates with RH_human
# RH_human <- merge(RH_human,gen_coord[,c("Chromosome","coord")])
# RH_human$Chromosome <-factor(RH_human$Chromosome, levels=chrOrder)
# RH_human <- RH_human[order(RH_human$Chromosome, RH_human$pos), ]
# RH_human$Chromosome <- as.numeric(RH_human$Chromosome)

# RH_human$coord <- RH_human$pos + RH_human$coord

# get rid of chrY, because no chrY seq in hamster genome
RH_human <- RH_human[RH_human$Chromosome != "chrY",]

# # Get rid of unneeded coord column at end of RH_human
# RH_human <- RH_human[,-ncol(RH_human)]


# ------------------ Read in and prepare ancillary tables -------------------------------

cell <- read.table("cell_label_info.txt",sep="\t",stringsAsFactors=FALSE,header=TRUE)
sum_reads <- colSums(RH_human[,5:ncol(RH_human)])
reads <- read.table("RH_pool_human_total_align.txt",sep="\t",stringsAsFactors=FALSE,header=TRUE) # uses mapped human reads, cf human_AIC_1.R



# -------- Take chunk of human data and prepare for gam loop ---------------------------

RH_human_sub <- RH_human[j:(j+step-1),]


ptm <- proc.time()

RH_human_sub_l <- reshape(RH_human_sub, 
  varying = c(colnames(RH_human_sub[c(5:ncol(RH_human_sub))])), 
  v.names = "copy",
  timevar = "RH_ID", 
  times = c(colnames(RH_human_sub[c(5:ncol(RH_human_sub))])), 
  new.row.names = 1:1e6,
  direction = "long")
  
RH_human_sub_l$week <- 0
RH_human_sub_l[grepl("_w0_",RH_human_sub_l$RH_ID),]$week <- 0
RH_human_sub_l[grepl("_w1_",RH_human_sub_l$RH_ID),]$week <- 1
RH_human_sub_l[grepl("_w2_",RH_human_sub_l$RH_ID),]$week <- 2
RH_human_sub_l[grepl("_w3_",RH_human_sub_l$RH_ID),]$week <- 3
RH_human_sub_l[grepl("_w4_",RH_human_sub_l$RH_ID),]$week <- 4
RH_human_sub_l[grepl("_w6_",RH_human_sub_l$RH_ID),]$week <- 6

RH_human_sub_l$conc <- 0
RH_human_sub_l[grepl("_d0",RH_human_sub_l$RH_ID),]$conc <- 0
RH_human_sub_l[grepl("_d8",RH_human_sub_l$RH_ID),]$conc <- 8
RH_human_sub_l[grepl("_d25",RH_human_sub_l$RH_ID),]$conc <- 25
RH_human_sub_l[grepl("_d75",RH_human_sub_l$RH_ID),]$conc <- 75

RH_human_sub_l$pool <- 0
RH_human_sub_l[grepl("RH1_",RH_human_sub_l$RH_ID),]$pool <- 1
RH_human_sub_l[grepl("RH2_",RH_human_sub_l$RH_ID),]$pool <- 2
RH_human_sub_l[grepl("RH3_",RH_human_sub_l$RH_ID),]$pool <- 3
RH_human_sub_l[grepl("RH4_",RH_human_sub_l$RH_ID),]$pool <- 4
RH_human_sub_l[grepl("RH5_",RH_human_sub_l$RH_ID),]$pool <- 5
RH_human_sub_l[grepl("RH6_",RH_human_sub_l$RH_ID),]$pool <- 6


RH_human_sub_l <- merge(RH_human_sub_l,cell)
RH_human_sub_l$sum_reads <- sum_reads[RH_human_sub_l$RH_ID]
RH_human_sub_l  <- merge(RH_human_sub_l,reads[,c(1:5,9)])
colnames(RH_human_sub_l)[13] <- "total_reads"


RH_human_sub_l$pool <- as.factor(RH_human_sub_l$pool)
RH_human_sub_l$cell <- as.factor(RH_human_sub_l$cell)

ngroup <- max(RH_human_sub_l$id)
nExpts <- nrow(RH_human_sub_l[RH_human_sub_l$id==1,])


# separate dataframes for growth, paclitaxel and Ix perms

RH_human_sub_l_growth <- RH_human_sub_l
RH_human_sub_l_paclitaxel <- RH_human_sub_l
RH_human_sub_l_Ix <- RH_human_sub_l




# shuffle within pools and paclitaxel conc for growth
for (i in 1: ngroup) {	
	
		# cat("i = ", i, "/", ngroup, "\n")
	
	for (k in 1:6) {	
		
		# cat("  k = ", k, "/6\n")
		
		
		for (l in c(0, 8, 25, 75)) {
			
			# cat("    l = ", l, "/4\n")
			
			npool <- dim(RH_human_sub_l_growth[RH_human_sub_l_growth$id == i & RH_human_sub_l_growth$pool == k & RH_human_sub_l_growth$conc == l,])[1]
			
		RH_human_sub_l_growth[RH_human_sub_l_growth$id == i  & RH_human_sub_l_growth$pool == k & RH_human_sub_l_growth$conc == l, c("copy","total_reads")]	<- RH_human_sub_l_growth[RH_human_sub_l_growth$id == i  & RH_human_sub_l_growth$pool == k & RH_human_sub_l_growth$conc == l, c("copy","total_reads")][sample(c(1:npool)),]
		
			}

		}

}








# shuffle within pools and weeks growth for paclitaxel
for (i in 1: ngroup) {	
	
		# cat("i = ", i, "/", ngroup, "\n")
	
	for (k in 1:6) {	
		
		# cat("  k = ", k, "/6\n")
		
		
		for (l in c(0, 1, 2, 3, 4, 6)) {
			
			# cat("    l = ", l, "/4\n")
			
			npool <- dim(RH_human_sub_l_paclitaxel[RH_human_sub_l_paclitaxel$id == i & RH_human_sub_l_paclitaxel$pool == k & RH_human_sub_l_paclitaxel$week == l,])[1]
			
		RH_human_sub_l_paclitaxel[RH_human_sub_l_paclitaxel$id == i  & RH_human_sub_l_paclitaxel$pool == k & RH_human_sub_l_paclitaxel$week == l, c("copy","total_reads")]	<- RH_human_sub_l_paclitaxel[RH_human_sub_l_paclitaxel$id == i  & RH_human_sub_l_paclitaxel$pool == k & RH_human_sub_l_paclitaxel$week == l, c("copy","total_reads")][sample(c(1:npool)),]
		
			}

		}

}










# shuffle within pools for Ix
for (i in 1: ngroup) {	
	
		# cat("i = ", i, "/", ngroup, "\n")
	
	for (k in 1:6) {	
		
		# cat("k = ", k, "/6\n")
		
		npool <- dim(RH_human_sub_l_Ix[RH_human_sub_l_Ix$id==i & RH_human_sub_l_Ix$pool==k,])[1]
			
		RH_human_sub_l_Ix[RH_human_sub_l_Ix$id == i & RH_human_sub_l_Ix$pool == k, c("copy","total_reads")]	<- RH_human_sub_l_Ix[RH_human_sub_l_Ix$id == i  & RH_human_sub_l_Ix$pool == k, c("copy","total_reads")][sample(c(1:npool)),]

		}

}











# # shuffle ignoring pool index
# for (i in 1: ngroup) {	
# RH_human_sub_l[RH_human_sub_l$id == i,c("copy","total_reads")]	<- RH_human_sub_l[RH_human_sub_l$id == i,c("copy","total_reads")][sample(c(1:nExpts)),]	
# }



log10p_raw_sub <- data.frame(
							Chromosome = as.character(), 
							posS = integer(), 
							posE = integer(), 
							pos = numeric(), 
							log10p_g_0nM = numeric(),
							log10p_g_8nM = numeric(),
							log10p_g_25nM = numeric(),
							log10p_g_75nM = numeric(),
							log10p_g_avg = numeric(),
							log10p_d_w1 = numeric(),
							log10p_d_w2 = numeric(),
							log10p_d_w3 = numeric(),
							log10p_d_w4 = numeric(),
							log10p_d_w6 = numeric(),
							log10p_d_avg = numeric(),
							log10p_g_d_Ix = numeric(),
							coef_g_0nM = numeric(),
							coef_g_8nM = numeric(),
							coef_g_25nM = numeric(),
							coef_g_75nM = numeric(),
							coef_g_avg = numeric(),
							coef_d_w1 = numeric(),
							coef_d_w2 = numeric(),
							coef_d_w3 = numeric(),
							coef_d_w4 = numeric(),
							coef_d_w6 = numeric(),
							coef_d_avg = numeric(),
							coef_g_d_Ix = numeric(),
							stringsAsFactors=FALSE
							)



for (i in 1:ngroup) { 
	cat("i = ",i,"/",ngroup, "\n")
	tryCatch ({
				
		m1_growth <- gam(copy ~ week * conc + s(pool, bs = "re") + s(cell, bs = "re") + offset(log(total_reads)), data = RH_human_sub_l_growth[RH_human_sub_l_growth$id == i, ], family = nb, method = "REML")
		
		m1_paclitaxel <- gam(copy ~ week * conc + s(pool, bs = "re") + s(cell, bs = "re") + offset(log(total_reads)), data = RH_human_sub_l_paclitaxel[RH_human_sub_l_paclitaxel$id == i, ], family = nb, method = "REML")
		
		m1_Ix <- gam(copy ~ week * conc + s(pool, bs = "re") + s(cell, bs = "re") + offset(log(total_reads)), data = RH_human_sub_l_Ix[RH_human_sub_l_Ix$id == i, ], family = nb, method = "REML")


		 
		glht_growth <- glht(m1_growth, linfct = c(
					"week == 0",
 					"week + 8*week:conc == 0",
					"week + 25*week:conc == 0",
 					"week + 75*week:conc == 0",
 					"week + (27)*week:conc == 0"))
 					
		glht_drug <- glht(m1_paclitaxel, linfct = c(
					"conc + 1*week:conc == 0",
  					"conc + 2*week:conc == 0",
 					"conc + 3*week:conc == 0",
  					"conc + 4*week:conc == 0",
 					"conc + 6*week:conc == 0",
 					"conc + (3.2)*week:conc == 0"))
 					
 		glht_omni <- glht(m1_Ix)
 		
 					
 	growth_stat <- summary(glht_growth,test = adjusted("none"))$test$tstat
 	drug_stat <- summary(glht_drug,test = adjusted("none"))$test$tstat
 	Ix_stat <- summary(glht_omni,test = adjusted("none"))$test$tstat["week:conc"]
 	growth_coef <- summary(glht_growth,test = adjusted("none"))$test$coefficients
 	drug_coef <- summary(glht_drug,test = adjusted("none"))$test$coefficients
 	Ix_coef <- summary(glht_omni,test = adjusted("none"))$test$coefficients["week:conc"]
 		
 		
 	log10p_raw_sub[i,] <- c(
	 							RH_human_sub[i,1:4],
	 							-log10(2*pnorm(-abs(growth_stat))),
	 							-log10(2*pnorm(-abs(drug_stat))),
	 							-log10(2*pnorm(-abs(Ix_stat))),
	 							growth_coef,
	 							drug_coef,
	 							Ix_coef
	 							)
 							
 							
 							
		}, error = function(e) {cat ("Error on line ", i, ": ", conditionMessage(e),"\n")})
}

# Too cluttered to write full data file
# write.table(log10p_raw_sub,paste0("/u/flashscratch/d/desmond/human_shuff_gam_", gam, "_batch_", (((j-1)/step)+1), ".txt"),quote=FALSE, sep="\t",row.names=FALSE)

human_max_shuff <- apply(log10p_raw_sub[,c(5:16)], 2, max, na.rm=T)

write.table(unlist(human_max_shuff), paste0("/u/flashscratch/d/desmond/human_max_shuff_gam_", gam, "_batch_62.txt"), quote=FALSE, sep="\t", row.names=TRUE, col.names=FALSE)


print(proc.time() - ptm)

}

main()

# afterwards, process log10p_raw_sub files in R in Hoffman using procHumanBatchShuff_1.R