# ----------- Find -log10P values for hamster genome segments using Hoffmann cluster ----------
# -------------------------- Post-process to put all segments together ------------------------



# --------------------------------------- Background ------------------------------------------


# Create directory Output in Hoffman to accept informational output from runJobs.com
# run job as:
# qsub -cwd -V -N PJ -l h_data=4G,h_rt=04:00:00 -M eplau -m n -t 1-215001:5000 runJobs.com 
# Maxvmem = 2.066G. Wallclock Time   = 36.5 min.




# make sure mgcv and multcomp are loaded into appropriate R version, eg R/3.5.0
# in R use install.packages('package_name', dependencies=TRUE)
# CA server did not work, but IA did
# cf https://www.hoffman2.idre.ucla.edu/software/r/
# 'module load R/3.5.0' before running qsub


# Main output dataframes are placed in same directory in which runJobs.com are run.
# afterwards, process in R using Post_Hoffman_process_1.R or similar
# cf http://www.maths.lancs.ac.uk/~rowlings/HPC/RJobs/
# https://www.ccn.ucla.edu/wiki/index.php/Hoffman2:Submitting_Jobs





# dim(RH_hamster) ## after removal of chrY
# # [1] 217580    119
# 217580/5e3
# # [1] 43.516
# 43*5e3
# # [1] 215000 # maximum row before administering final step by hand, which goes from 215001 to 217580, with output log10p_raw_sub_44.txt




# ------------- Begin here ---------------


j = as.numeric(Sys.getenv("SGE_TASK_ID"))
step = as.numeric(Sys.getenv("SGE_TASK_STEPSIZE"))
cat("Starting run j = ",j,"\n")


 
# ---------------- libraries -----------------------------------------------

library(mgcv)
library(multcomp)

#----------------- Prepare hamster gseq data ---------------------


RH_hamster <- read.table("RH_hamster_gseq.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE)

# Code below is to get rid of up and down ramps for copy number changes. 
# Should not be used here, where doing P val analyses.
# # Get rows at beginning of each chromosome:
# RH_hamster_start <- RH_hamster[RH_hamster$posS == 0 & RH_hamster$posE == 1e6,]

# # Get rid of ramp ups and ramp downs:
# RH_hamster <- RH_hamster[c(0,diff(RH_hamster$pos)) == 1e4,]

# # combine RH_hamster without ramps and RH_hamster_start:
# RH_hamster <- rbind(RH_hamster_start,RH_hamster)


# Sort:
chrOrder<-paste("chr",c(1:10,"X","Y"),sep="")
RH_hamster$Chromosome <-factor(RH_hamster$Chromosome, levels=chrOrder)
RH_hamster <- RH_hamster[order(RH_hamster$Chromosome, RH_hamster$pos), ]
RH_hamster$Chromosome <- as.character(RH_hamster$Chromosome)


# # Transform chr1 etc. to numbers
# RH_hamster$Chromosome <- gsub('chr', '', RH_hamster$Chromosome)
# RH_hamster[RH_hamster$Chromosome == "X","Chromosome"] <- 23
# RH_hamster[RH_hamster$Chromosome == "Y","Chromosome"] <- 24
# chrOrder<-c(1:24)
# RH_hamster$Chromosome <-factor(RH_hamster$Chromosome, levels=chrOrder)
# RH_hamster <- RH_hamster[order(RH_hamster$Chromosome, RH_hamster$pos), ]
# RH_hamster$Chromosome <- as.numeric(RH_hamster$Chromosome)

# # Compute chromosome size
# gen_coord <- aggregate(pos~Chromosome,FUN=max,data=RH_hamster)
# colnames(gen_coord)[2] <- "chr_size"
# gen_coord$Chromosome <-factor(gen_coord$Chromosome, levels=chrOrder)
# gen_coord <- gen_coord[order(gen_coord$Chromosome), ]
# gen_coord$Chromosome <- as.numeric(gen_coord$Chromosome)

# # Use cumsum to make genome coordinates
# gen_coord$coord <- c(0,cumsum(gen_coord$chr_size)[-24])

# # merge genome coordinates with RH_hamster
# RH_hamster <- merge(RH_hamster,gen_coord[,c("Chromosome","coord")])
# RH_hamster$Chromosome <-factor(RH_hamster$Chromosome, levels=chrOrder)
# RH_hamster <- RH_hamster[order(RH_hamster$Chromosome, RH_hamster$pos), ]
# RH_hamster$Chromosome <- as.numeric(RH_hamster$Chromosome)

# RH_hamster$coord <- RH_hamster$pos + RH_hamster$coord

# get rid of chrY, because no chrY seq in hamster genome
RH_hamster <- RH_hamster[RH_hamster$Chromosome != "chrY",]


# get rid of Contig_ID column
RH_hamster <- RH_hamster[,-c(1)]


# # Get rid of unneeded coord column at end of RH_hamster
# RH_hamster <- RH_hamster[,-ncol(RH_hamster)]


# ------------------ Read in and prepare ancillary tables -------------------------------

cell <- read.table("cell_label_info.txt",sep="\t",stringsAsFactors=FALSE,header=TRUE)
sum_reads <- colSums(RH_hamster[,5:ncol(RH_hamster)])
reads <- read.table("RH_pool_hamster_total_align.txt",sep="\t",stringsAsFactors=FALSE,header=TRUE) # uses mapped hamster reads, cf hamster_AIC_1.R


# -------- Take chunk of hamster data and prepare for gam loop ---------------------------

RH_hamster_sub <- RH_hamster[j:(j+step-1),]


ptm <- proc.time()

RH_hamster_sub_l <- reshape(RH_hamster_sub, 
  varying = c(colnames(RH_hamster_sub[c(5:ncol(RH_hamster_sub))])), 
  v.names = "copy",
  timevar = "RH_ID", 
  times = c(colnames(RH_hamster_sub[c(5:ncol(RH_hamster_sub))])), 
  new.row.names = 1:1e6,
  direction = "long")
  
RH_hamster_sub_l$week <- 0
RH_hamster_sub_l[grepl("_w0_",RH_hamster_sub_l$RH_ID),]$week <- 0
RH_hamster_sub_l[grepl("_w1_",RH_hamster_sub_l$RH_ID),]$week <- 1
RH_hamster_sub_l[grepl("_w2_",RH_hamster_sub_l$RH_ID),]$week <- 2
RH_hamster_sub_l[grepl("_w3_",RH_hamster_sub_l$RH_ID),]$week <- 3
RH_hamster_sub_l[grepl("_w4_",RH_hamster_sub_l$RH_ID),]$week <- 4
RH_hamster_sub_l[grepl("_w6_",RH_hamster_sub_l$RH_ID),]$week <- 6

RH_hamster_sub_l$conc <- 0
RH_hamster_sub_l[grepl("_d0",RH_hamster_sub_l$RH_ID),]$conc <- 0
RH_hamster_sub_l[grepl("_d8",RH_hamster_sub_l$RH_ID),]$conc <- 8
RH_hamster_sub_l[grepl("_d25",RH_hamster_sub_l$RH_ID),]$conc <- 25
RH_hamster_sub_l[grepl("_d75",RH_hamster_sub_l$RH_ID),]$conc <- 75

RH_hamster_sub_l$pool <- 0
RH_hamster_sub_l[grepl("RH1_",RH_hamster_sub_l$RH_ID),]$pool <- 1
RH_hamster_sub_l[grepl("RH2_",RH_hamster_sub_l$RH_ID),]$pool <- 2
RH_hamster_sub_l[grepl("RH3_",RH_hamster_sub_l$RH_ID),]$pool <- 3
RH_hamster_sub_l[grepl("RH4_",RH_hamster_sub_l$RH_ID),]$pool <- 4
RH_hamster_sub_l[grepl("RH5_",RH_hamster_sub_l$RH_ID),]$pool <- 5
RH_hamster_sub_l[grepl("RH6_",RH_hamster_sub_l$RH_ID),]$pool <- 6


RH_hamster_sub_l <- merge(RH_hamster_sub_l,cell)
RH_hamster_sub_l$sum_reads <- sum_reads[RH_hamster_sub_l$RH_ID]
RH_hamster_sub_l  <- merge(RH_hamster_sub_l,reads[,c(1:5,9)])
colnames(RH_hamster_sub_l)[13] <- "total_reads"


RH_hamster_sub_l$pool <- as.factor(RH_hamster_sub_l$pool)
RH_hamster_sub_l$cell <- as.factor(RH_hamster_sub_l$cell)




log10p_raw_sub <- data.frame(
							Chromosome = as.character(), 
							posS = integer(), 
							posE = integer(), 
							pos = numeric(), 
							log10p_g_0nM = numeric(),
							log10p_g_8nM = numeric(),
							log10p_g_25nM = numeric(),
							log10p_g_75nM = numeric(),
							log10p_g_avg = numeric(),
							log10p_d_w1 = numeric(),
							log10p_d_w2 = numeric(),
							log10p_d_w3 = numeric(),
							log10p_d_w4 = numeric(),
							log10p_d_w6 = numeric(),
							log10p_d_avg = numeric(),
							log10p_g_d_Ix = numeric(),
							coef_g_0nM = numeric(),
							coef_g_8nM = numeric(),
							coef_g_25nM = numeric(),
							coef_g_75nM = numeric(),
							coef_g_avg = numeric(),
							coef_d_w1 = numeric(),
							coef_d_w2 = numeric(),
							coef_d_w3 = numeric(),
							coef_d_w4 = numeric(),
							coef_d_w6 = numeric(),
							coef_d_avg = numeric(),
							coef_g_d_Ix = numeric(),
							stringsAsFactors=FALSE
							)


ngroup <- max(RH_hamster_sub_l$id)



for (i in 1:ngroup) { 
	cat("i = ",i,"/",ngroup, "\n")
	tryCatch ({
				
		m1 <- gam(copy ~ week * conc + s(pool, bs = "re") + s(cell, bs = "re") + offset(log(total_reads)), data = subset(RH_hamster_sub_l, RH_hamster_sub_l$id == i), family = nb, method = "REML")

		 
		glht_growth <- glht(m1, linfct = c(
					"week == 0",
 					"week + 8*week:conc == 0",
					"week + 25*week:conc == 0",
 					"week + 75*week:conc == 0",
 					"week + (27)*week:conc == 0"))
 					
		glht_drug <- glht(m1, linfct = c(
					"conc + 1*week:conc == 0",
  					"conc + 2*week:conc == 0",
 					"conc + 3*week:conc == 0",
  					"conc + 4*week:conc == 0",
 					"conc + 6*week:conc == 0",
 					"conc + (3.2)*week:conc == 0"))
 					
 		glht_omni <- glht(m1)
 		 					
 		
 					
 	growth_stat <- summary(glht_growth,test = adjusted("none"))$test$tstat
 	drug_stat <- summary(glht_drug,test = adjusted("none"))$test$tstat
 	Ix_stat <- summary(glht_omni,test = adjusted("none"))$test$tstat["week:conc"]
 	growth_coef <- summary(glht_growth,test = adjusted("none"))$test$coefficients
 	drug_coef <- summary(glht_drug,test = adjusted("none"))$test$coefficients
 	Ix_coef <- summary(glht_omni,test = adjusted("none"))$test$coefficients["week:conc"]
 		
 		
 		
 	log10p_raw_sub[i,] <- c(
 							RH_hamster_sub[i,1:4],
 							-log10(2*pnorm(-abs(growth_stat))),
 							-log10(2*pnorm(-abs(drug_stat))),
 							-log10(2*pnorm(-abs(Ix_stat))),
 							growth_coef,
 							drug_coef,
 							Ix_coef
 							)
 							
# Alternate (and better), but not run. Gets round min P val of 10^(-308) using floating 64-bit precision. OK though, as max -log10P is 111.1291. cf https://stackoverflow.com/questions/46416027/how-to-compute-p-values-from-z-scores-in-r-when-the-z-score-is-large-pvalue-muc

 	# log10p_raw_sub[i,] <- c(
 							# RH_hamster_sub[i,1:4],
 							# -(log(2) + pnorm(abs(growth_stat), lower.tail = FALSE, log.p = TRUE))/log(10),
 							# -(log(2) + pnorm(abs(drug_stat), lower.tail = FALSE, log.p = TRUE))/log(10),
 							# -(log(2) + pnorm(abs(Ix_stat), lower.tail = FALSE, log.p = TRUE))/log(10),
 							# growth_coef,
 							# drug_coef,
 							# Ix_coef
 							# )

 							
 							
 							
		}, error = function(e) {cat ("Error on line ", i, ": ", conditionMessage(e),"\n")})
}


write.table(log10p_raw_sub,paste0("/u/flashscratch/d/desmond/log10p_raw_sub_",(((j-1)/step)+1),".txt"),quote=FALSE, sep="\t",row.names=FALSE)


print(proc.time() - ptm)


# afterwards, process log10p_raw_sub files in R in Hoffman using Post_Hoffman_process_1.R