# Used median values because CEN caused strong right skewed outliers


library(ggplot2)
library(cowplot)

# for summarySE()
library(Rmisc)


library(lme4)
library(multcomp)
library(sjstats)
library(performance)
library(insight)
library(emmeans)

# from https://stackoverflow.com/questions/8197559/emulate-ggplot2-default-color-palette
# function for default ggplot2 colors

gg_color_hue <- function(n) {
  hues = seq(15, 375, length = n + 1)
  hcl(h = hues, l = 65, c = 100)[1:n]
}

theme2 <- theme(
	plot.margin = unit(c(t=1.3,r=0.7,b=1.3,l=0.7), "cm"),
	panel.grid.major = element_blank(), 
	panel.grid.minor = element_blank(), 
	panel.background = element_blank(), 
	legend.position="none", 
	axis.line.x = element_line(colour = "black", size = 0.1), 
	axis.line.y = element_line(colour = "black", size = 0.1), 
	axis.ticks = element_line(colour = "black", size = 0.1),
	axis.text=element_text(size=12), #numbers on tick marks of x and y axes
	axis.title=element_text(size=14), #titles of x and y axes
	axis.title.y=element_text(margin=margin(0,13,0,0)), #moves y axis title by adding margin space to bottom
	axis.title.x=element_text(margin=margin(10,0,0,0)),  #moves x axis title by adding margin space to top
	plot.title = element_text(size=32, face="bold", hjust = -0.14), #can provide "A","B", by ggtitle, but used plot_grid wch can shift more left
	plot.subtitle = element_text(size=14, face="plain", hjust = 0.5), #hjust shifts right
	legend.margin=margin(t=15,r=0,b=0,l=-5,unit = "pt"),
 	legend.box.margin=margin(t=15,r=-10,b=0,l=-5,unit = "pt"),
 	legend.key.height = unit(0.1, "cm"),
	legend.key.width = unit(0.3, "cm"),
 	legend.spacing.y = unit(0.1, 'cm'),
 	legend.spacing.x = unit(0.1, 'cm'),
 	legend.title = element_text(size = 9), 
 	legend.text = element_text(size = 8)
 	# legend.title.align=0.0
	)
	
	
size_point <- 0.3
size_hline <- 0.1


# # --------- Combine human and hamster align files to deduce human retention frequency from bulk seq reads -----------------


human <- read.table("RH_pool_human_total_align.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE)

dim(human)
# [1] 115   10

head(human)

      # RH_ID pool week conc cell Total_reads human_unaligned human_aligned human_aligned_and_hamster_unaligned
# 1 RH1_w0_d0    1    0    0    1    42000172        41601530        398642                              298433
# 2 RH1_w1_d0    1    1    0    2    47971567        47496634        474933                              372475
# 3 RH1_w2_d0    1    2    0    2    38276132        37777814        498318                              310062
# 4 RH1_w3_d0    1    3    0    2    34816772        34444880        371892                              288187
# 5 RH1_w4_d0    1    4    0    2    36328023        35818437        509586                              326399
# 6 RH1_w6_d0    1    6    0    2    50327669        49691822        635847                              527642
  # human_aligned_and_hamster_aligned
# 1                            100209
# 2                            102458
# 3                            188256
# 4                             83705
# 5                            183187
# 6                            108205


hamster <- read.table("RH_pool_hamster_total_align.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE)

dim(hamster)
# [1] 115   10

head(hamster)
      # RH_ID pool week conc cell Total_reads hamster_unaligned hamster_aligned hamster_aligned_and_human_unaligned
# 1 RH1_w0_d0    1    0    0    1    42000172           4304785        37695387                            37595178
# 2 RH1_w1_d0    1    1    0    2    47971567           4982337        42989230                            42886773
# 3 RH1_w2_d0    1    2    0    2    38276132           3876901        34399231                            34210971
# 4 RH1_w3_d0    1    3    0    2    34816772           3585610        31231162                            31147459
# 5 RH1_w4_d0    1    4    0    2    36328023           3623841        32704182                            32520994
# 6 RH1_w6_d0    1    6    0    2    50327669           5325221        45002448                            44894243
  # hamster_aligned_and_human_aligned
# 1                            100209
# 2                            102457
# 3                            188260
# 4                             83703
# 5                            183188
# 6                            108205

Human_retent <- merge(hamster[,c("RH_ID","hamster_aligned_and_human_unaligned")],human[,c("RH_ID","human_aligned_and_hamster_unaligned")])

# Factor of 2, because human frags hapoloid and A23 diploid
Human_retent$human_retent <- 2*Human_retent[,"human_aligned_and_hamster_unaligned"]/Human_retent[,"hamster_aligned_and_human_unaligned"]


dim(Human_retent)
# [1] 115   4


head(Human_retent)
       # RH_ID hamster_aligned_and_human_unaligned human_aligned_and_hamster_unaligned human_retent
# 1  RH1_w0_d0                            37595178                              298433  0.015876132
# 2  RH1_w1_d0                            42886773                              372475  0.017370157
# 3 RH1_w1_d25                            49651729                              321998  0.012970263
# 4 RH1_w1_d75                            59267597                              247321  0.008345909
# 5  RH1_w1_d8                            55742797                              425169  0.015254670
# 6  RH1_w2_d0                            34210971                              310062  0.018126466

# # -------- Correct human retention because different lengths human and hamster genome. ----------
# # -------- Human and hamster genome lengths including mitochondrion from A23_HEK_mito_copy_num_1.R --------------

hg38 <- 3088286401
picr <- 2368939474

Human_retent$corr_human_retent <- Human_retent$human_retent * (picr/hg38)




# --------- Calculate retention of human DNA in RH pools using 1 Mb windows ------------



#----------------- Prepare human retain RH pools ---------------------


RH_human <- read.table("RH_human_gseq.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE)

# Get rows at beginning of each chromosome:
RH_human_start <- RH_human[RH_human$posS == 0 & RH_human$posE == 1e6,]

# Get rid of ramp ups and ramp downs:
RH_human <- RH_human[c(0,diff(RH_human$pos)) == 1e4,]

# combine RH_human without ramps and RH_human_start:
RH_human <- rbind(RH_human_start,RH_human)


# Sort:
chrOrder<-paste("chr",c(1:22,"X","Y"),sep="")
RH_human$Chromosome <-factor(RH_human$Chromosome, levels=chrOrder)
RH_human <- RH_human[order(RH_human$Chromosome, RH_human$pos), ]
RH_human$Chromosome <- as.character(RH_human$Chromosome)


# Transform chr1 etc. to numbers
RH_human$Chromosome <- gsub('chr', '', RH_human$Chromosome)
RH_human[RH_human$Chromosome == "X","Chromosome"] <- 23
RH_human[RH_human$Chromosome == "Y","Chromosome"] <- 24
chrOrder<-c(1:24)
RH_human$Chromosome <-factor(RH_human$Chromosome, levels=chrOrder)
RH_human <- RH_human[order(RH_human$Chromosome, RH_human$pos), ]
RH_human$Chromosome <- as.numeric(RH_human$Chromosome)

# Compute chromosome size
gen_coord <- aggregate(pos~Chromosome,FUN=max,data=RH_human)
colnames(gen_coord)[2] <- "chr_size"
gen_coord$Chromosome <-factor(gen_coord$Chromosome, levels=chrOrder)
gen_coord <- gen_coord[order(gen_coord$Chromosome), ]
gen_coord$Chromosome <- as.numeric(gen_coord$Chromosome)

# Use cumsum to make genome coordinates
gen_coord$coord <- c(0,cumsum(gen_coord$chr_size)[-24])

# merge genome coordinates with RH_human
RH_human <- merge(RH_human,gen_coord[,c("Chromosome","coord")])
RH_human$Chromosome <-factor(RH_human$Chromosome, levels=chrOrder)
RH_human <- RH_human[order(RH_human$Chromosome, RH_human$pos), ]
RH_human$Chromosome <- as.numeric(RH_human$Chromosome)

RH_human$coord <- RH_human$pos + RH_human$coord

# Decided to get rid of chrY (cf below), because even though its reads shd contribute to chrX, large segments chrY are non-pseudoautosomal, so artifactually decreases genome median.
# get rid of chrY, because no chrY seq in hamster genome
RH_human <- RH_human[RH_human$Chromosome != 24,]


#----------------- Prepare hamster retain RH pools ---------------------

RH_hamster <- read.table("RH_hamster_gseq.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE)

# Get rows at beginning of each chromosome:
RH_hamster_start <- RH_hamster[RH_hamster$posS == 0 & RH_hamster$posE == 1e6,]

# Get rid of ramp ups and ramp downs (though note that hamster has ramp downs, not ramp ups):
RH_hamster <- RH_hamster[c(0,diff(RH_hamster$pos)) == 1e4,]

# combine RH_hamster without ramps and RH_hamster_start:
RH_hamster <- rbind(RH_hamster_start,RH_hamster)


# # get rid of contigs with only one entry:
# RH_hamster <- RH_hamster[!(RH_hamster$Contig_ID %in% aggregate(pos ~ Contig_ID, 
          # data = RH_hamster, 
          # FUN = function(x){NROW(x)})[aggregate(pos ~ Contig_ID, 
          # data = RH_hamster, 
          # FUN = function(x){NROW(x)})$pos==1,"Contig_ID"]),]

# Sort:
chrOrder<-paste("chr",c(1:10,"X"),sep="")
RH_hamster$Chromosome <-factor(RH_hamster$Chromosome, levels=chrOrder)
RH_hamster <- RH_hamster[order(RH_hamster$Chromosome, RH_hamster$pos), ]
RH_hamster$Chromosome <- as.character(RH_hamster$Chromosome)


# ----------- calculate seq reads retention ------------

# Chose non-overlapping 1 Mb windows because seemed more conservative? On the other hand, much easier to exclude TK1 precisely using overlapping windows (see below). Anyway, decided to go with non-overlapping windows. For seq reads retention, relative difference in retention betweeen overlapping and non-overlapping windows was negligible, < 1%. 


median_human_reads <- numeric()
for(i in colnames(RH_human[,c(5:(ncol(RH_human)-1))])) {
	median_human_reads[grep(i,colnames(RH_human[,c(5:(ncol(RH_human)-1))]))] <- median(RH_human[seq(1,nrow(RH_human),1e2),i])
	names(median_human_reads)[grep(i,colnames(RH_human[,c(5:(ncol(RH_human)-1))]))] <- i
	}
	
	
median_hamster_reads <- numeric()
for(i in colnames(RH_human[,c(5:(ncol(RH_human)-1))])) {
	median_hamster_reads[grep(i,colnames(RH_human[,c(5:(ncol(RH_human)-1))]))] <- median(median(RH_hamster[seq(1,nrow(RH_hamster),1e2),c(grep(i,colnames(RH_hamster)))]))
	names(median_hamster_reads)[grep(i,colnames(RH_human[,c(5:(ncol(RH_human)-1))]))] <- i
	}
	
	
mean_human_reads <- numeric()
for(i in colnames(RH_human[,c(5:(ncol(RH_human)-1))])) {
	mean_human_reads[grep(i,colnames(RH_human[,c(5:(ncol(RH_human)-1))]))] <- mean(RH_human[seq(1,nrow(RH_human),1e2),i])
	names(mean_human_reads)[grep(i,colnames(RH_human[,c(5:(ncol(RH_human)-1))]))] <- i
	}
	
	
mean_hamster_reads <- numeric()
for(i in colnames(RH_human[,c(5:(ncol(RH_human)-1))])) {
	mean_hamster_reads[grep(i,colnames(RH_human[,c(5:(ncol(RH_human)-1))]))] <- mean(mean(RH_hamster[seq(1,nrow(RH_hamster),1e2),c(grep(i,colnames(RH_hamster)))]))
	names(mean_hamster_reads)[grep(i,colnames(RH_human[,c(5:(ncol(RH_human)-1))]))] <- i
	}




# Non-overlapping 1 Mb windows. Factor of 2 because human frags haploid, A23 diploid.
median_retent_seq <- numeric()
for(i in colnames(RH_human[,c(5:(ncol(RH_human)-1))])) {
	median_retent_seq[grep(i,colnames(RH_human[,c(5:(ncol(RH_human)-1))]))] <- 2*median(RH_human[seq(1,nrow(RH_human),1e2),i]/(median(RH_hamster[seq(1,nrow(RH_hamster),1e2),c(grep(i,colnames(RH_hamster)))])))
	names(median_retent_seq)[grep(i,colnames(RH_human[,c(5:(ncol(RH_human)-1))]))] <- i
	}
	
	
mean_retent_seq <- numeric()
for(i in colnames(RH_human[,c(5:(ncol(RH_human)-1))])) {
	mean_retent_seq[grep(i,colnames(RH_human[,c(5:(ncol(RH_human)-1))]))] <- 2*mean(RH_human[seq(1,nrow(RH_human),1e2),i]/(mean(RH_hamster[seq(1,nrow(RH_hamster),1e2),c(grep(i,colnames(RH_hamster)))])))
	names(mean_retent_seq)[grep(i,colnames(RH_human[,c(5:(ncol(RH_human)-1))]))] <- i
	}




# # Overlapping 1 Mb windows. Factor of 2 because human frags haploid, A23 diploid.
# median_retent_seq <- numeric()
# for(i in colnames(RH_human[,c(5:(ncol(RH_human)-1))])) {
	# median_retent_seq[grep(i,colnames(RH_human[,c(5:(ncol(RH_human)-1))]))] <- 2*median(RH_human[,i]/(median(RH_hamster[seq(1,nrow(RH_hamster),1e2),c(grep(i,colnames(RH_hamster)))])))
	# names(median_retent_seq)[grep(i,colnames(RH_human[,c(5:(ncol(RH_human)-1))]))] <- i
	# }
	
	
# mean_retent_seq <- numeric()
# for(i in colnames(RH_human[,c(5:(ncol(RH_human)-1))])) {
	# mean_retent_seq[grep(i,colnames(RH_human[,c(5:(ncol(RH_human)-1))]))] <- 2*mean(RH_human[,i]/(mean(RH_hamster[seq(1,nrow(RH_hamster),1e2),c(grep(i,colnames(RH_hamster)))])))
	# names(mean_retent_seq)[grep(i,colnames(RH_human[,c(5:(ncol(RH_human)-1))]))] <- i
	# }



median_human_reads <- as.data.frame(median_human_reads)
median_human_reads$RH_ID <- row.names(median_human_reads)
row.names(median_human_reads) <- c(1:nrow(median_human_reads))


median_hamster_reads <- as.data.frame(median_hamster_reads)
median_hamster_reads$RH_ID <- row.names(median_hamster_reads)
row.names(median_hamster_reads) <- c(1:nrow(median_hamster_reads))


mean_human_reads <- as.data.frame(mean_human_reads)
mean_human_reads$RH_ID <- row.names(mean_human_reads)
row.names(mean_human_reads) <- c(1:nrow(mean_human_reads))


mean_hamster_reads <- as.data.frame(mean_hamster_reads)
mean_hamster_reads$RH_ID <- row.names(mean_hamster_reads)
row.names(mean_hamster_reads) <- c(1:nrow(mean_hamster_reads))





median_retent_seq <- as.data.frame(median_retent_seq)
median_retent_seq$RH_ID <- row.names(median_retent_seq)
row.names(median_retent_seq) <- c(1:nrow(median_retent_seq))


mean_retent_seq <- as.data.frame(mean_retent_seq)
mean_retent_seq$RH_ID <- row.names(mean_retent_seq)
row.names(mean_retent_seq) <- c(1:nrow(mean_retent_seq))



# -------- find middle TK1 -------------

gencode_gtf_ensembl_ucsc <- read.delim("gencode_gtf_ensembl_ucsc_v31.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE,check.names=FALSE)


gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$geneSymbol=="TK1",]
      # Chromosome         gene_id           tx_id geneSymbol strand    geneS    geneE geneLength txLength cdsLength    5utrS    5utrE 5utrDiff    3utrS    3utrE 3utrDiff
# 49401      chr17 ENSG00000167900 ENST00000588734        TK1      - 78174091 78187233      13143     1681       804 78186995 78187233      239 78174121 78174758      638
      # exonCount      gene_type                                       gene_description
# 49401         6 protein_coding thymidine kinase 1 [Source:HGNC Symbol;Acc:HGNC:11830]


TK1_coord <- mean(gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$geneSymbol=="TK1","geneS"],gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$geneSymbol=="TK1","geneE"])





# ----------- find peak TK1 reads ----------------

# For more accurate search of TK1 peak, restrict search to within delta of TK1, because CEN and TEL becomes larger than TK1 in some samples
delta_1 <- 1e6

RH_human_TK1_subset <- RH_human[RH_human$Chromosome==17 & RH_human$pos >= TK1_coord-delta_1 & RH_human$pos <= TK1_coord+delta_1,]

TK1_max <- apply(RH_human_TK1_subset[,c(5:(ncol(RH_human_TK1_subset)-1))],2,max)


TK1_max
 # RH1_w0_d0  RH1_w1_d0  RH1_w1_d8 RH1_w1_d25 RH1_w1_d75  RH1_w2_d0  RH1_w2_d8 RH1_w2_d25  RH1_w3_d0  RH1_w3_d8 RH1_w3_d25 RH1_w3_d75  RH1_w4_d0  RH1_w4_d8 RH1_w4_d25 
       # 997       1398       1531       1255        866       1805       1600       1220       1930       1668       1180        247       2370       2052       1237 
# RH1_w4_d75  RH1_w6_d0  RH1_w6_d8 RH1_w6_d25 RH1_w6_d75  RH2_w0_d0  RH2_w1_d0  RH2_w1_d8 RH2_w1_d25 RH2_w1_d75  RH2_w2_d0  RH2_w2_d8 RH2_w2_d25  RH2_w3_d0  RH2_w3_d8 
       # 154       3180       3951       2402        226       2636       3662       3431       3430       2759       3531       4211       1961       3648       3421 
# RH2_w3_d25 RH2_w3_d75  RH2_w4_d0  RH2_w4_d8 RH2_w4_d25  RH2_w6_d0  RH2_w6_d8 RH2_w6_d25 RH2_w6_d75  RH3_w0_d0  RH3_w1_d0  RH3_w1_d8 RH3_w1_d25 RH3_w1_d75  RH3_w2_d0 
      # 3072       5611       4168       4248       5095       6020       6337       4804       5334       3302       2347       2125       2113       2434       2447 
 # RH3_w2_d8 RH3_w2_d25  RH3_w3_d0  RH3_w3_d8 RH3_w3_d25 RH3_w3_d75  RH3_w4_d0  RH3_w4_d8 RH3_w4_d25 RH3_w4_d75  RH3_w6_d0  RH3_w6_d8 RH3_w6_d25 RH3_w6_d75  RH4_w0_d0 
      # 2548       2441       2651       3494       2630       3174       3606       2848       2872       3553       4209       3405       3437       3579       3125 
 # RH4_w1_d0  RH4_w1_d8 RH4_w1_d25 RH4_w1_d75  RH4_w2_d0  RH4_w2_d8 RH4_w2_d25  RH4_w3_d0  RH4_w3_d8 RH4_w3_d25 RH4_w3_d75  RH4_w4_d0  RH4_w4_d8 RH4_w4_d25 RH4_w4_d75 
      # 3902       3917       3704       3637       4935       3396       3437       7809       5048       4157       3626       9062       5209       3544       4127 
 # RH4_w6_d0  RH4_w6_d8 RH4_w6_d25 RH4_w6_d75  RH5_w0_d0  RH5_w1_d0  RH5_w1_d8 RH5_w1_d25  RH5_w2_d0  RH5_w2_d8 RH5_w2_d25  RH5_w3_d0  RH5_w3_d8 RH5_w3_d25 RH5_w3_d75 
     # 12080       7391       4731       3758       2234       2027       2070       2028       2166       2332       2683       3059       2933       3648       3396 
 # RH5_w4_d0  RH5_w4_d8 RH5_w4_d25  RH5_w6_d0  RH5_w6_d8 RH5_w6_d25 RH5_w6_d75  RH6_w0_d0  RH6_w1_d0  RH6_w1_d8 RH6_w1_d25  RH6_w2_d0  RH6_w2_d8 RH6_w2_d25  RH6_w3_d0 
      # 3639       3057       3601       4483       3785       6195       1651       2573       2741       2978       3102       3281       2976       3038       2977 
 # RH6_w3_d8 RH6_w3_d25 RH6_w3_d75  RH6_w4_d0  RH6_w4_d8 RH6_w4_d25  RH6_w6_d0  RH6_w6_d8 RH6_w6_d25 RH6_w6_d75 
      # 3498       3640       4175       3865       3467       4526       3286       4221       4714       5062  



TK1_max_coord <- apply(RH_human_TK1_subset[,c(5:(ncol(RH_human_TK1_subset)-1))],2,FUN= function(x) {RH_human_TK1_subset[,"coord"][which.max(x)]})






# ------------ calculate retention assuming TK1 is 100% retention ----------------

# Change RH_human by normalizing all 115 expts so that TK1 == 1.0

RH_human_TK <- RH_human

for(i in c(5:(ncol(RH_human_TK)-1))) {
RH_human_TK[,i] <- RH_human_TK[,i]/TK1_max[i-4]
}

# For TK1 retention, relative difference in retention betweeen overlapping and non-overlapping windows was negligible < 1%. 


# So as not to overestimate retention, exclude seqs +/- 3.5e6 from TK1, since av expected fragment length is 7 Mb? But then is incosistent with retention derived from sequence reads? So keep TK1 reads (ie delta_2 <- 0)
# Non-overlapping 1 Mb windows


delta_2 <- 0



median_retent_TK1 <- vector()


for (i in names(TK1_max)) {
	median_retent_TK1[i] <- median(RH_human_TK[seq(1,nrow(RH_human_TK),1e2),][RH_human_TK[seq(1,nrow(RH_human_TK),1e2),"coord"] <= (TK1_max_coord[i] - delta_2) | RH_human_TK[seq(1,nrow(RH_human_TK),1e2),"coord"] >= (TK1_max_coord[i] + delta_2),i])
}



mean_retent_TK1 <- vector()


for (i in names(TK1_max)) {
	mean_retent_TK1[i] <- mean(RH_human_TK[seq(1,nrow(RH_human_TK),1e2),][RH_human_TK[seq(1,nrow(RH_human_TK),1e2),"coord"] <= (TK1_max_coord[i] - delta_2) | RH_human_TK[seq(1,nrow(RH_human_TK),1e2),"coord"] >= (TK1_max_coord[i] + delta_2),i])
}





# Overlapping 1 Mb windows

# median_retent_TK1 <- vector()


# for (i in names(TK1_max)) {
	# median_retent_TK1[i] <- median(RH_human_TK[RH_human_TK[,"coord"] <= (TK1_max_coord[i] - delta_2) | RH_human_TK[,"coord"] >= (TK1_max_coord[i] + delta_2),i])
# }



# mean_retent_TK1 <- vector()


# for (i in names(TK1_max)) {
	# mean_retent_TK1[i] <- mean(RH_human_TK[RH_human_TK[,"coord"] <= (TK1_max_coord[i] - delta_2) | RH_human_TK[,"coord"] >= (TK1_max_coord[i] + delta_2),i])
# }



# -------------------------------------------------------
# -------- Correct TK1 retention for revertants ---------
# -------- Decrease TK1 retention by revertant freq -----
# -------------------------------------------------------




# ~~~~~~ Prepare median_retent_TK1 ~~~~~~~~~~~~~~

clone <- read.table("clone.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE)

clone$revert_freq <- clone$reverts/clone$colonies

revert <- aggregate(revert_freq~Pool,data=clone,FUN=mean)

median_retent_TK1 <- data.frame(sample = names(median_retent_TK1),retent = median_retent_TK1)

row.names(median_retent_TK1) <- c(1:nrow(median_retent_TK1))

median_retent_TK1$Pool <- numeric(115)

median_retent_TK1[grep("^RH1",median_retent_TK1$sample),"Pool"] <- 1
median_retent_TK1[grep("^RH2",median_retent_TK1$sample),"Pool"] <- 2
median_retent_TK1[grep("^RH3",median_retent_TK1$sample),"Pool"] <- 3
median_retent_TK1[grep("^RH4",median_retent_TK1$sample),"Pool"] <- 4
median_retent_TK1[grep("^RH5",median_retent_TK1$sample),"Pool"] <- 5
median_retent_TK1[grep("^RH6",median_retent_TK1$sample),"Pool"] <- 6

median_retent_TK1 <- merge(median_retent_TK1,revert)
median_retent_TK1$corr_retent <- median_retent_TK1$retent*(1-median_retent_TK1$revert_freq)


# ~~~~~~~~~~~ Prepare mean_retent_TK1 ~~~~~~~~~~~~~~~~

clone <- read.table("clone.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE)

clone$revert_freq <- clone$reverts/clone$colonies

revert <- aggregate(revert_freq~Pool,data=clone,FUN=mean)

mean_retent_TK1 <- data.frame(sample = names(mean_retent_TK1),retent = mean_retent_TK1)

row.names(mean_retent_TK1) <- c(1:nrow(mean_retent_TK1))

mean_retent_TK1$Pool <- numeric(115)

mean_retent_TK1[grep("^RH1",mean_retent_TK1$sample),"Pool"] <- 1
mean_retent_TK1[grep("^RH2",mean_retent_TK1$sample),"Pool"] <- 2
mean_retent_TK1[grep("^RH3",mean_retent_TK1$sample),"Pool"] <- 3
mean_retent_TK1[grep("^RH4",mean_retent_TK1$sample),"Pool"] <- 4
mean_retent_TK1[grep("^RH5",mean_retent_TK1$sample),"Pool"] <- 5
mean_retent_TK1[grep("^RH6",mean_retent_TK1$sample),"Pool"] <- 6

mean_retent_TK1 <- merge(mean_retent_TK1,revert)
mean_retent_TK1$corr_retent <- mean_retent_TK1$retent*(1-mean_retent_TK1$revert_freq)


# -----------------------------------------------------------------------------------------------------------------------------------
# ---------- Combine seq retent and mean and median TK1 retent to make a retention table --------------------------------------------
# ---------- Retents are corrected, as appropriate, for either human and hamster genome sizes or revertant frequency ----------------
# ---------- Result is retent, the FOUNDATION of all figures and stat tests following -----------------------------------------------
# -----------------------------------------------------------------------------------------------------------------------------------


colnames(median_retent_TK1)[5] <- "TK1_median_retent"
colnames(median_retent_TK1)[2] <- "RH_ID"
colnames(mean_retent_TK1)[5] <- "TK1_mean_retent"
colnames(mean_retent_TK1)[2] <- "RH_ID"

temp_1 <- merge(median_retent_seq,mean_retent_seq)
temp_2 <- merge(median_retent_TK1[,c("RH_ID","TK1_median_retent")],mean_retent_TK1[,c("RH_ID","TK1_mean_retent")])

# retent will be the root for all figure panels and stats after figure panels
retent <- merge(temp_1,temp_2)

temp3 <- merge(median_human_reads, median_hamster_reads)
temp4 <- merge(mean_human_reads, mean_hamster_reads)

retent <- merge(retent,temp3)
retent <- merge(retent,temp4)

retent <- retent[,c("RH_ID","median_human_reads", "median_hamster_reads", "mean_human_reads", "mean_hamster_reads","median_retent_seq", "mean_retent_seq", "TK1_median_retent", "TK1_mean_retent")]

dim(retent)
# [1] 115   9

head(retent)
       # RH_ID median_human_reads median_hamster_reads mean_human_reads mean_hamster_reads median_retent_seq mean_retent_seq TK1_median_retent TK1_mean_retent
# 1  RH1_w0_d0                 76                15130         95.79096           15209.42       0.010046266     0.012596264        0.05817028      0.07331825
# 2  RH1_w1_d0                 83                17375        119.44433           17297.70       0.009553957     0.013810431        0.04530578      0.06519902
# 3 RH1_w1_d25                 73                20193        103.19807           20067.49       0.007230228     0.010285101        0.04438762      0.06274954
# 4 RH1_w1_d75                 58                24339         79.19608           23922.75       0.004766013     0.006620983        0.05110846      0.06978603
# 5  RH1_w1_d8                 97                22613        136.34164           22494.54       0.008579136     0.012122196        0.04834808      0.06795729
# 6  RH1_w2_d0                 65                13752         99.07478           13829.42       0.009453170     0.014328120        0.02748013      0.04188597


retent[grep("^RH1",retent$RH_ID),"pool"] <- 1
retent[grep("^RH2",retent$RH_ID),"pool"] <- 2
retent[grep("^RH3",retent$RH_ID),"pool"] <- 3
retent[grep("^RH4",retent$RH_ID),"pool"] <- 4
retent[grep("^RH5",retent$RH_ID),"pool"] <- 5
retent[grep("^RH6",retent$RH_ID),"pool"] <- 6

retent[grep("_w0_",retent$RH_ID),"week"] <- 0
retent[grep("_w1_",retent$RH_ID),"week"] <- 1
retent[grep("_w2_",retent$RH_ID),"week"] <- 2
retent[grep("_w3_",retent$RH_ID),"week"] <- 3
retent[grep("_w4_",retent$RH_ID),"week"] <- 4
retent[grep("_w6_",retent$RH_ID),"week"] <- 6

retent[grep("_d0",retent$RH_ID),"conc"] <- 0
retent[grep("_d8",retent$RH_ID),"conc"] <- 8
retent[grep("_d25",retent$RH_ID),"conc"] <- 25
retent[grep("_d75",retent$RH_ID),"conc"] <- 75

retent <- retent[,c("RH_ID","pool","week","conc","median_human_reads", "median_hamster_reads", "mean_human_reads", "mean_hamster_reads","median_retent_seq", "mean_retent_seq", "TK1_median_retent", "TK1_mean_retent")]

retent <- retent[order(retent$week,retent$conc,retent$pool),]

dim(retent)
# [1] 115   12

head(retent)
       # RH_ID pool week conc median_human_reads median_hamster_reads mean_human_reads mean_hamster_reads median_retent_seq mean_retent_seq TK1_median_retent
# 1  RH1_w0_d0    1    0    0                 76                15130         95.79096           15209.42        0.01004627      0.01259626        0.05817028
# 21 RH2_w0_d0    2    0    0                194                12372        251.74543           12344.90        0.03136114      0.04078533        0.05881380
# 40 RH3_w0_d0    3    0    0                246                18255        341.26520           18410.09        0.02695152      0.03707372        0.05445742
# 60 RH4_w0_d0    4    0    0                259                11237        337.15985           11269.49        0.04609771      0.05983588        0.05852638
# 80 RH5_w0_d0    5    0    0                155                15082        205.01529           15367.61        0.02055430      0.02668147        0.04964056
# 98 RH6_w0_d0    6    0    0                154                14877        202.76471           14928.61        0.02070310      0.02716458        0.03787156
   # TK1_mean_retent
# 1       0.07331825
# 21      0.07632012
# 40      0.07554644
# 60      0.07618821
# 80      0.06565854
# 98      0.04986374

cell <- read.table("cell_label_info.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE)

head(cell)
  # pool conc week cell
# 1    1    0    0    1
# 2    1    0    1    2
# 3    1    0    2    2
# 4    1    0    3    2
# 5    1    0    4    2
# 6    1    0    6    2

dim(cell)
# [1] 115   4

retent <- merge(retent,cell)
retent <- retent[order(retent$week,retent$conc,retent$pool),]
retent <- retent[order(retent$cell),]

retent <- retent[,c("RH_ID","pool","week","conc","cell","median_human_reads", "median_hamster_reads", "mean_human_reads", "mean_hamster_reads","median_retent_seq", "mean_retent_seq", "TK1_median_retent", "TK1_mean_retent")]


dim(retent)
# [1] 115   13

head(retent)
       # RH_ID pool week conc cell median_human_reads median_hamster_reads mean_human_reads mean_hamster_reads median_retent_seq mean_retent_seq TK1_median_retent
# 1  RH1_w0_d0    1    0    0    1                 76                15130         95.79096           15209.42       0.010046266      0.01259626        0.05817028
# 2  RH1_w1_d0    1    1    0    2                 83                17375        119.44433           17297.70       0.009553957      0.01381043        0.04530578
# 6  RH1_w2_d0    1    2    0    2                 65                13752         99.07478           13829.42       0.009453170      0.01432812        0.02748013
# 9  RH1_w3_d0    1    3    0    2                 57                12570         92.10867           12637.62       0.009069212      0.01457690        0.02253721
# 13 RH1_w4_d0    1    4    0    2                 58                13010        104.06846           13110.79       0.008916218      0.01587524        0.01867507
# 17 RH1_w6_d0    1    6    0    2                 83                18203        168.86474           18193.74       0.009119376      0.01856295        0.01991745
   # TK1_mean_retent
# 1       0.07331825
# 2       0.06519902
# 6       0.04188597
# 9       0.03641882
# 13      0.03350838
# 17      0.04052234


retent <- merge(retent,median_retent_TK1[,c("RH_ID","revert_freq")])
retent <- retent[order(retent$week,retent$conc,retent$pool),]
retent <- retent[order(retent$cell),]

dim(retent)
# [1] 115   14

head(retent)
       # RH_ID pool week conc cell median_human_reads median_hamster_reads mean_human_reads mean_hamster_reads median_retent_seq mean_retent_seq TK1_median_retent
# 1  RH1_w0_d0    1    0    0    1                 76                15130         95.79096           15209.42       0.010046266      0.01259626        0.05817028
# 2  RH1_w1_d0    1    1    0    2                 83                17375        119.44433           17297.70       0.009553957      0.01381043        0.04530578
# 6  RH1_w2_d0    1    2    0    2                 65                13752         99.07478           13829.42       0.009453170      0.01432812        0.02748013
# 9  RH1_w3_d0    1    3    0    2                 57                12570         92.10867           12637.62       0.009069212      0.01457690        0.02253721
# 13 RH1_w4_d0    1    4    0    2                 58                13010        104.06846           13110.79       0.008916218      0.01587524        0.01867507
# 17 RH1_w6_d0    1    6    0    2                 83                18203        168.86474           18193.74       0.009119376      0.01856295        0.01991745
   # TK1_mean_retent revert_freq
# 1       0.07331825   0.2368978
# 2       0.06519902   0.2368978
# 6       0.04188597   0.2368978
# 9       0.03641882   0.2368978
# 13      0.03350838   0.2368978
# 17      0.04052234   0.2368978

retent$mean_of_medians <- rowMeans(retent[,c("median_retent_seq","TK1_median_retent")])
retent$mean_of_means <- rowMeans(retent[,c("mean_retent_seq","TK1_mean_retent")])

dim(retent)
# [1] 115  16


head(retent)
       # RH_ID pool week conc cell median_human_reads median_hamster_reads mean_human_reads mean_hamster_reads median_retent_seq mean_retent_seq TK1_median_retent
# 1  RH1_w0_d0    1    0    0    1                 76                15130         95.79096           15209.42       0.010046266      0.01259626        0.05817028
# 2  RH1_w1_d0    1    1    0    2                 83                17375        119.44433           17297.70       0.009553957      0.01381043        0.04530578
# 6  RH1_w2_d0    1    2    0    2                 65                13752         99.07478           13829.42       0.009453170      0.01432812        0.02748013
# 9  RH1_w3_d0    1    3    0    2                 57                12570         92.10867           12637.62       0.009069212      0.01457690        0.02253721
# 13 RH1_w4_d0    1    4    0    2                 58                13010        104.06846           13110.79       0.008916218      0.01587524        0.01867507
# 17 RH1_w6_d0    1    6    0    2                 83                18203        168.86474           18193.74       0.009119376      0.01856295        0.01991745
   # TK1_mean_retent revert_freq mean_of_medians mean_of_means
# 1       0.07331825   0.2368978      0.03410827    0.04295725
# 2       0.06519902   0.2368978      0.02742987    0.03950473
# 6       0.04188597   0.2368978      0.01846665    0.02810705
# 9       0.03641882   0.2368978      0.01580321    0.02549786
# 13      0.03350838   0.2368978      0.01379565    0.02469181
# 17      0.04052234   0.2368978      0.01451841    0.02954265


TK1_max <- as.data.frame(TK1_max)
TK1_max$RH_ID <- row.names(TK1_max)
row.names(TK1_max) <- c(1:nrow(TK1_max))



retent <- merge(retent,TK1_max)

retent <- retent[order(retent$week,retent$conc,retent$pool),]
retent <- retent[order(retent$cell),]

dim(retent)
# [1] 115   17

head(retent)
       # RH_ID pool week conc cell median_human_reads median_hamster_reads mean_human_reads mean_hamster_reads median_retent_seq mean_retent_seq TK1_median_retent
# 1  RH1_w0_d0    1    0    0    1                 76                15130         95.79096           15209.42       0.010046266      0.01259626        0.05817028
# 2  RH1_w1_d0    1    1    0    2                 83                17375        119.44433           17297.70       0.009553957      0.01381043        0.04530578
# 6  RH1_w2_d0    1    2    0    2                 65                13752         99.07478           13829.42       0.009453170      0.01432812        0.02748013
# 9  RH1_w3_d0    1    3    0    2                 57                12570         92.10867           12637.62       0.009069212      0.01457690        0.02253721
# 13 RH1_w4_d0    1    4    0    2                 58                13010        104.06846           13110.79       0.008916218      0.01587524        0.01867507
# 17 RH1_w6_d0    1    6    0    2                 83                18203        168.86474           18193.74       0.009119376      0.01856295        0.01991745
   # TK1_mean_retent revert_freq mean_of_medians mean_of_means TK1_max
# 1       0.07331825   0.2368978      0.03410827    0.04295725     997
# 2       0.06519902   0.2368978      0.02742987    0.03950473    1398
# 6       0.04188597   0.2368978      0.01846665    0.02810705    1805
# 9       0.03641882   0.2368978      0.01580321    0.02549786    1930
# 13      0.03350838   0.2368978      0.01379565    0.02469181    2370
# 17      0.04052234   0.2368978      0.01451841    0.02954265    3180



# ----- if desired, can plot mean_of_medians, mean_of_means stratified by week, conc or pool ---------------



# -------------------------------------------------------------------------------
# ----------- Plots of median sequence retent and TK1 retent -----------------------
# ----------- For plots of mean sequence retent and TK1 retent see below ---------
# ------------- Decided to use median for paper -----------------------------------
# ----------- Plot median sequence retention vs growth time ---------------------
# ---------------------------------------------------------------------------------


# warnings generated because week == 0 only has one measurement, so sd cannot be calcuated. Can ignore.

summary_retent_1 <- summarySE(retent, measurevar="median_retent_seq", groupvars=c("week","pool"))
summary_retent_1[1:6,5:7] <- 0

 


# provide common x scale jitter for two overlaid ggplots
summary_retent_1$jitter <- jitter(summary_retent_1$week,1)


retent_1 <- merge(retent,summary_retent_1[c("week","pool","jitter")])

# # replicate week 0, drug 0 nM data point four times, to allow four graphs lines linking week 0 and week 1, instead of only one
# summary_retent_1 <- rbind(summary_retent_1[1,],summary_retent_1[1,],summary_retent_1[1,],summary_retent_1)
# summary_retent_1[1:4,"conc"] <- c(0,8,25,75)

# turn retent freq into percentage
retent_1[,c("median_retent_seq", "mean_retent_seq", "TK1_median_retent", "TK1_mean_retent")] <- 100*retent_1[,c("median_retent_seq", "mean_retent_seq", "TK1_median_retent", "TK1_mean_retent")]
summary_retent_1[,c("median_retent_seq", "sd", "se", "ci")] <- 100*summary_retent_1[,c("median_retent_seq", "sd", "se", "ci")]




n = length(unique(retent_1$pool))
colores_1 = gg_color_hue(n)



p1 <- ggplot() + 
		theme2 + 
		theme(legend.key=element_blank()) +
		geom_line(
			data=summary_retent_1, 
			lwd=0.2,
			aes(
				x=summary_retent_1$jitter, 
				y= median_retent_seq, 
				colour=as.factor(summary_retent_1$pool)),
				show.legend=FALSE
				) + 
		geom_linerange(
			data=summary_retent_1,
			aes(
				x=jitter, 
				ymin=median_retent_seq-se, 
				ymax=median_retent_seq+se,
				colour=as.factor(pool)
				),
			lwd=0.2,
			show.legend=FALSE
			) +
		geom_point(
			shape=1,
			stroke=0.4,
			data=retent_1, 
			aes(
				x=retent_1$jitter, 
				y=median_retent_seq, 
				colour=as.factor(retent_1$pool)),
				size=1.0
				) +
		scale_color_manual(
			values=colores_1,
			name ="Pool", 
			labels=c(1,2,3,4,5,6)
			) +
 		guides(
 			colour = guide_legend(override.aes = list(fill=NA,shape=1,size=1),ncol=3,byrow=TRUE)
 			) +
 		theme(
 			legend.position = c(0.75,1.00), 
 			legend.title.align=0.6) +
		scale_x_continuous(breaks = c(0,1,2,3,4,6), labels = c(0,1,2,3,4,6)) +
		# ggtitle("") + 
		xlab("Weeks") + 
		ylab("Retention via \nalignments (%)") + 
		labs(subtitle="Growth")
print(p1)
		
		
		
		
# ----------- Plot TK1 median retention vs growth time ---------------------



summary_retent_2 <- summarySE(retent, measurevar="TK1_median_retent", groupvars=c("week","pool"))
summary_retent_2[1:6,5:7] <- 0 


# provide common x scale jitter for two overlaid ggplots
summary_retent_2$jitter <- jitter(summary_retent_2$week,1)


retent_2 <- merge(retent,summary_retent_2[c("week","pool","jitter")])

# replicate week 0, drug 0 nM data point four times, to allow four graphs lines linking week 0 and week 1, instead of only one
# summary_retent_2 <- rbind(summary_retent_2[1,],summary_retent_2[1,],summary_retent_2[1,],summary_retent_2)
# summary_retent_2[1:4,"conc"] <- c(0,8,25,75)

# turn retent freq into percentage
retent_2[,c("median_retent_seq", "mean_retent_seq", "TK1_median_retent", "TK1_mean_retent")] <- 100*retent_2[,c("median_retent_seq", "mean_retent_seq", "TK1_median_retent", "TK1_mean_retent")]
summary_retent_2[,c("TK1_median_retent", "sd", "se", "ci")] <- 100*summary_retent_2[,c("TK1_median_retent", "sd", "se", "ci")]




n = length(unique(retent_2$pool))
colores_2 = gg_color_hue(n)



p2 <- ggplot() + 
		theme2 + 
		theme(legend.key=element_blank()) +
		geom_line(
			data=summary_retent_2, 
			lwd=0.2,
			aes(
				x=summary_retent_2$jitter, 
				y=TK1_median_retent, 
				colour=as.factor(summary_retent_2$pool)),
				show.legend=FALSE
				) + 
		geom_linerange(
			data=summary_retent_2,
			aes(
				x=jitter, 
				ymin= TK1_median_retent-se, 
				ymax= TK1_median_retent+se,
				colour=as.factor(pool)
				),
			lwd=0.2,
			show.legend=FALSE
			) +
		geom_point(
			shape=1,
			stroke=0.4,
			data=retent_2, 
			aes(
				x=retent_2$jitter, 
				y=TK1_median_retent, 
				colour=as.factor(retent_2$pool)),
				size=1.0
				) +
		scale_color_manual(
			values=colores_2,
			name ="Pool", 
			labels=c(1,2,3,4,5,6)
			) +
	 		guides(
	 			colour = guide_legend(override.aes = list(fill=NA,shape=1,size=1),ncol=3,byrow=TRUE)
	 			) +
	 		theme(
	 			legend.position = c(0.75,1.00), 
	 			legend.title.align=0.6) +
		scale_x_continuous(breaks = c(0,1,2,3,4,6), labels = c(0,1,2,3,4,6)) +
		# ggtitle("") + 
		xlab("Weeks") +  
		ylab(expression(atop("Retention via ", italic('TK1')~"(%)"))) +
		labs(subtitle="Growth")
print(p2)



# ----------- Plot sequence retention vs conc ---------------------



summary_retent_3 <- summarySE(retent, measurevar="median_retent_seq", groupvars=c("conc","pool"))



# provide common x scale jitter for two overlaid ggplots
summary_retent_3$jitter <- jitter(summary_retent_3$conc,1)


retent_3 <- merge(retent,summary_retent_3[c("conc","pool","jitter")])

# replicate week 0, drug 0 nM data point four times, to allow four graphs lines linking week 0 and week 1, instead of only one
# summary_retent_3 <- rbind(summary_retent_3[1,],summary_retent_3[1,],summary_retent_3[1,],summary_retent_3)
# summary_retent_3[1:4,"conc"] <- c(0,8,25,75)

# turn retent freq into percentage
retent_3[,c("median_retent_seq", "mean_retent_seq", "TK1_median_retent", "TK1_mean_retent")] <- 100*retent_3[,c("median_retent_seq", "mean_retent_seq", "TK1_median_retent", "TK1_mean_retent")]
summary_retent_3[,c("median_retent_seq", "sd", "se", "ci")] <- 100*summary_retent_3[,c("median_retent_seq", "sd", "se", "ci")]





n = length(unique(retent_3$pool))
colores_3 = gg_color_hue(n)



p3 <- ggplot() + 
		theme2 + 
		theme(legend.key=element_blank()) +
		geom_line(
			data=summary_retent_3, 
			lwd=0.2,
			aes(
				x=summary_retent_3$jitter, 
				y=median_retent_seq, 
				colour=as.factor(summary_retent_3$pool)),
				show.legend=FALSE
				) + 
		geom_linerange(
			data=summary_retent_3,
			aes(
				x=jitter, 
				ymin=median_retent_seq-se, 
				ymax=median_retent_seq+se,
				colour=as.factor(pool)
				),
			lwd=0.2,
			show.legend=FALSE
			) +
		geom_point(
			shape=1,
			stroke=0.4,
			data=retent_3, 
			aes(
				x=retent_3$jitter, 
				y=median_retent_seq, 
				colour=as.factor(retent_3$pool)),
				size=1.0
				) +
		scale_color_manual(
			values=colores_3,
			name ="Pool", 
			labels=c(1,2,3,4,5,6)
			) +
	 		guides(
	 			colour = guide_legend(override.aes = list(fill=NA,shape=1,size=1),ncol=3,byrow=TRUE)
	 			) +
	 		theme(
	 			legend.position = c(0.75,1.00), 
	 			legend.title.align=0.6) +
	 	scale_x_continuous(breaks = c(0,8,25,75), labels = c(0,8,25,75)) +
		# ggtitle("") + 
		xlab("Conc (nM)") + 
		ylab("Retention via \nalignments (%)") + 
		labs(subtitle="Paclitaxel")
print(p3)
		

# ----------- Plot TK1 retention vs conc ---------------------



summary_retent_4 <- summarySE(retent, measurevar="TK1_median_retent", groupvars=c("conc","pool"))
 


# provide common x scale jitter for two overlaid ggplots
summary_retent_4$jitter <- jitter(summary_retent_4$conc,1)


retent_4 <- merge(retent,summary_retent_4[c("conc","pool","jitter")])

# replicate week 0, drug 0 nM data point four times, to allow four graphs lines linking week 0 and week 1, instead of only one
# summary_retent_4 <- rbind(summary_retent_4[1,],summary_retent_4[1,],summary_retent_4[1,],summary_retent_4)
# summary_retent_4[1:4,"conc"] <- c(0,8,25,75)

# turn retent freq into percentage
retent_4[,c("median_retent_seq", "mean_retent_seq", "TK1_median_retent", "TK1_mean_retent")] <- 100*retent_4[,c("median_retent_seq", "mean_retent_seq", "TK1_median_retent", "TK1_mean_retent")]
summary_retent_4[,c("TK1_median_retent", "sd", "se", "ci")] <- 100*summary_retent_4[,c("TK1_median_retent", "sd", "se", "ci")]





n = length(unique(retent_4$pool))
colores_4 = gg_color_hue(n)


p4 <- ggplot() + 
		theme2 + 
		theme(legend.key=element_blank()) +
		geom_line(
			data=summary_retent_4, 
			lwd=0.2,
			aes(
				x=summary_retent_4$jitter, 
				y= TK1_median_retent, 
				colour=as.factor(summary_retent_4$pool)),
				show.legend=FALSE
				) + 
		geom_linerange(
			data=summary_retent_4,
			aes(
				x=jitter, 
				ymin=TK1_median_retent-se, 
				ymax=TK1_median_retent+se,
				colour=as.factor(pool)
				),
			lwd=0.2,
			show.legend=FALSE
			) +
		geom_point(
			shape=1,
			stroke=0.4,
			data=retent_4, 
			aes(
				x=retent_4$jitter, 
				y=TK1_median_retent, 
				colour=as.factor(retent_4$pool)),
				size=1.0
				) +
		scale_color_manual(
			values=colores_4,
			name ="Pool", 
			labels=c(1,2,3,4,5,6)
			) +
 		guides(
 			colour = guide_legend(override.aes = list(fill=NA,shape=1,size=1),ncol=3,byrow=TRUE)
 			) +
 		theme(
 			legend.position = c(0.75,1.00), 
 			legend.title.align=0.6) +
 		scale_x_continuous(breaks = c(0,8,25,75), labels = c(0,8,25,75)) +
		# ggtitle("") + 
		xlab("Conc (nM)") + 
		ylab(expression(atop("Retention via ", italic('TK1')~"(%)"))) +
		labs(subtitle="Paclitaxel")
print(p4)


#------------------Make file --------------------------


pdf("median_retent_pools_1.pdf",width=7.5,height= 6.67)
plot_grid(p1, p2,p3,p4, labels=c("A", "B","C","D"), ncol = 2, nrow = 2, label_size = 16)
dev.off()

#-------------------------------------------------------


# -------------------------------------------------------------------------------
# ----------- Plots of mean sequence retent and TK1 retent -----------------------
# ----------- For plots of median sequence retent and TK1 retent see above ---------
# ------------- Decided to use median for paper -----------------------------------
# ----------- Plot mean sequence retention vs growth time ---------------------
# ---------------------------------------------------------------------------------





summary_retent_1 <- summarySE(retent, measurevar="mean_retent_seq", groupvars=c("week","pool"))
summary_retent_1[1:6,5:7] <- 0

 


# provide common x scale jitter for two overlaid ggplots
summary_retent_1$jitter <- jitter(summary_retent_1$week,1)


retent_1 <- merge(retent,summary_retent_1[c("week","pool","jitter")])

# # replicate week 0, drug 0 nM data point four times, to allow four graphs lines linking week 0 and week 1, instead of only one
# summary_retent_1 <- rbind(summary_retent_1[1,],summary_retent_1[1,],summary_retent_1[1,],summary_retent_1)
# summary_retent_1[1:4,"conc"] <- c(0,8,25,75)

# turn retent freq into percentage
retent_1[,c("median_retent_seq", "mean_retent_seq", "TK1_median_retent", "TK1_mean_retent")] <- 100*retent_1[,c("median_retent_seq", "mean_retent_seq", "TK1_median_retent", "TK1_mean_retent")]
summary_retent_1[,c("mean_retent_seq", "sd", "se", "ci")] <- 100*summary_retent_1[,c("mean_retent_seq", "sd", "se", "ci")]





n = length(unique(retent_1$pool))
colores_1 = gg_color_hue(n)



p1 <- ggplot() + 
		theme2 + 
		theme(legend.key=element_blank()) +
		geom_line(
			data=summary_retent_1, 
			lwd=0.2,
			aes(
				x=summary_retent_1$jitter, 
				y= mean_retent_seq, 
				colour=as.factor(summary_retent_1$pool)),
				show.legend=FALSE
				) + 
		geom_linerange(
			data=summary_retent_1,
			aes(
				x=jitter, 
				ymin=mean_retent_seq-se, 
				ymax=mean_retent_seq+se,
				colour=as.factor(pool)
				),
			lwd=0.2,
			show.legend=FALSE
			) +
		geom_point(
			shape=1,
			stroke=0.4,
			data=retent_1, 
			aes(
				x=retent_1$jitter, 
				y=mean_retent_seq, 
				colour=as.factor(retent_1$pool)),
				size=1.0
				) +
		scale_color_manual(
			values=colores_1,
			name ="Pool", 
			labels=c(1,2,3,4,5,6)
			) +
	 		guides(
	 			colour = guide_legend(override.aes = list(fill=NA,shape=1,size=1),ncol=3,byrow=TRUE)
	 			) +
	 		theme(
	 			legend.position = c(0.75,1.00), 
	 			legend.title.align=0.6) +
		scale_x_continuous(breaks = c(0,1,2,3,4,6), labels = c(0,1,2,3,4,6)) +
		# ggtitle("") + 
		xlab("Weeks") + 
		ylab("Retention via \nalignments (%)") + 
		labs(subtitle="Growth")
print(p1)
		
		
		
		
# ----------- Plot TK1 mean retention vs growth time ---------------------



summary_retent_2 <- summarySE(retent, measurevar="TK1_mean_retent", groupvars=c("week","pool"))
summary_retent_2[1:6,5:7] <- 0 


# provide common x scale jitter for two overlaid ggplots
summary_retent_2$jitter <- jitter(summary_retent_2$week,1)


retent_2 <- merge(retent,summary_retent_2[c("week","pool","jitter")])

# replicate week 0, drug 0 nM data point four times, to allow four graphs lines linking week 0 and week 1, instead of only one
# summary_retent_2 <- rbind(summary_retent_2[1,],summary_retent_2[1,],summary_retent_2[1,],summary_retent_2)
# summary_retent_2[1:4,"conc"] <- c(0,8,25,75)

# turn retent freq into percentage
retent_2[,c("median_retent_seq", "mean_retent_seq", "TK1_median_retent", "TK1_mean_retent")] <- 100*retent_2[,c("median_retent_seq", "mean_retent_seq", "TK1_median_retent", "TK1_mean_retent")]
summary_retent_2[,c("TK1_mean_retent", "sd", "se", "ci")] <- 100*summary_retent_2[,c("TK1_mean_retent", "sd", "se", "ci")]





n = length(unique(retent_2$pool))
colores_2 = gg_color_hue(n)



p2 <- ggplot() + 
		theme2 + 
		theme(legend.key=element_blank()) +
		geom_line(
			data=summary_retent_2, 
			lwd=0.2,
			aes(
				x=summary_retent_2$jitter, 
				y=TK1_mean_retent, 
				colour=as.factor(summary_retent_2$pool)),
				show.legend=FALSE
				) + 
		geom_linerange(
			data=summary_retent_2,
			aes(
				x=jitter, 
				ymin= TK1_mean_retent-se, 
				ymax= TK1_mean_retent+se,
				colour=as.factor(pool)
				),
			lwd=0.2,
			show.legend=FALSE
			) +
		geom_point(
			shape=1,
			stroke=0.4,
			data=retent_2, 
			aes(
				x=retent_2$jitter, 
				y=TK1_mean_retent, 
				colour=as.factor(retent_2$pool)),
				size=1.0
				) +
		scale_color_manual(
			values=colores_2,
			name ="Pool", 
			labels=c(1,2,3,4,5,6)
			) +
	 		guides(
	 			colour = guide_legend(override.aes = list(fill=NA,shape=1,size=1),ncol=3,byrow=TRUE)
	 			) +
	 		theme(
	 			legend.position = c(0.75,1.00), 
	 			legend.title.align=0.6) +
		scale_x_continuous(breaks = c(0,1,2,3,4,6), labels = c(0,1,2,3,4,6)) +
		# ggtitle("") + 
		xlab("Weeks") + 
		ylab(expression(atop("Retention via ", italic('TK1')~"(%)"))) +
		labs(subtitle="Growth")
print(p2)



# ----------- Plot sequence retention vs conc ---------------------



summary_retent_3 <- summarySE(retent, measurevar="mean_retent_seq", groupvars=c("conc","pool"))



# provide common x scale jitter for two overlaid ggplots
summary_retent_3$jitter <- jitter(summary_retent_3$conc,1)


retent_3 <- merge(retent,summary_retent_3[c("conc","pool","jitter")])

# replicate week 0, drug 0 nM data point four times, to allow four graphs lines linking week 0 and week 1, instead of only one
# summary_retent_3 <- rbind(summary_retent_3[1,],summary_retent_3[1,],summary_retent_3[1,],summary_retent_3)
# summary_retent_3[1:4,"conc"] <- c(0,8,25,75)

# turn retent freq into percentage
retent_3[,c("median_retent_seq", "mean_retent_seq", "TK1_median_retent", "TK1_mean_retent")] <- 100*retent_3[,c("median_retent_seq", "mean_retent_seq", "TK1_median_retent", "TK1_mean_retent")]
summary_retent_3[,c("mean_retent_seq", "sd", "se", "ci")] <- 100*summary_retent_3[,c("mean_retent_seq", "sd", "se", "ci")]




n = length(unique(retent_3$pool))
colores_3 = gg_color_hue(n)



p3 <- ggplot() + 
		theme2 + 
		theme(legend.key=element_blank()) +
		geom_line(
			data=summary_retent_3, 
			lwd=0.2,
			aes(
				x=summary_retent_3$jitter, 
				y=mean_retent_seq, 
				colour=as.factor(summary_retent_3$pool)),
				show.legend=FALSE
				) + 
		geom_linerange(
			data=summary_retent_3,
			aes(
				x=jitter, 
				ymin=mean_retent_seq-se, 
				ymax=mean_retent_seq+se,
				colour=as.factor(pool)
				),
			lwd=0.2,
			show.legend=FALSE
			) +
		geom_point(
			shape=1,
			stroke=0.4,
			data=retent_3, 
			aes(
				x=retent_3$jitter, 
				y=mean_retent_seq, 
				colour=as.factor(retent_3$pool)),
				size=1.0
				) +
		scale_color_manual(
			values=colores_3,
			name ="Pool", 
			labels=c(1,2,3,4,5,6)
			) +
	 		guides(
	 			colour = guide_legend(override.aes = list(fill=NA,shape=1,size=1),ncol=3,byrow=TRUE)
	 			) +
	 		theme(
	 			legend.position = c(0.75,1.00), 
	 			legend.title.align=0.6) +
	 	scale_x_continuous(breaks = c(0,8,25,75), labels = c(0,8,25,75)) +
		# ggtitle("") + 
		xlab("Conc (nM)") + 
		ylab("Retention via \nalignments (%)") + 
		labs(subtitle="Paclitaxel")
print(p3)
		



# ----------- Plot TK1 retention vs conc ---------------------


summary_retent_4 <- summarySE(retent, measurevar="TK1_mean_retent", groupvars=c("conc","pool"))
 


# provide common x scale jitter for two overlaid ggplots
summary_retent_4$jitter <- jitter(summary_retent_4$conc,1)


retent_4 <- merge(retent,summary_retent_4[c("conc","pool","jitter")])

# replicate week 0, drug 0 nM data point four times, to allow four graphs lines linking week 0 and week 1, instead of only one
# summary_retent_4 <- rbind(summary_retent_4[1,],summary_retent_4[1,],summary_retent_4[1,],summary_retent_4)
# summary_retent_4[1:4,"conc"] <- c(0,8,25,75)

# turn retent freq into percentage
retent_4[,c("median_retent_seq", "mean_retent_seq", "TK1_median_retent", "TK1_mean_retent")] <- 100*retent_4[,c("median_retent_seq", "mean_retent_seq", "TK1_median_retent", "TK1_mean_retent")]
summary_retent_4[,c("TK1_mean_retent", "sd", "se", "ci")] <- 100*summary_retent_4[,c("TK1_mean_retent", "sd", "se", "ci")]




n = length(unique(retent_4$pool))
colores_4 = gg_color_hue(n)


p4 <- ggplot() + 
		theme2 + 
		theme(legend.key=element_blank()) +
		geom_line(
			data=summary_retent_4, 
			lwd=0.2,
			aes(
				x=summary_retent_4$jitter, 
				y= TK1_mean_retent, 
				colour=as.factor(summary_retent_4$pool)),
				show.legend=FALSE
				) + 
		geom_linerange(
			data=summary_retent_4,
			aes(
				x=jitter, 
				ymin=TK1_mean_retent-se, 
				ymax=TK1_mean_retent+se,
				colour=as.factor(pool)
				),
			lwd=0.2,
			show.legend=FALSE
			) +
		geom_point(
			shape=1,
			stroke=0.4,
			data=retent_4, 
			aes(
				x=retent_4$jitter, 
				y=TK1_mean_retent, 
				colour=as.factor(retent_4$pool)),
				size=1.0
				) +
		scale_color_manual(
			values=colores_4,
			name ="Pool", 
			labels=c(1,2,3,4,5,6)
			) +
 		guides(
 			colour = guide_legend(override.aes = list(fill=NA,shape=1,size=1),ncol=3,byrow=TRUE)
 			) +
 		theme(
 			legend.position = c(0.75,1.00), 
 			legend.title.align=0.6) +
 		scale_x_continuous(breaks = c(0,8,25,75), labels = c(0,8,25,75)) +
		# ggtitle("") + 
		xlab("Conc (nM)") +  
		ylab(expression(atop("Retention via ", italic('TK1')~"(%)"))) +
		labs(subtitle="Paclitaxel")
print(p4)


#------------------Make file --------------------------


pdf("mean_retent_pools_1.pdf",width=7.5,height= 6.67)
plot_grid(p1, p2,p3,p4, labels=c("A", "B","C","D"), ncol = 2, nrow = 2, label_size = 16)
dev.off()

#-------------------------------------------------------


# ------------------------------------------------------------------------------------------
# ----------------- Retention frequencies of RH pools --------------------------------------
# ------------------------------------------------------------------------------------------

# <<<<<<<<<<<< use in paper >>>>>>>>>>>>>>>>>>>>>>>>>>>

retent[retent$week==0 & retent$conc==0,]
       # RH_ID pool week conc cell median_human_reads median_hamster_reads mean_human_reads mean_hamster_reads median_retent_seq mean_retent_seq TK1_median_retent
# 1  RH1_w0_d0    1    0    0    1                 76                15130         95.79096           15209.42        0.01004627      0.01259626        0.05817028
# 21 RH2_w0_d0    2    0    0    6                194                12372        251.74543           12344.90        0.03136114      0.04078533        0.05881380
# 40 RH3_w0_d0    3    0    0   11                246                18255        341.26520           18410.09        0.02695152      0.03707372        0.05445742
# 60 RH4_w0_d0    4    0    0   16                259                11237        337.15985           11269.49        0.04609771      0.05983588        0.05852638
# 80 RH5_w0_d0    5    0    0   21                155                15082        205.01529           15367.61        0.02055430      0.02668147        0.04964056
# 98 RH6_w0_d0    6    0    0   26                154                14877        202.76471           14928.61        0.02070310      0.02716458        0.03787156
   # TK1_mean_retent revert_freq mean_of_medians mean_of_means TK1_max
# 1       0.07331825   0.2368978      0.03410827    0.04295725     997
# 21      0.07632012   0.2008600      0.04508747    0.05855273    2636
# 40      0.07554644   0.2690309      0.04070447    0.05631008    3302
# 60      0.07618821   0.2938419      0.05231205    0.06801204    3125
# 80      0.06565854   0.2845355      0.03509743    0.04617000    2234
# 98      0.04986374   0.3672498      0.02928733    0.03851416    2573


apply(retent[retent$week==0 & retent$conc==0,6:ncol(retent)],2,mean)
# median_human_reads median_hamster_reads     mean_human_reads   mean_hamster_reads    median_retent_seq      mean_retent_seq    TK1_median_retent 
        # 1.806667e+02         1.449217e+04         2.389569e+02         1.458835e+04         2.595234e-02         3.402287e-02         5.291333e-02 
     # TK1_mean_retent          revert_freq      mean_of_medians        mean_of_means              TK1_max 
        # 6.948255e-02         2.754026e-01         3.943284e-02         5.175271e-02         2.477833e+03   


sem <- function(x) {sqrt(var(x,na.rm=TRUE)/sum(!is.na(x)))}

apply(retent[retent$week==0 & retent$conc==0,6:ncol(retent)],2,sem)
  # median_human_reads median_hamster_reads     mean_human_reads   mean_hamster_reads    median_retent_seq      mean_retent_seq    TK1_median_retent 
        # 2.763291e+01         1.002546e+03         3.796171e+01         1.028953e+03         4.985934e-03         6.535497e-03         3.333204e-03 
     # TK1_mean_retent          revert_freq      mean_of_medians        mean_of_means              TK1_max 
        # 4.253322e-03         2.305005e-02         3.413425e-03         4.528287e-03         3.358805e+02    
      
# ^^^^^^^^^^^^^^^^^ use in paper ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^      

      
# ------------------------------------------------------------------------------------------
# ----------------- Retention frequencies of all samples --------------------------------------
# ------------------------------------------------------------------------------------------


apply(retent[,c(10:13,15:16)],2,mean)
# median_retent_seq   mean_retent_seq TK1_median_retent   TK1_mean_retent   mean_of_medians     mean_of_means 
       # 0.01478064        0.03497492        0.02539782        0.05281520        0.02008923        0.04389506 



apply(retent[,c(10:13,15:16)],2,sem)
# median_retent_seq   mean_retent_seq TK1_median_retent   TK1_mean_retent   mean_of_medians     mean_of_means 
     # 0.0007281475      0.0018448728      0.0012760644      0.0015047539      0.0008819243      0.0013421823    
     
     
# ------------------------------------------------------------------------------------------
# ----------------- Retention frequencies at 6 weeks --------------------------------------
# ------------------------------------------------------------------------------------------
   

# Since growth significant, compare RH pools to RH samples at 6 wks


# median_retent_seq
mean(aggregate(median_retent_seq ~ pool,retent[retent$week==6,],FUN=mean)[,"median_retent_seq"])
# [1] 0.009607601 <<<<<<<<<<< use in paper


sem(aggregate(median_retent_seq ~ pool,retent[retent$week==6,],FUN=mean)[,"median_retent_seq"])
# [1] 0.001710241 <<<<<<<<<<< use in paper



# TK1_median_retent
mean(aggregate(TK1_median_retent ~ pool,retent[retent$week==6,],FUN=mean)[,"TK1_median_retent"])
# [1] 0.01272263 <<<<<<<<<<< use in paper


sem(aggregate(TK1_median_retent ~ pool,retent[retent$week==6,],FUN=mean)[,"TK1_median_retent"])
# [1] 0.002254856 <<<<<<<<<<< use in paper








    

      
      
      
      
# ---------------------------------------------------------------------------------------------
# -------------------------- Retention frequencies RH pools, table for paper ------------------  
# ---------------------------------------------------------------------------------------------

# ---------------- Same table in graph_Human_retent_2.R --------------------------


retent[retent$week==0 & retent$conc==0,c("pool","median_human_reads","median_hamster_reads","TK1_max","revert_freq","median_retent_seq","TK1_median_retent","mean_of_medians")]
   # pool median_human_reads median_hamster_reads TK1_max revert_freq median_retent_seq TK1_median_retent mean_of_medians
# 1     1                 76                15130     997   0.2368978        0.01004627        0.05817028      0.03410827
# 21    2                194                12372    2636   0.2008600        0.03136114        0.05881380      0.04508747
# 40    3                246                18255    3302   0.2690309        0.02695152        0.05445742      0.04070447
# 60    4                259                11237    3125   0.2938419        0.04609771        0.05852638      0.05231205
# 80    5                155                15082    2234   0.2845355        0.02055430        0.04964056      0.03509743
# 98    6                154                14877    2573   0.3672498        0.02070310        0.03787156      0.02928733


apply(retent[retent$week==0 & retent$conc==0,c("median_human_reads","median_hamster_reads","TK1_max","revert_freq","median_retent_seq","TK1_median_retent","mean_of_medians")],2,mean)
  # median_human_reads median_hamster_reads              TK1_max          revert_freq    median_retent_seq    TK1_median_retent      mean_of_medians 
        # 1.806667e+02         1.449217e+04         2.477833e+03         2.754026e-01         2.595234e-02         5.291333e-02         3.943284e-02 


apply(retent[retent$week==0 & retent$conc==0,c("median_human_reads","median_hamster_reads","TK1_max","revert_freq","median_retent_seq","TK1_median_retent","mean_of_medians")],2,sem)
  # median_human_reads median_hamster_reads              TK1_max          revert_freq    median_retent_seq    TK1_median_retent      mean_of_medians 
        # 2.763291e+01         1.002546e+03         3.358805e+02         2.305005e-02         4.985934e-03         3.333204e-03         3.413425e-03 

                     




# -------------------------------------------------------------------------------------------
# -------- Mixed model to analyze effects week, conc, pool on retention frequency -----------
# -------- only used mean_of_medians retention in paper -------------------------------------
# -------------------------------------------------------------------------------------------		


# Tried to use glht and gam to get p vals at average week and drug together with gam and delta_logLik() for p vals of random effects
# However, no simple way to get ICC for mgcv::gam that I could find. So used lmer. lmer is ok here, because we have continuous output Gaussian and nb not appropriate. Tried glmmTMB but lots of complaints about stability using ML and did not work using REML. lmer seemed more stable.


library(mgcv)
library(lme4)
library(multcomp)
library(sjstats)

retent_stat <- retent

# turn retent freq into percentage
retent_stat[,c("median_retent_seq", "mean_retent_seq", "TK1_median_retent", "TK1_mean_retent", "mean_of_medians", "mean_of_means")] <- 100* retent_stat[,c("median_retent_seq", "mean_retent_seq", "TK1_median_retent", "TK1_mean_retent", "mean_of_medians", "mean_of_means")]

retent_stat$pool <- as.factor(retent_stat$pool)
retent_stat$cell <- as.factor(retent_stat$cell)





# -------- median aligned sequence based retention -----------------


m1 <- lmer(median_retent_seq ~ week * conc + (1|pool/cell), data = retent_stat, REML=TRUE)

summary(glht(m1),test=adjusted("none"))

	 # Simultaneous Tests for General Linear Hypotheses

# Fit: lmer(formula = median_retent_seq ~ week * conc + (1 | pool/cell), 
    # data = retent_stat, REML = TRUE)

# Linear Hypotheses:
                   # Estimate Std. Error z value Pr(>|z|)    
# (Intercept) == 0  2.1928633  0.2396080   9.152  < 2e-16 ***
# week == 0        -0.1837178  0.0281275  -6.532 6.51e-11 ***
# conc == 0        -0.0023045  0.0038249  -0.602    0.547    
# week:conc == 0   -0.0006308  0.0008325  -0.758    0.449    
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
# (Adjusted p values reported -- none method)





# Random effects

# pool alone:
 anova(lmer(median_retent_seq ~ week * conc + (1|pool) + (1|cell), data = retent_stat, REML=TRUE),lmer(median_retent_seq ~ week * conc + (1|cell), data = retent_stat, REML=TRUE))
# refitting model(s) with ML (instead of REML)
# Data: retent_stat
# Models:
# lmer(median_retent_seq ~ week * conc + (1 | cell), data = retent_stat, REML = TRUE): median_retent_seq ~ week * conc + (1 | cell)
# lmer(median_retent_seq ~ week * conc + (1 | pool) + (1 | cell), data = retent_stat, REML = TRUE): median_retent_seq ~ week * conc + (1 | pool) + (1 | cell)
                                                                                                 # Df    AIC    BIC  logLik deviance  Chisq Chi Df Pr(>Chisq)    
# lmer(median_retent_seq ~ week * conc + (1 | cell), data = retent_stat, REML = TRUE)               6 191.91 208.38 -89.953   179.91                             
# lmer(median_retent_seq ~ week * conc + (1 | pool) + (1 | cell), data = retent_stat, REML = TRUE)  7 181.40 200.61 -83.699   167.40 12.508      1  0.0004053 *** <<<<<<<<<< use
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1





# cell alone:
anova(lmer(median_retent_seq ~ week * conc + (1|pool) + (1|cell), data = retent_stat, REML=TRUE),lmer(median_retent_seq ~ week * conc + (1|pool), data = retent_stat, REML=TRUE))
# refitting model(s) with ML (instead of REML)
# Data: retent_stat
# Models:
# lmer(median_retent_seq ~ week * conc + (1 | pool), data = retent_stat, REML = TRUE): median_retent_seq ~ week * conc + (1 | pool)
# lmer(median_retent_seq ~ week * conc + (1 | pool) + (1 | cell), data = retent_stat, REML = TRUE): median_retent_seq ~ week * conc + (1 | pool) + (1 | cell)
                                                                                                 # Df    AIC    BIC  logLik deviance  Chisq Chi Df Pr(>Chisq)  
# lmer(median_retent_seq ~ week * conc + (1 | pool), data = retent_stat, REML = TRUE)               6 182.27 198.74 -85.135   170.27                           
# lmer(median_retent_seq ~ week * conc + (1 | pool) + (1 | cell), data = retent_stat, REML = TRUE)  7 181.40 200.61 -83.699   167.40 2.8729      1    0.09008 <<<<<<<<<<< use
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1



# pool and cell together
anova(lmer(median_retent_seq ~ week * conc + (1|pool/cell), data = retent_stat, REML=TRUE),lm(median_retent_seq ~ week * conc, data = retent_stat, REML=TRUE))
# Data: retent_stat
# Models:
# lm(median_retent_seq ~ week * conc, data = retent_stat, REML = TRUE): median_retent_seq ~ week * conc
# lmer(median_retent_seq ~ week * conc + (1 | pool/cell), data = retent_stat, REML = TRUE): median_retent_seq ~ week * conc + (1 | pool/cell)
                                                                                         # Df    AIC    BIC   logLik deviance  Chisq Chi Df Pr(>Chisq)    
# lm(median_retent_seq ~ week * conc, data = retent_stat, REML = TRUE)                      5 239.19 252.92 -114.597   229.19                             
# lmer(median_retent_seq ~ week * conc + (1 | pool/cell), data = retent_stat, REML = TRUE)  7 181.40 200.61  -83.699   167.40 61.796      2  3.812e-14 ***
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1



# icc

# pool
(get_variance(m1)$var.intercept["pool"])/(get_variance(m1)$var.intercept["pool"] + get_variance(m1)$var.intercept["cell:pool"] + get_variance(m1)$var.residual)
     # pool 
# 0.4943498 <<<<<<<<<< use in paper




# cell
(get_variance(m1)$var.intercept["cell:pool"])/(get_variance(m1)$var.intercept["pool"] + get_variance(m1)$var.intercept["cell:pool"] + get_variance(m1)$var.residual)
# cell:pool 
# 0.1753048 <<<<<<<<<< use in paper


	

# pool and cell together
(get_variance(m1)$var.intercept["pool"] + get_variance(m1)$var.intercept["cell:pool"])/(get_variance(m1)$var.intercept["pool"] + get_variance(m1)$var.intercept["cell:pool"] + get_variance(m1)$var.residual)
     # pool 
# 0.6696545








# glht for average fixed effects:

glht_growth <- glht(m1, linfct = c("week + 27*week:conc == 0"))
glht_drug <- glht(m1, linfct = c("conc + (3.2)*week:conc == 0"))
glht_omni <- glht(m1)


growth_stat <- summary(glht_growth,test = adjusted("none"))$test$tstat
drug_stat <- summary(glht_drug,test = adjusted("none"))$test$tstat
Ix_stat <- summary(glht_omni,test = adjusted("none"))$test$tstat["week:conc"]
growth_coef <- summary(glht_growth,test = adjusted("none"))$test$coefficients
drug_coef <- summary(glht_drug,test = adjusted("none"))$test$coefficients
Ix_coef <- summary(glht_omni,test = adjusted("none"))$test$coefficients["week:conc"]

 		
 		
	
ans <- c(
 		growth_coef,
 		drug_coef,
 		Ix_coef,
 		growth_stat,
 		drug_stat,
 		Ix_stat,
 		2*pnorm(-abs(growth_stat)),
 		2*pnorm(-abs(drug_stat)),
 		2*pnorm(-abs(Ix_stat))
 		)
 		
names(ans) <- c("growth_coef","drug_coef","Ix_coef","growth_Z","drug_Z","Ix_Z","growth_P","drug_P","Ix_P")

ans
  # growth_coef     drug_coef       Ix_coef      growth_Z        drug_Z          Ix_Z      growth_P        drug_P          Ix_P 
# -2.007499e-01 -4.323076e-03 -6.308155e-04 -8.866058e+00 -1.673049e+00 -7.577123e-01  7.578130e-19  9.431771e-02  4.486233e-01     




# emmeans
# Provides kenward-roger
# lmer.df="asymptotic" agrees with glht exactly

# growth

summary(contrast(emmeans(m1, lmer.df = "kenward-roger", specs="week", at=list(week=c(0,1), conc=27)),list(c(-1,1))))
# NOTE: Results may be misleading due to involvement in interactions
 # contrast estimate     SE   df t.ratio p.value
 # c(-1, 1)   -0.201 0.0227 91.2 -8.839  <.0001 <<<<<<<<<<< use in paper
 
 
summary(contrast(emmeans(m1, lmer.df = "kenward-roger", specs="week", at=list(week=c(0,1), conc=27)),list(c(-1,1))))$p.value
# NOTE: Results may be misleading due to involvement in interactions
# [1] 6.772853e-14 <<<<<<<<<<< use in paper
 
 

# drug
summary(contrast(emmeans(m1, lmer.df = "kenward-roger", specs="conc", at=list(conc=c(0,1), week=3.2)),list(c(-1,1))))
# NOTE: Results may be misleading due to involvement in interactions
 # contrast estimate      SE   df t.ratio p.value
 # c(-1, 1) -0.00432 0.00259 24.5 -1.667  0.1082  <<<<<<<<<<< use in paper



# Ix
summary(contrast(emmeans(m1, lmer.df = "kenward-roger", specs=c("week","conc"), at=list(week=c(0,1),conc=c(0,1))),list(c(1,-1,-1,1))))
 # contrast         estimate       SE   df t.ratio p.value
 # c(1, -1, -1, 1) -0.000631 0.000836 93.2 -0.755  0.4523 <<<<<<<<<<< use in paper







# -------- median TK1 based retention -----------------



m2 <- lmer(TK1_median_retent ~ week * conc + (1|pool/cell), data = retent_stat, REML=TRUE)

summary(glht(m2),test=adjusted("none"))

	 # Simultaneous Tests for General Linear Hypotheses

# Fit: lmer(formula = TK1_median_retent ~ week * conc + (1 | pool/cell), 
    # data = retent_stat, REML = TRUE)

# Linear Hypotheses:
                   # Estimate Std. Error z value Pr(>|z|)    
# (Intercept) == 0  4.2683940  0.2690641  15.864   <2e-16 ***
# week == 0        -0.5311610  0.0447807 -11.861   <2e-16 ***
# conc == 0         0.0016776  0.0059952   0.280    0.780    
# week:conc == 0   -0.0004442  0.0013277  -0.335    0.738    
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
# (Adjusted p values reported -- none method)





# Random effects

# pool alone:
 anova(lmer(TK1_median_retent ~ week * conc + (1|pool) + (1|cell), data = retent_stat, REML=TRUE),lmer(TK1_median_retent ~ week * conc + (1|cell), data = retent_stat, REML=TRUE))
# refitting model(s) with ML (instead of REML)
# Data: retent_stat
# Models:
# lmer(TK1_median_retent ~ week * conc + (1 | cell), data = retent_stat, REML = TRUE): TK1_median_retent ~ week * conc + (1 | cell)
# lmer(TK1_median_retent ~ week * conc + (1 | pool) + (1 | cell), data = retent_stat, REML = TRUE): TK1_median_retent ~ week * conc + (1 | pool) + (1 | cell)
                                                                                                 # Df    AIC    BIC  logLik deviance  Chisq Chi Df Pr(>Chisq)  
# lmer(TK1_median_retent ~ week * conc + (1 | cell), data = retent_stat, REML = TRUE)               6 284.25 300.72 -136.12   272.25                           
# lmer(TK1_median_retent ~ week * conc + (1 | pool) + (1 | cell), data = retent_stat, REML = TRUE)  7 281.89 301.10 -133.94   267.89 4.3595      1     0.0368 *  <<<<<<<<<< use in paper
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1





# cell alone:
anova(lmer(TK1_median_retent ~ week * conc + (1|pool) + (1|cell), data = retent_stat, REML=TRUE),lmer(TK1_median_retent ~ week * conc + (1|pool), data = retent_stat, REML=TRUE))
# refitting model(s) with ML (instead of REML)
# Data: retent_stat
# Models:
# lmer(TK1_median_retent ~ week * conc + (1 | pool), data = retent_stat, REML = TRUE): TK1_median_retent ~ week * conc + (1 | pool)
# lmer(TK1_median_retent ~ week * conc + (1 | pool) + (1 | cell), data = retent_stat, REML = TRUE): TK1_median_retent ~ week * conc + (1 | pool) + (1 | cell)
                                                                                                 # Df    AIC    BIC  logLik deviance  Chisq Chi Df Pr(>Chisq)  
# lmer(TK1_median_retent ~ week * conc + (1 | pool), data = retent_stat, REML = TRUE)               6 284.84 301.31 -136.42   272.84                           
# lmer(TK1_median_retent ~ week * conc + (1 | pool) + (1 | cell), data = retent_stat, REML = TRUE)  7 281.89 301.10 -133.94   267.89 4.9485      1    0.02611 *   <<<<<<<<<< use in paper
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1





# pool and cell together
anova(lmer(TK1_median_retent ~ week * conc + (1|pool/cell), data = retent_stat, REML=TRUE),lm(TK1_median_retent ~ week * conc, data = retent_stat, REML=TRUE))
# Data: retent_stat
# Models:
# lm(TK1_median_retent ~ week * conc, data = retent_stat, REML = TRUE): TK1_median_retent ~ week * conc
# lmer(TK1_median_retent ~ week * conc + (1 | pool/cell), data = retent_stat, REML = TRUE): TK1_median_retent ~ week * conc + (1 | pool/cell)
                                                                                         # Df    AIC    BIC  logLik deviance Chisq Chi Df Pr(>Chisq)    
# lm(TK1_median_retent ~ week * conc, data = retent_stat, REML = TRUE)                      5 308.49 322.21 -149.24   298.49                            
# lmer(TK1_median_retent ~ week * conc + (1 | pool/cell), data = retent_stat, REML = TRUE)  7 281.89 301.10 -133.94   267.89  30.6      2  2.266e-07 ***






# icc

# pool
(get_variance(m2)$var.intercept["pool"])/(get_variance(m2)$var.intercept["pool"] + get_variance(m2)$var.intercept["cell:pool"] + get_variance(m2)$var.residual)
     # pool 
# 0.2641526 <<<<<<<<<< use in paper




# cell
(get_variance(m2)$var.intercept["cell:pool"])/(get_variance(m2)$var.intercept["pool"] + get_variance(m2)$var.intercept["cell:pool"] + get_variance(m2)$var.residual)
# cell:pool 
# 0.2312443 <<<<<<<<<< use in paper


	

# pool and cell together
(get_variance(m2)$var.intercept["pool"] + get_variance(m2)$var.intercept["cell:pool"])/(get_variance(m2)$var.intercept["pool"] + get_variance(m2)$var.intercept["cell:pool"] + get_variance(m2)$var.residual)
     # pool 
# 0.4953968





# glht for average fixed effects:

glht_growth <- glht(m2, linfct = c("week + 27*week:conc == 0"))
glht_drug <- glht(m2, linfct = c("conc + (3.2)*week:conc == 0"))
glht_omni <- glht(m2)


growth_stat <- summary(glht_growth,test = adjusted("none"))$test$tstat
drug_stat <- summary(glht_drug,test = adjusted("none"))$test$tstat
Ix_stat <- summary(glht_omni,test = adjusted("none"))$test$tstat["week:conc"]
growth_coef <- summary(glht_growth,test = adjusted("none"))$test$coefficients
drug_coef <- summary(glht_drug,test = adjusted("none"))$test$coefficients
Ix_coef <- summary(glht_omni,test = adjusted("none"))$test$coefficients["week:conc"]

 		
 		
	
ans <- c(
 		growth_coef,
 		drug_coef,
 		Ix_coef,
 		growth_stat,
 		drug_stat,
 		Ix_stat,
 		2*pnorm(-abs(growth_stat)),
 		2*pnorm(-abs(drug_stat)),
 		2*pnorm(-abs(Ix_stat))
 		)
 		
names(ans) <- c("growth_coef","drug_coef","Ix_coef","growth_Z","drug_Z","Ix_Z","growth_P","drug_P","Ix_P")

ans
  # growth_coef     drug_coef       Ix_coef      growth_Z        drug_Z          Ix_Z      growth_P        drug_P          Ix_P 
# -5.431533e-01  2.562459e-04 -4.441581e-04 -1.503049e+01  6.479945e-02 -3.345224e-01  4.635772e-51  9.483337e-01  7.379854e-01  






# emmeans
# Provides kenward-roger
# lmer.df="asymptotic" agrees with glht exactly

# growth

summary(contrast(emmeans(m2, lmer.df = "kenward-roger", specs="week", at=list(week=c(0,1), conc=27)),list(c(-1,1))))
# NOTE: Results may be misleading due to involvement in interactions
 # contrast estimate     SE   df t.ratio p.value
 # c(-1, 1)   -0.543 0.0362 91.5 -14.984 <.0001  <<<<<<<<<<< use in paper
 
 
 
summary(contrast(emmeans(m2, lmer.df = "kenward-roger", specs="week", at=list(week=c(0,1), conc=27)),list(c(-1,1))))$p.value
# NOTE: Results may be misleading due to involvement in interactions
# [1] 2.322106e-26 <<<<<<<<<<< use in paper
 
 

# drug
summary(contrast(emmeans(m2, lmer.df = "kenward-roger", specs="conc", at=list(conc=c(0,1), week=3.2)),list(c(-1,1))))
# NOTE: Results may be misleading due to involvement in interactions
 # contrast estimate      SE   df t.ratio p.value
 # c(-1, 1) 0.000256 0.00397 24.7 0.065   0.9491   <<<<<<<<<<< use in paper



# Ix
summary(contrast(emmeans(m2, lmer.df = "kenward-roger", specs=c("week","conc"), at=list(week=c(0,1),conc=c(0,1))),list(c(1,-1,-1,1))))
 # contrast         estimate      SE   df t.ratio p.value
 # c(1, -1, -1, 1) -0.000444 0.00133 93.6 -0.333  0.7398 <<<<<<<<<<< use in paper








# ------------------- Recalculate mixed model using negative binomial rather than Gaussian ----------------------------

# Shows that using derived quantity (retention) and Gaussian mixed model more correctly evaluates data than using sequence alignments and negative binomial mixed model.
# Also, for human mitochondria (cf mito_copy_num_1.R), trying to test if estimate is significantly different from zero is impossible using negative binomial.


# From A23_HEK_mito_copy_num_1.R:

# Hamster mito is 16283 bp from RH_PICR_RAW_ordered_fixed_11_29_18.xlsx (this is the actual mito sequence used for alignment, not the updated version on https://www.ncbi.nlm.nih.gov/nuccore/CM010855.1, which is, nicely, also 16283 bp.)

# Hamster genome excluding mitochondria is 2368906908 bp from https://www.ncbi.nlm.nih.gov/assembly/GCA_003668045.1/


# include mitochondria in genome length, since aligned reads include mitochondria:
hamster_genome <- 2368906908 + 16283




# From A23_HEK_mito_copy_num_1.R:

# Human mito is 16569 bp from "RH_pool against_hg38_fixed-11_24_18.xlsx" (this is the actual mito sequence used for alignment, not the updated version on https://mitomap.org//bin/view.pl/MITOMAP/HumanMitoSeq, which is, nicely, also 16569 bp.)

# Human genome excluding mitochondrion is 3088269832 bp from human_chr_lengths_1.R

human_genome <- 3088269832 + 16569



TK1_max_length <- 1e6




human_align <- read.table("RH_pool_human_total_align.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE)
hamster_align <- read.table("RH_pool_hamster_total_align.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE)

align <- merge(human_align[,c("RH_ID", "pool", "week", "conc", "cell","human_aligned_and_hamster_unaligned")],hamster_align[,c("RH_ID", "pool", "week", "conc", "cell","hamster_aligned_and_human_unaligned")])
colnames(align)[c(6:7)] <- c("human_align","hamster_align")

align <- merge(align,TK1_max)

head(align)
       # RH_ID pool week conc cell human_align hamster_align TK1_max
# 1  RH1_w0_d0    1    0    0    1      298433      37595178     997
# 2  RH1_w1_d0    1    1    0    2      372475      42886773    1398
# 3 RH1_w1_d25    1    1   25    4      321998      49651729    1255
# 4 RH1_w1_d75    1    1   75    5      247321      59267597     866
# 5  RH1_w1_d8    1    1    8    3      425169      55742797    1531
# 6  RH1_w2_d0    1    2    0    2      310062      34210971    1805


m1 <- gam(human_align ~ week * conc + s(pool, bs = "re") + s(cell, bs = "re") + offset(log(hamster_align*(human_genome/hamster_genome))), data = align, family = nb, method = "REML")


summary(m1)

# Family: Negative Binomial(3.58) 
# Link function: log 

# Formula:
# human_align ~ week * conc + s(pool, bs = "re") + s(cell, bs = "re") + 
    # offset(log(hamster_align * (human_genome/hamster_genome)))

# Parametric coefficients:
              # Estimate Std. Error z value Pr(>|z|)    
# (Intercept) -4.7956900  0.1561470 -30.713  < 2e-16 ***
# week         0.0871808  0.0337330   2.584  0.00975 ** 
# conc        -0.0014237  0.0040160  -0.355  0.72295    
# week:conc   -0.0006912  0.0010171  -0.680  0.49676    
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

# Approximate significance of smooth terms:
             # edf Ref.df Chi.sq  p-value    
# s(pool) 0.958004      1  23.84 1.29e-06 ***
# s(cell) 0.001472      1   0.00    0.668    
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

# R-sq.(adj) =  -0.0148   Deviance explained = 21.1%
# -REML = 1648.2  Scale est. = 1         n = 115


# neg binomial mixed mod gives misleading results in this context (eg week should have significant negative coefficient) because reads in aggregate are strongly skewed by CEN peaks and other deviations


# Oddly, using TK1_max gives opposite results to hamster aligned sequences. Seems using derived quantity (retention) better conveys situation accurately.

m1 <- gam(human_align ~ week * conc + s(pool, bs = "re") + s(cell, bs = "re") + offset(log(TK1_max*(human_genome/TK1_max_length))), data = align, family = nb, method = "REML")

summary(m1)

# Family: Negative Binomial(12.392) 
# Link function: log 

# Formula:
# human_align ~ week * conc + s(pool, bs = "re") + s(cell, bs = "re") + 
    # offset(log(TK1_max * (human_genome/TK1_max_length)))

# Parametric coefficients:
              # Estimate Std. Error z value Pr(>|z|)    
# (Intercept) -2.4903057  0.0634800 -39.230   <2e-16 ***
# week        -0.0402509  0.0181300  -2.220   0.0264 *  
# conc        -0.0007690  0.0021513  -0.357   0.7207    
# week:conc    0.0003020  0.0005459   0.553   0.5801    
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

# Approximate significance of smooth terms:
             # edf Ref.df Chi.sq p-value
# s(pool) 0.002707      1  0.002   0.403
# s(cell) 0.002841      1  0.002   0.461

# R-sq.(adj) =  0.648   Deviance explained = 5.25%
# -REML =   1574  Scale est. = 1         n = 115


# Above results may be more consistent at average conditional values





















































































