library(ggplot2)
library(cowplot)
library(lme4)
library(multcomp)

# for summarySE()
library(Rmisc)

# from https://stackoverflow.com/questions/8197559/emulate-ggplot2-default-color-palette
# function for default ggplot2 colors

gg_color_hue <- function(n) {
  hues = seq(15, 375, length = n + 1)
  hcl(h = hues, l = 65, c = 100)[1:n]
}

theme2 <- theme(
	plot.margin = unit(c(t=1.3,r=0.7,b=1.3,l=0.7), "cm"),
	panel.grid.major = element_blank(), 
	panel.grid.minor = element_blank(), 
	panel.background = element_blank(), 
	legend.position="none", 
	axis.line.x = element_line(colour = "black", size = 0.1), 
	axis.line.y = element_line(colour = "black", size = 0.1), 
	axis.ticks = element_line(colour = "black", size = 0.1),
	axis.text=element_text(size=12), #numbers on tick marks of x and y axes
	axis.title=element_text(size=14), #titles of x and y axes
	axis.title.y=element_text(margin=margin(0,13,0,0)), #moves y axis title by adding margin space to bottom
	axis.title.x=element_text(margin=margin(10,0,0,0)),  #moves x axis title by adding margin space to top
	plot.title = element_text(size=32, face="bold", hjust = -0.14), #can provide "A","B", by ggtitle, but used plot_grid wch can shift more left
	plot.subtitle = element_text(size=14, face="plain", hjust = 0.5), #hjust shifts right
	legend.margin=margin(t=15,r=0,b=0,l=-5,unit = "pt"),
 	legend.box.margin=margin(t=15,r=-10,b=0,l=-5,unit = "pt"),
 	legend.key.height = unit(0.1, "cm"),
	legend.key.width = unit(0.3, "cm"),
 	legend.spacing.y = unit(0.1, 'cm'),
 	legend.spacing.x = unit(0.1, 'cm'),
 	legend.title = element_text(size = 9), 
 	legend.text = element_text(size = 8)
 	# legend.title.align=0.0
	)

	
	
size_point <- 0.3
size_hline <- 0.1


# --------- Combine human and hamster align files to deduce human retention frequency from bulk seq reads -----------------
# --------- This calx uses overall alignments, not medians and means of non-overlapping 1 Mb windows -------------------


human <- read.table("RH_pool_human_total_align.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE)

dim(human)
# [1] 115   10

head(human)

      # RH_ID pool week conc cell Total_reads human_unaligned human_aligned human_aligned_and_hamster_unaligned
# 1 RH1_w0_d0    1    0    0    1    42000172        41601530        398642                              298433
# 2 RH1_w1_d0    1    1    0    2    47971567        47496634        474933                              372475
# 3 RH1_w2_d0    1    2    0    2    38276132        37777814        498318                              310062
# 4 RH1_w3_d0    1    3    0    2    34816772        34444880        371892                              288187
# 5 RH1_w4_d0    1    4    0    2    36328023        35818437        509586                              326399
# 6 RH1_w6_d0    1    6    0    2    50327669        49691822        635847                              527642
  # human_aligned_and_hamster_aligned
# 1                            100209
# 2                            102458
# 3                            188256
# 4                             83705
# 5                            183187
# 6                            108205


hamster <- read.table("RH_pool_hamster_total_align.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE)

dim(hamster)
# [1] 115   10

head(hamster)
      # RH_ID pool week conc cell Total_reads hamster_unaligned hamster_aligned hamster_aligned_and_human_unaligned
# 1 RH1_w0_d0    1    0    0    1    42000172           4304785        37695387                            37595178
# 2 RH1_w1_d0    1    1    0    2    47971567           4982337        42989230                            42886773
# 3 RH1_w2_d0    1    2    0    2    38276132           3876901        34399231                            34210971
# 4 RH1_w3_d0    1    3    0    2    34816772           3585610        31231162                            31147459
# 5 RH1_w4_d0    1    4    0    2    36328023           3623841        32704182                            32520994
# 6 RH1_w6_d0    1    6    0    2    50327669           5325221        45002448                            44894243
  # hamster_aligned_and_human_aligned
# 1                            100209
# 2                            102457
# 3                            188260
# 4                             83703
# 5                            183188
# 6                            108205

Human_retent <- merge(hamster[,c("RH_ID","hamster_aligned_and_human_unaligned")],human[,c("RH_ID","human_aligned_and_hamster_unaligned")])


# Factor of 2, because human frags hapoloid and A23 diploid
Human_retent$human_retent <- 2*Human_retent[,"human_aligned_and_hamster_unaligned"]/Human_retent[,"hamster_aligned_and_human_unaligned"]


dim(Human_retent)
# [1] 115   4


head(Human_retent)
       # RH_ID hamster_aligned_and_human_unaligned human_aligned_and_hamster_unaligned human_retent
# 1  RH1_w0_d0                            37595178                              298433  0.015876132
# 2  RH1_w1_d0                            42886773                              372475  0.017370157
# 3 RH1_w1_d25                            49651729                              321998  0.012970263
# 4 RH1_w1_d75                            59267597                              247321  0.008345909
# 5  RH1_w1_d8                            55742797                              425169  0.015254670
# 6  RH1_w2_d0                            34210971                              310062  0.018126466

# # -------- Correct human retention because different lengths human and hamster genome. ----------
# # -------- Human and hamster genome lengths including mitochondrion from A23_HEK_mito_copy_num_1.R --------------

hg38 <- 3088286401
picr <- 2368939474

Human_retent$corr_human_retent <- Human_retent$human_retent * (picr/hg38)




# --------- Calculate retention of human DNA in RH pools using 1 Mb windows ------------



#----------------- Prepare human retain RH pools ---------------------


RH_human <- read.table("RH_human_gseq.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE)

# Get rows at beginning of each chromosome:
RH_human_start <- RH_human[RH_human$posS == 0 & RH_human$posE == 1e6,]

# Get rid of ramp ups and ramp downs:
RH_human <- RH_human[c(0,diff(RH_human$pos)) == 1e4,]

# combine RH_human without ramps and RH_human_start:
RH_human <- rbind(RH_human_start,RH_human)


# Sort:
chrOrder<-paste("chr",c(1:22,"X","Y"),sep="")
RH_human$Chromosome <-factor(RH_human$Chromosome, levels=chrOrder)
RH_human <- RH_human[order(RH_human$Chromosome, RH_human$pos), ]
RH_human$Chromosome <- as.character(RH_human$Chromosome)


# # Transform chr1 etc. to numbers
RH_human$Chromosome <- gsub('chr', '', RH_human$Chromosome)
RH_human[RH_human$Chromosome == "X","Chromosome"] <- 23
RH_human[RH_human$Chromosome == "Y","Chromosome"] <- 24
chrOrder<-c(1:24)
RH_human$Chromosome <-factor(RH_human$Chromosome, levels=chrOrder)
RH_human <- RH_human[order(RH_human$Chromosome, RH_human$pos), ]
RH_human$Chromosome <- as.numeric(RH_human$Chromosome)

# Compute chromosome size
gen_coord <- aggregate(pos~Chromosome,FUN=max,data=RH_human)
colnames(gen_coord)[2] <- "chr_size"
gen_coord$Chromosome <-factor(gen_coord$Chromosome, levels=chrOrder)
gen_coord <- gen_coord[order(gen_coord$Chromosome), ]
gen_coord$Chromosome <- as.numeric(gen_coord$Chromosome)

# Use cumsum to make genome coordinates
gen_coord$coord <- c(0,cumsum(gen_coord$chr_size)[-24])

# merge genome coordinates with RH_human
RH_human <- merge(RH_human,gen_coord[,c("Chromosome","coord")])
RH_human$Chromosome <-factor(RH_human$Chromosome, levels=chrOrder)
RH_human <- RH_human[order(RH_human$Chromosome, RH_human$pos), ]
RH_human$Chromosome <- as.numeric(RH_human$Chromosome)

RH_human$coord <- RH_human$pos + RH_human$coord

# Decided to get rid of chrY (cf below), because even though its reads shd contribute to chrX, large segments chrY are non-pseudoautosomal, so artifactually decreases genome median.
# get rid of chrY, because no chrY seq in hamster genome
RH_human <- RH_human[RH_human$Chromosome != 24,]

#----------------- Prepare hamster retain RH pools ---------------------

RH_hamster <- read.table("RH_hamster_gseq.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE)

# Get rows at beginning of each chromosome:
RH_hamster_start <- RH_hamster[RH_hamster$posS == 0 & RH_hamster$posE == 1e6,]

# Get rid of ramp ups and ramp downs (though note that hamster has ramp downs, not ramp ups):
RH_hamster <- RH_hamster[c(0,diff(RH_hamster$pos)) == 1e4,]

# combine RH_hamster without ramps and RH_hamster_start:
RH_hamster <- rbind(RH_hamster_start,RH_hamster)


# # get rid of contigs with only one entry:
# RH_hamster <- RH_hamster[!(RH_hamster$Contig_ID %in% aggregate(pos ~ Contig_ID, 
          # data = RH_hamster, 
          # FUN = function(x){NROW(x)})[aggregate(pos ~ Contig_ID, 
          # data = RH_hamster, 
          # FUN = function(x){NROW(x)})$pos==1,"Contig_ID"]),]

# Sort:
chrOrder<-paste("chr",c(1:10,"X"),sep="")
RH_hamster$Chromosome <-factor(RH_hamster$Chromosome, levels=chrOrder)
RH_hamster <- RH_hamster[order(RH_hamster$Chromosome, RH_hamster$pos), ]
RH_hamster$Chromosome <- as.character(RH_hamster$Chromosome)


# ----------- calculate seq reads retention ------------

# Chose non-overlapping 1 Mb windows because seemed more conservative? On the other hand, much easier to exclude TK1 precisely using overlapping windows (see below). Anyway, decided to go with non-overlapping windows. For seq reads retention, relative difference in retention betweeen overlapping and non-overlapping windows was negligible, < 1%. 

median_human_reads <- numeric()
for(i in colnames(RH_human[,c(5:(ncol(RH_human)-1))])) {
	median_human_reads[grep(i,colnames(RH_human[,c(5:(ncol(RH_human)-1))]))] <- median(RH_human[seq(1,nrow(RH_human),1e2),i])
	names(median_human_reads)[grep(i,colnames(RH_human[,c(5:(ncol(RH_human)-1))]))] <- i
	}
	
	
median_hamster_reads <- numeric()
for(i in colnames(RH_human[,c(5:(ncol(RH_human)-1))])) {
	median_hamster_reads[grep(i,colnames(RH_human[,c(5:(ncol(RH_human)-1))]))] <- median(median(RH_hamster[seq(1,nrow(RH_hamster),1e2),c(grep(i,colnames(RH_hamster)))]))
	names(median_hamster_reads)[grep(i,colnames(RH_human[,c(5:(ncol(RH_human)-1))]))] <- i
	}
	
	
mean_human_reads <- numeric()
for(i in colnames(RH_human[,c(5:(ncol(RH_human)-1))])) {
	mean_human_reads[grep(i,colnames(RH_human[,c(5:(ncol(RH_human)-1))]))] <- mean(RH_human[seq(1,nrow(RH_human),1e2),i])
	names(mean_human_reads)[grep(i,colnames(RH_human[,c(5:(ncol(RH_human)-1))]))] <- i
	}
	
	
mean_hamster_reads <- numeric()
for(i in colnames(RH_human[,c(5:(ncol(RH_human)-1))])) {
	mean_hamster_reads[grep(i,colnames(RH_human[,c(5:(ncol(RH_human)-1))]))] <- mean(mean(RH_hamster[seq(1,nrow(RH_hamster),1e2),c(grep(i,colnames(RH_hamster)))]))
	names(mean_hamster_reads)[grep(i,colnames(RH_human[,c(5:(ncol(RH_human)-1))]))] <- i
	}



# Non-overlapping 1 Mb windows. Factor of 2 because human frags haploid, A23 diploid.
median_retent_seq <- numeric()
for(i in colnames(RH_human[,c(5:(ncol(RH_human)-1))])) {
	median_retent_seq[grep(i,colnames(RH_human[,c(5:(ncol(RH_human)-1))]))] <- 2*median(RH_human[seq(1,nrow(RH_human),1e2),i]/(median(RH_hamster[seq(1,nrow(RH_hamster),1e2),c(grep(i,colnames(RH_hamster)))])))
	names(median_retent_seq)[grep(i,colnames(RH_human[,c(5:(ncol(RH_human)-1))]))] <- i
	}
	
	
mean_retent_seq <- numeric()
for(i in colnames(RH_human[,c(5:(ncol(RH_human)-1))])) {
	mean_retent_seq[grep(i,colnames(RH_human[,c(5:(ncol(RH_human)-1))]))] <- 2*mean(RH_human[seq(1,nrow(RH_human),1e2),i]/(mean(RH_hamster[seq(1,nrow(RH_hamster),1e2),c(grep(i,colnames(RH_hamster)))])))
	names(mean_retent_seq)[grep(i,colnames(RH_human[,c(5:(ncol(RH_human)-1))]))] <- i
	}




# # Overlapping 1 Mb windows. Factor of 2 because human frags haploid, A23 diploid.
# median_retent_seq <- numeric()
# for(i in colnames(RH_human[,c(5:(ncol(RH_human)-1))])) {
	# median_retent_seq[grep(i,colnames(RH_human[,c(5:(ncol(RH_human)-1))]))] <- 2*median(RH_human[,i]/(median(RH_hamster[seq(1,nrow(RH_hamster),1e2),c(grep(i,colnames(RH_hamster)))])))
	# names(median_retent_seq)[grep(i,colnames(RH_human[,c(5:(ncol(RH_human)-1))]))] <- i
	# }
	
	
# mean_retent_seq <- numeric()
# for(i in colnames(RH_human[,c(5:(ncol(RH_human)-1))])) {
	# mean_retent_seq[grep(i,colnames(RH_human[,c(5:(ncol(RH_human)-1))]))] <- 2*mean(RH_human[,i]/(mean(RH_hamster[seq(1,nrow(RH_hamster),1e2),c(grep(i,colnames(RH_hamster)))])))
	# names(mean_retent_seq)[grep(i,colnames(RH_human[,c(5:(ncol(RH_human)-1))]))] <- i
	# }


median_human_reads <- as.data.frame(median_human_reads)
median_human_reads$RH_ID <- row.names(median_human_reads)
row.names(median_human_reads) <- c(1:nrow(median_human_reads))


median_hamster_reads <- as.data.frame(median_hamster_reads)
median_hamster_reads$RH_ID <- row.names(median_hamster_reads)
row.names(median_hamster_reads) <- c(1:nrow(median_hamster_reads))


mean_human_reads <- as.data.frame(mean_human_reads)
mean_human_reads$RH_ID <- row.names(mean_human_reads)
row.names(mean_human_reads) <- c(1:nrow(mean_human_reads))


mean_hamster_reads <- as.data.frame(mean_hamster_reads)
mean_hamster_reads$RH_ID <- row.names(mean_hamster_reads)
row.names(mean_hamster_reads) <- c(1:nrow(mean_hamster_reads))


median_retent_seq <- as.data.frame(median_retent_seq)
median_retent_seq$RH_ID <- row.names(median_retent_seq)
row.names(median_retent_seq) <- c(1:nrow(median_retent_seq))


mean_retent_seq <- as.data.frame(mean_retent_seq)
mean_retent_seq$RH_ID <- row.names(mean_retent_seq)
row.names(mean_retent_seq) <- c(1:nrow(mean_retent_seq))



# -------- find middle TK1 -------------

gencode_gtf_ensembl_ucsc <- read.delim("gencode_gtf_ensembl_ucsc_v31.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE,check.names=FALSE)


gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$geneSymbol=="TK1",]
     # Chromosome         gene_id           tx_id geneSymbol strand    geneS    geneE geneLength txLength cdsLength    5utrS    5utrE 5utrDiff    3utrS    3utrE 3utrDiff
# 49401      chr17 ENSG00000167900 ENST00000588734        TK1      - 78174091 78187233      13143     1681       804 78186995 78187233      239 78174121 78174758      638
      # exonCount      gene_type                                       gene_description
# 49401         6 protein_coding thymidine kinase 1 [Source:HGNC Symbol;Acc:HGNC:11830]


TK1_coord <- mean(gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$geneSymbol=="TK1","geneS"],gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$geneSymbol=="TK1","geneE"])





# ----------- find peak TK1 reads ----------------

# For more accurate search of TK1 peak, restrict search to within delta of TK1, because CEN and TEL becomes larger than TK1 in some samples
delta_1 <- 1e6

RH_human_TK1_subset <- RH_human[RH_human$Chromosome==17 & RH_human$pos >= TK1_coord-delta_1 & RH_human$pos <= TK1_coord+delta_1,]

TK1_max <- apply(RH_human_TK1_subset[,c(5:(ncol(RH_human_TK1_subset)-1))],2,max)

TK1_max_coord <- apply(RH_human_TK1_subset[,c(5:(ncol(RH_human_TK1_subset)-1))],2,FUN= function(x) {RH_human_TK1_subset[,"coord"][which.max(x)]})






# ------------ calculate retention assuming TK1 is 100% retention ----------------

# Change RH_human by normalizing all 115 expts so that TK1 == 1.0

RH_human_TK <- RH_human

for(i in c(5:(ncol(RH_human_TK)-1))) {
RH_human_TK[,i] <- RH_human_TK[,i]/TK1_max[i-4]
}

# For TK1 retention, relative difference in retention betweeen overlapping and non-overlapping windows was negligible, < 1%. 



# So as not to overestimate retention, exclude seqs +/- 3.5e6 from TK1, since av expected fragment length is 7 Mb? But then is incosistent with retention derived from sequence reads? So keep TK1 reads (ie delta_2 <- 0)
# Non-overlapping 1 Mb windows


delta_2 <- 0



median_retent_TK1 <- vector()


for (i in names(TK1_max)) {
	median_retent_TK1[i] <- median(RH_human_TK[seq(1,nrow(RH_human_TK),1e2),][RH_human_TK[seq(1,nrow(RH_human_TK),1e2),"coord"] <= (TK1_max_coord[i] - delta_2) | RH_human_TK[seq(1,nrow(RH_human_TK),1e2),"coord"] >= (TK1_max_coord[i] + delta_2),i])
}



mean_retent_TK1 <- vector()


for (i in names(TK1_max)) {
	mean_retent_TK1[i] <- mean(RH_human_TK[seq(1,nrow(RH_human_TK),1e2),][RH_human_TK[seq(1,nrow(RH_human_TK),1e2),"coord"] <= (TK1_max_coord[i] - delta_2) | RH_human_TK[seq(1,nrow(RH_human_TK),1e2),"coord"] >= (TK1_max_coord[i] + delta_2),i])
}





# Overlapping 1 Mb windows

# median_retent_TK1 <- vector()


# for (i in names(TK1_max)) {
	# median_retent_TK1[i] <- median(RH_human_TK[RH_human_TK[,"coord"] <= (TK1_max_coord[i] - delta_2) | RH_human_TK[,"coord"] >= (TK1_max_coord[i] + delta_2),i])
# }



# mean_retent_TK1 <- vector()


# for (i in names(TK1_max)) {
	# mean_retent_TK1[i] <- mean(RH_human_TK[RH_human_TK[,"coord"] <= (TK1_max_coord[i] - delta_2) | RH_human_TK[,"coord"] >= (TK1_max_coord[i] + delta_2),i])
# }



# -------------------------------------------------------
# -------- Correct TK1 retention for revertants ---------
# -------- Decrease TK1 retention by revertant freq -----
# -------------------------------------------------------


# ~~~~~~ Prepare median_retent_TK1 ~~~~~~~~~~~~~~

clone <- read.table("clone.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE)

clone$revert_freq <- clone$reverts/clone$colonies

revert <- aggregate(revert_freq~Pool,data=clone,FUN=mean)

median_retent_TK1 <- data.frame(sample = names(median_retent_TK1),retent = median_retent_TK1)

row.names(median_retent_TK1) <- c(1:nrow(median_retent_TK1))

median_retent_TK1$Pool <- numeric(115)

median_retent_TK1[grep("^RH1",median_retent_TK1$sample),"Pool"] <- 1
median_retent_TK1[grep("^RH2",median_retent_TK1$sample),"Pool"] <- 2
median_retent_TK1[grep("^RH3",median_retent_TK1$sample),"Pool"] <- 3
median_retent_TK1[grep("^RH4",median_retent_TK1$sample),"Pool"] <- 4
median_retent_TK1[grep("^RH5",median_retent_TK1$sample),"Pool"] <- 5
median_retent_TK1[grep("^RH6",median_retent_TK1$sample),"Pool"] <- 6

median_retent_TK1 <- merge(median_retent_TK1,revert)
median_retent_TK1$corr_retent <- median_retent_TK1$retent*(1-median_retent_TK1$revert_freq)


# ~~~~~~~~~~~ Prepare mean_retent_TK1 ~~~~~~~~~~~~~~~~

clone <- read.table("clone.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE)

clone$revert_freq <- clone$reverts/clone$colonies

revert <- aggregate(revert_freq~Pool,data=clone,FUN=mean)

mean_retent_TK1 <- data.frame(sample = names(mean_retent_TK1),retent = mean_retent_TK1)

row.names(mean_retent_TK1) <- c(1:nrow(mean_retent_TK1))

mean_retent_TK1$Pool <- numeric(115)

mean_retent_TK1[grep("^RH1",mean_retent_TK1$sample),"Pool"] <- 1
mean_retent_TK1[grep("^RH2",mean_retent_TK1$sample),"Pool"] <- 2
mean_retent_TK1[grep("^RH3",mean_retent_TK1$sample),"Pool"] <- 3
mean_retent_TK1[grep("^RH4",mean_retent_TK1$sample),"Pool"] <- 4
mean_retent_TK1[grep("^RH5",mean_retent_TK1$sample),"Pool"] <- 5
mean_retent_TK1[grep("^RH6",mean_retent_TK1$sample),"Pool"] <- 6

mean_retent_TK1 <- merge(mean_retent_TK1,revert)
mean_retent_TK1$corr_retent <- mean_retent_TK1$retent*(1-mean_retent_TK1$revert_freq)


# ---------- Combine seq retent and mean and median TK1 retent to make a retention table --------------------------------------------
# ---------- Retents are corrected, as appropriate, for either human and hamster genome sizes or revertant frequency ----------------


colnames(median_retent_TK1)[5] <- "TK1_median_retent"
colnames(median_retent_TK1)[2] <- "RH_ID"
colnames(mean_retent_TK1)[5] <- "TK1_mean_retent"
colnames(mean_retent_TK1)[2] <- "RH_ID"

temp_1 <- merge(median_retent_seq,mean_retent_seq)
temp_2 <- merge(median_retent_TK1[,c("RH_ID","TK1_median_retent")],mean_retent_TK1[,c("RH_ID","TK1_mean_retent")])


# retent will be the root for all figure panels
retent <- merge(temp_1,temp_2)

temp3 <- merge(median_human_reads, median_hamster_reads)
temp4 <- merge(mean_human_reads, mean_hamster_reads)

retent <- merge(retent,temp3)
retent <- merge(retent,temp4)

retent <- retent[,c("RH_ID","median_human_reads", "median_hamster_reads", "mean_human_reads", "mean_hamster_reads","median_retent_seq", "mean_retent_seq", "TK1_median_retent", "TK1_mean_retent")]

dim(retent)
# [1] 115   9

head(retent)
       # RH_ID median_human_reads median_hamster_reads mean_human_reads mean_hamster_reads median_retent_seq mean_retent_seq TK1_median_retent TK1_mean_retent
# 1  RH1_w0_d0                 76                15130         95.79096           15209.42       0.010046266     0.012596264        0.05817028      0.07331825
# 2  RH1_w1_d0                 83                17375        119.44433           17297.70       0.009553957     0.013810431        0.04530578      0.06519902
# 3 RH1_w1_d25                 73                20193        103.19807           20067.49       0.007230228     0.010285101        0.04438762      0.06274954
# 4 RH1_w1_d75                 58                24339         79.19608           23922.75       0.004766013     0.006620983        0.05110846      0.06978603
# 5  RH1_w1_d8                 97                22613        136.34164           22494.54       0.008579136     0.012122196        0.04834808      0.06795729
# 6  RH1_w2_d0                 65                13752         99.07478           13829.42       0.009453170     0.014328120        0.02748013      0.04188597


retent[grep("^RH1",retent$RH_ID),"pool"] <- 1
retent[grep("^RH2",retent$RH_ID),"pool"] <- 2
retent[grep("^RH3",retent$RH_ID),"pool"] <- 3
retent[grep("^RH4",retent$RH_ID),"pool"] <- 4
retent[grep("^RH5",retent$RH_ID),"pool"] <- 5
retent[grep("^RH6",retent$RH_ID),"pool"] <- 6

retent[grep("_w0_",retent$RH_ID),"week"] <- 0
retent[grep("_w1_",retent$RH_ID),"week"] <- 1
retent[grep("_w2_",retent$RH_ID),"week"] <- 2
retent[grep("_w3_",retent$RH_ID),"week"] <- 3
retent[grep("_w4_",retent$RH_ID),"week"] <- 4
retent[grep("_w6_",retent$RH_ID),"week"] <- 6

retent[grep("_d0",retent$RH_ID),"conc"] <- 0
retent[grep("_d8",retent$RH_ID),"conc"] <- 8
retent[grep("_d25",retent$RH_ID),"conc"] <- 25
retent[grep("_d75",retent$RH_ID),"conc"] <- 75

retent <- retent[,c("RH_ID","pool","week","conc","median_human_reads", "median_hamster_reads", "mean_human_reads", "mean_hamster_reads","median_retent_seq", "mean_retent_seq", "TK1_median_retent", "TK1_mean_retent")]

retent <- retent[order(retent$week,retent$conc,retent$pool),]

dim(retent)
# [1] 115   12

head(retent)
       # RH_ID pool week conc median_human_reads median_hamster_reads mean_human_reads mean_hamster_reads median_retent_seq mean_retent_seq TK1_median_retent
# 1  RH1_w0_d0    1    0    0                 76                15130         95.79096           15209.42        0.01004627      0.01259626        0.05817028
# 21 RH2_w0_d0    2    0    0                194                12372        251.74543           12344.90        0.03136114      0.04078533        0.05881380
# 40 RH3_w0_d0    3    0    0                246                18255        341.26520           18410.09        0.02695152      0.03707372        0.05445742
# 60 RH4_w0_d0    4    0    0                259                11237        337.15985           11269.49        0.04609771      0.05983588        0.05852638
# 80 RH5_w0_d0    5    0    0                155                15082        205.01529           15367.61        0.02055430      0.02668147        0.04964056
# 98 RH6_w0_d0    6    0    0                154                14877        202.76471           14928.61        0.02070310      0.02716458        0.03787156
   # TK1_mean_retent
# 1       0.07331825
# 21      0.07632012
# 40      0.07554644
# 60      0.07618821
# 80      0.06565854
# 98      0.04986374

cell <- read.table("cell_label_info.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE)

head(cell)
  # pool conc week cell
# 1    1    0    0    1
# 2    1    0    1    2
# 3    1    0    2    2
# 4    1    0    3    2
# 5    1    0    4    2
# 6    1    0    6    2

dim(cell)
# [1] 115   4

retent <- merge(retent,cell)
retent <- retent[order(retent$week,retent$conc,retent$pool),]
retent <- retent[order(retent$cell),]

retent <- retent[,c("RH_ID","pool","week","conc","cell","median_human_reads", "median_hamster_reads", "mean_human_reads", "mean_hamster_reads","median_retent_seq", "mean_retent_seq", "TK1_median_retent", "TK1_mean_retent")]


dim(retent)
# [1] 115   13

head(retent)
       # RH_ID pool week conc cell median_human_reads median_hamster_reads mean_human_reads mean_hamster_reads median_retent_seq mean_retent_seq TK1_median_retent
# 1  RH1_w0_d0    1    0    0    1                 76                15130         95.79096           15209.42       0.010046266      0.01259626        0.05817028
# 2  RH1_w1_d0    1    1    0    2                 83                17375        119.44433           17297.70       0.009553957      0.01381043        0.04530578
# 6  RH1_w2_d0    1    2    0    2                 65                13752         99.07478           13829.42       0.009453170      0.01432812        0.02748013
# 9  RH1_w3_d0    1    3    0    2                 57                12570         92.10867           12637.62       0.009069212      0.01457690        0.02253721
# 13 RH1_w4_d0    1    4    0    2                 58                13010        104.06846           13110.79       0.008916218      0.01587524        0.01867507
# 17 RH1_w6_d0    1    6    0    2                 83                18203        168.86474           18193.74       0.009119376      0.01856295        0.01991745
   # TK1_mean_retent
# 1       0.07331825
# 2       0.06519902
# 6       0.04188597
# 9       0.03641882
# 13      0.03350838
# 17      0.04052234


retent <- merge(retent,median_retent_TK1[,c("RH_ID","revert_freq")])
retent <- retent[order(retent$week,retent$conc,retent$pool),]
retent <- retent[order(retent$cell),]

dim(retent)
# [1] 115   14

head(retent)
       # RH_ID pool week conc cell median_human_reads median_hamster_reads mean_human_reads mean_hamster_reads median_retent_seq mean_retent_seq TK1_median_retent
# 1  RH1_w0_d0    1    0    0    1                 76                15130         95.79096           15209.42       0.010046266      0.01259626        0.05817028
# 2  RH1_w1_d0    1    1    0    2                 83                17375        119.44433           17297.70       0.009553957      0.01381043        0.04530578
# 6  RH1_w2_d0    1    2    0    2                 65                13752         99.07478           13829.42       0.009453170      0.01432812        0.02748013
# 9  RH1_w3_d0    1    3    0    2                 57                12570         92.10867           12637.62       0.009069212      0.01457690        0.02253721
# 13 RH1_w4_d0    1    4    0    2                 58                13010        104.06846           13110.79       0.008916218      0.01587524        0.01867507
# 17 RH1_w6_d0    1    6    0    2                 83                18203        168.86474           18193.74       0.009119376      0.01856295        0.01991745
   # TK1_mean_retent revert_freq
# 1       0.07331825   0.2368978
# 2       0.06519902   0.2368978
# 6       0.04188597   0.2368978
# 9       0.03641882   0.2368978
# 13      0.03350838   0.2368978
# 17      0.04052234   0.2368978

retent$mean_of_medians <- rowMeans(retent[,c("median_retent_seq","TK1_median_retent")])
retent$mean_of_means <- rowMeans(retent[,c("mean_retent_seq","TK1_mean_retent")])

dim(retent)
# [1] 115  16


head(retent)
       # RH_ID pool week conc cell median_human_reads median_hamster_reads mean_human_reads mean_hamster_reads median_retent_seq mean_retent_seq TK1_median_retent
# 1  RH1_w0_d0    1    0    0    1                 76                15130         95.79096           15209.42       0.010046266      0.01259626        0.05817028
# 2  RH1_w1_d0    1    1    0    2                 83                17375        119.44433           17297.70       0.009553957      0.01381043        0.04530578
# 6  RH1_w2_d0    1    2    0    2                 65                13752         99.07478           13829.42       0.009453170      0.01432812        0.02748013
# 9  RH1_w3_d0    1    3    0    2                 57                12570         92.10867           12637.62       0.009069212      0.01457690        0.02253721
# 13 RH1_w4_d0    1    4    0    2                 58                13010        104.06846           13110.79       0.008916218      0.01587524        0.01867507
# 17 RH1_w6_d0    1    6    0    2                 83                18203        168.86474           18193.74       0.009119376      0.01856295        0.01991745
   # TK1_mean_retent revert_freq mean_of_medians mean_of_means
# 1       0.07331825   0.2368978      0.03410827    0.04295725
# 2       0.06519902   0.2368978      0.02742987    0.03950473
# 6       0.04188597   0.2368978      0.01846665    0.02810705
# 9       0.03641882   0.2368978      0.01580321    0.02549786
# 13      0.03350838   0.2368978      0.01379565    0.02469181
# 17      0.04052234   0.2368978      0.01451841    0.02954265


TK1_max <- as.data.frame(TK1_max)
TK1_max$RH_ID <- row.names(TK1_max)
row.names(TK1_max) <- c(1:nrow(TK1_max))



retent <- merge(retent,TK1_max)

retent <- retent[order(retent$week,retent$conc,retent$pool),]
retent <- retent[order(retent$cell),]

dim(retent)
# [1] 115   17

head(retent)
       # RH_ID pool week conc cell median_human_reads median_hamster_reads mean_human_reads mean_hamster_reads median_retent_seq mean_retent_seq TK1_median_retent
# 1  RH1_w0_d0    1    0    0    1                 76                15130         95.79096           15209.42       0.010046266      0.01259626        0.05817028
# 2  RH1_w1_d0    1    1    0    2                 83                17375        119.44433           17297.70       0.009553957      0.01381043        0.04530578
# 6  RH1_w2_d0    1    2    0    2                 65                13752         99.07478           13829.42       0.009453170      0.01432812        0.02748013
# 9  RH1_w3_d0    1    3    0    2                 57                12570         92.10867           12637.62       0.009069212      0.01457690        0.02253721
# 13 RH1_w4_d0    1    4    0    2                 58                13010        104.06846           13110.79       0.008916218      0.01587524        0.01867507
# 17 RH1_w6_d0    1    6    0    2                 83                18203        168.86474           18193.74       0.009119376      0.01856295        0.01991745
   # TK1_mean_retent revert_freq mean_of_medians mean_of_means TK1_max
# 1       0.07331825   0.2368978      0.03410827    0.04295725     997
# 2       0.06519902   0.2368978      0.02742987    0.03950473    1398
# 6       0.04188597   0.2368978      0.01846665    0.02810705    1805
# 9       0.03641882   0.2368978      0.01580321    0.02549786    1930
# 13      0.03350838   0.2368978      0.01379565    0.02469181    2370
# 17      0.04052234   0.2368978      0.01451841    0.02954265    3180



# ----- if desired, can plot mean_of_medians, mean_of_means stratified by week, conc or pool ---------------




# -------------------------------------------------------------------------------
# ----------- Plots of median sequence retent and TK1 retent -----------------------
# ----------- For plots of mean sequence retent and TK1 retent see below ---------
# ------------- Decided to use median for paper -----------------------------------
# ----------- Plot median sequence retention vs growth time ---------------------
# ---------------------------------------------------------------------------------



summary_retent_1 <- summarySE(retent, measurevar="median_retent_seq", groupvars=c("week","conc"))
 


# provide common x scale jitter for two overlaid ggplots
summary_retent_1$jitter <- jitter(summary_retent_1$week,1)


retent_1 <- merge(retent,summary_retent_1[c("week","conc","jitter")])

# replicate week 0, drug 0 nM data point four times, to allow four graphs lines linking week 0 and week 1, instead of only one
summary_retent_1 <- rbind(summary_retent_1[1,],summary_retent_1[1,],summary_retent_1[1,],summary_retent_1)
summary_retent_1[1:4,"conc"] <- c(0,8,25,75)

# turn retent freq into percentage
retent_1[,c("median_retent_seq", "mean_retent_seq", "TK1_median_retent", "TK1_mean_retent")] <- 100*retent_1[,c("median_retent_seq", "mean_retent_seq", "TK1_median_retent", "TK1_mean_retent")]
summary_retent_1[,c("median_retent_seq", "sd", "se", "ci")] <- 100*summary_retent_1[,c("median_retent_seq", "sd", "se", "ci")]




n = length(unique(retent_1$conc))
colores_1 = gg_color_hue(n)



p1 <- ggplot() + 
		theme2 + 
		theme(legend.key=element_blank()) +
		geom_line(
			data=summary_retent_1, 
			lwd=0.2,
			aes(
				x=summary_retent_1$jitter, 
				y= median_retent_seq, 
				colour=as.factor(summary_retent_1$conc)),
				show.legend=FALSE
				) + 
			# remove 3 duplicated entries for 0w_d0, so color of error bar corresponds to 0 nM paclitaxel
		geom_linerange(
			data=summary_retent_1[c(1,5:nrow(summary_retent_1)),],
			aes(
				x=jitter, 
				ymin=median_retent_seq-se, 
				ymax=median_retent_seq+se,
				colour=as.factor(conc)
				),
			lwd=0.2,
			show.legend=FALSE
			) +
		geom_point(
			shape=1,
			stroke=0.4,
			data=retent_1, 
			aes(
				x=retent_1$jitter, 
				y=median_retent_seq, 
				colour=as.factor(retent_1$conc)),
				size=1.0
				) +
		scale_color_manual(
			values=colores_1,
			name ="Conc (nM)", 
			labels=c(0,8,25,75)
			) +
	 		guides(
	 			colour = guide_legend(override.aes = list(fill=NA,shape=1,size=1),ncol=2,byrow=TRUE)
	 			) +
	 		theme(
	 			legend.position = c(0.73,1.00), 
	 			legend.title.align=0.7) +
		scale_x_continuous(breaks = c(0,1,2,3,4,6), labels = c(0,1,2,3,4,6)) +
		# ggtitle("") + 
		xlab("Weeks") + 
		ylab("Retention via \nalignments (%)") + 
		labs(subtitle="Growth")
print(p1)
		
		
		
		
# ----------- Plot TK1 median retention vs growth time ---------------------



summary_retent_2 <- summarySE(retent, measurevar="TK1_median_retent", groupvars=c("week","conc"))
 


# provide common x scale jitter for two overlaid ggplots
summary_retent_2$jitter <- jitter(summary_retent_2$week,1)


retent_2 <- merge(retent,summary_retent_2[c("week","conc","jitter")])

# replicate week 0, drug 0 nM data point four times, to allow four graphs lines linking week 0 and week 1, instead of only one
summary_retent_2 <- rbind(summary_retent_2[1,],summary_retent_2[1,],summary_retent_2[1,],summary_retent_2)
summary_retent_2[1:4,"conc"] <- c(0,8,25,75)


# turn retent freq into percentage
retent_2[,c("median_retent_seq", "mean_retent_seq", "TK1_median_retent", "TK1_mean_retent")] <- 100*retent_2[,c("median_retent_seq", "mean_retent_seq", "TK1_median_retent", "TK1_mean_retent")]
summary_retent_2[,c("TK1_median_retent", "sd", "se", "ci")] <- 100*summary_retent_2[,c("TK1_median_retent", "sd", "se", "ci")]





n = length(unique(retent_2$conc))
colores_2 = gg_color_hue(n)



p2 <- ggplot() + 
		theme2 + 
		theme(legend.key=element_blank()) +
		geom_line(
			data=summary_retent_2, 
			lwd=0.2,
			aes(
				x=summary_retent_2$jitter, 
				y=TK1_median_retent, 
				colour=as.factor(summary_retent_2$conc)),
				show.legend=FALSE
				) + 
			# remove 3 duplicated entries for 0w_d0, so color of error bar corresponds to 0 nM paclitaxel
		geom_linerange(
			data=summary_retent_2[c(1,5:nrow(summary_retent_2)),],
			aes(
				x=jitter, 
				ymin= TK1_median_retent-se, 
				ymax= TK1_median_retent+se,
				colour=as.factor(conc)
				),
			lwd=0.2,
			show.legend=FALSE
			) +
		geom_point(
			shape=1,
			stroke=0.4,
			data=retent_2, 
			aes(
				x=retent_2$jitter, 
				y=TK1_median_retent, 
				colour=as.factor(retent_2$conc)),
				size=1.0
				) +
		scale_color_manual(
			values=colores_2,
			name ="Conc (nM)", 
			labels=c(0,8,25,75)
			) +
	 		guides(
	 			colour = guide_legend(override.aes = list(fill=NA,shape=1,size=1),ncol=2,byrow=TRUE)
	 			) +
	 		theme(
	 			legend.position = c(0.73,1.00), 
	 			legend.title.align=0.7) +
		scale_x_continuous(breaks = c(0,1,2,3,4,6), labels = c(0,1,2,3,4,6)) +
		# ggtitle("") + 
		xlab("Weeks") + 
		ylab(expression(atop("Retention via ", italic('TK1')~"(%)"))) + 
		labs(subtitle="Growth")
print(p2)



# ----------- Plot sequence retention vs conc ---------------------



summary_retent_3 <- summarySE(retent, measurevar="median_retent_seq", groupvars=c("conc","week"))
 


# provide common x scale jitter for two overlaid ggplots
summary_retent_3$jitter <- jitter(summary_retent_3$conc,1)


retent_3 <- merge(retent,summary_retent_3[c("week","conc","jitter")])

# replicate week 0, drug 0 nM data point four times, to allow four graphs lines linking week 0 and week 1, instead of only one
summary_retent_3 <- rbind(summary_retent_3[1,],summary_retent_3[1,],summary_retent_3[1,],summary_retent_3)
summary_retent_3[1:4,"conc"] <- c(0,8,25,75)

# turn retent freq into percentage
retent_3[,c("median_retent_seq", "mean_retent_seq", "TK1_median_retent", "TK1_mean_retent")] <- 100*retent_3[,c("median_retent_seq", "mean_retent_seq", "TK1_median_retent", "TK1_mean_retent")]
summary_retent_3[,c("median_retent_seq", "sd", "se", "ci")] <- 100*summary_retent_3[,c("median_retent_seq", "sd", "se", "ci")]




n = length(unique(retent_3$week))
colores_3 = gg_color_hue(n)



p3 <- ggplot() + 
		theme2 + 
		theme(legend.key=element_blank()) +
		geom_line(
			data=summary_retent_3, 
			lwd=0.2,
			aes(
				x=summary_retent_3$jitter, 
				y=median_retent_seq, 
				colour=as.factor(summary_retent_3$week)),
				show.legend=FALSE
				) + 
			# remove 3 duplicated entries for 0w_d0, so color of error bar corresponds to 0 nM paclitaxel
		geom_linerange(
			data=summary_retent_3[c(1,5:nrow(summary_retent_3)),],
			aes(
				x=jitter, 
				ymin=median_retent_seq-se, 
				ymax=median_retent_seq+se,
				colour=as.factor(week)
				),
			lwd=0.2,
			show.legend=FALSE
			) +
		geom_point(
			shape=1,
			stroke=0.4,
			data=retent_3, 
			aes(
				x=retent_3$jitter, 
				y=median_retent_seq, 
				colour=as.factor(retent_3$week)),
				size=1.0
				) +
		scale_color_manual(
			values=colores_3,
			name ="Weeks", 
			labels=c(0,1,2,3,4,6)
			) +
	 		guides(
	 			colour = guide_legend(override.aes = list(fill=NA,shape=1,size=1),ncol=3,byrow=TRUE)
	 			) +
	 		theme(
	 			legend.position = c(0.73,1.00), 
	 			legend.title.align=0.7) +
	 	scale_x_continuous(breaks = c(0,8,25,75), labels = c(0,8,25,75)) +
		# ggtitle("") + 
		xlab("Conc (nM)") + 
		ylab("Retention via \nalignments (%)") + 
		labs(subtitle="Paclitaxel")
print(p3)
		

# ----------- Plot TK1 retention vs conc ---------------------



summary_retent_4 <- summarySE(retent, measurevar="TK1_median_retent", groupvars=c("conc","week"))
 


# provide common x scale jitter for two overlaid ggplots
summary_retent_4$jitter <- jitter(summary_retent_4$conc,1)


retent_4 <- merge(retent,summary_retent_4[c("week","conc","jitter")])

# replicate week 0, drug 0 nM data point four times, to allow four graphs lines linking week 0 and week 1, instead of only one
summary_retent_4 <- rbind(summary_retent_4[1,],summary_retent_4[1,],summary_retent_4[1,],summary_retent_4)
summary_retent_4[1:4,"conc"] <- c(0,8,25,75)

# turn retent freq into percentage
retent_4[,c("median_retent_seq", "mean_retent_seq", "TK1_median_retent", "TK1_mean_retent")] <- 100*retent_4[,c("median_retent_seq", "mean_retent_seq", "TK1_median_retent", "TK1_mean_retent")]
summary_retent_4[,c("TK1_median_retent", "sd", "se", "ci")] <- 100*summary_retent_4[,c("TK1_median_retent", "sd", "se", "ci")]




n = length(unique(retent_4$week))
colores_4 = gg_color_hue(n)


p4 <- ggplot() + 
		theme2 + 
		theme(legend.key=element_blank()) +
		geom_line(
			data=summary_retent_4, 
			lwd=0.2,
			aes(
				x=summary_retent_4$jitter, 
				y= TK1_median_retent, 
				colour=as.factor(summary_retent_4$week)),
				show.legend=FALSE
				) + 
			# remove 3 duplicated entries for 0w_d0, so color of error bar corresponds to 0 nM paclitaxel
		geom_linerange(
			data=summary_retent_4[c(1,5:nrow(summary_retent_4)),],
			aes(
				x=jitter, 
				ymin=TK1_median_retent-se, 
				ymax=TK1_median_retent+se,
				colour=as.factor(week)
				),
			lwd=0.2,
			show.legend=FALSE
			) +
		geom_point(
			shape=1,
			stroke=0.4,
			data=retent_4, 
			aes(
				x=retent_4$jitter, 
				y=TK1_median_retent, 
				colour=as.factor(retent_4$week)),
				size=1.0
				) +
		scale_color_manual(
			values=colores_4,
			name ="Weeks", 
			labels=c(0,1,2,3,4,6)
			) +
 		guides(
 			colour = guide_legend(override.aes = list(fill=NA,shape=1,size=1),ncol=3,byrow=TRUE)
 			) +
 		theme(
 			legend.position = c(0.73,1.00), 
 			legend.title.align=0.7) +
 		scale_x_continuous(breaks = c(0,8,25,75), labels = c(0,8,25,75)) +
		# ggtitle("") + 
		xlab("Conc (nM)") + 
		ylab(expression(atop("Retention via ", italic('TK1')~"(%)"))) + 
		labs(subtitle="Paclitaxel")
print(p4)


#------------------Make file --------------------------


pdf("median_retent_1.pdf",width=7.5,height= 6.67)
plot_grid(p1, p2,p3,p4, labels=c("A", "B","C","D"), ncol = 2, nrow = 2, label_size = 16)
dev.off()

#-------------------------------------------------------


# -------------------------------------------------------------------------------
# ----------- Plots of mean sequence retent and TK1 retent -----------------------
# ----------- For plots of median sequence retent and TK1 retent see above ---------
# ------------- Decided to use median for paper -----------------------------------
# ----------- Plot mean sequence retention vs growth time ---------------------
# ---------------------------------------------------------------------------------



summary_retent_1 <- summarySE(retent, measurevar="mean_retent_seq", groupvars=c("week","conc"))
 


# provide common x scale jitter for two overlaid ggplots
summary_retent_1$jitter <- jitter(summary_retent_1$week,1)


retent_1 <- merge(retent,summary_retent_1[c("week","conc","jitter")])

# replicate week 0, drug 0 nM data point four times, to allow four graphs lines linking week 0 and week 1, instead of only one
summary_retent_1 <- rbind(summary_retent_1[1,],summary_retent_1[1,],summary_retent_1[1,],summary_retent_1)
summary_retent_1[1:4,"conc"] <- c(0,8,25,75)

# turn retent freq into percentage
retent_1[,c("median_retent_seq", "mean_retent_seq", "TK1_median_retent", "TK1_mean_retent")] <- 100*retent_1[,c("median_retent_seq", "mean_retent_seq", "TK1_median_retent", "TK1_mean_retent")]
summary_retent_1[,c("mean_retent_seq", "sd", "se", "ci")] <- 100*summary_retent_1[,c("mean_retent_seq", "sd", "se", "ci")]




n = length(unique(retent_1$conc))
colores_1 = gg_color_hue(n)



p1 <- ggplot() + 
		theme2 + 
		theme(legend.key=element_blank()) +
		geom_line(
			data=summary_retent_1, 
			lwd=0.2,
			aes(
				x=summary_retent_1$jitter, 
				y= mean_retent_seq, 
				colour=as.factor(summary_retent_1$conc)),
				show.legend=FALSE
				) + 
			# remove 3 duplicated entries for 0w_d0, so color of error bar corresponds to 0 nM paclitaxel
		geom_linerange(
			data=summary_retent_1[c(1,5:nrow(summary_retent_1)),],
			aes(
				x=jitter, 
				ymin=mean_retent_seq-se, 
				ymax=mean_retent_seq+se,
				colour=as.factor(conc)
				),
			lwd=0.2,
			show.legend=FALSE
			) +
		geom_point(
			shape=1,
			stroke=0.4,
			data=retent_1, 
			aes(
				x=retent_1$jitter, 
				y=mean_retent_seq, 
				colour=as.factor(retent_1$conc)),
				size=1.0
				) +
		scale_color_manual(
			values=colores_1,
			name ="Conc (nM)", 
			labels=c(0,8,25,75)
			) +
	 		guides(
	 			colour = guide_legend(override.aes = list(fill=NA,shape=1,size=1),ncol=2,byrow=TRUE)
	 			) +
	 		theme(
	 			legend.position = c(0.73,1.00), 
	 			legend.title.align=0.7) +
		scale_x_continuous(breaks = c(0,1,2,3,4,6), labels = c(0,1,2,3,4,6)) +
		# ggtitle("") + 
		xlab("Weeks") + 
		ylab("Retention via \nalignments (%)") + 
		labs(subtitle="Growth")
print(p1)
		
		
		
		
# ----------- Plot TK1 mean retention vs growth time ---------------------



summary_retent_2 <- summarySE(retent, measurevar="TK1_mean_retent", groupvars=c("week","conc"))
 


# provide common x scale jitter for two overlaid ggplots
summary_retent_2$jitter <- jitter(summary_retent_2$week,1)


retent_2 <- merge(retent,summary_retent_2[c("week","conc","jitter")])

# replicate week 0, drug 0 nM data point four times, to allow four graphs lines linking week 0 and week 1, instead of only one
summary_retent_2 <- rbind(summary_retent_2[1,],summary_retent_2[1,],summary_retent_2[1,],summary_retent_2)
summary_retent_2[1:4,"conc"] <- c(0,8,25,75)

# turn retent freq into percentage
retent_2[,c("median_retent_seq", "mean_retent_seq", "TK1_median_retent", "TK1_mean_retent")] <- 100*retent_2[,c("median_retent_seq", "mean_retent_seq", "TK1_median_retent", "TK1_mean_retent")]
summary_retent_2[,c("TK1_mean_retent", "sd", "se", "ci")] <- 100*summary_retent_2[,c("TK1_mean_retent", "sd", "se", "ci")]



n = length(unique(retent_2$conc))
colores_2 = gg_color_hue(n)



p2 <- ggplot() + 
		theme2 + 
		theme(legend.key=element_blank()) +
		geom_line(
			data=summary_retent_2, 
			lwd=0.2,
			aes(
				x=summary_retent_2$jitter, 
				y=TK1_mean_retent, 
				colour=as.factor(summary_retent_2$conc)),
				show.legend=FALSE
				) + 
			# remove 3 duplicated entries for 0w_d0, so color of error bar corresponds to 0 nM paclitaxel
		geom_linerange(
			data=summary_retent_2[c(1,5:nrow(summary_retent_2)),],
			aes(
				x=jitter, 
				ymin= TK1_mean_retent-se, 
				ymax= TK1_mean_retent+se,
				colour=as.factor(conc)
				),
			lwd=0.2,
			show.legend=FALSE
			) +
		geom_point(
			shape=1,
			stroke=0.4,
			data=retent_2, 
			aes(
				x=retent_2$jitter, 
				y=TK1_mean_retent, 
				colour=as.factor(retent_2$conc)),
				size=1.0
				) +
		scale_color_manual(
			values=colores_2,
			name ="Conc (nM)", 
			labels=c(0,8,25,75)
			) +
	 		guides(
	 			colour = guide_legend(override.aes = list(fill=NA,shape=1,size=1),ncol=2,byrow=TRUE)
	 			) +
	 		theme(
	 			legend.position = c(0.73,1.00), 
	 			legend.title.align=0.7) +
		scale_x_continuous(breaks = c(0,1,2,3,4,6), labels = c(0,1,2,3,4,6)) +
		# ggtitle("") + 
		xlab("Weeks") + 
		ylab(expression(atop("Retention via ", italic('TK1')~"(%)"))) + 
		labs(subtitle="Growth")
print(p2)



# ----------- Plot sequence retention vs conc ---------------------



summary_retent_3 <- summarySE(retent, measurevar="mean_retent_seq", groupvars=c("conc","week"))
 


# provide common x scale jitter for two overlaid ggplots
summary_retent_3$jitter <- jitter(summary_retent_3$conc,1)


retent_3 <- merge(retent,summary_retent_3[c("week","conc","jitter")])

# replicate week 0, drug 0 nM data point four times, to allow four graphs lines linking week 0 and week 1, instead of only one
summary_retent_3 <- rbind(summary_retent_3[1,],summary_retent_3[1,],summary_retent_3[1,],summary_retent_3)
summary_retent_3[1:4,"conc"] <- c(0,8,25,75)

# turn retent freq into percentage
retent_3[,c("median_retent_seq", "mean_retent_seq", "TK1_median_retent", "TK1_mean_retent")] <- 100*retent_3[,c("median_retent_seq", "mean_retent_seq", "TK1_median_retent", "TK1_mean_retent")]
summary_retent_3[,c("mean_retent_seq", "sd", "se", "ci")] <- 100*summary_retent_3[,c("mean_retent_seq", "sd", "se", "ci")]





n = length(unique(retent_3$week))
colores_3 = gg_color_hue(n)



p3 <- ggplot() + 
		theme2 + 
		theme(legend.key=element_blank()) +
		geom_line(
			data=summary_retent_3, 
			lwd=0.2,
			aes(
				x=summary_retent_3$jitter, 
				y=mean_retent_seq, 
				colour=as.factor(summary_retent_3$week)),
				show.legend=FALSE
				) + 
			# remove 3 duplicated entries for 0w_d0, so color of error bar corresponds to 0 nM paclitaxel
		geom_linerange(
			data=summary_retent_3[c(1,5:nrow(summary_retent_3)),],
			aes(
				x=jitter, 
				ymin=mean_retent_seq-se, 
				ymax=mean_retent_seq+se,
				colour=as.factor(week)
				),
			lwd=0.2,
			show.legend=FALSE
			) +
		geom_point(
			shape=1,
			stroke=0.4,
			data=retent_3, 
			aes(
				x=retent_3$jitter, 
				y=mean_retent_seq, 
				colour=as.factor(retent_3$week)),
				size=1.0
				) +
		scale_color_manual(
			values=colores_3,
			name ="Weeks", 
			labels=c(0,1,2,3,4,6)
			) +
	 		guides(
	 			colour = guide_legend(override.aes = list(fill=NA,shape=1,size=1),ncol=3,byrow=TRUE)
	 			) +
	 		theme(
	 			legend.position = c(0.73,1.00), 
	 			legend.title.align=0.7) +
	 	scale_x_continuous(breaks = c(0,8,25,75), labels = c(0,8,25,75)) +
		# ggtitle("") + 
		xlab("Conc (nM)") + 
		ylab("Retention via \nalignments (%)") + 
		labs(subtitle="Paclitaxel")
print(p3)
		


# ----------- Plot TK1 retention vs conc ---------------------


summary_retent_4 <- summarySE(retent, measurevar="TK1_mean_retent", groupvars=c("conc","week"))
 


# provide common x scale jitter for two overlaid ggplots
summary_retent_4$jitter <- jitter(summary_retent_4$conc,1)


retent_4 <- merge(retent,summary_retent_4[c("week","conc","jitter")])

# replicate week 0, drug 0 nM data point four times, to allow four graphs lines linking week 0 and week 1, instead of only one
summary_retent_4 <- rbind(summary_retent_4[1,],summary_retent_4[1,],summary_retent_4[1,],summary_retent_4)
summary_retent_4[1:4,"conc"] <- c(0,8,25,75)

# turn retent freq into percentage
retent_4[,c("median_retent_seq", "mean_retent_seq", "TK1_median_retent", "TK1_mean_retent")] <- 100*retent_4[,c("median_retent_seq", "mean_retent_seq", "TK1_median_retent", "TK1_mean_retent")]
summary_retent_4[,c("TK1_mean_retent", "sd", "se", "ci")] <- 100*summary_retent_4[,c("TK1_mean_retent", "sd", "se", "ci")]




n = length(unique(retent_4$week))
colores_4 = gg_color_hue(n)


p4 <- ggplot() + 
		theme2 + 
		theme(legend.key=element_blank()) +
		geom_line(
			data=summary_retent_4, 
			lwd=0.2,
			aes(
				x=summary_retent_4$jitter, 
				y= TK1_mean_retent, 
				colour=as.factor(summary_retent_4$week)),
				show.legend=FALSE
				) + 
			# remove 3 duplicated entries for 0w_d0, so color of error bar corresponds to 0 nM paclitaxel
		geom_linerange(
			data=summary_retent_4[c(1,5:nrow(summary_retent_4)),],
			aes(
				x=jitter, 
				ymin=TK1_mean_retent-se, 
				ymax=TK1_mean_retent+se,
				colour=as.factor(week)
				),
			lwd=0.2,
			show.legend=FALSE
			) +
		geom_point(
			shape=1,
			stroke=0.4,
			data=retent_4, 
			aes(
				x=retent_4$jitter, 
				y=TK1_mean_retent, 
				colour=as.factor(retent_4$week)),
				size=1.0
				) +
		scale_color_manual(
			values=colores_4,
			name ="Weeks", 
			labels=c(0,1,2,3,4,6)
			) +
 		guides(
 			colour = guide_legend(override.aes = list(fill=NA,shape=1,size=1),ncol=3,byrow=TRUE)
 			) +
 		theme(
 			legend.position = c(0.73,1.00), 
 			legend.title.align=0.7) +
 		scale_x_continuous(breaks = c(0,8,25,75), labels = c(0,8,25,75)) +
		# ggtitle("") + 
		xlab("Conc (nM)") +  
		ylab(expression(atop("Retention via ", italic('TK1')~"(%)"))) + 
		labs(subtitle="Paclitaxel")
print(p4)


#------------------Make file --------------------------


pdf("mean_retent_1.pdf",width=7.5,height= 6.67)
plot_grid(p1, p2,p3,p4, labels=c("A", "B","C","D"), ncol = 2, nrow = 2, label_size = 14)
dev.off()

#-------------------------------------------------------



# ------------------------------------------------------------------------------------------
# ----------------- Retention frequencies of RH pools --------------------------------------
# ------------------------------------------------------------------------------------------


# <<<<<<<<<<<<< use in paper >>>>>>>>>>>>>>>>>>>>>>>>>

retent[retent$week==0 & retent$conc==0,]
       # RH_ID pool week conc cell median_human_reads median_hamster_reads mean_human_reads mean_hamster_reads median_retent_seq mean_retent_seq TK1_median_retent
# 1  RH1_w0_d0    1    0    0    1                 76                15130         95.79096           15209.42        0.01004627      0.01259626        0.05817028
# 21 RH2_w0_d0    2    0    0    6                194                12372        251.74543           12344.90        0.03136114      0.04078533        0.05881380
# 40 RH3_w0_d0    3    0    0   11                246                18255        341.26520           18410.09        0.02695152      0.03707372        0.05445742
# 60 RH4_w0_d0    4    0    0   16                259                11237        337.15985           11269.49        0.04609771      0.05983588        0.05852638
# 80 RH5_w0_d0    5    0    0   21                155                15082        205.01529           15367.61        0.02055430      0.02668147        0.04964056
# 98 RH6_w0_d0    6    0    0   26                154                14877        202.76471           14928.61        0.02070310      0.02716458        0.03787156
   # TK1_mean_retent revert_freq mean_of_medians mean_of_means TK1_max
# 1       0.07331825   0.2368978      0.03410827    0.04295725     997
# 21      0.07632012   0.2008600      0.04508747    0.05855273    2636
# 40      0.07554644   0.2690309      0.04070447    0.05631008    3302
# 60      0.07618821   0.2938419      0.05231205    0.06801204    3125
# 80      0.06565854   0.2845355      0.03509743    0.04617000    2234
# 98      0.04986374   0.3672498      0.02928733    0.03851416    2573


apply(retent[retent$week==0 & retent$conc==0,6:ncol(retent)],2,mean)
# median_human_reads median_hamster_reads     mean_human_reads   mean_hamster_reads    median_retent_seq      mean_retent_seq    TK1_median_retent 
        # 1.806667e+02         1.449217e+04         2.389569e+02         1.458835e+04         2.595234e-02         3.402287e-02         5.291333e-02 
     # TK1_mean_retent          revert_freq      mean_of_medians        mean_of_means              TK1_max 
        # 6.948255e-02         2.754026e-01         3.943284e-02         5.175271e-02         2.477833e+03   <<<<<<<<<<<< use in paper


sem <- function(x) {sqrt(var(x,na.rm=TRUE)/sum(!is.na(x)))}

apply(retent[retent$week==0 & retent$conc==0,6:ncol(retent)],2,sem)
  # median_human_reads median_hamster_reads     mean_human_reads   mean_hamster_reads    median_retent_seq      mean_retent_seq    TK1_median_retent 
        # 2.763291e+01         1.002546e+03         3.796171e+01         1.028953e+03         4.985934e-03         6.535497e-03         3.333204e-03 
     # TK1_mean_retent          revert_freq      mean_of_medians        mean_of_means              TK1_max 
        # 4.253322e-03         2.305005e-02         3.413425e-03         4.528287e-03         3.358805e+02   <<<<<<<<<<<< use in paper
    
    
    
# ^^^^^^^^^^^^^^^ use in paper ^^^^^^^^^^^^^^^^^^^^^^
    
 
      
      
# ------------------------------------------------------------------------------------------
# ----- Compare retention frequencies of median_retent_seq and TK1_median_retent -----------
# ------------------------------------------------------------------------------------------


t.test(retent[retent$week==0 & retent$conc==0,"median_retent_seq"],retent[retent$week==0 & retent$conc==0,"TK1_median_retent"])

	# Welch Two Sample t-test <<<<<<<<<<< use in paper

# data:  retent[retent$week == 0 & retent$conc == 0, "median_retent_seq"] and retent[retent$week == 0 & retent$conc == 0, "TK1_median_retent"]
# t = -4.4954, df = 8.7252, p-value = 0.00162
# alternative hypothesis: true difference in means is not equal to 0
# 95 percent confidence interval:
 # -0.04059367 -0.01332832
# sample estimates:
 # mean of x  mean of y 
# 0.02595234 0.05291333 

  
    
# ------------------------------------------------------------------------------------------
# ----------------- Retention frequencies of all samples --------------------------------------
# ------------------------------------------------------------------------------------------


apply(retent[,c(10:13,15:16)],2,mean)
# median_retent_seq   mean_retent_seq TK1_median_retent   TK1_mean_retent   mean_of_medians     mean_of_means 
       # 0.01478064        0.03497492        0.02539782        0.05281520        0.02008923        0.04389506 



apply(retent[,c(10:13,15:16)],2,sem)
# median_retent_seq   mean_retent_seq TK1_median_retent   TK1_mean_retent   mean_of_medians     mean_of_means 
     # 0.0007281475      0.0018448728      0.0012760644      0.0015047539      0.0008819243      0.0013421823   
    



      
# ------------------------------------------------------------------------------------------      
# ----------- Calculate retention parameters, resolution, redundancy etc. for RH pools -----
# ------------------------------------------------------------------------------------------    





# ---------------------- Data for table in paper --------------------------------------------


retent[retent$week==0 & retent$conc==0,c("pool","median_human_reads","median_hamster_reads","TK1_max","revert_freq","median_retent_seq","TK1_median_retent","mean_of_medians")]
   # pool median_human_reads median_hamster_reads TK1_max revert_freq median_retent_seq TK1_median_retent mean_of_medians
# 1     1                 76                15130     997   0.2368978        0.01004627        0.05817028      0.03410827
# 21    2                194                12372    2636   0.2008600        0.03136114        0.05881380      0.04508747
# 40    3                246                18255    3302   0.2690309        0.02695152        0.05445742      0.04070447
# 60    4                259                11237    3125   0.2938419        0.04609771        0.05852638      0.05231205
# 80    5                155                15082    2234   0.2845355        0.02055430        0.04964056      0.03509743
# 98    6                154                14877    2573   0.3672498        0.02070310        0.03787156      0.02928733


apply(retent[retent$week==0 & retent$conc==0,c("median_human_reads","median_hamster_reads","TK1_max","revert_freq","median_retent_seq","TK1_median_retent","mean_of_medians")],2,mean)
  # median_human_reads median_hamster_reads              TK1_max          revert_freq    median_retent_seq    TK1_median_retent      mean_of_medians 
        # 1.806667e+02         1.449217e+04         2.477833e+03         2.754026e-01         2.595234e-02         5.291333e-02         3.943284e-02 


apply(retent[retent$week==0 & retent$conc==0,c("median_human_reads","median_hamster_reads","TK1_max","revert_freq","median_retent_seq","TK1_median_retent","mean_of_medians")],2,sem)
  # median_human_reads median_hamster_reads              TK1_max          revert_freq    median_retent_seq    TK1_median_retent      mean_of_medians 
        # 2.763291e+01         1.002546e+03         3.358805e+02         2.305005e-02         4.985934e-03         3.333204e-03         3.413425e-03 



# ------------------- retent_pool -----------------------------


retent_pool <- retent[retent$week==0 & retent$conc==0,c("pool","median_retent_seq","TK1_median_retent","mean_of_medians")]

retent_pool
   # pool median_retent_seq TK1_median_retent mean_of_medians
# 1     1        0.01004627        0.05817028      0.03410827
# 21    2        0.03136114        0.05881380      0.04508747
# 40    3        0.02695152        0.05445742      0.04070447
# 60    4        0.04609771        0.05852638      0.05231205
# 80    5        0.02055430        0.04964056      0.03509743
# 98    6        0.02070310        0.03787156      0.02928733



# ---------------- mean, var and sem of mean_of_medians ---------------------------

(R <- mean(retent_pool$mean_of_medians))
# [1] 0.03943284 <<<<<<<<<< use in paper


(varR <- var(retent_pool$mean_of_medians))
# [1] 6.990881e-05



sem(retent_pool$mean_of_medians)
# [1] 0.003413425 <<<<<<<<<< use in paper



# Note get similar estimates using mixed model. Decided to use more apt s.e.m. model above because of simplicity and straightforwardness, and because calculating resolution at 0 wks, 0 conc (i.e. in RH pools only) is most accruate reflection of ultimate resolution despite later fragment losses.

retent_pool_l <- reshape(retent_pool, 
  varying = c("median_retent_seq", "TK1_median_retent"), 
  v.names = "retent",
 timevar = "method", 
 times = c("median_retent_seq", "TK1_median_retent"), 
 new.row.names = 1:1000,
 direction = "long")

summary(glht(lmer(retent~(1|method) + (1|pool),data=retent_pool_l,REML=TRUE)),test = adjusted("none"))

	 # Simultaneous Tests for General Linear Hypotheses

# Fit: lmer(formula = retent ~ (1 | method) + (1 | pool), data = retent_pool_l, 
    # REML = TRUE)

# Linear Hypotheses:
                 # Estimate Std. Error z value Pr(>|z|)   
# (Intercept) == 0  0.03943    0.01368   2.883  0.00394 **
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
# (Adjusted p values reported -- none method)





# --------------- From  clone_sem_1.R,  mean, var clone numb ---------------------

clone <- read.table("clone.txt",header=TRUE,stringsAsFactors=FALSE,sep="\t")

# Mean RH clones per fusion
(N <- mean(aggregate(RH_clones~Pool,data=clone,FUN=sum)[,2]))
# [1] 2621.125

# var RH clones per fusion
(varN <- var(aggregate(RH_clones~Pool,data=clone,FUN=sum)[,2]))
# [1] 1168018

# ----------- Freq of retention ~800 higher than for lentivirus --------------------------------------

# Number protein coding genes
num_cr <- length(gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type == "protein_coding","geneSymbol"])
# [1] 19940


# Number nc genes
num_nc <- length(gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type != "protein_coding","geneSymbol"])
# [1] 38781

# Ratio nc/cr:

num_nc/num_cr
# [1] 1.944885



# mean ratio retent vs retent in lentiviral protein library
R/(1/num_cr)
# [1] 786.2908 <<<<<<<<<<<<< do not use in paper 

# sem of ratio (uses sem of R with n = 6, or 0.003413425, from above)
sem(retent_pool$mean_of_medians)/(1/num_cr)
# [1] 68.06369 <<<<<<<<<<<<< do not use in paper 

 


# ------------ Number of cells with a gene in 75 cm2 flask with 10^7 cells ---------------

# RH pool
R * 10^7
# [1] 394328.4 <<<<<<<<<<<<< use in paper 

# sem (uses sem of R with n = 6, or 0.003413425, from above)
sem(retent_pool$mean_of_medians) * 10^7
# [1] 34134.25 <<<<<<<<<<<<< use in paper


# lentiviral library:
(1/2e4)*10^7
# [1] 500


# ---------------------- Redundancy --------------------------------------

(mean_R_times_N <- R*N)
# [1] 103.3584 <<<<<<<<<<<<< use in paper

# variance of R*N is (https://stats.stackexchange.com/questions/52646/variance-of-product-of-multiple-random-variables):
(var_RN <- (varR*varN + varR*N^2 + varN*R^2))
# [1] 2378.157

# hence sem of redundancy
sqrt(var_RN/6)
# [1] 19.90878 <<<<<<<<<<<<< use in paper

    

# ------------------- Length human genome -------------------------------

# From human_chr_lengths_1.R
# length human genome, excluding mitochondria

G <- 3088269832


# ------------------ Number of breakpoints -------------------------------

# assuming av frag size is F and number clones N, no. bkpts == twice number frags:

# # simple estimate frag size
# # frag length = 7e6

# F <- mean(c(4e6,10e6))
# varF <- var(c(4e6,10e6))


# weighted estimate frag size

clone$frag_size <- 0
clone[clone$gamma==3e3,c("frag_size")] <- 10e6
clone[clone$gamma==10e3,c("frag_size")] <- 4e6

RH_clones_sum <- aggregate(RH_clones~Pool,data=clone,FUN=sum)
colnames(RH_clones_sum)[2] <- "RH_clones_sum"

clone <- merge(clone,RH_clones_sum,by.x="Pool",by.y="Pool",all.x=TRUE)
clone$wt_frag_size <- clone$frag_size*(clone$RH_clones/clone$RH_clones_sum)

head(clone)
  # Pool gamma dil colonies reverts RH_clones frag_size RH_clones_sum wt_frag_size
# 1    1  3000   5   545.00      95    450.00     1e+07       2196.25    2048947.1
# 2    1  3000  10  1045.00     260    785.00     1e+07       2196.25    3574274.3
# 3    1 10000   5   536.25     135    401.25     4e+06       2196.25     730791.1
# 4    1 10000  10   770.00     210    560.00     4e+06       2196.25    1019920.3
# 5    2  3000   5   842.50     145    697.50     1e+07       3924.25    1777409.7
# 6    2  3000  10  2042.50     380   1662.50     1e+07       3924.25    4236478.3

frag_size <- aggregate(wt_frag_size~Pool,data=clone,FUN=sum)

frag_size
  # Pool wt_frag_size
# 1    1      7373933
# 2    2      7608333
# 3    3      7534483
# 4    4      7376915
# 5    5      7206068
# 6    6      7052023


mean(frag_size$wt_frag_size)
# [1] 7358626 <<<<<<<<<< use in paper

sem(frag_size$wt_frag_size)
# [1] 83890.22 <<<<<<<<<< use in paper


F <- mean(frag_size$wt_frag_size)
varF <- var(frag_size$wt_frag_size)




# av num bkpts:
(2*R*G*N/F)
# [1] 86754.95 <<<<<<<<<<<<< use in paper




# variance of R*N is (https://stats.stackexchange.com/questions/52646/variance-of-product-of-multiple-random-variables):
(var_RN <- (varR*varN + varR*N^2 + varN*R^2))
# [1] 2378.157



# variance of R*N/F is http://www.stat.cmu.edu/~hseltman/files/ratio.pdf

var_RN_div_F <- ((R*N)^2/F^2)*((var_RN/(R*N)^2) + varF/(F^2))


var_RN_div_F
# [1] 4.407231e-11


# for sem chose 6 for number, as conservative compromise between n = 12 (median seq and median TK), n = 6 (six pools) and n = 2 (frag lengths)

# hence sem bkpts:

(2*G)*(sqrt(var_RN_div_F/6))
# [1] 16739.89 <<<<<<<<<<<<< use in paper





# ------------- Expected resolution from one pool -----------------

# Mapping resolution depends on initial number of bkpts in originating RH pools, not number of bkpts after selection in growth and paclitaxel. So estimate below, ignoring other values of growth time and paclitax conc, is most accurate.

# length genome / number frags

# G/((R*G*N/F)) ==

F/(R*N)
# [1] 71195.24 <<<<<<<<<<<<< use in paper

# From http://www.stat.cmu.edu/~hseltman/files/ratio.pdf
var_F_div_RN <- (F^2/(R*N)^2)*((var_RN/(R*N)^2) + varF/(F^2))


# sem of expected resolution:

sqrt(var_F_div_RN/6)
# [1] 13737.55 <<<<<<<<<<<<< use in paper





# ------------- Numbers from above changed to fit expected numbers for six pools -----------------



# ---------------- mean, var and sem of mean_of_medians ---------------------------


# mean retention, R, unchanged at 0.03943284, but varR increased:


(varR_six <- 6 * var(retent_pool$mean_of_medians))
# [1] 0.0004194528



# --------------- From  clone_sem_1.R,  mean, var clone numb ---------------------

# Number RH clones and variance increased

# Total RH clones
(N_six <- 6 * mean(aggregate(RH_clones~Pool,data=clone,FUN=sum)[,2]))
# [1] 15726.75

# var RH clones per fusion
(varN_six <- 6 * var(aggregate(RH_clones~Pool,data=clone,FUN=sum)[,2]))
# [1] 7008110






# ------------------ Number of breakpoints six pools -------------------------------

# mean fragment size, F, unchanged at 7358626 bp, but variance increased



varF_six <- 6*varF




# av num bkpts increased:
6*(2*R*G*N/F)
# [1] 520529.7 <<<<<<<<<<<<< do not use in paper




# variance of R*N is (https://stats.stackexchange.com/questions/52646/variance-of-product-of-multiple-random-variables) but now increased:
(var_RN_six <- (varR_six*varN_six + varR_six*N_six^2 + varN_six*R^2))
# [1] 117580.4



# variance of R*N/F is http://www.stat.cmu.edu/~hseltman/files/ratio.pdf and is now increased

var_RN_div_F_six <- ((R*N_six)^2/F^2)*((var_RN_six/(R*N_six)^2) + varF_six/(F^2))


var_RN_div_F_six
# [1] 2.204638e-09


# for sem chose 6 for number, as conservative (and realistic) compromise between n = 12 (median seq and median TK), n = 6 (six pools) and n = 2 (frag lengths)


(2*G)*(sqrt(var_RN_div_F_six/6))
# [1] 118396.3 <<<<<<<<<<<<< do not use in paper





# ------------- Expected resolution from six pools -----------------

# Mapping resolution depends on initial number of bkpts in originating RH pools, not number of bkpts after selection in growth and paclitaxel. So estimate below, ignoring other values of growth time and paclitax conc, is most accurate.

# length genome / number frags

# G/((R*G*N/F)) ==

F/(R*N_six)
# [1] 11865.87 <<<<<<<<<<<<< use in paper

# From http://www.stat.cmu.edu/~hseltman/files/ratio.pdf
var_F_div_RN_six <- (F^2/(R*N_six)^2)*((var_RN_six/(R*N_six)^2) + varF_six/(F^2))


# sem of expected resolution:

sqrt(var_F_div_RN_six/6)
# [1] 2698.935 <<<<<<<<<<<<< use in paper

# Expected resolution and sem values roughly numbers from one pool divided by six


























