# Compare centromere retention in HEK293 cells vs RH pools

# --------- Prepare HEK293 data from our lab -----------------

# read in HEK293 sequence reads:
HEK <- read.table("HEK293_gseq.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE)

# Get rows at beginning of each chromosome:
HEK_start <- HEK[HEK$posS == 0 & HEK$posE == 1e6,]

# Get rid of ramp ups and ramp downs:
HEK <- HEK[c(0,diff(HEK$pos)) == 1e4,]

# combine HEK without ramps and HEK_start:
HEK <- rbind(HEK_start,HEK)

# Sort:
chrOrder<-paste("chr",c(1:22,"X","Y"),sep="")
# chrOrder<-c(1:24)
HEK$Chromosome <-factor(HEK$Chromosome, levels=chrOrder)
HEK <- HEK[order(HEK$Chromosome, HEK$pos), ]
HEK$Chromosome <- as.numeric(HEK$Chromosome)


# Transform reads into mean ratios
HEK$read_ratio <- HEK$reads/mean(HEK$reads)


# Transform chr1 etc. to numbers
HEK$Chromosome <- gsub('chr', '', HEK$Chromosome)
HEK[HEK$Chromosome == "X","Chromosome"] <- 23
HEK[HEK$Chromosome == "Y","Chromosome"] <- 24
# chrOrder<-paste("chr",c(1:22,"X","Y"),sep="")
chrOrder<-c(1:24)
HEK$Chromosome <-factor(HEK$Chromosome, levels=chrOrder)
HEK <- HEK[order(HEK$Chromosome, HEK$pos), ]
HEK$Chromosome <- as.numeric(HEK$Chromosome)

# Compute chromosome size
gen_coord <- aggregate(pos~Chromosome,FUN=max,data=HEK)
colnames(gen_coord)[2] <- "chr_size"
gen_coord$Chromosome <-factor(gen_coord$Chromosome, levels=chrOrder)
gen_coord <- gen_coord[order(gen_coord$Chromosome), ]
gen_coord$Chromosome <- as.numeric(gen_coord$Chromosome)

# Use cumsum to make genome coordinates
gen_coord$coord <- c(0,cumsum(gen_coord$chr_size)[-24])

# merge genome coordinates with HEK
HEK <- merge(HEK,gen_coord[,c("Chromosome","coord")])
HEK$Chromosome <-factor(HEK$Chromosome, levels=chrOrder)
HEK <- HEK[order(HEK$Chromosome, HEK$pos), ]
HEK$Chromosome <- as.numeric(HEK$Chromosome)

HEK$coord <- HEK$pos + HEK$coord

# get rid of chrY
HEK <- HEK[HEK$Chromosome != 24,]

dim(HEK)
# [1] 300814      7

head(HEK)
  # Chromosome  posS    posE    pos reads read_ratio  coord
# 1          1     0 1000000 500000 24858   1.954085 500000
# 2          1 10000 1010000 510000 25003   1.965484 510000
# 3          1 20000 1020000 520000 24978   1.963519 520000
# 4          1 30000 1030000 530000 24969   1.962811 530000
# 5          1 40000 1040000 540000 24933   1.959981 540000
# 6          1 50000 1050000 550000 24836   1.952356 550000


#----------------- Prepare human retain in RH pools---------------------

# read in RH human sequence reads:
RH_human <- read.table("RH_human_gseq.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE)

# Get rows at beginning of each chromosome:
RH_human_start <- RH_human[RH_human$posS == 0 & RH_human$posE == 1e6,]

# Get rid of ramp ups and ramp downs:
RH_human <- RH_human[c(0,diff(RH_human$pos)) == 1e4,]

# combine RH_human without ramps and RH_human_start:
RH_human <- rbind(RH_human_start,RH_human)

# Sort:
chrOrder<-paste("chr",c(1:22,"X","Y"),sep="")
RH_human$Chromosome <-factor(RH_human$Chromosome, levels=chrOrder)
RH_human <- RH_human[order(RH_human$Chromosome, RH_human$pos), ]
# Because of factor levels, following transforms chr to nos, X becomes 23, Y becomes 24
RH_human$Chromosome <- as.numeric(RH_human$Chromosome)


# Normalize RH pools using peak copy number at TK1:
RH_human$RH1_w0_d0_ratio <- RH_human[,"RH1_w0_d0"]/RH_human[RH_human$Chromosome==17,"RH1_w0_d0"][which.max(RH_human[RH_human$Chromosome==17,"RH1_w0_d0"])]
RH_human$RH2_w0_d0_ratio <- RH_human[,"RH2_w0_d0"]/RH_human[RH_human$Chromosome==17,"RH2_w0_d0"][which.max(RH_human[RH_human$Chromosome==17,"RH2_w0_d0"])]
RH_human$RH3_w0_d0_ratio <- RH_human[,"RH3_w0_d0"]/RH_human[RH_human$Chromosome==17,"RH3_w0_d0"][which.max(RH_human[RH_human$Chromosome==17,"RH3_w0_d0"])]
RH_human$RH4_w0_d0_ratio <- RH_human[,"RH4_w0_d0"]/RH_human[RH_human$Chromosome==17,"RH4_w0_d0"][which.max(RH_human[RH_human$Chromosome==17,"RH4_w0_d0"])]
RH_human$RH5_w0_d0_ratio <- RH_human[,"RH5_w0_d0"]/RH_human[RH_human$Chromosome==17,"RH5_w0_d0"][which.max(RH_human[RH_human$Chromosome==17,"RH5_w0_d0"])]
RH_human$RH6_w0_d0_ratio <- RH_human[,"RH6_w0_d0"]/RH_human[RH_human$Chromosome==17,"RH6_w0_d0"][which.max(RH_human[RH_human$Chromosome==17,"RH6_w0_d0"])]


# Transform chr1 etc. to numbers, though not necessary with above step using factors
RH_human$Chromosome <- gsub('chr', '', RH_human$Chromosome)
RH_human[RH_human$Chromosome == "X","Chromosome"] <- 23
RH_human[RH_human$Chromosome == "Y","Chromosome"] <- 24
chrOrder<-c(1:24)
RH_human$Chromosome <-factor(RH_human$Chromosome, levels=chrOrder)
RH_human <- RH_human[order(RH_human$Chromosome, RH_human$pos), ]
RH_human$Chromosome <- as.numeric(RH_human$Chromosome)

# Compute chromosome size
gen_coord <- aggregate(pos~Chromosome,FUN=max,data=RH_human)
colnames(gen_coord)[2] <- "chr_size"
gen_coord$Chromosome <-factor(gen_coord$Chromosome, levels=chrOrder)
gen_coord <- gen_coord[order(gen_coord$Chromosome), ]
gen_coord$Chromosome <- as.numeric(gen_coord$Chromosome)

# Use cumsum to make genome coordinates
gen_coord$coord <- c(0,cumsum(gen_coord$chr_size)[-24])

# merge genome coordinates with RH_human
RH_human <- merge(RH_human,gen_coord[,c("Chromosome","coord")])
RH_human$Chromosome <-factor(RH_human$Chromosome, levels=chrOrder)
RH_human <- RH_human[order(RH_human$Chromosome, RH_human$pos), ]
RH_human$Chromosome <- as.numeric(RH_human$Chromosome)

RH_human$coord <- RH_human$pos + RH_human$coord

# get rid of chrY
RH_human <- RH_human[RH_human$Chromosome != 24,]



dim(RH_human)
# [1] 300814    126


RH_human[1:10,1:10]
   # Chromosome  posS    posE    pos RH1_w0_d0 RH1_w1_d0 RH1_w1_d8 RH1_w1_d25 RH1_w1_d75 RH1_w2_d0
# 1           1     0 1000000 500000        80       112       122         97         66       121
# 2           1 10000 1010000 510000        80       113       123         98         67       122
# 3           1 20000 1020000 520000        75       112       123         92         62       120
# 4           1 30000 1030000 530000        76       110       123         90         62       120
# 5           1 40000 1040000 540000        77       109       121         90         61       120
# 6           1 50000 1050000 550000        78       108       122         89         63       121
# 7           1 60000 1060000 560000        77       106       122         89         62       121
# 8           1 70000 1070000 570000        75       106       119         90         61       120
# 9           1 80000 1080000 580000        77       104       118         89         61       122
# 10          1 90000 1090000 590000        80       102       119         89         62       125

# --------- Read in centromere limits -------------------

# import centromere limits
# cf hg38_centromere_limits_1.R
hg38_cen_limits <- read.table("hg38_centromere_limits.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE)


# replace hg38_cen_limits Chromomsomes with 1,2,3 etc.
chrOrder<-paste("chr",c(1:22,"X","Y"),sep="")
hg38_cen_limits$Chromosome <-factor(hg38_cen_limits$Chromosome, levels=chrOrder)
hg38_cen_limits <- hg38_cen_limits[order(hg38_cen_limits$Chromosome), ]
# Because of factor levels, following transforms chr to nos, X becomes 23, Y becomes 24
hg38_cen_limits$Chromosome <- as.numeric(hg38_cen_limits$Chromosome)


# get rid of chrY
hg38_cen_limits <- hg38_cen_limits[hg38_cen_limits$Chromosome != 24,]

dim(hg38_cen_limits)
# [1] 23  4

head(hg38_cen_limits)
  # Chromosome      posS      posE       pos
# 1          1 122026459 124932724 123479592
# 2          2  92188145  94090557  93139351
# 3          3  90772458  93655574  92214016
# 4          4  49712061  51743951  50728006
# 5          5  46485900  50059807  48272854
# 6          6  58553888  59829934  59191911


# --------- Ratio mean cen reads vs rest of genome for our HEK293 data and RH pool data---------------

# ~~~~~~~ Prepare HEK293 data ~~~~~~~~~~


# make HEK data frame without centromeres:
HEK_no_cen <- HEK

# Use posE and posS for HEK_no_cen, to totally clean centromeres from chromosomes, at risk of including some euchromatin in the centromeres:

for(i in c(1:23)) {
	
	HEK_no_cen <- HEK_no_cen[!(HEK_no_cen$Chromosome == i & HEK_no_cen$posE > hg38_cen_limits[hg38_cen_limits$Chromosome==i,"posS"] & HEK_no_cen$posS < hg38_cen_limits[hg38_cen_limits$Chromosome==i,"posE"]),]
	
}

dim(HEK_no_cen)
# [1] 292329      7

# make HEK data frame with only centromeres:
HEK_cen_only <- list()


# Use posE and posS for HEK_cen_only, to totally clean centromeres from chromosomes, at risk of including some euchromatin in the centromeres:

for(i in c(1:23)) {
	
	HEK_cen_only[[i]] <- HEK[(HEK$Chromosome == i & HEK$posE > hg38_cen_limits[hg38_cen_limits$Chromosome==i,"posS"] & HEK$posS < hg38_cen_limits[hg38_cen_limits$Chromosome==i,"posE"]),]

	
}

HEK_cen_only <- do.call(rbind, HEK_cen_only)

dim(HEK_cen_only)
# [1] 8485    7

dim(HEK_no_cen)[1] + dim(HEK_cen_only)[1] == dim(HEK)[1]
# [1] TRUE

dim(HEK_no_cen)
# [1] 292329      7

dim(HEK_cen_only)
# [1] 8485    7

dim(HEK)
# [1] 300814      7

292329 + 8485
# [1] 300814

# Percent reads devoted to centromeres

8485/300814
# [1] 0.0282068

# ~~~~~~~~~~ Prepare RH pool data ~~~~~~~~~~~~


# make RH_human data frame without centromeres:
RH_human_no_cen <- RH_human

# Use posE and posS for RH_human_no_cen, to totally clean centromeres from chromosomes, at risk of including some euchromatin in the centromeres:

for(i in c(1:23)) {
	
	RH_human_no_cen <- RH_human_no_cen[!(RH_human_no_cen$Chromosome == i & RH_human_no_cen$posE > hg38_cen_limits[hg38_cen_limits$Chromosome==i,"posS"] & RH_human_no_cen$posS < hg38_cen_limits[hg38_cen_limits$Chromosome==i,"posE"]),]

	
}

dim(RH_human_no_cen)
# [1] 292329    126

# make RH_human data frame with only centromeres:
RH_human_cen_only <- list()


# Use posE and posS for RH_human_cen_only, to totally clean centromeres from chromosomes, at risk of including some euchromatin in the centromeres:

for(i in c(1:23)) {
	
	RH_human_cen_only[[i]] <- RH_human[(RH_human$Chromosome == i & RH_human$posE > hg38_cen_limits[hg38_cen_limits$Chromosome==i,"posS"] & RH_human$posS < hg38_cen_limits[hg38_cen_limits$Chromosome==i,"posE"]),]

	
}

RH_human_cen_only <- do.call(rbind, RH_human_cen_only)

dim(RH_human_cen_only)
# [1] 8485    126

dim(RH_human_no_cen)[1] + dim(RH_human_cen_only)[1] == dim(RH_human)[1]
# [1] TRUE

dim(RH_human_no_cen)
# [1] 292329      126

dim(RH_human_cen_only)
# [1] 8485    126

dim(RH_human)
# [1] 300814      126

292329 + 8485
# [1] 300814

# Percent reads devoted to centromeres

8485/300814
# [1] 0.0282068

# ~~~~~~~~~~ Compare ratios HEK293 cen to non-cen and RH pool cen to non-cen ~~~~~~~~~~~~~~~~~~~~

# Ratio of cen to non-cen HEK293

# ratio of median values for HEK
HEK_ratio <- median(HEK_cen_only$read_ratio)/median(HEK_no_cen $read_ratio)
# [1] 1.07507

# ratio of median values for RH pools:
mean(c(
		median(RH_human_cen_only[,"RH1_w0_d0_ratio"])/median(RH_human_no_cen[,"RH1_w0_d0_ratio"]),
		median(RH_human_cen_only[,"RH2_w0_d0_ratio"])/median(RH_human_no_cen[,"RH2_w0_d0_ratio"]),
		median(RH_human_cen_only[,"RH3_w0_d0_ratio"])/median(RH_human_no_cen[,"RH3_w0_d0_ratio"]),
		median(RH_human_cen_only[,"RH4_w0_d0_ratio"])/median(RH_human_no_cen[,"RH4_w0_d0_ratio"]),
		median(RH_human_cen_only[,"RH5_w0_d0_ratio"])/median(RH_human_no_cen[,"RH5_w0_d0_ratio"]),
		median(RH_human_cen_only[,"RH6_w0_d0_ratio"])/median(RH_human_no_cen[,"RH6_w0_d0_ratio"])
		))
# [1] 5.495818 <<<<<<<<<<<< use in paper

sem <- function(x) {sqrt(var(x,na.rm=TRUE)/sum(!is.na(x)))}

sem(c(
		median(RH_human_cen_only[,"RH1_w0_d0_ratio"])/median(RH_human_no_cen[,"RH1_w0_d0_ratio"]),
		median(RH_human_cen_only[,"RH2_w0_d0_ratio"])/median(RH_human_no_cen[,"RH2_w0_d0_ratio"]),
		median(RH_human_cen_only[,"RH3_w0_d0_ratio"])/median(RH_human_no_cen[,"RH3_w0_d0_ratio"]),
		median(RH_human_cen_only[,"RH4_w0_d0_ratio"])/median(RH_human_no_cen[,"RH4_w0_d0_ratio"]),
		median(RH_human_cen_only[,"RH5_w0_d0_ratio"])/median(RH_human_no_cen[,"RH5_w0_d0_ratio"]),
		median(RH_human_cen_only[,"RH6_w0_d0_ratio"])/median(RH_human_no_cen[,"RH6_w0_d0_ratio"])
		))

# [1] 0.2253278 <<<<<<<<<<<< use in paper

# t test assuming pooled variance (more conservative than assuming HEK_ratio is theoretical value of comparison, reasonable to assume pooled variance in this setting):

t.test(c(
		median(RH_human_cen_only[,"RH1_w0_d0_ratio"])/median(RH_human_no_cen[,"RH1_w0_d0_ratio"]),
		median(RH_human_cen_only[,"RH2_w0_d0_ratio"])/median(RH_human_no_cen[,"RH2_w0_d0_ratio"]),
		median(RH_human_cen_only[,"RH3_w0_d0_ratio"])/median(RH_human_no_cen[,"RH3_w0_d0_ratio"]),
		median(RH_human_cen_only[,"RH4_w0_d0_ratio"])/median(RH_human_no_cen[,"RH4_w0_d0_ratio"]),
		median(RH_human_cen_only[,"RH5_w0_d0_ratio"])/median(RH_human_no_cen[,"RH5_w0_d0_ratio"]),
		median(RH_human_cen_only[,"RH6_w0_d0_ratio"])/median(RH_human_no_cen[,"RH6_w0_d0_ratio"])
		), median(HEK_cen_only$read_ratio)/median(HEK_no_cen $read_ratio),var.equal=TRUE)

	# Two Sample t-test

# data:  c(median(RH_human_cen_only[, "RH1_w0_d0_ratio"])/median(RH_human_no_cen[,  and median(HEK_cen_only$read_ratio)/median(HEK_no_cen$read_ratio)    "RH1_w0_d0_ratio"]), median(RH_human_cen_only[, "RH2_w0_d0_ratio"])/median(RH_human_no_cen[,  and median(HEK_cen_only$read_ratio)/median(HEK_no_cen$read_ratio)    "RH2_w0_d0_ratio"]), median(RH_human_cen_only[, "RH3_w0_d0_ratio"])/median(RH_human_no_cen[,  and median(HEK_cen_only$read_ratio)/median(HEK_no_cen$read_ratio)    "RH3_w0_d0_ratio"]), median(RH_human_cen_only[, "RH4_w0_d0_ratio"])/median(RH_human_no_cen[,  and median(HEK_cen_only$read_ratio)/median(HEK_no_cen$read_ratio)    "RH4_w0_d0_ratio"]), median(RH_human_cen_only[, "RH5_w0_d0_ratio"])/median(RH_human_no_cen[,  and median(HEK_cen_only$read_ratio)/median(HEK_no_cen$read_ratio)    "RH5_w0_d0_ratio"]), median(RH_human_cen_only[, "RH6_w0_d0_ratio"])/median(RH_human_no_cen[,  and median(HEK_cen_only$read_ratio)/median(HEK_no_cen$read_ratio)    "RH6_w0_d0_ratio"])) and median(HEK_cen_only$read_ratio)/median(HEK_no_cen$read_ratio)
# t = 7.4154, df = 5, p-value = 0.0007023
# alternative hypothesis: true difference in means is not equal to 0
# 95 percent confidence interval:
 # 2.888266 5.953229
# sample estimates:
# mean of x mean of y 
 # 5.495818  1.075070   <<<<<<<<<<<<<<<<<<<<< use in paper
 
 
 # Assuming HEK mean is theoretical mean (more liberal than above):

t.test(c(
		median(RH_human_cen_only[,"RH1_w0_d0_ratio"])/median(RH_human_no_cen[,"RH1_w0_d0_ratio"]),
		median(RH_human_cen_only[,"RH2_w0_d0_ratio"])/median(RH_human_no_cen[,"RH2_w0_d0_ratio"]),
		median(RH_human_cen_only[,"RH3_w0_d0_ratio"])/median(RH_human_no_cen[,"RH3_w0_d0_ratio"]),
		median(RH_human_cen_only[,"RH4_w0_d0_ratio"])/median(RH_human_no_cen[,"RH4_w0_d0_ratio"]),
		median(RH_human_cen_only[,"RH5_w0_d0_ratio"])/median(RH_human_no_cen[,"RH5_w0_d0_ratio"]),
		median(RH_human_cen_only[,"RH6_w0_d0_ratio"])/median(RH_human_no_cen[,"RH6_w0_d0_ratio"])
		), mu = median(HEK_cen_only$read_ratio)/median(HEK_no_cen $read_ratio))

	# One Sample t-test

# data:  c(median(RH_human_cen_only[, "RH1_w0_d0_ratio"])/median(RH_human_no_cen[,     "RH1_w0_d0_ratio"]), median(RH_human_cen_only[, "RH2_w0_d0_ratio"])/median(RH_human_no_cen[,     "RH2_w0_d0_ratio"]), median(RH_human_cen_only[, "RH3_w0_d0_ratio"])/median(RH_human_no_cen[,     "RH3_w0_d0_ratio"]), median(RH_human_cen_only[, "RH4_w0_d0_ratio"])/median(RH_human_no_cen[,     "RH4_w0_d0_ratio"]), median(RH_human_cen_only[, "RH5_w0_d0_ratio"])/median(RH_human_no_cen[,     "RH5_w0_d0_ratio"]), median(RH_human_cen_only[, "RH6_w0_d0_ratio"])/median(RH_human_no_cen[,     "RH6_w0_d0_ratio"]))
# t = 19.619, df = 5, p-value = 6.352e-06
# alternative hypothesis: true mean is not equal to 1.07507
# 95 percent confidence interval:
 # 4.916594 6.075041
# sample estimates:
# mean of x 
 # 5.495818 



































