# Used depthOfCoverage_100000-GS00370-DNA_A01_1110_36-ASM.tsv.


library(ggplot2)
library(cowplot) #used with plot_grid 


theme2 <- theme(
	plot.margin = unit(c(t=1.2,r=0.4,b=1.2,l=0.4), "cm"),
	panel.grid.major = element_blank(), 
	panel.grid.minor = element_blank(), 
	panel.background = element_blank(), 
	legend.position="none", 
	axis.line.x = element_line(colour = "black", size = 0.1), 
	axis.line.y = element_line(colour = "black", size = 0.1), 
	axis.ticks = element_line(colour = "black", size = 0.1),
	axis.text=element_text(size=12), #numbers on tick marks of x and y axes
	axis.title=element_text(size=14), #titles of x and y axes
	axis.title.y=element_text(margin=margin(0,13,0,0)), #moves y axis title by adding margin space to bottom
	axis.title.x=element_text(margin=margin(10,0,0,0)),  #moves x axis title by adding margin space to top
	plot.title = element_text(size=32, face="bold", hjust = -0.14), #can provide "A","B", by ggtitle, but used plot_grid wch can shift more left
	plot.subtitle = element_text(size=14, face="plain", hjust = 0.5) #hjust shifts right
	)



	
size_point <- 0.3
size_hline <- 0.1



# --------- Prepare and plot HEK data from our lab -----------------

# read in HEK293 sequence reads:
HEK <- read.table("HEK293_gseq.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE)

# Get rows at beginning of each chromosome:
HEK_start <- HEK[HEK$posS == 0 & HEK$posE == 1e6,]

# Get rid of ramp ups and ramp downs:
HEK <- HEK[c(0,diff(HEK$pos)) == 1e4,]

# combine HEK without ramps and HEK_start:
HEK <- rbind(HEK_start,HEK)

# Sort:
chrOrder<-paste("chr",c(1:22,"X","Y"),sep="")
# chrOrder<-c(1:24)
HEK$Chromosome <-factor(HEK$Chromosome, levels=chrOrder)
HEK <- HEK[order(HEK$Chromosome, HEK$pos), ]
HEK$Chromosome <- as.numeric(HEK$Chromosome)


# Transform reads into mean ratios
HEK$read_ratio <- HEK$reads/mean(HEK$reads)


# Transform chr1 etc. to numbers
HEK$Chromosome <- gsub('chr', '', HEK$Chromosome)
HEK[HEK$Chromosome == "X","Chromosome"] <- 23
HEK[HEK$Chromosome == "Y","Chromosome"] <- 24
# chrOrder<-paste("chr",c(1:22,"X","Y"),sep="")
chrOrder<-c(1:24)
HEK$Chromosome <-factor(HEK$Chromosome, levels=chrOrder)
HEK <- HEK[order(HEK$Chromosome, HEK$pos), ]
HEK$Chromosome <- as.numeric(HEK$Chromosome)

# Compute chromosome size
gen_coord <- aggregate(pos~Chromosome,FUN=max,data=HEK)
colnames(gen_coord)[2] <- "chr_size"
gen_coord$Chromosome <-factor(gen_coord$Chromosome, levels=chrOrder)
gen_coord <- gen_coord[order(gen_coord$Chromosome), ]
gen_coord$Chromosome <- as.numeric(gen_coord$Chromosome)

# Use cumsum to make genome coordinates
gen_coord$coord <- c(0,cumsum(gen_coord$chr_size)[-24])

# merge genome coordinates with HEK
HEK <- merge(HEK,gen_coord[,c("Chromosome","coord")])
HEK$Chromosome <-factor(HEK$Chromosome, levels=chrOrder)
HEK <- HEK[order(HEK$Chromosome, HEK$pos), ]
HEK$Chromosome <- as.numeric(HEK$Chromosome)

HEK$coord <- HEK$pos + HEK$coord

# get rid of chrY
HEK <- HEK[HEK$Chromosome != 24,]

# find midpoints of chromosomes for breaks in ggplot
mid <- function(x) {(max(x)+min(x))/2}
chr_mid <- aggregate(coord~Chromosome,FUN = mid,data=HEK)
colnames(chr_mid)[2] <- "mid"
chr_mid$Chromosome <-factor(chr_mid$Chromosome, levels=chrOrder)
chr_mid <- chr_mid[order(chr_mid$Chromosome), ]
chr_mid$Chromosome <- as.numeric(chr_mid$Chromosome)

# Define breaks as mid-points chromosomes
breaks <- chr_mid$mid


# attractive grey and skyblue color scheme
cb1<-rep(c("grey", "skyblue"), 12)


labels <- as.character(c(1:9,"",11,"",13,"","",16,"","","",20,"","","X"))


p1 <- ggplot(data = HEK, aes(x = coord, y = read_ratio, color=as.factor(Chromosome))) + 
	geom_point(size= size_point,stroke=0) +
	scale_color_manual(values=cb1) +
	theme2 +
	scale_x_continuous(breaks = breaks, labels = labels) +
	# ggtitle("") + 
	xlab("Chromosome") + 
	ylab("Copy") + 
	labs(subtitle="HEK293 (this study)")+
	scale_y_continuous(breaks=c(0,2,4),limit = c(0, 5))
print(p1)



# ----------- Prepare and plot HEK293 depth of coverage CNV file from EBI ---------------

# From ftp://ftp.sra.ebi.ac.uk/vol1/ERA152/ERA152199/CGGS00370-DNA_A01_1110_36-ASM/GS00370-DNA_A01_1110_36-ASM/GS00370-DNA_A01/ASM/CNV/
# download:
# depthOfCoverage_100000-GS00370-DNA_A01_1110_36-ASM.tsv
# delete header using TextEdit

cnv <- read.table("depthOfCoverage_100000-GS00370-DNA_A01_1110_36-ASM.tsv",header=TRUE,sep="\t",stringsAsFactors=FALSE)

head(cnv)
  # chromosome position uniqueSequenceCoverage weightSumSequenceCoverage gcCorrectedCvg avgNormalizedCoverage
# 1       chr1    50000                  5.990                    86.326         88.697                  70.3
# 2       chr1   357582                  0.117                    48.142         52.112                  75.3
# 3       chr1   561231                 30.290                   573.417        511.212                 179.1
# 4       chr1   661231                  4.815                    54.062         53.199                  67.3
# 5       chr1   761231                 47.343                    83.658         80.528                  63.4
# 6       chr1   861231                 60.103                    64.816         72.498                  61.1

# no need to worry about chrM, because not present

# Transform chr1 etc. to numbers. OK even though no chrY
cnv$chromosome <- gsub('chr', '', cnv$chromosome)
cnv[cnv$chromosome == "X","chromosome"] <- 23
cnv[cnv$chromosome == "Y","chromosome"] <- 24
# chrOrder<-paste("chr",c(1:22,"X","Y"),sep="")
chrOrder<-c(1:24)
cnv$chromosome <-factor(cnv$chromosome, levels=chrOrder)
cnv <- cnv[order(cnv$chromosome, cnv$position), ]
cnv$chromosome <- as.numeric(cnv$chromosome)



cnv <- cnv[,c("chromosome","position","avgNormalizedCoverage")]
head(cnv)
  # chromosome position avgNormalizedCoverage
# 1          1    50000                  70.3
# 2          1   357582                  75.3
# 3          1   561231                 179.1
# 4          1   661231                  67.3
# 5          1   761231                  63.4
# 6          1   861231                  61.1

colnames(cnv) <- c("Chromosome","pos","read_ratio")
head(cnv)
  # Chromosome    pos read_ratio
# 1          1  50000       70.3
# 2          1 357582       75.3
# 3          1 561231      179.1
# 4          1 661231       67.3
# 5          1 761231       63.4
# 6          1 861231       61.1

cnv$read_ratio <- cnv$read_ratio/mean(cnv$read_ratio)


chrOrder<-c(1:24)
cnv$Chromosome <-factor(cnv$Chromosome, levels=chrOrder)
cnv <- cnv[order(cnv$Chromosome, cnv$pos), ]
cnv$Chromosome <- as.numeric(cnv$Chromosome)

head(cnv)
  # Chromosome    pos read_ratio
# 1          1  50000       70.3
# 2          1 357582       75.3
# 3          1 561231      179.1
# 4          1 661231       67.3
# 5          1 761231       63.4
# 6          1 861231       61.1

gen_coord <- aggregate(pos~Chromosome,FUN=max,data=cnv)
colnames(gen_coord)[2] <- "chr_size"
gen_coord$Chromosome <-factor(gen_coord$Chromosome, levels=chrOrder)
gen_coord <- gen_coord[order(gen_coord$Chromosome), ]
gen_coord$Chromosome <- as.numeric(gen_coord$Chromosome)

gen_coord$chr_size <- as.numeric(gen_coord$chr_size)

gen_coord$coord <- c(0,cumsum(gen_coord$chr_size)[-23])

cnv <- merge(cnv,gen_coord[,c("Chromosome","coord")])
cnv$Chromosome <-factor(cnv$Chromosome, levels=chrOrder)
cnv <- cnv[order(cnv$Chromosome, cnv$pos), ]
cnv$Chromosome <- as.numeric(cnv$Chromosome)

cnv$coord <- cnv$pos + cnv$coord


# attractive grey and skyblue color scheme
cb1<-rep(c("grey", "skyblue"), 12)

size_point <- 0.3
size_hline <- 0.1

labels <- as.character(c(1:9,"",11,"",13,"","",16,"","","",20,"","","X"))


mid <- function(x) {(max(x)+min(x))/2}
chr_mid <- aggregate(coord~Chromosome,FUN = mid,data=cnv)
colnames(chr_mid)[2] <- "mid"
chr_mid$Chromosome <-factor(chr_mid$Chromosome, levels=chrOrder)
chr_mid <- chr_mid[order(chr_mid$Chromosome), ]
chr_mid$Chromosome <- as.numeric(chr_mid$Chromosome)

# Define breaks as mid-points chromosomes
breaks <- chr_mid$mid

max(cnv$read_ratio)
# [1] 4.345646

min(cnv$read_ratio)
# [1] 0.3513702

p2 <- ggplot(data = cnv, aes(x = coord, y = read_ratio, color=as.factor(Chromosome))) + 
 	geom_point(size= size_point,stroke=0) +
 	scale_color_manual(values=cb1) +
 	theme2 +
 	scale_x_continuous(breaks = breaks, labels = labels) +
 	# ggtitle("") + 
 	xlab("Chromosome") + 
 	ylab("Copy") + 
 	labs(subtitle="HEK293 (Lin et al., 2014)")+
 	scale_y_continuous(breaks=c(0,1,2),limit = c(0, 2.1))
print(p2)




# ---------- Scatterplot of copy number for public HEK293 data vs our data ----------------

dim(cnv)
# [1] 28175     4

dim(HEK)
#  [1] 300814      7

# down sample HEK to match cnv data frame:
HEK_down <- HEK[0,]

# takes ~1-2 mins:
for(i in 1:nrow(cnv)) {HEK_down[i,] <- HEK[which.min(abs(cnv$coord[i] - HEK$coord)),]}

dim(HEK_down)
# [1] 28175     7

combined_cnv <- data.frame(Chromosome = HEK_down$Chromosome, HEK_ratio = HEK_down$read_ratio,cnv_ratio = cnv$read_ratio)


p3 <- ggplot(data = combined_cnv, aes(x = HEK_ratio, y = cnv_ratio, color=as.factor(Chromosome)),show_guide=FALSE) + 
 	geom_point(size= 0.5,stroke=0) +
 	theme2 +
    stat_smooth(method = "lm", formula = y ~ x,aes(group=1),se=TRUE,colour="pink",size=0.5,fill="grey") +
 	scale_color_discrete(name ="Chromosomes", labels=c(1:22,"X")) +
 	theme(
		 	plot.margin = unit(c(t=0.8,r=0.4,b=0.8,l=0.4), "cm"),
 			legend.position = "right", 
 			legend.title = element_text(size = 9), 
 			legend.text = element_text(size = 8),
 			legend.title.align=0.7,
 			legend.margin=margin(t=0,r=0,b=0,l=-5,unit = "pt"),
		 	legend.box.margin=margin(t=0,r=-0,b=0,l=-5,unit = "pt"),
		 	legend.key = element_rect(fill = NA),
		 	legend.key.height = unit(0.2, "cm"),
			legend.key.width = unit(0.3, "cm"),
		 	legend.spacing.y = unit(0.1, 'cm'),
		 	legend.spacing.x = unit(0.1, 'cm')
 			) +
 	guides(colour = guide_legend(override.aes = list(size=1),ncol=2,byrow=FALSE)) +
 	annotate("text", x=0.23, y=2.0, label= paste("italic('R') == 0.52"), parse=TRUE, size=3) +
 	annotate("text", x=0.41, y=1.87, label= paste("italic('P') < 2.2%*%10^-16"), parse=TRUE, size=3) +
 	# ggtitle("") + 
 	xlab("HEK293 copy number \n(this study)") + 
 	ylab("HEK293 copy number \n(Lin et al., 2014)") + 
 	theme(axis.title=element_text(size=13), 
 			axis.text=element_text(size=12)) +
 	labs(subtitle="") +
 	scale_x_continuous(breaks = c(0:2), limit = c(0,2)) +
 	scale_y_continuous(breaks = c(0:2), limit = c(0,2))
print(p3)

# ---------- Corr of depth of coverage copy number for public HEK293 data and our data ----------------
# ---------- Data used in legend of p3 scatterplot above ----------------------

cor.test(HEK_down$read_ratio,cnv$read_ratio)

	# Pearson's product-moment correlation <<<<<<<<<< use in paper

# data:  HEK_down$read_ratio and cnv$read_ratio
# t = 101.1, df = 28173, p-value < 2.2e-16
# alternative hypothesis: true correlation is not equal to 0
# 95 percent confidence interval:
 # 0.5073341 0.5244712
# sample estimates:
      # cor 
# 0.5159543 

cor.test(HEK_down$read_ratio,cnv$read_ratio)$p.value
# [1] 0 <<<<<<<<<<<<<< use in paper


#------------------Make file --------------------------


pdf("Compare_HEK293_CNAs.pdf", width=7.5, height=6.67, useDingbats = FALSE)
plot_grid(p1, p2, p3, labels=c("A", "B", "C"), ncol = 2, nrow = 2, label_size = 16)
dev.off()




tiff("Compare_HEK293_CNAs.tif",width=7.5,height=6.67,units="in",res=300)
plot_grid(p1, p2, p3, labels=c("A", "B", "C"), ncol = 2, nrow = 2, label_size = 16)
dev.off()





png("Compare_HEK293_CNAs.png",width=7.5,height=6.67,units="in",res=300)
plot_grid(p1, p2, p3, labels=c("A", "B", "C"), ncol = 2, nrow = 2, label_size = 16)
dev.off()




png("Compare_HEK293_CNAs_hi_res.png",width=7.5,height=6.67,units="in",res=1200)
plot_grid(p1, p2, p3, labels=c("A", "B", "C"), ncol = 2, nrow = 2, label_size = 16)
dev.off()





# ------ INFORMATIONAL NOTE: Both our data and data from published paper contain centromere sequences, so no need to correct for these -----------

# For example:

hg38_cen_limits <- read.table("hg38_centromere_limits.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE)

# replace hg38_cen_limits Chromomsomes with 1,2,3 etc.
chrOrder<-paste("chr",c(1:22,"X","Y"),sep="")
# chrOrder<-c(1:11)
hg38_cen_limits$Chromosome <-factor(hg38_cen_limits$Chromosome, levels=chrOrder)
hg38_cen_limits <- hg38_cen_limits[order(hg38_cen_limits$Chromosome), ]
#Although not logical, following transforms chr to nos, X becomes 11
hg38_cen_limits$Chromosome <- as.numeric(hg38_cen_limits$Chromosome)

hg38_cen_limits
   # Chromosome      posS      posE       pos
# 1           1 122026459 124932724 123479592
# 2           2  92188145  94090557  93139351
# 3           3  90772458  93655574  92214016
# 4           4  49712061  51743951  50728006
# 5           5  46485900  50059807  48272854
# 6           6  58553888  59829934  59191911
# 7           7  58169653  61528020  59848836
# 8           8  44033744  45877265  44955504
# 9           9  43389635  45518558  44454096
# 10         10  39686682  41593521  40640102
# 11         11  51078348  54425074  52751711
# 12         12  34769407  37185252  35977330
# 13         13  16000000  18051248  17025624
# 14         14  16000000  18173523  17086762
# 15         15  17083673  19725254  18404464
# 16         16  36311158  38265669  37288414
# 17         17  22813679  26616164  24714922
# 18         18  15460899  20861206  18161052
# 19         19  24498980  27190874  25844927
# 20         20  26436232  30038348  28237290
# 21         21  10864560  12915808  11890184
# 22         22  12954788  15054318  14004553
# 23         23  58605579  62412542  60509060
# 24         24  10316944  10544039  10430492


# Let's look at chr1:
i <- 1

dim(HEK[(HEK$posE > hg38_cen_limits[hg38_cen_limits$Chromosome==i,]$posS & HEK$posS < hg38_cen_limits[hg38_cen_limits$Chromosome==i,]$posE),])
# [1] 5083    7

dim(HEK)
# [1] 300814      7

5083/300814
# [1] 0.01689748

dim(cnv[(cnv$pos > hg38_cen_limits[hg38_cen_limits$Chromosome==i,]$posS & cnv$pos < hg38_cen_limits[hg38_cen_limits$Chromosome==i,]$posE),])
# [1] 347   4

dim(cnv)
# [1] 28175     4

347/28175
# [1] 0.01231588
























