## log2 copy number changes from wk0 to wk4 (at d0) and d0 to d75 (at wk0)


library(ggplot2)
library(cowplot) #used with plot_grid 

#----------------Aesthetics ---------------------------


theme2 <- theme(
	plot.margin = unit(c(t=1.2,r=0.4,b=1.2,l=0.4), "cm"),
	panel.grid.major = element_blank(), 
	panel.grid.minor = element_blank(), 
	panel.background = element_blank(), 
	legend.position="none", 
	axis.line.x = element_line(colour = "black", size = 0.1), 
	axis.line.y = element_line(colour = "black", size = 0.1), 
	axis.ticks = element_line(colour = "black", size = 0.1),
	axis.text=element_text(size=12), #numbers on tick marks of x and y axes
	axis.title=element_text(size=14), #titles of x and y axes
	axis.title.y=element_text(margin=margin(0,13,0,0)), #moves y axis title by adding margin space to bottom
	axis.title.x=element_text(margin=margin(10,0,0,0)),  #moves x axis title by adding margin space to top
	plot.title = element_text(size=32, face="bold", hjust = -0.14), #can provide "A","B", by ggtitle, but used plot_grid wch can shift more left
	plot.subtitle = element_text(size=14, face="plain", hjust = 0.5) #hjust shifts right
	)




# darkest two hues from 3-class PuBuGn in color brewer
# cb1<-rep(c("#1c9099", "#a6bddb"), 12)

# # darkest two hues from 3-class PuBu in color brewer
# cb1<-rep(c("#2b8cbe", "#a6bddb"), 12)


# #attractive pinks, greys
# cb1<-c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7","#999999", "#E69F00", "#56B4E9", "#E69F00", "#009E73", "#F0E442", "#0072B2", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7","#999999", "#D55E00", "#CC79A7")

# cb1_rev <- c("#CC79A7", "#D55E00", "#0072B2", "#F0E442", "#009E73", "#56B4E9", "#E69F00","#999999", "#CC79A7", "#D55E00", "#0072B2", "#D55E00", "#F0E442", "#009E73", "#56B4E9", "#0072B2", "#F0E442", "#009E73", "#56B4E9", "#E69F00","#999999", "#CC79A7", "#E69F00","#999999")

# #'4-class RdBu'
# cb2 <- c('#ca0020','#f4a582','#92c5de','#0571b0','#ca0020','#f4a582','#92c5de','#0571b0','#ca0020','#f4a582','#92c5de','#f4a582','#0571b0','#ca0020','#f4a582','#92c5de','#0571b0','#ca0020','#f4a582','#92c5de','#0571b0','#ca0020','#92c5de','#0571b0')

# #'4-class RdYlBu'
# cb3 <- c('#d7191c','#fdae61','#abd9e9','#2c7bb6','#d7191c','#fdae61','#abd9e9','#2c7bb6','#d7191c','#fdae61','#abd9e9','#fdae61','#2c7bb6','#d7191c','#fdae61','#abd9e9','#2c7bb6','#d7191c','#fdae61','#abd9e9','#2c7bb6','#d7191c','#abd9e9','#2c7bb6')
	
	
size_point <- 0.3
size_hline <- 0.1

# If desired, modify balloon code. Probably not a good idea in this context, though.
# balloon_scale <- 0.8 # inflation factor for significant points	
# # scale significant points beginning wiht 0.8 pt
# size_point <- 0.8*(1 + balloon_scale*(bleed$A23_T_HUM_ratio_norm/max(bleed$A23_T_HUM_ratio_norm, na.rm=TRUE)))



#----------------- Hamster log2 read changes ---------------------


RH_hamster <- read.table("RH_hamster_gseq.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE)

# Get rows at beginning of each chromosome:
RH_hamster_start <- RH_hamster[RH_hamster$posS == 0 & RH_hamster$posE == 1e6,]

# Get rid of ramp ups and ramp downs (though note that hamster has ramp downs, not ramp ups):
RH_hamster <- RH_hamster[c(0,diff(RH_hamster$pos)) == 1e4,]

# combine RH_hamster without ramps and RH_hamster_start:
RH_hamster <- rbind(RH_hamster_start,RH_hamster)


# # get rid of contigs with only one entry:
# RH_hamster <- RH_hamster[!(RH_hamster$Contig_ID %in% aggregate(pos ~ Contig_ID, 
          # data = RH_hamster, 
          # FUN = function(x){NROW(x)})[aggregate(pos ~ Contig_ID, 
          # data = RH_hamster, 
          # FUN = function(x){NROW(x)})$pos==1,"Contig_ID"]),]

# Sort:
chrOrder<-paste("chr",c(1:10,"X"),sep="")
RH_hamster$Chromosome <-factor(RH_hamster$Chromosome, levels=chrOrder)
RH_hamster <- RH_hamster[order(RH_hamster$Chromosome, RH_hamster$pos), ]
RH_hamster$Chromosome <- as.character(RH_hamster$Chromosome)


# Transform reads into mean ratios for RH pools only
# lengths will be the same for all entries, and so will cancel, but retained here for clarity.

RH_hamster$delta_w6_w0 <- rowMeans(
						cbind(
log2(RH_hamster[,c("RH1_w6_d0")]*(length(RH_hamster[,"RH1_w6_d0"])/sum(RH_hamster[,"RH1_w6_d0"])))-log2(RH_hamster[,c("RH1_w0_d0")]*(length(RH_hamster[,"RH1_w0_d0"])/sum(RH_hamster[,"RH1_w0_d0"]))),
log2(RH_hamster[,c("RH2_w6_d0")]*(length(RH_hamster[,"RH2_w6_d0"])/sum(RH_hamster[,"RH2_w6_d0"])))-log2(RH_hamster[,c("RH2_w0_d0")]*(length(RH_hamster[,"RH2_w0_d0"])/sum(RH_hamster[,"RH2_w0_d0"]))),
log2(RH_hamster[,c("RH3_w6_d0")]*(length(RH_hamster[,"RH3_w6_d0"])/sum(RH_hamster[,"RH3_w6_d0"])))-log2(RH_hamster[,c("RH3_w0_d0")]*(length(RH_hamster[,"RH3_w0_d0"])/sum(RH_hamster[,"RH3_w0_d0"]))),
log2(RH_hamster[,c("RH4_w6_d0")]*(length(RH_hamster[,"RH4_w6_d0"])/sum(RH_hamster[,"RH4_w6_d0"])))-log2(RH_hamster[,c("RH4_w0_d0")]*(length(RH_hamster[,"RH4_w0_d0"])/sum(RH_hamster[,"RH4_w0_d0"]))),
log2(RH_hamster[,c("RH5_w6_d0")]*(length(RH_hamster[,"RH5_w6_d0"])/sum(RH_hamster[,"RH5_w6_d0"])))-log2(RH_hamster[,c("RH5_w0_d0")]*(length(RH_hamster[,"RH5_w0_d0"])/sum(RH_hamster[,"RH5_w0_d0"]))),
log2(RH_hamster[,c("RH6_w6_d0")]*(length(RH_hamster[,"RH6_w6_d0"])/sum(RH_hamster[,"RH6_w6_d0"])))-log2(RH_hamster[,c("RH6_w0_d0")]*(length(RH_hamster[,"RH6_w0_d0"])/sum(RH_hamster[,"RH6_w0_d0"])))
									
	)
)

# Some samples blacked out, because missing:

RH_hamster$delta_d75_d0 <- rowMeans(
						cbind(
log2(RH_hamster[,c("RH1_w6_d75")]*(length(RH_hamster[,"RH1_w6_d75"])/sum(RH_hamster[,"RH1_w6_d75"])))-log2(RH_hamster[,c("RH1_w6_d0")]*(length(RH_hamster[,"RH1_w6_d0"])/sum(RH_hamster[,"RH1_w6_d0"]))),
log2(RH_hamster[,c("RH2_w6_d75")]*(length(RH_hamster[,"RH2_w6_d75"])/sum(RH_hamster[,"RH2_w6_d75"])))-log2(RH_hamster[,c("RH2_w6_d0")]*(length(RH_hamster[,"RH2_w6_d0"])/sum(RH_hamster[,"RH2_w6_d0"]))),
log2(RH_hamster[,c("RH3_w6_d75")]*(length(RH_hamster[,"RH3_w6_d75"])/sum(RH_hamster[,"RH3_w6_d75"])))-log2(RH_hamster[,c("RH3_w6_d0")]*(length(RH_hamster[,"RH3_w6_d0"])/sum(RH_hamster[,"RH3_w6_d0"]))),
log2(RH_hamster[,c("RH4_w6_d75")]*(length(RH_hamster[,"RH4_w6_d75"])/sum(RH_hamster[,"RH4_w6_d75"])))-log2(RH_hamster[,c("RH4_w6_d0")]*(length(RH_hamster[,"RH4_w6_d0"])/sum(RH_hamster[,"RH4_w6_d0"])))
# log2(RH_hamster[,c("RH5_w6_d75")]*(length(RH_hamster[,"RH5_w6_d75"])/sum(RH_hamster[,"RH5_w6_d75"])))-log2(RH_hamster[,c("RH5_w6_d0")]*(length(RH_hamster[,"RH5_w6_d0"])/sum(RH_hamster[,"RH5_w6_d0"]))),
# log2(RH_hamster[,c("RH6_w6_d75")]*(length(RH_hamster[,"RH6_w6_d75"])/sum(RH_hamster[,"RH6_w6_d75"])))-log2(RH_hamster[,c("RH6_w6_d0")]*(length(RH_hamster[,"RH6_w6_d0"])/sum(RH_hamster[,"RH6_w6_d0"])))
									
	)
)



# Transform chr1 etc. to numbers
RH_hamster$Chromosome <- gsub('chr', '', RH_hamster$Chromosome)
RH_hamster[RH_hamster$Chromosome == "X","Chromosome"] <- 11
chrOrder<-c(1:11)
RH_hamster$Chromosome <-factor(RH_hamster$Chromosome, levels=chrOrder)
RH_hamster <- RH_hamster[order(RH_hamster$Chromosome, RH_hamster$pos), ]
RH_hamster$Chromosome <- as.numeric(RH_hamster$Chromosome)

# Compute chromosome size
gen_coord <- aggregate(pos~Chromosome,FUN=max,data=RH_hamster)
colnames(gen_coord)[2] <- "chr_size"
gen_coord$Chromosome <-factor(gen_coord$Chromosome, levels=chrOrder)
gen_coord <- gen_coord[order(gen_coord$Chromosome), ]
gen_coord$Chromosome <- as.numeric(gen_coord$Chromosome)

# Use cumsum to make genome coordinates
gen_coord$coord <- c(0,cumsum(gen_coord$chr_size)[-11])

# merge genome coordinates with RH_hamster
RH_hamster <- merge(RH_hamster,gen_coord[,c("Chromosome","coord")])
RH_hamster$Chromosome <-factor(RH_hamster$Chromosome, levels=chrOrder)
RH_hamster <- RH_hamster[order(RH_hamster$Chromosome, RH_hamster$pos), ]
RH_hamster$Chromosome <- as.numeric(RH_hamster$Chromosome)

RH_hamster$coord <- RH_hamster$pos + RH_hamster$coord

# find midpoints of chromosomes for breaks in ggplot
mid <- function(x) {(max(x)+min(x))/2}
chr_mid <- aggregate(coord~Chromosome,FUN = mid,data=RH_hamster)
colnames(chr_mid)[2] <- "mid"
chr_mid$Chromosome <-factor(chr_mid$Chromosome, levels=chrOrder)
chr_mid <- chr_mid[order(chr_mid$Chromosome), ]
chr_mid$Chromosome <- as.numeric(chr_mid$Chromosome)

# Define breaks as mid-points chromosomes
breaks <- chr_mid$mid


# attractive grey and skyblue color scheme
cb1<-c(rep(c("grey", "skyblue"), 5),"grey")

labels <- as.character(c(1:8,"","","X"))


p1 <- ggplot(data = RH_hamster, aes(x = coord, y = delta_w6_w0, color=as.factor(Chromosome))) + 
	geom_point(size= size_point,stroke=0) +
	scale_color_manual(values=cb1) +
	theme2 +
	scale_x_continuous(breaks = breaks, labels = labels) +
	# ggtitle("") + 
	xlab("Chromosome") + 
	ylab(expression(Delta*log[2]*'('*copy*')')) + 
	labs(subtitle="Growth at 6 weeks \n hamster reads")+
	scale_y_continuous(breaks=c(-3,0,3),limit = c(-3.5, 3.5))
print(p1)

p3 <- ggplot(data = RH_hamster, aes(x = coord, y = delta_d75_d0, color=as.factor(Chromosome))) + 
	geom_point(size= size_point,stroke=0) +
	scale_color_manual(values=cb1) +
	theme2 +
	scale_x_continuous(breaks = breaks, labels = labels) +
	# ggtitle("") + 
	xlab("Chromosome") + 
	ylab(expression(Delta*log[2]*'('*copy*')')) + 
	labs(subtitle="Paclitaxel at 6 weeks \n hamster reads")+
	scale_y_continuous(breaks=c(-6,0,6),limit = c(-6.5, 6.5))
print(p3)


#----------------- Human log2 read changes ---------------------

RH_human <- read.table("RH_human_gseq.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE)

# Get rows at beginning of each chromosome:
RH_human_start <- RH_human[RH_human$posS == 0 & RH_human$posE == 1e6,]

# Get rid of ramp ups and ramp downs (though note that human has ramp downs, not ramp ups):
RH_human <- RH_human[c(0,diff(RH_human$pos)) == 1e4,]

# combine RH_human without ramps and RH_human_start:
RH_human <- rbind(RH_human_start,RH_human)


# Sort:
chrOrder<-paste("chr",c(1:22,"X","Y"),sep="")
RH_human$Chromosome <-factor(RH_human$Chromosome, levels=chrOrder)
RH_human <- RH_human[order(RH_human$Chromosome, RH_human$pos), ]
RH_human$Chromosome <- as.character(RH_human$Chromosome)


RH_human$delta_w6_w0 <- rowMeans(
						cbind(
log2(RH_human[,c("RH1_w6_d0")]*(length(RH_hamster[,"RH1_w6_d0"])/sum(RH_hamster[,"RH1_w6_d0"])))-log2(RH_human[,c("RH1_w0_d0")]*(length(RH_hamster[,"RH1_w0_d0"])/sum(RH_hamster[,"RH1_w0_d0"]))),
log2(RH_human[,c("RH2_w6_d0")]*(length(RH_hamster[,"RH2_w6_d0"])/sum(RH_hamster[,"RH2_w6_d0"])))-log2(RH_human[,c("RH2_w0_d0")]*(length(RH_hamster[,"RH2_w0_d0"])/sum(RH_hamster[,"RH2_w0_d0"]))),
log2(RH_human[,c("RH3_w6_d0")]*(length(RH_hamster[,"RH3_w6_d0"])/sum(RH_hamster[,"RH3_w6_d0"])))-log2(RH_human[,c("RH3_w0_d0")]*(length(RH_hamster[,"RH3_w0_d0"])/sum(RH_hamster[,"RH3_w0_d0"]))),
log2(RH_human[,c("RH4_w6_d0")]*(length(RH_hamster[,"RH4_w6_d0"])/sum(RH_hamster[,"RH4_w6_d0"])))-log2(RH_human[,c("RH4_w0_d0")]*(length(RH_hamster[,"RH4_w0_d0"])/sum(RH_hamster[,"RH4_w0_d0"]))),
log2(RH_human[,c("RH5_w6_d0")]*(length(RH_hamster[,"RH5_w6_d0"])/sum(RH_hamster[,"RH5_w6_d0"])))-log2(RH_human[,c("RH5_w0_d0")]*(length(RH_hamster[,"RH5_w0_d0"])/sum(RH_hamster[,"RH5_w0_d0"]))),
log2(RH_human[,c("RH6_w6_d0")]*(length(RH_hamster[,"RH6_w6_d0"])/sum(RH_hamster[,"RH6_w6_d0"])))-log2(RH_human[,c("RH6_w0_d0")]*(length(RH_hamster[,"RH6_w0_d0"])/sum(RH_hamster[,"RH6_w0_d0"])))
									
	)
)

# Some samples blacked out, because missing:

RH_human$delta_d75_d0 <- rowMeans(
						cbind(
log2(RH_human[,c("RH1_w6_d75")]*(length(RH_hamster[,"RH1_w6_d75"])/sum(RH_hamster[,"RH1_w6_d75"])))-log2(RH_human[,c("RH1_w6_d0")]*(length(RH_hamster[,"RH1_w6_d0"])/sum(RH_hamster[,"RH1_w6_d0"]))),
log2(RH_human[,c("RH2_w6_d75")]*(length(RH_hamster[,"RH2_w6_d75"])/sum(RH_hamster[,"RH2_w6_d75"])))-log2(RH_human[,c("RH2_w6_d0")]*(length(RH_hamster[,"RH2_w6_d0"])/sum(RH_hamster[,"RH2_w6_d0"]))),
log2(RH_human[,c("RH3_w6_d75")]*(length(RH_hamster[,"RH3_w6_d75"])/sum(RH_hamster[,"RH3_w6_d75"])))-log2(RH_human[,c("RH3_w6_d0")]*(length(RH_hamster[,"RH3_w6_d0"])/sum(RH_hamster[,"RH3_w6_d0"]))),
log2(RH_human[,c("RH4_w6_d75")]*(length(RH_hamster[,"RH4_w6_d75"])/sum(RH_hamster[,"RH4_w6_d75"])))-log2(RH_human[,c("RH4_w6_d0")]*(length(RH_hamster[,"RH4_w6_d0"])/sum(RH_hamster[,"RH4_w6_d0"])))
# log2(RH_human[,c("RH5_w6_d75")]*(length(RH_hamster[,"RH5_w6_d75"])/sum(RH_hamster[,"RH5_w6_d75"])))-log2(RH_human[,c("RH5_w6_d0")]*(length(RH_hamster[,"RH5_w6_d0"])/sum(RH_hamster[,"RH5_w6_d0"]))),
# log2(RH_human[,c("RH6_w6_d75")]*(length(RH_hamster[,"RH6_w6_d75"])/sum(RH_hamster[,"RH6_w6_d75"])))-log2(RH_human[,c("RH6_w6_d0")]*(length(RH_hamster[,"RH6_w6_d0"])/sum(RH_hamster[,"RH6_w6_d0"])))
									
	)
)


# Transform chr1 etc. to numbers
RH_human$Chromosome <- gsub('chr', '', RH_human$Chromosome)
RH_human[RH_human$Chromosome == "X","Chromosome"] <- 23
RH_human[RH_human$Chromosome == "Y","Chromosome"] <- 24
chrOrder<-c(1:24)
RH_human$Chromosome <-factor(RH_human$Chromosome, levels=chrOrder)
RH_human <- RH_human[order(RH_human$Chromosome, RH_human$pos), ]
RH_human$Chromosome <- as.numeric(RH_human$Chromosome)

# Compute chromosome size
gen_coord <- aggregate(pos~Chromosome,FUN=max,data=RH_human)
colnames(gen_coord)[2] <- "chr_size"
gen_coord$Chromosome <-factor(gen_coord$Chromosome, levels=chrOrder)
gen_coord <- gen_coord[order(gen_coord$Chromosome), ]
gen_coord$Chromosome <- as.numeric(gen_coord$Chromosome)

# Use cumsum to make genome coordinates
gen_coord$coord <- c(0,cumsum(gen_coord$chr_size)[-24])

# merge genome coordinates with RH_human
RH_human <- merge(RH_human,gen_coord[,c("Chromosome","coord")])
RH_human$Chromosome <-factor(RH_human$Chromosome, levels=chrOrder)
RH_human <- RH_human[order(RH_human$Chromosome, RH_human$pos), ]
RH_human$Chromosome <- as.numeric(RH_human$Chromosome)

RH_human$coord <- RH_human$pos + RH_human$coord

# get rid of chrY
RH_human <- RH_human[RH_human$Chromosome != 24,]

# find midpoints of chromosomes for breaks in ggplot
mid <- function(x) {(max(x)+min(x))/2}
chr_mid <- aggregate(coord~Chromosome,FUN = mid,data=RH_human)
colnames(chr_mid)[2] <- "mid"
chr_mid$Chromosome <-factor(chr_mid$Chromosome, levels=chrOrder)
chr_mid <- chr_mid[order(chr_mid$Chromosome), ]
chr_mid$Chromosome <- as.numeric(chr_mid$Chromosome)

# Define breaks as mid-points chromosomes
breaks <- chr_mid$mid


# attractive grey and skyblue color scheme
cb1<-rep(c("grey", "skyblue"), 12)

labels <- as.character(c(1:9,"",11,"",13,"","",16,"","","",20,"","","X"))

# use !is.infinite to get rid of lurker points at bottom of graph, esp p4
p2 <- ggplot(data = RH_human[!is.infinite(RH_human$delta_w6_w0),], aes(x = coord, y = delta_w6_w0, color=as.factor(Chromosome))) + 
	geom_point(size= size_point,stroke=0) +
	scale_color_manual(values=cb1) +
	theme2 +
	scale_x_continuous(breaks = breaks, labels = labels) +
	# ggtitle("") + 
	xlab("Chromosome") + 
	ylab(expression(Delta*log[2]*'('*copy*')')) + 
	labs(subtitle="Growth at 6 weeks \n human reads")+
	scale_y_continuous(breaks=c(-3,0,3),limit = c(-3.5, 3.5))
print(p2)

p4 <- ggplot(data = RH_human[!is.infinite(RH_human$delta_d75_d0),], aes(x = coord, y = delta_d75_d0, color=as.factor(Chromosome))) + 
	geom_point(size= size_point,stroke=0) +
	scale_color_manual(values=cb1) +
	theme2 +
	scale_x_continuous(breaks = breaks, labels = labels) +
	# ggtitle("") + 
	xlab("Chromosome") + 
	ylab(expression(Delta*log[2]*'('*copy*')')) + 
	labs(subtitle="Paclitaxel at 6 weeks \n human reads")+
	scale_y_continuous(breaks=c(-6,0,6),limit = c(-6.5, 6.5))
print(p4)


#------------------Make files --------------------------


pdf("Copy_w6_2.pdf",width=7.5,height=6.67, useDingbats=FALSE)
plot_grid(p1, p2, p3, p4, labels=c("A", "B", "C", "D"), ncol = 2, nrow = 2, label_size = 16)
dev.off()



tiff("Copy_w6_2.tif",width=7.5,height=6.67,units="in",res=300)
plot_grid(p1, p2, p3, p4, labels=c("A", "B", "C", "D"), ncol = 2, nrow = 2, label_size = 16)
dev.off()



png("Copy_w6_2.png",width=7.5,height=6.67,units="in",res=300)
plot_grid(p1, p2, p3, p4, labels=c("A", "B", "C", "D"), ncol = 2, nrow = 2, label_size = 16)
dev.off()



png("Copy_w6_hi_res_2.png",width=7.5,height=6.67,units="in",res=1200)
plot_grid(p1, p2, p3, p4, labels=c("A", "B", "C", "D"), ncol = 2, nrow = 2, label_size = 16)
dev.off()



#-------------------------------------------------------


############################################################################
# ------- Compare hamster and human baselines, growth at week 6 ------------
############################################################################

sem <- function(x) {sqrt(var(x,na.rm=TRUE)/sum(!is.na(x)))}

mean(RH_hamster[seq(1,nrow(RH_hamster),100),]$delta_w6_w0)
# [1] -0.004184464 <<<<<<<<<<<<<<<< do not use in paper because hamster normalized to self


sd(RH_hamster[seq(1,nrow(RH_hamster),100),]$delta_w6_w0)
# [1] 0.07499401 <<<<<<<<<<<<<<<< use in paper when quoting Levene's test


sem(RH_hamster[seq(1,nrow(RH_hamster),100),]$delta_w6_w0)
# [1] 0.001659176 <<<<<<<<<<<<<<<< do not use in paper because hamaster normalized to itself

# ------- sd of sd of hamster. -----------
# Two formulae, one from https://stats.stackexchange.com/questions/631/standard-deviation-of-standard-deviation, which is:
# sd*sqrt(exp(1)*(1-(1/n))-1)
# Other from chapter 2, "Distribution Functions",  S.V. Gupta, Measurement Uncertainties, DOI 10.1007/978-3-642-20989-5 2, © Springer-Verlag Berlin Heidelberg 2012
# sd/sqrt(2*n)
# Both gave nearly identical answers:

sd(RH_hamster[seq(1,nrow(RH_hamster),100),]$delta_w6_w0)*sqrt(exp(1)*(1-(1/length(RH_hamster[seq(1,nrow(RH_hamster),100),]$delta_w6_w0)))^(length(RH_hamster[seq(1,nrow(RH_hamster),100),]$delta_w6_w0)-1)-1)
# [1] 0.001173382

sd(RH_hamster[seq(1,nrow(RH_hamster),100),]$delta_w6_w0)/sqrt(2*length(RH_hamster[seq(1,nrow(RH_hamster),100),]$delta_w6_w0))
# [1] 0.001173214


# sem of sd of hamster:
# Both gave nearly identical answers:

sd(RH_hamster[seq(1,nrow(RH_hamster),100),]$delta_w6_w0)*sqrt(exp(1)*(1-(1/length(RH_hamster[seq(1,nrow(RH_hamster),100),]$delta_w6_w0)))^(length(RH_hamster[seq(1,nrow(RH_hamster),100),]$delta_w6_w0)-1)-1)/(length(RH_hamster[seq(1,nrow(RH_hamster),100),]$delta_w6_w0))
# [1] 5.743426e-07 <<<<<<<<<<<<<<<< use in paper

sd(RH_hamster[seq(1,nrow(RH_hamster),100),]$delta_w6_w0)/sqrt(2*length(RH_hamster[seq(1,nrow(RH_hamster),100),]$delta_w6_w0))/(length(RH_hamster[seq(1,nrow(RH_hamster),100),]$delta_w6_w0))
# [1] 5.742606e-07



# Are hamster changes significantly different from zero? Barely:

t.test(RH_hamster[seq(1,nrow(RH_hamster),100),]$delta_w6_w0,mu=0)

	# One Sample t-test

# data:  RH_hamster[seq(1, nrow(RH_hamster), 100), ]$delta_w6_w0
# t = -2.522, df = 2042, p-value = 0.01174
# alternative hypothesis: true mean is not equal to 0
# 95 percent confidence interval:
 # -0.0074383177 -0.0009306106
# sample estimates:
   # mean of x 
# -0.004184464 


# ------------------ mean change in human DNA due to growth -----------------


mean(RH_human[seq(1,nrow(RH_human),100),]$delta_w6_w0,na.rm=TRUE)
# [1] -0.7807245 <<<<<<<<<<<<<<<< use in paper

sd(RH_human[seq(1,nrow(RH_human),100),]$delta_w6_w0,na.rm=TRUE)
# [1] 0.7782373 <<<<<<<<<<<<<<<< use in paper


sem(RH_human[seq(1,nrow(RH_human),100),]$delta_w6_w0)
# [1] 0.0144217 <<<<<<<<<<<<<<<< use in paper

# ----- sd of sd of human. ---------------
# Two formulae, one from https://stats.stackexchange.com/questions/631/standard-deviation-of-standard-deviation, which is:
# sd*sqrt(exp(1)*(1-(1/n))-1)
# Other from chapter 2, "Distribution Functions",  S.V. Gupta, Measurement Uncertainties, DOI 10.1007/978-3-642-20989-5 2, © Springer-Verlag Berlin Heidelberg 2012
# sd/sqrt(2*n)
# Both gave nearly identical answers:

sd(RH_human[seq(1,nrow(RH_human),100),]$delta_w6_w0,na.rm=TRUE)*sqrt(exp(1)*(1-(1/length(RH_human[seq(1,nrow(RH_human),100),]$delta_w6_w0)))^(length(RH_human[seq(1,nrow(RH_human),100),]$delta_w6_w0)-1)-1)
# [1] 0.01003294

sd(RH_human[seq(1,nrow(RH_human),100),]$delta_w6_w0,na.rm=TRUE)/sqrt(2*length(RH_human[seq(1,nrow(RH_human),100),]$delta_w6_w0))
# [1] 0.01003196


# sem of sd of human:
# Both gave nearly identical answers:

sd(RH_human[seq(1,nrow(RH_human),100),]$delta_w6_w0,na.rm=TRUE)*sqrt(exp(1)*(1-(1/length(RH_human[seq(1,nrow(RH_human),100),]$delta_w6_w0)))^(length(RH_human[seq(1,nrow(RH_human),100),]$delta_w6_w0)-1)-1)/(length(RH_human[seq(1,nrow(RH_human),100),]$delta_w6_w0))
# [1] 3.334309e-06 <<<<<<<<<<<<<<<< use in paper


sd(RH_human[seq(1,nrow(RH_human),100),]$delta_w6_w0,na.rm=TRUE)/sqrt(2*length(RH_human[seq(1,nrow(RH_human),100),]$delta_w6_w0))/(length(RH_human[seq(1,nrow(RH_human),100),]$delta_w6_w0))
# [1] 3.333986e-06


# Human changes are highly significantly different from zero:
t.test(RH_human[seq(1,nrow(RH_human),100),]$delta_w6_w0,mu=0)

	# One Sample t-test       <<<<<<<<<<<<< use in paper

# data:  RH_human[seq(1, nrow(RH_human), 100), ]$delta_w6_w0
# t = -54.135, df = 2911, p-value < 2.2e-16
# alternative hypothesis: true mean is not equal to 0
# 95 percent confidence interval:
 # -0.8090023 -0.7524468
# sample estimates:
 # mean of x 
# -0.7807245 

t.test(RH_human[seq(1,nrow(RH_human),100),]$delta_w6_w0,na.rm=TRUE,mu=0)$p.value
# [1] 0 <<<<<<<<<<<<< use in paper



# Compare human and hamster baseline:

t.test(RH_hamster[seq(1,nrow(RH_hamster),100),]$delta_w6_w0,RH_human[seq(1,nrow(RH_human),100),]$delta_w6_w0)

	# Welch Two Sample t-test

# data:  RH_hamster[seq(1, nrow(RH_hamster), 100), ]$delta_w6_w0 and RH_human[seq(1, nrow(RH_human), 100), ]$delta_w6_w0
# t = 53.492, df = 2987.8, p-value < 2.2e-16
# alternative hypothesis: true difference in means is not equal to 0
# 95 percent confidence interval:
 # 0.7480761 0.8050040
# sample estimates:
   # mean of x    mean of y 
# -0.004184464 -0.780724527.  <<<<<<<<<<<<<<<<<< do not use this in paper because hamster DNA is normalized to itself

t.test(RH_hamster[seq(1,nrow(RH_hamster),100),]$delta_w6_w0,RH_human[seq(1,nrow(RH_human),100),]$delta_w6_w0)$p.value
# [1] 0

# ----------- Levene's test hamster vs human ----------------

library(car)

Levene_data <- data.frame(copy = c(RH_hamster[seq(1,nrow(RH_hamster),100),]$delta_w6_w0,RH_human[seq(1,nrow(RH_human),100),]$delta_w6_w0),group = c(rep("hamster",length(RH_hamster[seq(1,nrow(RH_hamster),100),]$delta_w6_w0)),rep("human",length(RH_human[seq(1,nrow(RH_human),100),]$delta_w6_w0))))

# using default, center=median. Actually becomes Brown-Forsythe test (more conservative and robust than Levene test), cf https://biostats.w.uib.no/test-for-homogeneity-of-variances-levenes-test/

leveneTest(copy~group,data=Levene_data)
# Levene's Test for Homogeneity of Variance (center = median)
        # Df F value    Pr(>F)    
# group    1  3078.9 < 2.2e-16 *** <<<<<<<<<<< use in paper
      # 4953                      
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

leveneTest(copy~group,data=Levene_data)$Pr
# [1]  0 NA <<<<<<<<<<< use in paper


# using center=mean, this is now actually the Levene test:

leveneTest(copy~group,data=Levene_data,center=mean)
# Levene's Test for Homogeneity of Variance (center = mean)
        # Df F value    Pr(>F)    
# group    1  3208.5 < 2.2e-16 ***
      # 4953                      
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

leveneTest(copy~group,data=Levene_data,center=mean)$Pr
# [1] 0 NA


# fligner test, best for non-normal data, or data with outliers cf https://biostats.w.uib.no/test-for-homogeneity-of-variances-levenes-test/

fligner.test(copy~group,data=Levene_data)

	# Fligner-Killeen test of homogeneity of variances

# data:  copy by group
# Fligner-Killeen:med chi-squared = 2264, df = 1, p-value < 2.2e-16

fligner.test(copy~group,data=Levene_data)$p.value
# [1] 0 ~~~~~~~~~~ in theory should use this in paper, since both distributions are significantly non-normal:

shapiro.test(RH_hamster[seq(1,nrow(RH_hamster),100),]$delta_w6_w0)

	# Shapiro-Wilk normality test

# data:  RH_hamster[seq(1, nrow(RH_hamster), 100), ]$delta_w6_w0
# W = 0.99781, p-value = 0.006626


shapiro.test(RH_human[seq(1,nrow(RH_human),100),]$delta_w6_w0)

	# Shapiro-Wilk normality test

# data:  RH_human[seq(1, nrow(RH_human), 100), ]$delta_w6_w0
# W = 0.98807, p-value = 6.294e-15

# But in fact, both distros look quite normal by eye using 

hist(RH_hamster[seq(1,nrow(RH_hamster),100),]$delta_w6_w0) 

# and 

hist(RH_human[seq(1,nrow(RH_human),100),]$delta_w6_w0)

# so use Brown-Forsythe test, as above.




############################################################################
# -- Compare hamster and human baselines, drug 75 nM at week 6 vs week 0 ---
############################################################################

sem <- function(x) {sqrt(var(x,na.rm=TRUE)/sum(!is.na(x)))}

mean(RH_hamster[seq(1,nrow(RH_hamster),100),]$delta_d75_d0)
# [1] -0.002847477 <<<<<<<<<<<<<<<< do not use in paper because hamster DNA is normalized to itself

sd(RH_hamster[seq(1,nrow(RH_hamster),100),]$delta_d75_d0)
# [1] 0.1070661 <<<<<<<<<<<<<<<< use in paper

sem(RH_hamster[seq(1,nrow(RH_hamster),100),]$delta_d75_d0)
# [1] 0.002368743 <<<<<<<<<<<<<<<< do not use in paper because hamster normalized to itself


# ------- sd of sd of hamster. -----------
# Two formulae, one from https://stats.stackexchange.com/questions/631/standard-deviation-of-standard-deviation, which is:
# sd*sqrt(exp(1)*(1-(1/n))-1)
# Other from chapter 2, "Distribution Functions",  S.V. Gupta, Measurement Uncertainties, DOI 10.1007/978-3-642-20989-5 2, © Springer-Verlag Berlin Heidelberg 2012
# sd/sqrt(2*n)
# Both gave nearly identical answers:


sd(RH_hamster[seq(1,nrow(RH_hamster),100),]$delta_d75_d0)*sqrt(exp(1)*(1-(1/length(RH_hamster[seq(1,nrow(RH_hamster),100),]$delta_d75_d0)))^(length(RH_hamster[seq(1,nrow(RH_hamster),100),]$delta_d75_d0)-1)-1)
# [1] 0.001675193

sd(RH_hamster[seq(1,nrow(RH_hamster),100),]$delta_d75_d0)/sqrt(2*length(RH_hamster[seq(1,nrow(RH_hamster),100),]$delta_d75_d0))
# [1] 0.001674954



# sem of sd of hamster:
# Both gave nearly identical answers:

sd(RH_hamster[seq(1,nrow(RH_hamster),100),]$delta_d75_d0)*sqrt(exp(1)*(1-(1/length(RH_hamster[seq(1,nrow(RH_hamster),100),]$delta_d75_d0)))^(length(RH_hamster[seq(1,nrow(RH_hamster),100),]$delta_d75_d0)-1)-1)/(length(RH_hamster[seq(1,nrow(RH_hamster),100),]$delta_d75_d0))
# [1] 8.199674e-07 <<<<<<<<<<<<<<<< use in paper


sd(RH_hamster[seq(1,nrow(RH_hamster),100),]$delta_d75_d0)/sqrt(2*length(RH_hamster[seq(1,nrow(RH_hamster),100),]$delta_d75_d0))/(length(RH_hamster[seq(1,nrow(RH_hamster),100),]$delta_d75_d0))
# [1] 8.198504e-07



# Are hamster changes significantly different from zero? no:

t.test(RH_hamster[seq(1,nrow(RH_hamster),100),]$delta_d75_d0,mu=0)

	# One Sample t-test

# data:  RH_hamster[seq(1, nrow(RH_hamster), 100), ]$delta_d75_d0
# t = -1.2021, df = 2042, p-value = 0.2295
# alternative hypothesis: true mean is not equal to 0
# 95 percent confidence interval:
 # -0.007492881  0.001797928
# sample estimates:
   # mean of x 
# -0.002847477


# --------------------- changes in human DNA due to paclitaxel --------------------


mean(RH_human[seq(1,nrow(RH_human),100),][!is.infinite(RH_human[seq(1,nrow(RH_human),100),]$delta_d75_d0),"delta_d75_d0"],na.rm=TRUE)
# [1] -1.458834 <<<<<<<<<<<<<<<< use in paper

sd(RH_human[seq(1,nrow(RH_human),100),][!is.infinite(RH_human[seq(1,nrow(RH_human),100),]$delta_d75_d0),"delta_d75_d0"],na.rm=TRUE)
# [1] 1.206873 <<<<<<<<<<<<<<<< use in paper

sem(RH_human[seq(1,nrow(RH_human),100),][!is.infinite(RH_human[seq(1,nrow(RH_human),100),]$delta_d75_d0),"delta_d75_d0"])
# [1] 0.02288142 <<<<<<<<<<<<<<<< use in paper


# ----- sd of sd of human. ---------------
# Two formulae, one from https://stats.stackexchange.com/questions/631/standard-deviation-of-standard-deviation, which is:
# sd*sqrt(exp(1)*(1-(1/n))-1)
# Other from chapter 2, "Distribution Functions",  S.V. Gupta, Measurement Uncertainties, DOI 10.1007/978-3-642-20989-5 2, © Springer-Verlag Berlin Heidelberg 2012
# sd/sqrt(2*n)
# Both gave nearly identical answers:

sd(RH_human[seq(1,nrow(RH_human),100),][!is.infinite(RH_human[seq(1,nrow(RH_human),100),]$delta_d75_d0),"delta_d75_d0"],na.rm=TRUE)*sqrt(exp(1)*(1-(1/length(RH_human[seq(1,nrow(RH_human),100),][!is.infinite(RH_human[seq(1,nrow(RH_human),100),]$delta_d75_d0),"delta_d75_d0"])))^(length(RH_human[seq(1,nrow(RH_human),100),][!is.infinite(RH_human[seq(1,nrow(RH_human),100),]$delta_d75_d0),"delta_d75_d0"])-1)-1)
# [1] 0.01590632


sd(RH_human[seq(1,nrow(RH_human),100),][!is.infinite(RH_human[seq(1,nrow(RH_human),100),]$delta_d75_d0),"delta_d75_d0"],na.rm=TRUE)/sqrt(2*length(RH_human[seq(1,nrow(RH_human),100),][!is.infinite(RH_human[seq(1,nrow(RH_human),100),]$delta_d75_d0),"delta_d75_d0"]))
# [1] 0.01590471


# sem of sd of human:
# Both gave nearly identical answers:

sd(RH_human[seq(1,nrow(RH_human),100),][!is.infinite(RH_human[seq(1,nrow(RH_human),100),]$delta_d75_d0),"delta_d75_d0"],na.rm=TRUE)*sqrt(exp(1)*(1-(1/length(RH_human[seq(1,nrow(RH_human),100),][!is.infinite(RH_human[seq(1,nrow(RH_human),100),]$delta_d75_d0),"delta_d75_d0"])))^(length(RH_human[seq(1,nrow(RH_human),100),][!is.infinite(RH_human[seq(1,nrow(RH_human),100),]$delta_d75_d0),"delta_d75_d0"])-1)-1)/(length(RH_human[seq(1,nrow(RH_human),100),][!is.infinite(RH_human[seq(1,nrow(RH_human),100),]$delta_d75_d0),"delta_d75_d0"]))
# [1] 5.524946e-06 <<<<<<<<<<<<<<<< use in paper


sd(RH_human[seq(1,nrow(RH_human),100),][!is.infinite(RH_human[seq(1,nrow(RH_human),100),]$delta_d75_d0),"delta_d75_d0"],na.rm=TRUE)/sqrt(2*length(RH_human[seq(1,nrow(RH_human),100),][!is.infinite(RH_human[seq(1,nrow(RH_human),100),]$delta_d75_d0),"delta_d75_d0"]))/(length(RH_human[seq(1,nrow(RH_human),100),][!is.infinite(RH_human[seq(1,nrow(RH_human),100),]$delta_d75_d0),"delta_d75_d0"]))
# [1] 5.524386e-06


# Are human changes significantly different from zero? yes:

t.test(RH_human[seq(1,nrow(RH_human),100),][!is.infinite(RH_human[seq(1,nrow(RH_human),100),]$delta_d75_d0),"delta_d75_d0"],mu=0)

	# One Sample t-test     <<<<<<<<<<<<< use in paper

# data:  RH_human[seq(1, nrow(RH_human), 100), ][!is.infinite(RH_human[seq(1,     nrow(RH_human), 100), ]$delta_d75_d0), "delta_d75_d0"]
# t = -63.756, df = 2781, p-value < 2.2e-16
# alternative hypothesis: true mean is not equal to 0
# 95 percent confidence interval:
 # -1.503700 -1.413968
# sample estimates:
# mean of x 
# -1.458834 


t.test(RH_human[seq(1,nrow(RH_human),100),][!is.infinite(RH_human[seq(1,nrow(RH_human),100),]$delta_d75_d0),"delta_d75_d0"],mu=0)$p.value
# [1] 0 <<<<<<<<<<<<< use in paper

# Is baseline for human downshifted compared to hamster? Yes.

t.test(RH_hamster[seq(1,nrow(RH_hamster),100),]$delta_d75_d0,RH_human[seq(1,nrow(RH_human),100),][!is.infinite(RH_human[seq(1,nrow(RH_human),100),]$delta_d75_d0),"delta_d75_d0"])

	# Welch Two Sample t-test

# data:  RH_hamster[seq(1, nrow(RH_hamster), 100), ]$delta_d75_d0 and RH_human[seq(1, nrow(RH_human), 100), ][!is.infinite(RH_human[seq(1, RH_hamster[seq(1, nrow(RH_hamster), 100), ]$delta_d75_d0 and     nrow(RH_human), 100), ]$delta_d75_d0), "delta_d75_d0"]
# t = 63.294, df = 2840.5, p-value < 2.2e-16
# alternative hypothesis: true difference in means is not equal to 0
# 95 percent confidence interval:
 # 1.410881 1.501092
# sample estimates:
   # mean of x    mean of y 
# -0.002847477 -1.458834113  <<<<<<<<<<<<<<<< do not use in paper because hamster normalized to itself


t.test(RH_hamster[seq(1,nrow(RH_hamster),100),]$delta_d75_d0,RH_human[seq(1,nrow(RH_human),100),][!is.infinite(RH_human[seq(1,nrow(RH_human),100),]$delta_d75_d0),"delta_d75_d0"])$p.value

# [1] 0

# Is human growth baseline more shifted down than human drug? Yes.

t.test(RH_human[seq(1,nrow(RH_human),100),]$delta_w6_w0,RH_human[seq(1,nrow(RH_human),100),][!is.infinite(RH_human[seq(1,nrow(RH_human),100),]$delta_d75_d0),"delta_d75_d0"])

	# Welch Two Sample t-test

# data:  RH_human[seq(1, nrow(RH_human), 100), ]$delta_w6_w0 and RH_human[seq(1, nrow(RH_human), 100), ][!is.infinite(RH_human[seq(1, RH_human[seq(1, nrow(RH_human), 100), ]$delta_w6_w0 and     nrow(RH_human), 100), ]$delta_d75_d0), "delta_d75_d0"]
# t = 25.071, df = 4718.1, p-value < 2.2e-16
# alternative hypothesis: true difference in means is not equal to 0
# 95 percent confidence interval:
 # 0.6250847 0.7311345
# sample estimates:
 # mean of x  mean of y 
# -0.7807245 -1.4588341 

t.test(RH_human[seq(1,nrow(RH_human),100),]$delta_w6_w0,RH_human[seq(1,nrow(RH_human),100),][!is.infinite(RH_human[seq(1,nrow(RH_human),100),]$delta_d75_d0),"delta_d75_d0"])$p.value
# [1] 2.472027e-130



# Levene's test hamster vs human

library(car)

Levene_data <- data.frame(copy = c(RH_hamster[seq(1,nrow(RH_hamster),100),]$delta_d75_d0,RH_human[seq(1,nrow(RH_human),100),][!is.infinite(RH_human[seq(1,nrow(RH_human),100),]$delta_d75_d0),"delta_d75_d0"]),group = c(rep("hamster",length(RH_hamster[seq(1,nrow(RH_hamster),100),]$delta_d75_d0)),rep("human",length(RH_human[seq(1,nrow(RH_human),100),][!is.infinite(RH_human[seq(1,nrow(RH_human),100),]$delta_d75_d0),"delta_d75_d0"]))))


leveneTest(copy~group,data=Levene_data)

# Levene's Test for Homogeneity of Variance (center = median)
        # Df F value    Pr(>F)    
# group    1  2758.3 < 2.2e-16 *** <<<<<<<<<<< use in paper
      # 4823                      
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

leveneTest(copy~group,data=Levene_data)$Pr
# [1]  0 NA <<<<<<<<<<< use in paper


# using center=mean, this is now actually the Levene test:

leveneTest(copy~group,data=Levene_data,center=mean)

# Levene's Test for Homogeneity of Variance (center = mean)
        # Df F value    Pr(>F)    
# group    1  2761.4 < 2.2e-16 ***
      # 4823                      
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

leveneTest(copy~group,data=Levene_data,center=mean)$Pr
# [1] 0 NA



# fligner test, best for non-normal data, or data with outliers cf https://biostats.w.uib.no/test-for-homogeneity-of-variances-levenes-test/

fligner.test(copy~group,data=Levene_data)

	# Fligner-Killeen test of homogeneity of variances

# data:  copy by group
# Fligner-Killeen:med chi-squared = 2243.2, df = 1, p-value < 2.2e-16

fligner.test(copy~group,data=Levene_data)$p.value
# [1] 0 ~~~~~~~~~~ in theory should use this in paper, since both distributions are significantly non-normal:

shapiro.test(RH_hamster[seq(1,nrow(RH_hamster),100),]$delta_d75_d0)

	# Shapiro-Wilk normality test

# data:  RH_hamster[seq(1, nrow(RH_hamster), 100), ]$delta_d75_d0
# W = 0.99247, p-value = 9.313e-09

shapiro.test(RH_hamster[seq(1,nrow(RH_hamster),100),]$delta_d75_d0)$p.value
# [1] 9.313111e-09


shapiro.test(RH_human[seq(1,nrow(RH_human),100),][!is.infinite(RH_human[seq(1,nrow(RH_human),100),]$delta_d75_d0),"delta_d75_d0"])

	# Shapiro-Wilk normality test

# data:  RH_human[seq(1, nrow(RH_human), 100), ][!is.infinite(RH_human[seq(1,     nrow(RH_human), 100), ]$delta_d75_d0), "delta_d75_d0"]
# W = 0.99879, p-value = 0.04402

# But in fact, both distros look quite normal by eye using 

hist(RH_hamster[seq(1,nrow(RH_hamster),100),]$delta_d75_d0) 

# and 

hist(RH_human[seq(1,nrow(RH_human),100),][!is.infinite(RH_human[seq(1,nrow(RH_human),100),]$delta_d75_d0),"delta_d75_d0"])

# so use Brown-Forsythe test, as above.





























































