library(Rmisc)
library(ggplot2)
library(cowplot) #used with plot_grid 
library(scales)

# --------------- function ---------------------------


get_legend<-function(myggplot){
  tmp <- ggplot_gtable(ggplot_build(myggplot))
   leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
   legend <- tmp$grobs[[leg]]
   return(legend)
 }




#----------------Aesthetics ---------------------------

# base style axes: https://stackoverflow.com/questions/25327694/how-to-tweak-the-extent-to-which-an-axis-is-drawn-in-ggplot2

theme2 <- theme(
	plot.margin = unit(c(t=0.0,r=0.0,b=0.0,l=0.0), "cm"),
	panel.grid.major = element_blank(), 
	panel.grid.minor = element_blank(), 
	panel.background = element_blank(), 
	legend.position="none", 
	axis.line.x = element_line(colour = "black", size = 0.1), 
	axis.line.y = element_line(colour = "black", size = 0.1), 
	axis.ticks = element_line(colour = "black", size = 0.1),
	axis.text=element_text(size=6), #numbers on tick marks of x and y axes
	axis.title=element_text(size=7), #titles of x and y axes
	axis.title.y=element_text(margin=margin(0,8,0,0)), #moves y axis title by adding margin space to bottom
	axis.title.x=element_text(margin=margin(13,0,0,0)),  #moves x axis title by adding margin space to top
	# plot.title = element_text(size=32, face="bold", hjust = -0.14), #can provide "A","B", by ggtitle, but used plot_grid wch can shift more left
	plot.subtitle = element_text(size=9, face="plain", hjust = 0.5, margin=margin(0,0,13,0)) #hjust shifts right
	)

gg_color_hue <- function(n) {
  hues = seq(15, 375, length = n + 1)
  hcl(h = hues, l = 65, c = 100)[1:n]
}

size_hline <- 0.2


# balloon_scale <- 5 # inflation factor for significant points	
# size_point <- 0.1*(1 + balloon_scale*(logP$log10p_g_avg/max(logP$log10p_g_avg))) # scale significant points


#-----------moffat_BF_crispr (1)----------------------

# petal.txt is from Moffat supp data file mmc3-1.xlsx

petal <- read.table("petal.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE)
petal <- petal[,c("Gene", "BF_hct116", "BF_hela", "BF_gbm", "BF_rpe1", "BF_dld1")] # omit 1st gen crispr BF_a375_GeCKo
g_unique <- read.delim("growth_loci_unique.txt",sep="\t",stringsAsFactors=FALSE,header=TRUE,check.names=FALSE)
petal$RH <- numeric(nrow(petal))
petal[petal$Gene %in% g_unique$geneSymbol,"RH"] <- 1


petal_l <- reshape(petal, 
  varying = c("BF_hct116", "BF_hela",  "BF_gbm", "BF_rpe1", "BF_dld1"), 
  v.names = "BF",
  timevar = "cell", 
  times = c("BF_hct116", "BF_hela",  "BF_gbm", "BF_rpe1", "BF_dld1"), 
  new.row.names = 1:1e6,
  direction = "long")



petal_l[petal_l$cell == "BF_hct116","cell"] <- "HCT116"
petal_l[petal_l$cell == "BF_hela","cell"] <- "HeLa"
petal_l[petal_l$cell == "BF_gbm","cell"] <- "GBM"
petal_l[petal_l$cell == "BF_rpe1","cell"] <- "RPE-1"
petal_l[petal_l$cell == "BF_dld1","cell"] <- "DLD-1"

level_order_1 <- c("DLD-1", "GBM", "RPE-1", "HeLa", "HCT116")



petal_l$cell <- factor(petal_l$cell,levels=level_order_1)
petal_l$RH <- factor(petal_l$RH,levels=c("1","0"))



labels_1 <- level_order_1

colores_1 <- gg_color_hue(length(unique(petal_l$RH)))
names(colores_1) <- c("RH+","RH-")

p_vals_1 <- c(
			t.test(petal[petal$RH==1,c("BF_dld1")],petal[petal$RH==0,c("BF_dld1")])$p.value, 
			t.test(petal[petal$RH==1,c("BF_gbm")],petal[petal$RH==0,c("BF_gbm")])$p.value,
			t.test(petal[petal$RH==1,c("BF_rpe1")],petal[petal$RH==0,c("BF_rpe1")])$p.value,
			t.test(petal[petal$RH==1,c("BF_hela")],petal[petal$RH==0,c("BF_hela")])$p.value,
			t.test(petal[petal$RH==1,c("BF_hct116")],petal[petal$RH==0,c("BF_hct116")])$p.value
			)



# fun_label <- c(
				# deparse(bquote(italic(P)==.(formatC(p_vals[1],format="e",digits=0)))),
				# deparse(bquote(italic(P)==.(formatC(p_vals[2],format="e",digits=0)))),
				# deparse(bquote(italic(P)==.(formatC(p_vals[3],format="e",digits=0)))),
				# deparse(bquote(italic(P)==.(formatC(p_vals[4],format="e",digits=0)))),
				# deparse(bquote(italic(P)==.(formatC(p_vals[5],format="e",digits=0))))
				# )
				
				
fun_label_1 <- c(
				deparse(bquote(.(formatC(p_vals_1[1],format="e",digits=0)))),
				deparse(bquote(.(formatC(p_vals_1[2],format="e",digits=0)))),
				deparse(bquote(.(formatC(p_vals_1[3],format="e",digits=0)))),
				deparse(bquote(.(formatC(p_vals_1[4],format="e",digits=0)))),
				deparse(bquote(.(formatC(p_vals_1[5],format="e",digits=0))))
				)


horiz_pos_1 <- c(1,2,3,4,5)-0.3
# vert_pos <-  c(
				# max(petal[petal$RH==0,c("BF_dld1")],na.rm=TRUE) + 25, 
				# max(petal[petal$RH==0,c("BF_gbm")],na.rm=TRUE) + 25, 
				# max(petal[petal$RH==0,c("BF_rpe1")],na.rm=TRUE) + 25, 
				# max(petal[petal$RH==0,c("BF_hela")],na.rm=TRUE) + 25,
				# max(petal[petal$RH==0,c("BF_hct116")],na.rm=TRUE) + 25
				# )
vert_pos_1 <- c(60)
				



# violin plot
p1 <- ggplot(
		data = petal_l, 
		aes(
				x = cell, 
				y = BF,
				fill = RH
				)
			) + 
		geom_violin(
			width=0.5, 
			position = position_dodge(width=0.6), alpha=0.1,
			lwd=0.1
			) +
		geom_point(
			data= petal_l,
			mapping=aes(x=cell,y=BF),
			shape=16, 
			colour="grey",
			alpha=1,
			size=0.1,
			position=position_jitterdodge(dodge.width=0.6,jitter.width=0.0)
			) +
		geom_boxplot(
			width=0.1,
			position= position_dodge(0.6), alpha=0.1,
			outlier.shape=NA,
			lwd=0.2,
			fatten=2,
			show.legend = FALSE#,
			#notch=TRUE
			)  +
		# scale_fill_manual(
			# values=as.vector(colores_1),
			# labels=names(colores_1),
			# name=NULL		
			# )+
		# guides(
			# shape=FALSE,
	 		# fill = guide_legend(
				 		# override.aes = list(
				 		# #fill=NA,
				 		# shape=NA,
				 		# size=0.1
				 		# ),
			 		# ncol=2,
			 		# byrow=TRUE
			 		# )
	 		 # ) +
	    annotate(
		    "rect",
		    xmin=horiz_pos_1-0.2,
		    xmax=horiz_pos_1+0.6, 
		    ymin=vert_pos_1-30, 
		    ymax=vert_pos_1+20, 
		    fill="white"
		    ) +
 		annotate(
	 		geom="text", 
	 		x=horiz_pos_1, 
	 		y=vert_pos_1, 
	 		hjust=0,
	 		label= fun_label_1, 
	 		parse=TRUE, 
	 		size=2
	 		) +
 		theme2 + 
   		theme(
		 	plot.margin = unit(c(top=0.5,right=0.5,bottom=0.5,left=0.5), "cm"),
		 	plot.subtitle = element_text(size=9, face="plain", hjust = 0.5, margin=margin(0,0,13,0)),
		 	axis.title.x=element_blank(),
	        # axis.text.x=element_blank(),
	        # axis.ticks.x=element_blank(),
	 		legend.position="none",
			legend.margin=margin(t=0,r=0,b=0,l=0,unit = "pt"),
		 	legend.box.margin=margin(t=0,r=0,b=0,l=0,unit = "pt"),
		 	legend.key.height = unit(0.3, "cm"),
			legend.key.width = unit(0.3, "cm"),
		 	legend.spacing.y = unit(0.3, 'cm'),
		 	legend.spacing.x = unit(0.3, 'cm'),
		 	legend.title = element_text(size = 7),  
		 	legend.text = element_text(size = 7),
	 		legend.title.align=0.7
	 		# plot.tag = element_text(face="bold"), 
	 		# plot.tag = element_text(size=32, face="bold"), 
	 		# plot.tag.position = c(0.5,0.7)
 			) +
		# labs(tag = "A") +
		scale_x_discrete(
			labels = labels_1, 
			expand = expand_scale(add = .6)
			) +
		scale_y_continuous(
			breaks = c(-150, -50, 50),
			labels = c(-150, -50, 50),
			expand = expand_scale(mult = .05)
			) +
		coord_cartesian(ylim = c(-170,vert_pos_1)) +
		# xlab("Cell lines") + 
		ylab(expression(BF)) + 
		labs(subtitle="CRISPR")
print(p1)



# saveRDS(p1,"p1")
# rm(list=setdiff(ls(),c("theme2","gg_color_hue","size_hline")))



#-----------moffat_numTKOHits violin plot (2)----------------------

# petal.txt is from Moffat supp data file mmc3-1.xlsx

tko <- read.table("petal.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE)
tko <- tko[,c("Gene","numTKOHits")]
g_unique <- read.delim("growth_loci_unique.txt",sep="\t",stringsAsFactors=FALSE,header=TRUE,check.names=FALSE)
tko$RH <- numeric(nrow(tko))
tko[tko$Gene %in% g_unique$geneSymbol,"RH"] <- 1



# No need to cast in long form, as already in long form with only one group factor. However, for consistency with other plots provide "dummy" grp factor for plotting.
tko$cell <- "numTKOHits"
tko$cell <- factor(tko$cell,levels=c("numTKOHits"))

tko$RH <- factor(tko$RH,levels=c("1","0"))


labels_2 <- levels(tko$cell)



colores_2 <- gg_color_hue(length(unique(tko$RH)))
names(colores_2) <- c("RH+","RH-")

p_vals_2 <- c(
			t.test(tko[tko$RH==1,c("numTKOHits")],tko[tko$RH==0,c("numTKOHits")])$p.value
			)



fun_label_2 <- c(
				deparse(bquote(.(formatC(p_vals_2[1],format="e",digits=0))))
				)


horiz_pos_2 <- c(1)-0.18
vert_pos_2 <-  c(
				max(tko[,c("numTKOHits")],na.rm=TRUE) + 1
				)



# violin plot
p2 <- ggplot(
		data = tko, 
		aes(
				x = cell, 
				y = numTKOHits,
				fill = RH
				)
			) + 
		geom_violin(
			width=0.5, 
			position = position_dodge(width=0.6), alpha=0.1,
			lwd=0.1
			) +
		geom_point(
			data= tko,
			mapping=aes(x=cell,y= numTKOHits),
			shape=16, 
			colour="grey",
			alpha=1,
			size=0.1,
			position=position_jitterdodge(
						dodge.width=0.6,
						jitter.width=0.0,
						jitter.height=0.1
						)
			) +
		geom_boxplot(
			width=0.1,
			position= position_dodge(0.6), alpha=0.1,
			outlier.shape=NA,
			lwd=0.2,
			fatten=2,
			show.legend = FALSE#,
			#notch=TRUE
			)  +
		scale_fill_manual(
			values=as.vector(colores_2),
			labels=names(colores_2),
			name=NULL		
			)+
		guides(
			shape=FALSE,
	 		fill = guide_legend(
				 		override.aes = list(
				 		#fill=NA,
				 		shape=NA,
				 		size=0.1
				 		),
			 		ncol=1,
			 		byrow=TRUE
			 		)
	 		 ) +
 		annotate(
	 		geom="text", 
	 		x=horiz_pos_2, 
	 		y=vert_pos_2, 
	 		hjust=0,
	 		label= fun_label_2, 
	 		parse=TRUE, 
	 		size=2
	 		) +
 		theme2 + 
   		theme(
		 	plot.margin = unit(c(top=0.5,right=0.5,bottom=0.5,left=0.5), "cm"),
		 	axis.title.x=element_blank(),
	        axis.text.x=element_blank(),
	        axis.ticks.x=element_blank(),
	 		legend.position = c(0,0),
			legend.margin=margin(t=0,r=0,b=0,l=0,unit = "pt"),
		 	legend.box.margin=margin(t=0,r=0,b=0,l=0,unit = "pt"),
		 	legend.key.height = unit(0.3, "cm"),
			legend.key.width = unit(0.3, "cm"),
		 	legend.spacing.y = unit(0.3, 'cm'),
		 	legend.spacing.x = unit(0.3, 'cm'),
		 	legend.title = element_text(size = 7),  
		 	legend.text = element_text(size = 7),
	 		legend.title.align=0.7,
	 		# plot.tag = element_text(face="bold")#, 
	 		# plot.tag = element_text(size=32, face="bold"), 
	 		# plot.tag.position = c(0.5,0.7)
 			) +
		# labs(tag = "B") +
		scale_x_discrete(
			# labels = labels, 
			expand = expand_scale(add = .6)
			) +
		scale_y_continuous(
			expand = expand_scale(mult = .05)#,
			#limits=c(-2.2,6.25)
			) +
		# coord_cartesian(ylim = c(0,2)) +
		# xlab("Cell lines") + 
		ylab(expression(Number)) + 
		labs(subtitle="Hits")		
print(p2)

		
legend <- get_legend(p2)
p2 <- p2 + theme(legend.position="none")




# saveRDS(p2,"p2")
# rm(list=setdiff(ls(),c("theme2","gg_color_hue","size_hline")))



# #-----------moffat_BF_hct116_shRNA (3)----------------------

# # petal.txt is from Moffat supp data file mmc3-1.xlsx

# sh <- read.table("petal.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE)
# sh <- sh[,c("Gene","BF_hct116_shRNA")]
# g_unique <- read.delim("growth_loci_unique.txt",sep="\t",stringsAsFactors=FALSE,header=TRUE,check.names=FALSE)
# sh$RH <- numeric(nrow(sh))
# sh[sh$Gene %in% g_unique$geneSymbol,"RH"] <- 1


# # # Cast in long form, for consistency and providing factors to ggplot2, though sh already in long form with only one column
# # sh_l <- reshape(
				# # sh, 
				# # varying = c("BF_hct116_shRNA"), 
				# # v.names = "BF",
				# # timevar = "cell", 
				# # times = c("HCT116"), 
				# # new.row.names = 1:1e6,
				# # direction = "long"
				# # )




# # No need to cast in long form, as already in long form with only one group factor. However, for consistency with other plots provide "dummy" grp factor for plotting.
# sh$cell <- "HCT116"
# sh$cell <- factor(sh$cell,levels=c("HCT116"))


# sh$RH <- factor(sh$RH,levels=c("1","0"))


# labels_3 <- levels(sh$cell)

# colores_3 <- gg_color_hue(length(unique(sh$RH)))
# names(colores_3) <- c("RH+","RH-")

# p_vals_3 <- c(
			# t.test(sh[sh$RH==1,c("BF_hct116_shRNA")],sh[sh$RH==0,c("BF_hct116_shRNA")])$p.value
			# )



# fun_label_3 <- c(
				# deparse(bquote(.(formatC(p_vals_3[1],format="e",digits=0))))
				# )


# horiz_pos_3 <- c(1)-0.2
# # vert_pos_3 <-  c(
				# # max(sh[,c("BF_hct116_shRNA")],na.rm=TRUE) + 10
				# # )
# vert_pos_3 <- 50



# # violin plot
# p3 <- ggplot(
		# data = sh, 
		# aes(
				# x = cell, 
				# y = BF_hct116_shRNA,
				# fill = RH
				# )
			# ) + 
		# geom_violin(
			# width=0.5, 
			# position = position_dodge(width=0.6), alpha=0.1,
			# lwd=0.1
			# ) +
		# geom_point(
			# data= sh,
			# mapping=aes(x=cell,y= BF_hct116_shRNA),
			# shape=16, 
			# colour="grey",
			# alpha=1,
			# size=0.1,
			# position=position_jitterdodge(
						# dodge.width=0.6,
						# jitter.width=0.0,
						# jitter.height=0.1
						# )
			# ) +
		# geom_boxplot(
			# width=0.1,
			# position= position_dodge(0.6), alpha=0.1,
			# outlier.shape=NA,
			# lwd=0.2,
			# fatten=2,
			# show.legend = FALSE#,
			# #notch=TRUE
			# )  +
		# scale_fill_manual(
			# values=as.vector(colores_3),
			# labels=names(colores_3),
			# name=NULL		
			# )+
		# guides(
			# shape=FALSE,
	 		# fill = guide_legend(
				 		# override.aes = list(
				 		# #fill=NA,
				 		# shape=NA,
				 		# size=0.1
				 		# ),
			 		# ncol=1,
			 		# byrow=TRUE
			 		# )
	 		 # ) +
	    # annotate(
		    # "rect",
		    # xmin=horiz_pos_3-0.2,
		    # xmax=horiz_pos_3+0.4, 
		    # ymin=vert_pos_3-10, 
		    # ymax=vert_pos_3+8, 
		    # fill="white"
		    # ) +
 		# annotate(
	 		# geom="text", 
	 		# x=horiz_pos_3, 
	 		# y=vert_pos_3, 
	 		# hjust=0,
	 		# label= fun_label_3, 
	 		# parse=TRUE, 
	 		# size=2
	 		# ) +
 		# theme2 + 
   		# theme(
		 	# plot.margin = unit(c(top=0.5,right=0.5,bottom=0.5,left=0.5), "cm"),
		 	# axis.title.x=element_blank(),
	        # axis.text.x=element_blank(),
	        # axis.ticks.x=element_blank(),
	        # legend.position="none",
			# legend.margin=margin(t=0,r=0,b=0,l=0,unit = "pt"),
		 	# legend.box.margin=margin(t=0,r=0,b=0,l=0,unit = "pt"),
		 	# legend.key.height = unit(0.3, "cm"),
			# legend.key.width = unit(0.3, "cm"),
		 	# legend.spacing.y = unit(0.3, 'cm'),
		 	# legend.spacing.x = unit(0.3, 'cm'),
		 	# legend.title = element_text(size = 7),  
		 	# legend.text = element_text(size = 7),
	 		# legend.title.align=0.7
	 		# # plot.tag = element_text(face="bold")#, 
	 		# # plot.tag = element_text(size=32, face="bold"), 
	 		# # plot.tag.position = c(0.5,0.7)
 			# ) +
		# # labs(tag = "C") +
		# scale_x_discrete(
			# labels = labels, 
			# expand = expand_scale(add = .6)
			# ) +
		# scale_y_continuous(
			# expand = expand_scale(mult = .05)#,
			# #limits=c(-2.2,6.25)
			# ) +
		# coord_cartesian(ylim = c(-75,50)) +
		# xlab("Cell lines") + 
		# ylab(expression(BF)) + 
		# labs(subtitle="shRNA")		
# print(p3)




# # saveRDS(p3,"p3")
# # rm(list=setdiff(ls(),c("theme2","gg_color_hue","size_hline")))



#-------------Moffat_crispr_hct116_fish_ovlap (4)------------

# petal.txt is from Moffat supp data file mmc3-1.xlsx

petal <- read.table("petal.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE)
petal <- petal[,c("Gene", "BF_hct116", "BF_hela", "BF_gbm", "BF_rpe1", "BF_dld1")] # omit 1st gen crispr BF_a375_GeCKo
g_unique <- read.delim("growth_loci_unique.txt",sep="\t",stringsAsFactors=FALSE,header=TRUE,check.names=FALSE)
petal$RH <- numeric(nrow(petal))
petal[petal$Gene %in% g_unique$geneSymbol,"RH"] <- 1



petal_hct116_RH_overlap <- data.frame(
BF_hct116_thres = numeric(), 	#col 1
ptl_rh=numeric(), 				#col 2
noptl_rh=numeric(), 			#col 3
ptl_norh=numeric(), 			#col 4
noptl_norh=numeric(), 			#col 5
obs=numeric(), 					#col 6
exp=numeric(), 					#col 7
chi_sq=numeric(),				#col 8
chi_df=numeric(),				#col 9
P_chi=numeric(),				#col 10
fish_OR=numeric(),				#col 11
fish_conf_1=numeric(),			#col 12
fish_conf_2=numeric(),			#col 13
P_fish=numeric()				#col 14
)


BF_hct116_max <- petal[which.max(petal$BF_hct116),]$BF_hct116
BF_hct116_min <- petal[which.min(petal$BF_hct116),]$BF_hct116

vec_4 <- seq(BF_hct116_min, BF_hct116_max,(BF_hct116_max-BF_hct116_min)/100)


for(i in 1:101) {

petal_hct116_RH_overlap[i,1] <- vec_4[i]
petal_hct116_RH_overlap[i,2] <- dim(petal[petal$BF_hct116 > vec_4[i] & !is.na(petal$BF_hct116) & petal$RH == 1,])[1]
petal_hct116_RH_overlap[i,3] <- dim(petal[petal$BF_hct116 < vec_4[i] & !is.na(petal$BF_hct116) & petal$RH == 1,])[1]
petal_hct116_RH_overlap[i,4] <- dim(petal[petal$BF_hct116 > vec_4[i] & !is.na(petal$BF_hct116) & petal$RH == 0,])[1]
petal_hct116_RH_overlap[i,5] <- dim(petal[petal$BF_hct116 < vec_4[i] & !is.na(petal$BF_hct116) & petal$RH == 0,])[1]

chi <- chisq.test(matrix(c(petal_hct116_RH_overlap[i,2],petal_hct116_RH_overlap[i,3],petal_hct116_RH_overlap[i,4], petal_hct116_RH_overlap[i,5]),2,2,byrow=TRUE))
fish <- fisher.test(matrix(c(petal_hct116_RH_overlap[i,2],petal_hct116_RH_overlap[i,3],petal_hct116_RH_overlap[i,4], petal_hct116_RH_overlap[i,5]),2,2,byrow=TRUE))

petal_hct116_RH_overlap[i,6] <- chi$observed[1,1]
petal_hct116_RH_overlap[i,7] <- chi$expected[1,1]
petal_hct116_RH_overlap[i,8] <- chi$statistic
petal_hct116_RH_overlap[i,9] <- chi$parameter[[1]]
petal_hct116_RH_overlap[i,10] <- chi$p.value
petal_hct116_RH_overlap[i,11] <- fish$estimate[[1]]
petal_hct116_RH_overlap[i,12] <- fish$conf.int[[1]]
petal_hct116_RH_overlap[i,13] <- fish$conf.int[[2]]
petal_hct116_RH_overlap[i,14] <- fish$p.value

}


petal_hct116_RH_overlap$q_fish <- p.adjust(petal_hct116_RH_overlap$P_fish, method = "BH")
h_line_4 <- max(petal_hct116_RH_overlap[petal_hct116_RH_overlap$q_fish < 0.05,]$P_fish)


p4 <- ggplot() + 
		theme2 + 
		theme(legend.key=element_blank()) +
		geom_line(
			data= petal_hct116_RH_overlap, 
			lwd=0.2,
			colour="black",
			show.legend=FALSE,
			aes(
				x= BF_hct116_thres, 
				y= -log10(petal_hct116_RH_overlap$P_fish)
				)
			) +
		geom_hline(
			color = "red", 
			size=size_hline,
			aes(
				yintercept = -log10(h_line_4), 
				linetype = "FDR = 0.05"
				)
			) +
		scale_linetype_manual(
			name = NULL, 
			values = c(1), 
			guide = guide_legend(
						override.aes = list(
											color = c("red"),
											size=size_hline
											)
							)
			) +
		theme(
		 	plot.margin = unit(c(top=0.5,right=0.5,bottom=0.5,left=0.5), "cm"),
		 	plot.subtitle = element_text(size=9, face="plain", hjust = 0.5, margin=margin(0,0,6,0)),
		 	axis.title.x = element_text(size=7, face="plain", hjust = 0.5, margin=margin(5,0,0,0)), #titles of x and y axes
	 		legend.position = c(0.6,0.9), 
			legend.margin=margin(t=0,r=0,b=0,l=0,unit = "pt"),
		 	legend.box.margin=margin(t=0,r=0,b=0,l=0,unit = "pt"),
		 	legend.key.height = unit(0.3, "cm"),
			legend.key.width = unit(0.25, "cm"),
		 	legend.spacing.y = unit(0.3, 'cm'),
		 	legend.spacing.x = unit(0.25, 'cm'),
		 	legend.title = element_text(size = 6),  
		 	legend.text = element_text(size = 6),
	 		legend.title.align=0.7
 			) +
		labs(subtitle="HCT116") +
		xlab("BF") + 
		# scale_x_continuous(breaks = c(0,1,2,3,4,6), labels = c(0,1,2,3,4,6)) + 
		ylab(expression(-log[10]*italic('P'))) 
print(p4)


# saveRDS(p4,"p4")
# rm(list=setdiff(ls(),c("theme2","gg_color_hue","size_hline")))



#------------------------- RH growth genes vs Sabbatini: violin plot (5) --------------------------

# crisp.txt is from Sabbatini supp data file aac7041_SM_Table_S3.xlsx


crisp <- read.table("crisp.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE)
g_unique <- read.delim("growth_loci_unique.txt",sep="\t",stringsAsFactors=FALSE,header=TRUE,check.names=FALSE)
crisp$RH <- numeric(nrow(crisp))
crisp[crisp$Gene %in% g_unique$geneSymbol,"RH"] <- 1

crisp_l <- reshape(crisp, 
  varying = c("KBM7.CS", "K562.CS", "Jiyoye.CS", "Raji.CS"), 
  v.names = "CS",
  timevar = "cell", 
  times = c("KBM7.CS", "K562.CS", "Jiyoye.CS", "Raji.CS"), 
  new.row.names = 1:1e5,
  direction = "long")


crisp_l$cell <- gsub("\\...$","",crisp_l$cell)

crisp_l$cell <- factor(crisp_l$cell,levels=c("KBM7","Raji","Jiyoye","K562"))
crisp_l$RH <- factor(crisp_l$RH,levels=c("1","0"))




  
# summ_crisp <- summarySE(crisp_l, measurevar="CS", groupvars=c("cell","RH"))




# summ_crisp$cell <- factor(summ_crisp$cell,levels=c("Raji","Jiyoye","KBM7","K562"))
# summ_crisp$RH <- factor(summ_crisp$RH,levels=c("1","0"))


# summ_crisp[summ_crisp$RH==1,"RH_label"] <- "RH+"
# summ_crisp[summ_crisp$RH==0,"RH_label"] <- "RH-"


# summ_crisp$RH_label <- factor(summ_crisp$RH_label,levels=c("RH+","RH-"))
# summ_crisp <- summ_crisp[order(summ_crisp$cell,summ_crisp$RH_label),]

labels_5 <- levels(crisp_l$cell)

colores_5 <- gg_color_hue(length(unique(crisp_l$RH)))
names(colores_5) <- c("RH+","RH-")

p_vals_5 <- c(
			t.test(crisp[crisp$RH==1,c("KBM7.CS")],crisp[crisp$RH==0,c("KBM7.CS")])$p.value, 
			t.test(crisp[crisp$RH==1,c("Raji.CS")],crisp[crisp$RH==0,c("Raji.CS")])$p.value,
			t.test(crisp[crisp$RH==1,c("Jiyoye.CS")],crisp[crisp$RH==0,c("Jiyoye.CS")])$p.value,
			t.test(crisp[crisp$RH==1,c("K562.CS")],crisp[crisp$RH==0,c("K562.CS")])$p.value
			)
# p_exps <- -ceiling(log10(p_vals))


fun_label_5 <- c(
				deparse(bquote(.(formatC(p_vals_5[1],format="e",digits=0)))),
				deparse(bquote(.(formatC(p_vals_5[2],format="e",digits=0)))),
				deparse(bquote(.(formatC(p_vals_5[3],format="e",digits=0)))),
				deparse(bquote(.(formatC(p_vals_5[4],format="e",digits=0))))
				)

horiz_pos_5 <- c(1,2,3,4)-0.2
# vert_pos_5 <-  c(
				# max(-crisp[crisp$RH==0,c("KBM7.CS")]) + 0.3, 
				# max(-crisp[crisp$RH==0,c("Raji.CS")]) + 0.3, 
				# max(-crisp[crisp$RH==0,c("Jiyoye.CS")]) + 0.3, 
				# max(-crisp[crisp$RH==0,c("K562.CS")]) + 0.3
				# )
				
				
vert_pos_5 <- 1.6


# violin plot
p5 <- ggplot(
		data = crisp_l, 
		aes(
				x = cell, 
				y = -CS,
				fill = RH
				)
			) + 
		geom_violin(
			width=0.5, 
			position = position_dodge(width=0.6), alpha=0.1,
			lwd=0.1
			) +
		geom_point(
			data=crisp_l,
			mapping=aes(x=cell,y=-CS),
			shape=16, 
			colour="grey",
			alpha=1,
			size=0.1,
			position=position_jitterdodge(dodge.width=0.6,jitter.width=0.0)
			) +
		geom_boxplot(
			width=0.1,
			position= position_dodge(0.6), alpha=0.1,
			outlier.shape=NA,
			lwd=0.2,
			fatten=2,
			show.legend = FALSE#,
			#notch=TRUE
			)  +
		scale_fill_manual(
			values=as.vector(colores_5),
			labels=names(colores_5),
			name=NULL		
			)+
		guides(
			shape=FALSE,
	 		fill = guide_legend(
				 		override.aes = list(
				 		#fill=NA,
				 		shape=NA,
				 		size=0.1
				 		),
			 		ncol=2,
			 		byrow=TRUE
			 		)
	 		 ) +
		annotate(
		    "rect",
		    xmin=horiz_pos_5-0.2,
		    xmax=horiz_pos_5+0.4, 
		    ymin=vert_pos_5-0.4, 
		    ymax=vert_pos_5+0.2, 
		    fill="white"
		    ) +
 		annotate(
	 		geom="text", 
	 		x=horiz_pos_5, 
	 		y=vert_pos_5, 
	 		hjust=0,
	 		label= fun_label_5, 
	 		parse=TRUE, 
	 		size=2
	 		) +
 		theme2 + 
   		theme(
		 	plot.margin = unit(c(top=0.5,right=0.5,bottom=0.5,left=0.5), "cm"),
		 	plot.subtitle = element_text(size=9, face="plain", hjust = 0.5, margin=margin(0,0,6,0)),
		 	axis.title.x=element_blank(),
	        # axis.text.x=element_blank(),
	        # axis.ticks.x=element_blank(),
	 		legend.position="none",
			legend.margin=margin(t=0,r=0,b=0,l=0,unit = "pt"),
		 	legend.box.margin=margin(t=0,r=0,b=0,l=0,unit = "pt"),
		 	legend.key.height = unit(0.3, "cm"),
			legend.key.width = unit(0.3, "cm"),
		 	legend.spacing.y = unit(0.3, 'cm'),
		 	legend.spacing.x = unit(0.3, 'cm'),
		 	legend.title = element_text(size = 7),  
		 	legend.text = element_text(size = 7),
	 		legend.title.align=0.7
 			) +
		scale_x_discrete(
			labels = labels_5, 
			expand = expand_scale(add = .6)
			) +
		scale_y_continuous(
			expand = expand_scale(mult = .05)
			) +
		coord_cartesian(ylim = c(-1,vert_pos_5)) +
		xlab("Cell lines") + 
		ylab(expression(-CS)) + 
		labs(subtitle="CRISPR")
print(p5)


# saveRDS(p5,"p5")
# rm(list=setdiff(ls(),c("theme2","gg_color_hue","size_hline")))



# test barplot
# p5 <- ggplot(
		# data = summ_crisp, 
		# aes(
				# x = cell, 
				# y = -CS,
				# fill = RH
				# )
			# ) + 
		# geom_bar(
			# stat="identity",
			# width=0.7,
			# position = position_dodge()
				# )  +
		# # geom_text(data = summ_crisp, aes(x = cell, y = -CS), label=p_exps, colour = "black", , size = 3, nudge_y=0.1) +
		# theme2 + 
		# scale_fill_manual(
			# values=as.vector(colores_1),
			# labels=names(colores_1),
			# name=NULL		
			# )+
		# guides(
			# # shape=FALSE,
	 		# fill = guide_legend(
				 		# # override.aes = list(
				 		# # fill=NA,
				 		# # shape=NA,
				 		# # size=0.3
				 		# # ),
			 		# ncol=2,
			 		# byrow=TRUE
			 		# )
	 		 # ) +
	    # geom_errorbar(
		    # width=0, # Width of the error cross bars
		    # size=0.1,
            # position= position_dodge(0.65), # adjust so error whiskers in middle of bars
		    # aes(
			    # ymin=-CS-se, 
			    # ymax=-CS+se
			    # )
            # ) +
		# theme(
		 	# plot.margin = unit(c(1.2,1.5,1.2,0.9), "cm"),
		 	# axis.text=element_text(size=6), #numbers on tick marks of x and y axes
	 		# legend.position = c(0.2,1.00), 
 			# axis.title=element_text(size=6), #titles of x and y axes
			# axis.title.y=element_text(margin=margin(t=0,r=7,b=0,l=0), hjust = 0.5), #moves y axis title by adding margin space to bottom, trbl
			# axis.title.x=element_text(margin=margin(t=7,r=0,b=0,l=0), hjust = 0.5),  #moves x axis title by adding margin space to top,trbl
			# # plot.title = element_text(size=32, face="bold", hjust = -0.14), #can provide "A","B", by ggtitle, but used plot_grid wch can shift more left
			# plot.subtitle = element_text(size=7, face="plain", hjust = 0.5), #hjust shifts right
			# legend.margin=margin(t=0,r=0,b=0,l=0,unit = "pt"),
		 	# legend.box.margin=margin(t=0,r=0,b=0,l=0,unit = "pt"),
		 	# legend.key.height = unit(0.05, "cm"),
			# legend.key.width = unit(0.3, "cm"),
		 	# legend.spacing.y = unit(0.01, 'cm'),
		 	# legend.spacing.x = unit(0.01, 'cm'),
		 	# legend.title = element_text(size = 6),  
		 	# legend.text = element_text(size = 5),
	 		# legend.title.align=0.7
 			# ) +
		# scale_x_discrete(labels = labels, expand = expand_scale(add = .6)) +
		# scale_y_continuous(expand = expand_scale(mult = .05)) +
		# xlab("Cell lines") + 
		# ylab(expression(-CS)) + 
		# labs(subtitle="CRISPR") 
# print(p5)



# # test boxplots
# p5 <- ggplot(
		# data = crisp_l, 
		# aes(
				# x = cell, 
				# y = -CS,
				# fill = RH
				# )
			# ) + 
		# geom_boxplot(
				# )  +
		# theme2 + 
		# scale_fill_manual(
			# values=as.vector(colores_1),
			# labels=names(colores_1),
			# name=NULL		
			# )+
		# guides(
			# # shape=FALSE,
	 		# fill = guide_legend(
				 		# # override.aes = list(
				 		# # fill=NA,
				 		# # shape=NA,
				 		# # size=0.3
				 		# # ),
			 		# ncol=2,
			 		# byrow=TRUE
			 		# )
	 		 # ) +
		# theme(
		 	# plot.margin = unit(c(1.2,1.5,1.2,0.9), "cm"),
		 	# axis.text=element_text(size=6), #numbers on tick marks of x and y axes
	 		# legend.position = c(0.2,1.00), 
 			# axis.title=element_text(size=6), #titles of x and y axes
			# axis.title.y=element_text(margin=margin(t=0,r=7,b=0,l=0), hjust = 0.5), #moves y axis title by adding margin space to bottom, trbl
			# axis.title.x=element_text(margin=margin(t=7,r=0,b=0,l=0), hjust = 0.5),  #moves x axis title by adding margin space to top,trbl
			# # plot.title = element_text(size=32, face="bold", hjust = -0.14), #can provide "A","B", by ggtitle, but used plot_grid wch can shift more left
			# plot.subtitle = element_text(size=7, face="plain", hjust = 0.5), #hjust shifts right
			# legend.margin=margin(t=0,r=0,b=0,l=0,unit = "pt"),
		 	# legend.box.margin=margin(t=0,r=0,b=0,l=0,unit = "pt"),
		 	# legend.key.height = unit(0.05, "cm"),
			# legend.key.width = unit(0.3, "cm"),
		 	# legend.spacing.y = unit(0.01, 'cm'),
		 	# legend.spacing.x = unit(0.01, 'cm'),
		 	# legend.title = element_text(size = 6),  
		 	# legend.text = element_text(size = 5),
	 		# legend.title.align=0.7
 			# ) +
		# scale_x_discrete(labels = labels, expand = expand_scale(add = .6)) +
		# scale_y_continuous(expand = expand_scale(mult = .05)) +
		# xlab("Cell lines") + 
		# ylab(expression(-CS)) + 
		# labs(subtitle="CRISPR") 
# print(p5)


# #---------- sabbatini_gene_trap (6) -----------------------


# # gene_trap.txt is from Sabbatini supp data file aac7041_SM_Table_S4.xlsx
# # n.s.

# gt <- read.table("gene_trap.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE)
# gt <- gt[,c("Symbol","GTS")]

# # convert GTS for easier interpretation
# gt$GTS <- 1-gt$GTS

# g_unique <- read.delim("growth_loci_unique.txt",sep="\t",stringsAsFactors=FALSE,header=TRUE,check.names=FALSE)
# gt$RH <- numeric(nrow(gt))
# gt[gt$Symbol %in% g_unique$geneSymbol,"RH"] <- 1



# # No need to cast in long form, as already in long form with only one group factor. However, for consistency with other plots provide "dummy" grp factor for plotting.
# gt$cell <- "KBM7"
# gt$cell <- factor(gt$cell,levels=c("KBM7"))


# gt$RH <- factor(gt$RH,levels=c("1","0"))



# labels_6 <- levels(gt$cell)

# colores_6 <- gg_color_hue(length(unique(gt$RH)))
# names(colores_6) <- c("RH+","RH-")


# p_vals_6 <- c(
			# t.test(gt[gt$RH==1,c("GTS")],gt[gt$RH==0,c("GTS")])$p.value
			# )



# fun_label_6 <- c(
				# deparse(bquote(.(formatC(p_vals_6[1],format="e",digits=0))))
				# )


# horiz_pos_6 <- c(1)-0.2
# vert_pos_6 <-  c(
				# max(gt[,c("GTS")],na.rm=TRUE) + 0.15
				# )



# # violin plot
# p6 <- ggplot(
		# data = gt, 
		# aes(
				# x = cell, 
				# y = GTS,
				# fill = RH
				# )
			# ) + 
		# geom_violin(
			# width=0.5, 
			# position = position_dodge(width=0.6), alpha=0.1,
			# lwd=0.1
			# ) +
		# geom_point(
			# data= gt,
			# mapping=aes(x=cell,y= GTS),
			# shape=16, 
			# colour="grey",
			# alpha=1,
			# size=0.1,
			# position=position_jitterdodge(
						# dodge.width=0.6,
						# jitter.width=0.0,
						# jitter.height=0.1
						# )
			# ) +
		# geom_boxplot(
			# width=0.1,
			# position= position_dodge(0.6), alpha=0.1,
			# outlier.shape=NA,
			# lwd=0.2,
			# fatten=2,
			# show.legend = FALSE#,
			# #notch=TRUE
			# )  +
		# scale_fill_manual(
			# values=as.vector(colores_6),
			# labels=names(colores_6),
			# name=NULL		
			# )+
		# guides(
			# shape=FALSE,
	 		# fill = guide_legend(
				 		# override.aes = list(
				 		# #fill=NA,
				 		# shape=NA,
				 		# size=0.1
				 		# ),
			 		# ncol=1,
			 		# byrow=TRUE
			 		# )
	 		 # ) +
 		# annotate(
	 		# geom="text", 
	 		# x=horiz_pos_6, 
	 		# y=vert_pos_6, 
	 		# hjust=0,
	 		# label= fun_label_6, 
	 		# parse=TRUE, 
	 		# size=2
	 		# ) +
 		# theme2 + 
   		# theme(
		 	# plot.margin = unit(c(top=0.5,right=0.5,bottom=0.5,left=0.5), "cm"),
		 	# plot.subtitle = element_text(size=9, face="plain", hjust = 0.5, margin=margin(0,0,6,0)),
		 	# axis.title.x=element_blank(),
	        # axis.text.x=element_blank(),
	        # axis.ticks.x=element_blank(),
	 		# legend.position="none",
			# legend.margin=margin(t=0,r=0,b=0,l=0,unit = "pt"),
		 	# legend.box.margin=margin(t=0,r=0,b=0,l=0,unit = "pt"),
		 	# legend.key.height = unit(0.3, "cm"),
			# legend.key.width = unit(0.3, "cm"),
		 	# legend.spacing.y = unit(0.3, 'cm'),
		 	# legend.spacing.x = unit(0.3, 'cm'),
		 	# legend.title = element_text(size = 7),  
		 	# legend.text = element_text(size = 7),
	 		# legend.title.align=0.7
 			# ) +
		# scale_x_discrete(
			# labels = labels_6, 
			# expand = expand_scale(add = .6)
			# ) +
		# scale_y_continuous(
			# expand = expand_scale(mult = .05)#,
			# #limits=c(0.20,1.1)
			# ) +
		# # coord_cartesian(ylim = c(-1,1.6)) +
		# xlab("Cell line") + 
		# ylab(expression(1-GTS)) + 
		# labs(subtitle="Gene trap")
# print(p6)


# # saveRDS(p6,"p6")
# # rm(list=setdiff(ls(),c("theme2","gg_color_hue","size_hline")))




#------------- sabbatini_crispr_KBM7_fish_ovlap (7) ------------

# crisp.txt is from Sabbatini supp data file aac7041_SM_Table_S3.xlsx


crisp <- read.table("crisp.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE)
g_unique <- read.delim("growth_loci_unique.txt",sep="\t",stringsAsFactors=FALSE,header=TRUE,check.names=FALSE)
crisp$RH <- 0
crisp[crisp$Gene %in% g_unique$geneSymbol,]$RH <- 1

crisp_KBM7.CS_RH_overlap <- data.frame(
minus_KBM7.CS_thres = numeric(), 	#col 1
crisp_rh=numeric(), 				#col 2
nocrisp_rh=numeric(), 			#col 3
crisp_norh=numeric(), 			#col 4
nocrisp_norh=numeric(), 			#col 5
obs=numeric(), 					#col 6
exp=numeric(), 					#col 7
chi_sq=numeric(),				#col 8
chi_df=numeric(),				#col 9
P_chi=numeric(),				#col 10
fish_OR=numeric(),				#col 11
fish_conf_1=numeric(),			#col 12
fish_conf_2=numeric(),			#col 13RH
P_fish=numeric()				#col 14
)


KBM7.CS_max <- -crisp[which.max(-crisp$KBM7.CS),]$KBM7.CS
KBM7.CS_min <- -crisp[which.min(-crisp$KBM7.CS),]$KBM7.CS

vec <- seq(KBM7.CS_min, KBM7.CS_max,(KBM7.CS_max-KBM7.CS_min)/100)


for(i in 1:101) {

crisp_KBM7.CS_RH_overlap[i,1] <- vec[i]
crisp_KBM7.CS_RH_overlap[i,2] <- dim(crisp[-crisp$KBM7.CS > vec[i] & !is.na(crisp$KBM7.CS) & crisp$RH == 1,])[1]
crisp_KBM7.CS_RH_overlap[i,3] <- dim(crisp[-crisp$KBM7.CS < vec[i] & !is.na(crisp$KBM7.CS) & crisp$RH == 1,])[1]
crisp_KBM7.CS_RH_overlap[i,4] <- dim(crisp[-crisp$KBM7.CS > vec[i] & !is.na(crisp$KBM7.CS) & crisp$RH == 0,])[1]
crisp_KBM7.CS_RH_overlap[i,5] <- dim(crisp[-crisp$KBM7.CS < vec[i] & !is.na(crisp$KBM7.CS) & crisp$RH == 0,])[1]

chi <- chisq.test(matrix(c(crisp_KBM7.CS_RH_overlap[i,2],crisp_KBM7.CS_RH_overlap[i,3],crisp_KBM7.CS_RH_overlap[i,4], crisp_KBM7.CS_RH_overlap[i,5]),2,2,byrow=TRUE))
fish <- fisher.test(matrix(c(crisp_KBM7.CS_RH_overlap[i,2],crisp_KBM7.CS_RH_overlap[i,3],crisp_KBM7.CS_RH_overlap[i,4], crisp_KBM7.CS_RH_overlap[i,5]),2,2,byrow=TRUE))

crisp_KBM7.CS_RH_overlap[i,6] <- chi$observed[1,1]
crisp_KBM7.CS_RH_overlap[i,7] <- chi$expected[1,1]
crisp_KBM7.CS_RH_overlap[i,8] <- chi$statistic
crisp_KBM7.CS_RH_overlap[i,9] <- chi$parameter[[1]]
crisp_KBM7.CS_RH_overlap[i,10] <- chi$p.value
crisp_KBM7.CS_RH_overlap[i,11] <- fish$estimate[[1]]
crisp_KBM7.CS_RH_overlap[i,12] <- fish$conf.int[[1]]
crisp_KBM7.CS_RH_overlap[i,13] <- fish$conf.int[[2]]
crisp_KBM7.CS_RH_overlap[i,14] <- fish$p.value

}

crisp_KBM7.CS_RH_overlap$q_fish <- p.adjust(crisp_KBM7.CS_RH_overlap$P_fish, method = "BH")
h_line_7 <- max(crisp_KBM7.CS_RH_overlap[crisp_KBM7.CS_RH_overlap$q_fish < 0.05,]$P_fish)



# cutoff <- data.frame( x = c(-Inf, Inf), y = -log10(h_line), cutoff = factor(-log10(h_line)) )
# cutoff <- data.frame(yintercept=-log10(h_line), cutoff=factor(-log10(h_line)))



p7 <- ggplot() + 
		theme2 + 
		theme(legend.key=element_blank()) +
		geom_line(
			data=crisp_KBM7.CS_RH_overlap, 
			lwd=0.2,
			colour="black",
			show.legend=FALSE,
			aes(
				x=minus_KBM7.CS_thres, 
				y= -log10(crisp_KBM7.CS_RH_overlap$P_fish)
				)
			) +
		geom_hline(
			color = "red", 
			size=size_hline,
			aes(
				yintercept = -log10(h_line_7), 
				linetype = "FDR = 0.05"
				)
			) +
		scale_linetype_manual(
			name = NULL, 
			values = c(1), 
			guide = guide_legend(
						override.aes = list(
											color = c("red"),
											size=size_hline
											)
							)
			) +
		theme(
		 	plot.margin = unit(c(top=0.5,right=0.5,bottom=0.5,left=0.5), "cm"),
		 	plot.subtitle = element_text(size=9, face="plain", hjust = 0.5, margin=margin(0,0,6,0)),
		 	axis.title.x = element_text(size=7, face="plain", hjust = 0.5, margin=margin(5,0,0,0)), #titles of x and y axes
	 		legend.position = c(0.8,1.0), 
			legend.margin=margin(t=0,r=0,b=0,l=0,unit = "pt"),
		 	legend.box.margin=margin(t=0,r=0,b=0,l=0,unit = "pt"),
		 	legend.key.height = unit(0.3, "cm"),
			legend.key.width = unit(0.25, "cm"),
		 	legend.spacing.y = unit(0.3, 'cm'),
		 	legend.spacing.x = unit(0.25, 'cm'),
		 	legend.title = element_text(size = 6),  
		 	legend.text = element_text(size = 6),
	 		legend.title.align=0.7
 			) +
		labs(subtitle="KBM7") +
		xlab("-CS") + 
		# scale_x_continuous(breaks = c(0,1,2,3,4,6), labels = c(0,1,2,3,4,6)) + 
		ylab(expression('-log'[10]*italic('P'))) 
print(p7)


# saveRDS(p7,"p7")
# rm(list=setdiff(ls(),c("theme2","gg_color_hue","size_hline")))



#---------rh_geo violin plot (8)------------------------

# RH data downloaded from GEO
# Each individual median normalized array posted into excel spreadsheet
# As described in paper, replicates log averaged and means antilogged. No reps for a23_8, so left as one non-averaged obs. A23 not used in analysis, though.


rh <- read.table("rh_array.txt",sep="\t",stringsAsFactors=FALSE,header=TRUE)
dim(rh)
# [1] 24353    88

# For some reason, lots of entries wih no gene names.
dim(rh[rh$Gene=="",])
# [1] 5263   88

rh <- rh[rh$Gene != "",]
# Still gene dups because mult spots on array, multicopy genes etc:
dim(rh[duplicated(rh$Gene),])
# [1] [1] 2350   88


rh[,-1] <- log2(rh[,-1])

# get rid of remaining gene dups:
rh <- aggregate(.~Gene, rh, mean)


# Get rid of gene names converted to dates
rh <- rh[-c(1:22),]

rh$rh_ave <- rowMeans(rh[,c(3:81)])

g_unique <- read.delim("growth_loci_unique.txt",sep="\t",stringsAsFactors=FALSE,header=TRUE,check.names=FALSE)
rh$RH <- 0
rh[rh$Gene %in% g_unique$geneSymbol,]$RH <- 1

# No need to cast in long form, as already in long form with only one group factor. However, for consistency with other plots provide "dummy" grp factor for plotting.
rh$cell <- "RH"
rh$cell <- factor(rh$cell,levels=c("RH"))


rh$RH <- factor(rh$RH,levels=c("1","0"))


labels_8 <- levels(rh$cell)


colores_8 <- gg_color_hue(length(unique(rh$RH)))
names(colores_8) <- c("RH+","RH-")


p_vals_8 <- c(
			t.test(rh[rh$RH==1,c("rh_ave")],rh[rh$RH==0,c("rh_ave")])$p.value
			)



fun_label_8 <- c(
				deparse(bquote(.(formatC(p_vals_8[1],format="e",digits=0))))
				)


horiz_pos_8 <- c(1)-0.2
# vert_pos_8 <-  c(
				# max(rh[,c("rh_ave")],na.rm=TRUE) + 0.2
				# )
vert_pos_8 <- 0.085


# violin plot
p8 <- ggplot(
		data = rh, 
		aes(
				x = cell, 
				y = rh_ave,
				fill = RH
				)
			) + 
		geom_violin(
			width=0.5, 
			position = position_dodge(width=0.6), alpha=0.1,
			lwd=0.1
			) +
		geom_point(
			data= rh,
			mapping=aes(x=cell,y=rh_ave),
			shape=16, 
			colour="grey",
			alpha=1,
			size=0.1,
			position=position_jitterdodge(
						dodge.width=0.6,
						jitter.width=0.0,
						jitter.height=0.1
						)
			) +
		geom_boxplot(
			width=0.1,
			position= position_dodge(0.6), alpha=0.1,
			outlier.shape=NA,
			lwd=0.2,
			fatten=2,
			show.legend = FALSE#,
			#notch=TRUE
			)  +
		scale_fill_manual(
			values=as.vector(colores_8),
			labels=names(colores_8),
			name=NULL		
			)+
		guides(
			shape=FALSE,
	 		fill = guide_legend(
				 		override.aes = list(
				 		#fill=NA,
				 		shape=NA,
				 		size=0.1
				 		),
			 		ncol=1,
			 		byrow=TRUE
			 		)
	 		 ) +
		annotate(
		    "rect",
		    xmin=horiz_pos_8-0.2,
		    xmax=horiz_pos_8+0.5, 
		    ymin=vert_pos_8-0.015, 
		    ymax=vert_pos_8+0.01, 
		    fill="white"
		    ) +
 		annotate(
	 		geom="text", 
	 		x=horiz_pos_8, 
	 		y=vert_pos_8, 
	 		hjust=0,
	 		label= fun_label_8, 
	 		parse=TRUE, 
	 		size=2
	 		) +
 		theme2 + 
   		theme(
		 	plot.margin = unit(c(top=0.5,right=0.5,bottom=0.5,left=0.5), "cm"),
		 	plot.subtitle = element_text(size=9, face="plain", hjust = 0.5, margin=margin(0,0,6,0)),
		 	axis.title.x=element_blank(),
	        axis.text.x=element_blank(),
	        axis.ticks.x=element_blank(),
	 		legend.position="none",
			legend.margin=margin(t=0,r=0,b=0,l=0,unit = "pt"),
		 	legend.box.margin=margin(t=0,r=0,b=0,l=0,unit = "pt"),
		 	legend.key.height = unit(0.3, "cm"),
			legend.key.width = unit(0.3, "cm"),
		 	legend.spacing.y = unit(0.3, 'cm'),
		 	legend.spacing.x = unit(0.3, 'cm'),
		 	legend.title = element_text(size = 7),  
		 	legend.text = element_text(size = 7),
	 		legend.title.align=0.7
 			) +
		scale_x_discrete(
			labels = labels, 
			expand = expand_scale(add = .6)
			) +
		scale_y_continuous(
			expand = expand_scale(mult = .05)#,
			#limits=c(-2.2,6.25)
			) +
		xlab("Cell lines") + 
		ylab(expression(log[2]*'('*exp*')')) +
		coord_cartesian(ylim = c(-0.04,vert_pos_8)) +
		labs(subtitle="RH exp")
print(p8)


# saveRDS(p8,"p8")
# rm(list=setdiff(ls(),c("theme2","gg_color_hue","size_hline")))


      
#--------- gtex_cr max P tissue violin plot (9) ------------------------


# GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct downloaded from GTEx website on 09/10/19


gtex <- read.delim("GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct",sep="\t",stringsAsFactors=FALSE,header=TRUE,check.names=TRUE,fill=FALSE,skip=2)

colnames(gtex)[1] <- "gene_id"

# remove version number in gene_id
gtex$gene_id <- gsub("\\..$","",gtex$gene_id)


# gtex$mean <- 2^rowMeans(log2(gtex[,c(3:ncol(gtex))]))


gencode_gtf_ensembl_ucsc <- read.delim("gencode_gtf_ensembl_ucsc_v31.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE,check.names=FALSE)

gtex <- merge(gtex, gencode_gtf_ensembl_ucsc[,c("gene_id","gene_type")])


g_unique <- read.delim("growth_loci_unique.txt",sep="\t",stringsAsFactors=FALSE,header=TRUE,check.names=FALSE)



gtex$RH <- numeric(nrow(gtex))
gtex[gtex$gene_id %in% g_unique$ensembl_gene_id,"RH"] <- 1


# Convert TPM to log2(TPM+1) for viewing
# gtex[,c(3:55)] <- log2(gtex[,c(3:55)] + 1)
# However, led to strange behavior such as median outside 95% CI using t.test for some tissues. 
# So thresholded by discarding TPM < 5 vals, ie keeping TPM >= 5 (see below). Similar to microrarray spots not above bkgrd being discarded.


# Find most significant tissue
gtex_mean_diff_cr_CI <- data.frame(
							threshold = numeric(),
							tissue = character(),
							nRHpos = numeric(),
							nRHneg = numeric(),
							median_diff = numeric(),
							mean_diff=numeric(),
							mean_CI_1 = numeric(),
							mean_CI_2 = numeric(), 
							P = numeric(), 
							stringsAsFactors=FALSE
							)





# # Use if want to explore TPM thresholds, eg
# thresh_key <- c(1:20)



thresh_key <- 5 # equiv to ~1 transcript per cell with typical 200,000 transcripts.
i_key <- c(3:(ncol(gtex)-2))



for(thresh in c(1:length(thresh_key))) {
	
	cat(paste0("\nthresh = ", thresh, "/",length(thresh_key), "\n"))
	
	for(i in c(1:length(i_key))) {
		
		
		cat(paste0("\ti = ",i,"/",length(i_key), "\n"))
		
		gtex_mean_diff_cr_CI[c((thresh-1)*length(i_key) + i),"threshold"] <- thresh_key[thresh]
		gtex_mean_diff_cr_CI[c((thresh-1)*length(i_key) + i),"tissue"] <- colnames(gtex)[i_key[i]]
		
		gtex_mean_diff_cr_CI[c((thresh-1)*length(i_key) + i),"nRHpos"] <- length(gtex[gtex[,i_key[i]] >= thresh_key[thresh],][gtex[gtex[,i_key[i]] >= thresh_key[thresh],]$gene_type == "protein_coding" & gtex[gtex[,i_key[i]] >= thresh_key[thresh],]$RH==1, i_key[i]])
		
		gtex_mean_diff_cr_CI[c((thresh-1)*length(i_key) + i),"nRHneg"] <- length(gtex[gtex[,i_key[i]] >= thresh_key[thresh],][gtex[gtex[,i_key[i]] >= thresh_key[thresh],]$gene_type == "protein_coding" & gtex[gtex[,i_key[i]] >= thresh_key[thresh],]$RH==0, i_key[i]])
		
		gtex_mean_diff_cr_CI[c((thresh-1)*length(i_key) + i),"median_diff"] <- median(log2(gtex[gtex[,i_key[i]] >= thresh_key[thresh],][gtex[gtex[,i_key[i]] >= thresh_key[thresh],]$gene_type == "protein_coding" & gtex[gtex[,i_key[i]] >= thresh_key[thresh],]$RH==1, i_key[i]]))-median(log2(gtex[gtex[,i_key[i]] >= thresh_key[thresh],][gtex[gtex[,i_key[i]] >= thresh_key[thresh],]$gene_type == "protein_coding" & gtex[gtex[,i_key[i]] >= thresh_key[thresh],]$RH==0, i_key[i]]))

			
		tryCatch ({
		
		ans <- t.test(log2(gtex[gtex[,i_key[i]] >= thresh_key[thresh],][gtex[gtex[,i_key[i]] >= thresh_key[thresh],]$gene_type == "protein_coding" & gtex[gtex[,i_key[i]] >= thresh_key[thresh],]$RH==1, i_key[i]]), log2(gtex[gtex[,i_key[i]] >= thresh_key[thresh],][gtex[gtex[,i_key[i]] >= thresh_key[thresh],]$gene_type == "protein_coding" & gtex[gtex[,i_key[i]] >= thresh_key[thresh],]$RH==0, i_key[i]]))
		
		
		gtex_mean_diff_cr_CI[c((thresh-1)*length(i_key) + i),"mean_diff"] <- ans$estimate["mean of x"]-ans$estimate["mean of y"]
		gtex_mean_diff_cr_CI[c((thresh-1)*length(i_key) + i),"mean_CI_1"] <- ans$conf.int[1]
		gtex_mean_diff_cr_CI[c((thresh-1)*length(i_key) + i),"mean_CI_2"] <- ans$conf.int[2]
		gtex_mean_diff_cr_CI[c((thresh-1)*length(i_key) + i),"P"] <- ans$p.value
		
		}, error = function(e) {cat ("Error on line ", i, ": ", conditionMessage(e),"\n")})

	}
}



# omit pancreas, because insufficient data points for nc genes.
gtex_mean_diff_cr_CI <- gtex_mean_diff_cr_CI[gtex_mean_diff_cr_CI$tissue != "Pancreas",]





tissue_cr <- gtex_mean_diff_cr_CI[which.min(gtex_mean_diff_cr_CI$P),"tissue"]

tissue_cr
# [1] "Brain...Substantia.nigra"


gtex2_cr <- rbind(
			data.frame(
				tpm = log2(gtex[gtex$gene_type == "protein_coding" & gtex$RH==1, tissue_cr][gtex[gtex$gene_type == "protein_coding" & gtex$RH==1, tissue_cr] >= thresh_key[thresh]]), RH = factor("1")), 
			data.frame(
				tpm = log2(gtex[gtex$gene_type == "protein_coding" & gtex$RH==0,tissue_cr][gtex[gtex$gene_type == "protein_coding" & gtex$RH==0, tissue_cr] >= thresh_key[thresh]]), RH = factor("0"))
				)



# No need to cast in long form, as already in long form with only one group factor. However, for consistency with other plots provide "dummy" grp factor for plotting.
gtex2_cr$cell <- "GTEx"
gtex2_cr$cell <- factor(gtex2_cr$cell,levels=c("GTEx"))


gtex2_cr$RH <- factor(gtex2_cr$RH,levels=c("1","0"))


labels_9 <- levels(gtex2_cr$cell)


colores_9 <- gg_color_hue(length(unique(gtex2_cr$RH)))
names(colores_9) <- c("RH+","RH-")




p_vals_9 <- c(
			t.test(gtex2_cr[gtex2_cr$RH==1,"tpm"],gtex2_cr[gtex2_cr$RH==0,"tpm"])$p.value
			)



fun_label_9 <- c(
				deparse(bquote(.(formatC(p_vals_9[1],format="e",digits=0))))
				)


horiz_pos_9 <- c(1)-0.2
# vert_pos_9 <-  c(
				# max(gtex2_cr[,c("log_tissue")],na.rm=TRUE) + 0.2
				# )
vert_pos_9 <- 11



# violin plot
p9 <- ggplot(
		data = gtex2_cr, 
		aes(
				x = cell, 
				y = tpm,
				fill = RH
				)
			) + 
		geom_violin(
			width=0.5, 
			position = position_dodge(width=0.6), alpha=0.1,
			lwd=0.1
			) +
		geom_point(
			data= gtex2_cr,
			mapping=aes(x=cell,y= tpm),
			shape=16, 
			colour="grey",
			alpha=1,
			size=0.1,
			position=position_jitterdodge(
						dodge.width=0.6,
						jitter.width=0.0,
						jitter.height=0.1
						)
			) +
		geom_boxplot(
			width=0.1,
			position= position_dodge(0.6), alpha=0.1,
			outlier.shape=NA,
			lwd=0.2,
			fatten=2,
			show.legend = FALSE#,
			#notch=TRUE
			)  +
		scale_fill_manual(
			values=as.vector(colores_9),
			labels=names(colores_9),
			name=NULL		
			)+
		guides(
			shape=FALSE,
	 		fill = guide_legend(
				 		override.aes = list(
				 		#fill=NA,
				 		shape=NA,
				 		size=0.1
				 		),
			 		ncol=1,
			 		byrow=TRUE
			 		)
	 		 ) +
		annotate(
		    "rect",
		    xmin=horiz_pos_9-0.2,
		    xmax=horiz_pos_9+0.4, 
		    ymin=vert_pos_9-1.0, 
		    ymax=vert_pos_9+1, 
		    fill="white"
		    ) +
 		annotate(
	 		geom="text", 
	 		x=horiz_pos_9, 
	 		y=vert_pos_9, 
	 		hjust=0,
	 		label= fun_label_9, 
	 		parse=TRUE, 
	 		size=2
	 		) +
 		theme2 + 
   		theme(
	   		plot.margin = unit(c(top=0.5,right=0.5,bottom=0.5,left=0.5), "cm"),
		 	plot.subtitle = element_text(size=9, face="plain", hjust = 0.5, margin=margin(0,0,6,0)),
		 	axis.title.x=element_blank(),
	        axis.text.x=element_blank(),
	        axis.ticks.x=element_blank(),
	 		legend.position="none",
			legend.margin=margin(t=0,r=0,b=0,l=0,unit = "pt"),
		 	legend.box.margin=margin(t=0,r=0,b=0,l=0,unit = "pt"),
		 	legend.key.height = unit(0.3, "cm"),
			legend.key.width = unit(0.3, "cm"),
		 	legend.spacing.y = unit(0.3, 'cm'),
		 	legend.spacing.x = unit(0.3, 'cm'),
		 	legend.title = element_text(size = 7),  
		 	legend.text = element_text(size = 7),
	 		legend.title.align=0.7
 			) +
		scale_x_discrete(
			# labels = labels, 
			expand = expand_scale(add = .6)
			) +
		scale_y_continuous(
			expand = expand_scale(mult = .05)#,
			#limits=c(-2.2,6.25)
			) +
		# xlab("Cell lines") + 
		ylab(expression(log[2]*'('* TPM*')')) +
		coord_cartesian(ylim = c(2,vert_pos_9)) +
		labs(subtitle="GTEx cr")
print(p9)


# saveRDS(p9,"p9")
# rm(list=setdiff(ls(),c("theme2","gg_color_hue","size_hline")))



#--------- gtex_nc max P tissue violin plot (10) ------------------------

# GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct downloaded from GTEx website on 09/10/19


gtex <- read.delim("GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct",sep="\t",stringsAsFactors=FALSE,header=TRUE,check.names=TRUE,fill=FALSE,skip=2)

colnames(gtex)[1] <- "gene_id"

# remove version number in gene_id
gtex$gene_id <- gsub("\\..$","",gtex$gene_id)


# gtex$mean <- 2^rowMeans(log2(gtex[,c(3:ncol(gtex))]))


gencode_gtf_ensembl_ucsc <- read.delim("gencode_gtf_ensembl_ucsc_v31.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE,check.names=FALSE)

gtex <- merge(gtex, gencode_gtf_ensembl_ucsc[,c("gene_id","gene_type")])


g_unique <- read.delim("growth_loci_unique.txt",sep="\t",stringsAsFactors=FALSE,header=TRUE,check.names=FALSE)



gtex$RH <- numeric(nrow(gtex))
gtex[gtex$gene_id %in% g_unique$ensembl_gene_id,"RH"] <- 1


# Convert TPM to log2(TPM+1) for viewing
# gtex[,c(3:55)] <- log2(gtex[,c(3:55)] + 1)
# However, led to strange behavior such as median outside 95% CI using t.test for some tissues. 
# So thresholded by discarding TPM < 5 vals, ie keeping TPM >= 5 (see below). Similar to microrarray spots not above bkgrd being discarded.


# Find most significant tissue
gtex_mean_diff_nc_CI <- data.frame(
							threshold = numeric(),
							tissue = character(),
							nRHpos = numeric(),
							nRHneg = numeric(),
							median_diff = numeric(),
							mean_diff=numeric(),
							mean_CI_1 = numeric(),
							mean_CI_2 = numeric(), 
							P = numeric(), 
							stringsAsFactors=FALSE
							)








# # Use if want to explore TPM thresholds, eg
# thresh_key <- c(1:20)



thresh_key <- 5 # equiv to ~1 transcript per cell with typical 200,000 transcripts.
i_key <- c(3:(ncol(gtex)-2))





for(thresh in c(1:length(thresh_key))) {
	
	cat(paste0("\nthresh = ", thresh, "/",length(thresh_key), "\n"))
	
	for(i in c(1:length(i_key))) {
		
		
		cat(paste0("\ti = ",i,"/",length(i_key), "\n"))
		
		gtex_mean_diff_nc_CI[c((thresh-1)*length(i_key) + i),"threshold"] <- thresh_key[thresh]
		gtex_mean_diff_nc_CI[c((thresh-1)*length(i_key) + i),"tissue"] <- colnames(gtex)[i_key[i]]
		
		gtex_mean_diff_nc_CI[c((thresh-1)*length(i_key) + i),"nRHpos"] <- length(gtex[gtex[,i_key[i]] >= thresh_key[thresh],][gtex[gtex[,i_key[i]] >= thresh_key[thresh],]$gene_type != "protein_coding" & gtex[gtex[,i_key[i]] >= thresh_key[thresh],]$RH==1, i_key[i]])
		
		gtex_mean_diff_nc_CI[c((thresh-1)*length(i_key) + i),"nRHneg"] <- length(gtex[gtex[,i_key[i]] >= thresh_key[thresh],][gtex[gtex[,i_key[i]] >= thresh_key[thresh],]$gene_type != "protein_coding" & gtex[gtex[,i_key[i]] >= thresh_key[thresh],]$RH==0, i_key[i]])
		
		gtex_mean_diff_nc_CI[c((thresh-1)*length(i_key) + i),"median_diff"] <- median(log2(gtex[gtex[,i_key[i]] >= thresh_key[thresh],][gtex[gtex[,i_key[i]] >= thresh_key[thresh],]$gene_type != "protein_coding" & gtex[gtex[,i_key[i]] >= thresh_key[thresh],]$RH==1, i_key[i]]))-median(log2(gtex[gtex[,i_key[i]] >= thresh_key[thresh],][gtex[gtex[,i_key[i]] >= thresh_key[thresh],]$gene_type != "protein_coding" & gtex[gtex[,i_key[i]] >= thresh_key[thresh],]$RH==0, i_key[i]]))

			
		tryCatch ({
		
		ans <- t.test(log2(gtex[gtex[,i_key[i]] >= thresh_key[thresh],][gtex[gtex[,i_key[i]] >= thresh_key[thresh],]$gene_type != "protein_coding" & gtex[gtex[,i_key[i]] >= thresh_key[thresh],]$RH==1, i_key[i]]), log2(gtex[gtex[,i_key[i]] >= thresh_key[thresh],][gtex[gtex[,i_key[i]] >= thresh_key[thresh],]$gene_type != "protein_coding" & gtex[gtex[,i_key[i]] >= thresh_key[thresh],]$RH==0, i_key[i]]))
		
		
		gtex_mean_diff_nc_CI[c((thresh-1)*length(i_key) + i),"mean_diff"] <- ans$estimate["mean of x"]-ans$estimate["mean of y"]
		gtex_mean_diff_nc_CI[c((thresh-1)*length(i_key) + i),"mean_CI_1"] <- ans$conf.int[1]
		gtex_mean_diff_nc_CI[c((thresh-1)*length(i_key) + i),"mean_CI_2"] <- ans$conf.int[2]
		gtex_mean_diff_nc_CI[c((thresh-1)*length(i_key) + i),"P"] <- ans$p.value
		
		}, error = function(e) {cat ("Error on line ", i, ": ", conditionMessage(e),"\n")})

	}
}



# omit pancreas, because insufficient data points for nc genes.
gtex_mean_diff_nc_CI <- gtex_mean_diff_nc_CI[gtex_mean_diff_nc_CI$tissue != "Pancreas",]



# Find most significant nc tissue
# tissue_nc <- gtex_mean_diff_nc_CI[which.min(gtex_mean_diff_nc_CI$P),"tissue"]



# compare most significant cr tissue
tissue_nc <- tissue_cr



tissue_nc
# [1] "Brain...Substantia.nigra"


gtex2_nc <- rbind(
			data.frame(
				tpm = log2(gtex[gtex$gene_type != "protein_coding" & gtex$RH==1, tissue_cr][gtex[gtex$gene_type != "protein_coding" & gtex$RH==1, tissue_cr] >= thresh_key[thresh]]), RH = factor("1")), 
			data.frame(
				tpm = log2(gtex[gtex$gene_type != "protein_coding" & gtex$RH==0,tissue_cr][gtex[gtex$gene_type != "protein_coding" & gtex$RH==0, tissue_cr] >= thresh_key[thresh]]), RH = factor("0"))
				)



# No need to cast in long form, as already in long form with only one group factor. However, for consistency with other plots provide "dummy" grp factor for plotting.
gtex2_nc$cell <- "GTEx"
gtex2_nc$cell <- factor(gtex2_nc$cell,levels=c("GTEx"))


gtex2_nc$RH <- factor(gtex2_nc$RH,levels=c("1","0"))


labels_10 <- levels(gtex2_nc$cell)


colores_10 <- gg_color_hue(length(unique(gtex2_nc$RH)))
names(colores_10) <- c("RH+","RH-")




p_vals_10 <- c(
			t.test(gtex2_nc[gtex2_nc$RH==1,"tpm"],gtex2_nc[gtex2_nc$RH==0,"tpm"])$p.value
			)



fun_label_10 <- ifelse(p_vals_10 >= 0.05,
						"n.s.",
						c(deparse(bquote(.(formatC(p_vals_10[1],format="e",digits=0)))))
						)


horiz_pos_10 <- c(1)-0.1
# vert_pos_10 <-  c(
				# max(gtex2_nc[,c("log_tissue")],na.rm=TRUE) + 0.2
				# )
vert_pos_10 <- 7



# violin plot
p10 <- ggplot(
		data = gtex2_nc, 
		aes(
				x = cell, 
				y = tpm,
				fill = RH
				)
			) + 
		geom_violin(
			width=0.5, 
			position = position_dodge(width=0.6), alpha=0.1,
			lwd=0.1
			) +
		geom_point(
			data= gtex2_nc,
			mapping=aes(x=cell,y= tpm),
			shape=16, 
			colour="grey",
			alpha=1,
			size=0.1,
			position=position_jitterdodge(
						dodge.width=0.6,
						jitter.width=0.0,
						jitter.height=0.1
						)
			) +
		geom_boxplot(
			width=0.1,
			position= position_dodge(0.6), alpha=0.1,
			outlier.shape=NA,
			lwd=0.2,
			fatten=2,
			show.legend = FALSE#,
			#notch=TRUE
			)  +
		scale_fill_manual(
			values=as.vector(colores_10),
			labels=names(colores_10),
			name=NULL		
			)+
		guides(
			shape=FALSE,
	 		fill = guide_legend(
				 		override.aes = list(
				 		#fill=NA,
				 		shape=NA,
				 		size=0.1
				 		),
			 		ncol=1,
			 		byrow=TRUE
			 		)
	 		 ) +
		annotate(
		    "rect",
		    xmin=horiz_pos_10-0.2,
		    xmax=horiz_pos_10+0.4, 
		    ymin=vert_pos_10-0.5, 
		    ymax=vert_pos_10+1.5, 
		    fill="white"
		    ) +
 		annotate(
	 		geom="text", 
	 		x=horiz_pos_10, 
	 		y=vert_pos_10, 
	 		hjust=0,
	 		label= fun_label_10, 
	 		parse=TRUE, 
	 		size=2
	 		) +
 		theme2 + 
   		theme(
	   		plot.margin = unit(c(top=0.5,right=0.5,bottom=0.5,left=0.5), "cm"),
		 	plot.subtitle = element_text(size=9, face="plain", hjust = 0.5, margin=margin(0,0,6,0)),
		 	axis.title.x=element_blank(),
	        axis.text.x=element_blank(),
	        axis.ticks.x=element_blank(),
	 		legend.position="none",
			legend.margin=margin(t=0,r=0,b=0,l=0,unit = "pt"),
		 	legend.box.margin=margin(t=0,r=0,b=0,l=0,unit = "pt"),
		 	legend.key.height = unit(0.3, "cm"),
			legend.key.width = unit(0.3, "cm"),
		 	legend.spacing.y = unit(0.3, 'cm'),
		 	legend.spacing.x = unit(0.3, 'cm'),
		 	legend.title = element_text(size = 7),  
		 	legend.text = element_text(size = 7),
	 		legend.title.align=0.7
 			) +
		scale_x_discrete(
			# labels = labels, 
			expand = expand_scale(add = .6)
			) +
		scale_y_continuous(
			expand = expand_scale(mult = .05)#,
			#limits=c(-2.2,6.25)
			) +
		# xlab("Cell lines") + 
		ylab(expression(log[2]*'('* TPM*')')) +
		coord_cartesian(ylim = c(2,vert_pos_10)) +
		labs(subtitle="GTEx nc")
print(p10)


# saveRDS(p10,"p10")
# rm(list=setdiff(ls(),c("theme2","gg_color_hue","size_hline")))





#----------- CRISPRai (10a)----------------------

# crisprai.txt is from Weissman supp data file 1-s2.0-S0092867414011787-mmc3.xlsx

crisprai <- read.table("crisprai.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE,skip=1,check.names=FALSE)
crisprai <- crisprai[,c(1,2,4)]
colnames(crisprai) <- c("Gene","i_gamma","a_gamma")


crisprai[3257,]
     # Gene a_gamma    i_gamma
# 3257 CTRL     err 0.00700599

# get rid of CTRL:
crisprai <- crisprai[crisprai$Gene != "CTRL",]


crisprai$i_gamma <- as.numeric(crisprai$i_gamma)


g_unique <- read.delim("growth_loci_unique.txt",sep="\t",stringsAsFactors=FALSE,header=TRUE,check.names=FALSE)


crisprai$RH <- numeric(nrow(crisprai))
crisprai[crisprai$Gene %in% g_unique$geneSymbol,"RH"] <- 1



crisprai_l <- reshape(crisprai, 
  varying = c("i_gamma", "a_gamma"), 
  v.names = "gamma",
  timevar = "a_i", 
  times = c("i", "a"), 
  new.row.names = 1:1e5,
  direction = "long")
  
  

crisprai_l$a_i <- factor(crisprai_l$a_i, levels=c("i","a"))
crisprai_l$RH <- factor(crisprai_l$RH,levels=c("1","0"))



labels_10a <- levels(crisprai_l$a_i)

colores_10a <- gg_color_hue(length(unique(crisprai$RH)))
names(colores_10a) <- c("RH+","RH-")

p_vals_10a <- c(
		t.test(crisprai_l[crisprai_l$a_i == "i" & crisprai_l$RH==1,c("gamma")], crisprai_l[crisprai_l$a_i == "i" & crisprai_l$RH==0,c("gamma")])$p.value, 
		t.test(crisprai_l[crisprai_l$a_i == "a" & crisprai_l$RH==1,c("gamma")], crisprai_l[crisprai_l$a_i == "a" & crisprai_l$RH==0,c("gamma")])$p.value
			)





fun_label_10a <- c(
					ifelse(p_vals_10a[1] >= 0.05,
						"n.s.",
						c(deparse(bquote(.(formatC(p_vals_10a[1],format="e",digits=0)))))
						),
					ifelse(p_vals_10a[2] >= 0.05,
						"n.s.",
						c(deparse(bquote(.(formatC(p_vals_10a[2],format="e",digits=0)))))
						)
				)



				
				


horiz_pos_10a <- c(1-0.25,2-0.13)
vert_pos_10a <- c(0.2,0.2)
# vert_pos_10a <-	rep(
					# max(
						# c(
						# max(-crisprai_l[crisprai_l$a_i=="i",c("gamma")]) - 0.3, 
						# max(-crisprai_l[crisprai_l$a_i=="a",c("gamma")]) - 0.3
						# )
					# ),
				# 2
			# )
				
				



# violin plot
p10a <- ggplot(
		data = crisprai_l, 
		aes(
				x = a_i, 
				y = -gamma,
				fill = RH
				)
			) + 
		geom_violin(
			width=0.5, 
			position = position_dodge(width=0.6), alpha=0.1,
			lwd=0.1
			) +
		geom_point(
			data= crisprai_l,
			mapping=aes(x=a_i,y=-gamma),
			shape=16, 
			colour="grey",
			alpha=1,
			size=0.1,
			position=position_jitterdodge(dodge.width=0.6,jitter.width=0.0)
			) +
		geom_boxplot(
			width=0.1,
			position= position_dodge(0.6), alpha=0.1,
			outlier.shape=NA,
			lwd=0.2,
			fatten=2,
			show.legend = FALSE#,
			#notch=TRUE
			)  +
		scale_fill_manual(
			values=as.vector(colores_10a),
			labels=names(colores_10a),
			name=NULL		
			)+
		guides(
			shape=FALSE,
	 		fill = guide_legend(
				 		override.aes = list(
				 		#fill=NA,
				 		shape=NA,
				 		size=0.1
				 		),
			 		ncol=2,
			 		byrow=TRUE
			 		)
	 		 ) +
		annotate(
		    "rect",
		    xmin=horiz_pos_10a-0.2,
		    xmax=horiz_pos_10a+0.8, 
		    ymin=vert_pos_10a-0.05, 
		    ymax=vert_pos_10a+0.05, 
		    fill="white"
		    ) +
 		annotate(
	 		geom="text", 
	 		x=horiz_pos_10a, 
	 		y=vert_pos_10a, 
	 		hjust=0,
	 		label= fun_label_10a, 
	 		parse=TRUE, 
	 		size=2
	 		) +
 		theme2 + 
   		theme(
		 	plot.margin = unit(c(top=0.5,right=0.5,bottom=0.5,left=0.5), "cm"),
		 	plot.subtitle = element_text(size=9, face="plain", hjust = 0.5, margin=margin(0,0,6,0)),
		 	axis.title.x = element_text(size=7, face="plain", hjust = 0.5, margin=margin(5,0,0,0)), #titles of x and y axes
		 	# axis.title.x=element_blank(),
	        # axis.text.x=element_blank(),
	        # axis.ticks.x=element_blank(),
	 		legend.position="none",
			legend.margin=margin(t=0,r=0,b=0,l=0,unit = "pt"),
		 	legend.box.margin=margin(t=0,r=0,b=0,l=0,unit = "pt"),
		 	legend.key.height = unit(0.3, "cm"),
			legend.key.width = unit(0.3, "cm"),
		 	legend.spacing.y = unit(0.3, 'cm'),
		 	legend.spacing.x = unit(0.3, 'cm'),
		 	legend.title = element_text(size = 7),  
		 	legend.text = element_text(size = 7),
	 		legend.title.align=0.7
 			) +
		scale_x_discrete(
			labels = labels_10a, 
			expand = expand_scale(add = .6)
			) +
		scale_y_continuous(
			expand = expand_scale(mult = .05)
			# trans = log10_trans(), 
			# labels=function(x) {x*1e-3},
			# breaks = trans_breaks("log10", function(x) 10^x, n=3),
			# labels = trans_format("log10", math_format(10^.x))
			) +
		coord_cartesian(ylim = c(-0.2,0.2)) +
		xlab("K562") + 
		ylab(expression(-gamma)) + 
		labs(subtitle="CRISPRi/a")
print(p10a)



 
#--------- string_graph (11) ------------------------



# string db v11.0 downloaded 09/11/19 from https://string-db.org/cgi/download.pl?sessionId=i50ty8Z04nR7&species_text=Homo+sapiens
# string is also needed for p12 below.


string <-read.table("9606.protein.links.v11.0.txt",sep=" ",stringsAsFactors=FALSE,header=TRUE)
g_unique <- read.delim("growth_loci_unique.txt",sep="\t",stringsAsFactors=FALSE,header=TRUE,check.names=FALSE)


# Translate ENSP to ENSG ids
# Downloading translation table from ensembl would take too long: 3 hours for gene1
# # library(biomaRt)
# ensembl <- useDataset("hsapiens_gene_ensembl", mart=useMart("ensembl"))

# # download from ensembl
# gene_tx_1 <- getBM(
				# attributes=c(
						# 'external_gene_name',
						# 'ensembl_gene_id',
						# 'ensembl_transcript_id'
					# ), 
			    # filters = 'ensembl_peptide_id', 
			    # values = string$protein1, 
			    # mart = ensembl
			    # )
			    
			    
			    
# # Can use translation table provided by string db, human.name_2_string.tsv
# names <-read.table("human.name_2_string.tsv",sep="\t",stringsAsFactors=FALSE,header=FALSE)
# colnames(names) <- c("version","geneSymbol","ensembl_pr_id")
			    

# Best solution			    
# To translate used 9606.protein.aliases.v11.0 
# downloaded 09/11/19 from 
# https://string-db.org/cgi/download.pl?sessionId=i50ty8Z04nR7&species_text=Homo+sapiens 




aliases <-read.delim("9606.protein.aliases.v11.0.txt",sep="\t",stringsAsFactors=FALSE,header=FALSE,fill=TRUE,skip=1)

# colnames provided by "9606.protein.aliases.v11.0.txt", but blanked out by ## symbols
colnames(aliases) <- c("string_protein_id", "alias", "source")
aliases <- aliases[grep("^ENSG",aliases$alias),]








# takes a few mins
string <- merge(string,aliases[,c("string_protein_id", "alias")],by.x="protein1",by.y="string_protein_id")
colnames(string)[4] <- "gene1"
string <- string[,-c(1)]


# takes a few mins
string <- merge(string,aliases[,c("string_protein_id", "alias")],by.x="protein2",by.y="string_protein_id")
colnames(string)[4] <- "gene2"
string <- string[,-c(1)]


colnames(string)[1] <- "score"


scor <- seq(from = 150, to = 1000, by = 5)
seq <- seq_along(seq(from = 150, to = 1000, by = 5) )


# # Go to START HERE to save time, otherwise, run below
# # DO NOT DELETE
# # Takes about 10 min

# string_ans <- data.frame(
						# score = numeric(), 
						# mean_rh_plus =  numeric(), 
						# sem_rh_plus = numeric(), 
						# mean_rh_minus =  numeric(), 
						# sem_rh_minus = numeric(), 
						# t = numeric(), 
						# # df = numeric(), 
						# P = numeric()
						# )
						
						
# sem <- function(x) {sqrt(var(x,na.rm=TRUE)/sum(!is.na(x)))}


# # Loop ends in error message, but ok, file still written.

# for(i in seq) {
# print(i)

# string_shave <- string[string$score >= scor[i], ]

# string_agg_1 <- aggregate(rep(1, length(string_shave$gene1)), by=list(string_shave$gene1), sum)
# colnames(string_agg_1)<-c("Gene","edge1")

# string_agg_2 <- aggregate(rep(1, length(string_shave$gene2)), by=list(string_shave$gene2), sum)
# colnames(string_agg_2)<-c("Gene","edge2")

# string_agg_3 <- merge(string_agg_1,string_agg_2)
# string_agg_3$sum_edge<-string_agg_3$edge1+string_agg_3$edge2

# # edge1 == edge2 because string table is completely symmetrical. Therefore use either edge1 or edge2 arbitrarily

# a <- string_agg_3[string_agg_3$Gene %in% g_unique$ensembl_gene_id,"edge1"]
# b <- string_agg_3[!(string_agg_3$Gene %in% g_unique$ensembl_gene_id),"edge1"]
# test <- t.test(a,b)

# string_ans[i,] <- c(
					# scor[i],
					# mean(a),
					# sem(a),
					# mean(b),
					# sem(b),
					# test$statistic,
					# # test$parameter,
					# test$p.value
# )

# }


# string_ans$q <- p.adjust(string_ans$P, method = "BH")

# write.table(string_ans,"string_ans.txt",quote=FALSE,sep="\t",row.names=FALSE)

# START HERE if desired to save time
string_ans <- read.table("string_ans.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE)


h_line_11 <- max(string_ans[string_ans$q < 0.05,]$P)


p11 <- ggplot() + 
		theme2 + 
		theme(legend.key=element_blank()) +
		geom_line(
			data=string_ans, 
			lwd=0.2,
			colour="black",
			show.legend=FALSE,
			aes(
				x=score, 
				y= -log10(P)
				)
			) +
		geom_hline(
			color = "red", 
			size=size_hline,
			aes(
				yintercept = -log10(h_line_11), 
				linetype = "FDR = 0.05"
				)
			) +
		scale_linetype_manual(
			name = NULL, 
			values = c(1), 
			guide = guide_legend(
						override.aes = list(
											color = c("red"),
											size=size_hline
											)
							)
			) +
		theme(
		 	plot.margin = unit(c(top=0.5,right=0.5,bottom=0.5,left=0.5), "cm"),
		 	plot.subtitle = element_text(size=9, face="plain", hjust = 0.5, margin=margin(0,0,6,0)),
		 	axis.title.x = element_text(size=7, face="plain", hjust = 0.5, margin=margin(5,0,0,0)), #titles of x and y axes
	 		legend.position = c(0.3,0.9), 
			legend.margin=margin(t=0,r=0,b=0,l=0,unit = "pt"),
		 	legend.box.margin=margin(t=0,r=0,b=0,l=0,unit = "pt"),
		 	legend.key.height = unit(0.6, "cm"),
			legend.key.width = unit(0.3, "cm"),
		 	legend.spacing.y = unit(0.6, 'cm'),
		 	legend.spacing.x = unit(0.3, 'cm'),
		 	legend.title = element_text(size = 6),  
		 	legend.text = element_text(size = 6),
	 		legend.title.align=0.7
 			) +
		labs(subtitle="Protein Ix") +
		xlab("Score") + 
		# scale_x_continuous(breaks = c(0,1,2,3,4,6), labels = c(0,1,2,3,4,6)) + 
		ylab(expression(-log[10]*italic('P'))) 
print(p11)


# saveRDS(p11,"p11")
# do not delete string files because needed for p12 violin plot below
# rm(list=setdiff(ls(),c("theme2","gg_color_hue","size_hline")))



#---------string_summary (12)------------------------

# use dataframes from code above in string_graph (11)


# Visual inspection of string_ans shows score threshold giving max -logP in first peak is score = 160, i = 3
scor[3]
# [1] 160



# value of i which gives most significant p val (which is in 2nd peak)
which.min(string_ans$P)
# [1] 170 # confusingly, similar to number as above, but it is correct


# String combined score threshold that gives most significant p val in second peak
string_ans[which.min(string_ans$P),"score"]
# [1] 995



string_peaks <- data.frame(
						RH = numeric(),
						edges = numeric(),
						score = numeric()
						)



for(i in c(3,which.min(string_ans$P))) {
	
	print(i)
	
	string_shave <- string[string$score >= scor[i], ]
	
	string_agg_1 <- aggregate(rep(1, length(string_shave$gene1)), by=list(string_shave$gene1), sum)
	colnames(string_agg_1)<-c("Gene","edge1")
	
	string_agg_2 <- aggregate(rep(1, length(string_shave$gene2)), by=list(string_shave$gene2), sum)
	colnames(string_agg_2)<-c("Gene","edge2")
	
	string_agg_3 <- merge(string_agg_1,string_agg_2)
	string_agg_3$sum_edge<-string_agg_3$edge1+string_agg_3$edge2
	
	# edge1 == edge2 because string table is completely symmetrical. Therefore use either edge1 or edge2 arbitrarily
	
	a <- string_agg_3[string_agg_3$Gene %in% g_unique$ensembl_gene_id,"edge1"]
	b <- string_agg_3[!(string_agg_3$Gene %in% g_unique$ensembl_gene_id),"edge1"]
	RH <- c(rep(1,length(a)),rep(0,length(b)))
	edges <- c(a,b)
	score <- scor[i]
	
	string_peaks_temp <- cbind(RH,edges,score)
	string_peaks <- rbind(string_peaks,string_peaks_temp)

}




# No need to cast in long form, as already in long form.

string_peaks$score <- factor(string_peaks$score, levels=c("160","995"))
string_peaks$RH <- factor(string_peaks$RH,levels=c("1","0"))


labels_12 <- levels(string_peaks$score)

colores_12 <- gg_color_hue(length(unique(string_peaks$RH)))
names(colores_12) <- c("RH+","RH-")

p_vals_12 <- c(
		t.test(string_peaks[string_peaks$score == 160 & string_peaks$RH==1,c("edges")], string_peaks[string_peaks$score == 160 & string_peaks$RH==0,c("edges")])$p.value, 
		t.test(string_peaks[string_peaks$score == 995 & string_peaks$RH==1,c("edges")], string_peaks[string_peaks$score == 995 & string_peaks$RH==0,c("edges")])$p.value
			)
# p_exps <- -ceiling(log10(p_vals))


fun_label_12 <- c(
				deparse(bquote(.(formatC(p_vals_12[1],format="e",digits=0)))),
				deparse(bquote(.(formatC(p_vals_12[2],format="e",digits=0))))
				)
				
				


horiz_pos_12 <- c(1,2)-0.3
# vert_pos_12 <-  c(
				# max(string_peaks[string_peaks$score==160,c("edges")]) + 0.3, 
				# max(string_peaks[string_peaks$score==980,c("edges")]) + 0.3
				# )
				
				
vert_pos_12 <- 10^4.5


# violin plot
p12 <- ggplot(
		data = string_peaks, 
		aes(
				x = score, 
				y = edges,
				fill = RH
				)
			) + 
		geom_violin(
			width=0.5, 
			position = position_dodge(width=0.6), alpha=0.1,
			lwd=0.1
			) +
		geom_point(
			data= string_peaks,
			mapping=aes(x=score,y=edges),
			shape=16, 
			colour="grey",
			alpha=1,
			size=0.1,
			position=position_jitterdodge(dodge.width=0.6,jitter.width=0.0)
			) +
		geom_boxplot(
			width=0.1,
			position= position_dodge(0.6), alpha=0.1,
			outlier.shape=NA,
			lwd=0.2,
			fatten=2,
			show.legend = FALSE#,
			#notch=TRUE
			)  +
		scale_fill_manual(
			values=as.vector(colores_12),
			labels=names(colores_12),
			name=NULL		
			)+
		guides(
			shape=FALSE,
	 		fill = guide_legend(
				 		override.aes = list(
				 		#fill=NA,
				 		shape=NA,
				 		size=0.1
				 		),
			 		ncol=2,
			 		byrow=TRUE
			 		)
	 		 ) +
		annotate(
		    "rect",
		    xmin=horiz_pos_12-0.2,
		    xmax=horiz_pos_12+0.4, 
		    ymin=vert_pos_12-10^0.3, 
		    ymax=vert_pos_12+10^0.2, 
		    fill="white"
		    ) +
 		annotate(
	 		geom="text", 
	 		x=horiz_pos_12, 
	 		y=vert_pos_12, 
	 		hjust=0,
	 		label= fun_label_12, 
	 		parse=TRUE, 
	 		size=2
	 		) +
 		theme2 + 
   		theme(
		 	plot.margin = unit(c(top=0.5,right=0.5,bottom=0.5,left=0.5), "cm"),
		 	plot.subtitle = element_text(size=9, face="plain", hjust = 0.5, margin=margin(0,0,6,0)),
		 	axis.title.x = element_text(size=7, face="plain", hjust = 0.5, margin=margin(5,0,0,0)), #titles of x and y axes
		 	# axis.title.x=element_blank(),
	        # axis.text.x=element_blank(),
	        # axis.ticks.x=element_blank(),
	 		legend.position="none",
			legend.margin=margin(t=0,r=0,b=0,l=0,unit = "pt"),
		 	legend.box.margin=margin(t=0,r=0,b=0,l=0,unit = "pt"),
		 	legend.key.height = unit(0.3, "cm"),
			legend.key.width = unit(0.3, "cm"),
		 	legend.spacing.y = unit(0.3, 'cm'),
		 	legend.spacing.x = unit(0.3, 'cm'),
		 	legend.title = element_text(size = 7),  
		 	legend.text = element_text(size = 7),
	 		legend.title.align=0.7
 			) +
		scale_x_discrete(
			labels = labels_12, 
			expand = expand_scale(add = .6)
			) +
		scale_y_continuous(
			expand = expand_scale(mult = .05), 
			trans = log10_trans(), 
			# labels=function(x) {x*1e-3},
			breaks = trans_breaks("log10", function(x) 10^x, n=3),
			labels = trans_format("log10", math_format(10^.x))
			) +
		coord_cartesian(ylim = c(1,10^4.5)) +
		xlab("Score") + 
		ylab(expression(Edges)) + 
		labs(subtitle="Prot Ix")
print(p12)


# saveRDS(p12,"p12")
# rm(list=setdiff(ls(),c("theme2","gg_color_hue","size_hline")))



# # --------- if want to explore landscape dup_genes (13.1) ---------------

# dup2_ans <- data.frame(
						# NB_Genes = numeric(), 
						# mean_rh_plus =  numeric(), 
						# sem_rh_plus = numeric(), 
						# mean_rh_minus =  numeric(), 
						# sem_rh_minus = numeric(), 
						# D = numeric(), 
						# # df = numeric(), 
						# P = numeric()
						# )
						
						
# sem <- function(x) {sqrt(var(x,na.rm=TRUE)/sum(!is.na(x)))}


# scor <- seq(from = 1, to = max(dup2$NB_Genes), by = 1)
# seq <- seq_along(seq(from = 1, to = max(dup2$NB_Genes), by = 1) )



# # Loop ends in error message, file still written.
# # Error message means that loop halts before highest NB_Genes are explored, because gap in calculable values.
# # Needs to be encased in tryCatch to explore highest NB_Genes.

# for(i in seq) {
# print(i)

# dup2_shave <- dup2[dup2$NB_Genes >= scor[i], ]

# a <- dup2_shave[dup2_shave$gene_id %in% g_unique$ensembl_gene_id,"NB_Genes"]
# b <- dup2_shave[!(dup2_shave$gene_id %in% g_unique$ensembl_gene_id),"NB_Genes"]
# test <- ks.test(a,b)

# dup2_ans[i,] <- c(
					# scor[i],
					# mean(a),
					# sem(a),
					# mean(b),
					# sem(b),
					# test$statistic,
					# # test$parameter,
					# test$p.value
# )

# }

# dup2_ans$q <- p.adjust(dup2_ans$P, method = "BH")
# h_line <- max(dup2_ans[dup2_ans$q < 0.05,]$P)

# p13.1 <- ggplot() + 
		# theme2 + 
		# theme(legend.key=element_blank()) +
		# geom_line(
			# data= dup2_ans, 
			# lwd=0.2,
			# colour="black",
			# show.legend=FALSE,
			# aes(
				# x=NB_Genes, 
				# y= -log10(P)
				# )
			# ) +
		# geom_hline(
			# color = "red", 
			# size=size_hline,
			# aes(
				# yintercept = -log10(h_line), 
				# linetype = "FDR = 0.05"
				# )
			# ) +
		# scale_linetype_manual(
			# name = NULL, 
			# values = c(1), 
			# guide = guide_legend(
						# override.aes = list(
											# color = c("red"),
											# size=size_hline
											# )
							# )
			# ) +
		# theme(
		 	# plot.margin = unit(c(1.2,1.5,1.2,0.9), "cm"),
	 		# legend.position = c(0.8,0.9), 
			# legend.margin=margin(t=0,r=0,b=0,l=0,unit = "pt"),
		 	# legend.box.margin=margin(t=0,r=0,b=0,l=0,unit = "pt"),
		 	# legend.key.height = unit(0.6, "cm"),
			# legend.key.width = unit(0.3, "cm"),
		 	# legend.spacing.y = unit(0.6, 'cm'),
		 	# legend.spacing.x = unit(0.3, 'cm'),
		 	# legend.title = element_text(size = 8),  
		 	# legend.text = element_text(size = 8),
	 		# legend.title.align=0.7
 			# ) +
		# labs(subtitle="Gene \nduplicates") +
		# xlab("Paralogs") + 
		# # scale_x_continuous(breaks = c(0,1,2,3,4,6), labels = c(0,1,2,3,4,6)) + 
		# ylab(expression('-log'[10]*italic('P'))) 
# print(p13.1)

      

#--------- dup_genes (13) ------------------------


dup <- read.delim("dgd_Hsa_all_v71.tsv",sep="\t",stringsAsFactors=FALSE,header=TRUE,check.names=FALSE)
g_unique <- read.delim("growth_loci_unique.txt",sep="\t",stringsAsFactors=FALSE,header=TRUE,check.names=FALSE)
gencode_gtf_ensembl_ucsc <- read.delim("gencode_gtf_ensembl_ucsc_v31.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE,check.names=FALSE)

dup2 <- merge(gencode_gtf_ensembl_ucsc,dup[,c("NB_Genes","ENS_ID")],by.x="gene_id",by.y="ENS_ID",all.x=TRUE)


# Only ~6% genes have paralogs

dim(gencode_gtf_ensembl_ucsc)
# [1] 58721    19

dim(dup)
# [1] 3543    9

3543/58721
# [1] 0.06033617

dim(dup2)
# [1] 58721    20

sum(is.na(dup2$NB_Genes))
# [1] 55224

# slight attrition mapping genes from dup to gencode_gtf_ensembl_ucsc
58721-55224
# [1] 3497



# Replace NAs for genes with no paralogs with number 1
dup2[is.na(dup2$NB_Genes),"NB_Genes"] <- 1


dup2$RH <- numeric(nrow(dup2))
dup2[dup2$gene_id %in% g_unique$ensembl_gene_id,"RH"] <- 1



# No need to cast in long form, as already in long form with only one group factor. However, for consistency with other plots provide "dummy" grp factor for plotting.
dup2$cell <- "Paralogs"
dup2$cell <- factor(dup2$cell,levels=c("Paralogs"))


dup2$RH <- factor(dup2$RH,levels=c("1","0"))


labels_13 <- levels(dup2$cell)


colores_13 <- gg_color_hue(length(unique(dup2$RH)))
names(colores_13) <- c("RH+","RH-")



# Restrict test to duplicated genes, ie genes with more than 1 paralog
p_vals <- c(
			t.test(dup2[dup2$NB_Genes !=1 & dup2$RH==1,"NB_Genes"], dup2[dup2$NB_Genes !=1 & dup2$RH==0,"NB_Genes"])$p.value
			)



fun_label_13 <- c(
				deparse(bquote(.(formatC(p_vals[1],format="e",digits=0))))
				)

horiz_pos_13 <- c(1)-0.225

vert_pos_13 <-  c(
				max(dup2[,c("NB_Genes")],na.rm=TRUE) * 10^0.2
				)


# violin plot of genes with 2 or more paralogs

p13 <- ggplot(
		data = dup2[dup2$NB_Genes !=1,], 
		aes(
				x = cell, 
				y = NB_Genes,
				fill = RH
				)
			) + 
		geom_violin(
			width=0.5, 
			position = position_dodge(width=0.6), alpha=0.1,
			lwd=0.1
			) +
		geom_point(
			data= dup2[dup2$NB_Genes !=1,],
			mapping=aes(x=cell,y= NB_Genes),
			shape=16, 
			colour="grey",
			alpha=1,
			size=0.1,
			position=position_jitterdodge(
						dodge.width=0.6,
						jitter.width=0.0,
						jitter.height=0.1
						)
			) +
		geom_boxplot(
			width=0.1,
			position= position_dodge(0.6), alpha=0.1,
			outlier.shape=NA,
			lwd=0.2,
			fatten=2,
			show.legend = FALSE#,
			#notch=TRUE
			)  +
		scale_fill_manual(
			values=as.vector(colores_13),
			labels=names(colores_13),
			name=NULL		
			)+
		guides(
			shape=FALSE,
	 		fill = guide_legend(
				 		override.aes = list(
				 		#fill=NA,
				 		shape=NA,
				 		size=0.1
				 		),
			 		ncol=1,
			 		byrow=TRUE
			 		)
	 		 ) +
	    annotate(
		    "rect",
		    xmin=horiz_pos_13-0.2,
		    xmax=horiz_pos_13+0.6, 
		    ymin=vert_pos_13*10^-0.2, 
		    ymax=vert_pos_13*10^0.1, 
		    fill="white"
		    ) +
 		annotate(
	 		geom="text", 
	 		x=horiz_pos_13, 
	 		y=vert_pos_13, 
	 		hjust=0,
	 		label= fun_label_13, 
	 		parse=TRUE, 
	 		size=2
	 		) +
 		theme2 + 
   		theme(
	   		plot.margin = unit(c(top=0.5,right=0.5,bottom=0.5,left=0.5), "cm"),
		 	plot.subtitle = element_text(size=9, face="plain", hjust = 0.5, margin=margin(0,0,6,0)),
		 	axis.title.x=element_blank(),
	        axis.text.x=element_blank(),
	        axis.ticks.x=element_blank(),
	 		legend.position="none",
			legend.margin=margin(t=0,r=0,b=0,l=0,unit = "pt"),
		 	legend.box.margin=margin(t=0,r=0,b=0,l=0,unit = "pt"),
		 	legend.key.height = unit(0.3, "cm"),
			legend.key.width = unit(0.3, "cm"),
		 	legend.spacing.y = unit(0.3, 'cm'),
		 	legend.spacing.x = unit(0.3, 'cm'),
		 	legend.title = element_text(size = 7),  
		 	legend.text = element_text(size = 7),
	 		legend.title.align=0.7
 			) +
		scale_x_discrete(
			# labels = labels, 
			expand = expand_scale(add = .6)
			) +
		scale_y_continuous(
			expand = expand_scale(mult = .05),
			trans = log10_trans(), 
			# labels=function(x) {x*1e-3},
			breaks = trans_breaks("log10", function(x) 10^x, n=3),
			labels = trans_format("log10", math_format(10^.x))
			) +
		# xlab("Cell lines") + 
		ylab(expression(Paralogs)) +
		coord_cartesian(ylim = c(1,vert_pos_13)) +
		labs(subtitle="Gene dups")
print(p13)

# saveRDS(p13,"p13")
# rm(list=setdiff(ls(),c("theme2","gg_color_hue","size_hline")))




#------------------Make file --------------------------


# for(i in c(1:14)) {
	# assign(paste0("p",i), readRDS(paste0("p",i)))
	# }
	
	
(p_comp <- ggdraw() +
	draw_plot(p1, x = 0.0, y = 0.76, width = 0.38, height = 0.25) +
 	draw_plot(p2, x = 0.41, y = 0.78, width = 0.2, height = 0.23) +
 	# draw_plot(p3, x = 0.69, y = 0.78, width = 0.2, height = 0.23) + 
 	draw_plot(p4, x = 0.62, y = 0.75, width = 0.38, height = 0.25) + 
 	
 	
 	draw_plot(p5, x = 0.0, y = 0.51, width = 0.38, height = 0.25) +
 	# draw_plot(p6, x = 0.8, y = 0.54, width = 0.2, height = 0.23) +
 	draw_plot(p7, x = 0.41, y = 0.49, width = 0.38, height = 0.25) +
 	draw_plot(legend, x = 0.88, y = 0.69, width = 1, height = 1) +
 	
 	draw_plot(p10a, x = 0.0, y = 0.24, width = 0.25, height = 0.27) +
 	draw_plot(p11, x = 0.29, y = 0.24, width = 0.38, height = 0.25) +
 	draw_plot(p12, x = 0.7, y = 0.24, width = 0.25, height = 0.27) +
 	
 	draw_plot(p8, x = 0.0, y = 0.0, width = 0.2, height = 0.23) +
 	draw_plot(p9, x = 0.25, y = 0.0, width = 0.2, height = 0.23) +
 	draw_plot(p10, x = 0.5, y = 0.0, width = 0.2, height = 0.23) +
 	draw_plot(p13, x = 0.75, y = 0.0, width = 0.2, height = 0.23) +
 	
 	draw_plot_label(
			 	c(
			 	"A", "B", "C", 
			 	"D", "E", "",  
			 	"F", "G", "H",
			 	"I", "J", "K", "L"
			 	), 
	 	x = c(
			 	0.0, 0.41, 0.62, 
			 	0.0, 0.41, 0.44, 
			 	0.0, 0.29, 0.7, 
			 	0.0, 0.25, 0.5, 0.75
			 	), 
	 	y = c(
			 	1.0, 1.0, 1.0, 
			 	0.75, 0.75, 0.75,  
			 	0.5, 0.5, 0.5, 
			 	0.23, 0.23, 0.23, 0.23
			 	), 
	 	size = c(
			 	14, 14, 14, 
			 	14, 14, 0, 
			 	14, 14, 14, 
			 	14, 14, 14, 14
			 	)
	 	)
)


pdf("crispr_2.pdf",width=7.5,height=7.5,useDingbats=FALSE)
p_comp
dev.off()


tiff("crispr_2.tif",width=7.5,height=7.5,units="in",res=300)
p_comp
dev.off()

png("crispr_2.png", width=7.5, height=7.5,units="in",res=300)
p_comp
dev.off()


png("crispr_hi_res_2.png", width=7.5, height=7.5,units="in",res=1200)
p_comp
dev.off()




















