library(pROC)
library(ggplot2)
library(cowplot) #used with plot_grid 
library(scales)
library(plotROC)
library("biomaRt")

# ----------------- NOTE -----------------------------------

# "roc.txt" under roc curve images, p13, is archived. Can use archived file if want to save some time and bypass the somewhat slow roc code for p13.
# However, not necessary. Even without using roc.txt, whole thing takes ~20 min.



# --------------- function ---------------------------


get_legend<-function(myggplot){
  tmp <- ggplot_gtable(ggplot_build(myggplot))
   leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
   legend <- tmp$grobs[[leg]]
   return(legend)
 }




#----------------Aesthetics ---------------------------

# base style axes: https://stackoverflow.com/questions/25327694/how-to-tweak-the-extent-to-which-an-axis-is-drawn-in-ggplot2

theme2 <- theme(
	plot.margin = unit(c(t=0.0,r=0.0,b=0.0,l=0.0), "cm"),
	panel.grid.major = element_blank(), 
	panel.grid.minor = element_blank(), 
	panel.background = element_blank(), 
	legend.position="none", 
	axis.line.x = element_line(colour = "black", size = 0.1), 
	axis.line.y = element_line(colour = "black", size = 0.1), 
	axis.ticks = element_line(colour = "black", size = 0.1),
	axis.text=element_text(size=6), #numbers on tick marks of x and y axes
	axis.title=element_text(size=7), #titles of x and y axes
	axis.title.y=element_text(margin=margin(0,8,0,0)), #moves y axis title by adding margin space to bottom
	axis.title.x=element_text(margin=margin(13,0,0,0)),  #moves x axis title by adding margin space to top
	# plot.title = element_text(size=32, face="bold", hjust = -0.14), #can provide "A","B", by ggtitle, but used plot_grid wch can shift more left
	plot.subtitle = element_text(size=9, face="plain", hjust = 0.5, margin=margin(0,0,13,0)) #hjust shifts right
	)

gg_color_hue <- function(n) {
  hues = seq(15, 375, length = n + 1)
  hcl(h = hues, l = 65, c = 100)[1:n]
}

size_hline <- 0.2


# balloon_scale <- 5 # inflation factor for significant points	
# size_point <- 0.1*(1 + balloon_scale*(logP$log10p_g_avg/max(logP$log10p_g_avg))) # scale significant points



# #--------- gnomad (14) ------------------------

# download gnomad.v2.1.1.lof_metrics.by_transcript.txt.bgz from gnomad website (https://gnomad.broadinstitute.org/downloads) on 09/12/19
# mv gnomad.v2.1.1.lof_metrics.by_transcript.txt.bgz gnomad.v2.1.1.lof_metrics.by_transcript.txt.gz
# unzipped to give gnomad.v2.1.1.lof_metrics.by_transcript.txt


# gnomad_by_tx <-read.table("gnomad.v2.1.1.lof_metrics.by_transcript.txt",sep="\t",stringsAsFactors=FALSE,header=TRUE)

# dim(gnomad_by_tx)
# # [1] 80950    78



# dim(gnomad_by_tx[gnomad_by_tx$canonical=="true",])
# # [1] 19704    78


# download gnomad.v2.1.1.lof_metrics.by_gene.txt.bgz from gnomad website (https://gnomad.broadinstitute.org/downloads) on 09/12/19
# mv gnomad.v2.1.1.lof_metrics.by_gene.txt.bgz gnomad.v2.1.1.lof_metrics.by_gene.txt.gz
# unzipped to give gnomad.v2.1.1.lof_metrics.by_gene.txt
# gnomad.v2.1.1.lof_metrics.by_gene.txt appears to be gnomad.v2.1.1.lof_metrics.by_transcript.txt subsetted using gnomad_by_tx[gnomad_by_tx$canonical=="true",]


gnomad <-read.table("gnomad.v2.1.1.lof_metrics.by_gene.txt",sep="\t",stringsAsFactors=FALSE,header=TRUE)

dim(gnomad)
# [1] 19704    77


# # convert enesmbl tx ids to ensembl gene ids, if access to internet
# # DO NOT DELETE

# ensembl = useMart("ensembl",dataset="hsapiens_gene_ensembl")

# translate <- getBM(attributes=c(
				# 'ensembl_transcript_id', 
				# 'ensembl_gene_id'
				# ), 
		      # filters = 'ensembl_transcript_id', 
		      # values = gnomad$transcript, 
		      # mart = ensembl
		      # )
		      

# write.table(translate,"gnomad_tx_gene_id.txt",quote=FALSE,sep="\t",row.names=FALSE,col.names=TRUE)


# if no access to internet, or to save time
translate <- read.table("gnomad_tx_gene_id.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE)


# a few gnomad tx ids are not in ensembl and are lost
dim(translate)
# [1] 18589     2

gnomad <- merge(gnomad,translate,by.x="transcript",by.y="ensembl_transcript_id")


dim(gnomad)
# [1] 18589    78



g_unique <- read.delim("growth_loci_unique.txt",sep="\t",stringsAsFactors=FALSE,header=TRUE,check.names=FALSE)



gnomad$RH <- numeric(nrow(gnomad))
gnomad[gnomad$ensembl_gene_id %in% g_unique$ensembl_gene_id,"RH"] <- 1



# No need to cast in long form, as already in long form with only one group factor. However, for consistency with other plots provide "dummy" grp factor for plotting.
gnomad$cell <- "LOF"
gnomad$cell <- factor(gnomad$cell,levels=c("LOF"))


gnomad$RH <- factor(gnomad$RH,levels=c("1","0"))


labels_14 <- levels(gnomad$cell)


colores_14 <- gg_color_hue(length(unique(gnomad$RH)))
names(colores_14) <- c("RH+","RH-")




p_vals_14 <- c(
			t.test(gnomad[gnomad$RH==1,"oe_lof"], gnomad[gnomad$RH !=1,"oe_lof"])$p.value
			)



fun_label_14 <- c(
				deparse(bquote(.(formatC(p_vals_14[1],format="e",digits=0))))
				)
				

# to prevent -Inf of log10(0) in graphing. Addition of 1 and log10 transform do not change Wilcox P val		
gnomad$obs_lof <- gnomad$obs_lof + 1


horiz_pos_14 <- c(1)-0.2
# vert_pos_14 <-  c(
				# max(gnomad[,c("oe_lof")],na.rm=TRUE) + 10
				# )
vert_pos_14 <- 10^1


# violin plot of genes with 2 or more paralogs

p14 <- ggplot() + 
		geom_violin(
			data = gnomad, 
			aes(
					x = cell, 
					y = oe_lof,
					fill = RH
					),
			width=0.5, 
			position = position_dodge(width=0.6), alpha=0.1,
			lwd=0.1
			) +
		geom_point(
			data= gnomad,
			mapping=aes(
						x=cell,
						y= oe_lof,
						fill=RH
						),
			shape=16, 
			colour="grey",
			alpha=1,
			size=0.1,
			position=position_jitterdodge(
						dodge.width=0.6,
						jitter.width=0.0,
						jitter.height=0.1
						)
			) +
		geom_boxplot(
			data = gnomad, 
			aes(
					x = cell, 
					y = oe_lof,
					fill = RH
					),
			width=0.1,
			position= position_dodge(0.6), alpha=0.1,
			outlier.shape=NA,
			lwd=0.2,
			fatten=2, 
			colour="black",
			show.legend = FALSE#,
			#notch=TRUE
			)  +
		scale_fill_manual(
			values=as.vector(colores_14),
			labels=names(colores_14),
			name=NULL		
			)+
		guides(
			shape=FALSE,
	 		fill = guide_legend(
				 		override.aes = list(
				 		#fill=NA,
				 		shape=NA,
				 		size=0.1
				 		),
			 		ncol=1,
			 		byrow=TRUE
			 		)
	 		 ) +
	    annotate(
		    "rect",
		    xmin=horiz_pos_14-0.2,
		    xmax=horiz_pos_14+0.6, 
		    ymin=vert_pos_14*10^-0.2, 
		    ymax=vert_pos_14*10^0.2, 
		    fill="white"
		    ) +
 		annotate(
	 		geom="text", 
	 		x=horiz_pos_14, 
	 		y=vert_pos_14, 
	 		hjust=0,
	 		label= fun_label_14, 
	 		parse=TRUE, 
	 		size=2
	 		) +
 		theme2 + 
   		theme(
	   		plot.margin = unit(c(top=0.5,right=0.5,bottom=0.5,left=0.5), "cm"),
		 	plot.subtitle = element_text(size=9, face="plain", hjust = 0.5, margin=margin(0,0,6,0)),
		 	axis.title.x=element_blank(),
	        axis.text.x=element_blank(),
	        axis.ticks.x=element_blank(),
	 		legend.position="none",
			legend.margin=margin(t=0,r=0,b=0,l=0,unit = "pt"),
		 	legend.box.margin=margin(t=0,r=0,b=0,l=0,unit = "pt"),
		 	legend.key.height = unit(0.3, "cm"),
			legend.key.width = unit(0.3, "cm"),
		 	legend.spacing.y = unit(0.3, 'cm'),
		 	legend.spacing.x = unit(0.3, 'cm'),
		 	legend.title = element_text(size = 7),  
		 	legend.text = element_text(size = 7),
	 		legend.title.align=0.7
 			) +
		scale_x_discrete(
			# labels = labels, 
			expand = expand_scale(add = .6)
			) +
		scale_y_continuous(
			expand = expand_scale(mult = .05),
			trans = log10_trans(), 
			# labels=function(x) {x*1e-3},
			breaks = trans_breaks("log10", function(x) 10^x, n=3),
			labels = trans_format("log10", math_format(10^.x))
			) +
		# xlab("Cell lines") + 
		ylab(expression(obs/exp)) +
		coord_cartesian(ylim = c(10^-2,vert_pos_14)) +
		labs(subtitle="LOF")
print(p14)


# saveRDS(p14,"p14")
# rm(list=setdiff(ls(),c("theme2","gg_color_hue","size_hline")))


       
# #---------Phyletic_retention (1)------------------------

# Download homologene.data from homologene web site


homol <-read.delim("homologene.data",sep="\t",stringsAsFactors=FALSE,header=FALSE,check.names=FALSE)
g_unique <- read.delim("growth_loci_unique.txt",sep="\t",stringsAsFactors=FALSE,header=TRUE,check.names=FALSE)

homol <- homol[,c(1,2,4)]
colnames(homol) <- c("group","species","geneSymbol")
homol_agg <- aggregate(rep(1, length(homol$group)), by=list(homol$group), sum)
colnames(homol_agg) <- c("group","no.species")
homol <- merge(homol,homol_agg)

# select human gene symbols
homol <- homol[homol$species==9606,]


homol$RH <- numeric(nrow(homol))
homol[homol$geneSymbol %in% g_unique$geneSymbol,"RH"] <- 1



# No need to cast in long form, as already in long form with only one group factor. However, for consistency with other plots provide "dummy" grp factor for plotting.
homol$cell <- "Orthologs"
homol$cell <- factor(homol$cell,levels=c("Orthologs"))


homol$RH <- factor(homol$RH,levels=c("1","0"))


labels_1 <- levels(homol$cell)


colores_1 <- gg_color_hue(length(unique(homol$RH)))
names(colores_1) <- c("RH+","RH-")




p_vals_1 <- c(
			t.test(homol[homol$RH==1,"no.species"], homol[homol$RH !=1,"no.species"])$p.value
			)



fun_label_1 <- c(
				deparse(bquote(.(formatC(p_vals_1[1],format="e",digits=0))))
				)


horiz_pos_1 <- c(1)-0.2
# vert_pos_1 <-  c(
				# max(homol[,c("no.species")],na.rm=TRUE) * 2^0.1
				# )
vert_pos_1 <- 2^5




p1 <- ggplot() + 
		geom_violin(
			data = homol, 
			aes(
					x = cell, 
					y = no.species,
					fill = RH
					),
			width=0.5, 
			position = position_dodge(width=0.6), alpha=0.1,
			lwd=0.1
			) +
		geom_point(
			data= homol,
			mapping=aes(
					x=cell,
					y= no.species,
					fill=RH
					),
			shape=16, 
			colour="grey",
			alpha=1,
			size=0.1,
			position=position_jitterdodge(
						dodge.width=0.6,
						jitter.width=0.0,
						jitter.height=0.1
						)
			) +
		geom_boxplot(
			data = homol, 
			aes(
					x = cell, 
					y = no.species,
					fill = RH
					),
			width=0.1,
			position= position_dodge(0.6), alpha=0.1,
			outlier.shape=NA,
			lwd=0.2,
			fatten=2,
			colour="black",
			show.legend = FALSE#,
			#notch=TRUE
			)  +
		scale_fill_manual(
			values=as.vector(colores_1),
			labels=names(colores_1),
			name=NULL		
			)+
		guides(
			shape=FALSE,
	 		fill = guide_legend(
				 		override.aes = list(
				 		#fill=NA,
				 		shape=NA,
				 		size=0.1
				 		),
			 		ncol=1,
			 		byrow=TRUE
			 		)
	 		 ) +
	    annotate(
		    "rect",
		    xmin=horiz_pos_1-0.2,
		    xmax=horiz_pos_1+0.4, 
		    ymin=vert_pos_1/2^0.3, 
		    ymax=vert_pos_1*2^0.3, 
		    fill="white"
		    ) +
 		annotate(
	 		geom="text", 
	 		x=horiz_pos_1, 
	 		y=vert_pos_1, 
	 		hjust=0,
	 		label= fun_label_1, 
	 		parse=TRUE, 
	 		size=2
	 		) +
 		theme2 + 
   		theme(
	   		plot.margin = unit(c(top=0.5,right=0.5,bottom=0.5,left=0.5), "cm"),
		 	plot.subtitle = element_text(size=9, face="plain", hjust = 0.5, margin=margin(0,0,13,0)),
		 	axis.title.x=element_blank(),
	        axis.text.x=element_blank(),
	        axis.ticks.x=element_blank(),
	 		legend.position = c(0,0),
			legend.margin=margin(t=0,r=0,b=0,l=0,unit = "pt"),
		 	legend.box.margin=margin(t=0,r=0,b=0,l=0,unit = "pt"),
		 	legend.key.height = unit(0.3, "cm"),
			legend.key.width = unit(0.3, "cm"),
		 	legend.spacing.y = unit(0.3, 'cm'),
		 	legend.spacing.x = unit(0.3, 'cm'),
		 	legend.title = element_text(size = 7),  
		 	legend.text = element_text(size = 7),
	 		legend.title.align=0.7
	 		# plot.tag = element_text(face="bold"), 
	 		# plot.tag = element_text(size=32, face="bold"), 
	 		# plot.tag.position = c(0.5,0.7)
 			) +
		scale_x_discrete(
			# labels = labels, 
			expand = expand_scale(add = .6)
			) +
		scale_y_continuous(
			expand = expand_scale(mult = .05), 
			trans = log2_trans(), 
			# labels=function(x) {x*1e-3},
			breaks = trans_breaks("log2", function(x) 2^x, n=3),
			labels = trans_format("log2", math_format(2^.x))
			) +
		coord_cartesian(ylim = c(2^2, vert_pos_1)) +
		# xlab("Cell lines") + 
		ylab(expression(No.~species)) +
		labs(subtitle="Orthologs")
print(p1)

legend <- get_legend(p1)
p1 <- p1 + theme(legend.position="none")


       
#---------dN/dS (2)------------------------

# DO NOT DELETE:

# ensembl 97 data downloaded 09/12/19

# library(biomaRt)
# ensembl <- useDataset("hsapiens_gene_ensembl", mart=useMart("ensembl"))


# genes_dn_ds <- getBM(
				# attributes=c(
						# 'external_gene_name',
						# 'ensembl_gene_id',
						# 'ensembl_transcript_id',
						# 'mmusculus_homolog_dn',
						# 'mmusculus_homolog_ds'
					# ), 
			    # # filters = 'ensembl_peptide_id', 
			    # # values = string$protein1, 
			    # mart = ensembl
			    # )
			    
# write.table(genes_dn_ds,"genes_dn_ds.txt",quote=FALSE,sep="\t",row.names=FALSE)
			    
genes_dn_ds <- read.table("genes_dn_ds.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE)			    
genes_dn_ds$dn_ds <- genes_dn_ds$mmusculus_homolog_dn/genes_dn_ds$mmusculus_homolog_ds
g_unique <- read.delim("growth_loci_unique.txt",sep="\t",stringsAsFactors=FALSE,header=TRUE,check.names=FALSE)



# av dn_ds based on gene id. Also gets rid of the many dn_ds NAs
genes_dn_ds <- aggregate(dn_ds ~ ensembl_gene_id, data = genes_dn_ds, FUN=mean)


genes_dn_ds$RH <- numeric(nrow(genes_dn_ds))
genes_dn_ds[genes_dn_ds$ensembl_gene_id %in% g_unique$ensembl_gene_id,"RH"] <- 1



# No need to cast in long form, as already in long form with only one group factor. However, for consistency with other plots provide "dummy" grp factor for plotting.
genes_dn_ds$cell <- "dN/dS"
genes_dn_ds$cell <- factor(genes_dn_ds$cell,levels=c("dN/dS"))


genes_dn_ds$RH <- factor(genes_dn_ds$RH,levels=c("1","0"))


labels_2 <- levels(genes_dn_ds$cell)


colores_2 <- gg_color_hue(length(unique(genes_dn_ds$RH)))
names(colores_2) <- c("RH+","RH-")




p_vals_2 <- c(
			t.test(genes_dn_ds[genes_dn_ds$RH==1,"dn_ds"], genes_dn_ds[genes_dn_ds$RH !=1,"dn_ds"])$p.value
			)



fun_label_2 <- c(
				deparse(bquote(.(formatC(p_vals_2[1],format="e",digits=0))))
				)


horiz_pos_2 <- c(1)-0.24
# vert_pos_2 <-  c(
				# max(genes_dn_ds[,c("dn_ds")],na.rm=TRUE) + 2
				# )
vert_pos_2 <- 10^0.5




p2 <- ggplot(
		data = genes_dn_ds, 
		aes(
				x = cell, 
				y = dn_ds,
				fill = RH
				)
			) + 
		geom_violin(
			width=0.5, 
			position = position_dodge(width=0.6), alpha=0.1,
			lwd=0.1
			) +
		geom_point(
			data= genes_dn_ds,
			mapping=aes(x=cell,y= dn_ds),
			shape=16, 
			colour="grey",
			alpha=1,
			size=0.1,
			position=position_jitterdodge(
						dodge.width=0.6,
						jitter.width=0.0,
						jitter.height=0.1
						)
			) +
		geom_boxplot(
			width=0.1,
			position= position_dodge(0.6), alpha=0.1,
			outlier.shape=NA,
			lwd=0.2,
			fatten=2,
			show.legend = FALSE#,
			#notch=TRUE
			)  +
		scale_fill_manual(
			values=as.vector(colores_2),
			labels=names(colores_2),
			name=NULL		
			)+
		guides(
			shape=FALSE,
	 		fill = guide_legend(
				 		override.aes = list(
				 		#fill=NA,
				 		shape=NA,
				 		size=0.1
				 		),
			 		ncol=1,
			 		byrow=TRUE
			 		)
	 		 ) +
	    annotate(
		    "rect",
		    xmin=horiz_pos_2-0.2,
		    xmax=horiz_pos_2+0.4, 
		    ymin=vert_pos_2*10^-0.3, 
		    ymax=vert_pos_2*10^0.3, 
		    fill="white"
		    ) +
 		annotate(
	 		geom="text", 
	 		x=horiz_pos_2, 
	 		y=vert_pos_2, 
	 		hjust=0,
	 		label= fun_label_2, 
	 		parse=TRUE, 
	 		size=2
	 		) +
 		theme2 + 
   		theme(
	   		plot.margin = unit(c(top=0.5,right=0.5,bottom=0.5,left=0.5), "cm"),
		 	plot.subtitle = element_text(size=9, face="plain", hjust = 0.5, margin=margin(0,0,13,0)),
		 	axis.title.x=element_blank(),
	        axis.text.x=element_blank(),
	        axis.ticks.x=element_blank(),
	 		legend.position="none",
			legend.margin=margin(t=0,r=0,b=0,l=0,unit = "pt"),
		 	legend.box.margin=margin(t=0,r=0,b=0,l=0,unit = "pt"),
		 	legend.key.height = unit(0.3, "cm"),
			legend.key.width = unit(0.3, "cm"),
		 	legend.spacing.y = unit(0.3, 'cm'),
		 	legend.spacing.x = unit(0.3, 'cm'),
		 	legend.title = element_text(size = 7),  
		 	legend.text = element_text(size = 7),
	 		legend.title.align=0.7
	 		# plot.tag = element_text(face="bold"), 
	 		# plot.tag = element_text(size=32, face="bold"), 
	 		# plot.tag.position = c(0.5,0.7)
 			) +
		scale_x_discrete(
			# labels = labels, 
			expand = expand_scale(add = .6)
			) +
		scale_y_continuous(
			expand = expand_scale(mult = .05), 
			trans = log10_trans(), 
			# labels=function(x) {x*1e-3},
			breaks = trans_breaks("log10", function(x) 10^x, n=3),
			labels = trans_format("log10", math_format(10^.x))
			) +
		coord_cartesian(ylim = c(10^-2.5, vert_pos_2)) +
		# xlab("Cell lines") + 
		ylab(expression(dN/dS)) +
		labs(subtitle="Seq diverge")
print(p2)




# #---------exonCount_cr_nc together in one graph (4) ------------------------


# g_unique <- read.delim("growth_loci_unique.txt",sep="\t",stringsAsFactors=FALSE,header=TRUE,check.names=FALSE)
# gencode_gtf_ensembl_ucsc <- read.delim("gencode_gtf_ensembl_ucsc_v31.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE,check.names=FALSE)


# gencode_gtf_ensembl_ucsc$RH <- numeric(nrow(gencode_gtf_ensembl_ucsc))
# gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_id %in% g_unique$ensembl_gene_id,"RH"] <- 1


# # no need to reshape because already in long form
# gencode_gtf_ensembl_ucsc$cell = character(nrow(gencode_gtf_ensembl_ucsc))
# gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type == "protein_coding","cell"] <- "coding"
# gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type != "protein_coding","cell"] <- "non-coding"


# level_order_4 <- c("coding", "non-coding")


# gencode_gtf_ensembl_ucsc$cell <- factor(gencode_gtf_ensembl_ucsc$cell,levels=level_order_4)
# gencode_gtf_ensembl_ucsc$RH <- factor(gencode_gtf_ensembl_ucsc$RH,levels=c("1","0"))


# labels_4 <- level_order_4

# colores_4 <- gg_color_hue(length(unique(gencode_gtf_ensembl_ucsc$RH)))
# names(colores_4) <- c("RH+","RH-")


# p_vals_4 <- c(

			# t.test(gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type == "protein_coding" & gencode_gtf_ensembl_ucsc$RH==1,"exonCount"], gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type == "protein_coding" & gencode_gtf_ensembl_ucsc$RH !=1,"exonCount"])$p.value,
			
			# t.test(gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type != "protein_coding" & gencode_gtf_ensembl_ucsc$RH==1,"exonCount"], gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type != "protein_coding" & gencode_gtf_ensembl_ucsc$RH !=1,"exonCount"])$p.value
			
			# )
			
			
			
# fun_label_4 = character(length(p_vals_4))			
			

# for(i in 1:length(p_vals_4))		{	
# if (p_vals_4[i] >= 2.2e-16) {

# fun_label_4[i] <- c(
				# deparse(bquote(.(formatC(p_vals_4[i],format="e",digits=0))))
				# ) } else {
					
# fun_label_4[i] <- c(deparse(bquote(""< .(formatC(2.2e-16,format="e",digits=1)))))
					
				# }

# }



# horiz_pos_4 <- c(1:length(p_vals_4))-c(0.09,0.09)
# vert_pos_4 <-  c(
				# 10^2.1, 10^2.1
				# # max(gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type == "protein_coding",c("exonCount")],na.rm=TRUE), 
				# # max(gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type != "protein_coding",c("exonCount")],na.rm=TRUE)
				# )



# # violin plot
# p4 <- ggplot(
		# data = gencode_gtf_ensembl_ucsc, 
		# aes(
				# x = cell, 
				# y = exonCount,
				# fill = RH
				# )
			# ) + 
		# geom_violin(
			# width=0.5, 
			# position = position_dodge(width=0.6), alpha=0.1,
			# lwd=0.1
			# ) +
		# geom_point(
			# data= gencode_gtf_ensembl_ucsc,
			# mapping=aes(x=cell,y=exonCount),
			# shape=16, 
			# colour="grey",
			# alpha=1,
			# size=0.1,
			# position=position_jitterdodge(
						# dodge.width=0.6,
						# jitter.width=0.0,
						# jitter.height=0.0
						# )
			# ) +
		# geom_boxplot(
			# width=0.05,
			# position= position_dodge(0.6), alpha=0.1,
			# outlier.shape=NA,
			# lwd=0.2,
			# fatten=2,
			# show.legend = FALSE#,
			# #notch=TRUE
			# )  +
		# scale_fill_manual(
			# values=as.vector(colores_4),
			# labels=names(colores_4),
			# name=NULL		
			# )+
		# guides(
			# shape=FALSE,
	 		# fill = guide_legend(
				 		# override.aes = list(
				 		# #fill=NA,
				 		# shape=NA,
				 		# size=0.1
				 		# ),
			 		# ncol=1,
			 		# byrow=TRUE
			 		# )
	 		 # ) +
	    # annotate(
		    # "rect",
		    # xmin=horiz_pos_4-0.2,
		    # xmax=horiz_pos_4+0.6, 
		    # ymin=vert_pos_4/10^0.3, 
		    # ymax=vert_pos_4*10^0.1, 
		    # fill="white"
		    # ) +
 		# annotate(
	 		# geom="text", 
	 		# x=horiz_pos_4, 
	 		# y=vert_pos_4, 
	 		# hjust=0,
	 		# label= fun_label_4, 
	 		# parse=TRUE, 
	 		# size=2
	 		# ) +
 		# theme2 + 
   		# theme(
	   		# axis.title.x=element_blank(),
	        # # axis.text.x=element_blank(),
	        # # axis.ticks.x=element_blank(),
		 	# plot.margin = unit(c(1.2,1.5,1.2,0.9), "cm"),
	 		# legend.position = c(0.9,0.95),
			# legend.margin=margin(t=0,r=0,b=0,l=0,unit = "pt"),
		 	# legend.box.margin=margin(t=0,r=0,b=0,l=0,unit = "pt"),
		 	# legend.key.height = unit(0.3, "cm"),
			# legend.key.width = unit(0.3, "cm"),
		 	# legend.spacing.y = unit(0.3, 'cm'),
		 	# legend.spacing.x = unit(0.3, 'cm'),
		 	# legend.title = element_text(size = 7),  
		 	# legend.text = element_text(size = 7),
	 		# legend.title.align=0.7
 			# ) +
		# scale_x_discrete(
			# labels = labels_4, 
			# expand = expand_scale(add = .6)
			# ) +
		# scale_y_continuous(
				# expand = expand_scale(mult = .05), 
				# trans = log10_trans(), 
				# # labels=function(x) {x*1e-3},
				# breaks = trans_breaks("log10", function(x) 10^x),
				# labels = trans_format("log10", math_format(10^.x))#,
				# #limits=c(-2.2,6.25)
			# ) +
		# # xlab("Cell lines") + 
		# ylab(expression(Number)) +
		# coord_cartesian(ylim = c(10^0,vert_pos_4)) +
		# labs(subtitle="Exons")
# print(p4)




#---------exon_cr (4)------------------------



g_unique <- read.delim("growth_loci_unique.txt",sep="\t",stringsAsFactors=FALSE,header=TRUE,check.names=FALSE)
gencode_gtf_ensembl_ucsc <- read.delim("gencode_gtf_ensembl_ucsc_v31.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE,check.names=FALSE)




gencode_gtf_ensembl_ucsc$RH <- numeric(nrow(gencode_gtf_ensembl_ucsc))
gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_id %in% g_unique$ensembl_gene_id,"RH"] <- 1



# No need to cast in long form, as already in long form with only one group factor. However, for consistency with other plots provide "dummy" grp factor for plotting.
gencode_gtf_ensembl_ucsc$cell <- "Exons"
gencode_gtf_ensembl_ucsc$cell <- factor(gencode_gtf_ensembl_ucsc$cell,levels=c("Exons"))


gencode_gtf_ensembl_ucsc$RH <- factor(gencode_gtf_ensembl_ucsc$RH,levels=c("1","0"))


labels_4 <- levels(gencode_gtf_ensembl_ucsc$cell)


colores_4 <- gg_color_hue(length(unique(gencode_gtf_ensembl_ucsc$RH)))
names(colores_4) <- c("RH+","RH-")




p_vals_4 <- c(
			t.test(gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type == "protein_coding" & gencode_gtf_ensembl_ucsc$RH==1,"exonCount"], gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type == "protein_coding" & gencode_gtf_ensembl_ucsc$RH !=1,"exonCount"])$p.value
			)



if (p_vals_4 >= 2.2e-16) {

fun_label_4 <- c(
				deparse(bquote(.(formatC(p_vals_4[1],format="e",digits=0))))
				) } else {
					
fun_label_4 <- c(deparse(bquote(""< .(formatC(2.2e-16,format="e",digits=1)))))
					
				}


horiz_pos_4 <- c(1)-0.24
vert_pos_4 <-  c(
				max(gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type == "protein_coding",c("exonCount")],na.rm=TRUE) * 10^0.1
				)
# vert_pos_4 <- 10^2.7




p4 <- ggplot(
		data = gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type == "protein_coding",], 
		aes(
				x = cell, 
				y = exonCount,
				fill = RH
				)
			) + 
		geom_violin(
			width=0.5, 
			position = position_dodge(width=0.6), alpha=0.1,
			lwd=0.1
			) +
		geom_point(
			data= gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type == "protein_coding",],
			mapping=aes(x=cell,y= exonCount),
			shape=16, 
			colour="grey",
			alpha=1,
			size=0.1,
			position=position_jitterdodge(
						dodge.width=0.6,
						jitter.width=0.0,
						jitter.height=0.0
						)
			) +
		geom_boxplot(
			width=0.1,
			position= position_dodge(0.6), alpha=0.1,
			outlier.shape=NA,
			lwd=0.2,
			fatten=2,
			show.legend = FALSE#,
			#notch=TRUE
			)  +
		scale_fill_manual(
			values=as.vector(colores_4),
			labels=names(colores_4),
			name=NULL		
			)+
		guides(
			shape=FALSE,
	 		fill = guide_legend(
				 		override.aes = list(
				 		#fill=NA,
				 		shape=NA,
				 		size=0.1
				 		),
			 		ncol=1,
			 		byrow=TRUE
			 		)
	 		 ) +
	    annotate(
		    "rect",
		    xmin=horiz_pos_4-0.2,
		    xmax=horiz_pos_4+0.6, 
		    ymin=vert_pos_4/10^0.3, 
		    ymax=vert_pos_4*10^0.1, 
		    fill="white"
		    ) +
 		annotate(
	 		geom="text", 
	 		x=horiz_pos_4, 
	 		y=vert_pos_4, 
	 		hjust=0,
	 		label= fun_label_4, 
	 		parse=TRUE, 
	 		size=2
	 		) +
 		theme2 + 
   		theme(
	   		plot.margin = unit(c(top=0.5,right=0.5,bottom=0.5,left=0.5), "cm"),
		 	plot.subtitle = element_text(size=9, face="plain", hjust = 0.5, margin=margin(0,0,13,0)),
		 	axis.title.x=element_blank(),
	        axis.text.x=element_blank(),
	        axis.ticks.x=element_blank(),
	 		legend.position="none",
			legend.margin=margin(t=0,r=0,b=0,l=0,unit = "pt"),
		 	legend.box.margin=margin(t=0,r=0,b=0,l=0,unit = "pt"),
		 	legend.key.height = unit(0.3, "cm"),
			legend.key.width = unit(0.3, "cm"),
		 	legend.spacing.y = unit(0.3, 'cm'),
		 	legend.spacing.x = unit(0.3, 'cm'),
		 	legend.title = element_text(size = 7),  
		 	legend.text = element_text(size = 7),
	 		legend.title.align=0.7
	 		# plot.tag = element_text(face="bold"), 
	 		# plot.tag = element_text(size=32, face="bold"), 
	 		# plot.tag.position = c(0.5,0.7)
 			) +
		scale_x_discrete(
			# labels = labels, 
			expand = expand_scale(add = .6)
			) +
		scale_y_continuous(
			expand = expand_scale(mult = .05), 
			trans = log10_trans(), 
			# labels=function(x) {x*1e-3},
			breaks = trans_breaks("log10", function(x) 10^x, n=4),
			labels = trans_format("log10", math_format(10^.x))
			) +
		# xlab("Cell lines") + 
		ylab(expression(Number)) +
		coord_cartesian(ylim = c(10^0,vert_pos_4)) +
		labs(subtitle="Exons cr")
print(p4)




#---------exon_nc (5)------------------------



g_unique <- read.delim("growth_loci_unique.txt",sep="\t",stringsAsFactors=FALSE,header=TRUE,check.names=FALSE)
gencode_gtf_ensembl_ucsc <- read.delim("gencode_gtf_ensembl_ucsc_v31.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE,check.names=FALSE)




gencode_gtf_ensembl_ucsc$RH <- numeric(nrow(gencode_gtf_ensembl_ucsc))
gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_id %in% g_unique$ensembl_gene_id,"RH"] <- 1



# No need to cast in long form, as already in long form with only one group factor. However, for consistency with other plots provide "dummy" grp factor for plotting.
gencode_gtf_ensembl_ucsc$cell <- "Exons"
gencode_gtf_ensembl_ucsc$cell <- factor(gencode_gtf_ensembl_ucsc$cell,levels=c("Exons"))


gencode_gtf_ensembl_ucsc$RH <- factor(gencode_gtf_ensembl_ucsc$RH,levels=c("1","0"))


labels_5 <- levels(gencode_gtf_ensembl_ucsc$cell)


colores_5 <- gg_color_hue(length(unique(gencode_gtf_ensembl_ucsc$RH)))
names(colores_5) <- c("RH+","RH-")




p_vals_5 <- c(
			t.test(gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type != "protein_coding" & gencode_gtf_ensembl_ucsc$RH==1,"exonCount"], gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type != "protein_coding" & gencode_gtf_ensembl_ucsc$RH !=1,"exonCount"])$p.value
			)



if (p_vals_5 >= 2.2e-16) {

fun_label_5 <- c(
				deparse(bquote(.(formatC(p_vals_5[1],format="e",digits=0))))
				) } else {
					
fun_label_5 <- c(deparse(bquote(""< .(formatC(2.2e-16,format="e",digits=1)))))
					
				}


horiz_pos_5 <- c(1)-0.22
vert_pos_5 <-  c(
				max(gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type != "protein_coding",c("exonCount")],na.rm=TRUE)  * 10^0.3
				)
				
				


p5 <- ggplot(
		data = gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type != "protein_coding",], 
		aes(
				x = cell, 
				y = exonCount,
				fill = RH
				)
			) + 
		geom_violin(
			width=0.5, 
			position = position_dodge(width=0.6), alpha=0.1,
			lwd=0.1
			) +
		geom_point(
			data= gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type != "protein_coding",],
			mapping=aes(x=cell,y= exonCount),
			shape=16, 
			colour="grey",
			alpha=1,
			size=0.1,
			position=position_jitterdodge(
						dodge.width=0.6,
						jitter.width=0.0,
						jitter.height=0.0
						)
			) +
		geom_boxplot(
			width=0.1,
			position= position_dodge(0.6), alpha=0.1,
			outlier.shape=NA,
			lwd=0.2,
			fatten=2,
			show.legend = FALSE#,
			#notch=TRUE
			)  +
		scale_fill_manual(
			values=as.vector(colores_1),
			labels=names(colores_1),
			name=NULL		
			)+
		guides(
			shape=FALSE,
	 		fill = guide_legend(
				 		override.aes = list(
				 		#fill=NA,
				 		shape=NA,
				 		size=0.1
				 		),
			 		ncol=1,
			 		byrow=TRUE
			 		)
	 		 ) +
	    annotate(
		    "rect",
		    xmin=horiz_pos_5-0.2,
		    xmax=horiz_pos_5+0.6, 
		    ymin=vert_pos_5/10^0.3, 
		    ymax=vert_pos_5*10^0.1, 
		    fill="white"
		    ) +
 		annotate(
	 		geom="text", 
	 		x=horiz_pos_5, 
	 		y=vert_pos_5, 
	 		hjust=0,
	 		label= fun_label_5, 
	 		parse=TRUE, 
	 		size=2
	 		) +
 		theme2 + 
   		theme(
	   		plot.margin = unit(c(top=0.5,right=0.5,bottom=0.5,left=0.5), "cm"),
		 	plot.subtitle = element_text(size=9, face="plain", hjust = 0.5, margin=margin(0,0,13,0)),
		 	axis.title.x=element_blank(),
	        axis.text.x=element_blank(),
	        axis.ticks.x=element_blank(),
	 		legend.position="none",
			legend.margin=margin(t=0,r=0,b=0,l=0,unit = "pt"),
		 	legend.box.margin=margin(t=0,r=0,b=0,l=0,unit = "pt"),
		 	legend.key.height = unit(0.3, "cm"),
			legend.key.width = unit(0.3, "cm"),
		 	legend.spacing.y = unit(0.3, 'cm'),
		 	legend.spacing.x = unit(0.3, 'cm'),
		 	legend.title = element_text(size = 7),  
		 	legend.text = element_text(size = 7),
	 		legend.title.align=0.7
	 		# plot.tag = element_text(face="bold"), 
	 		# plot.tag = element_text(size=32, face="bold"), 
	 		# plot.tag.position = c(0.5,0.7)
 			) +
		scale_x_discrete(
			# labels = labels, 
			expand = expand_scale(add = .6)
			) +
		scale_y_continuous(
			expand = expand_scale(mult = .05), 
			trans = log10_trans(), 
			# labels=function(x) {x*1e-3},
			breaks = trans_breaks("log10", function(x) 10^x, n=3),
			labels = trans_format("log10", math_format(10^.x))
			) +
		# xlab("Cell lines") + 
		ylab(expression(Number)) +
		coord_cartesian(ylim = c(10^0,vert_pos_5)) +
		labs(subtitle="Exons nc")
print(p5)



#---------geneLength_cr (6)------------------------


g_unique <- read.delim("growth_loci_unique.txt",sep="\t",stringsAsFactors=FALSE,header=TRUE,check.names=FALSE)
gencode_gtf_ensembl_ucsc <- read.delim("gencode_gtf_ensembl_ucsc_v31.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE,check.names=FALSE)




gencode_gtf_ensembl_ucsc$RH <- numeric(nrow(gencode_gtf_ensembl_ucsc))
gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_id %in% g_unique$ensembl_gene_id,"RH"] <- 1



# No need to cast in long form, as already in long form with only one group factor. However, for consistency with other plots provide "dummy" grp factor for plotting.
gencode_gtf_ensembl_ucsc$cell <- "geneLength_cr"
gencode_gtf_ensembl_ucsc$cell <- factor(gencode_gtf_ensembl_ucsc$cell,levels=c("geneLength_cr"))


gencode_gtf_ensembl_ucsc$RH <- factor(gencode_gtf_ensembl_ucsc$RH,levels=c("1","0"))


labels_6 <- levels(gencode_gtf_ensembl_ucsc$cell)


colores_6 <- gg_color_hue(length(unique(gencode_gtf_ensembl_ucsc$RH)))
names(colores_6) <- c("RH+","RH-")




p_vals_6 <- c(
			t.test(gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type == "protein_coding" & gencode_gtf_ensembl_ucsc$RH==1,"geneLength"], gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type == "protein_coding" & gencode_gtf_ensembl_ucsc$RH !=1,"geneLength"])$p.value
			)



if (p_vals_6 >= 2.2e-16) {

fun_label_6 <- c(
				deparse(bquote(.(formatC(p_vals_6[1],format="e",digits=0))))
				) } else {
					
fun_label_6 <- c(deparse(bquote(""< .(formatC(2.2e-16,format="e",digits=1)))))
					
				}


horiz_pos_6 <- c(1)-0.4
vert_pos_6 <-  c(
				max(gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type == "protein_coding",c("geneLength")],na.rm=TRUE)  * 10^0.6
				)




p6 <- ggplot(
		data = gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type == "protein_coding",], 
		aes(
				x = cell, 
				y = geneLength,
				fill = RH
				)
			) + 
		geom_violin(
			width=0.5, 
			position = position_dodge(width=0.6), alpha=0.1,
			lwd=0.1
			) +
		geom_point(
			data= gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type == "protein_coding",],
			mapping=aes(x=cell,y= geneLength),
			shape=16, 
			colour="grey",
			alpha=1,
			size=0.1,
			position=position_jitterdodge(
						dodge.width=0.6,
						jitter.width=0.0,
						jitter.height=0.0
						)
			) +
		geom_boxplot(
			width=0.1,
			position= position_dodge(0.6), alpha=0.1,
			outlier.shape=NA,
			lwd=0.2,
			fatten=2,
			show.legend = FALSE#,
			# alpha=0.1,
			#notch=TRUE
			)  +
		scale_fill_manual(
			values=as.vector(colores_6),
			labels=names(colores_6),
			name=NULL		
			)+
		guides(
			shape=FALSE,
	 		fill = guide_legend(
				 		override.aes = list(
				 		#fill=NA,
				 		shape=NA,
				 		size=0.1
				 		),
			 		ncol=1,
			 		byrow=TRUE
			 		)
	 		 ) +
	    annotate(
		    "rect",
		    xmin=horiz_pos_6-0.2,
		    xmax=horiz_pos_6+0.6, 
		    ymin=vert_pos_6/10^0.3, 
		    ymax=vert_pos_6*10^0.1, 
		    fill="white"
		    ) +
 		annotate(
	 		geom="text", 
	 		x=horiz_pos_6, 
	 		y=vert_pos_6, 
	 		hjust=0,
	 		label= fun_label_6, 
	 		parse=TRUE, 
	 		size=2
	 		) +
 		theme2 + 
   		theme(
	   		plot.margin = unit(c(top=0.5,right=0.5,bottom=0.5,left=0.5), "cm"),
		 	plot.subtitle = element_text(size=9, face="plain", hjust = 0.5, margin=margin(0,0,13,0)),
		 	axis.title.x=element_blank(),
	        axis.text.x=element_blank(),
	        axis.ticks.x=element_blank(),
	 		legend.position="none",
			legend.margin=margin(t=0,r=0,b=0,l=0,unit = "pt"),
		 	legend.box.margin=margin(t=0,r=0,b=0,l=0,unit = "pt"),
		 	legend.key.height = unit(0.3, "cm"),
			legend.key.width = unit(0.3, "cm"),
		 	legend.spacing.y = unit(0.3, 'cm'),
		 	legend.spacing.x = unit(0.3, 'cm'),
		 	legend.title = element_text(size = 7),  
		 	legend.text = element_text(size = 7),
	 		legend.title.align=0.7
	 		# plot.tag = element_text(face="bold"), 
	 		# plot.tag = element_text(size=32, face="bold"), 
	 		# plot.tag.position = c(0.5,0.7)
 			) +
		scale_x_discrete(
			# labels = labels, 
			expand = expand_scale(add = .6)
			) +
		scale_y_continuous(
				expand = expand_scale(mult = .05), 
				trans = log10_trans(), 
				# labels=function(x) {x*1e-3},
				breaks = trans_breaks("log10", function(x) 10^x, n=3),
			   labels = trans_format("log10", math_format(10^.x))#,
				#limits=c(-2.2,6.25)
			) +
		# xlab("Cell lines") + 
		ylab(expression(Length~(bp))) +
		coord_cartesian(ylim = c(10^2,vert_pos_6)) +
		labs(subtitle="Genes cr")
print(p6)



#---------geneLength_nc (7)------------------------


g_unique <- read.delim("growth_loci_unique.txt",sep="\t",stringsAsFactors=FALSE,header=TRUE,check.names=FALSE)
gencode_gtf_ensembl_ucsc <- read.delim("gencode_gtf_ensembl_ucsc_v31.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE,check.names=FALSE)




gencode_gtf_ensembl_ucsc$RH <- numeric(nrow(gencode_gtf_ensembl_ucsc))
gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_id %in% g_unique$ensembl_gene_id,"RH"] <- 1



# No need to cast in long form, as already in long form with only one group factor. However, for consistency with other plots provide "dummy" grp factor for plotting.
gencode_gtf_ensembl_ucsc$cell <- "geneLength_cr"
gencode_gtf_ensembl_ucsc$cell <- factor(gencode_gtf_ensembl_ucsc$cell,levels=c("geneLength_cr"))


gencode_gtf_ensembl_ucsc$RH <- factor(gencode_gtf_ensembl_ucsc$RH,levels=c("1","0"))


labels_7 <- levels(gencode_gtf_ensembl_ucsc$cell)


colores_7 <- gg_color_hue(length(unique(gencode_gtf_ensembl_ucsc$RH)))
names(colores_7) <- c("RH+","RH-")




p_vals_7 <- c(
			t.test(gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type != "protein_coding" & gencode_gtf_ensembl_ucsc$RH==1,"geneLength"], gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type != "protein_coding" & gencode_gtf_ensembl_ucsc$RH !=1,"geneLength"])$p.value
			)



if (p_vals_7 >= 2.2e-16) {

fun_label_7 <- c(
				deparse(bquote(.(formatC(p_vals_7[1],format="e",digits=0))))
				) } else {
					
fun_label_7 <- c(deparse(bquote(""< .(formatC(2.2e-16,format="e",digits=1)))))
					
				}


horiz_pos_7 <- c(1)-0.4
vert_pos_7 <-  c(
				max(gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type != "protein_coding",c("geneLength")],na.rm=TRUE)  * 10^0.7
				)




p7 <- ggplot(
		data = gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type != "protein_coding",], 
		aes(
				x = cell, 
				y = geneLength,
				fill = RH
				)
			) + 
		geom_violin(
			width=0.5, 
			position = position_dodge(width=0.6), alpha=0.1,
			lwd=0.1
			) +
		geom_point(
			data= gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type != "protein_coding",],
			mapping=aes(x=cell,y= geneLength),
			shape=16, 
			colour="grey",
			alpha=1,
			size=0.1,
			position=position_jitterdodge(
						dodge.width=0.6,
						jitter.width=0.0,
						jitter.height=0.0
						)
			) +
		geom_boxplot(
			width=0.1,
			position= position_dodge(0.6), alpha=0.1,
			outlier.shape=NA,
			lwd=0.2,
			fatten=2,
			show.legend = FALSE#,
			# alpha=0.1,
			#notch=TRUE
			)  +
		scale_fill_manual(
			values=as.vector(colores_7),
			labels=names(colores_7),
			name=NULL		
			)+
		guides(
			shape=FALSE,
	 		fill = guide_legend(
				 		override.aes = list(
				 		#fill=NA,
				 		shape=NA,
				 		size=0.1
				 		),
			 		ncol=1,
			 		byrow=TRUE
			 		)
	 		 ) +
	    annotate(
		    "rect",
		    xmin=horiz_pos_7-0.2,
		    xmax=horiz_pos_7+0.6, 
		    ymin=vert_pos_7/10^0.3, 
		    ymax=vert_pos_7*10^0.1, 
		    fill="white"
		    ) +
 		annotate(
	 		geom="text", 
	 		x=horiz_pos_7, 
	 		y=vert_pos_7, 
	 		hjust=0,
	 		label= fun_label_7, 
	 		parse=TRUE, 
	 		size=2
	 		) +
 		theme2 + 
   		theme(
	   		plot.margin = unit(c(top=0.5,right=0.5,bottom=0.5,left=0.5), "cm"),
		 	plot.subtitle = element_text(size=9, face="plain", hjust = 0.5, margin=margin(0,0,13,0)),
		 	axis.title.x=element_blank(),
	        axis.text.x=element_blank(),
	        axis.ticks.x=element_blank(),
	 		legend.position="none",
			legend.margin=margin(t=0,r=0,b=0,l=0,unit = "pt"),
		 	legend.box.margin=margin(t=0,r=0,b=0,l=0,unit = "pt"),
		 	legend.key.height = unit(0.3, "cm"),
			legend.key.width = unit(0.3, "cm"),
		 	legend.spacing.y = unit(0.3, 'cm'),
		 	legend.spacing.x = unit(0.3, 'cm'),
		 	legend.title = element_text(size = 7),  
		 	legend.text = element_text(size = 7),
	 		legend.title.align=0.7
	 		# plot.tag = element_text(face="bold"), 
	 		# plot.tag = element_text(size=32, face="bold"), 
	 		# plot.tag.position = c(0.5,0.7)
 			) +
		scale_x_discrete(
			# labels = labels, 
			expand = expand_scale(add = .6)
			) +
		scale_y_continuous(
				expand = expand_scale(mult = .05), 
				trans = log10_trans(), 
				# labels=function(x) {x*1e-3},
				breaks = trans_breaks("log10", function(x) 10^x, n=4),
			   labels = trans_format("log10", math_format(10^.x))#,
				#limits=c(-2.2,6.25)
			) +
		# xlab("Cell lines") + 
		ylab(expression(Length~(bp))) +
		coord_cartesian(ylim = c(10^1,vert_pos_7)) +
		labs(subtitle="Genes nc")
print(p7)




# #---------geneLength_cr_nc together in one graph (8)------------------------


# g_unique <- read.delim("growth_loci_unique.txt",sep="\t",stringsAsFactors=FALSE,header=TRUE,check.names=FALSE)
# gencode_gtf_ensembl_ucsc <- read.delim("gencode_gtf_ensembl_ucsc_v31.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE,check.names=FALSE)


# gencode_gtf_ensembl_ucsc$RH <- numeric(nrow(gencode_gtf_ensembl_ucsc))
# gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_id %in% g_unique$ensembl_gene_id,"RH"] <- 1


# # no need to reshape because already in long form
# gencode_gtf_ensembl_ucsc$cell = character(nrow(gencode_gtf_ensembl_ucsc))
# gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type == "protein_coding","cell"] <- "coding"
# gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type != "protein_coding","cell"] <- "non-coding"


# level_order <- c("coding", "non-coding")


# gencode_gtf_ensembl_ucsc$cell <- factor(gencode_gtf_ensembl_ucsc$cell,levels=level_order)
# gencode_gtf_ensembl_ucsc$RH <- factor(gencode_gtf_ensembl_ucsc$RH,levels=c("1","0"))


# labels <- level_order

# colores_1 <- gg_color_hue(length(unique(gencode_gtf_ensembl_ucsc$RH)))
# names(colores_1) <- c("RH+","RH-")


# p_vals <- c(

			# ks.test(gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type == "protein_coding" & gencode_gtf_ensembl_ucsc$RH==1,"geneLength"], gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type == "protein_coding" & gencode_gtf_ensembl_ucsc$RH !=1,"geneLength"])$p.value,
			
			# ks.test(gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type != "protein_coding" & gencode_gtf_ensembl_ucsc$RH==1,"geneLength"], gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type != "protein_coding" & gencode_gtf_ensembl_ucsc$RH !=1,"geneLength"])$p.value
			
			# )
			
			
			
# fun_label = character(length(p_vals))			
			

# for(i in 1:length(p_vals))		{	
# if (p_vals[i] != 0) {

# fun_label[i] <- c(
				# deparse(bquote(italic(P) == .(formatC(p_vals[i],format="g",digits=1))))
				# ) } else {
					
# fun_label[i] <- c(deparse(bquote(italic(P) < .(formatC(2.2e-16,format="g",digits=2)))))
					
				# }

# }



# horiz_pos <- c(1:length(p_vals))-0.09
# vert_pos <-  c(
				# 5e6, 2.5e6
				# # max(gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type == "protein_coding",c("geneLength")],na.rm=TRUE), 
				# # max(gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type != "protein_coding",c("geneLength")],na.rm=TRUE)
				# )



# # violin plot
# p8 <- ggplot(
		# data = gencode_gtf_ensembl_ucsc, 
		# aes(
				# x = cell, 
				# y = geneLength,
				# fill = RH
				# )
			# ) + 
		# geom_violin(
			# width=0.5, 
			# position = position_dodge(width=0.6), alpha=0.1,
			# lwd=0.1
			# ) +
		# geom_point(
			# data= gencode_gtf_ensembl_ucsc,
			# mapping=aes(x=cell,y=geneLength),
			# shape=16, 
			# colour="grey",
			# alpha=1,
			# size=1,
			# position=position_jitterdodge(dodge.width=0.6,jitter.width=0.0)
			# ) +
		# geom_boxplot(
			# width=0.1,
			# position= position_dodge(0.6), alpha=0.1,
			# outlier.shape=NA,
			# lwd=0.2,
			# fatten=2,
			# show.legend = FALSE#,
			# #notch=TRUE
			# )  +
		# scale_fill_manual(
			# values=as.vector(colores_1),
			# labels=names(colores_1),
			# name=NULL		
			# )+
		# guides(
			# shape=FALSE,
	 		# fill = guide_legend(
				 		# override.aes = list(
				 		# #fill=NA,
				 		# shape=NA,
				 		# size=0.1
				 		# ),
			 		# ncol=1,
			 		# byrow=TRUE
			 		# )
	 		 # ) +
 		# annotate(
	 		# geom="text", 
	 		# x=horiz_pos, 
	 		# y=vert_pos, 
	 		# hjust=0,
	 		# label= fun_label, 
	 		# parse=TRUE, 
	 		# size=2.5
	 		# ) +
 		# theme2 + 
   		# theme(
	   		# axis.title.x=element_blank(),
	        # # axis.text.x=element_blank(),
	        # # axis.ticks.x=element_blank(),
		 	# plot.margin = unit(c(1.2,1.5,1.2,0.9), "cm"),
	 		# legend.position = c(0.9,0.95),
			# legend.margin=margin(t=0,r=0,b=0,l=0,unit = "pt"),
		 	# legend.box.margin=margin(t=0,r=0,b=0,l=0,unit = "pt"),
		 	# legend.key.height = unit(0.3, "cm"),
			# legend.key.width = unit(0.3, "cm"),
		 	# legend.spacing.y = unit(0.3, 'cm'),
		 	# legend.spacing.x = unit(0.3, 'cm'),
		 	# legend.title = element_text(size = 7),  
		 	# legend.text = element_text(size = 7),
	 		# legend.title.align=0.7
 			# ) +
		# scale_x_discrete(
			# labels = labels, 
			# expand = expand_scale(add = .6)
			# ) +
		# scale_y_continuous(
				# expand = expand_scale(mult = .05), 
				# trans = log10_trans(), 
				# # labels=function(x) {x*1e-3},
				# breaks = trans_breaks("log10", function(x) 10^x),
			   # labels = trans_format("log10", math_format(10^.x))#,
				# #limits=c(-2.2,6.25)
			# ) +
		# # xlab("Cell lines") + 
		# ylab(expression(Length~(bp))) +
		# labs(subtitle="Genes")
# print(p8)




#---------txLength_cr (8)------------------------


g_unique <- read.delim("growth_loci_unique.txt",sep="\t",stringsAsFactors=FALSE,header=TRUE,check.names=FALSE)
gencode_gtf_ensembl_ucsc <- read.delim("gencode_gtf_ensembl_ucsc_v31.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE,check.names=FALSE)




gencode_gtf_ensembl_ucsc$RH <- numeric(nrow(gencode_gtf_ensembl_ucsc))
gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$tx_id %in% g_unique$ensembl_tx_id,"RH"] <- 1



# No need to cast in long form, as already in long form with only one group factor. However, for consistency with other plots provide "dummy" grp factor for plotting.
gencode_gtf_ensembl_ucsc$cell <- "txLength_cr"
gencode_gtf_ensembl_ucsc$cell <- factor(gencode_gtf_ensembl_ucsc$cell,levels=c("txLength_cr"))


gencode_gtf_ensembl_ucsc$RH <- factor(gencode_gtf_ensembl_ucsc$RH,levels=c("1","0"))


labels_8 <- levels(gencode_gtf_ensembl_ucsc$cell)


colores_8 <- gg_color_hue(length(unique(gencode_gtf_ensembl_ucsc$RH)))
names(colores_8) <- c("RH+","RH-")




p_vals_8 <- c(
			t.test(gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type == "protein_coding" & gencode_gtf_ensembl_ucsc$RH==1,"txLength"], gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type == "protein_coding" & gencode_gtf_ensembl_ucsc$RH !=1,"txLength"])$p.value
			)



if (p_vals_8 >= 2.2e-16) {

fun_label_8 <- c(
				deparse(bquote(.(formatC(p_vals_8[1],format="e",digits=0))))
				) } else {
					
fun_label_8 <- c(deparse(bquote(""< .(formatC(2.2e-16,format="e",digits=1)))))
					
				}


horiz_pos_8 <- c(1)-0.4
vert_pos_8 <-  c(
				10^5.3
				# max(gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type == "protein_coding",c("txLength")],na.rm=TRUE)  * 10^0.6
				)




p8 <- ggplot(
		data = gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type == "protein_coding",], 
		aes(
				x = cell, 
				y = txLength,
				fill = RH
				)
			) + 
		geom_violin(
			width=0.5, 
			position = position_dodge(width=0.6), alpha=0.1,
			lwd=0.1
			) +
		geom_point(
			data= gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type == "protein_coding",],
			mapping=aes(x=cell,y= txLength),
			shape=16, 
			colour="grey",
			alpha=1,
			size=0.1,
			position=position_jitterdodge(
						dodge.width=0.6,
						jitter.width=0.0,
						jitter.height=0.0
						)
			) +
		geom_boxplot(
			width=0.1,
			position= position_dodge(0.6), alpha=0.1,
			outlier.shape=NA,
			lwd=0.2,
			fatten=2,
			show.legend = FALSE#,
			# alpha=0.1,
			#notch=TRUE
			)  +
		scale_fill_manual(
			values=as.vector(colores_8),
			labels=names(colores_8),
			name=NULL		
			)+
		guides(
			shape=FALSE,
	 		fill = guide_legend(
				 		override.aes = list(
				 		#fill=NA,
				 		shape=NA,
				 		size=0.1
				 		),
			 		ncol=1,
			 		byrow=TRUE
			 		)
	 		 ) +
	    annotate(
		    "rect",
		    xmin=horiz_pos_8-0.2,
		    xmax=horiz_pos_8+0.6, 
		    ymin=vert_pos_8/10^0.3, 
		    ymax=vert_pos_8*10^0.1, 
		    fill="white"
		    ) +
 		annotate(
	 		geom="text", 
	 		x=horiz_pos_8, 
	 		y=vert_pos_8, 
	 		hjust=0,
	 		label= fun_label_8, 
	 		parse=TRUE, 
	 		size=2
	 		) +
 		theme2 + 
   		theme(
	   		plot.margin = unit(c(top=0.5,right=0.5,bottom=0.5,left=0.5), "cm"),
		 	plot.subtitle = element_text(size=9, face="plain", hjust = 0.5, margin=margin(0,0,13,0)),
		 	axis.title.x=element_blank(),
	        axis.text.x=element_blank(),
	        axis.ticks.x=element_blank(),
	 		legend.position="none",
			legend.margin=margin(t=0,r=0,b=0,l=0,unit = "pt"),
		 	legend.box.margin=margin(t=0,r=0,b=0,l=0,unit = "pt"),
		 	legend.key.height = unit(0.3, "cm"),
			legend.key.width = unit(0.3, "cm"),
		 	legend.spacing.y = unit(0.3, 'cm'),
		 	legend.spacing.x = unit(0.3, 'cm'),
		 	legend.title = element_text(size = 7),  
		 	legend.text = element_text(size = 7),
	 		legend.title.align=0.7
	 		# plot.tag = element_text(face="bold"), 
	 		# plot.tag = element_text(size=32, face="bold"), 
	 		# plot.tag.position = c(0.5,0.7)
 			) +
		scale_x_discrete(
			# labels = labels, 
			expand = expand_scale(add = .6)
			) +
		scale_y_continuous(
				expand = expand_scale(mult = .05), 
				trans = log10_trans(), 
				# labels=function(x) {x*1e-3},
				breaks = trans_breaks("log10", function(x) 10^x, n=4),
			   labels = trans_format("log10", math_format(10^.x))#,
				#limits=c(-2.2,6.25)
			) +
		# xlab("Cell lines") + 
		ylab(expression(Length~(bp))) +
		coord_cartesian(ylim = c(10^2,vert_pos_8)) +
		labs(subtitle="tx cr")
print(p8)



#---------txLength_nc (9)------------------------


g_unique <- read.delim("growth_loci_unique.txt",sep="\t",stringsAsFactors=FALSE,header=TRUE,check.names=FALSE)
gencode_gtf_ensembl_ucsc <- read.delim("gencode_gtf_ensembl_ucsc_v31.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE,check.names=FALSE)




gencode_gtf_ensembl_ucsc$RH <- numeric(nrow(gencode_gtf_ensembl_ucsc))
gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$tx_id %in% g_unique$ensembl_tx_id,"RH"] <- 1



# No need to cast in long form, as already in long form with only one group factor. However, for consistency with other plots provide "dummy" grp factor for plotting.
gencode_gtf_ensembl_ucsc$cell <- "txLength_nc"
gencode_gtf_ensembl_ucsc$cell <- factor(gencode_gtf_ensembl_ucsc$cell,levels=c("txLength_nc"))


gencode_gtf_ensembl_ucsc$RH <- factor(gencode_gtf_ensembl_ucsc$RH,levels=c("1","0"))


labels_9 <- levels(gencode_gtf_ensembl_ucsc$cell)


colores_9 <- gg_color_hue(length(unique(gencode_gtf_ensembl_ucsc$RH)))
names(colores_9) <- c("RH+","RH-")




p_vals_9 <- c(
			t.test(gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type != "protein_coding" & gencode_gtf_ensembl_ucsc$RH==1,"txLength"], gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type != "protein_coding" & gencode_gtf_ensembl_ucsc$RH !=1,"txLength"])$p.value
			)



if (p_vals_9 >= 2.2e-16) {

fun_label_9 <- c(
				deparse(bquote(.(formatC(p_vals_9[1],format="e",digits=0))))
				) } else {
					
fun_label_9 <- c(deparse(bquote(""< .(formatC(2.2e-16,format="e",digits=1)))))
					
				}


horiz_pos_9 <- c(1)-0.25
vert_pos_9 <-  c(
				10^5.3
				# max(gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type != "protein_coding",c("txLength")],na.rm=TRUE)  * 10^0.6
				)




p9 <- ggplot(
		data = gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type != "protein_coding",], 
		aes(
				x = cell, 
				y = txLength,
				fill = RH
				)
			) + 
		geom_violin(
			width=0.5, 
			position = position_dodge(width=0.6), alpha=0.1,
			lwd=0.1
			) +
		geom_point(
			data= gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type != "protein_coding",],
			mapping=aes(x=cell,y= txLength),
			shape=16, 
			colour="grey",
			alpha=1,
			size=0.1,
			position=position_jitterdodge(
						dodge.width=0.6,
						jitter.width=0.0,
						jitter.height=0.0
						)
			) +
		geom_boxplot(
			width=0.1,
			position= position_dodge(0.6), alpha=0.1,
			outlier.shape=NA,
			lwd=0.2,
			fatten=2,
			show.legend = FALSE#,
			# alpha=0.1,
			#notch=TRUE
			)  +
		scale_fill_manual(
			values=as.vector(colores_9),
			labels=names(colores_9),
			name=NULL		
			)+
		guides(
			shape=FALSE,
	 		fill = guide_legend(
				 		override.aes = list(
				 		#fill=NA,
				 		shape=NA,
				 		size=0.1
				 		),
			 		ncol=1,
			 		byrow=TRUE
			 		)
	 		 ) +
	    annotate(
		    "rect",
		    xmin=horiz_pos_9-0.2,
		    xmax=horiz_pos_9+0.6, 
		    ymin=vert_pos_9/10^0.5, 
		    ymax=vert_pos_9*10^0.1, 
		    fill="white"
		    ) +
 		annotate(
	 		geom="text", 
	 		x=horiz_pos_9, 
	 		y=vert_pos_9, 
	 		hjust=0,
	 		label= fun_label_9, 
	 		parse=TRUE, 
	 		size=2
	 		) +
 		theme2 + 
   		theme(
	   		plot.margin = unit(c(top=0.5,right=0.5,bottom=0.5,left=0.5), "cm"),
		 	plot.subtitle = element_text(size=9, face="plain", hjust = 0.5, margin=margin(0,0,13,0)),
		 	axis.title.x=element_blank(),
	        axis.text.x=element_blank(),
	        axis.ticks.x=element_blank(),
	 		legend.position="none",
			legend.margin=margin(t=0,r=0,b=0,l=0,unit = "pt"),
		 	legend.box.margin=margin(t=0,r=0,b=0,l=0,unit = "pt"),
		 	legend.key.height = unit(0.3, "cm"),
			legend.key.width = unit(0.3, "cm"),
		 	legend.spacing.y = unit(0.3, 'cm'),
		 	legend.spacing.x = unit(0.3, 'cm'),
		 	legend.title = element_text(size = 7),  
		 	legend.text = element_text(size = 7),
	 		legend.title.align=0.7
	 		# plot.tag = element_text(face="bold"), 
	 		# plot.tag = element_text(size=32, face="bold"), 
	 		# plot.tag.position = c(0.5,0.7)
 			) +
		scale_x_discrete(
			# labels = labels, 
			expand = expand_scale(add = .6)
			) +
		scale_y_continuous(
				expand = expand_scale(mult = .05), 
				trans = log10_trans(), 
				# labels=function(x) {x*1e-3},
				breaks = trans_breaks("log10", function(x) 10^x, n=4),
			   labels = trans_format("log10", math_format(10^.x))#,
				#limits=c(-2.2,6.25)
			) +
		# xlab("Cell lines") + 
		ylab(expression(Length~(bp))) +
		coord_cartesian(ylim = c(10^1,vert_pos_9)) +
		labs(subtitle="tx nc")
print(p9)





# #--------- tx_cr_nc together in one graph (9) ------------------------


# g_unique <- read.delim("growth_loci_unique.txt",sep="\t",stringsAsFactors=FALSE,header=TRUE,check.names=FALSE)
# gencode_gtf_ensembl_ucsc <- read.delim("gencode_gtf_ensembl_ucsc_v31.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE,check.names=FALSE)


# gencode_gtf_ensembl_ucsc$RH <- numeric(nrow(gencode_gtf_ensembl_ucsc))
# gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_id %in% g_unique$ensembl_gene_id,"RH"] <- 1


# # no need to reshape because already in long form
# gencode_gtf_ensembl_ucsc$cell = character(nrow(gencode_gtf_ensembl_ucsc))
# gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type == "protein_coding","cell"] <- "coding"
# gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type != "protein_coding","cell"] <- "non-coding"


# level_order <- c("coding", "non-coding")


# gencode_gtf_ensembl_ucsc$cell <- factor(gencode_gtf_ensembl_ucsc$cell,levels=level_order)
# gencode_gtf_ensembl_ucsc$RH <- factor(gencode_gtf_ensembl_ucsc$RH,levels=c("1","0"))


# labels <- level_order

# colores_1 <- gg_color_hue(length(unique(gencode_gtf_ensembl_ucsc$RH)))
# names(colores_1) <- c("RH+","RH-")


# p_vals <- c(

			# ks.test(gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type == "protein_coding" & gencode_gtf_ensembl_ucsc$RH==1,"txLength"], gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type == "protein_coding" & gencode_gtf_ensembl_ucsc$RH !=1,"txLength"])$p.value,
			
			# ks.test(gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type != "protein_coding" & gencode_gtf_ensembl_ucsc$RH==1,"txLength"], gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type != "protein_coding" & gencode_gtf_ensembl_ucsc$RH !=1,"txLength"])$p.value
			
			# )
			
			
			
# fun_label = character(length(p_vals))			
			

# for(i in 1:length(p_vals))		{	
# if (p_vals[i] != 0) {

# fun_label[i] <- c(
				# deparse(bquote(italic(P) == .(formatC(p_vals[i],format="g",digits=1))))
				# ) } else {
					
# fun_label[i] <- c(deparse(bquote(italic(P) < .(formatC(2.2e-16,format="g",digits=2)))))
					
				# }

# }



# horiz_pos <- c(1:length(p_vals))-0.09
# vert_pos <-  c(
				# 5e5, 5e5
				# # max(gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type == "protein_coding",c("geneLength")],na.rm=TRUE), 
				# # max(gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type != "protein_coding",c("geneLength")],na.rm=TRUE)
				# )



# # violin plot
# p9 <- ggplot(
		# data = gencode_gtf_ensembl_ucsc, 
		# aes(
				# x = cell, 
				# y = txLength,
				# fill = RH
				# )
			# ) + 
		# geom_violin(
			# width=0.5, 
			# position = position_dodge(width=0.6), alpha=0.1,
			# lwd=0.1
			# ) +
		# geom_point(
			# data= gencode_gtf_ensembl_ucsc,
			# mapping=aes(x=cell,y= txLength),
			# shape=16, 
			# colour="grey",
			# alpha=1,
			# size=1,
			# position=position_jitterdodge(dodge.width=0.6,jitter.width=0.0)
			# ) +
		# geom_boxplot(
			# width=0.1,
			# position= position_dodge(0.6), alpha=0.1,
			# outlier.shape=NA,
			# lwd=0.2,
			# fatten=2,
			# show.legend = FALSE#,
			# #notch=TRUE
			# )  +
		# scale_fill_manual(
			# values=as.vector(colores_1),
			# labels=names(colores_1),
			# name=NULL		
			# )+
		# guides(
			# shape=FALSE,
	 		# fill = guide_legend(
				 		# override.aes = list(
				 		# #fill=NA,
				 		# shape=NA,
				 		# size=0.1
				 		# ),
			 		# ncol=1,
			 		# byrow=TRUE
			 		# )
	 		 # ) +
 		# annotate(
	 		# geom="text", 
	 		# x=horiz_pos, 
	 		# y=vert_pos, 
	 		# hjust=0,
	 		# label= fun_label, 
	 		# parse=TRUE, 
	 		# size=2.5
	 		# ) +
 		# theme2 + 
   		# theme(
	   		# axis.title.x=element_blank(),
	        # # axis.text.x=element_blank(),
	        # # axis.ticks.x=element_blank(),
		 	# plot.margin = unit(c(1.2,1.5,1.2,0.9), "cm"),
	 		# legend.position = c(0.9,0.95),
			# legend.margin=margin(t=0,r=0,b=0,l=0,unit = "pt"),
		 	# legend.box.margin=margin(t=0,r=0,b=0,l=0,unit = "pt"),
		 	# legend.key.height = unit(0.3, "cm"),
			# legend.key.width = unit(0.3, "cm"),
		 	# legend.spacing.y = unit(0.3, 'cm'),
		 	# legend.spacing.x = unit(0.3, 'cm'),
		 	# legend.title = element_text(size = 7),  
		 	# legend.text = element_text(size = 7),
	 		# legend.title.align=0.7
 			# ) +
		# scale_x_discrete(
			# labels = labels, 
			# expand = expand_scale(add = .6)
			# ) +
		# scale_y_continuous(
				# expand = expand_scale(mult = .05), 
				# trans = log10_trans(), 
				# # labels=function(x) {x*1e-3},
				# breaks = trans_breaks("log10", function(x) 10^x),
			   # labels = trans_format("log10", math_format(10^.x))#,
				# #limits=c(-2.2,6.25)
			# ) +
		# # xlab("Cell lines") + 
		# ylab(expression(Length~(bp))) +
		# labs(subtitle="Tx")
# print(p9)




#--------- cdsLength_cr (10) ------------------------


g_unique <- read.delim("growth_loci_unique.txt",sep="\t",stringsAsFactors=FALSE,header=TRUE,check.names=FALSE)
gencode_gtf_ensembl_ucsc <- read.delim("gencode_gtf_ensembl_ucsc_v31.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE,check.names=FALSE)




gencode_gtf_ensembl_ucsc$RH <- numeric(nrow(gencode_gtf_ensembl_ucsc))
gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_id %in% g_unique$ensembl_gene_id,"RH"] <- 1



# No need to cast in long form, as already in long form with only one group factor. However, for consistency with other plots provide "dummy" grp factor for plotting.
gencode_gtf_ensembl_ucsc$cell <- "geneLength_cr"
gencode_gtf_ensembl_ucsc$cell <- factor(gencode_gtf_ensembl_ucsc$cell,levels=c("geneLength_cr"))


gencode_gtf_ensembl_ucsc$RH <- factor(gencode_gtf_ensembl_ucsc$RH,levels=c("1","0"))


labels_10 <- levels(gencode_gtf_ensembl_ucsc$cell)


colores_10 <- gg_color_hue(length(unique(gencode_gtf_ensembl_ucsc$RH)))
names(colores_10) <- c("RH+","RH-")




p_vals_10 <- c(
			t.test(gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type == "protein_coding" & gencode_gtf_ensembl_ucsc$RH==1,"cdsLength"], gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type == "protein_coding" & gencode_gtf_ensembl_ucsc$RH !=1,"cdsLength"])$p.value
			)



if (p_vals_10 >= 2.2e-16) {

fun_label <- c(
				deparse(bquote(.(formatC(p_vals_10[1],format="e",digits=0))))
				) } else {
					
fun_label <- c(deparse(bquote(""< .(formatC(2.2e-16,format="e",digits=1)))))
					
				}


horiz_pos_10 <- c(1)-0.24
vert_pos_10 <-  c(
				max(gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type == "protein_coding",c("cdsLength")],na.rm=TRUE)  # * 10^0.3
				)



p10 <- ggplot(
		data = gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type == "protein_coding",], 
		aes(
				x = cell, 
				y = cdsLength,
				fill = RH
				)
			) + 
		geom_violin(
			width=0.5, 
			position = position_dodge(width=0.6), alpha=0.1,
			lwd=0.1
			) +
		geom_point(
			data= gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type == "protein_coding",],
			mapping=aes(x=cell,y= cdsLength),
			shape=16, 
			colour="grey",
			alpha=1,
			size=0.1,
			position=position_jitterdodge(
						dodge.width=0.6,
						jitter.width=0.0,
						jitter.height=0.1
						)
			) +
		geom_boxplot(
			width=0.1,
			position= position_dodge(0.6), alpha=0.1,
			outlier.shape=NA,
			lwd=0.2,
			fatten=2,
			show.legend = FALSE#,
			# alpha=0.1,
			#notch=TRUE
			)  +
		scale_fill_manual(
			values=as.vector(colores_10),
			labels=names(colores_10),
			name=NULL		
			)+
		guides(
			shape=FALSE,
	 		fill = guide_legend(
				 		override.aes = list(
				 		#fill=NA,
				 		shape=NA,
				 		size=0.1
				 		),
			 		ncol=1,
			 		byrow=TRUE
			 		)
	 		 ) +
	    annotate(
		    "rect",
		    xmin=horiz_pos_10-0.2,
		    xmax=horiz_pos_10+0.6, 
		    ymin=vert_pos_10*10^-0.3, 
		    ymax=vert_pos_10*10^0.1, 
		    fill="white"
		    ) +
 		annotate(
	 		geom="text", 
	 		x=horiz_pos_10, 
	 		y=vert_pos_10, 
	 		hjust=0,
	 		label= fun_label, 
	 		parse=TRUE, 
	 		size=2
	 		) +
 		theme2 + 
   		theme(
	   		plot.margin = unit(c(top=0.5,right=0.5,bottom=0.5,left=0.5), "cm"),
		 	plot.subtitle = element_text(size=9, face="plain", hjust = 0.5, margin=margin(0,0,13,0)),
		 	axis.title.x=element_blank(),
	        axis.text.x=element_blank(),
	        axis.ticks.x=element_blank(),
	 		legend.position="none",
			legend.margin=margin(t=0,r=0,b=0,l=0,unit = "pt"),
		 	legend.box.margin=margin(t=0,r=0,b=0,l=0,unit = "pt"),
		 	legend.key.height = unit(0.3, "cm"),
			legend.key.width = unit(0.3, "cm"),
		 	legend.spacing.y = unit(0.3, 'cm'),
		 	legend.spacing.x = unit(0.3, 'cm'),
		 	legend.title = element_text(size = 7),  
		 	legend.text = element_text(size = 7),
	 		legend.title.align=0.7
	 		# plot.tag = element_text(face="bold"), 
	 		# plot.tag = element_text(size=32, face="bold"), 
	 		# plot.tag.position = c(0.5,0.7)
 			) +
		scale_x_discrete(
			# labels = labels, 
			expand = expand_scale(add = .6)
			) +
		scale_y_continuous(
				expand = expand_scale(mult = .05), 
				trans = log10_trans(), 
				# labels=function(x) {x*1e-3},
				breaks = trans_breaks("log10", function(x) 10^x),
			   labels = trans_format("log10", math_format(10^.x))#,
				#limits=c(-2.2,6.25)
			) +
		# xlab("Cell lines") + 
		ylab(expression(Length~(bp))) +
		coord_cartesian(ylim = c(10^1.5,vert_pos_10)) +
		labs(subtitle=" cds")
print(p10)





# #--------- 5ut_3ut_ut_cr together (11) ------------------------
# #--------- Do not include -------------------------------------
# # -------- Confounded with geneLength -------------------------


# g_unique <- read.delim("growth_loci_unique.txt",sep="\t",stringsAsFactors=FALSE,header=TRUE,check.names=FALSE)
# gencode_gtf_ensembl_ucsc <- read.delim("gencode_gtf_ensembl_ucsc_v31.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE,check.names=FALSE)



# # All utrs are positively correlated with txLength, so normalize
# gencode2 <- gencode_gtf_ensembl_ucsc
# gencode2$utrDiff <- gencode2$txLength-gencode2$cdsLength
# gencode2$percent.5utr <- gencode2$"5utrDiff"/gencode2$txLength
# gencode2$percent.3utr <- gencode2$"3utrDiff"/gencode2$txLength
# gencode2$percent.utr <- gencode2$utrDiff/gencode2$txLength


# gencode2$RH <- numeric(nrow(gencode2))
# gencode2[gencode2$gene_id %in% g_unique$ensembl_gene_id,"RH"] <- 1
# # ?unnecessary?
# gencode2 <- gencode2[gencode2$gene_type == "protein_coding",]


# gencode3 <- reshape(gencode2, 
  # varying = c("percent.5utr", "percent.3utr",  "percent.utr"), 
  # v.names = "Percent.utr",
  # timevar = "utr", 
  # times = c("percent.5utr", "percent.3utr",  "percent.utr"), 
  # new.row.names = 1:1e6,
  # direction = "long")



# gencode3[gencode3$utr == "percent.5utr","utr"] <- "5' utr"
# gencode3[gencode3$utr == "percent.3utr","utr"] <- "3' utr"
# gencode3[gencode3$utr == "percent.utr","utr"] <- "utr"


# level_order <- c("5' utr", "3' utr", "utr")



# gencode3$utr <- factor(gencode3$utr,levels=level_order)
# gencode3$RH <- factor(gencode3$RH,levels=c("1","0"))



# labels <- level_order

# colores_1 <- gg_color_hue(length(unique(gencode2$RH)))
# names(colores_1) <- c("RH+","RH-")



# # None of the utrs are significant with RH when txLength included in lm:
# summary(lm(gencode2$"5utrDiff"~ txLength+RH,data=gencode2))
# summary(lm(gencode2$"3utrDiff"~ txLength+RH,data=gencode2))
# summary(lm(gencode2$"utrDiff"~ txLength+RH,data=gencode2))

# # # So following p vals are confounded with txLength. Sig p vals for 5utr and 3utr not valid.
# # p_vals <- c(
			# # ks.test(gencode2[gencode2$gene_type == "protein_coding" & gencode2$RH==1,"percent.5utr"], gencode2[gencode2$gene_type == "protein_coding" & gencode2$RH !=1,"percent.5utr"])$p.value,
			# # ks.test(gencode2[gencode2$gene_type == "protein_coding" & gencode2$RH==1,"percent.3utr"], gencode2[gencode2$gene_type == "protein_coding" & gencode2$RH !=1,"percent.3utr"])$p.value,
			# # ks.test(gencode2[gencode2$gene_type == "protein_coding" & gencode2$RH==1,"percent.utr"], gencode2[gencode2$gene_type == "protein_coding" & gencode2$RH !=1,"percent.utr"])$p.value
			# # )
			


# p_vals <- c(
			# summary(lm(gencode2$"5utrDiff"~ txLength+RH,data=gencode2))$coefficients["RH","Pr(>|t|)"],
			# summary(lm(gencode2$"3utrDiff"~ txLength+RH,data=gencode2))$coefficients["RH","Pr(>|t|)"],
			# summary(lm(gencode2$"utrDiff"~ txLength+RH,data=gencode2))$coefficients["RH","Pr(>|t|)"]
			# )	
			




# fun_label = character(length(p_vals))			
			

# for(i in 1:length(p_vals))		{	
# if (p_vals[i] != 0) {

# fun_label[i] <- c(
				# deparse(bquote(italic(P) == .(formatC(p_vals[i],format="g",digits=1))))
				# ) } else {
					
# fun_label[i] <- c(deparse(bquote(italic(P) < .(formatC(2.2e-16,format="g",digits=2)))))
					
				# }

# }



# horiz_pos <- c(1:length(p_vals))-0.09
# vert_pos <-  c( 170, 170, 170
				# # max(gencode2[,c("percent.5utr")],na.rm=TRUE) + 30, 
				# # max(gencode2[,c("percent.3utr")],na.rm=TRUE) + 30,
				# # max(gencode2[,c("percent.utr")],na.rm=TRUE) + 30
				# )



# # violin plot
# p11 <- ggplot(
		# data = gencode3, 
		# aes(
				# x = utr, 
				# y = Percent.utr*1e2, # make %
				# fill = RH
				# )
			# ) + 
		# geom_violin(
			# width=0.5, 
			# position = position_dodge(width=0.6), alpha=0.1,
			# lwd=0.1
			# ) +
		# geom_point(
			# data= gencode3,
			# mapping=aes(x=utr,y= Percent.utr*1e2), # make %
			# shape=16, 
			# colour="grey",
			# alpha=1,
			# size=1,
			# position=position_jitterdodge(dodge.width=0.6,jitter.width=0.0)
			# ) +
		# geom_boxplot(
			# width=0.1,
			# position= position_dodge(0.6), alpha=0.1,
			# outlier.shape=NA,
			# lwd=0.2,
			# fatten=2,
			# show.legend = FALSE#,
			# #notch=TRUE
			# )  +
		# scale_fill_manual(
			# values=as.vector(colores_1),
			# labels=names(colores_1),
			# name=NULL		
			# )+
		# guides(
			# shape=FALSE,
	 		# fill = guide_legend(
				 		# override.aes = list(
				 		# #fill=NA,
				 		# shape=NA,
				 		# size=0.1
				 		# ),
			 		# ncol=2,
			 		# byrow=TRUE
			 		# )
	 		 # ) +
 		# annotate(
	 		# geom="text", 
	 		# x=horiz_pos, 
	 		# y=vert_pos, 
	 		# hjust=0,
	 		# label= fun_label, 
	 		# parse=TRUE, 
	 		# size=2.5
	 		# ) +
 		# theme2 + 
   		# theme(
	   		# axis.title.x=element_blank(),
	        # # axis.text.x=element_blank(),
	        # # axis.ticks.x=element_blank(),
		 	# plot.margin = unit(c(1.2,1.5,1.2,0.9), "cm"),
	 		# legend.position = c(0.8,0.1),
			# legend.margin=margin(t=0,r=0,b=0,l=0,unit = "pt"),
		 	# legend.box.margin=margin(t=0,r=0,b=0,l=0,unit = "pt"),
		 	# legend.key.height = unit(0.3, "cm"),
			# legend.key.width = unit(0.3, "cm"),
		 	# legend.spacing.y = unit(0.3, 'cm'),
		 	# legend.spacing.x = unit(0.3, 'cm'),
		 	# legend.title = element_text(size = 7),  
		 	# legend.text = element_text(size = 7),
	 		# legend.title.align=0.7
 			# ) +
		# scale_x_discrete(
			# labels = labels, 
			# expand = expand_scale(add = .6)
			# ) +
		# scale_y_continuous(
				# expand = expand_scale(mult = .05), 
				# trans = log10_trans(), 
				# # labels=function(x) {x*1e-3},
				# breaks = trans_breaks("log10", function(x) 10^x),
			   # labels = trans_format("log10", math_format(10^.x))#,
				# #limits=c(-2.2,6.25)
			# ) +
		# # xlab("Cell lines") + 
		# ylab(expression(utr~('%'))) +
		# labs(subtitle="utr")
# print(p11)




# #--------------- lentiviral orf (12) ------------------------

# g_unique <- read.delim("growth_loci_unique.txt",sep="\t",stringsAsFactors=FALSE,header=TRUE,check.names=FALSE)

# orf <- read.delim("orf.txt",sep="\t",stringsAsFactors=FALSE,header=TRUE,check.names=TRUE,fill=FALSE,skip=1)
# orf$ave_A_B_all_GFP <- rowMeans(orf[,c("A_Z.all.GFP.","B_Z.all.GFP.")])
# orf$delta_A <- orf[,c("A_.GFP.")]-orf[,c("A_.EdU..of.GFP..1")]
# orf$delta_B <- orf[,c("B_.GFP.")]-orf[,c("B_.EdU..of.GFP..1")]
# orf$ave_delta_A_B <- rowMeans(orf[,c("delta_A","delta_B")])


# orf$RH <- numeric(nrow(orf))
# orf[orf$HUGO.Symbol %in% g_unique$geneSymbol,"RH"] <- 1


# # n.s.
# t.test(orf[orf$RH==1,"ave_A_B_all_GFP"],orf[orf$RH==0,"ave_A_B_all_GFP"])
# t.test(orf[orf$RH==1,"ave_delta_A_B"],orf[orf$RH==0,"ave_delta_A_B"])


# # n.s.
# ks.test(orf[orf$RH==1,"ave_A_B_all_GFP"],orf[orf$RH==0,"ave_A_B_all_GFP"])
# ks.test(orf[orf$RH==1,"ave_delta_A_B"],orf[orf$RH==0,"ave_delta_A_B"])


# # n.s.
# t.test(orf[orf$RH==1,"ave_A_B_all_GFP"],orf[orf$RH==0,"ave_A_B_all_GFP"])
# t.test(orf[orf$RH==1,"ave_delta_A_B"],orf[orf$RH==0,"ave_delta_A_B"])



       
# #---------roc (13)------------------------

# cf roc_explore_1.R and crispr_fig_1.R
# Choose: 
# HeLa from petal (0.6), 
# RH exp (0.56), quite nice, 
# string prot interactions. Pretty nice, score ==160 (AUC = 0.57),  980 (AUC=0.58)
# geneLength_cr_nc. Outstanding rocs. AUC =0.76 CR, 0.66 non-coding



# moffat crisp

petal <- read.table("petal.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE)
petal <- petal[,c("Gene", "BF_hct116", "BF_hela", "BF_gbm", "BF_rpe1", "BF_dld1")] # omit 1st gen crispr BF_a375_GeCKo
g_unique <- read.delim("growth_loci_unique.txt",sep="\t",stringsAsFactors=FALSE,header=TRUE,check.names=FALSE)
petal$RH <- numeric(nrow(petal))
petal[petal$Gene %in% g_unique$geneSymbol,"RH"] <- 1


petal_l <- reshape(petal, 
  varying = c("BF_hct116", "BF_hela",  "BF_gbm", "BF_rpe1", "BF_dld1"), 
  v.names = "BF",
  timevar = "cell", 
  times = c("BF_hct116", "BF_hela",  "BF_gbm", "BF_rpe1", "BF_dld1"), 
  new.row.names = 1:1e6,
  direction = "long")


petal_l[petal_l$cell == "BF_hct116","cell"] <- "HCT116"
petal_l[petal_l$cell == "BF_hela","cell"] <- "HeLa"
petal_l[petal_l$cell == "BF_gbm","cell"] <- "GBM"
petal_l[petal_l$cell == "BF_rpe1","cell"] <- "RPE-1"
petal_l[petal_l$cell == "BF_dld1","cell"] <- "DLD-1"

roc_petal_hela <- petal_l[petal_l$cell == "HeLa",]
roc_petal_hela <- roc_petal_hela[,c("Gene","RH","cell","BF")]
colnames(roc_petal_hela)[4] <- "score"
roc_petal_hela$score <- -roc_petal_hela$score


# rh exp

rh <- read.table("rh_array.txt",sep="\t",stringsAsFactors=FALSE,header=TRUE)
dim(rh)
# [1] 24353    88

# For some reason, lots of entries wih no gene names.
dim(rh[rh$Gene=="",])
# [1] 5263   88

rh <- rh[rh$Gene != "",]
# Still gene dups because mult spots on array, multicopy genes etc:
dim(rh[duplicated(rh$Gene),])
# [1] [1] 2350   88


rh[,-1] <- log2(rh[,-1])

# get rid of remaining gene dups:
rh <- aggregate(.~Gene, rh, mean)


# Get rid of gene names converted to dates
rh <- rh[-c(1:22),]

rh$rh_ave <- rowMeans(rh[,c(3:81)])

g_unique <- read.delim("growth_loci_unique.txt",sep="\t",stringsAsFactors=FALSE,header=TRUE,check.names=FALSE)
rh$RH <- 0
rh[rh$Gene %in% g_unique$geneSymbol,]$RH <- 1

# No need to cast in long form, as already in long form with only one group factor. However, for consistency with other plots provide "dummy" grp factor for plotting.
rh$cell <- "RH_exp"

roc_rh_exp <- rh[,c("Gene","rh_ave","RH","cell")]
roc_rh_exp <- roc_rh_exp[,c("Gene","RH","cell","rh_ave")]
colnames(roc_rh_exp)[4] <- "score"
roc_rh_exp$score <- -roc_rh_exp$score



# string prot interactions

# string prot interactions. Both pretty nice roc curves, score ==160 (AUC = 0.58), looks a bit nicer than 995 (AUC=0.54). Have to change sign of one score edge number, because in different directions vs RH status. score == 995 not Wilcox significant (even though t test significant), because lack of power, so omitted from roc curves.

# cf crispr_fig1_ttest.R

# string db v11.0 downloaded 09/11/19 from https://string-db.org/cgi/download.pl?sessionId=i50ty8Z04nR7&species_text=Homo+sapiens
# string is also needed for p12 below.

string <-read.table("9606.protein.links.v11.0.txt",sep=" ",stringsAsFactors=FALSE,header=TRUE)
g_unique <- read.delim("growth_loci_unique.txt",sep="\t",stringsAsFactors=FALSE,header=TRUE,check.names=FALSE)



# Translate ENSP to ENSG ids
# Downloading translation table from ensembl would take too long: 3 hours for gene1
# # library(biomaRt)
# ensembl <- useDataset("hsapiens_gene_ensembl", mart=useMart("ensembl"))

# # download from ensembl
# gene_tx_1 <- getBM(
				# attributes=c(
						# 'external_gene_name',
						# 'ensembl_gene_id',
						# 'ensembl_transcript_id'
					# ), 
			    # filters = 'ensembl_peptide_id', 
			    # values = string$protein1, 
			    # mart = ensembl
			    # )
			    
			    
			    
# # Can use translation table provided by string db, human.name_2_string.tsv
# names <-read.table("human.name_2_string.tsv",sep="\t",stringsAsFactors=FALSE,header=FALSE)
# colnames(names) <- c("version","geneSymbol","ensembl_pr_id")
			    

# >>>>>>>>>>> Best solution <<<<<<<<<<<<<<<<<<<<<<<<<
# To translate, used 9606.protein.aliases.v11.0 
# downloaded 09/11/19 from 
# https://string-db.org/cgi/download.pl?sessionId=i50ty8Z04nR7&species_text=Homo+sapiens 




aliases <-read.delim("9606.protein.aliases.v11.0.txt",sep="\t",stringsAsFactors=FALSE,header=FALSE,fill=TRUE,skip=1)

# colnames provided by "9606.protein.aliases.v11.0.txt", but blanked out by ## symbols
colnames(aliases) <- c("string_protein_id", "alias", "source")
aliases <- aliases[grep("^ENSG",aliases$alias),]








# takes a few mins
string <- merge(string,aliases[,c("string_protein_id", "alias")],by.x="protein1",by.y="string_protein_id")
colnames(string)[4] <- "gene1"
string <- string[,-c(1)]


# takes a few mins
string <- merge(string,aliases[,c("string_protein_id", "alias")],by.x="protein2",by.y="string_protein_id")
colnames(string)[4] <- "gene2"
string <- string[,-c(1)]



colnames(string)[1] <- "score"


scor <- seq(from = 150, to = 1000, by = 5)
seq <- seq_along(seq(from = 150, to = 1000, by = 5) )



# # Go to START HERE to save time, otherwise, run below
# # DO NOT DELETE
# # Takes about 5 min

# string_ans <- data.frame(
						# score = numeric(), 
						# mean_rh_plus =  numeric(), 
						# sem_rh_plus = numeric(), 
						# mean_rh_minus =  numeric(), 
						# sem_rh_minus = numeric(), 
						# t = numeric(), 
						# # df = numeric(), 
						# P = numeric()
						# )
						
						
# sem <- function(x) {sqrt(var(x,na.rm=TRUE)/sum(!is.na(x)))}


# # Loop ends in error message, but ok, file still written.
# # Takes about 10 min

# for(i in seq) {
# print(i)

# string_shave <- string[string$score >= scor[i], ]

# string_agg_1 <- aggregate(rep(1, length(string_shave$gene1)), by=list(string_shave$gene1), sum)
# colnames(string_agg_1)<-c("Gene","edge1")

# string_agg_2 <- aggregate(rep(1, length(string_shave$gene2)), by=list(string_shave$gene2), sum)
# colnames(string_agg_2)<-c("Gene","edge2")

# string_agg_3 <- merge(string_agg_1,string_agg_2)
# string_agg_3$sum_edge<-string_agg_3$edge1+string_agg_3$edge2

# # edge1 == edge2 because string table is completely symmetrical. Therefore use either edge1 or edge2 arbitrarily

# a <- string_agg_3[string_agg_3$Gene %in% g_unique$ensembl_gene_id,"edge1"]
# b <- string_agg_3[!(string_agg_3$Gene %in% g_unique$ensembl_gene_id),"edge1"]
# test <- t.test(a,b)

# string_ans[i,] <- c(
					# scor[i],
					# mean(a),
					# sem(a),
					# mean(b),
					# sem(b),
					# test$statistic,
					# # test$parameter,
					# test$p.value
# )

# }


# string_ans$q <- p.adjust(string_ans$P, method = "BH")

# write.table(string_ans,"string_ans.txt",quote=FALSE,sep="\t",row.names=FALSE)

# START HERE if desired to save time
string_ans <- read.table("string_ans.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE)




# Visual inspection of string_ans shows score threshold giving max -logP in first peak is score = 160, i = 3
scor[3]
# [1] 160




# value of i which gives most significant p val (which is in 2nd peak)
which.min(string_ans$P)
# [1] 170 # confusingly, similar to number as above, but it is correct


# String combined score threshold that gives most significant p val in second peak
string_ans[which.min(string_ans$P),"score"]
# [1] 995



string_peaks <- data.frame(
						RH = numeric(),
						edges = numeric(),
						score = numeric()
						)



for(i in c(3,which.min(string_ans$P))) {
	
	print(i)
	
	string_shave <- string[string$score >= scor[i], ]
	
	string_agg_1 <- aggregate(rep(1, length(string_shave$gene1)), by=list(string_shave$gene1), sum)
	colnames(string_agg_1)<-c("Gene","edge1")
	
	string_agg_2 <- aggregate(rep(1, length(string_shave$gene2)), by=list(string_shave$gene2), sum)
	colnames(string_agg_2)<-c("Gene","edge2")
	
	string_agg_3 <- merge(string_agg_1,string_agg_2)
	string_agg_3$sum_edge<-string_agg_3$edge1+string_agg_3$edge2
	
	# edge1 == edge2 because string table is completely symmetrical. Therefore use either edge1 or edge2 arbitrarily
	
	a <- string_agg_3[string_agg_3$Gene %in% g_unique$ensembl_gene_id,"edge1"]
	b <- string_agg_3[!(string_agg_3$Gene %in% g_unique$ensembl_gene_id),"edge1"]
	RH <- c(rep(1,length(a)),rep(0,length(b)))
	edges <- c(a,b)
	score <- scor[i]
	
	string_peaks_temp <- cbind(RH,edges,score)
	string_peaks <- rbind(string_peaks,string_peaks_temp)

}


############ Not a mistake, although a bit confusing. Column named "score" (the original string score) becomes "cell" for ggplot2 color. Column named "edges" becomes "score" for the roc curve. ###################
colnames(string_peaks) <- c("RH","score","cell")


# gene name not necessary for roc curve. Allows rbind with other roc data frames.
string_peaks$Gene <- NA 
roc_string <- string_peaks[,c("Gene","RH","cell","score")]
roc_string[roc_string$cell=="995","score"] <- -roc_string[roc_string$cell=="995","score"] # puts roc on top of diag



# geneLength

g_unique <- read.delim("growth_loci_unique.txt",sep="\t",stringsAsFactors=FALSE,header=TRUE,check.names=FALSE)
gencode_gtf_ensembl_ucsc <- read.delim("gencode_gtf_ensembl_ucsc_v31.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE,check.names=FALSE)


gencode_gtf_ensembl_ucsc$RH <- numeric(nrow(gencode_gtf_ensembl_ucsc))
gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_id %in% g_unique$ensembl_gene_id,"RH"] <- 1


# no need to reshape because already in long form
gencode_gtf_ensembl_ucsc$cell = character(nrow(gencode_gtf_ensembl_ucsc))
gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type == "protein_coding","cell"] <- "geneLength_cr"
gencode_gtf_ensembl_ucsc[gencode_gtf_ensembl_ucsc$gene_type != "protein_coding","cell"] <- "geneLength_nc"

roc_geneLength <- gencode_gtf_ensembl_ucsc[,c("geneSymbol","geneLength","RH","cell")]
colnames(roc_geneLength) <- c("Gene","score","RH","cell")
roc_geneLength <- roc_geneLength[,c("Gene","RH","cell","score")]



# ------------------ archive roc.txt -------------------------

# Archive roc. Contains string score == 995.

# # put df together, INCLUDES roc_string score == 995.
# roc <- rbind(roc_petal_hela, roc_rh_exp, roc_string, roc_geneLength)

# write.table(roc,"roc.txt",quote=FALSE,sep="\t",row.names=FALSE)

# -----------------------------------------------------------


# put df together
# omit string score == 995, since n.s. by Wilcox (though significant by t test)
roc <- rbind(roc_petal_hela, roc_rh_exp, roc_string[roc_string$cell == 160, ], roc_geneLength)


level_order_13 <- c("geneLength_cr", "geneLength_nc", "HeLa", "160", "RH_exp")

roc$cell <- factor(roc$cell,levels=level_order_13)
roc$RH <- factor(roc$RH,levels=c("1","0"))




colores_13 <- gg_color_hue(length(unique(roc$cell)))
names(colores_13) <- level_order_13


labels_13 <- c("Gene length, cr", "Gene length, nc", "CRISPR HeLa", "Protein Ix, score = 160", "RH expression")


labels_13 <- paste0(
				labels_13,
				" (",
				c(
					format(round(auc(roc(roc[roc$cell == "geneLength_cr",c("RH")], roc[roc$cell == "geneLength_cr",c("score")]))[1], digits=2), nsmall = 2),
					format(round(auc(roc(roc[roc$cell == "geneLength_nc",c("RH")], roc[roc$cell == "geneLength_nc",c("score")]))[1], digits=2), nsmall = 2),
					format(round(auc(roc(roc[roc$cell == "HeLa",c("RH")], roc[roc$cell == "HeLa",c("score")]))[1], digits=2), nsmall = 2),
					format(round(auc(roc(roc[roc$cell == "160",c("RH")], roc[roc$cell == "160",c("score")]))[1], digits=2), nsmall = 2),
					# format(round(auc(roc(roc[roc$cell == "995",c("RH")], roc[roc$cell == "995",c("score")]))[1], digits=2), nsmall = 2),
					format(round(auc(roc(roc[roc$cell == "RH_exp",c("RH")], roc[roc$cell == "RH_exp",c("score")]))[1], digits=2), nsmall = 2)
					), 
				")"
			)



p13 <- ggplot() + 
		geom_roc(n.cuts = 0) +
		theme2 + 
		theme(legend.key=element_blank()) +
		geom_line(
			data= roc, 
			lwd=0.2,
			stat="roc",
			show.legend=TRUE,
			mapping = aes(
				d = as.numeric(as.character(RH)), 
				m = score, 
				color = cell)
				) + 
		geom_abline(
			intercept = 0, 
			slope = 1,
			lwd=0.2,
			colour = "grey80"
			) +
		scale_color_manual(
			values=colores_13,
			name =NULL, 
			labels=labels_13
			) +
 		guides(
 			colour = guide_legend(
				 			override.aes = list(
									 			fill=NA,
									 			shape=1,
									 			size=0.2
									 			),
				 			ncol=1,
				 			byrow=TRUE
		 			)
 			) +
   		theme(
	   		plot.margin = unit(c(top=0.5,right=0.5,bottom=0.5,left=0.5), "cm"),
		 	plot.subtitle = element_text(size=9, face="plain", hjust = 0.5, margin=margin(0,0,13,0)),
		 	# axis.title.x=element_blank(),
	        # axis.text.x=element_blank(),
	        # axis.ticks.x=element_blank(),
			# plot.title = element_text(size=32, face="bold", hjust = -0.14), #can provide "A","B", by ggtitle, but used plot_grid wch can shift more left
	 		legend.position = c(0.9,0.3),
			legend.margin=margin(t=0,r=0,b=0,l=0,unit = "pt"),
		 	legend.box.margin=margin(t=0,r=0,b=0,l=0,unit = "pt"),
		 	legend.key.height = unit(0.05, "cm"),
			legend.key.width = unit(0.3, "cm"),
		 	legend.spacing.y = unit(0.05, 'cm'),
		 	legend.spacing.x = unit(0.3, 'cm'),
		 	legend.title = element_text(size = 6),  
		 	legend.text = element_text(size = 6),
	 		legend.title.align=0.7
 			) +
		scale_x_continuous(breaks = c(0,0.5,1), labels = c(0,0.5,1)) + 
		scale_y_continuous(breaks = c(0,0.5,1), labels = c(0,0.5,1)) + 
		xlab("FPR") + 
		ylab(expression(TPR)) + 
		labs(subtitle="ROC")
print(p13)



#------------------Make file --------------------------


# for(i in c(1:14)) {
	# assign(paste0("p",i), readRDS(paste0("p",i)))
	# }
	
	
(p_comp <- ggdraw() +
	draw_plot(p14, x = 0.0, y = 0.76, width = 0.2, height = 0.23) +
 	draw_plot(p2, x = 0.22, y = 0.76, width = 0.2, height = 0.23) +
 	draw_plot(p1, x = 0.44, y = 0.76, width = 0.2, height = 0.23) + 
 	draw_plot(p6, x = 0.66, y = 0.76, width = 0.2, height = 0.23) +
 	draw_plot(legend, x = 0.92, y = 0.96, width = 1, height = 1) +
 	
 	draw_plot(p7, x = 0.0, y = 0.51, width = 0.2, height = 0.23) +
 	draw_plot(p8, x = 0.22, y = 0.51, width = 0.2, height = 0.23) + 
	draw_plot(p9, x = 0.44, y = 0.51, width = 0.2, height = 0.23) +
 	draw_plot(p4, x = 0.66, y = 0.51, width = 0.2, height = 0.23) +
 	
 	draw_plot(p5, x = 0.0, y = 0.27, width = 0.2, height = 0.23) +
 	draw_plot(p10, x = 0.22, y = 0.27, width = 0.2, height = 0.23) +
 	
 	draw_plot(p13, x = 0.44, y = 0.10, width = 0.4, height = 0.4) +
 	 	
 	draw_plot_label(
	 	c("A", "B", "C", "D", "", "E", "F", "G", "H", "I", "J", "K"), 
	 	x = c(0.0, 0.22, 0.44, 0.66, 0.92, 0.0, 0.22, 0.44, 0.66, 0.0, 0.22, 0.44), 
	 	y = c(1.0, 1.0, 1.0, 1.0, 1.0, 0.76, 0.76, 0.76, 0.76, 0.51, 0.51, 0.51), 
	 	size = c(14, 14, 14, 14, 0, 14, 14, 14, 14, 14, 14, 14)
	 	)
)


pdf("roc_3.pdf",width=7.5,height=7.5,useDingbats=FALSE)
p_comp
dev.off()


tiff("roc_3.tif",width=7.5,height=7.5,units="in",res=300)
p_comp
dev.off()



png("roc_3.png", width=7.5, height=7.5,units="in",res=300)
p_comp
dev.off()



png("roc_hi_res_3.png", width=7.5, height=7.5,units="in",res=1200)
p_comp
dev.off()














