library(ggplot2)
library(cowplot) #used with plot_grid 
library(scales)

# --------------- functions ---------------------------


get_legend<-function(myggplot){
  tmp <- ggplot_gtable(ggplot_build(myggplot))
   leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
   legend <- tmp$grobs[[leg]]
   return(legend)
 }


sem <- function(x) {sqrt(var(x,na.rm=TRUE)/sum(!is.na(x)))}


compare <- function(a,b) {
	print(t.test(a,b))
	
	print(paste0("exact P value = ", t.test(a,b)$p.value))
	
	print(paste0("mean of a = ", mean(a, na.rm = TRUE)))
	print(paste0("sem of a = ", sem(a)))
	print(paste0("sd of a = ", sd(a, na.rm = TRUE)))
	print(paste0("number in a = ", sum(!is.na(a))))
	
	print(paste0("mean of b = ", mean(b, na.rm = TRUE)))
	print(paste0("sem of b = ", sem(b)))
	print(paste0("sd of b = ", sd(b, na.rm = TRUE)))
	print(paste0("number in b = ", sum(!is.na(b))))
	
}



#----------------Aesthetics ---------------------------

# base style axes: https://stackoverflow.com/questions/25327694/how-to-tweak-the-extent-to-which-an-axis-is-drawn-in-ggplot2

theme2 <- theme(
				plot.margin = unit(c(top=1.25,right=1.0,bottom=1.25,left=1.0), "cm"),
				panel.grid.major = element_blank(), 
				panel.grid.minor = element_blank(), 
				panel.background = element_blank(), 
				axis.line.x = element_line(colour = "black", size = 0.1), 
				axis.line.y = element_line(colour = "black", size = 0.1), 
				axis.ticks = element_line(colour = "black", size = 0.1),
				axis.text=element_text(size=12), #numbers on tick marks of x and y axes
				axis.title=element_text(size=14), #titles of x and y axes
				axis.title.y=element_text(margin=margin(0,13,0,0)), #moves y axis title by adding margin space to bottom
				axis.title.x=element_text(margin=margin(10,0,0,0)),  #moves x axis title by adding margin space to top
				# plot.title = element_text(size=32, face="bold", hjust = -0.14), #can provide "A","B", by ggtitle, but used plot_grid wch can shift more left
				plot.subtitle = element_text(size=14, face="plain", hjust = 0.5, margin=margin(0,0,13,0)), #hjust shifts right
			 	legend.position="none", 
				legend.margin=margin(t=0,r=0,b=0,l=0,unit = "pt"),
			 	legend.box.margin=margin(t=0,r=0,b=0,l=0,unit = "pt"),
			 	legend.key.height = unit(0.3, "cm"),
				legend.key.width = unit(0.4, "cm"),
			 	legend.spacing.y = unit(0.3, 'cm'),
			 	legend.spacing.x = unit(0.25, 'cm'),
			 	legend.title = element_text(size = 11),  
			 	legend.text = element_text(size = 10),
			 	legend.title.align=0.7
		)
		
		

gg_color_hue <- function(n) {
  hues = seq(15, 375, length = n + 1)
  hcl(h = hues, l = 65, c = 100)[1:n]
}


size_hline <- 0.2


# balloon_scale <- 5 # inflation factor for significant points	
# size_point <- 0.1*(1 + balloon_scale*(logP$log10p_g_avg/max(logP$log10p_g_avg))) # scale significant points


# ------------------------ geneRIF PMIDs P vals (1) --------------------------------

# use read.delim cf https://kbroman.org/blog/2017/08/08/eof-within-quoted-string/
# quote mark in a gene description entry most likely introduced during production of gencode_ensembl via biomaRt
g_unique <- read.delim("growth_loci_unique.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE,check.names=FALSE)
gencode_gtf_ensembl_ucsc <- read.delim("gencode_gtf_ensembl_ucsc_v31.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE,check.names=FALSE)


# # alternative approach: https://kbroman.org/blog/2017/08/08/eof-within-quoted-string/
# g_unique <- read.table("growth_loci_unique.txt", header=TRUE, sep="\t", stringsAsFactors=FALSE, check.names=FALSE, quote="", fill=FALSE)



# Add rif cols

# Go to https://www.ncbi.nlm.nih.gov/gene/about-generif and click on FTP hyperlink. Download generifs_basic. Biggish file, ~238.2 Mb. If save as generifs_basic.gz, size is 73 Mb.
# Downloaded 09/27/2019
gene_rif <- read.delim("generifs_basic",header=TRUE,sep="\t",stringsAsFactors=FALSE)
gene_rif <- gene_rif[gene_rif$X.Tax.ID==9606,] # select human genes using 9606 taxa ID
gene_rif <- gene_rif[,c("Gene.ID","PubMed.ID..PMID..list")]
colnames(gene_rif) <- c("entrezgene_id","PMID")
gene_rif_summary <- aggregate(rep(1, length(gene_rif$entrezgene)), by=list(gene_rif$entrezgene), sum)
colnames(gene_rif_summary) <- c("entrezgene_id","PMID_entries")


dim(gene_rif_summary)
# [1] 9633    2


# # DO NOT DELETE
# # go to START HERE to save time

# gene_rif_human_ncbi <- unique(gene_rif$entrezgene)


# # Convert NCBI gene_id (entrezgene) to ensembl

# library(biomaRt)
# ensembl <- useDataset("hsapiens_gene_ensembl", mart=useMart("ensembl"))

# # download from ensembl on 09/27/2019
# gene_rif_human_ensembl <- getBM(
							# attributes=c(
								# 'ensembl_gene_id',
								# 'entrezgene_id'
								# ), 
					      # filters = 'entrezgene_id', 
					      # values = gene_rif_human_ncbi, 
					      # mart = ensembl
					      # )

# head(gene_rif_human_ensembl)
  # # ensembl_gene_id entrezgene_id
# # 1 ENSG00000198610          1109
# # 2 ENSG00000149554          1111
# # 3 ENSG00000053254          1112
# # 4 ENSG00000276781          1113
# # 5 ENSG00000100604          1113
# # 6 ENSG00000089199          1114

# # save entrez gene id to ensembl gene id translation table for archival purposes:
# write.table(gene_rif_human_ensembl,"ensembl_entrezgene.txt",quote=FALSE,sep="\t",row.names=FALSE,col.names=TRUE)

# if desired to save time, START HERE

# read 'frozen' translation archive:
gene_rif_human_ensembl <- read.table("ensembl_entrezgene.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE)

# biomart has increased data frame size and introduced redundancy. That is, multiple ensembl_gene_id for one entrezgene
dim(gene_rif_human_ensembl)
# [1] 10603     2

# But it is OK because will later merge with gencode_gtf_ensembl_ucsc using unique ensembl_gene_id to give accurate results

gene_rif_summary <- merge(gene_rif_human_ensembl, gene_rif_summary, all.x=TRUE)


dim(gene_rif_summary)
# [1] 10603     3



head(gene_rif_summary)
  # entrezgene_id ensembl_gene_id PMID_entries
# 1           355 ENSG00000026103           51
# 2           356 ENSG00000117560          593
# 3           357 ENSG00000146950            4
# 4           358 ENSG00000240583          201
# 5           359 ENSG00000167580          108
# 6           360 ENSG00000165272          133



# gene_rif_summary_n tables all based on gencode_gtf_ensembl_ucsc, which lacks CEN. 
# Hence, lack of CEN entrez ids in geneRifs does not interfere in any analyses.

gene_rif_summary_2 <- merge(gencode_gtf_ensembl_ucsc, gene_rif_summary,all.x=TRUE,by.x=c("gene_id"),by.y=c("ensembl_gene_id"))
gene_rif_summary_2[is.na(gene_rif_summary_2[,c("PMID_entries")]),c("PMID_entries")] <- 0

dim(gencode_gtf_ensembl_ucsc)
# [1] 60603    19


dim(gene_rif_summary_2)
# [1] 60607    21

# Therefore, four dups. Cure this by selecting ensembl_gene_id with highest PMID_entries
gene_rif_summary_2 <- merge(aggregate(PMID_entries~gene_id,data= gene_rif_summary_2,FUN=max), gencode_gtf_ensembl_ucsc,all.x=TRUE,all.y=TRUE)


dim(gene_rif_summary_2)
# [1] 60603    20

gene_rif_summary_2$RH <- 0
gene_rif_summary_2[gene_rif_summary_2$gene_id %in% g_unique$ensembl_gene_id,"RH"] <- 1


# compare PMID_entries RH+, RH-, nc and cr

compare(gene_rif_summary_2[gene_rif_summary_2$RH==1,"PMID_entries"],gene_rif_summary_2[gene_rif_summary_2$RH==0,"PMID_entries"])

	# Welch Two Sample t-test

# data:  a and b
# t = 2.5532, df = 940.47, p-value = 0.01083
# alternative hypothesis: true difference in means is not equal to 0
# 95 percent confidence interval:
 # 0.7873408 6.0192297
# sample estimates:
# mean of x mean of y 
 # 9.612865  6.209580 

# [1] "exact P value = 0.010832057766015"
# [1] "mean of a = 9.61286549707602"
# [1] "sem of a = 1.30120335577585"
# [1] "sd of a = 38.0476845285676"
# [1] "number in a = 855"
# [1] "mean of b = 6.20958023699538"
# [1] "sem of b = 0.289283970556315"
# [1] "sd of b = 70.7108496831509"
# [1] "number in b = 59748"



# compare PMID_entries RH+, RH- for protein coding genes

compare(gene_rif_summary_2[gene_rif_summary_2$gene_type == "protein_coding" & gene_rif_summary_2$RH==1,"PMID_entries"],gene_rif_summary_2[gene_rif_summary_2$gene_type == "protein_coding" & gene_rif_summary_2$RH==0,"PMID_entries"])

	# Welch Two Sample t-test

# data:  a and b
# t = -0.13892, df = 562.11, p-value = 0.8896
# alternative hypothesis: true difference in means is not equal to 0
# 95 percent confidence interval:
 # -5.457627  4.736649
# sample estimates:
# mean of x mean of y 
 # 18.59502  18.95551 

# [1] "exact P value = 0.889567027193616"
# [1] "mean of a = 18.5950226244344"
# [1] "sem of a = 2.44206057855476"
# [1] "sd of a = 51.3413835248368"
# [1] "number in a = 442"
# [1] "mean of b = 18.9555111861977"
# [1] "sem of b = 0.877789435302519"
# [1] "sd of b = 122.680298662295"
# [1] "number in b = 19533"


# compare PMID_entries RH+, RH- for non-protein coding genes

compare(gene_rif_summary_2[gene_rif_summary_2$gene_type != "protein_coding" & gene_rif_summary_2$RH==1,"PMID_entries"],gene_rif_summary_2[gene_rif_summary_2$gene_type != "protein_coding" & gene_rif_summary_2$RH==0,"PMID_entries"])

	# Welch Two Sample t-test

# data:  a and b
# t = -3.1686, df = 40214, p-value = 0.001533
# alternative hypothesis: true difference in means is not equal to 0
# 95 percent confidence interval:
 # -0.030266398 -0.007132582
# sample estimates:
 # mean of x  mean of y 
# 0.00000000 0.01869949 

# [1] "exact P value = 0.00153265712367836"
# [1] "mean of a = 0"
# [1] "sem of a = 0"
# [1] "sd of a = 0"
# [1] "number in a = 413"
# [1] "mean of b = 0.0186994902399602"
# [1] "sem of b = 0.00590141453102975"
# [1] "sd of b = 1.18345066554436"
# [1] "number in b = 40215"



# nc geneRIFs too rare to get meaningful results, restrict analyses to cr genes.
# Zero PMID observations for nc (see above and below). Stick to cr.

compare(gene_rif_summary_2[gene_rif_summary_2$gene_type != "protein_coding" & gene_rif_summary_2$PMID_entries > 0 & gene_rif_summary_2$RH == 1,"PMID_entries"],gene_rif_summary_2[gene_rif_summary_2$gene_type != "protein_coding" & gene_rif_summary_2$PMID_entries > 0 & gene_rif_summary_2$RH == 0,"PMID_entries"])

# Error in t.test.default(a, b) : not enough 'x' observations

length(gene_rif_summary_2[gene_rif_summary_2$gene_type != "protein_coding" & gene_rif_summary_2$PMID_entries > 0 & gene_rif_summary_2$RH == 1,"PMID_entries"])
# [1] 0


length(gene_rif_summary_2[gene_rif_summary_2$gene_type != "protein_coding" & gene_rif_summary_2$PMID_entries > 0 & gene_rif_summary_2$RH == 0,"PMID_entries"])
# [1] 37


compare(gene_rif_summary_2[gene_rif_summary_2$gene_type != "protein_coding" & gene_rif_summary_2$PMID_entries >= 0 & gene_rif_summary_2$RH == 1,"PMID_entries"],gene_rif_summary_2[gene_rif_summary_2$gene_type != "protein_coding" & gene_rif_summary_2$PMID_entries >= 0 & gene_rif_summary_2$RH == 0,"PMID_entries"])

	# Welch Two Sample t-test

# data:  a and b
# t = -3.1686, df = 40214, p-value = 0.001533
# alternative hypothesis: true difference in means is not equal to 0
# 95 percent confidence interval:
 # -0.030266398 -0.007132582
# sample estimates:
 # mean of x  mean of y 
# 0.00000000 0.01869949 

# [1] "exact P value = 0.00153265712367836"
# [1] "mean of a = 0"
# [1] "sem of a = 0"
# [1] "sd of a = 0"
# [1] "number in a = 413"
# [1] "mean of b = 0.0186994902399602"
# [1] "sem of b = 0.00590141453102975"
# [1] "sd of b = 1.18345066554436"
# [1] "number in b = 40215"




rif_ans <- data.frame(
					PMID_thresh = numeric(),
					t = numeric(),
					df = numeric(),
					mean_RH_pos_PMID = numeric(),
					sem_RH_pos_PMID = numeric(),
					mean_RH_neg_PMID = numeric(),
					sem_RH_neg_PMID = numeric(),
					P = numeric()
					)

thresh <- seq(0,max(gene_rif_summary_2$PMID_entries),4)

for (i in c(1:length(thresh))) {
		print(i)
		ans <- t.test(gene_rif_summary_2[gene_rif_summary_2$PMID_entries >= thresh[i] & gene_rif_summary_2$gene_type == "protein_coding" & gene_rif_summary_2$RH==1,"PMID_entries"],gene_rif_summary_2[gene_rif_summary_2$PMID_entries >= thresh[i] & gene_rif_summary_2$gene_type == "protein_coding" & gene_rif_summary_2$RH==0,"PMID_entries"])
		rif_ans[i,"PMID_thresh"] <- thresh[i]
		rif_ans[i,"t"] <- ans$statistic
		rif_ans[i,"df"] <- ans$parameter
		rif_ans[i,"mean_RH_pos_PMID"] <- mean(gene_rif_summary_2[gene_rif_summary_2$PMID_entries >= thresh[i] & gene_rif_summary_2$gene_type == "protein_coding" & gene_rif_summary_2$RH==1,"PMID_entries"])
		rif_ans[i,"sem_RH_pos_PMID"] <- sem(gene_rif_summary_2[gene_rif_summary_2$PMID_entries >= thresh[i] & gene_rif_summary_2$gene_type == "protein_coding" & gene_rif_summary_2$RH==1,"PMID_entries"])
		rif_ans[i,"mean_RH_neg_PMID"] <- mean(gene_rif_summary_2[gene_rif_summary_2$PMID_entries >= thresh[i] & gene_rif_summary_2$gene_type == "protein_coding" & gene_rif_summary_2$RH==0,"PMID_entries"])
		rif_ans[i,"sem_RH_neg_PMID"] <- sem(gene_rif_summary_2[gene_rif_summary_2$PMID_entries >= thresh[i] & gene_rif_summary_2$gene_type == "protein_coding" & gene_rif_summary_2$RH==0,"PMID_entries"])
		rif_ans[i,"P"] <- ans$p.value
}

# Ends in error, but table is written ok




rif_ans[which.min(rif_ans$P),]
   # PMID_thresh         t       df mean_RH_pos_PMID sem_RH_pos_PMID mean_RH_neg_PMID sem_RH_neg_PMID            P
# 92         364 -6.758621 163.5647           397.75        14.68772         896.1867        72.27089 2.335071e-10     <<<<<<<<< use in paper

min(p.adjust(rif_ans$P,method="BH"))
# [1] 2.321952e-09 <<<<<<<<< use in paper





# # If wish instead to plot cumulative freq diffs (ks.test style) use following and plot cum freqs vs PMID_thresh:

# rif_ans <- data.frame(
					# PMID_thresh = numeric(),
					# cum_RH_pos_PMID = numeric(),
					# cum_RH_neg_PMID = numeric(),
					# cum_diff = numeric()
					# )

# thresh <- seq(0,max(gene_rif_summary_2$PMID_entries),1)

# for (i in c(1:length(thresh))) {
		# print(i)
		# rif_ans[i,"PMID_thresh"] <- thresh[i]
		# rif_ans[i,"cum_RH_pos_PMID"] <- 1-length(gene_rif_summary_2[gene_rif_summary_2$PMID_entries >= thresh[i] & gene_rif_summary_2$gene_type == "protein_coding" & gene_rif_summary_2$RH==1,"PMID_entries"])/length(gene_rif_summary_2[gene_rif_summary_2$PMID_entries >= 0 & gene_rif_summary_2$gene_type == "protein_coding" & gene_rif_summary_2$RH==1,"PMID_entries"])
		# rif_ans[i,"cum_RH_neg_PMID"] <- 1-length(gene_rif_summary_2[gene_rif_summary_2$PMID_entries >= thresh[i] & gene_rif_summary_2$gene_type == "protein_coding" & gene_rif_summary_2$RH==0,"PMID_entries"])/length(gene_rif_summary_2[gene_rif_summary_2$PMID_entries >= 0 & gene_rif_summary_2$gene_type == "protein_coding" & gene_rif_summary_2$RH==0,"PMID_entries"])
		# rif_ans[i,"cum_diff"] <- rif_ans[i,"cum_RH_neg_PMID"] - rif_ans[i,"cum_RH_pos_PMID"]
# }




rif_ans$q <- p.adjust(rif_ans$P, method = "BH")
h_line_1 <- max(rif_ans[rif_ans $q < 0.05,]$P)


p1 <- ggplot() + 
		theme2 + 
		theme(legend.key=element_blank()) +
		geom_line(
			data= rif_ans, 
			lwd=0.2,
			colour="black",
			show.legend=FALSE,
			aes(
				x= PMID_thresh, 
				y= -log10(rif_ans$P)
				)
			) +
		geom_hline(
			color = "red",
			size=size_hline,
			aes(
				yintercept = -log10(h_line_1), 
				linetype = "FDR = 0.05"
				)
			) +
		geom_hline(
			color = "blue",
			size=size_hline,
			aes(
				yintercept = -log10(0.05), 
				linetype = "P = 0.05"
				)
			) +
		scale_linetype_manual(
			name = NULL, 
			values = c("solid","dashed"), 
			guide = guide_legend(
						override.aes = list(
										color = c("red","blue")
										)
							)
			) +
		theme(
		 	legend.position = c(0.25,0.9)
 			) +
		labs(subtitle="GeneRIF cr") +
		xlab("Publication threshold") + 
		# scale_x_continuous(breaks = c(0,1,2,3,4,6), labels = c(0,1,2,3,4,6)) + 
		ylab(expression(-log[10]*italic('P'))) 
print(p1)






# # ------------------------ geneRIF PMIDs (2) --------------------------------
# uses older geneRIF version, not up to date with 09/27/19 download

# # Only 4 RH+ genes at peak significance, so better to graph all pubs for RH+ and RH- genes at different thresholds


# which.min(rif_ans$P)
# # [1] 89 


# # rif_ans PMID threshold that gives most significant p val in first peak
# rif_ans[which.min(rif_ans$P),"PMID_thresh"]
# # [1] 352




# rif_peak <- data.frame(
					# RH = numeric(),
					# PMID = numeric(),
					# PMID_thresh = numeric()
					# )

# thresh <- seq(0,max(gene_rif_summary_2$PMID_entries),4)

# for (i in c(which.min(rif_ans$P))) {
		# print(i)
		# a <- gene_rif_summary_2[gene_rif_summary_2$PMID_entries >= thresh[i] & gene_rif_summary_2$gene_type == "protein_coding" & gene_rif_summary_2$RH==1,"PMID_entries"]
		# b <- gene_rif_summary_2[gene_rif_summary_2$PMID_entries >= thresh[i] & gene_rif_summary_2$gene_type == "protein_coding" & gene_rif_summary_2$RH==0,"PMID_entries"]
		# RH <- c(rep(1,length(a)),rep(0,length(b)))
		# PMID <- c(a,b)
		# PMID_thresh <- thresh[i]
	
	# # use of rif_peak_temp not strictly necessary because only one peak, 
	# # could simply use:
	# # rif_peak <- cbind(RH,PMID, PMID_thresh)
	
	# # but kept rif_peak_temp for generality in case > 1 peak:
	# rif_peak_temp <- cbind(RH,PMID, PMID_thresh)
	# rif_peak <- rbind(rif_peak, rif_peak_temp)	
	
# }



# ------------------------ geneRIF PMIDs no. pubs (2) --------------------------------

# calculate points
# too many points, clutter up graph irredeemably

# rif_points <- data.frame(
					# PMID_thresh = numeric(),
					# gene_id = numeric(),
					# PMID_entries = numeric(),
					# RH = numeric()
					# )


# # run loop for same number of rows as rif_ans, beyond which t-test fails.

# for (i in c(1:nrow(rif_ans))) {
		# print(i)
		# rif_points_temp <- cbind(PMID_thresh = thresh[i],gene_rif_summary_2[gene_rif_summary_2$PMID_entries >= thresh[i],c("gene_id","PMID_entries","RH")])
		# rif_points <- rbind(rif_points, rif_points_temp)
# }



rif_ans_2 <- reshape(rif_ans, 
  varying = c("mean_RH_pos_PMID", "mean_RH_neg_PMID"), 
  v.names = "No.pubs",
  timevar = "RH", 
  times = c(1,0), 
  new.row.names = 1:1000,
  direction = "long")
  
rif_ans_3 <- reshape(rif_ans_2, 
  varying = c("sem_RH_pos_PMID", "sem_RH_neg_PMID"), 
  v.names = "No.pubs.sem",
  timevar = "RH.sem", 
  times = c(1,0), 
  new.row.names = 1:1000,
  direction = "long")
  
# remove incorrect combos
rif_ans_4 <- rif_ans_3[!(rif_ans_3$RH != rif_ans_3$RH.sem),]

n = length(unique(rif_ans_4$RH))
colores_2 = gg_color_hue(n)
names(colores_2) <- levels(factor(rif_ans_4$RH))


labels_2 <- c("RH-","RH+")




p2 <- ggplot() + 
		theme2 + 
		theme(legend.key=element_blank()) +
		geom_line(
			data= rif_ans_4, 
			lwd=0.2,
			aes(
				x= PMID_thresh, 
				y= No.pubs,
				colour=as.factor(RH)
					),
			show.legend=TRUE
			) +
		geom_ribbon(
			data=rif_ans_4,
			aes(
				x= PMID_thresh, 
				ymin= No.pubs-No.pubs.sem, 
				ymax=No.pubs+No.pubs.sem,
				group=as.factor(RH)
				),
			lwd=0.2,
			fill="grey50",
		    alpha=0.3,
			show.legend=FALSE
			) +
		# geom_point(
			# shape=1,
			# stroke=0.2,
			# data= rif_points, 
			# aes(
				# x= PMID_thresh, 
				# y= PMID_entries, 
				# colour=as.factor(RH)),
				# size=1.0
				# ) +
		# scale_linetype_manual(
			# name = NULL, 
			# values = c(1), 
			# guide = guide_legend(
						# override.aes = list(
											# color = c("red"),
											# size=size_hline
											# )
							# )
			# ) +
		scale_colour_manual(
			values=colores_2,
			labels=labels_2,
			name=NULL		
			) +
		guides(
			shape=FALSE,
	 		fill = guide_legend(
				 		override.aes = list(
				 		#fill=NA,
				 		shape=1,
				 		size=NA
				 		),
			 		ncol=1,
			 		byrow=TRUE
			 		)
	 		 ) +
   		theme(
	 		legend.position = c(0.15,0.9)
 			) +
		# scale_x_discrete(
			# # labels = labels_12, 
			# expand = expand_scale(add = .6)
			# ) +
		# scale_y_continuous(
			# expand = expand_scale(mult = .05), 
			# trans = log10_trans(), 
			# # labels=function(x) {x*1e-3},
			# breaks = trans_breaks("log10", function(x) 10^x, n=3),
			# labels = trans_format("log10", math_format(10^.x))
			# ) +
		# coord_cartesian(ylim = c(1,10^4.5)) +
		xlab("Publication threshold") + 
		ylab(expression(Publications >= threshold)) +
		labs(subtitle="GeneRIF cr")
print(p2)



# ------------ reactome cr logP vals (3) ----------------------


# use read.delim cf https://kbroman.org/blog/2017/08/08/eof-within-quoted-string/
# quote mark in a gene description entry most likely introduced during production of gencode_ensembl via biomaRt
g_unique <- read.delim("growth_loci_unique.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE,check.names=FALSE)
gencode_gtf_ensembl_ucsc <- read.delim("gencode_gtf_ensembl_ucsc_v31.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE,check.names=FALSE)


# # alternative approach: https://kbroman.org/blog/2017/08/08/eof-within-quoted-string/
# g_unique <- read.table("growth_loci_unique.txt", header=TRUE, sep="\t", stringsAsFactors=FALSE, check.names=FALSE, quote="", fill=FALSE)




# Add reactome cols
# reactome is like open access version of Incyte db using literature curation of pathways, while string is supposed to be actual PPIs.
# go to https://reactome.org/download-data and click on hyperlink "Human protein-protein interactions" under heading "TAB-delimited format interaction files". Result is "reactome.homo_sapiens.interactions.tab-delimited.txt"
# Downloaded most up-to-date reactome v70 on Sept 27, 2019
# removed hashtag in front of header so can read into R

reac <- read.table("reactome.homo_sapiens.interactions.tab-delimited.txt",sep="\t",stringsAsFactors=FALSE,header=TRUE) 
reac <- reac[,c("Interactor.1.Ensembl.gene.id","Interactor.2.Ensembl.gene.id")]
colnames(reac) <- c("Gene1","Gene2")
reac_1 <- aggregate(rep(1, length(reac$Gene1)), by=list(reac$Gene1), sum)
reac_2 <- aggregate(rep(1, length(reac$Gene2)), by=list(reac$Gene2), sum)
colnames(reac_1) <- c("ensembl_gene_id","edge1")
colnames(reac_2) <- c("ensembl_gene_id","edge2")
reac_3 <- merge(reac_1,reac_2,all=TRUE)
reac_3[is.na(reac_3)] <- 0
reac_3$reac_edges <- reac_3$edge1 + reac_3$edge2
reac_3$ensembl_gene_id <- gsub(".*:","",reac_3$ensembl_gene_id)


dim(reac_3)
# [1] 6032    4

reac_4 <- merge(gencode_gtf_ensembl_ucsc, reac_3,all.x=TRUE,by.x=c("gene_id"),by.y=c("ensembl_gene_id"))
reac_4[is.na(reac_4[,c("edge1")]),c("edge1")] <- 0
reac_4[is.na(reac_4[,c("edge2")]),c("edge2")] <- 0
reac_4[is.na(reac_4[,c("reac_edges")]),c("reac_edges")] <- 0

dim(gencode_gtf_ensembl_ucsc)
# [1] 60603    19


dim(reac_4)
# [1] 60603    22

reac_4$RH <- 0
reac_4[reac_4$gene_id %in% g_unique$ensembl_gene_id,"RH"] <- 1


# compare Reactome entries, RH+, RH-, nc and cr

compare(reac_4[reac_4$RH==1,"reac_edges"], reac_4[reac_4$RH==0,"reac_edges"])

	# Welch Two Sample t-test

# data:  a and b
# t = 1.6457, df = 891.72, p-value = 0.1002
# alternative hypothesis: true difference in means is not equal to 0
# 95 percent confidence interval:
 # -0.1836793  2.0909129
# sample estimates:
# mean of x mean of y 
 # 3.297076  2.343459 

# [1] "exact P value = 0.100187583375544"
# [1] "mean of a = 3.29707602339181"
# [1] "sem of a = 0.573246795531967"
# [1] "sd of a = 16.7619558746125"
# [1] "number in a = 855"
# [1] "mean of b = 2.34345919528687"
# [1] "sem of b = 0.0847394313712557"
# [1] "sd of b = 20.7132015728541"
# [1] "number in b = 59748"



# compare Reactome entries, RH+, RH- for protein coding genes

compare(reac_4[reac_4$gene_type == "protein_coding" & reac_4$RH==1,"reac_edges"], reac_4[reac_4$gene_type == "protein_coding" & reac_4$RH==0,"reac_edges"])

	# Welch Two Sample t-test

# data:  a and b
# t = -0.67178, df = 490.73, p-value = 0.502
# alternative hypothesis: true difference in means is not equal to 0
# 95 percent confidence interval:
 # -2.949645  1.446559
# sample estimates:
# mean of x mean of y 
 # 6.377828  7.129371 

# [1] "exact P value = 0.502040602462784"
# [1] "mean of a = 6.37782805429864"
# [1] "sem of a = 1.08922565506498"
# [1] "sd of a = 22.8996580153955"
# [1] "number in a = 442"
# [1] "mean of b = 7.12937080837557"
# [1] "sem of b = 0.255257118806494"
# [1] "sd of b = 35.6748649635608"
# [1] "number in b = 19533"





# compare Reactome entries, RH+, RH- for non-protein coding genes

compare(reac_4[reac_4$gene_type != "protein_coding" & reac_4$RH==1,"reac_edges"], reac_4[reac_4$gene_type != "protein_coding" & reac_4$RH==0,"reac_edges"])


	# Welch Two Sample t-test

# data:  a and b
# t = -2.2817, df = 40214, p-value = 0.02251
# alternative hypothesis: true difference in means is not equal to 0
# 95 percent confidence interval:
 # -0.035085968 -0.002661141
# sample estimates:
 # mean of x  mean of y 
# 0.00000000 0.01887355 

# [1] "exact P value = 0.022509574150891"
# [1] "mean of a = 0"
# [1] "sem of a = 0"
# [1] "sd of a = 0"
# [1] "number in a = 413"
# [1] "mean of b = 0.0188735546437896"
# [1] "sem of b = 0.00827154259369944"
# [1] "sd of b = 1.65874851463518"
# [1] "number in b = 40215"

# Statistically significant (P = 0.02251), with 413 RH+, all zero entries, and 40215 RH- mean = 0.019. Worse, insuffic obs for nc, when threshold > 0 entries (see below). Stick to cr.

compare(reac_4[reac_4$gene_type != "protein_coding" & reac_4$reac_edges > 0 & reac_4$RH==1,"reac_edges"], reac_4[reac_4$gene_type != "protein_coding"  & reac_4$reac_edges > 0 & reac_4$RH==0,"reac_edges"])
# Error in t.test.default(a, b) : not enough 'x' observations


length(reac_4[reac_4$gene_type != "protein_coding" & reac_4$reac_edges > 0 & reac_4$RH==0,"reac_edges"])
# [1] 14


length(reac_4[reac_4$gene_type != "protein_coding" & reac_4$reac_edges > 0 & reac_4$RH==1,"reac_edges"])
# [1] 0











reac_cr_ans <- data.frame(
					reac_edges_thresh_cr = numeric(),
					t = numeric(),
					df = numeric(),
					mean_RH_pos_reac_edges = numeric(),
					sem_RH_pos_reac_edges = numeric(),
					mean_RH_neg_reac_edges = numeric(),
					sem_RH_neg_reac_edges = numeric(),
					P = numeric()
					)

thresh_cr_reac <- seq(0,max(reac_4$reac_edges),2)

for (i in c(1:length(thresh_cr_reac))) {
		print(i)
		ans <- t.test(reac_4[reac_4$reac_edges >= thresh_cr_reac[i] & reac_4$gene_type == "protein_coding" & reac_4$RH==1,"reac_edges"],reac_4[reac_4$reac_edges >= thresh_cr_reac[i] & reac_4$gene_type == "protein_coding" & reac_4$RH==0,"reac_edges"])
		reac_cr_ans[i,"reac_edges_thresh_cr"] <- thresh_cr_reac[i]
		reac_cr_ans[i,"t"] <- ans$statistic
		reac_cr_ans[i,"df"] <- ans$parameter
		reac_cr_ans[i,"mean_RH_pos_reac_edges"] <- mean(reac_4[reac_4$reac_edges >= thresh_cr_reac[i] & reac_4$gene_type == "protein_coding" & reac_4$RH==1,"reac_edges"])
		reac_cr_ans[i,"sem_RH_pos_reac_edges"] <- sem(reac_4[reac_4$reac_edges >= thresh_cr_reac[i] & reac_4$gene_type == "protein_coding" & reac_4$RH==1,"reac_edges"])
		reac_cr_ans[i,"mean_RH_neg_reac_edges"] <- mean(reac_4[reac_4$reac_edges >= thresh_cr_reac[i] & reac_4$gene_type == "protein_coding" & reac_4$RH==0,"reac_edges"])
		reac_cr_ans[i,"sem_RH_neg_reac_edges"] <- sem(reac_4[reac_4$reac_edges >= thresh_cr_reac[i] & reac_4$gene_type == "protein_coding" & reac_4$RH==0,"reac_edges"])
		reac_cr_ans[i,"P"] <- ans$p.value
}

# Ends in error, but table is written ok




reac_cr_ans[which.min(reac_cr_ans$P),]
    # reac_edges_thresh_cr         t       df mean_RH_pos_reac_edges sem_RH_pos_reac_edges mean_RH_neg_reac_edges sem_RH_neg_reac_edges           P
# 113                  224 -5.367847 4.312459                  248.5                  19.5               399.8191              20.35734 0.004685789 <<<<<<<< use in paper


min(p.adjust(reac_cr_ans$P,method="BH"))
# [1] 0.08762805 <<<<<<<< use in paper






reac_cr_ans$q <- p.adjust(reac_cr_ans$P, method = "BH")
h_line_3 <- max(reac_cr_ans[reac_cr_ans$q < 0.05,]$P)



p3 <- ggplot() + 
		theme2 + 
		theme(legend.key=element_blank()) +
		geom_line(
			data= reac_cr_ans, 
			lwd=0.2,
			colour="black",
			show.legend=FALSE,
			aes(
				x= reac_edges_thresh_cr, 
				y= -log10(reac_cr_ans$P)
				)
			) +
		geom_hline(
			color = "red",
			size=size_hline,
			aes(
				yintercept = -log10(h_line_3), 
				linetype = "FDR = 0.05"
				)
			) +
		geom_hline(
			color = "blue",
			size=size_hline,
			aes(
				yintercept = -log10(0.05), 
				linetype = "P = 0.05"
				)
			) +
		scale_linetype_manual(
			name = NULL, 
			values = c("solid","dashed"), 
			guide = guide_legend(
						override.aes = list(
										color = c("red","blue")
										)
							)
			) +
		theme(
	 		legend.position = c(0.25,0.95)
 			) +
		labs(subtitle="Reactome cr") +
		xlab("Entry threshold") + 
		# scale_x_continuous(breaks = c(0,1,2,3,4,6), labels = c(0,1,2,3,4,6)) + 
		ylab(expression(-log[10]*italic('P'))) 
print(p3)




# ------------ reactome RH+, RH- cr diffs (4) ----------------------




reac_cr_ans_2 <- reshape(reac_cr_ans, 
  varying = c("mean_RH_pos_reac_edges", "mean_RH_neg_reac_edges"), 
  v.names = "No.entries",
  timevar = "RH", 
  times = c(1,0), 
  new.row.names = 1:1000,
  direction = "long")
  
reac_cr_ans_3 <- reshape(reac_cr_ans_2, 
  varying = c("sem_RH_pos_reac_edges", "sem_RH_neg_reac_edges"), 
  v.names = "No.entries.sem",
  timevar = "RH.sem", 
  times = c(1,0), 
  new.row.names = 1:1000,
  direction = "long")
  
# remove incorrect combos
reac_cr_ans_4 <- reac_cr_ans_3[!(reac_cr_ans_3$RH != reac_cr_ans_3$RH.sem),]

n = length(unique(reac_cr_ans_4$RH))
colores_4 = gg_color_hue(n)
names(colores_4) <- levels(factor(reac_cr_ans_4$RH))


labels_4 <- c("RH-","RH+")




p4 <- ggplot() + 
		theme2 + 
		theme(legend.key=element_blank()) +
		geom_line(
			data= reac_cr_ans_4, 
			lwd=0.2,
			aes(
				x= reac_edges_thresh_cr, 
				y= No.entries,
				colour=as.factor(RH)
					),
			show.legend=TRUE
			) +
		geom_ribbon(
			data=reac_cr_ans_4,
			aes(
				x= reac_edges_thresh_cr, 
				ymin= No.entries-No.entries.sem, 
				ymax= No.entries + No.entries.sem,
				group=as.factor(RH)
				),
			lwd=0.2,
			fill="grey50",
		    alpha=0.3,
			show.legend=FALSE
			) +
		# geom_point(
			# shape=1,
			# stroke=0.2,
			# data= reac_cr_points, 
			# aes(
				# x= PMID_thresh, 
				# y= PMID_entries, 
				# colour=as.factor(RH)),
				# size=1.0
				# ) +
		# scale_linetype_manual(
			# name = NULL, 
			# values = c(1), 
			# guide = guide_legend(
						# override.aes = list(
											# color = c("red"),
											# size=size_hline
											# )
							# )
			# ) +
		scale_colour_manual(
			values=colores_4,
			labels=labels_4,
			name=NULL		
			) +
		guides(
			shape=FALSE,
	 		fill = guide_legend(
				 		override.aes = list(
				 		#fill=NA,
				 		shape=1,
				 		size=NA
				 		),
			 		ncol=1,
			 		byrow=TRUE
			 		)
	 		 ) +
   		theme(
	 		legend.position = c(0.15,0.9)
 			) +
		# scale_x_discrete(
			# # labels = labels_12, 
			# expand = expand_scale(add = .6)
			# ) +
		# scale_y_continuous(
			# expand = expand_scale(mult = .05), 
			# trans = log10_trans(), 
			# # labels=function(x) {x*1e-3},
			# breaks = trans_breaks("log10", function(x) 10^x, n=3),
			# labels = trans_format("log10", math_format(10^.x))
			# ) +
		# coord_cartesian(ylim = c(1,10^4.5)) +
		xlab("Entry threshold") + 
		ylab(expression(Entries >= threshold)) +
		labs(subtitle="Reactome cr")
print(p4)



#------------------Make file --------------------------

# png("Hum_seq_cov_montage_1.png",width=7.5,height=10,units="in",res=300)
# plot_grid(p1, p2,p3,p4,p5,p6, labels=c("A", "B","C","D","E","F"), ncol = 2, nrow = 3, label_size = 14)
# dev.off()


# Decided to include p2, p4 only
# (p_comp <- plot_grid(p2, p4, labels=c("A", "B"), ncol = 2, nrow = 1, label_size = 14))


(p_comp <- plot_grid(p2, p1, p4, p3, labels=c("A", "B", "C", "D"), ncol = 2, nrow = 2, label_size = 16))


pdf("rif_reactome.pdf",width=7.5,height=7.2)
p_comp
dev.off()




# -------- How many RH growth cr and nc genes lack both rif and Reactome? ----------------

rif_reac_1 <- merge(gene_rif_summary_2, reac_4)

dim(gene_rif_summary_2)
# [1] 60603    21

dim(reac_4)
# [1] 60603    23

dim(rif_reac_1)
# [1] 60603    24


# rif_reac_1 does not include CEN
dim(rif_reac_1[rif_reac_1$RH==1,])
# [1] 855  24


dim(rif_reac_1[rif_reac_1$geneSymbol=="CEN",])
# [1]  0 24

# cr: 148 genes without geneRIF or reactome entries
dim(rif_reac_1[rif_reac_1$PMID_entries == 0 & rif_reac_1$reac_edges == 0 & rif_reac_1$gene_type == "protein_coding" & rif_reac_1$RH==1,])
# [1] 148  24 <<<<<<<<<<<<< use in paper. Too many to check by hand in PubMed

# cr: total 442 genes
dim(rif_reac_1[rif_reac_1$gene_type == "protein_coding" & rif_reac_1$RH==1,])
# [1] 442  24 <<<<<<<<<<<<< use in paper


# Alternatively, 442 cr genes:
dim(g_unique[g_unique$gene_type=="protein_coding",])
# [1] 442  50 <<<<<<<<<<<<< use in paper


# save cr genes with no geneRIF or reactome entries to query against DAVID interpro (DO NOT DELETE):
# write.table(rif_reac_1[rif_reac_1$PMID_entries == 0 & rif_reac_1$reac_edges == 0 & rif_reac_1$gene_type == "protein_coding" & rif_reac_1$RH==1,"geneSymbol"],"growth_unique_no_rif_reac_geneSymbol.txt",quote=FALSE,sep="\t",row.names=FALSE,col.names=FALSE)

# actually used ensembl gene id for DAVID interpro  (DO NOT DELETE):
# write.table(rif_reac_1[rif_reac_1$PMID_entries == 0 & rif_reac_1$reac_edges == 0 & rif_reac_1$gene_type == "protein_coding" & rif_reac_1$RH==1,"gene_id"],"growth_unique_no_rif_reac_ensembl_id.txt",quote=FALSE,sep="\t",row.names=FALSE,col.names=FALSE)


# DAVID interpro results saved in growth_unique_no_rif_reac_Interpro.txt

# work out expected values

Interpro <- read.table("growth_unique_no_rif_reac_Interpro.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE,check.names=FALSE)

# Calculated expected: (Pop Hits/Pop Total)*(List Total).
Interpro$Expected <- (Interpro[,c("Pop Hits")]/Interpro[,c("Pop Total")])*Interpro[,c("List Total")]




# MLE ORs
# For consistency with rest of paper, calculate MLE odds ratio from fisher.test in R, not classic OR

Interpro$OR <- 0

for(i in c(1:nrow(Interpro))) {
	
	Interpro$OR[i] <- fisher.test(matrix(c(Interpro$"Count"[i],(Interpro$"Pop Hits"[i]-Interpro$"Count"[i]),(Interpro$"List Total"[i]-Interpro$"Count"[i]),(Interpro$"Pop Total"[i]-(Interpro$"Count"[i])-(Interpro$"List Total"[i]-Interpro$"Count"[i])-(Interpro$"Pop Hits"[i]-Interpro$"Count"[i]))),2,2),alternative="greater")$estimate
	
}



Interpro <- Interpro[order(Interpro$Benjamini),]
rownames(Interpro) <- NULL




# # If wish to do EASE OR (jack knife correction) (and EASE P val if desired) do following. However, P vals are close but do not match exactly results from DAVID, for unclear reasons.

# Interpro$OR <- 0

# for(i in c(1:nrow(Interpro))) {
	
	# Interpro$OR[i] <- fisher.test(matrix(c(Interpro$"Count"[i]-1,(Interpro$"Pop Hits"[i]-Interpro$"Count"[i]),(Interpro$"List Total"[i]-Interpro$"Count"[i]),(Interpro$"Pop Total"[i]-(Interpro$"Count"[i])-(Interpro$"List Total"[i]-Interpro$"Count"[i])-(Interpro$"Pop Hits"[i]-Interpro$"Count"[i]))),2,2),alternative="greater")$estimate
	
# }


# P val quoted in Interpro from DAVID is EASE P val from Fisher's test corrected via jack knife procedure.


# DO NOT DELETE
# write.table(Interpro,"growth_unique_no_rif_reac_Interpro.txt",quote=FALSE,sep="\t",row.names=FALSE)

Interpro
  # Category                                           Term Count        %       PValue
# 1 INTERPRO                            IPR001202:WW domain     6 4.195804 3.296684e-05
# 2 INTERPRO                IPR000195:Rab-GTPase-TBC domain     4 2.797203 7.303309e-03
# 3 INTERPRO       IPR006634:TRAM/LAG1/CLN8 homology domain     3 2.097902 6.089401e-03
# 4 INTERPRO IPR003591:Leucine-rich repeat, typical subtype     5 3.496503 3.575426e-02
                                                                                                 # Genes List Total Pop Hits Pop Total Fold Enrichment  Bonferroni   Benjamini
# 1 ENSG00000151718, ENSG00000102385, ENSG00000163697, ENSG00000176769, ENSG00000151276, ENSG00000169933        130       53     18559       16.161684 0.009612942 0.009612942
# 2                                   ENSG00000146350, ENSG00000152061, ENSG00000108239, ENSG00000274933        130       57     18559       10.018354 0.883250175 0.511251527
# 3                                                    ENSG00000271092, ENSG00000090661, ENSG00000152078        130       17     18559       25.193213 0.832981210 0.591320676
# 4                  ENSG00000240720, ENSG00000165379, ENSG00000108061, ENSG00000130224, ENSG00000163428        130      178     18559        4.010156 0.999976719 0.930537389
          # FDR  Expected        OR
# 1  0.04388612 0.3712485 18.907158
# 2  9.29880733 0.3992672 11.002693
# 3  7.81084122 0.1190797 31.036382
# 4 38.41695556 1.2468344  4.220296



# Only WW domain enriched: odds ratio = 18.907158,  P = 3.296684e-05	BH FDR = 0.009612942, obs = 6, exp = 0.3712485 <<<<<<<<<<<<<<< use in paper



# nc: 413 genes without geneRIF or reactome entries
dim(rif_reac_1[rif_reac_1$PMID_entries == 0 & rif_reac_1$reac_edges == 0 & rif_reac_1$gene_type != "protein_coding" & rif_reac_1$RH==1,])
# [1] 413  24

# nc: total 413 genes. That is, no nc genes with geneRIF or reactome entries.
dim(rif_reac_1[rif_reac_1$gene_type != "protein_coding" & rif_reac_1$RH==1,])
# [1] 413  24 <<<<<<<<<<<< use in paper

#Alternatively, 413 nc non-CEN genes:
dim(g_unique[g_unique$gene_type != "protein_coding" & g_unique$geneSymbol != "CEN",])
# [1] 413  50 <<<<<<<<<<<< use in paper


# No nc genes with PMID or reac entry

dim(rif_reac_1[(rif_reac_1$PMID_entries != 0 | rif_reac_1$reac_edges != 0) & rif_reac_1$gene_type != "protein_coding" & rif_reac_1$RH==1,])
# [1]  0 24 <<<<<<<<<<<< use in paper


rif_reac_1[(rif_reac_1$PMID_entries != 0 | rif_reac_1$reac_edges != 0) & rif_reac_1$gene_type != "protein_coding" & rif_reac_1$RH==1,]
# [1] gene_id          Chromosome       tx_id            geneSymbol       strand           geneS            geneE            geneLength       txLength        
# [10] cdsLength        5utrS            5utrE            5utrDiff         3utrS            3utrE            3utrDiff         exonCount        gene_type       
# [19] gene_description RH               PMID_entries     edge1            edge2            reac_edges      
# <0 rows> (or 0-length row.names) <<<<<<<<<<<<<< use in paper



# -------- How many  RH paclitaxel cr and nc genes lack both rif and Reactome? ----------------


d_unique <- read.delim("paclitaxel_loci_unique.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE,check.names=FALSE)


dim(d_unique)
# [1] 38  50

dim(d_unique[d_unique$geneSymbol != "CEN",])
# [1] 34 50

# rif_reac_1 does not include CEN, see "How many RH growth cr and nc genes lack both rif and Reactome?" above

rif_reac_1$RH_pac <- 0
rif_reac_1[rif_reac_1$gene_id %in% d_unique$ensembl_gene_id,"RH_pac"] <- 1

# cr: 8 genes without geneRIF or reactome entries <<<<<<<<<<<<< use in paper
dim(rif_reac_1[rif_reac_1$PMID_entries == 0 & rif_reac_1$reac_edges == 0 & rif_reac_1$gene_type == "protein_coding" & rif_reac_1$RH_pac==1,])
# [1]  8 25

# These cr genes without geneRIF or reactome entries are:
rif_reac_1[rif_reac_1$PMID_entries == 0 & rif_reac_1$reac_edges == 0 & rif_reac_1$gene_type == "protein_coding" & rif_reac_1$RH_pac==1,"geneSymbol"]
# [1] "TBC1D12"    "GALNT18"    "JAZF1"      "SEMA3D"     "NEK10"      "PDE4DIP"    "TMEM185B"   "AC020915.5"

# Checked by hand. Only TMEM185B and AC020915.5 have no pubmed entries (though TMEM185B is mentioned as ee3 in PMID: 15525354 DOI: 10.1111/j.1471-4159.2004.02799.x) <<<<<<<<<< use in paper

# paclitaxel cr: total 19 genes <<<<<<<<<<<<< use in paper
dim(rif_reac_1[rif_reac_1$gene_type == "protein_coding" & rif_reac_1$RH_pac==1,])
# [1] 19 25

# Alternatively, paclitaxel cr: total 19 genes <<<<<<<<<<<<< use in paper
dim(d_unique[d_unique$gene_type == "protein_coding",])
# [1] 19 50





# save cr genes with no geneRIF or reactome entries to query against DAVID interpro (DO NOT DELETE):
 # write.table(rif_reac_1[rif_reac_1$PMID_entries == 0 & rif_reac_1$reac_edges == 0 & rif_reac_1$gene_type == "protein_coding" & rif_reac_1$RH_pac==1,"geneSymbol"],"paclitaxel_unique_no_rif_reac_geneSymbol.txt",quote=FALSE,sep="\t",row.names=FALSE,col.names=FALSE)
 
# DO NOT DELETE
 # write.table(rif_reac_1[rif_reac_1$PMID_entries == 0 & rif_reac_1$reac_edges == 0 & rif_reac_1$gene_type == "protein_coding" & rif_reac_1$RH_pac==1,"gene_id"],"paclitaxel_unique_no_rif_reac_ensembl_id.txt",quote=FALSE,sep="\t",row.names=FALSE,col.names=FALSE)
 
# No domains enriched in DAVID interpro


# nc: 15 genes lacking both geneRIF and reactome entries
dim(rif_reac_1[rif_reac_1$PMID_entries == 0 & rif_reac_1$reac_edges == 0 & rif_reac_1$gene_type != "protein_coding" & rif_reac_1$RH_pac==1,])
# [1] 15 25 <<<<<<<<<<<<< use in paper

# nc: total 15 genes in rif or reac, ie NO nc genes with either reac or rif
dim(rif_reac_1[rif_reac_1$gene_type != "protein_coding" & rif_reac_1$RH_pac==1,])
# [1] 15 25 <<<<<<<<<<<<< use in paper


# Alternatively, total paclitaxel nc including CEN: total 19 genes 
dim(d_unique[d_unique$gene_type != "protein_coding",])
# [1] 19 50 <<<<<<<<<<<<< use in paper


# Alternatively, total paclitaxel nc non-centromeric: total 15 genes 
dim(d_unique[d_unique$gene_type != "protein_coding" & d_unique$geneSymbol != "CEN",])
# [1] 15 50 <<<<<<<<<<<<< use in paper




# -------- How many  RH Ix cr and nc genes lack both rif and Reactome? ----------------


Ix <- read.delim("Ix_loci.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE,check.names=FALSE)


dim(Ix)
# [1] 62 50

dim(Ix[Ix$geneSymbol != "CEN",])
# [1] 57 50



# rif_reac_1 does not include CEN, see "How many RH growth cr and nc genes lack both rif and Reactome?" above

rif_reac_1$RH_Ix <- 0
rif_reac_1[rif_reac_1$gene_id %in% Ix$ensembl_gene_id,"RH_Ix"] <- 1

# cr: 5 genes without geneRIF or reactome entries <<<<<<<<<<<<< use in paper
dim(rif_reac_1[rif_reac_1$PMID_entries == 0 & rif_reac_1$reac_edges == 0 & rif_reac_1$gene_type == "protein_coding" & rif_reac_1$RH_Ix==1,])
# [1]  5 26

# These cr genes without geneRIF or reactome entries are:
rif_reac_1[rif_reac_1$PMID_entries == 0 & rif_reac_1$reac_edges == 0 & rif_reac_1$gene_type == "protein_coding" & rif_reac_1$RH_Ix==1,"geneSymbol"]
# [1] "CDH13"   "FSIP1"   "SEMA3D"  "ALK"     "PDE4DIP"

# Checked by hand. None have no pubmed entries. All have entries <<<<<<<<<< use in paper


# Ix cr: total 26 genes <<<<<<<<<<<<< use in paper
dim(rif_reac_1[rif_reac_1$gene_type == "protein_coding" & rif_reac_1$RH_Ix==1,])
# [1] 26 26

# Alternatively, Ix cr: total 26 genes <<<<<<<<<<<<< use in paper
dim(Ix[Ix$gene_type == "protein_coding",])
# [1] 26 50


# save cr genes with no geneRIF or reactome entries to query against DAVID interpro  (DO NOT DELETE):
# write.table(rif_reac_1[rif_reac_1$PMID_entries == 0 & rif_reac_1$reac_edges == 0 & rif_reac_1$gene_type == "protein_coding" & rif_reac_1$RH_Ix==1,"geneSymbol"],"Ix_no_rif_reac_geneSymbol.txt",quote=FALSE,sep="\t",row.names=FALSE,col.names=FALSE)
 
# DO NOT DELETE
# write.table(rif_reac_1[rif_reac_1$PMID_entries == 0 & rif_reac_1$reac_edges == 0 & rif_reac_1$gene_type == "protein_coding" & rif_reac_1$RH_Ix==1,"gene_id"],"Ix_no_rif_reac_ensembl_id.txt",quote=FALSE,sep="\t",row.names=FALSE,col.names=FALSE)

# No domains enriched in DAVID interpro



# nc: 31 genes lacking both geneRIF and reactome entries
dim(rif_reac_1[rif_reac_1$PMID_entries == 0 & rif_reac_1$reac_edges == 0 & rif_reac_1$gene_type != "protein_coding" & rif_reac_1$RH_Ix==1,])
# [1] 31 26 <<<<<<<<<<<<< use in paper

# nc: total 31 genes in rif or reac, ie NO nc genes with either reac or rif
dim(rif_reac_1[rif_reac_1$gene_type != "protein_coding" & rif_reac_1$RH_Ix==1,])
# [1] 31 26 <<<<<<<<<<<<< use in paper


# Alternatively, total Ix nc including CEN: total 36 genes 
dim(Ix[Ix$gene_type != "protein_coding",])
# [1] 36 50 <<<<<<<<<<<<< use in paper


# Alternatively, total paclitaxel nc non-centromeric: total 31 genes 
dim(Ix[Ix$gene_type != "protein_coding" & Ix$geneSymbol != "CEN",])
# [1] 31 50 <<<<<<<<<<<<< use in paper

































