library(ggplot2)
library(cowplot)
library(magick)

# ----------- functions -----------------



sem <- function(x) {sqrt(var(x,na.rm=TRUE)/sum(!is.na(x)))}

compare <- function(a,b) {
	print(t.test(a,b))
	
	print(paste0("exact P value = ", t.test(a,b)$p.value))
	
	print(paste0("mean of a = ", mean(a, na.rm = TRUE)))
	print(paste0("sem of a = ", sem(a)))
	print(paste0("sd of a = ", sd(a, na.rm = TRUE)))
	print(paste0("number in a = ", sum(!is.na(a))))
	
	print(paste0("mean of b = ", mean(b, na.rm = TRUE)))
	print(paste0("sem of b = ", sem(b)))
	print(paste0("sd of b = ", sd(b, na.rm = TRUE)))
	print(paste0("number in b = ", sum(!is.na(b))))
	
}


#----------------- Prepare human logP ---------------------


logP <- read.table("log10P_human.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE)
human_thresh_95 <- read.table("human_thresh_95.txt",header=FALSE,sep="\t",stringsAsFactors=FALSE,row.names=1,col.names=c("","thresh"))



# Sort:
chrOrder<-paste("chr",c(1:22,"X"),sep="")
logP$Chromosome <- factor(logP$Chromosome, levels=chrOrder)
logP <- logP[order(logP$Chromosome, logP$pos), ]
logP$Chromosome <- as.character(logP$Chromosome)



# # Transform chr1 etc. to numbers
# logP$Chromosome <- gsub('chr', '', logP$Chromosome)
# logP[logP$Chromosome == "X","Chromosome"] <- 23
# chrOrder<-c(1:23)
# logP$Chromosome <- factor(logP$Chromosome, levels=chrOrder)
# logP <- logP[order(logP$Chromosome, logP$pos), ]
# logP$Chromosome <- as.numeric(logP$Chromosome)

# # Compute chromosome size
# gen_coord <- aggregate(pos~Chromosome,FUN=max,data=logP)
# colnames(gen_coord)[2] <- "chr_size"
# gen_coord$Chromosome <-factor(gen_coord$Chromosome, levels=chrOrder)
# gen_coord <- gen_coord[order(gen_coord$Chromosome), ]
# gen_coord$Chromosome <- as.numeric(gen_coord$Chromosome)

# # Use cumsum to make genome coordinates
# gen_coord$coord <- c(0,cumsum(gen_coord$chr_size)[-23])

# # merge genome coordinates with logP
# logP <- merge(logP,gen_coord[,c("Chromosome","coord")])
# logP$Chromosome <-factor(logP$Chromosome, levels=chrOrder)
# logP <- logP[order(logP$Chromosome, logP$pos), ]
# logP$Chromosome <- as.numeric(logP$Chromosome)

# logP$coord <- logP$pos + logP$coord


# # find midpoints of chromosomes for breaks in ggplot
# mid <- function(x) {(max(x)+min(x))/2}
# chr_mid <- aggregate(coord~Chromosome,FUN = mid,data=logP)
# colnames(chr_mid)[2] <- "mid"
# chr_mid$Chromosome <-factor(chr_mid$Chromosome, levels=chrOrder)
# chr_mid <- chr_mid[order(chr_mid$Chromosome), ]
# chr_mid$Chromosome <- as.numeric(chr_mid$Chromosome)

# # Define breaks as mid-points chromosomes
# breaks <- chr_mid$mid


# # attractive grey and skyblue color scheme
# cb1<-rep(c("grey", "skyblue"), 12)


# standard black color scheme
cb1<-rep(c("black", "black"), 12)


# ------------------ Olfactory receptors ------------------------

g_unique <- read.delim("growth_loci_unique.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE,check.names=FALSE)

length(g_unique[grepl("olf",g_unique[,"gene_description"]),"gene_description"])
# [1] 12

# <<<<<<<<<<< use in paper >>>>>>>>>>>>>>>
# Six olfactory gene clusters on six chromosomes. Only two loci, in clusters on chr12 and 19, are protein coding.
# <<<<<<<<<<< use in paper >>>>>>>>>>>>>>>

g_unique[grepl("olf",g_unique[,"gene_description"]),c("geneSymbol","Chromosome","pos","conc","log10P","dist","strand","geneLength","exonCount","gene_type","gene_description")]
    # geneSymbol Chromosome       pos conc   log10P   dist strand geneLength exonCount                          gene_type
# 183      OR5H8       chr3  98310000   25 36.41062      0      +       6092         2             polymorphic_pseudogene
# 539    OR4A13P      chr11  55470000  avg 12.68452  -2289      +        942        NA             unprocessed_pseudogene
# 540    OR5D17P      chr11  55750000    8  8.45666   4977      +        981        NA             unprocessed_pseudogene
# 541     OR5W1P      chr11  55900000   25 12.66613   3341      -        938        NA             unprocessed_pseudogene
# 591     OR10A7      chr12  55220000   25 40.14765   1025      +        951         1                     protein_coding
# 592      OR6C6      chr12  55290000    8 27.70777   3988      -       2582         2                     protein_coding
# 640    OR11H5P      chr14  20210000  avg 12.59235      0      +       5893        NA transcribed_unprocessed_pseudogene
# 770 AC005255.1      chr19  14800000   25 21.46135      0      -      46117         6                     protein_coding <<<<<<<<<< use in paper
# 853    OR2AF1P       chrX 131630000   25 22.58025 -10050      +        939        NA             unprocessed_pseudogene
                                                                                          # gene_description
# 183 olfactory receptor family 5 subfamily H member 8 (gene/pseudogene) [Source:HGNC Symbol;Acc:HGNC:14773]
# 539       olfactory receptor family 4 subfamily A member 13 pseudogene [Source:HGNC Symbol;Acc:HGNC:15150]
# 540       olfactory receptor family 5 subfamily D member 17 pseudogene [Source:HGNC Symbol;Acc:HGNC:15284]
# 541        olfactory receptor family 5 subfamily W member 1 pseudogene [Source:HGNC Symbol;Acc:HGNC:15298]
# 591                  olfactory receptor family 10 subfamily A member 7 [Source:HGNC Symbol;Acc:HGNC:15329]
# 592                   olfactory receptor family 6 subfamily C member 6 [Source:HGNC Symbol;Acc:HGNC:31293]
# 640       olfactory receptor family 11 subfamily H member 5 pseudogene [Source:HGNC Symbol;Acc:HGNC:15348]
# 770                          olfactory receptor family 7 subfamily C member 1 [Source:NCBI gene;Acc:26664]
# 853       olfactory receptor family 2 subfamily AF member 1 pseudogene [Source:HGNC Symbol;Acc:HGNC:14719]



# Examine in UCSC genome browser using growth_8nM_log10P.txt, growth_25nM_log10P.txt, growth_avg_log10P.txt.

# ------------------ AC005255.1 (OR7C1) logP  at 25 nM vs Gencode, UCSC genome browser ----------------


# AC005255.1: chr19:14,355,026-15,229,825

# # Place following header at top of  bedGraph format on ucsc genome browser
# browser position chr19:14,355,026-15,229,825
# track type=bedGraph name="-log10P" description="AC005255.1" visibility=full color=0,0,255 altColor=255,0,0 priority=20

# Custom track settings
# Display mode: full
# Type of graph: points
# Track height: 128 pixels
# Data view scaling: auto-scale to data view
# Always include zero: ON
# Vertical viewing range:  min: 0; max: 1000  (range: 0 to 1000) (greyed out)
# Transform function: Transform data points by: NONE
# Windowing function: maximum
# Smoothing window: OFF
# Negate values: not selected
# Draw y indicator lines: 
# at y = 0.0: ON at y = 9.47618624712419 ON (corresponds to human_thresh_95.txt, for log10p_g_75nM)


# Configure Image page on ucsc genome browser:
# image width:	400	pixels
# label area width:	10	characters	
# text size: 12

# In configuration page of the base position track:
# Title: Growth (25 nM)

# AC005255.1 was known as OR7C1 in gencode v29.
# Back to OR7C1 in gencode v32.



p1 <- ggdraw() + draw_image(magick::image_read_pdf("OR7C1_25nM.pdf", density = 300),scale=0.9) + coord_cartesian(clip = "off") # + draw_label("Paclitaxel", fontface='plain', size=12, x=0.55,y=0.92) + draw_label("D", fontface='bold',x=0.05,y=0.98)


# ----------- Mean distance to closest gene ---------------


mean(g_unique$dist)
# [1] -1659.072 <<<<<<<<<<< use in paper

sem(g_unique$dist)
# [1] 1442.327 <<<<<<<<<<< use in paper

t.test(g_unique$dist)
	# One Sample t-test

# data:  g_unique$dist
# t = -1.1503, df = 858, p-value = 0.2504
# alternative hypothesis: true mean is not equal to 0
# 95 percent confidence interval:
 # -4489.974  1171.830
# sample estimates:
# mean of x 
# -1659.072  


# ------------ gene deserts (gd) ------------------



dim(g_unique[g_unique$dist > 250e3,])
# [1]  2 50 <<<<<<<<<<<<< use in paper


g_unique[g_unique$dist > 250e3,c("geneSymbol","Chromosome","pos","conc","log10P","dist","strand","geneLength","exonCount","gene_type","gene_description")]
   # geneSymbol Chromosome       pos conc   log10P   dist strand geneLength exonCount            gene_type
# 39  AL591888.1       chr1 104730000   25 75.06590 268406      -      33960         5               lncRNA
# 258 AC093853.1       chr4 160220000    8 16.00126 301829      -        483        NA processed_pseudogene
                                                                 # gene_description
# 39                                                               Novel transcript
# 258 v-myb myeloblastosis viral oncogene homolog (avian)-like 2 (MYBL2) pseudogene


# Examine in UCSC genome browser using growth_8nM_log10P.txt, growth_25nM_log10P.txt, growth_avg_log10P.txt.



# ------------------ (2) gd1 logP at 25 nM vs Gencode, UCSC genome browser, chr1 gene desert ----------------

# peak at chr1:104730000

# chr1 gene desert: chr1:103,717,502-107,092,506


# # Place following header at top of gd1_25nM.txt and use in bedGraph format on ucsc genome browser
# browser position chr1:103,717,502-107,092,506
# track type=bedGraph name="-log10P" description="Chr 1 gene desert" visibility=full color=0,0,255 altColor=255,0,0 priority=20

# Custom track settings
# Display mode: full
# Type of graph: points
# Track height: 128 pixels
# Data view scaling: auto-scale to data view
# Always include zero: ON
# Vertical viewing range:  min: 0; max: 1000  (range: 0 to 1000) (greyed out)
# Transform function: Transform data points by: NONE
# Windowing function: mean
# Smoothing window: OFF
# Negate values: not selected
# Draw y indicator lines: 
# at y = 0.0: ON at y = 9.47618624712419 ON (corresponds to human_thresh_95.txt, for log10p_g_25nM)


# Configure Image page on ucsc genome browser:
# image width:	400	pixels
# label area width:	7	characters	
# text size: 12

# In configuration page of the base position track:
# Title: Growth (25 nM)


p2 <- ggdraw() + draw_image(magick::image_read_pdf("gd_chr1_25nM.pdf", density = 300),scale=0.9) + coord_cartesian(clip = "off") # + draw_label("Paclitaxel", fontface='plain', size=12, x=0.55,y=0.92) + draw_label("D", fontface='bold',x=0.05,y=0.98)




# ------------------ (3) gd2 logP at 8 nM vs Gencode, UCSC genome browser, chr4 gene desert ----------------


# peak at chr4:160220000

# chr4 gene desert: chr4:158,774,265-162,636,764



# # Place following header at top of bedGraph format on ucsc genome browser
# browser position chr4:158,774,265-162,636,764
# track type=bedGraph name="-log10P" description="Chr 4 gene desert" visibility=full color=0,0,255 altColor=255,0,0 priority=20

# Custom track settings
# Display mode: full
# Type of graph: points
# Track height: 128 pixels
# Data view scaling: auto-scale to data view
# Always include zero: ON
# Vertical viewing range:  min: 0; max: 1000  (range: 0 to 1000) (greyed out)
# Transform function: Transform data points by: NONE
# Windowing function: mean
# Smoothing window: OFF
# Negate values: not selected
# Draw y indicator lines: 
# at y = 0.0: ON at y = 6.76934263023539 ON (corresponds to human_thresh_95.txt, for log10p_g_8nM)


# Configure Image page on ucsc genome browser:
# image width:	400	pixels
# label area width:	7	characters	
# text size: 12

# In configuration page of the base position track:
# Title: Growth (25 nM)


p3 <- ggdraw() + draw_image(magick::image_read_pdf("gd_chr4_8nM.pdf", density = 300),scale=0.9) + coord_cartesian(clip = "off") # + draw_label("Paclitaxel", fontface='plain', size=12, x=0.55,y=0.92) + draw_label("D", fontface='bold',x=0.05,y=0.98)





# --------------- Combine panels ---------------------

top_row = plot_grid(NULL,p1, NULL, ncol=3, rel_widths=c(0.25,0.5,0.25), labels=c("", "A", ""), label_size = 16, align="v")
bottom_row = plot_grid(p2, p3, ncol=2, labels=c("B", "C"), label_size = 16, align="v")


pdf("g_desert_olf_1.pdf", width=7.5, height=6.67, useDingbats = FALSE)
plot_grid(top_row, bottom_row, ncol=1, rel_heights=c(1,1))
dev.off()



# png("g_desert_olf_1.png",width=7.5,height=10,units="in",res=300)
# plot_grid(top_row, bottom_row, ncol=1, rel_heights=c(1,1))
# dev.off()




# ----------------- hi res ----------------------


p1 <- ggdraw() + draw_image(magick::image_read_pdf("OR7C1_25nM.pdf", density = 1200),scale=0.9) + coord_cartesian(clip = "off") # + draw_label("Paclitaxel", fontface='plain', size=12, x=0.55,y=0.92) + draw_label("D", fontface='bold',x=0.05,y=0.98)
p2 <- ggdraw() + draw_image(magick::image_read_pdf("gd_chr1_25nM.pdf", density = 1200),scale=0.9) + coord_cartesian(clip = "off") # + draw_label("Paclitaxel", fontface='plain', size=12, x=0.55,y=0.92) + draw_label("D", fontface='bold',x=0.05,y=0.98)
p3 <- ggdraw() + draw_image(magick::image_read_pdf("gd_chr4_8nM.pdf", density = 1200),scale=0.9) + coord_cartesian(clip = "off") # + draw_label("Paclitaxel", fontface='plain', size=12, x=0.55,y=0.92) + draw_label("D", fontface='bold',x=0.05,y=0.98)


top_row = plot_grid(NULL,p1, NULL, ncol=3, rel_widths=c(0.25,0.5,0.25), labels=c("", "A", ""), label_size = 16, align="v")
bottom_row = plot_grid(p2, p3, ncol=2, labels=c("B", "C"), label_size = 16, align="v")


pdf("g_desert_olf_hi_res_1.pdf", width=7.5, height=6.67, useDingbats = FALSE)
plot_grid(top_row, bottom_row, ncol=1, rel_heights=c(1,1))
dev.off()








# ---------- Examine overlap Olf receptor gene clusters in g_unique petal ---------------

petal <- read.table("petal.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE)
gencode_gtf_ensembl_ucsc <- read.delim("gencode_gtf_ensembl_ucsc_v31.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE,check.names=FALSE)

dim(petal)
# [1] 17651     9

dim(gencode_gtf_ensembl_ucsc)
# [1] 60603    19

# add petal cols to gencode
gencode_gtf_ensembl_ucsc <- merge(gencode_gtf_ensembl_ucsc,petal,by.x=c("geneSymbol"),by.y=c("Gene"),all.x=TRUE)

# Sort:
chrOrder<-paste("chr",c(1:22,"X","Y"),sep="")
gencode_gtf_ensembl_ucsc$Chromosome <-factor(gencode_gtf_ensembl_ucsc$Chromosome, levels=chrOrder)
gencode_gtf_ensembl_ucsc <- gencode_gtf_ensembl_ucsc[order(gencode_gtf_ensembl_ucsc$Chromosome, gencode_gtf_ensembl_ucsc$geneS), ]
gencode_gtf_ensembl_ucsc$Chromosome <- as.character(gencode_gtf_ensembl_ucsc$Chromosome)

dim(gencode_gtf_ensembl_ucsc)
# [1] 60603    27

# make df with ORs
gencode_OR <- gencode_gtf_ensembl_ucsc[grep("olfactory",gencode_gtf_ensembl_ucsc$gene_description),]

# total number OR genes in gencode
dim(gencode_OR)
# [1] 879  27

# get rid of entries with no numTKOHits data, because not tested in petal paper
gencode_OR <- gencode_OR[!is.na(gencode_OR$numTKOHits),]

# Sort:
chrOrder<-paste("chr",c(1:22,"X","Y"),sep="")
gencode_OR$Chromosome <-factor(gencode_OR$Chromosome, levels=chrOrder)
gencode_OR <- gencode_OR[order(gencode_OR$Chromosome, gencode_OR$geneS), ]
gencode_OR$Chromosome <- as.character(gencode_OR$Chromosome)

# number OR genes tested in petal data
dim(gencode_OR)
# [1] 298  27 <<<<<<<<<<< use in paper

# number OR genes positive in petal data
dim(gencode_OR[gencode_OR$numTKOHits > 0,])
# [1] 19 27 <<<<<<<<<<< use in paper

# % OR genes positive in petal data
19/300
# [1] 0.06333333


# make dataframe with OR positive genes from petal data
gencode_OR_TKO <- gencode_OR[gencode_OR$numTKOHits > 0,]

dim(gencode_OR_TKO)
# [1] 19 27



# Sort:
chrOrder<-paste("chr",c(1:22,"X","Y"),sep="")
gencode_OR_TKO$Chromosome <-factor(gencode_OR_TKO$Chromosome, levels=chrOrder)
gencode_OR_TKO <- gencode_OR_TKO[order(gencode_OR_TKO$Chromosome, gencode_OR_TKO$geneS), ]
gencode_OR_TKO$Chromosome <- as.character(gencode_OR_TKO$Chromosome)


# Assuming each chromosome has 1 cluster of OR genes:


# chromosomes with OR in gencode
unique(gencode_gtf_ensembl_ucsc[grep("olfactory",gencode_gtf_ensembl_ucsc$gene_description),"Chromosome"])
 # [1] "chr1"  "chr2"  "chr3"  "chr4"  "chr5"  "chr6"  "chr7"  "chr8"  "chr9"  "chr10" "chr11" "chr12" "chr13" "chr14" "chr15" "chr16" "chr17" "chr18" "chr19" "chr21"
# [21] "chr22" "chrX" 
 
 

# chromosomes tested for OR in petal. Subset of gencode. Also subset of g_unique
unique(gencode_OR$Chromosome)
# [1] "chr1"  "chr2"  "chr3"  "chr5"  "chr6"  "chr7"  "chr9"  "chr10" "chr11" "chr12" "chr14" "chr15" "chr16" "chr17" "chr19" "chrX" 

# chromosomes tested for OR in petal and positive for growth effects
unique(gencode_OR_TKO$Chromosome)
# [1] "chr1"  "chr9"  "chr11" "chr12" "chr14" "chr17" "chr19"





# chromosomes tested for OR in g_unique
unique(g_unique$Chromosome)
# [1] "chr1"  "chr2"  "chr3"  "chr4"  "chr5"  "chr6"  "chr7"  "chr8"  "chr9"  "chr10" "chr11" "chr12" "chr13" "chr14" "chr15" "chr16" "chr17" "chr18" "chr19" "chr20" "chr21" "chr22" "chrX" 

# chromosomes positive for OR growth effects in g_unique
unique(g_unique[grep("olfactory",g_unique$gene_description),]$Chromosome)
# [1] "chr3"  "chr11" "chr12" "chr14" "chr19" "chrX"   


# Superset of possible chromosomes:
intersect(unique(gencode_OR$Chromosome),unique(g_unique$Chromosome)) # same as unique(gencode_OR$Chromosome), since unique(gencode_OR$Chromosome) is subset of unique(g_unique$Chromosome)
#  [1] "chr1"  "chr2"  "chr3"  "chr5"  "chr6"  "chr7"  "chr9"  "chr10" "chr11" "chr12" "chr14" "chr15" "chr16" "chr17" "chr19" "chrX"

Intersect_table <- data.frame(
					Chromosome=intersect(unique(gencode_OR$Chromosome),unique(g_unique$Chromosome)),
					TKO=rep(0,length(unique(gencode_OR$Chromosome))),
					g=rep(0,length(unique(gencode_OR$Chromosome)))
					) 
					
Intersect_table[Intersect_table$Chromosome %in% gencode_OR_TKO$Chromosome,"TKO"] <- 1
Intersect_table[Intersect_table$Chromosome %in% g_unique[grep("olfactory",g_unique$gene_description),"Chromosome"],"g"] <- 1

fisher.test(table(Intersect_table[,c("TKO","g")]))

	# Fisher's Exact Test for Count Data

# data:  table(Intersect_table[, c("TKO", "g")])
# p-value = 0.3024
# alternative hypothesis: true odds ratio is not equal to 1
# 95 percent confidence interval:
  # 0.3618649 72.8305459
# sample estimates:
# odds ratio 
  # 4.193432   


# More detailed examination also confirms very little or no overlap between growth pos OR genes in g_unique and petal at chromsome cluster level:

g_unique[grep("olfactory",g_unique$gene_description),c(1:6,11)]
    # Chromosome      posS      posE       pos conc   log10P geneSymbol
# 183       chr3  97810000  98810000  98310000   25 36.41062      OR5H8
# 539      chr11  54970000  55970000  55470000  avg 12.68452    OR4A13P
# 540      chr11  55250000  56250000  55750000    8  8.45666    OR5D17P
# 541      chr11  55400000  56400000  55900000   25 12.66613     OR5W1P
# 591      chr12  54720000  55720000  55220000   25 40.14765     OR10A7
# 592      chr12  54790000  55790000  55290000    8 27.70777      OR6C6
# 640      chr14  19710000  20710000  20210000  avg 12.59235    OR11H5P
# 770      chr19  14300000  15300000  14800000   25 21.46135 AC005255.1
# 853       chrX 131130000 132130000 131630000   25 22.58025    OR2AF1P


gencode_OR_TKO[,c(1:6)]
      # geneSymbol Chromosome         gene_id           tx_id strand     geneS
# 41881      OR2T6       chr1 ENSG00000198104 ENST00000641644      + 248375746
# 41866      OR2S2       chr9 ENSG00000278889 ENST00000341959      -  35957108
# 41767      OR1J1       chr9 ENSG00000136834 ENST00000259357      - 122476958
# 41775      OR1L8       chr9 ENSG00000171496 ENST00000641027      - 122567117
# 42104     OR52N2      chr11 ENSG00000180988 ENST00000317037      +   5820314
# 41943      OR4C3      chr11 ENSG00000176547 ENST00000319856      +  48324920
# 42186     OR5D18      chr11 ENSG00000186119 ENST00000333976      +  55819607
# 42431      OR8I2      chr11 ENSG00000172154 ENST00000302124      +  56093277
# 41782      OR1S1      chr11 ENSG00000280204 ENST00000641544      +  58212720
# 41956     OR4D11      chr11 ENSG00000176200 ENST00000313253      +  59503576
# 41589        OMP      chr11 ENSG00000254550 ENST00000529803      +  77102840
# 41645     OR10D3      chr11 ENSG00000197309 ENST00000641546      + 124183345
# 42463      OR9K2      chr12 ENSG00000170605 ENST00000641329      +  55126406
# 42263     OR6C76      chr12 ENSG00000185821 ENST00000328314      +  55426254
# 42255     OR6C68      chr12 ENSG00000205327 ENST00000548615      +  55492378
# 42151     OR5AU1      chr14 ENSG00000169327 ENST00000641039      -  21148370
# 41958      OR4D2      chr17 ENSG00000255713 ENST00000641866      +  58166982
# 42397      OR7G3      chr19 ENSG00000170920 ENST00000305444      -   9126012
# 42302      OR7A5      chr19 ENSG00000188269 ENST00000322301      -  14824251

# Or gene level:
intersect(g_unique[grep("olfactory",g_unique$gene_description),c("geneSymbol")],gencode_OR_TKO[,c("geneSymbol")])
# character(0) <<<<<<<<<<<<<<< use in paper



# Lack of overlap between growth +ve petal OR and RH OR is not significant. Probably due to lack of power.

# overlap
intersect(g_unique[grep("olfactory",g_unique$gene_description),c("geneSymbol")],gencode_OR_TKO[,c("geneSymbol")])
# character(0)


# number growth +ve OR genes in RH g_unique
dim(g_unique[grep("olfactory",g_unique$gene_description),c(1:6,11)])
# [1] 9 7


# number growth +ve OR genes in petal
dim(gencode_OR[gencode_OR$numTKOHits > 0,])
# [1] 19 27


# number OR genes tested in petal data
dim(gencode_OR)
# [1] 298  27



# total number OR genes in gencode (not relevant here, because number OR genes limited by number tested in petal)
dim(gencode_OR)
# [1] 879  27



fisher.test(matrix(c(0,9-0,19-0,298-0-9+0-19+0),2,2))
	# Fisher's Exact Test for Count Data

# data:  matrix(c(0, 9 - 0, 19 - 0, 298 - 0 - 9 + 0 - 19 + 0), 2, 2)
# p-value = 1
# alternative hypothesis: true odds ratio is not equal to 1
# 95 percent confidence interval:
 # 0.000000 7.810721
# sample estimates:
# odds ratio 
         # 0 





# ---------- SNPs in chr 1 gene desert ---------------

# Found rs10494021 using GWAS option in ucsc genome browser.

# rs10494021 is at chr1:104741897-104741897, distance from peak marker is:

104741897-104730000
# [1] 11897

# LD estimate from this table of variants in high LD:
# http://uswest.ensembl.org/Homo_sapiens/Variation/HighLD?db=core;r=1:104741397-104742397;v=rs10494021;vdb=variation;vf=502315406#373514_tablePanel

# Linked paper (http://uswest.ensembl.org/Homo_sapiens/Variation/Phenotype?db=core;r=1:104741397-104742397;v=rs10494021;vdb=variation;vf=502315406) shows OR of 1.1399 and P 6.61e-6 for ear infection in GWAS (https://static-content.springer.com/esm/art%3A10.1038%2Fs41467-017-00257-5/MediaObjects/41467_2017_257_MOESM2_ESM.txt).


# ------------- Compare logP vals for gene deserts and other g loci -------------------

# Lower power, because only two RH gene desert loci.

compare(g_unique[g_unique$dist > 250e3 ,c("log10P")],g_unique[g_unique$geneSymbol != "CEN" & g_unique$dist < 250e3,c("log10P")])

	# Welch Two Sample t-test <<<<<<<<<<<<<<<<<<<<<<< use in paper

# data:  a and b
# t = 0.73944, df = 1.0006, p-value = 0.5946
# alternative hypothesis: true difference in means is not equal to 0
# 95 percent confidence interval:
 # -352.9627  396.6433
# sample estimates:
# mean of x mean of y 
 # 45.53358  23.69330 

# [1] "exact P value = 0.59460500816692"
# [1] "mean of a = 45.5335823838496"
# [1] "sem of a = 29.5323214609756"
# [1] "sd of a = 41.7650095384737"
# [1] "number in a = 2"
# [1] "mean of b = 23.6933008162036"
# [1] "sem of b = 0.492583077024476"
# [1] "sd of b = 14.3864619996919"
# [1] "number in b = 853"



























