# ------------------------- data ----------------------------------------

g_unique <- read.delim("growth_loci_unique.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE,check.names=FALSE)



# Remove CEN because not foound in DAVID or GO
write.table(g_unique[g_unique$geneSymbol != "CEN",]$geneSymbol,"growth_unique_no_CEN_geneSymbol.txt",quote=FALSE,sep="\t",row.names=FALSE,col.names=FALSE)

write.table(g_unique[g_unique$geneSymbol != "CEN",]$ensembl_gene_id,"growth_unique_no_CEN_ensembl_id.txt",quote=FALSE,sep="\t",row.names=FALSE,col.names=FALSE)

# Decided to use ensembl ids for DAVID and GO


# -------------- Func annotation chart DAVID ----------------


# Download results of Func annot chart
# called results g_DAVID_Func_Ann_Chart_1.txt

g_david <- read.table("g_DAVID_Func_Ann_Chart_1.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE,check.names=FALSE)

g_david$Expected <- (g_david$"Pop Hits"/g_david$"Pop Total")*g_david$"List Total"

# For consistency with rest of paper, calculate MLE odds ratio from fisher.test in R, not classic OR
g_david$OR <- 0

for(i in c(1:nrow(g_david))) {
	
	g_david$OR[i] <- fisher.test(matrix(c(g_david$"Count"[i],(g_david$"Pop Hits"[i]-g_david$"Count"[i]),(g_david$"List Total"[i]-g_david$"Count"[i]),(g_david$"Pop Total"[i]-(g_david$"Count"[i])-(g_david$"List Total"[i]-g_david$"Count"[i])-(g_david$"Pop Hits"[i]-g_david$"Count"[i]))),2,2),alternative="greater")$estimate
	
}


# # If wish to do EASE OR (jack knife correction) (and EASE P val if desired) do following. However, P vals are close but do not match exactly results from DAVID, for unclear reasons.

# g_david$OR <- 0

# for(i in c(1:nrow(g_david))) {
	
	# g_david$OR[i] <- fisher.test(matrix(c(g_david$"Count"[i]-1,(g_david$"Pop Hits"[i]-g_david$"Count"[i]),(g_david$"List Total"[i]-g_david$"Count"[i]),(g_david$"Pop Total"[i]-(g_david$"Count"[i])-(g_david$"List Total"[i]-g_david$"Count"[i])-(g_david$"Pop Hits"[i]-g_david$"Count"[i]))),2,2),alternative="greater")$estimate
	
# }


# P val quoted in g_david from DAVID is EASE P val from Fisher's test corrected via jack knife procedure.


g_david <- g_david[order(g_david$Benjamini),]
rownames(g_david) <- NULL


g_david[g_david$Benjamini < 0.05,-c(6)]
         # Category                      Term Count         %       PValue List Total Pop Hits Pop Total Fold Enrichment   Bonferroni    Benjamini          FDR   Expected
# 1     UP_KEYWORDS      Alternative splicing   306 59.765625 1.527775e-15        437    10587     20581        1.361236 5.238032e-13 5.238032e-13 2.109424e-12 224.795637
# 2  UP_SEQ_FEATURE            splice variant   232 45.312500 2.178442e-10        433     7760     20063        1.385269 3.367871e-07 3.367871e-07 3.632784e-07 167.476449
# 3        INTERPRO       IPR001202:WW domain    11  2.148438 2.217137e-07        414       53     18559        9.304029 1.806803e-04 1.806803e-04 3.411612e-04   1.182284
# 4     UP_KEYWORDS               Coiled coil    99 19.335938 9.075335e-06        437     3036     20581        1.535743 3.053730e-03 1.528032e-03 1.234158e-02  64.463923
# 5     UP_KEYWORDS            Phosphoprotein   219 42.773438 1.393768e-05        437     8246     20581        1.250794 4.686019e-03 1.564452e-03 1.895332e-02 175.088771
# 6           SMART                SM00456:WW     9  1.757812 3.694901e-05        266       49     10057        6.944376 8.095983e-03 8.095983e-03 4.702366e-02   1.296013
# 7     UP_KEYWORDS             Cell adhesion    24  4.687500 2.489583e-04        437      479     20581        2.359722 8.048543e-02 1.664185e-02 3.380480e-01  10.170691
# 8     UP_KEYWORDS              Polymorphism   292 57.031250 2.212117e-04        437    12043     20581        1.141913 7.184505e-02 1.846651e-02 3.004248e-01 255.711141
# 9     UP_KEYWORDS Chromosomal rearrangement    18  3.515625 8.238042e-04        437      334     20581        2.538114 2.425036e-01 3.889982e-02 1.114562e+00   7.091881
# 10    UP_KEYWORDS           Cell projection    30  5.859375 7.849619e-04        437      721     20581        1.959616 2.325148e-01 4.314748e-02 1.062270e+00  15.309120
          # OR
# 1   2.240674
# 2   1.855408
# 3  11.756460
# 4   1.715952
# 5   1.516382
# 6   8.531849
# 7   2.514559 <<<<<<<<<<<< use in paper
# 8   1.438300
# 9   2.695215
# 10  2.074977


# Archive results

write.table(g_david,"g_DAVID_Func_Ann_Chart_1.txt",quote=FALSE,sep="\t",row.names=FALSE)






# ------------------ Biological process category GO -----------------------------

# get results from GO using entrez gene ids

go <- read.table("g_panth_bio_proc_1.txt",header=TRUE,sep="\t",skip=6,stringsAsFactors=FALSE,check.names=FALSE)


# For consistency with rest of paper, calculate MLE odds ratio from fisher.test in R, not classic OR
# Again, for reasons not understood, Fisher's p vals calculated here are close but do not exactly match P vals from GO panther

go$OR <- 0

for(i in c(1:nrow(go))) {
	
	go$OR[i] <- fisher.test(matrix(c(go$"upload_1 (433)"[i],(go$"Homo sapiens - REFLIST (20996)"[i]-go$"upload_1 (433)"[i]),(433-go$"upload_1 (433)"[i]),(20996-(go$"upload_1 (433)"[i])-(go$"Homo sapiens - REFLIST (20996)"[i]-go$"upload_1 (433)"[i])-(433-go$"upload_1 (433)"[i]))),2,2))$estimate
	
}

go <- go[order(go$"upload_1 (FDR)"),]

head(go,9)
                            # GO biological process complete Homo sapiens - REFLIST (20996) upload_1 (433) upload_1 (expected) upload_1 (over/under)
# 1                       generation of neurons (GO:0048699)                           1541             66               31.78                     +
# 2                                neurogenesis (GO:0022008)                           1640             66               33.82                     +
# 3                  nervous system development (GO:0007399)                           2351             84               48.48                     +
# 4                  regulation of localization (GO:0032879)                           2755             93               56.82                     +
# 5          regulation of synapse organization (GO:0050807)                            220             18                4.54                     +
# 6                         negative chemotaxis (GO:0050919)                             45              9                0.93                     +
# 7 regulation of synapse structure or activity (GO:0050803)                            229             19                4.72                     +
# 8                         biological adhesion (GO:0022610)                            933             43               19.24                     +
# 9                               cell adhesion (GO:0007155)                            927             43               19.12                     +
  # upload_1 (fold Enrichment) upload_1 (raw P-value) upload_1 (FDR)        OR
# 1                       2.08               3.11e-08       0.000494  2.327211
# 2                       1.95               3.36e-07       0.002680  2.169458
# 3                       1.73               7.85e-07       0.003120  1.942477
# 4                       1.64               2.00e-06       0.003530  1.839289
# 5                       3.97               1.84e-06       0.003650  4.371244
# 6                       9.70               1.23e-06       0.003910 12.097536
# 7                       4.02               7.84e-07       0.004160  4.447130
# 8                       2.23               1.83e-06       0.004160  2.436996
# 9                       2.25               1.65e-06       0.004380  2.454278 <<<<<<<<<<< use in paper


# Archive
# keep g_panth_bio_proc_1.txt because contains valuable header info

write.table(go,"g_panth_bio_proc_2.txt",quote=FALSE,sep="\t",row.names=FALSE)











