#Find growth peaks
# https://stats.stackexchange.com/questions/22974/how-to-find-local-peaks-valleys-in-a-series-of-data

find_peaks <- function (x, m = 3){
    shape <- diff(sign(diff(x, na.pad = FALSE)))
    pks <- sapply(which(shape < 0), FUN = function(i){
       z <- i - m + 1
       z <- ifelse(z > 0, z, 1)
       w <- i + m + 1
       w <- ifelse(w < length(x), w, length(x))
       if(all(x[c(z : i, (i + 2) : w)] <= x[i + 1])) return(i + 1) else return(numeric(0))
    })
     pks <- unlist(pks)
     pks
}


sem <- function(x) {sqrt(var(x,na.rm=TRUE)/sum(!is.na(x)))}

compare <- function(a,b) {
	print(t.test(a,b))
	
	print(paste0("exact P value = ", t.test(a,b)$p.value))
	
	print(paste0("mean of a = ", mean(a, na.rm = TRUE)))
	print(paste0("sem of a = ", sem(a)))
	print(paste0("sd of a = ", sd(a, na.rm = TRUE)))
	print(paste0("number in a = ", sum(!is.na(a))))
	
	print(paste0("mean of b = ", mean(b, na.rm = TRUE)))
	print(paste0("sem of b = ", sem(b)))
	print(paste0("sd of b = ", sd(b, na.rm = TRUE)))
	print(paste0("number in b = ", sum(!is.na(b))))
	
}



logP <- read.table("log10P_human.txt",sep="\t",stringsAsFactors=FALSE,header=TRUE)


human_thresh_95 <- read.table("human_thresh_95.txt",header=FALSE,sep="\t",stringsAsFactors=FALSE,row.names=1,col.names=c("","thresh"))


#####################################################################################################
# --------------- Manually id peaks and compare with peak finding algorithm -------------------------
#####################################################################################################


plot(logP[logP$Chromosome=="chr1",]$pos,logP[logP$Chromosome=="chr1",]$log10p_g_0nM,cex=0.1)
abline(h= human_thresh_95["log10p_g_0nM",],lwd=0.3,col="red")
pts1 <- identify(logP[logP$Chromosome=="chr1",]$pos,logP[logP$Chromosome=="chr1",]$log10p_g_0nM,cex=0.3)


pts1
# [1]   966  1394  1402  1435  1949  2582  3292  5221  5936  6515  6809  7094  7635  8069  8444  8497  8766  9279  9827  9995 10376 11134 11708 16009 16112 17012
# [27] 17014 17542 18664 18812 19366 19368 19709 20601 21185 21189 21696 21919 22604 23264 23592 23737 24216 24728 24815




logP[logP$Chromosome=="chr1",c("Chromosome","pos","log10p_g_0nM")][pts1,]
      # Chromosome       pos log10p_g_0nM
# 966         chr1   9160000    11.167897
# 1394        chr1  13440000     9.383750
# 1402        chr1  13520000     9.585628
# 1435        chr1  13850000     7.882810
# 1949        chr1  18990000    44.740285
# 2582        chr1  25320000     9.173873
# 3292        chr1  32420000    19.743998
# 5221        chr1  51710000    23.247005
# 5936        chr1  58860000    19.249278
# 6515        chr1  64650000    20.658886
# 6809        chr1  67590000    25.563108
# 7094        chr1  70440000    15.025671
# 7635        chr1  75850000    12.418260
# 8069        chr1  80190000    14.804542
# 8444        chr1  83940000    16.858790
# 8497        chr1  84470000    18.124037
# 8766        chr1  87160000    12.619391
# 9279        chr1  92290000     9.713567
# 9827        chr1  97770000    10.550682
# 9995        chr1  99450000    58.157547
# 10376       chr1 103260000    54.414257
# 11134       chr1 110840000    19.115451
# 11708       chr1 116580000    13.109831
# 16009       chr1 159590000    20.713889
# 16112       chr1 160620000    18.222160
# 17012       chr1 169620000    12.332378
# 17014       chr1 169640000    12.574550
# 17542       chr1 174920000    16.669853
# 18664       chr1 186140000     9.555433
# 18812       chr1 187620000    10.963045
# 19366       chr1 193160000    11.985306
# 19368       chr1 193180000    12.673419
# 19709       chr1 196590000     7.440836
# 20601       chr1 205510000    10.532110
# 21185       chr1 211350000    13.058630
# 21189       chr1 211390000    12.597113
# 21696       chr1 216460000    13.265888
# 21919       chr1 218690000    13.832303
# 22604       chr1 225540000    18.847610
# 23264       chr1 232140000     9.476501
# 23592       chr1 235420000    15.442900
# 23737       chr1 236870000    15.614277
# 24216       chr1 241660000    17.484538
# 24728       chr1 246780000    14.654374
# 24815       chr1 247650000    18.929676

dim(logP[logP$Chromosome=="chr1",c("Chromosome","pos","log10p_g_0nM")][pts1,])
# [1] 45  3

plot(logP[logP$Chromosome=="chr2",]$pos,logP[logP$Chromosome=="chr2",]$log10p_g_0nM,cex=0.1)
abline(h= human_thresh_95["log10p_g_0nM",],lwd=0.3,col="red")
pts2 <- identify(logP[logP$Chromosome=="chr2",]$pos,logP[logP$Chromosome=="chr2",]$log10p_g_0nM,cex=0.3)

pts2
 # [1]   509   824   882  1272  1988  2132  2326  3558  3945  4358  4661  4944  4958  5646  6252  6628  7252  7825  8789  9757 10016 11239 11459 12557 12562 12965
# [27] 14572 14615 14857 15299 15652 16076 16997 17587 19303 19661 19704 19976 20736 21081 22756



logP[logP$Chromosome=="chr2",c("Chromosome","pos","log10p_g_0nM")][pts2,]
      # Chromosome       pos log10p_g_0nM
# 25504       chr2   4590000    29.852839
# 25819       chr2   7740000    26.050992
# 25877       chr2   8320000    36.833238
# 26267       chr2  12220000    28.056349
# 26983       chr2  19380000    36.259233
# 27127       chr2  20820000    36.331188
# 27321       chr2  22760000    34.121253
# 28553       chr2  35080000    33.924010
# 28940       chr2  38950000    31.637428
# 29353       chr2  43080000    24.887112
# 29656       chr2  46110000    25.342788
# 29939       chr2  48940000    23.469490
# 29953       chr2  49080000    22.280915
# 30641       chr2  55960000    39.666109
# 31247       chr2  62020000    30.551502
# 31623       chr2  65780000    42.506260
# 32247       chr2  72020000    29.266139
# 32820       chr2  77750000    60.692156
# 33784       chr2  87390000    52.733505
# 34752       chr2  97070000    16.799405
# 35011       chr2  99660000    16.046321
# 36234       chr2 111890000    38.081195
# 36454       chr2 114090000    39.498702
# 37552       chr2 125070000    13.273759
# 37557       chr2 125120000    16.127742
# 37960       chr2 129150000    15.141341
# 39567       chr2 145220000     8.857582
# 39610       chr2 145650000    21.383796
# 39852       chr2 148070000    21.126200
# 40294       chr2 152490000    32.357394
# 40647       chr2 156020000    38.180234
# 41071       chr2 160260000    32.310879
# 41992       chr2 169470000    10.546976
# 42582       chr2 175370000    14.499178
# 44298       chr2 192530000    18.378620
# 44656       chr2 196110000    11.270505
# 44699       chr2 196540000    14.058847
# 44971       chr2 199260000    11.886230
# 45731       chr2 206860000    57.121139
# 46076       chr2 210310000    44.044311
# 47751       chr2 227060000    16.016503



dim(logP[logP$Chromosome=="chr2",c("Chromosome","pos","log10p_g_0nM")][pts2,])
# [1] 41  3


plot(logP[logP$Chromosome=="chr3",]$pos,logP[logP$Chromosome=="chr3",]$log10p_g_0nM,cex=0.1)
abline(h= human_thresh_95["log10p_g_0nM",],lwd=0.3,col="red")
pts3 <- identify(logP[logP$Chromosome=="chr3",]$pos,logP[logP$Chromosome=="chr3",]$log10p_g_0nM,cex=0.3)
pts3


pts3
# [1]   241   242   495  1499  1509  1952  2768  2837  2860  3486  3499  4089  5112  5404  5744  6101  6548  6592  7018  7335  7855  7992  8361  8397  9622  9945
# [27] 10523 10648 11191 11196 11715 12084 12321 12727 13558 14074 14192 14470 14945 15108 15583 15806 15885 16137 16152 16484 16777 17248 17285 17587 18194 18219
# [53] 18477



dim(logP[logP$Chromosome=="chr3",c("Chromosome","pos","log10p_g_0nM")][pts3,])
# [1] 53  3






# subsetting to get rid of NAs not necessary (see below), but more general to keep.

apply(logP[,c(5:16)],2,FUN =function(x) {sum(is.na(x))})
 # log10p_g_0nM  log10p_g_8nM log10p_g_25nM log10p_g_75nM  log10p_g_avg   log10p_d_w1   log10p_d_w2   log10p_d_w3   log10p_d_w4   log10p_d_w6  log10p_d_avg 
            # 0             0             0             0             0             0             0             0             0             0             0 
# log10p_g_d_Ix 
            # 0 
            
            

##########################################################################################################
# >>> Highest intersection with handpicked loci (and a little bit more conservative) when m = 200 <<<<<<<<
# >>>>>>>>>>>>>>>>>>>>>> Corresponds to spacing of 200*10*1e3 = 2 Mb <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
##########################################################################################################



# Intersect



# use P(A|B) <- P(A) + P(B) - P(A&B)
# Prob intersect = P(A&B) / (P(A) + P(B) - P(A&B))



intersect_ans <- data.frame(m = numeric(), intersect_1 = numeric(), intersect_2 = numeric(), intersect_3 = numeric(), mean = numeric(), sem = numeric(), stringsAsFactors=FALSE)
key <- seq(50,450,50)

for (i in c(1:length(key))) {

m <- key[i]

subset <- !is.na(logP$log10p_g_0nM) #remove NAs
peaks <- find_peaks(logP[subset,"log10p_g_0nM"],m)
g_0nM <- logP[subset,][peaks,][logP[subset,][peaks,"log10p_g_0nM"] > human_thresh_95["log10p_g_0nM",],][,c("Chromosome","pos","log10p_g_0nM")]
colnames(g_0nM)[3] <- "log10P"
g_0nM$conc <- 0

intersect_ans[i,1] <- m

intersect_ans[i,2] <- length(intersect(paste0(g_0nM[g_0nM$Chromosome=="chr1",c("Chromosome")],"_",g_0nM[g_0nM$Chromosome=="chr1",c("pos")]),paste0(logP[logP$Chromosome=="chr1",c("Chromosome")][pts1],"_",logP[logP$Chromosome=="chr1",][pts1,c("pos")])))/(length(g_0nM[g_0nM$Chromosome=="chr1",c("Chromosome")]) + length(logP[logP$Chromosome=="chr1",c("Chromosome")][pts1]) - length(intersect(paste0(g_0nM[g_0nM$Chromosome=="chr1",c("Chromosome")],"_",g_0nM[g_0nM$Chromosome=="chr1",c("pos")]),paste0(logP[logP$Chromosome=="chr1",c("Chromosome")][pts1],"_",logP[logP$Chromosome=="chr1",][pts1,c("pos")]))))

intersect_ans[i,3] <- length(intersect(paste0(g_0nM[g_0nM$Chromosome=="chr2",c("Chromosome")],"_",g_0nM[g_0nM$Chromosome=="chr2",c("pos")]),paste0(logP[logP$Chromosome=="chr2",c("Chromosome")][pts2],"_",logP[logP$Chromosome=="chr2",][pts2,c("pos")])))/(length(g_0nM[g_0nM$Chromosome=="chr2",c("Chromosome")]) + length(logP[logP$Chromosome=="chr2",c("Chromosome")][pts2]) - length(intersect(paste0(g_0nM[g_0nM$Chromosome=="chr2",c("Chromosome")],"_",g_0nM[g_0nM$Chromosome=="chr2",c("pos")]),paste0(logP[logP$Chromosome=="chr2",c("Chromosome")][pts2],"_",logP[logP$Chromosome=="chr2",][pts2,c("pos")]))))

intersect_ans[i,4] <- length(intersect(paste0(g_0nM[g_0nM$Chromosome=="chr3",c("Chromosome")],"_",g_0nM[g_0nM$Chromosome=="chr3",c("pos")]),paste0(logP[logP$Chromosome=="chr3",c("Chromosome")][pts3],"_",logP[logP$Chromosome=="chr3",][pts3,c("pos")])))/(length(g_0nM[g_0nM$Chromosome=="chr3",c("Chromosome")]) + length(logP[logP$Chromosome=="chr3",c("Chromosome")][pts3]) - length(intersect(paste0(g_0nM[g_0nM$Chromosome=="chr3",c("Chromosome")],"_",g_0nM[g_0nM$Chromosome=="chr3",c("pos")]),paste0(logP[logP$Chromosome=="chr3",c("Chromosome")][pts3],"_",logP[logP$Chromosome=="chr3",][pts3,c("pos")]))))

}

intersect_ans$mean <- rowMeans(intersect_ans[,c(2:4)])
intersect_ans$sem <- apply(intersect_ans[,c(2:4)],1,FUN = sem)


intersect_ans
    # m intersect_1 intersect_2 intersect_3      mean        sem
# 1  50   0.4805195   0.4864865   0.4020619 0.4563559 0.02720164
# 2 100   0.5901639   0.6206897   0.5342466 0.5817001 0.02531027
# 3 150   0.6538462   0.7608696   0.5645161 0.6597439 0.05675901
# 4 200   0.6666667   0.7906977   0.5818182 0.6797275 0.06065091 <<<<<<<<<<< use in paper
# 5 250   0.6739130   0.7209302   0.5094340 0.6347591 0.06411563
# 6 300   0.6222222   0.6428571   0.4339623 0.5663472 0.06645996
# 7 350   0.5777778   0.5365854   0.4150943 0.5098192 0.04883236
# 8 400   0.4888889   0.4634146   0.4150943 0.4557993 0.02164027
# 9 450   0.4666667   0.4634146   0.3584906 0.4295240 0.03552910





# --------------- automated peak finder fewer peaks (more conservative) than manual, but not significantly -----------------------------





ans <- data.frame(m = numeric(), chr1 = numeric(), chr2 = numeric(), chr3 = numeric(), stringsAsFactors=FALSE)
key <- seq(50,450,50)

for (i in c(1:length(key))) {

m <- key[i]

subset <- !is.na(logP$log10p_g_0nM) #remove NAs
peaks <- find_peaks(logP[subset,"log10p_g_0nM"],m)
g_0nM <- logP[subset,][peaks,][logP[subset,][peaks,"log10p_g_0nM"] > human_thresh_95["log10p_g_0nM",],][,c("Chromosome","pos","log10p_g_0nM")]
colnames(g_0nM)[3] <- "log10P"
g_0nM$conc <- 0

ans[i,1] <- m
ans[i,2] <- dim(g_0nM[g_0nM$Chromosome=="chr1",])[1]
ans[i,3] <- dim(g_0nM[g_0nM$Chromosome=="chr2",])[1]
ans[i,4] <- dim(g_0nM[g_0nM$Chromosome=="chr3",])[1]

}

ans$mean <- rowMeans(ans[,c(2:4)])
ans$sem <- apply(ans[,c(2:4)],1,FUN = sem)


ans
    # m chr1 chr2 chr3     mean       sem
# 1  50   69   69   83 73.66667 4.6666667
# 2 100   52   53   59 54.66667 2.1858128
# 3 150   41   40   44 41.66667 1.2018504
# 4 200   35   36   34 35.00000 0.5773503 <<<<<< corresponds to highest overlap
# 5 250   32   33   27 30.66667 1.8559215
# 6 300   28   28   23 26.33333 1.6666667
# 7 350   26   22   22 23.33333 1.3333333
# 8 400   22   19   22 21.00000 1.0000000
# 9 450   21   19   19 19.66667 0.6666667






# mean handpicked peaks on chrs 1, 2, 3:

mean(c(length(pts1),length(pts2),length(pts3)))
# 46.33333


sem(c(length(pts1),length(pts2),length(pts3)))
# [1] 3.527668



compare(c(length(pts1),length(pts2),length(pts3)),as.numeric(ans[ans$m==200,c(2:4)]))

	# Welch Two Sample t-test <<<<<<<<<<< use in paper

# data:  a and b
# t = 3.1705, df = 2.1071, p-value = 0.08107
# alternative hypothesis: true difference in means is not equal to 0
# 95 percent confidence interval:
 # -3.321765 25.988432
# sample estimates:
# mean of x mean of y 
 # 46.33333  35.00000 

# [1] "exact P value = 0.0810748441967471"
# [1] "mean of a = 46.3333333333333"
# [1] "sem of a = 3.52766841475279"
# [1] "sd of a = 6.11010092660779"
# [1] "number in a = 3"
# [1] "mean of b = 35"
# [1] "sem of b = 0.577350269189626"
# [1] "sd of b = 1"
# [1] "number in b = 3"






# ---------------- make g and g_unique tables ------------------------


m <- 200


# subsetting to get rid of NAs not necessary (see below), but more general to keep.

apply(logP[,c(5:16)],2,FUN =function(x) {sum(is.na(x))})
 # log10p_g_0nM  log10p_g_8nM log10p_g_25nM log10p_g_75nM  log10p_g_avg   log10p_d_w1   log10p_d_w2   log10p_d_w3   log10p_d_w4   log10p_d_w6  log10p_d_avg 
            # 0             0             0             0             0             0             0             0             0             0             0 
# log10p_g_d_Ix 
            # 0 


subset <- !is.na(logP$log10p_g_0nM) #remove NAs
peaks <- find_peaks(logP[subset,"log10p_g_0nM"],m)
g_0nM <- logP[subset,][peaks,][logP[subset,][peaks,"log10p_g_0nM"] > human_thresh_95["log10p_g_0nM",],][,c("Chromosome","pos","log10p_g_0nM")]
colnames(g_0nM)[3] <- "log10P"
g_0nM$conc <- 0

subset <- !is.na(logP$log10p_g_8nM) #remove NAs
peaks <- find_peaks(logP[subset,"log10p_g_8nM"],m)
g_8nM <- logP[subset,][peaks,][logP[subset,][peaks,"log10p_g_8nM"] > human_thresh_95["log10p_g_8nM",],][,c("Chromosome","pos","log10p_g_8nM")]
colnames(g_8nM)[3] <- "log10P"
g_8nM$conc <- 8


subset <- !is.na(logP$log10p_g_25nM) #remove NAs
peaks <- find_peaks(logP[subset,"log10p_g_25nM"],m)
g_25nM <- logP[subset,][peaks,][logP[subset,][peaks,"log10p_g_25nM"] > human_thresh_95["log10p_g_25nM",],][,c("Chromosome","pos","log10p_g_25nM")]
colnames(g_25nM)[3] <- "log10P"
g_25nM$conc <- 25


subset <- !is.na(logP$log10p_g_75nM) #remove NAs
peaks <- find_peaks(logP[subset,"log10p_g_75nM"],m)
g_75nM <- logP[subset,][peaks,][logP[subset,][peaks,"log10p_g_75nM"] > human_thresh_95["log10p_g_75nM",],][,c("Chromosome","pos","log10p_g_75nM")]
colnames(g_75nM)[3] <- "log10P"
g_75nM$conc <- 75


subset <- !is.na(logP$log10p_g_avg) #remove NAs
peaks <- find_peaks(logP[subset,"log10p_g_avg"],m)
g_avg <- logP[subset,][peaks,][logP[subset,][peaks,"log10p_g_avg"] > human_thresh_95["log10p_g_avg",],][,c("Chromosome","pos","log10p_g_avg")]
colnames(g_avg)[3] <- "log10P"
g_avg$conc <- "avg"

g <- rbind(g_0nM,g_8nM,g_25nM,g_75nM,g_avg)



g.bed <- g[,c(1,2,4)]
g.bed$pos_plus <- g.bed$pos + 1
g.bed <- g.bed[,c(1,2,4,3)]
g.bed <- format(g.bed,scientific=FALSE)

write.table(g.bed,"g.bed",sep="\t",quote=FALSE,row.names=FALSE,col.names=FALSE)


# ----------- Create table genecode_cen.bed from gencode, ensembl and ucsc ----------------------
# ~~~ Go to START HERE (below) if gencode_gtf_ensembl_ucsc already constructed, to save time ~~~~~

# Create gene table from gencode and ensembl websites, supplemented with information from UCSC table brower
# All gencode tables are 1 based counting. ucsc tables are 0-based. I will use 1-based counting in my tables.


# ~~~~~~~ Download latest gencode GTF from gencode website ~~~~~~~~~~~
# ~~~~~~~ Downloaded gencode.v31.annotation.gtf on 08/31/2019 ~~~~~~~~

library(rtracklayer)
gencode_gtf <- import("gencode.v31.annotation.gtf")
gencode_gtf <- as.data.frame(gencode_gtf,stringsAsFactors=FALSE)

# remove version nos from gene_id, transcript_id to allow eventual merge with gencode_ensembl
gencode_gtf$gene_id <- gsub("\\.[0-9]+","",gencode_gtf$gene_id)
gencode_gtf$transcript_id <- gsub("\\.[0-9]+","",gencode_gtf$transcript_id)




# get rid of unneeded columns. Got rid of tx_id because when select type == "gene" (below), no associated tx_id
gencode_gtf <- gencode_gtf[,c("seqnames", "start",   "end", "width", "strand","type","gene_id","gene_type","gene_name")]
colnames(gencode_gtf) <- c("Chromosome","geneS","geneE","geneLength","strand","type","gene_id","gene_type","geneSymbol")

# get rid of unneeded types, eg exon, CDS, etc
gencode_gtf <- gencode_gtf[gencode_gtf$type=="gene",]

fctr.cols <- sapply(gencode_gtf, is.factor)
gencode_gtf[, fctr.cols] <- sapply(gencode_gtf[, fctr.cols], as.character)


# Get rid of "type". But remember to correct for ensembl and gencode being 1 based and ucsc being 0 based!!!
gencode_gtf <- gencode_gtf[,c("Chromosome", "geneS",   "geneE","geneLength", "strand",  "gene_id", "gene_type", "geneSymbol")]

dim(gencode_gtf)
# [1] 60603     8


# gencode_gtf has only one entry per gene_id, but multiple gene_id and tx_id for some geneSymbols, eg RF00019

length(unique(gencode_gtf$geneSymbol))
# [1] 59050

length(unique(gencode_gtf$gene_id))
# [1] 60603


# ~~~~~~~~~~ gencode v31 from ensembl ~~~~~~~~~~~~~~~

# Supplement info with gencode downloaded from ensembl biomart web site using structure option 

# library(seqinr)
# gencode_ensembl <- read.fasta("martquery_0115051407_592.txt")
# gencode_ensembl <- do.call(rbind,strsplit(names(gencode_ensembl),'\\|'))
# gencode_ensembl <- as.data.frame(gencode_ensembl)

# # replace empty entries with NA:
# is.na(gencode_ensembl) <- do.call(cbind,lapply(gencode_ensembl, FUN = function(x) {x==""}))



# gencode_ensembl <- read.table("martquery_0115213848_796.txt", header=TRUE, stringsAsFactors = FALSE, sep = "\t",quote="")

# Or supplement with gencode info downloaded from ensembl using biomaRt. This is **superior** to above.

# library(biomaRt)
# ensembl <- useDataset("hsapiens_gene_ensembl", mart=useMart("ensembl"))


# gencode_ensembl <- list()

# # download from ensembl. Download by chromosomes, or else ensembl times out.

# for (i in c(1:22,"X","Y","MT")) {

# gencode_ensembl[[i]] <- getBM(
						# attributes=c(
							# 'chromosome_name',
							# 'start_position',
							# 'end_position',
							# 'strand',
							# 'ensembl_gene_id',
							# 'ensembl_transcript_id',
							# 'gene_biotype',
							# 'external_gene_name',
							# 'transcript_length',
							# '5_utr_start',
							# '5_utr_end',
							# '3_utr_start',
							# '3_utr_end', 
							# 'cds_length', 
							# 'description'
						# ), 
			      # filters = 'chromosome_name', 
			      # values = i, 
			      # mart = ensembl
			      # )
			      
# }


# gencode_ensembl <- do.call(rbind, gencode_ensembl)





# chrOrder <- c(1:22,"X","Y","MT")
# gencode_ensembl$chromosome_name <-factor(gencode_ensembl$chromosome_name, levels=chrOrder)
# gencode_ensembl <- gencode_ensembl[order(gencode_ensembl$chromosome_name, gencode_ensembl$start_position), ]
# gencode_ensembl$chromosome_name <- as.character(gencode_ensembl$chromosome_name)






# save 'frozen' version for archival purposes:			      
# write.table(gencode_ensembl, "gencode_ensembl_v31.txt",quote=FALSE,sep="\t",row.names=FALSE,col.names=TRUE)

# Use 'frozen' table from biomaRt. # Use read.delim cf https://kbroman.org/blog/2017/08/08/eof-within-quoted-string/
# quote mark in a gene description entry introduced during production of gencode_ensembl via biomaRt
gencode_ensembl <- read.delim("gencode_ensembl_v31.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE)

# # alternative approach: https://kbroman.org/blog/2017/08/08/eof-within-quoted-string/
# gencode_ensembl <- read.table("gencode_ensembl_v31.txt", header=TRUE, sep="\t", stringsAsFactors=FALSE, check.names=FALSE, quote="", fill=FALSE)
			      
gencode_ensembl[gencode_ensembl$chromosome_name == "MT","chromosome_name"] <- "M"
			      
			      
colnames(gencode_ensembl) <- c("Chromosome","geneS","geneE","strand","gene_id","tx_id","gene_type","geneSymbol","txLength","5utrS","5utrE","3utrS","3utrE","cdsLength","gene_description")

gencode_ensembl[gencode_ensembl$strand==1,"strand"] <- "+"
gencode_ensembl[gencode_ensembl$strand==-1,"strand"] <- "-"





gencode_ensembl$Chromosome <- paste0("chr",gencode_ensembl$Chromosome)

dim(gencode_ensembl)
# [1] 513708     15




# chose longest txLength for each gene entry. Use gene_id rather than geneSymbol to preserve different entries for repetitive genes with different gene_id but same geneSymbol, eg RF00019
gencode_ensembl_agg <- merge(aggregate(txLength~gene_id,gencode_ensembl,max),gencode_ensembl,all.x=TRUE,sort=FALSE)

dim(gencode_ensembl_agg)
# [1] 106682     15


gencode_ensembl_agg <- gencode_ensembl_agg[!duplicated(gencode_ensembl_agg[,c("gene_id")]),]


dim(gencode_ensembl_agg)
# [1] 60558    15


# gencode_gtf is a bit bigger:
dim(gencode_gtf)
# [1] 60603     8



# Choose biggest 5utr in each tx_id. Does not necessarily chose longest txLength or cdsLength. Use gencode_ensembl_agg for that information.
gencode_ensembl_5utr <- gencode_ensembl
# add 1 to account for 1-based table
gencode_ensembl_5utr$"5utrDiff" <- gencode_ensembl_5utr$"5utrE"-gencode_ensembl_5utr$"5utrS" + 1
gencode_ensembl_5utr$"5utrDiff"[is.na(gencode_ensembl_5utr$"5utrDiff")] <- 0  # replace NAs with 0s so that which.max works next step
# Find biggest 5'UTR in each tx_id:
gencode_ensembl_5utr <- do.call(rbind, lapply(split(gencode_ensembl_5utr,as.factor(gencode_ensembl_5utr$tx_id)), function(x) {return(x[which.max(x$"5utrDiff"),])}))
gencode_ensembl_5utr[(is.na(gencode_ensembl_5utr$"5utrS")|is.na(gencode_ensembl_5utr$"5utrE")),"5utrDiff"] <- NA # replace 0s with NAs if either 5utrS or 5utrE is NA
rownames(gencode_ensembl_5utr) <- NULL

dim(gencode_ensembl_5utr)
# [1] 226721     16

# Choose biggest 3utr in each tx_id. Does not necessarily chose longest txLength or cdsLength. Use gencode_ensembl_agg for that information.
gencode_ensembl_3utr <- gencode_ensembl
# add 1 to account for 1-based table
gencode_ensembl_3utr$"3utrDiff" <- gencode_ensembl_3utr$"3utrE"-gencode_ensembl_3utr$"3utrS" + 1
gencode_ensembl_3utr$"3utrDiff"[is.na(gencode_ensembl_3utr$"3utrDiff")] <- 0 # replace NAs with 0s so that which.max works next step
# Find biggest 3'UTR in each tx_id:
gencode_ensembl_3utr <- do.call(rbind, lapply(split(gencode_ensembl_3utr,as.factor(gencode_ensembl_3utr$tx_id)), function(x) {return(x[which.max(x$"3utrDiff"),])}))
gencode_ensembl_3utr[(is.na(gencode_ensembl_3utr$"3utrS")|is.na(gencode_ensembl_3utr$"3utrE")),"3utrDiff"] <- NA # replace 0s with NAs if either 3utrS or 3utrE is NA
rownames(gencode_ensembl_3utr) <- NULL

dim(gencode_ensembl_3utr)
# [1] 226721     16



# merge gencode_ensembl_5utr, gencode_ensembl_3utr
# NB because tx_id is included in match, insists that max 5utr and 3utr in same transcript. That is good, because selects for tx_id with max 5utr and max 3utr, hence most likely to be bona fide full length transcripts. Note that tx_id refers to the max 5utr and 3utr, not the txLength necessarily.
gencode_ensembl_utr <- merge(gencode_ensembl_5utr[,setdiff(names(gencode_ensembl_5utr), c("3utrS", "3utrE"))],gencode_ensembl_3utr[,setdiff(names(gencode_ensembl_3utr), c("5utrS", "5utrE"))])


dim(gencode_ensembl_utr)
# [1] 226721     17

# # Here is strategy if do not wish to enforce same tx_id for gencode_ensembl_5utr and gencode_ensembl_3utr:
# gencode_ensembl_utr <- gencode_ensembl_utr <- merge(gencode_ensembl_5utr[,setdiff(names(gencode_ensembl_5utr), c("tx_id","3utrS", "3utrE"))],gencode_ensembl_3utr[,setdiff(names(gencode_ensembl_3utr), c("tx_id","5utrS", "5utrE"))])

# dim(gencode_ensembl_utr)
# # [1] 208984     16

# # makes only ~1% difference in dataframe size, so most max 5utr are in same tx as max 3utr
# 208984/206534
# # [1] 1.011862



# merge gencode_ensembl_agg, gencode_ensembl_utr
# NB tx_id in gencode_ensembl_agg refers to max txLength. tx_id in gencode_ensembl_utr refers to tx_id with max 5utr and 3utr. No obvious reason for them to match so well but they do.
gencode_ensembl_final <- merge(gencode_ensembl_agg[,c("gene_id","txLength","Chromosome","geneS","geneE","strand","tx_id","gene_type","geneSymbol","cdsLength","gene_description")],gencode_ensembl_utr[,c("Chromosome","geneS","geneE","strand","gene_id","tx_id","gene_type","geneSymbol","txLength","cdsLength","gene_description", "5utrS", "5utrE", "5utrDiff", "3utrS","3utrE", "3utrDiff")])

# Number of rows in gencode_ensembl_final exactly same as in gencode_ensembl_agg, even though all.x=TRUE was not used. Hence 100% exact matches of tx_id between the two dataframes!! Hence tx_id does refer to both maximum txLength, max 5utr and max 3utr.
dim(gencode_ensembl_agg)
# [1] 60558    15

dim(gencode_ensembl_final)
# [1] 60558    17



# Here is match if do not insist on tx_id matching in gencode_ensembl_agg and gencode_ensembl_utr. Very minimal difference, ~0.049%. However, the very slight increases must represent artifactual expansion based on exact matches of tx_id in two dataframes, as described above. Best to keep tx_id as above.

# gencode_ensembl_final <- merge(gencode_ensembl_agg[,c("gene_id","txLength","Chromosome","geneS","geneE","strand","gene_type","geneSymbol","cdsLength","gene_description")],gencode_ensembl_utr[,c("Chromosome","geneS","geneE","strand","gene_id","gene_type","geneSymbol","txLength","cdsLength","gene_description", "5utrS", "5utrE", "5utrDiff", "3utrS","3utrE", "3utrDiff")])

# dim(gencode_ensembl_final)
# # [1] 58705    16

# 58705/58676
# # [1] 1.000494





dim(gencode_gtf)
# [1] 60603     8

dim(gencode_ensembl_final)
# [1] 60558    17


# All genes found in gencode_gtf are also in gencode_ensembl_final:

dim(gencode_gtf[gencode_gtf$geneSymbol %in% setdiff(gencode_gtf$geneSymbol,gencode_ensembl_final$geneSymbol),c("geneSymbol","gene_type","Chromosome")])
# [1] 0   3


# All genes found in gencode_ensembl_final are also in gencode_gtf:
dim(gencode_ensembl_final[gencode_ensembl_final$geneSymbol %in% setdiff(gencode_ensembl_final$geneSymbol,gencode_gtf$geneSymbol),c("geneSymbol","gene_type","Chromosome")])
# [1] 0   3

# include all genes in gencode_gtf, leave some out from gencode_ensembl_final
dim(merge(gencode_gtf, gencode_ensembl_final,all.x=TRUE))
# [1] 60603    18

# include all genes in gencode_ensembl_final, leave some out from gencode_gtf
dim(merge(gencode_gtf, gencode_ensembl_final,all.y=TRUE))
# [1] 60558    18

# include all genes
dim(merge(gencode_gtf, gencode_ensembl_final,all=TRUE))
# [1] 60603    18


# Some genes repeated in gencode_gtf and in gencode_ensembl_final. 
# However, when repeated genes ignored, gencode_gtf and gencode_ensembl_final are same
# Thus extra genes in gencode_gtf are due to repeated genes

# genes repeated in gencode_gtf
length(gencode_gtf$geneSymbol)
# [1] 60603

length(unique(gencode_gtf$geneSymbol))
# [1] 59050


# Some genes repeated in gencode_ensembl_final
length(gencode_ensembl_final$geneSymbol)
# [1] 60558

length(unique(gencode_ensembl_final$geneSymbol))
# [1] 59050

# Extra repeated genes in gencode_gtf confirmed by:

setdiff(gencode_ensembl_final[duplicated(gencode_ensembl_final$geneSymbol),c("geneSymbol")],gencode_gtf[duplicated(gencode_gtf$geneSymbol),c("geneSymbol")])
character(0)


setdiff(gencode_gtf[duplicated(gencode_gtf$geneSymbol),c("geneSymbol")],gencode_ensembl_final[duplicated(gencode_ensembl_final$geneSymbol),c("geneSymbol")])
 # [1] "AL954722.1" "PLCXD1"     "GTPBP6"     "LINC00685"  "PPP2R3B"    "AL732314.6" "AL732314.4" "FABP5P13"   "KRT18P53"   "SHOX"       "AL672277.1" "RPL14P5"   
# [13] "CRLF2"      "CSF2RA"     "MIR3690"    "RNA5SP498"  "IL3RA"      "SLC25A6"    "LINC00106"  "ASMTL-AS1"  "ASMTL"      "P2RY8"      "AKAP17A"    "ASMT"      
# [25] "AL683807.1" "AL683807.2" "DHRSX"      "DHRSX-IT1"  "ZBED1"      "MIR6089"    "CD99P1"     "LINC00102"  "CD99"       "SPRY3"      "AMD1P2"     "DPH3P2"    
# [37] "VAMP7"      "ELOCP24"    "TRPC6P"     "IL9R"       "AJ271736.1" "WASIR1"     "WASH6P"     "DDX11L16"  


# Mostly seems to be X and Y repeaed genes. For example:
gencode_gtf[gencode_gtf$geneSymbol == "AL954722.1",]
        # Chromosome  geneS  geneE geneLength strand               gene_id              gene_type geneSymbol
# 2779282       chrX 253743 255091       1349      +       ENSG00000228572 unprocessed_pseudogene AL954722.1
# 2871982       chrY 253743 255091       1349      + ENSG00000228572_PAR_Y unprocessed_pseudogene AL954722.1


gencode_ensembl_final[gencode_ensembl_final$geneSymbol == "AL954722.1",]
              # gene_id txLength Chromosome  geneS  geneE strand           tx_id              gene_type geneSymbol cdsLength            gene_description 5utrS 5utrE
# 28166 ENSG00000228572      259       chrX 253743 255091      + ENST00000431238 unprocessed_pseudogene AL954722.1        NA regucalcin (RGN) pseudogene    NA    NA
      # 5utrDiff 3utrS 3utrE 3utrDiff
# 28166       NA    NA    NA       NA


# And:

gencode_gtf[gencode_gtf$geneSymbol == "WASIR1",]
        # Chromosome     geneS     geneE geneLength strand               gene_id gene_type geneSymbol
# 2871884       chrX 156014623 156016837       2215      -       ENSG00000185203    lncRNA     WASIR1
# 2881646       chrY  57201143  57203357       2215      - ENSG00000185203_PAR_Y    lncRNA     WASIR1



gencode_ensembl_final[gencode_ensembl_final$geneSymbol == "WASIR1",]
              # gene_id txLength Chromosome     geneS     geneE strand           tx_id gene_type geneSymbol cdsLength
# 15914 ENSG00000185203     1054       chrX 156014623 156016837      - ENST00000399966    lncRNA     WASIR1        NA
                                                       # gene_description 5utrS 5utrE 5utrDiff 3utrS 3utrE 3utrDiff
# 15914 WASH and IL9R antisense RNA 1 [Source:HGNC Symbol;Acc:HGNC:38513]    NA    NA       NA    NA    NA       NA


# From https://www.gencodegenes.org/pages/faq.html:
# "What is the difference between GENCODE GTF and Ensembl GTF?
# The gene annotation is the same in both files. The only exception is that the genes which are common to the human chromosome X and Y PAR regions can be found twice in the GENCODE GTF, while they are shown only for chromosome X in the Ensembl file."



# Therefore decided to use genes in gencode_gtf. Also gencode web site says best set of genes for most users (ie high quality!)
gencode_gtf_ensembl <- merge(gencode_gtf,gencode_ensembl_final,all.x=TRUE)

dim(gencode_gtf_ensembl)
# [1] 60603    18

# # If wish to keep all genes in both data frames do this instead. Gives same results:
# gencode_gtf_ensembl <- merge(gencode_gtf,gencode_ensembl_final,all=TRUE)



# ~~~~~~~~ ucsc gencode v31 ~~~~~~~~~~~~~~~~~~~~~~~

# ucsc. Only adds exon counts for each gene.
# ucsc tables use 0-based counting. I ignore geneS and geneE in UCSC and will use 1-based counting of ensembl tables.
# On ucsc table browser: "Genes and Gene Predictions", "ALL GENCODE V31"
# output format: "selected fields from primary and related tables"
# Creates ucsc_gencode_v31.txt. Selected fields shown in colnames(), below

gencode_ucsc <- read.table("ucsc_gencode_v31.txt",header=FALSE,sep="\t",stringsAsFactors=FALSE)
colnames(gencode_ucsc) <- c("tx_id","Chromosome","strand","geneS","geneE","cdsS","cdsE","exonCount","exonS","exonE","score","geneSymbol")

# remove version nos from tx_id to allow merge with gencode_gtf_ensembl
gencode_ucsc$tx_id <- gsub("\\.[0-9]+","", gencode_ucsc$tx_id)


dim(gencode_ucsc)
# [1] 100040      12

gencode_gtf_ensembl_ucsc <- merge(gencode_gtf_ensembl,gencode_ucsc[,c("tx_id","geneSymbol","exonCount")],all.x=TRUE)

# Because combo of "tx_id" and "geneSymbol" not quite unique for duplicated genes in gencode_ucsc, end up with 28 duplicated chrX gene rows.
 gencode_gtf_ensembl_ucsc <- unique(gencode_gtf_ensembl_ucsc)

# replace empty entries with NA:
is.na(gencode_gtf_ensembl_ucsc) <- do.call(cbind,lapply(gencode_gtf_ensembl_ucsc, FUN = function(x) {x==""}))


gencode_gtf_ensembl_ucsc <- gencode_gtf_ensembl_ucsc[,c("Chromosome","gene_id","tx_id","geneSymbol","strand","geneS","geneE","geneLength", "txLength", "cdsLength", "5utrS", "5utrE", "5utrDiff", "3utrS", "3utrE", "3utrDiff", "exonCount", "gene_type","gene_description")]


# Sort
chrOrder<-paste("chr",c(1:22,"X","Y","M"),sep="")
gencode_gtf_ensembl_ucsc$Chromosome <-factor(gencode_gtf_ensembl_ucsc$Chromosome, levels=chrOrder)
gencode_gtf_ensembl_ucsc <- gencode_gtf_ensembl_ucsc[order(gencode_gtf_ensembl_ucsc$Chromosome, gencode_gtf_ensembl_ucsc$geneS), ]
gencode_gtf_ensembl_ucsc$Chromosome <- as.character(gencode_gtf_ensembl_ucsc$Chromosome)



# remember gencode is 1-based
dim(gencode_gtf_ensembl_ucsc)
# [1] 60603    19


# write.table(gencode_gtf_ensembl_ucsc, "gencode_gtf_ensembl_ucsc_v31.txt",quote=FALSE,sep="\t",row.names=FALSE,col.names=TRUE)



# ~~~~~~~~~~ if desired, can START HERE for gencode_gtf_ensembl_ucsc ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~~~~~~~~~~ Also permits use of 'frozen' archived v31 gencode tables ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~~~~~~~~~~ Note use of read.delim ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
gencode_gtf_ensembl_ucsc <- read.delim("gencode_gtf_ensembl_ucsc_v31.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE,check.names=FALSE)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~



# ~~~~~~~~~~~~~~~~~~ cen ~~~~~~~~~~~~~~~~~~~~~~

# Create cen table from UCSC table browser
# "All tracks", "Centromeres" to create hg38_centromere.txt

hg38_centromere <- read.table("hg38_centromere.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE)

# Make file cen.bed using limits of centromeres following hg38_centromere_limits_1.R

hg38_centromere_min <- aggregate(chromStart~chrom,data= hg38_centromere,min)
hg38_centromere_max <- aggregate(chromEnd~chrom,data= hg38_centromere,max)

cen <- merge(hg38_centromere_min,hg38_centromere_max)

colnames(cen) <- c("Chromosome","geneS","geneE")

# convert ucsc table to 1-based:
cen$geneS <- cen$geneS + 1

# add 1 to subtraction, because 1-based table
cen$geneLength <- cen$geneE - cen$geneS + 1
cen$gene_type <- c("centromere")

# Sort:
chrOrder<-paste("chr",c(1:22,"X","Y"),sep="")
cen$Chromosome <-factor(cen$Chromosome, levels=chrOrder)
cen <- cen[order(cen$Chromosome), ]
cen$Chromosome <- as.character(cen$Chromosome)


cen$gene_id <- paste0("cen",c(1:22,"X","Y"))
cen$geneSymbol <- "CEN"
cen$gene_description <- "centromere"

# Add other columns to match with gencode_gtf_ensembl_ucsc
cen[setdiff(names(gencode_gtf_ensembl_ucsc), names(cen))] <- NA
cen <- cen[,c(colnames(gencode_gtf_ensembl_ucsc))]


dim(cen)
# [1] 24 19



# ~~~~~~~~~~ combine gencode_gtf_ensembl_ucsc, cen ~~~~~~~~~~~~

gencode_cen <- rbind(gencode_gtf_ensembl_ucsc,cen)

dim(gencode_cen)
# [1] 60627    19


# gene_id is unique identifier compared to geneSymbol, tx_id:

dim(gencode_gtf_ensembl_ucsc)
# [1] 60603    19

length(unique(gencode_gtf_ensembl_ucsc$geneSymbol))
# [1] 59050

length(unique(gencode_gtf_ensembl_ucsc$tx_id))
# [1] 60559

length(unique(gencode_gtf_ensembl_ucsc$gene_id))
# [1] 60603


# this bed file is 1-based, although officially should be 0-based. OK because closest-features will use g.bed which is also 1-based.
gencode_cen.bed <- gencode_cen[,c("Chromosome","geneS","geneE","gene_id")]


write.table(gencode_cen.bed,"gencode_cen.bed",sep="\t",quote=FALSE,row.names=FALSE,col.names=FALSE)

# ------------------- Following in Unix terminal using bedops --------------
# sort -k1,1 -k2,2n -k3,3n g.bed > g_sort.bed
# sort -k1,1 -k2,2n -k3,3n gencode_cen.bed > gencode_cen_sort.bed
# closest-features --closest --dist g_sort.bed gencode_cen_sort.bed > answer.bed

# ------------------- Back to R --------------

g1 <- read.table(textConnection(gsub("\t", "|", readLines("answer.bed"))),fill=TRUE,sep="|",stringsAsFactors=FALSE)
 
g1 <- g1[,c(1,2,4,6:9)]
colnames(g1) <- c("Chromosome","pos","conc","geneS","geneE","gene_id","dist")

g2 <- merge(g,g1) #makes one table with both log10P and dist


g3 <- merge(aggregate(log10P ~ Chromosome + gene_id + conc, g2, max),g2) #selects max logP in same wk, eg cen has two peaks on chr11 75 nM, choose higher logP

g3 <- merge(g3,gencode_cen) # combine logP, pos, conc, dist with other gene information

g3 <- merge(g3,logP) # combine logP, pos, conc, dist, gene information with logP and coef vals



# pick out significant coefficient
g3$sig_coef = numeric(nrow(g3))
g3[g3$conc==0,"sig_coef"] <- g3[g3$conc==0,"coef_g_0nM"]
g3[g3$conc==8,"sig_coef"] <- g3[g3$conc==8,"coef_g_8nM"]
g3[g3$conc==25,"sig_coef"] <- g3[g3$conc==25,"coef_g_25nM"]
g3[g3$conc==75,"sig_coef"] <- g3[g3$conc==75,"coef_g_75nM"]
g3[g3$conc=="avg","sig_coef"] <- g3[g3$conc=="avg","coef_g_avg"]


g3 <- g3[,c("Chromosome","posS","posE","pos","conc","log10P", "sig_coef", "dist","gene_id","tx_id","geneSymbol","strand","geneS","geneE","geneLength","txLength","cdsLength","5utrS","5utrE","5utrDiff","3utrS","3utrE","3utrDiff","exonCount","gene_type","gene_description","log10p_g_0nM", "log10p_g_8nM", "log10p_g_25nM", "log10p_g_75nM", "log10p_g_avg", "log10p_d_w1", "log10p_d_w2", "log10p_d_w3", "log10p_d_w4", "log10p_d_w6", "log10p_d_avg", "log10p_g_d_Ix", "coef_g_0nM", "coef_g_8nM", "coef_g_25nM", "coef_g_75nM", "coef_g_avg", "coef_d_w1", "coef_d_w2", "coef_d_w3", "coef_d_w4", "coef_d_w6", "coef_d_avg", "coef_g_d_Ix")]

colnames(g3)[c(9,10)] <- c("ensembl_gene_id" , "ensembl_tx_id")

# sort
g3$Chromosome <- factor(g3$Chromosome, levels=c(paste("chr",1:22,sep=""),"chrX","chrY"))
g3 <- g3[order(g3$Chromosome, g3$pos),]
g3$Chromosome <- as.character(g3$Chromosome)


dim(g3)
# 1836   50


write.table(g3,"growth_loci.txt", quote=FALSE, row.names=FALSE, sep="\t")

 


#make unique gene table
g3_unique <- merge(aggregate(log10P ~ ensembl_gene_id, data = g3, FUN = max), g3)

g3_unique <- g3_unique[,c("Chromosome","posS","posE","pos","conc","log10P", "sig_coef", "dist","ensembl_gene_id","ensembl_tx_id","geneSymbol","strand","geneS","geneE","geneLength","txLength","cdsLength","5utrS","5utrE","5utrDiff","3utrS","3utrE","3utrDiff","exonCount","gene_type","gene_description","log10p_g_0nM", "log10p_g_8nM", "log10p_g_25nM", "log10p_g_75nM", "log10p_g_avg", "log10p_d_w1", "log10p_d_w2", "log10p_d_w3", "log10p_d_w4", "log10p_d_w6", "log10p_d_avg", "log10p_g_d_Ix", "coef_g_0nM", "coef_g_8nM", "coef_g_25nM", "coef_g_75nM", "coef_g_avg", "coef_d_w1", "coef_d_w2", "coef_d_w3", "coef_d_w4", "coef_d_w6", "coef_d_avg", "coef_g_d_Ix")]

# sort
g3_unique$Chromosome<-factor(g3_unique$Chromosome, levels=c(paste("chr",1:22,sep=""),"chrX","chrY"))
g3_unique <- g3_unique[order(g3_unique$Chromosome, g3_unique$pos),]
g3_unique$Chromosome <- as.character(g3_unique$Chromosome)

dim(g3_unique)
# [1] 859   50

# Percent of all known genes
859/60603
# [1] 0.01417422 <<<<<<<<<<<< use in paper


write.table(g3_unique,"growth_loci_unique.txt", quote=FALSE, row.names=FALSE, sep="\t")




























