# Most of this adapted from peak_g_1.R
# Complete sampling of all positions in logP

logP <- read.table("log10P_human.txt",sep="\t",stringsAsFactors=FALSE,header=TRUE)


g.bed <- logP[,c("Chromosome","pos")]
g.bed$pos_plus <- g.bed$pos + 1
g.bed <- format(g.bed,scientific=FALSE)

write.table(g.bed,"g.bed",sep="\t",quote=FALSE,row.names=FALSE,col.names=FALSE)

# ------------------ see peak_g_1.R for construction of gencode_gtf_ensembl_ucsc ---------------------

gencode_gtf_ensembl_ucsc <- read.delim("gencode_gtf_ensembl_ucsc_v31.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE,check.names=FALSE)

# ~~~~~~~~~~~~~~~~~~ cen ~~~~~~~~~~~~~~~~~~~~~~

# Create cen table from UCSC table browser
# "All tracks", "Centromeres" to create hg38_centromere.txt

hg38_centromere <- read.table("hg38_centromere.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE)

# Make file cen.bed using limits of centromeres following hg38_centromere_limits_1.R

hg38_centromere_min <- aggregate(chromStart~chrom,data= hg38_centromere,min)
hg38_centromere_max <- aggregate(chromEnd~chrom,data= hg38_centromere,max)

cen <- merge(hg38_centromere_min,hg38_centromere_max)

colnames(cen) <- c("Chromosome","geneS","geneE")

# convert ucsc table to 1-based:
cen$geneS <- cen$geneS + 1

# add 1 to subtraction, because 1-based table
cen$geneLength <- cen$geneE - cen$geneS + 1
cen$gene_type <- c("centromere")

# Sort:
chrOrder<-paste("chr",c(1:22,"X","Y"),sep="")
cen$Chromosome <-factor(cen$Chromosome, levels=chrOrder)
cen <- cen[order(cen$Chromosome), ]
cen$Chromosome <- as.character(cen$Chromosome)


cen$gene_id <- paste0("cen",c(1:22,"X","Y"))
cen$geneSymbol <- "CEN"
cen$gene_description <- "centromere"

# Add other columns to match with gencode_gtf_ensembl_ucsc
cen[setdiff(names(gencode_gtf_ensembl_ucsc), names(cen))] <- NA
cen <- cen[,c(colnames(gencode_gtf_ensembl_ucsc))]


dim(cen)
# [1] 24 19

# ~~~~~~~~~~ combine gencode_gtf_ensembl_ucsc, cen ~~~~~~~~~~~~

gencode_cen <- rbind(gencode_gtf_ensembl_ucsc,cen)

dim(gencode_cen)
# [1] 60627    19


# gene_id is unique identifier compared to geneSymbol, tx_id:

dim(gencode_gtf_ensembl_ucsc)
# [1] 60603    19

length(unique(gencode_gtf_ensembl_ucsc$geneSymbol))
# [1] 59050

length(unique(gencode_gtf_ensembl_ucsc$tx_id))
# [1] 60559

length(unique(gencode_gtf_ensembl_ucsc$gene_id))
# [1] 60603


# this bed file is 1-based, although officially should be 0-based. OK because closest-features will use g.bed which is also 1-based.
gencode_cen.bed <- gencode_cen[,c("Chromosome","geneS","geneE","gene_id")]


write.table(gencode_cen.bed,"gencode_cen.bed",sep="\t",quote=FALSE,row.names=FALSE,col.names=FALSE)



# ------------------- Following in Unix terminal using bedops --------------
# sort -k1,1 -k2,2n -k3,3n g.bed > g_sort.bed
# sort -k1,1 -k2,2n -k3,3n gencode_cen.bed > gencode_cen_sort.bed
# closest-features --closest --dist g_sort.bed gencode_cen_sort.bed > answer.bed

# ------------------- Back to R --------------

g1 <- read.table(textConnection(gsub("\t", "|", readLines("answer.bed"))),fill=TRUE,sep="|",stringsAsFactors=FALSE)

g1 <- g1[,c(1,2,5:8)]
colnames(g1) <- c("Chromosome","pos","geneS","geneE","gene_id","dist")

g2 <- merge(g1,logP[,c("Chromosome","posS","posE","pos")]) # makes table supplemented with posS, posE


g3 <- merge(g2,gencode_cen)


g3 <- g3[,c("Chromosome","posS","posE","pos", "dist","gene_id","tx_id","geneSymbol","strand","geneS","geneE","geneLength","txLength","cdsLength","5utrS","5utrE","5utrDiff","3utrS","3utrE","3utrDiff","exonCount","gene_type","gene_description")]

colnames(g3)[c(6,7)] <- c("ensembl_gene_id" , "ensembl_tx_id")

# sort
g3$Chromosome <- factor(g3$Chromosome, levels=c(paste("chr",1:22,sep=""),"chrX","chrY"))
g3 <- g3[order(g3$Chromosome, g3$pos),]
g3$Chromosome <- as.character(g3$Chromosome)


dim(g3)
# [1] 305391     23

write.table(g3,"gene_sample.txt", quote=FALSE, row.names=FALSE, sep="\t")




