library(dplyr)
library(tidyverse)
library(Rsamtools)
library(stringi)
library(parallel)
refgene_df<-read.csv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/28_genomic_feature/01_refernce/fin/v3/intergenic/refGene.target.unk.filtered.mappable.bed",
                     header=F,
                     sep="\t"
)%>%as.tibble()


library(GenomicRanges)
print(refgene_df)
#refgene_df<-refgene_df[1:100,]
fasta_file<-FaFile(file='/home/users/ayh/Projects/reference/genome/human/GRCh37/A3B/human_g1k_v37.rtTA.A3B_mcherry_vec.fa')
gene_gr<-GRanges(seqnames=refgene_df$V1,IRanges(start=(as.numeric(refgene_df$V2)), end=(as.numeric(refgene_df$V3)+1)),strand="+")
print(reduce(gene_gr))
gene_seq<-getSeq(fasta_file,reduce(gene_gr))
#as.data.frame(gene_seq)$x

#print((reduce(refgene_df)%>%as.tibble())$width%>%sum())
TCA_count<-mclapply(as.data.frame(gene_seq)$x,function(x){
  return (rbind((str_count(x,"TC[ACGT]")%>%as.tibble()),
        (str_count(x,"[ACGT]GA")%>%as.tibble()))
  )
}
, mc.cores = 24)

#print(sum((do.call(rbind,TCA_count))$value))

write.table(sum((do.call(rbind,TCA_count))$value),"protein_coding_genic_len.TCN.txt",
            sep="\t",
            quote=F,
            col.names=F,
            row.names=F)
