library(dplyr)
library(tidyverse)
library(Rsamtools)
library(stringi)

refgene_df<-read.csv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/28_genomic_feature/01_refernce/intron/hg19_refGene.protein_coding.txt",
                     header=F,
                     sep="\t")%>%as.tibble()

write.table(refgene_df%>%select(V3,V5,V6),"hg19_refGene.protein_coding.bed",
header=F,
sep="\t",
row.names=F)

refgene_df<-refgene_df%>%mutate(V3=gsub("chr","",V3))%>%
  filter(V3%in%c(as.character(c(1:22)),"X"))

fasta_file<-FaFile(file='/home/users/ayh/Projects/reference/genome/human/GRCh37/A3B/human_g1k_v37.rtTA.A3B_mcherry_vec.fa')


library(GenomicRanges)


gene_gr<-GRanges(seqnames=refgene_df$V3,IRanges(start=(as.numeric(refgene_df$V5)), end=(as.numeric(refgene_df$V6)+1)),strand="+")
gene_seq<-getSeq(fasta_file,reduce(gene_gr))
as.data.frame(gene_seq)$x
TCA_count<-lapply(as.data.frame(gene_seq)$x,function(x){
  str_count(x,"TC[ACGT]")%>%as.tibble()
}
)


write.table(sum((do.call(rbind,TCA_count))$value),"protein_coding_genic_len.TCN.txt",
            sep="\t",
            quote=F,
            col.names=F,
            row.names=F)
