library(dplyr)
library(tidyverse)
library(Rsamtools)
library(stringi)
library(parallel)
refgene_df<-read.csv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/28_genomic_feature/01_refernce/fin/v3/Tx/refGene.target.unk.filtered.mappable.uniq.bed",
header=F,
sep="\t"
)%>%as.tibble()
colnames(refgene_df)<-c("chrom","start","end","gene","gene_dir")

refgene_df<-refgene_df%>%filter(gene_dir=="-")

refgene_df<-refgene_df%>%mutate(start=ifelse(start==0,1,start))
#refgene_df<-refgene_df[1:10,]
library(GenomicRanges)
fasta_file<-FaFile(file='/home/users/ayh/Projects/reference/genome/human/GRCh37/A3B/human_g1k_v37.rtTA.A3B_mcherry_vec.fa')

gene_gr<-GRanges(seqnames=refgene_df$chrom,IRanges(start=(as.numeric(refgene_df$start)), end=(as.numeric(refgene_df$end)+1)),strand="+")

gene_seq<-getSeq(fasta_file,reduce(gene_gr))
#as.data.frame(gene_seq)$x
TCA_count<-mclapply(as.data.frame(gene_seq)$x,function(x){
  return(str_count(x,"TC[ACGT]")%>%as.tibble())
}
, mc.cores = 24)
write.table(sum((do.call(rbind,TCA_count))$value),"Tx.-.TCN.txt",
            sep="\t",
            quote=F,
            col.names=F,
            row.names=F)
TCA_count<-mclapply(as.data.frame(gene_seq)$x,function(x){
  return(str_count(x,"[ACGT]GA")%>%as.tibble())
}
, mc.cores = 24)
write.table(sum((do.call(rbind,TCA_count))$value),"Tx.-.NGA.txt",
            sep="\t",
            quote=F,
            col.names=F,
            row.names=F)
