library(dplyr)
library(tidyverse)
library(Rsamtools)
library(stringi)
library(parallel)
refgene_df<-read.csv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/28_genomic_feature/01_refernce/fin/v3/wgEncodeCrgMapabilityAlign75mer.filtered.bed",
                     header=F,
                     sep="\t"
)%>%as.tibble()

refgene_df<-refgene_df%>%mutate(V2=ifelse(V2==0,1,V2))
library(GenomicRanges)
fasta_file<-FaFile(file='/home/users/ayh/Projects/reference/genome/human/GRCh37/A3B/human_g1k_v37.rtTA.A3B_mcherry_vec.fa')

print("making gr")
gene_gr<-GRanges(seqnames=refgene_df$V1,IRanges(start=(as.numeric(refgene_df$V2)), end=(as.numeric(refgene_df$V3)+1)),strand="+")
gene_seq<-getSeq(fasta_file,reduce(gene_gr))
#as.data.frame(gene_seq)$x

#if(FALSE){
TCA_count<-mclapply(as.data.frame(gene_seq)$x,function(x){
  return (rbind((str_count(x,"TC[ACGT]")%>%as.tibble()),
        (str_count(x,"[ACGT]GA")%>%as.tibble()))
  )
}
, mc.cores =28)



write.table(sum((do.call(rbind,TCA_count))$value),"mappable_len.TCN.txt",
            sep="\t",
            quote=F,
            col.names=F,
            row.names=F)
#}

#if(FALSE){
print("starting nonTCN counting")
nonTCA_count<-mclapply(as.data.frame(gene_seq)$x,function(x){
  return (rbind((str_count(x,"[ACG]C[ACGT]")%>%as.tibble()),
        (str_count(x,"[ACGT]G[TCG]")%>%as.tibble()))
  )
}
, mc.cores =28)



write.table(sum((do.call(rbind,nonTCA_count))$value),"mappable_len.nonTCN.txt",
            sep="\t",
            quote=F,
            col.names=F,
            row.names=F)
#}
NTN_count<-mclapply(as.data.frame(gene_seq)$x,function(x){
  return (rbind((str_count(x,"[ACGT]T[ACGT]")%>%as.tibble()),
        (str_count(x,"[ACGT]A[ACGT]")%>%as.tibble()))
  )
}
, mc.cores =28)



write.table(sum((do.call(rbind,NTN_count))$value),"mappable_len.NTN.txt",
            sep="\t",
            quote=F,
            col.names=F,
            row.names=F)
