library(rtracklayer)
library(GenomicRanges)
library(Biostrings)


library(dplyr)
library(tidyverse)
library(parallel)

args=commandArgs(trailingOnly=TRUE)
options(scipen=999)


# --- (1) Load BED and Exclude BED ---
#regions <- import("/home/users/ayh/Projects/27_A3B/07_revision/botseq/test/C5_48hr_3/test.000-049.bed", format = "BED")
#regions <- import("/home/users/ayh/Projects/27_A3B/07_revision/botseq/test/C5_48hr_3/C5_48hr_3.s.indel.bam.processed.F1R2.F2R1.region.v6.000-049.bed",format="BED")
regions <- import(args[1],format="BED")

allowed_chr <- c(as.character(1:22), "X")
regions <- regions[as.character(seqnames(regions)) %in% allowed_chr]
regions<-regions[start(regions)>0 &end(regions)>0]
#regions <- import("/home/users/ayh/Projects/27_A3B/07_revision/botseq/test/test/test.s.indel.bam.processed.F1R2.F2R1.region.v6.000-049.bed",format="BED")
exclude <- import("/home/users/ayh/Projects/99_scripts/annotation/botseq/snv/satellite_hg19_rename.simple.bed", format = "BED")
#regions[1:10000]
#regions%>%nrow()
#print(regions)
#print(exclude)
#print("pre")
#GenomicRanges::setdiff(regions[1],subsetByOverlaps(exclude, regions[1]))
#print("pre_done")
# Function to subtract exclude regions from one region
print("start filtering")
filtered_regions<-regions[!(regions %over% exclude)]
print("filtering and merge done")

# Apply function row-by-row
export(filtered_regions, paste(args[1],"filtered_regions.bed",sep="."), format = "BED")
# --- (3) Load reference genome ---
suppressMessages(library(Rsamtools))
fasta_file<-FaFile(file='/home/users/ayh/Projects/reference/genome/human/GRCh37/A3B/human_g1k_v37.rtTA.A3B_mcherry_vec.fa')

# --- (4) Extract sequences ---
print("start get_seq")
t<-getSeq(fasta_file,filtered_regions)
all_seq<-as.data.frame(t)$x
print("get_seq done")
#all_seq
# --- (5) Count A, C, G, T ---

library(stringr)
A_count<-str_count(all_seq,"A")%>%sum()
C_count<-str_count(all_seq,"C")%>%sum()
G_count<-str_count(all_seq,"G")%>%sum()
T_count<-str_count(all_seq,"T")%>%sum()

fn<-args[1]
#fn<-"/home/users/ayh/Projects/27_A3B/07_revision/botseq/test/C5_48hr_3/C5_48hr_3.s.indel.bam.processed.F1R2.F2R1.region.v6.050-099.bed"
id=gsub(".s.indel.*","",basename(fn))
bin <- sub(".*\\.(\\d{3}-\\d{3})\\.bed$", "\\1", fn)
out_df<-data.frame(id=id,bin_group=bin,base=c("A","C","G","T"),count=c(A_count,C_count,G_count,T_count))
out_df%>%
write.table(paste(id,paste(id,bin,"count.txt",sep="."),sep="/"),quote=FALSE,sep="\t",col.names = F, row.names = F)
