library(dplyr)
library(tidyverse)
library(data.table)
library(parallel)
args=commandArgs(trailingOnly=TRUE)
options(scipen=999)
#df<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/15_bot_seq/A3A/05_depth/test/A3A_1st_C19_48h_3ug/A3A_1st_C19_48h_3ug.s.indel.bam.depth.v8.txt",col_types=cols("CHR1"=col_character(),"CHR2"=col_character(),"F1R2(n,mq)"=col_character(),
#                                    "F2R1(n,mq)"=col_character(),
#                                    "F1R1(n,mq)"=col_character(),
#                                    "F2R2(n,mq)"=col_character()))

#df<-read_tsv("/home/users/ayh/Projects/25_transplantation/06_botseq/210204/05_bam_depth/YHA-cancer-HC06-S16L002-wgs-ILLUMINA.fmarked.realigned.recal.bam.depth.v8.txt",col_types=cols("CHR1"=col_character(),
#                                                                                                                             "CHR2"=col_character(),
#                                                                                                                             "F1R2(n,mq)"=col_character(),
#                                                                                                                             "F2R1(n,mq)"=col_character(),
#                                                                                                                             "F1R1(n,mq)"=col_character(),
#                                                                                                                             "F2R2(n,mq)"=col_character()))


df<-read_tsv(args[1],col_types=cols("CHR1"=col_character(),
                                    "CHR2"=col_character(),
                                    "F1R2(n,mq)"=col_character(),
                                    "F2R1(n,mq)"=col_character(),
                                    "F1R1(n,mq)"=col_character(),
                                    "F2R2(n,mq)"=col_character()))


#df<-read_tsv("/home/users/ayh/Projects/27_A3B/07_revision/botseq/test/C5_48hr_3/test/C5_48hr_3.s.indel.bam.depth.v9.txt",col_types=cols("CHR1"=col_character(),
#                                    "CHR2"=col_character(),
#                                    "F1R2(n,mq)"=col_character(),
#                                    "F2R1(n,mq)"=col_character(),
#                                    "F1R1(n,mq)"=col_character(),
#                                    "F2R2(n,mq)"=col_character()))



#df
#df
#df<-
if(FALSE){
dt <- as.data.table(df)

# Split all 4 columns using tstrsplit, which is much faster than separate
dt[, c("F1R1_n", "F1R1_mq") := tstrsplit(`F1R1(n,mq)`, ",", fixed = TRUE)]
dt[, c("F1R2_n", "F1R2_mq") := tstrsplit(`F1R2(n,mq)`, ",", fixed = TRUE)]
dt[, c("F2R1_n", "F2R1_mq") := tstrsplit(`F2R1(n,mq)`, ",", fixed = TRUE)]
dt[, c("F2R2_n", "F2R2_mq") := tstrsplit(`F2R2(n,mq)`, ",", fixed = TRUE)]

cols_to_num <- c("CHR2","F1R1_n", "F1R1_mq", "F1R2_n", "F1R2_mq",
                 "F2R1_n", "F2R1_mq", "F2R2_n", "F2R2_mq")
dt[, (cols_to_num) := lapply(.SD, as.numeric), .SDcols = cols_to_num]
}
split_df<-df%>%separate(`F1R1(n,mq)`,c("F1R1_n","F1R1_mq"),",") %>%
  separate(`F1R2(n,mq)`,c("F1R2_n","F1R2_mq"),",") %>%
  separate(`F2R1(n,mq)`,c("F2R1_n","F2R1_mq"),",") %>%
  separate(`F2R2(n,mq)`,c("F2R2_n","F2R2_mq"),",")
print("split done")
#identical(split_tmp_df,split_df)
#split_df
#split_df[-1]
#is.na(split_df)
#split_df
split_df[-1] <- lapply(split_df[-1], as.numeric)
#split_df
#split_df
#sorted_split_df%>%head(10)
### in block, <4 eliminate
split_3_df<-split_df%>%filter(F1R2_n>=3 & F1R2_mq>=20 & F2R1_n>=3 &F2R1_mq>=20)
#splo
#split_3_df
uniq_split_3_df<-split_3_df
#uniq_split_3_df<-uniq_split_3_df%>%mutate(dist=abs(POS1-POS4))
uniq_split_3_df<-uniq_split_3_df%>%mutate(dist=abs(POS1-POS4))%>%filter(dist<10000)
#tmp%>%filter(dist>10000)
#ggplot(tmp,aes(x=dist))+
#  geom_histogram()+
#  scale_y_log10()+
#  scale_x_log10()
#uniq_split_4_df
#uniq_split_4_df%>%filter(POS1>POS2)

#uniq_split_3_df<-uniq_split_3_df%>%filter(CHR1==CHR2)
#uniq_split_3_df<-uniq_split_3_df%>%mutate(dist=POS2-POS1)%>%filter(dist>0)
#uniq_split_3_df%>%mutate(cover_bp=ifelse(POS3>POS2,(POS2-PO1+1+POS4-POS3+1),POS1-POS2+302))
#uniq_split_3_df<-uniq_split_3_df%>%mutate(POS3_2=ifelse(CHR1==CHR2,
#                                                        ifelse(POS3>POS2,
#                                                               POS3,
#                                                               ifelse(POS4!=POS2,
#                                                                      POS2+1,
#                                                                      POS3)),
#                                                        POS3))
#uniq_split_3_df%>%select(CHR1)%>%unique()%>%print(n=24)
#uniq_split_3_df%>%select(CHR2)%>%unique()%>%print(n=24)
uniq_split_3_df$CHR2<-gsub("23","X",uniq_split_3_df$CHR2)

uniq_split_3_df$CHR2<-gsub("24","Y",uniq_split_3_df$CHR2)


uniq_split_3_df<-uniq_split_3_df%>%mutate(POS3_2=ifelse(CHR1==CHR2,
                                                        ifelse(POS3>POS2,
                                                               POS3,
                                                               ifelse(POS4!=POS2,
                                                                      ifelse(POS4<POS2,POS3,POS2+1),
                                                                      POS3)),
                                                        POS3)) %>%
  mutate(POS4_2=ifelse((CHR1==CHR2) & (POS3<POS1) & (POS2>POS4),POS1-1,POS4))


#uniq_split_3_df
#uniq_split_3_df<-uniq_split_3_df%>%mutate(new_CHR2= (gsub("23","X",CHR2) %>% gsub("24","Y")))
#uniq_split_4_df
#uniq_split_4_df%>%sum(dist,na.rm=TRUE)
#uniq_split_4_df[is.numeric(uniq_split_4_df$dist)]
#sum(uniq_split_4_df$dist)

#sorted_split_4_df
#head(sort_split_4_df,10)
### overlap

#sorted_df$`F1R1(n,mq)`%>%str_split(",")
#uniq_split_3_df%>%filter(POS1==59573112)
uniq_split_3_df<-uniq_split_3_df%>%filter(!(abs(POS1-POS2)==1 | abs(POS3-POS4)==1))
#uniq_split_3_df$CHR1<-as.double(uniq_split_3_df$CHR1)
#uniq_split_3_df$CHR1%>%unique()
#cover_df<-uniq_split_3_df %>% mutate(bed_POS1_2=POS1+150,bed_POS2_1=POS2-150)
#cover_df
#as.character(c(1:22,"X","Y"))
#CHR_list<-c()

A<-uniq_split_3_df%>%filter(((POS1<POS3 | POS4<POS2)) | (POS1==POS3 & POS4==POS2))%>% select(CHR1,POS1,POS2)
B<-uniq_split_3_df%>%filter(((POS1>POS3 | POS4>POS2)) | (POS1==POS3 & POS4==POS2))%>% filter(CHR2%in%as.character(c(1:22,"X","Y"))) %>%select(CHR2,POS3_2,POS4_2)
colnames(A)<-c("CHR","POS1","POS2")
colnames(B)<-c("CHR","POS1","POS2")
#A$CHR%>%unique()
#B$CHR%>%unique()
#A%>%nrow()
#B%>%nrow()
#uniq_split_3_df%>%mutate()
#merged_bed<-rbind(A,B)
#merged_bed<-merged_bed%>%filter(POS1>0 & POS2>0)
#merged_bed<-merged_bed%>%arrange(CHR,POS1)%>%unique()
#merged_bed

##edit##
# Prepare intervals

# 1. Define the region for binning
A_df<-uniq_split_3_df%>%filter(((POS1<POS3 | POS4<POS2)) | (POS1==POS3 & POS4==POS2))
B_df<-uniq_split_3_df%>%filter(((POS1>POS3 | POS4>POS2)) | (POS1==POS3 & POS4==POS2))%>% filter(CHR2%in%as.character(c(1:22,"X","Y")))

bed_A_df<-A_df%>%
  mutate(CHR = CHR1,
         POS1_start = POS1,
         POS1_end = POS2,
         POS2_start = POS3_2,
         POS2_end = POS4_2) 


bed_B_df<-B_df%>%
  mutate(CHR = CHR1,
         POS1_start = POS1,
         POS1_end = POS2,
         POS2_start = POS3_2,
         POS2_end = POS4_2) 


# 2. Function to split a region across two blocks with bin group annotation
bin_joined_region <- function(chr, pos1_start, pos1_end, pos2_start, pos2_end) {
  len1 <- pos1_end - pos1_start + 1
  len2 <- pos2_end - pos2_start + 1
  total_len <- len1 + len2
  
  if (total_len <= 0) return(tibble())
  
  # Fixed bin boundaries and labels
  bin_bounds <- list(
    c(0, 49),
    c(50, 99),
    c(100, 149),
    c(150, 199),
    c(200, 249),
    c(250, 301)
  )
  bin_labels <- sapply(bin_bounds, function(b) sprintf("%03d-%03d", b[1], b[2]))
  
  assign_bins_fixed <- function(start_pos, length_start, bin_start_offset) {
    region_offset <- 0
    bins <- list()
    current_start <- start_pos
    
    for (i in seq_along(bin_bounds)) {
      bin_start <- bin_bounds[[i]][1]
      bin_end <- bin_bounds[[i]][2]
      
      bin_len <- bin_end - bin_start + 1
      region_relative_start <- bin_start - bin_start_offset
      region_relative_end <- bin_end - bin_start_offset
      
      region_overlap_start <- max(0, region_relative_start)
      region_overlap_end <- min(length_start - 1, region_relative_end)
      
      if (region_overlap_start <= region_overlap_end) {
        real_start <- current_start + region_overlap_start
        real_end <- current_start + region_overlap_end
        bins[[length(bins) + 1]] <- tibble(
          CHR = chr,
          POS1 = real_start,
          POS2 = real_end,
          bin = bin_labels[i]
        )
      }
    }
    
    bind_rows(bins)
  }
  
  # Apply fixed binning separately for each region
  bins1 <- assign_bins_fixed(pos1_start, len1, bin_start_offset = 0)
  bins2 <- assign_bins_fixed(pos2_start, len2, bin_start_offset = len1)
  
  bind_rows(bins1, bins2)
}

# 3. Apply to all regions
print("A start")
A_tmp<-mclapply(c(1:nrow(bed_A_df)),function(x){
  print(paste0("A",x))
  t_df<-bed_A_df[x,]
  bin_joined_region(t_df$CHR,t_df$POS1_start,t_df$POS1_end,t_df$POS2_start,t_df$POS2_end)%>%
    mutate(ori_CHR=t_df$CHR1,ori_POS1=t_df$POS1,ori_POS2=t_df$POS2,ori_POS3=t_df$POS3_2,ori_POS4=t_df$POS4_2)
},mc.cores=10)
print("A done")
print("B start")
B_tmp<-mclapply(c(1:nrow(bed_B_df)),function(x){
  print(paste0("B",x))
  t_df<-bed_B_df[x,]
  bin_joined_region(t_df$CHR,t_df$POS1_start,t_df$POS1_end,t_df$POS2_start,t_df$POS2_end)%>%
    mutate(ori_CHR=t_df$CHR1,ori_POS1=t_df$POS1,ori_POS2=t_df$POS2,ori_POS3=t_df$POS3_2,ori_POS4=t_df$POS4_2)
},mc.cores=10)
print("B done")
A_bind_df<-do.call(rbind,A_tmp)
B_bind_df<-do.call(rbind,B_tmp)
#A_bind_df%>%nrow()
#A_bind_df%>%unique()%>%nrow()

#B_bind_df%>%nrow()
#B_bind_df%>%unique()%>%nrow()
merge_bind_df<-rbind(A_bind_df,B_bind_df)%>%
  unique()
merge_bind_df<-merge_bind_df%>%
  select(-ori_CHR,-ori_POS1,-ori_POS2,-ori_POS3,-ori_POS4)

fn=args[1]
id=gsub(".s.indel.bam.depth.v[0-9]*.txt","",basename(fn))
bin_levels <- c("000-049", "050-099", "100-149", "150-199", "200-249", "250-301")
#id="test"
#fn=args[1]
for (bin_group in bin_levels) {
  bin_df <- merge_bind_df %>% filter(bin == bin_group)
  bin_df%>%select(-bin)%>%
  write.table(paste(id,gsub(".depth.v[0-9]*.txt",paste0(".processed.F1R2.F2R1.region.v6.",bin_group,".bed"),basename(fn)),sep="/"),quote=FALSE,sep="\t",col.names = F, row.names = F)
  }
#}

#id="test"
#write.table(merged_bed,"/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/15_bot_seq/A3A/05_depth/A3A_1st_C19_48h_3ug/test.processed.F1R2.F2R1.region.v5.bed",quote=F,sep="\t",col.names=F,row.names=F)
#write.table(merged_bed,paste(id,gsub(".depth.v[0-9]*.txt",".processed.F1R2.F2R1.region.v6.bed",basename(args[1])),sep="/"),quote=FALSE,sep="\t",col.names = F, row.names = F)

