library(dplyr)
library(tidyverse)
library(data.table)
args=commandArgs(trailingOnly=TRUE)
options(scipen=999)
assign_cluster2 <- function(pos1,pos2,imd_cutoff) {
  # This is a time consuming function
  imd_cutoff <- imd_cutoff[1]
  r1 <- abs(pos1 - data.table::shift(pos1,1)) # consecutive imd
  r2<-abs(pos2-data.table::shift(pos2,1))
  #print("r1")
  #print(r1)
  #print("r2")
  #print(r2)
  r1[1] <- 0
  r2[1] <- 0
  cluster_assign1 <- vector('character',length(r1))
  #print(cluster_assign1)
  #cluster_starts1 <- c(1,which((abs(r1) > imd_cutoff &r1!=0& r2!=0)|(abs(r2)>imd_cutoff &r2!=0&r1!=0)))
  #cluster_starts1 <- c(1,which((abs(r1 - imd_cutoff)> 0&r2!=0)|(abs(r2-imd_cutoff)>0&r1!=0)))
  cluster_starts1 <- c(1,which((r1!=0&r2!=0) |(r1==0 & abs(r2)>imd_cutoff )| (r2==0 & abs(r1)>imd_cutoff)))# (abs(r1 - imd_cutoff)> 0&r2!=0)|(abs(r2-imd_cutoff)>0&r1!=0)))
  #print("starts")
  #print(cluster_starts1)
  #print(which(abs(r1 - imd_cutoff)> 0|abs(r2-imd_cutoff)>0))
  cluster_ends1 <- data.table::shift(cluster_starts1,-1) - 1
  #  print("ends")
  #  print(cluster_ends1)
  cluster_ends1[length(cluster_ends1)] <- length(r1)
  #  print(cluster_assign1)
  #  print("start")
  #  print(cluster_starts1)
  #  print("end")
  #  print(cluster_ends1)
  cluster_num <- 1
  #  print(cluster_num)
  for (i in 1:length(cluster_starts1)) {
    cluster_assign1[cluster_starts1[i]:cluster_ends1[i]] <- cluster_num
    cluster_num <- cluster_num + 1
  }
  
  return(cluster_assign1)
}

imd_cutoff=5


#df<-read_tsv("/home/users/ayh/Projects/25_transplantation/06_botseq/210204/05_bam_depth/test.txt.depth.v8.txt",col_types=cols("CHR1"=col_character(),"CHR2"=col_character(),"F1R2(n,mq)"=col_character(),
#                                    "F2R1(n,mq)"=col_character(),
#                                    "F1R1(n,mq)"=col_character(),
#                                    "F2R2(n,mq)"=col_character()))

#read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/15_bot_seq/EM_seq/07_depth/240319/test/10pg_A3B_60s.bismark.s.indel.bam.depth.v8.test.txt")%>%
#  as.data.table()
#df<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/15_bot_seq/EM_seq/07_depth/240319/test/10pg_A3B_60s.bismark.s.indel.bam.depth.v8.test.txt",col_types=cols("CHR1"=col_character(),
#df<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/15_bot_seq/EM_seq/07_depth/240319/10pg_A3B_60s.bismark/10pg_A3B_60s.bismark.s.indel.bam.depth.v8.txt",col_types=cols("CHR1"=col_character(),

#                                                                                                                             "CHR2"=col_character(),
#                                                                                                                             "F1R2(n,mq)"=col_character(),
#                                                                                                                             "F2R1(n,mq)"=col_character(),
#                                                                                                                             "F1R1(n,mq)"=col_character(),
#                                                                                                                             "F2R2(n,mq)"=col_character()))
#id=gsub(".s.indel.bam.depth.v[0-9]*.txt","",basename(args[1]))
#print(args[1])
#print(gsub(".depth.v[0-9]*.txt",".processed.F1R2.F2R1.Only_SS_lib.region.cont.v6.bed",basename(args[1])))
#print(id)
#print(paste(id,gsub(".depth.v[0-9]*.txt",".processed.F1R2.F2R1.Only_SS_lib.region.cont.v6.bed",basename(args[1])),sep="/"))
#df<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/15_bot_seq/EM_seq/07_depth/240319/10pg_A3B_60s.bismark/10pg_A3B_60s.bismark.s.indel.bam.depth.v8.txt",col_types=cols("CHR1"=col_character(),
df<-read_tsv(args[1],col_types=cols("CHR1"=col_character(),
                                    "CHR2"=col_character(),
                                    "F1R2(n,mq)"=col_character(),
                                    "F2R1(n,mq)"=col_character(),
                                    "F1R1(n,mq)"=col_character(),
                                    "F2R2(n,mq)"=col_character()))
#df<-df%>%mutate(CHR2=c(rep(c(22),11),15,12))
#df
#df
#df<-

dt<-as.data.table(df)
dt[, c("F1R1_n", "F1R1_mq") := tstrsplit(`F1R1(n,mq)`, ",")]
dt[, c("F1R2_n", "F1R2_mq") := tstrsplit(`F1R2(n,mq)`, ",")]
dt[, c("F2R1_n", "F2R1_mq") := tstrsplit(`F2R1(n,mq)`, ",")]
dt[, c("F2R2_n", "F2R2_mq") := tstrsplit(`F2R2(n,mq)`, ",")]
split_df<-dt[,-7:-10]%>%as.tibble()
#df%>%head(n=10000)
#split_df<-df%>%separate(`F1R1(n,mq)`,c("F1R1_n","F1R1_mq"),",") %>%
#split_df<-df%>%head(n=10000)%>%separate(`F1R1(n,mq)`,c("F1R1_n","F1R1_mq"),",") %>%
#  separate(`F1R2(n,mq)`,c("F1R2_n","F1R2_mq"),",") %>%
#  separate(`F2R1(n,mq)`,c("F2R1_n","F2R1_mq"),",") %>%
#  separate(`F2R2(n,mq)`,c("F2R2_n","F2R2_mq"),",")
#split_df
#split_df[-1]
#is.na(split_df)
#split_df
split_df[-1] <- lapply(split_df[-1], as.numeric)

##remove cluster

split_df<-split_df%>%mutate(type=ifelse(F1R2_n>0 & F2R1_n>0,"DS","only_SS_lib"))%>%
  mutate(read_type=ifelse(F1R2_n>0,"F1R2","F2R1"))

split_df<-split_df%>%filter(type=="only_SS_lib")%>%
  as.data.table()



split_df[,cluster_id:=assign_cluster2(POS1,POS4,imd_cutoff),by=list(CHR1,read_type)]

split_df<-split_df%>%mutate(info=paste(CHR1,CHR2,cluster_id,read_type,sep="_"))


split_df<-split_df%>%mutate(info2=paste(CHR1,POS1,POS4,sep="_"))

exclude_info<-split_df%>%
  group_by(info)%>%
  dplyr::summarise(n=n())%>%
  left_join(split_df%>%select(info,info2))


split_df<-left_join(split_df,exclude_info)

split_excl_df<-split_df  %>%filter(n==1)%>%as.tibble()%>%arrange(CHR1,POS1)

split_3_df<-split_excl_df%>%filter((F1R2_n>=3 & F1R2_mq>=20 & F2R1_n==0) | (F2R1_n>=3 &F2R1_mq>=20 & F1R2_n==0) )

uniq_split_3_df<-split_3_df%>%
  mutate(CHR2=ifelse(CHR1=="X",23,
                     ifelse(CHR1=="Y",24,CHR2)))

# For POS3_2, POS4_2 annotation, CHR1 and CHR2 has to be same for SEX chromosomes - modified by LWH

#uniq_split_3_df$CHR2
uniq_split_3_df$CHR2 <- as.character(uniq_split_3_df$CHR2)
uniq_split_3_df$CHR2 <- gsub("23","X",uniq_split_3_df$CHR2)
uniq_split_3_df$CHR2 <- gsub("24","Y",uniq_split_3_df$CHR2)
uniq_split_3_df<-uniq_split_3_df%>%mutate(POS3_2=ifelse(CHR1==CHR2,
                                                        ifelse(POS3>POS2,
                                                               POS3,
                                                               ifelse(POS4!=POS2,
                                                                      ifelse(POS4<POS2,POS3,POS2+1),
                                                                      POS3)),
                                                        POS3)) %>%
  mutate(POS4_2=ifelse((CHR1==CHR2) & (POS3<POS1) & (POS2>POS4),POS1-1,POS4))

# After POS3_2, POS4_2 annotation, CHR1 and CHR2 has to be in double again for filtering below - modified by LWH
uniq_split_3_df


uniq_split_3_df<-uniq_split_3_df%>%filter(!(abs(POS1-POS2)==1 | abs(POS3-POS4)==1))%>%
  filter(CHR1==CHR2)
uniq_split_3_sample_df<-uniq_split_3_df%>%filter(CHR1%in%c(1:22,"X"))%>%filter(((POS1<POS3 | POS4<POS2)) | (POS1==POS3 & POS4==POS2))%>%mutate(read_type=ifelse(F1R2_n>=3,"+","-"))%>%mutate(name=".",score=".")%>%#sample_n(10000)%>%
  arrange(CHR1)
#uniq_split_3_sample_df<-uniq_split_3_df%>%filter(((POS1<POS3 | POS4<POS2)) | (POS1==POS3 & POS4==POS2))%>%mutate(read_type=ifelse(F1R2_n>=3,"+","-"))%>%mutate(name=".",score=".")%>%
#  arrange(CHR1)
#uniq_split_3_sample_df<-uniq_split_3_df%>%filter(CHR1%in%c(1:22,"X"))
fn<-args[1]
#fn<-"/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/15_bot_seq/EM_seq/07_depth/240319/10pg_A3B_60s.bismark/10pg_A3B_60s.bismark.s.indel.bam.depth.v8.txt"
id<-gsub("\\..*","",basename(fn))

print("done")
print(uniq_split_3_sample_df)
uniq_split_3_sample_df<-uniq_split_3_sample_df%>%mutate(dir=gsub("^.*_","",info))
uniq_split_3_sample_df%>%
write.table(paste0(id,".sample_tot.txt"),
            sep="\t",
            quote=F,
            row.names=F)

#F1R2
uniq_split_3_sample_df_F1R2_bed<-rbind(
uniq_split_3_sample_df%>%filter(dir=="F1R2")%>%select(CHR1,POS1,POS2)%>%plyr::rename(c("CHR1"="CHR","POS1"="POS1","POS2"="POS2")),
uniq_split_3_sample_df%>%filter(dir=="F1R2")%>%select(CHR2,POS3_2,POS4_2)%>%plyr::rename(c("CHR2"="CHR","POS3_2"="POS1","POS4_2"="POS2")))%>%
  arrange(CHR,POS1)
uniq_split_3_sample_df_F1R2_bed%>%
write.table(paste0(id,".sample_tot.F1R2.bed"),
            sep="\t",
            quote=F,
            row.names=F,
            col.names=F)

uniq_split_3_sample_df_F1R2_bed%>%mutate(POS1=POS1-1,POS2=POS2)%>%
write.table(paste0(id,".sample_tot.bedtools.F1R2.bed"),
            sep="\t",
            quote=F,
            row.names=F,
            col.names=F)


#F1R2
uniq_split_3_sample_df_F2R1_bed<-rbind(
uniq_split_3_sample_df%>%filter(dir=="F2R1")%>%select(CHR1,POS1,POS2)%>%plyr::rename(c("CHR1"="CHR","POS1"="POS1","POS2"="POS2")),
uniq_split_3_sample_df%>%filter(dir=="F2R1")%>%select(CHR2,POS3_2,POS4_2)%>%plyr::rename(c("CHR2"="CHR","POS3_2"="POS1","POS4_2"="POS2")))%>%
  arrange(CHR,POS1)
uniq_split_3_sample_df_F2R1_bed%>%
write.table(paste0(id,".sample_tot.F2R1.bed"),
            sep="\t",
            quote=F,
            row.names=F,
            col.names=F)

uniq_split_3_sample_df_F2R1_bed%>%mutate(POS1=POS1-1,POS2=POS2)%>%
write.table(paste0(id,".sample_tot.bedtools.F2R1.bed"),
            sep="\t",
            quote=F,
            row.names=F,
            col.names=F)


