## 2022.08.17 last updated
## modified for mm10 by LWH
## 2024.03.25 last updated
## modified for removing miss alignment(within ~5bp difference in one end, clusters) by ayh

library(data.table)
assign_cluster2 <- function(pos1,pos2,imd_cutoff) {
  # This is a time consuming function
  imd_cutoff <- imd_cutoff[1]
  r1 <- abs(pos1 - data.table::shift(pos1,1)) # consecutive imd
  r2<-abs(pos2-data.table::shift(pos2,1))
  #print("r1")
  #print(r1)
  #print("r2")
  #print(r2)
  r1[1] <- 0
  r2[1] <- 0
  cluster_assign1 <- vector('character',length(r1))
  #print(cluster_assign1)
  #cluster_starts1 <- c(1,which((abs(r1) > imd_cutoff &r1!=0& r2!=0)|(abs(r2)>imd_cutoff &r2!=0&r1!=0)))
  #cluster_starts1 <- c(1,which((abs(r1 - imd_cutoff)> 0&r2!=0)|(abs(r2-imd_cutoff)>0&r1!=0)))
  cluster_starts1 <- c(1,which((r1!=0&r2!=0) |(r1==0 & abs(r2)>imd_cutoff )| (r2==0 & abs(r1)>imd_cutoff)))# (abs(r1 - imd_cutoff)> 0&r2!=0)|(abs(r2-imd_cutoff)>0&r1!=0)))
  #print("starts")
  #print(cluster_starts1)
  #print(which(abs(r1 - imd_cutoff)> 0|abs(r2-imd_cutoff)>0))
  cluster_ends1 <- data.table::shift(cluster_starts1,-1) - 1
  #  print("ends")
  #  print(cluster_ends1)
  cluster_ends1[length(cluster_ends1)] <- length(r1)
  #  print(cluster_assign1)
  #  print("start")
  #  print(cluster_starts1)
  #  print("end")
  #  print(cluster_ends1)
  cluster_num <- 1
  #  print(cluster_num)
  for (i in 1:length(cluster_starts1)) {
    cluster_assign1[cluster_starts1[i]:cluster_ends1[i]] <- cluster_num
    cluster_num <- cluster_num + 1
  }
  
  return(cluster_assign1)
}

imd_cutoff=5


library(dplyr)
library(tidyverse)
args=commandArgs(trailingOnly=TRUE)
options(scipen=999)
#df<-read_tsv("/home/users/ayh/Projects/25_transplantation/06_botseq/210204/05_bam_depth/test.txt.depth.v8.txt",col_types=cols("CHR1"=col_character(),"CHR2"=col_character(),"F1R2(n,mq)"=col_character(),
#                                    "F2R1(n,mq)"=col_character(),
#                                    "F1R1(n,mq)"=col_character(),
#                                    "F2R2(n,mq)"=col_character()))

#read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/15_bot_seq/EM_seq/07_depth/240319/test/10pg_A3B_60s.bismark.s.indel.bam.depth.v8.test.txt")%>%
#  as.data.table()
#df<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/15_bot_seq/EM_seq/07_depth/240319/test/10pg_A3B_60s.bismark.s.indel.bam.depth.v8.test.txt",col_types=cols("CHR1"=col_character(),
#df<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/15_bot_seq/EM_seq/07_depth/240319/10pg_A3B_60s.bismark/10pg_A3B_60s.bismark.s.indel.bam.depth.v8.txt",col_types=cols("CHR1"=col_character(),

#                                                                                                                             "CHR2"=col_character(),
#                                                                                                                             "F1R2(n,mq)"=col_character(),
#                                                                                                                             "F2R1(n,mq)"=col_character(),
#                                                                                                                             "F1R1(n,mq)"=col_character(),
#                                                                                                                             "F2R2(n,mq)"=col_character()))
#id=gsub(".s.indel.bam.depth.v[0-9]*.txt","",basename(args[1]))
#print(args[1])
#print(gsub(".depth.v[0-9]*.txt",".processed.F1R2.F2R1.Only_SS_lib.region.cont.v6.bed",basename(args[1])))
#print(id)
#print(paste(id,gsub(".depth.v[0-9]*.txt",".processed.F1R2.F2R1.Only_SS_lib.region.cont.v6.bed",basename(args[1])),sep="/"))
df<-read_tsv(args[1],col_types=cols("CHR1"=col_character(),
                                    "CHR2"=col_character(),
                                    "F1R2(n,mq)"=col_character(),
                                    "F2R1(n,mq)"=col_character(),
                                    "F1R1(n,mq)"=col_character(),
                                    "F2R2(n,mq)"=col_character()))
#df<-df%>%mutate(CHR2=c(rep(c(22),11),15,12))
#df
#df
#df<-
split_df<-df%>%separate(`F1R1(n,mq)`,c("F1R1_n","F1R1_mq"),",") %>%
  separate(`F1R2(n,mq)`,c("F1R2_n","F1R2_mq"),",") %>%
  separate(`F2R1(n,mq)`,c("F2R1_n","F2R1_mq"),",") %>%
  separate(`F2R2(n,mq)`,c("F2R2_n","F2R2_mq"),",")
#split_df
#split_df[-1]
#is.na(split_df)
#split_df
split_df[-1] <- lapply(split_df[-1], as.numeric)

##remove cluster

split_df<-split_df%>%mutate(type=ifelse(F1R2_n>0 & F2R1_n>0,"DS","only_SS_lib"))%>%
  mutate(read_type=ifelse(F1R2_n>0,"F1R2","F2R1"))
split_df<-split_df%>%filter(type=="only_SS_lib")%>%
  as.data.table()



split_df[,cluster_id:=assign_cluster2(POS1,POS4,imd_cutoff),by=list(CHR1,read_type)]

split_df<-split_df%>%mutate(info=paste(CHR1,CHR2,cluster_id,read_type,sep="_"))


split_df<-split_df%>%mutate(info2=paste(CHR1,POS1,POS4,sep="_"))

exclude_info<-split_df%>%
  group_by(info)%>%
  dplyr::summarise(n=n())%>%
  left_join(split_df%>%select(info,info2))
#mutate(info=paste(CHR1,CHR2,cluster_id,read_type,sep="_"))%>%


split_df<-left_join(split_df,exclude_info)


#exclude_info

#split_df5>%filter()
#split_df%>%filter(POS1==76054)
#split_df%>%filter(info%in%c("1_1_35","1_1_36","1_1_37","1_1_38","1_1_39","!_1_40","1_1_41","1_1_42","1_1_43"))
#split_df%>%filter(info%in%exclude_info$info)%>%as.tibble()%>%
#  print(n=30)
#split_df%>%filter()
split_df
split_excl_df<-split_df  %>%filter(n==1)%>%as.tibble()%>%arrange(CHR1,POS1)
#split_df
#split_df
#sorted_split_df%>%head(10)
### in block, <4 eliminate
#split_3_df<-split_df%>%filter(F1R2_n>=3 & F1R2_mq>=20 & F2R1_n>=3 &F2R1_mq>=20)
split_3_df<-split_excl_df%>%filter((F1R2_n>=3 & F1R2_mq>=20 & F2R1_n==0) | (F2R1_n>=3 &F2R1_mq>=20 & F1R2_n==0) )
#split_3_df<-split_3_df%>%mutate(CHR2=c(22,22,22,22,22,10))
#split_3_df<-split_df%>%filter(F1R2_n>=3 & F1R2_mq>=20 & F2R1_n>=3 &F2R1_mq>=20)
#splo
#split_3_df
uniq_split_3_df<-split_3_df%>%
  mutate(CHR2=ifelse(CHR1=="X",23,
                     ifelse(CHR1=="Y",24,CHR2)))
#uniq_split_3_df%>%select(CHR1,CHR2)%>%unique()%>%print(n=100)
#uniq_split_3_df<-uniq_split_3_df%>%mutate(dist=abs(POS1-POS4))
uniq_split_3_df<-uniq_split_3_df%>%mutate(dist=abs(POS1-POS4))%>%filter(dist<10000)
#tmp%>%filter(dist>10000)
#ggplot(tmp,aes(x=dist))+
#  geom_histogram()+
#  scale_y_log10()+
#  scale_x_log10()
#uniq_split_4_df
#uniq_split_4_df%>%filter(POS1>POS2)

#uniq_split_3_df<-uniq_split_3_df%>%filter(CHR1==CHR2)
#uniq_split_3_df<-uniq_split_3_df%>%mutate(dist=POS2-POS1)%>%filter(dist>0)
#uniq_split_3_df%>%mutate(cover_bp=ifelse(POS3>POS2,(POS2-PO1+1+POS4-POS3+1),POS1-POS2+302))
#uniq_split_3_df<-uniq_split_3_df%>%mutate(POS3_2=ifelse(CHR1==CHR2,
#                                                        ifelse(POS3>POS2,
#                                                               POS3,
#                                                               ifelse(POS4!=POS2,
#                                                                      POS2+1,
#                                                                      POS3)),
#                                                        POS3))

# For POS3_2, POS4_2 annotation, CHR1 and CHR2 has to be same for SEX chromosomes - modified by LWH

#uniq_split_3_df$CHR2
uniq_split_3_df$CHR2 <- as.character(uniq_split_3_df$CHR2)
uniq_split_3_df$CHR2 <- gsub("23","X",uniq_split_3_df$CHR2)
uniq_split_3_df$CHR2 <- gsub("24","Y",uniq_split_3_df$CHR2)

# remove clusters having alternative aligns within 5bp
uniq_split_3_df


uniq_split_3_df<-uniq_split_3_df%>%mutate(POS3_2=ifelse(CHR1==CHR2,
                                                        ifelse(POS3>POS2,
                                                               POS3,
                                                               ifelse(POS4!=POS2,
                                                                      ifelse(POS4<POS2,POS3,POS2+1),
                                                                      POS3)),
                                                        POS3)) %>%
  mutate(POS4_2=ifelse((CHR1==CHR2) & (POS3<POS1) & (POS2>POS4),POS1-1,POS4))

# After POS3_2, POS4_2 annotation, CHR1 and CHR2 has to be in double again for filtering below - modified by LWH
uniq_split_3_df$CHR2 <- gsub("X","23",uniq_split_3_df$CHR2)
uniq_split_3_df$CHR2 <- gsub("Y","24",uniq_split_3_df$CHR2)
uniq_split_3_df$CHR2 <- as.double(uniq_split_3_df$CHR2)

#uniq_split_3_df$CHR2<-gsub("23","X",uniq_split_3_df$CHR2) # modified, but aren't chromosomes already written correctly? checked out and they weren't - LWH ==> moved to below (CHR2 need to be in double format to be filtered!!!)

#uniq_split_3_df$CHR2<-gsub("24","Y",uniq_split_3_df$CHR2) # modified, but aren't chromosomes already written correctly? checked out and they weren't - LWH ==> moved to below (CHR2 need to be in double format to be filtered!!!)

#uniq_split_3_df
#uniq_split_3_df<-uniq_split_3_df%>%mutate(new_CHR2= (gsub("23","X",CHR2) %>% gsub("24","Y")))
#uniq_split_4_df
#uniq_split_4_df%>%sum(dist,na.rm=TRUE)
#uniq_split_4_df[is.numeric(uniq_split_4_df$dist)]
#sum(uniq_split_4_df$dist)

#sorted_split_4_df
#head(sort_split_4_df,10)
### overlap

#sorted_df$`F1R1(n,mq)`%>%str_split(",")
#uniq_split_3_df%>%filter(POS1==59573112)
uniq_split_3_df<-uniq_split_3_df%>%filter(!(abs(POS1-POS2)==1 | abs(POS3-POS4)==1))
#cover_df<-uniq_split_3_df %>% mutate(bed_POS1_2=POS1+150,bed_POS2_1=POS2-150)
#cover_df
A<-uniq_split_3_df%>%filter(((POS1<POS3 | POS4<POS2)) | (POS1==POS3 & POS4==POS2))%>%mutate(read_type=ifelse(F1R2_n>=3,"+","-"))%>%mutate(name=".",score=".")%>%select(CHR1,POS1,POS2,name,score,read_type)
B<-uniq_split_3_df%>%filter(((POS1>POS3 | POS4>POS2)) | (POS1==POS3 & POS4==POS2))%>%filter((CHR2<=24))%>%mutate(read_type=ifelse(F1R2_n>=3,"+","-"))%>%mutate(name=".",score=".")%>%select(CHR2,POS3_2,POS4_2,name,score,read_type) # modified, but aren't chromosomes already written correctly? - LWH

B$CHR2 <- gsub("23","X",B$CHR2) # After filtering, now we can change numbers into actual letters (X, Y) - modified by LWH
B$CHR2 <- gsub("24","Y",B$CHR2)

colnames(A)<-c("CHR","POS1","POS2","name","score","read_type")
colnames(B)<-c("CHR","POS1","POS2","name","score","read_type")



merged_bed<-rbind(A,B)
merged_bed<-merged_bed%>%filter(POS1>0 & POS2>0)
merged_bed<-merged_bed%>%arrange(CHR,POS1)%>%unique()

merged_bed%>%arrange(CHR,POS1)

#merged_bed %>%sample_n(50)%>%view()
#merged_bed %>%filter(POS1>POS2)
#ㅈㄷmerged_bed
print(merged_bed)
id=gsub(".s.indel.bam.depth.v[0-9]*.txt","",basename(args[1]))
#write.table(merged_bed,paste(id,gsub(".depth.v[0-9]*.txt",".processed.F1R2.F2R1.region.v4.bed",basename(args[1])),sep="/"),quote=FALSE,sep="\t",col.names = F, row.names = F)
#write.table(merged_bed,paste(id,gsub(".depth.v[0-9]*.txt",".processed.F1R2.F2R1.Only_SS_lib.region.cont.v6.bed",basename(args[1])),sep="/"),quote=FALSE,sep="\t",col.names = F, row.names = F)
write.table(merged_bed,args[2],quote=FALSE,sep="\t",row.names = F,col.names=F)
