library(dplyr)
library(stringi)
##input is {id}.snp.fin.DS_mut.
#{id}.snp.exclude.reheader.somatic.merged.dist_out.vcf.v13.info.txt.v4.rasm.vaf_info.v2.txt.readc.read_loc.info.txt.baseq.txt.cont.readc.read_type.out.dist_out.vcf
#{id}.snp.fin.DS_mut.excl_common.vcf
vcf_files<-list.files("/home/users/ayh/Projects/27_A3B/07_revision/botseq/test/vcf",
                      "vcf",
                      full.names=T)


vcf_files

vcf_tmp<-lapply(vcf_files,function(x){
  read_tsv(x)%>%
    mutate(id=gsub(".snp.*","",basename(x)))%>%
    #mutate(mut_type=substr(sig_cont,2,4))%>%
    mutate(mut_type=paste0(REF,">",ALT))%>%
    select(`#CHROM`,POS,REF,ALT,three_bp_cont,sig_cont,mut_type,`F1R2_dist(r1,r2)`,id)%>%
    separate(`F1R2_dist(r1,r2)`,c("dist_r1","dist_r2"),",")

})


merge_vcf<-do.call(rbind,vcf_tmp)
merge_vcf$dist_r1<-as.double(merge_vcf$dist_r1)
merge_vcf$dist_r2<-as.double(merge_vcf$dist_r2)


## split case
#is.na(dist_r1) | is.na(dist_r2) -> 151bp assume
#dist_r1=dist_r2 -> just dist_r1
#dist_r1!=dist_r2 -> just dist_r1

merge_vcf%>%arrange(-adj_dist)
merge_vcf<-merge_vcf%>%
  mutate(adj_dist=
           ifelse(is.na(dist_r1)|is.na(dist_r2),
                  ifelse(is.na(dist_r2),dist_r1,151+dist_r2),dist_r1))
merge_vcf%>%
  ggplot(aes(x=adj_dist))+
  geom_density()+
  facet_wrap(~mut_type)


# Define custom breaks: 0 to just before max, plus max as last break


# Bin size

breaks <- c(0, 50, 100, 150, 200, 250, 303)
labels <-c("000-049","050-099","100-149","150-199","200-249","250-301")
# Assign each position to its bin

merge_vcf<-merge_vcf %>%
  mutate(bin_group = cut(adj_dist,
                         breaks = breaks,
                         labels = labels,
                         include.lowest = TRUE,
                         right = FALSE))



bin_files<-list.files("/home/users/ayh/Projects/27_A3B/07_revision/botseq/test/vcf/depth/",
                      "count.txt",
                      full.names=T)
bin_tmp<-lapply(bin_files,function(x){
  read_tsv(x,col_names = c("id","bin_group","REF","len"))
})

bin_df<-do.call(rbind,bin_tmp)
bin_df%>%select(-id)

bin_df$id%>%unique()
merge_vcf$id%>%unique()
bin_df
merge_vcf%>%
  group_by(id,mut_type,REF,bin_group)%>%
  dplyr::summarise(n=n())%>%uniq

bin_df
merge_vcf$id%>%unique()
t_df<-merge_vcf%>%
  group_by(id,mut_type,REF,bin_group)%>%
  dplyr::summarise(n=n())%>%
  spread(bin_group,n)
t_df[is.na(t_df)]<-0
t_df<-t_df%>%gather(bin_group,n,4:9)%>%ungroup()
rate_df<-left_join(t_df,bin_df,by=c("id"="id","REF"="REF","bin_group"="bin_group"))%>%
  mutate(mut_rate=n/len)
mut_type_ref<-c("C>A","C>T","C>G","T>A","T>C","T>G",
                "G>T","G>A","G>C","A>T","A>G","A>C")
rate_df<-rate_df%>%mutate(mut_type=factor(mut_type,levels = c(mut_type_ref)))

A3A_bot_meta<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/15_bot_seq/A3A/A3A_conditional_bat1/after_id_swap/04_varscan/snv/annotation/DS/metadata.txt")%>%
  mutate(new_id=id)
A3B_bot_meta<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/15_bot_seq/A3B/integrate/metadata.txt")%>%
  select(id,new_id,time,dose,batch)

bot_meta<-rbind(A3A_bot_meta,A3B_bot_meta)

rate_df<-left_join(rate_df,bot_meta)
rate_df<-rate_df%>%mutate(APOBEC=ifelse(grepl("A3A",new_id),"A3A","A3B"))%>%
  mutate(cond=paste(APOBEC,time,dose,sep="_"))
rate_df%>%ungroup()%>%select(new_id,time,dose)%>%unique()
rate_df<-rate_df%>%mutate(cor_rate=(n+1)/len)
rate_df%>%
  group_by(cond,mut_type)%>%
  dplyr::summarise(n=n())%>%
  print(n=100)
rate_df%>%select(cond,mut_type,bin_group)%>%unique()
rate_df%>%
  ggplot(aes(x=bin_group,y=cor_rate))+
  geom_point()+
  facet_wrap(~cond+mut_type,ncol=6)

rate_df%>%
  group_by(cond,mut_type,bin_group)%>%
  dplyr::summarise(mean.rate=mean(cor_rate),
                   log.mean.rate=log())



rate_df%>%
  select(new_id,APOBEC,time,dose,cond,bin_group,mut_type,REF,n,len,mut_rate)%>%
  write.table("/home/users/ayh/Projects/27_A3B/07_revision/botseq/bot_seq_dist.sample.txt",
              sep="\t",
              quote=F,
              row.names=F)


rate_mean_df<-rate_df%>%
  group_by(cond,mut_type,bin_group) %>%
  dplyr::summarise(mean.rate = mean(cor_rate*1000000, na.rm = TRUE),
            log.mean.rate=mean(log(cor_rate*1000000,10)))


rate_stat_df<-rate_df%>%
  group_by(cond,mut_type,REF,bin_group)%>%
  summarise(mean.rate = mean(cor_rate*1000000, na.rm = TRUE),
            sd.rate = sd(cor_rate*1000000, na.rm = TRUE),
            n.rate = n(),
            se=sd.rate/sqrt(n.rate)) %>%
  mutate(dse=0.434/mean.rate*se)
rate_stat_df[is.na(rate_stat_df)]<-0


rate_fin_df<-left_join(rate_stat_df,rate_mean_df)%>%
  mutate(upper.log.rate=log.mean.rate+dse,
         lower.log.rate=log.mean.rate-dse)%>%
  ungroup()
rate_fin_df%>%filter(cond!="A3A_0h_100ng/ml")
rate_fin_df$bin_group%>%unique()
rate_fin_df%>%
  write.table("/home/users/ayh/Projects/27_A3B/07_revision/botseq/bot_seq_dist.summary.txt",
              sep="\t",
              quote=F,
              row.names=F)
p_dist<-rate_fin_df%>%
  ggplot(aes(x=bin_group,y=log.mean.rate))+
  #geom_point(size=5)+
  geom_point()+
  geom_errorbar(aes(ymin=upper.log.rate,ymax=lower.log.rate,width=1))+

  #geom_line()+
  theme_bw()+
  #ggtitle("clustered Multivariate analysis")+
  xlab("")+ylab("log10(mutation rate per Mb)")+

  facet_wrap(~cond+mut_type,ncol=6)+

 theme(#plot.title=element_text(size=20),
#      axis.title=element_text(size=20),
#      axis.text.y=element_text(size=40),
#      axis.text.x=element_text(size=40),
      axis.ticks.length=unit(.4, "cm"),
      axis.ticks.y = element_line(size = 2),
      axis.ticks.x = element_blank(),
      #strip.text.x = element_text(size = 30),
      legend.position="none"
)+
  scale_y_continuous(    name = "log10(mutation rate per Mb)",
                         breaks = c(-2, -1, 0, 1, 2),         # positions on log scale
                         labels = c(-0.01, -0.1, 1, 10,100)     )+
  scale_x_discrete(labels=c("000-049"="50","51-99"="100", "100-149"="150", "150-199"="200", "200-249"="250", "250-301"="250+")  )
rate_fin_df%>%arrange(-upper.log.rate)
rate_fin_df%>%arrange(lower.log.rate)
for (condition in rate_fin_df$cond%>%unique()){
  #condition="A3A_48h_100ng/ml"
  t_df<-rate_fin_df%>%filter(cond==condition)
  p_dist<-t_df%>%
    ggplot(aes(x=bin_group,y=log.mean.rate))+
    #geom_point(size=5)+
    geom_point()+
    geom_errorbar(aes(ymin=upper.log.rate,ymax=lower.log.rate,width=1))+

    #geom_line()+
    theme_bw()+
    #ggtitle("clustered Multivariate analysis")+
    xlab("")+ylab("log10(mutation rate per Mb)")+

    facet_wrap(~mut_type,ncol=6)+

    theme(#plot.title=element_text(size=20),
      #      axis.title=element_text(size=20),
      #      axis.text.y=element_text(size=40),
      #      axis.text.x=element_text(size=40),
      axis.ticks.length=unit(.4, "cm"),
      axis.ticks.y = element_line(size = 2),
      axis.ticks.x = element_line(size = 2),
      #strip.text.x = element_text(size = 30),
      legend.position="none"
    )+
  #  ylim(c(-1.2,3))+
    scale_y_continuous(    name = "log10(mutation rate per Mb)",
                           breaks = c( -1, 0, 1, 2,3),         # positions on log scale
                           labels = c( -0.1, 1, 10,100,1000),
                           limits=c(-1.2,3))+
    scale_x_discrete(labels=c("000-049"="50","51-99"="100", "100-149"="150", "150-199"="200", "200-249"="250", "250-301"="250+")  )
  #p_dist
  ggsave(paste0("/home/users/ayh/Projects/27_A3B/07_revision/botseq/mut_rate.dist.",gsub("/ml","",condition),".pdf"),p_dist,
         height=8,width=10)
}

##C>T vs others###
group_rate_df<-rate_df%>%mutate(group_mut_type=ifelse(mut_type=="C>T","C>T",
                                     ifelse(mut_type=="G>A","G>A",
                                            ifelse(REF=="C"|REF=="T","others","rev_others"))))

  #group_by(id,cond,bin_group,group_mut_type)
group_rate_df<-left_join(group_rate_df%>%
  group_by(id,cond,bin_group,group_mut_type)%>%
  dplyr::summarise(group_n=sum(n))%>%ungroup(),
  group_rate_df%>%
    select(id,bin_group,REF,group_mut_type,len,new_id,cond)%>%unique()%>%
    group_by(id,bin_group,group_mut_type,new_id,cond)%>%
    dplyr::summarise(group_len=sum(len))%>%
    ungroup()
)%>%
  mutate(mut_rate=group_n/group_len)%>%
  mutate(cor_rate=(group_n+1)/group_len)

group_rate_stat_df




group_rate_mean_df<-group_rate_df%>%
  group_by(cond,group_mut_type,bin_group) %>%
  dplyr::summarise(mean.rate = mean(cor_rate*1000000, na.rm = TRUE),
                   log.mean.rate=mean(log(cor_rate*1000000,10)))



group_rate_stat_df<-group_rate_df%>%
  group_by(cond,group_mut_type,bin_group)%>%
  summarise(mean.rate = mean(cor_rate*1000000, na.rm = TRUE),
            sd.rate = sd(cor_rate*1000000, na.rm = TRUE),
            n.rate = n(),
            se=sd.rate/sqrt(n.rate)) %>%
  mutate(dse=0.434/mean.rate*se)
group_rate_stat_df[is.na(group_rate_stat_df)]<-0


group_rate_fin_df<-left_join(group_rate_stat_df,group_rate_mean_df)%>%
  mutate(upper.log.rate=log.mean.rate+dse,
         lower.log.rate=log.mean.rate-dse)%>%
  ungroup()

group_rate_fin_df$group_mut_type<-factor(group_rate_fin_df$group_mut_type,levels=c("C>T","others","G>A","rev_others"))

for (condition in group_rate_fin_df$cond%>%unique()){
  #condition="A3A_48h_100ng/ml"
  t_df<-group_rate_fin_df%>%filter(cond==condition)
  p_dist<-t_df%>%
    ggplot(aes(x=bin_group,y=log.mean.rate))+
    #geom_point(size=5)+
    geom_point()+
    geom_errorbar(aes(ymin=upper.log.rate,ymax=lower.log.rate,width=1))+

    #geom_line()+
    theme_bw()+
    #ggtitle("clustered Multivariate analysis")+
    xlab("")+ylab("log10(mutation rate per Mb)")+

    facet_wrap(~group_mut_type,ncol=2)+

    theme(#plot.title=element_text(size=20),
      #      axis.title=element_text(size=20),
      #      axis.text.y=element_text(size=40),
      #      axis.text.x=element_text(size=40),
      axis.ticks.length=unit(.4, "cm"),
      axis.ticks.y = element_line(size = 2),
      axis.ticks.x = element_line(size = 2),
      #strip.text.x = element_text(size = 30),
      legend.position="none"
    )+
    #  ylim(c(-1.2,3))+
    scale_y_continuous(    name = "log10(mutation rate per Mb)",
                           breaks = c( -1, 0, 1, 2,3),         # positions on log scale
                           labels = c( -0.1, 1, 10,100,1000),
                           limits=c(-1.2,3))+
    scale_x_discrete(labels=c("000-049"="50","51-99"="100", "100-149"="150", "150-199"="200", "200-249"="250", "250-301"="250+")  )
  #p_dist
  ggsave(paste0("/home/users/ayh/Projects/27_A3B/07_revision/botseq/group_mut_rate.dist.",gsub("/ml","",condition),".pdf"),p_dist,
         height=8,width=10)
}
dodge_width<-0.5
group_rate_fin_df$cond%>%unique()
group_rate_fin_df$cond<-factor(group_rate_fin_df$cond,levels=group_rate_fin_df$cond%>%unique())


p_A3A_group<-group_rate_fin_df%>%filter(grepl("A3A",cond))%>%
  ggplot(aes(x = bin_group, y = log.mean.rate, color = cond, group = cond)) +
  geom_point(position = position_dodge(width = dodge_width)) +
  geom_line(position = position_dodge(width = dodge_width)) +
  geom_errorbar(
    aes(ymin = lower.log.rate, ymax = upper.log.rate),
    width = 0.5,
    position = position_dodge(width = dodge_width)
  ) +
  theme_bw() +
  labs(x = "Bin group", y = "log10(mean rate)", color = "Condition") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    panel.grid.minor = element_blank(),
    strip.text = element_text(size = 15)
  )+
  scale_y_continuous(breaks = c( -1, 0, 1,2,3),
                     labels = c("0.1", "1", "10","100","1000"),
                     minor_breaks = log10(1:9 %o% 10^(-1:3)),
                     limits=c(-1,3)# add minor tick positions
  ) +
  annotation_logticks(sides = "l")+
  facet_wrap(~group_mut_type)
p_A3A_group

p_A3B_group<-group_rate_fin_df%>%filter(grepl("A3B",cond))%>%
  ggplot(aes(x = bin_group, y = log.mean.rate, color = cond, group = cond)) +
  geom_point(position = position_dodge(width = dodge_width)) +
  geom_line(position = position_dodge(width = dodge_width)) +
  geom_errorbar(
    aes(ymin = lower.log.rate, ymax = upper.log.rate),
    width = 0.7,
    position = position_dodge(width = dodge_width)
  ) +
  theme_bw() +
  labs(x = "Bin group", y = "log10(mean rate)", color = "Condition") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    panel.grid.minor = element_blank(),
    strip.text = element_text(size = 15)
  )+
  scale_y_continuous(breaks = c( -1, 0, 1,2,3),
                     labels = c("0.1", "1", "10","100","1000"),
                     minor_breaks = log10(1:9 %o% 10^(-1:3)),
                     limits=c(-1,3)# add minor tick positions
  ) +
  annotation_logticks(sides = "l")+
  facet_wrap(~group_mut_type)


p_A3B_group
p_A3A_group
ggsave("/home/users/ayh/Projects/27_A3B/07_revision/botseq/group_mut_rate_dist.A3A.pdf",p_A3A_group,
      width=8,height=8)
ggsave("/home/users/ayh/Projects/27_A3B/07_revision/botseq/group_mut_rate_dist.A3B.pdf",p_A3B_group,
       width=8,height=8)

group_rate_fin_df%>%
  ggplot(aes(x=bin_group,y=log.mean.rate,col=cond))+
  #geom_point(size=5)+
  geom_point(position="dodge")+
  geom_errorbar(aes(ymin=upper.log.rate,ymax=lower.log.rate,width=1))+

  geom_line()+
  theme_bw()+
  #ggtitle("clustered Multivariate analysis")+
  xlab("")+ylab("log10(mutation rate per Mb)")+

  facet_wrap(~group_mut_type,ncol=2)+

  theme(#plot.title=element_text(size=20),
    #      axis.title=element_text(size=20),
    #      axis.text.y=element_text(size=40),
    #      axis.text.x=element_text(size=40),
    axis.ticks.length=unit(.4, "cm"),
    axis.ticks.y = element_line(size = 2),
    axis.ticks.x = element_line(size = 2),
    #strip.text.x = element_text(size = 30),
    legend.position="none"
  )+
  #  ylim(c(-1.2,3))+
  scale_y_continuous(    name = "log10(mutation rate per Mb)",
                         breaks = c( -1, 0, 1, 2,3),         # positions on log scale
                         labels = c( -0.1, 1, 10,100,1000),
                         limits=c(-1.2,3))+
  scale_x_discrete(labels=c("000-049"="50","51-99"="100", "100-149"="150", "150-199"="200", "200-249"="250", "250-301"="250+")  )
