library(dplyr)
library(tidyverse)
library(ggplot2)
sim_files<-list.files("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/25_simulation/jolim_rerun/v2/dist/",
                      ".dist.maf",
                      full.names=T)
sim_files[grepl("A3A",sim_files)]

sim_tmp<-lapply(sim_files,function(x){
  read_tsv(x,show_col_types = FALSE)%>%mutate("run"=gsub("^.*\\.","",gsub(".dist.maf","",basename(x))))%>%
    select(`#CHROM`,POS1,REF,ALT,id,cont,dist,run)%>%
    mutate(id=gsub("_[0-9]*$","",id))
})

sim_df<-do.call(rbind,sim_tmp)
sim_df<-sim_df%>%filter(dist!="c")%>%
  mutate(dist=as.numeric(dist))
breakpoints <- c(-Inf, 1000, 3000, 10000,30000,100000,300000,1000000,3000000,Inf)

# Create a data frame with the distance column


# Add a new column specifying the group
sim_df$group <- cut(sim_df$dist, breaks = breakpoints, labels = c(1:9), right = FALSE)



real_files<-list.files("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/25_simulation/jolim_rerun/v2/dist/original",
                       ".txt",
                       full.names=T)

real_tmp<-lapply(real_files,function(x){
  read_tsv(x,col_names=c("tmp","id","#CHROM","POS","REF","ALT","dist"),show_col_types = FALSE)%>%
    filter(dist!="c")%>%
    filter(!is.na(dist))%>%
    mutate(dist=as.numeric(dist))%>%
    mutate(id=gsub("_[0-9]*$","",id))
})

real_df<-do.call(rbind,real_tmp)
real_df<-real_df%>%filter(!is.na(dist))

real_df$group<-cut(real_df$dist, breaks = breakpoints, labels = c(1:9), right = FALSE)


sim_tmp_df<-sim_df%>%
  dplyr::group_by(group,id,run)%>%
  dplyr::summarise(n=n())%>%spread(run,n)%>%
  ungroup()

sim_tmp_df[is.na(sim_tmp_df)]<-0
sim_tmp_df<-sim_tmp_df%>%gather(run,count,3:102)
sim_freq_df<-sim_tmp_df%>%
  left_join(sim_tmp_df%>%filter(!is.na(group))%>%
  group_by(id,run)%>%
  dplyr::summarise(tot_count=sum(count)))%>%
  mutate(freq=count/tot_count)



###check normal distribution
###all of distribution was normal

shapiro_tmp<-lapply(sim_freq_df$id%>%unique(),function(x){
#  y="A3A_1st_C3_3ug-2"
  tmp<-lapply(as.character(c(1:100)),function(y){
#    run="1"
    df<-sim_freq_df%>%filter(id==x&run==y)
    data.frame(id=x,run=y,pval=(shapiro.test(df$freq))$p.value)%>%as.tibble()
  })
  do.call(rbind,tmp)
})

shapiro_df<-do.call(rbind,shapiro_tmp)

sim_freq_df$group%>%unique()

sim_freq_df<-sim_freq_df%>%ungroup()
sim_freq_df<-sim_freq_df%>%mutate(group=as.numeric(sim_freq_df$group))


sim_ci_tmp<-lapply(sim_freq_df$id%>%unique(),function(x){
    #x="A3A_1st_C3_3ug-2"
  tmp<-lapply(c(1:9),function(y){
    group=y
    #y=1
    #run=="1"
    #sim_freq_df%>%filter(id==x&group==1)
    df<-sim_freq_df%>%filter(id==x&group==y)
    data.frame(id=x,group=y,mean=mean(df$freq),ci=1.96*sd(df$freq))%>%mutate(upper=mean+ci,lower=mean-ci)%>%as.tibble()
  })
  do.call(rbind,tmp)
})

sim_ci_df<-do.call(rbind,sim_ci_tmp)



obs_df1<-real_df%>%
  dplyr::group_by(group,id)%>%
  dplyr::summarise(n=n())%>%
  left_join(real_df%>%
              dplyr::group_by(group,id)%>%
              dplyr::summarise(n=n())%>%
              ungroup()%>%
              dplyr::group_by(id)%>%
              dplyr::summarise(tot_count=sum(n))
  )%>%
  mutate(freq=n/tot_count)%>%
  select(-n)%>%
  spread(group,freq)

obs_df1[is.na(obs_df1)]<-0
obs_df1<-obs_df1%>%gather(group,freq,3:11)%>%
  mutate(type="observed")



obs_df2<-real_df%>%
  dplyr::group_by(group,id)%>%
  dplyr::summarise(n=n())%>%
  left_join(real_df%>%
              dplyr::group_by(group,id)%>%
              dplyr::summarise(n=n())%>%
              ungroup()%>%
              dplyr::group_by(id)%>%
              dplyr::summarise(tot_count=sum(n))
  )%>%
  spread(group,n)
obs_df2[is.na(obs_df2)]<-0
obs_df2<-obs_df2%>%gather(group,n,3:11)
obs_df2<-obs_df2%>%mutate(type="observed")

obs_df<-left_join(obs_df1,obs_df2)


dist_sum_df<-rbind(sim_ci_df%>%mutate(type="expected"),obs_df%>%
  select(id,group,mean=freq)%>%
  mutate(ci=0,upper=mean,lower=mean)%>%mutate(type="observed")
)


dist_sum_A3A_df<-dist_sum_df%>%
  filter(grepl("A3A",id))%>%
  filter(!grepl("Ctrl",id))%>%
  filter(!grepl("ctrl",id))%>%
  ungroup()%>%
  #select(id)%>%unique()%>%
  filter(!id%in%c("A3A_1st_C3","A3A_C3_TP53KO_C3"))
  

##p value

sim_p_tmp<-lapply(sim_freq_df$id%>%unique(),function(x){
  #x="A3A_1st_C3_3ug-2"
  print(x)
  tmp<-lapply(c(1:9),function(y){
    group=y
   # y=4
    #run=="1"
    #sim_freq_df%>%filter(id==x&group==1)
    sim_tmp_df<-sim_freq_df%>%filter(id==x&group==y)
    std_freq<-(obs_df%>%filter(id==x,group==y))$freq
    #sim_tmp_df%>%mutate()
  #  pval=p_tmp_df$`TRUE`/101
  #  pval
   # sim_tmp_df$freq%>%mean()+1.96*(sim_tmp_df$freq%>%sd())
    q<-(abs(sim_tmp_df$freq%>%mean()-std_freq))/(sim_tmp_df$freq%>%sd())
    data.frame(id=x,group=y,pval=(1-(pnorm(q, mean = 0, sd = 1, lower.tail = TRUE)))*2)%>%as.tibble()
  })
  do.call(rbind,tmp)
})


sim_p_df<-do.call(rbind,sim_p_tmp)

sim_freq_df%>%
  write.table("/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig3/dist_sim_freq_df.txt",
              sep="\t",
              quote=F,
              row.names=F)
sim_p_df%>%
  write.table("/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig3/dist_sim_p_df.txt",
              sep="\t",
              quote=F,
              row.names=F)
obs_df%>%
  write.table("/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig3/dist_obs_df.txt",
              sep="\t",
              quote=F,
              row.names=F)

sim_freq_df<-read_tsv("/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig3/dist_sim_freq_df.txt")
sim_p_df<-read_tsv("/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig3/dist_sim_p_df.txt")
obs_df<-read_tsv("/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig3/dist_obs_df.txt")



sim_p_A3A_df<-sim_p_df%>%filter(grepl("A3A",id))%>%
  filter(!grepl("Ctrl",id))%>%
  filter(!grepl("ctrl",id))%>%
  ungroup()%>%
  #select(id)%>%unique()%>%
  filter(!id%in%c("A3A_1st_C3","A3A_C3_TP53KO_C3"))%>%
  mutate(qval=p.adjust((sim_p_df%>%filter(grepl("A3A",id))%>%
  filter(!grepl("Ctrl",id))%>%
  filter(!grepl("ctrl",id))%>%
  ungroup()%>%
  #select(id)%>%unique()%>%
  filter(!id%in%c("A3A_1st_C3","A3A_C3_TP53KO_C3")))$pval,
  method="BH")
)

sim_p_A3A_fin_df<-sim_p_A3A_df%>%mutate(`.y.`="freq")%>%
  mutate(group1="observed",group2="expected")%>%
  mutate(n1=1,n2=100,
         statistic=1,
         df=99,
         p=pval,
         p.adj=qval,
         p.adj.signif=ifelse(qval>=0.05,"n.s.",
                             ifelse(qval>=0.005,"*",
                                    ifelse(qval>=0.0005,"**",
                                           ifelse(qval>0.00005,"***","****"))))
         )%>%
  mutate(xmin=rep(seq(0.8,8.8,by=1),16),
         xmax=rep(seq(1.2,9.2,by=1),16))%>%
  left_join(dist_sum_A3A_df%>%
          select(id,group,upper,type)%>%
          group_by(id,group)%>%
          dplyr::summarise(max_c=max(upper))%>%
            mutate(group=as.numeric(group))
          )%>%
  mutate(y.position=max_c+0.03)%>%
  select(-max_c)
sim_p_A3A_fin_df$y.position%>%max()


lapply(dist_sum_A3A_df$id%>%unique(),function(x){

p_dist<-dist_sum_A3A_df%>%
  filter(id==x)%>%
  mutate(group=factor(group,levels=as.character(c(1:9))))%>%
  ggplot(aes(x=group,y=mean,col="black"))+
  geom_bar(stat="identity",position="dodge",aes(fill=type))+
  geom_errorbar(aes(fill=type,ymin=lower,ymax=upper),position=position_dodge(0.9),width=0.2)+
  theme_classic()+
  scale_colour_manual(values=c("black"))+
  guides(colour="none")+
  add_pvalue(sim_p_A3A_fin_df%>%
               filter(id==x),
             xmin="xmin",
             xmax="xmax",
             tip.length =0.01)+
  guides(colour="none",
         fill="none")+
  ylim(c(0,0.8))+
  ylab("ratio")+
  ggtitle(x)
  
ggsave(paste0("/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig3/dist/dist.",x,".pdf"),p_dist,
       height=8,width=10)
  
})



p_dist_sample<-dist_sum_A3A_df%>%
  filter(id=="A3A_1st_C3_3ug-2")%>%
  mutate(group=factor(group,levels=as.character(c(1:9))))%>%
  ggplot(aes(x=group,y=mean,col="black"))+
  geom_bar(stat="identity",position="dodge",aes(fill=type))+
  geom_errorbar(aes(fill=type,ymin=lower,ymax=upper),position=position_dodge(0.9),width=0.2)+
  theme_classic()+
  scale_colour_manual(values=c("black"))+
  guides(colour="none")+
  add_pvalue(sim_p_A3A_fin_df%>%
               filter(id=="A3A_1st_C3_3ug-2"),
             xmin="xmin",
             xmax="xmax",
             tip.length =0.01)+
  guides(colour="none",
         fill="none")+
  ylim(c(0,0.5))+
  ylab("ratio")+
  ggtitle("A3A_1st_C3_3ug-2")


dist_sum_A3A_df%>%
  write.table("/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig3/dist/cluster_dist.ci.sample.txt",
              sep="\t",
              quote=F,
              row.names=F
  )



metadata<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/24_mutect2_strelka2_baseq/merge/clonal/sig/metadata.txt")%>%
  select(-`m/d`)
dist_A3A_oe_df<-
left_join(
sim_freq_df%>%filter(grepl("A3A",id))%>%
  filter(!grepl("Ctrl",id))%>%
  filter(!grepl("ctrl",id))%>%
  ungroup()%>%
  #select(id)%>%unique()%>%
  filter(!id%in%c("A3A_1st_C3","A3A_C3_TP53KO_C3"))%>%
  group_by(id,group)%>%
  dplyr::summarise(tot_freq=sum(freq)),
obs_df%>%filter(grepl("A3A",id))%>%
  filter(!grepl("Ctrl",id))%>%
  filter(!grepl("ctrl",id))%>%
  ungroup()%>%
  #select(id)%>%unique()%>%
  filter(!id%in%c("A3A_1st_C3","A3A_C3_TP53KO_C3"))%>%
  select(-n,-tot_count,-type)%>%plyr::rename(c("freq"="obs_freq"))
)%>%
  mutate(mean_oe_ratio=obs_freq/tot_freq*100)%>%
  left_join(metadata)

dist_A3A_oe_df<-dist_A3A_oe_df%>%mutate(info=paste(APOBEC,dose,sep="_"))
dist_A3A_oe_df%>%
  write.table("/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig3/dist/cluster_dist.oe.sample.txt",
              sep="\t",
              quote=F,
              row.names=F
  )  

dist_A3A_oe_df<-read_tsv("/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig3/dist/cluster_dist.oe.sample.txt")
dist_A3A_oe_ci_df<-dist_A3A_oe_df%>%mutate(info=paste(APOBEC,dose,TP53,sep="_"))%>%
  group_by(info,group,TP53) %>%
  summarise(mean.ratio = mean(mean_oe_ratio, na.rm = TRUE),
            sd.ratio = sd(mean_oe_ratio, na.rm = TRUE),
            n.ratio = n(),
            se=sd.ratio/sqrt(n.ratio))%>%
  mutate(se.ratio = sd.ratio / sqrt(n.ratio),
         lower.ci.ratio = mean.ratio - qt(1 - (0.05 / 2), n.ratio - 1) * se.ratio,
         upper.ci.ratio = mean.ratio + qt(1 - (0.05 / 2), n.ratio - 1) * se.ratio)
dist_A3A_oe_ci_df%>%
write.table("/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig3/dist/cluster_dist.oe.ci.group.txt",
            sep="\t",
            quote=F,
            row.names=F
            )

dist_A3A_oe_ci_df<-read_tsv("/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig3/dist/cluster_dist.oe.ci.group.txt")

dist_A3A_oe_log_ci_df<-left_join(dist_A3A_oe_df%>%mutate(info=paste(APOBEC,dose,TP53,sep="_"))%>%
  group_by(info,group,TP53) %>%
  summarise(mean.ratio = mean(mean_oe_ratio, na.rm = TRUE),
            log.mean.ratio=log(mean(mean_oe_ratio),10)),
  dist_A3A_oe_df%>%mutate(info=paste(APOBEC,dose,TP53,sep="_"))%>%
  group_by(info,group,TP53) %>%
  summarise(mean.ratio = mean(mean_oe_ratio, na.rm = TRUE),
  sd.ratio = sd(mean_oe_ratio, na.rm = TRUE),
  n.ratio = n(),
  se=sd.ratio/sqrt(n.ratio)) %>%
  mutate(dse=0.434/mean.ratio*se))%>%
  mutate(upper.log.ratio=log.mean.ratio+dse,
         lower.log.ratio=log.mean.ratio-dse)
  

dist_A3A_oe_log_ci_df

dist_A3A_oe_log_ci_df$info<-factor(dist_A3A_oe_log_ci_df$info,levels=c("A3A_100ng_WT","A3A_3ug_WT","A3A_100ng_KO","A3A_3ug_KO"))
dist_A3A_oe_log_ci_df%>%
  write.table("/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig3/dist/cluster_dist.oe.log.ci.group.txt",
              sep="\t",
              quote=F,
              row.names=F
  )


dist_A3A_oe_log_ci_df2<-dist_A3A_oe_ci_df%>%mutate(ci=qt(1 - (0.05 / 2), n.ratio - 1) * se.ratio)%>%
  mutate(dci=0.434/mean.ratio*ci)%>%
  mutate(log.mean.ratio=log10(mean.ratio))%>%
  mutate(upper.ci.log.ratio=log.mean.ratio+dci,
         lower.ci.log.ratio=log.mean.ratio-dci)



dist_A3A_oe_log_ci_df3<-dist_A3A_oe_ci_df%>%
  mutate(upper.ci.log.ratio=log10(upper.ci.ratio),
         lower.ci.log.ratio=log10(lower.ci.ratio)
         )


p_dist_ci_group<-dist_A3A_oe_log_ci_df%>%
  ggplot(aes(x=group,y=log.mean.ratio,group=info))+
  geom_hline(yintercept=0,colour="grey")+
  theme_classic()+
  geom_point(aes(y=log.mean.ratio,group=info,colour=info),size=3,position=position_dodge(width=.3))+
  geom_line(aes(y=log.mean.ratio,group=info,colour=info),position=position_dodge(width=.3))+
  geom_errorbar(aes(ymin=lower.log.ratio,ymax=upper.log.ratio,colour=info),position="dodge",width=.3)+
  scale_y_continuous(lim=c(-1,2.2))


ggsave("/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig3/dist/cluster_dist.ci.group.pdf",p_dist_ci_group,
       width=10,height=8)

p_dist_ci_sample<-dist_A3A_oe_df%>%filter(id=="A3A_1st_C3_3ug-2")%>%
  group_by(group) %>%
  summarise(mean.ratio = mean(mean_oe_ratio, na.rm = TRUE),
            log.mean.ratio=log(mean(mean_oe_ratio),10))%>%
  mutate(id="A3A_1st_C3_3ug-2")%>%
  ggplot(aes(x=group,y=log.mean.ratio))+
  geom_hline(yintercept=0,colour="grey")+
  #geom_bar(stat="identity",position="dodge",aes(fill=type))+
  #geom_errorbar(aes(fill=type,ymin=lower.ci.freq,ymax=upper.ci.freq),position=position_dodge(0.9),width=0.2)+
  theme_classic()+
  #scale_colour_manual(values=c("black"))+
  #guides(colour="none")+
  #guides(colour="none",
  #       fill="none")+
  geom_point(aes(y=log.mean.ratio,group=id),size=3)+
  geom_line(aes(y=log.mean.ratio,group=id))+
  #geom_errorbar(aes(ymin=lower.log.ratio,ymax=upper.log.ratio))+
  scale_y_continuous(lim=c(-1,2.2))

ggsave("/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig3/dist/cluster_dist.ci.A3A_1st_C3_3ug-2.pdf",p_dist_ci_sample,
       width=10,height=8)


p_dist_ci_sample<-dist_A3A_oe_log_ci_df%>%filter(id=="A3A_1st_C3_3ug-2")
  ggplot(aes(x=group,y=log.mean.ratio,group=info))+
  geom_hline(yintercept=0,colour="grey")+
  #geom_bar(stat="identity",position="dodge",aes(fill=type))+
  #geom_errorbar(aes(fill=type,ymin=lower.ci.freq,ymax=upper.ci.freq),position=position_dodge(0.9),width=0.2)+
  theme_classic()+
  #scale_colour_manual(values=c("black"))+
  #guides(colour="none")+
  #guides(colour="none",
  #       fill="none")+
  geom_point(aes(y=log.mean.ratio,group=info,colour=info),size=3,position=position_dodge(width=.3))+
  geom_line(aes(y=log.mean.ratio,group=info,colour=info),position=position_dodge(width=.3))+
  geom_errorbar(aes(ymin=lower.log.ratio,ymax=upper.log.ratio,colour=info),position="dodge",width=.3)+
  scale_y_continuous(lim=c(-1,2.2))

p_dist_ci_group
ggsave("/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig3/dist/cluster_dist.ci.group.pdf",p_dist_ci_group,
       width=10,height=8)




left_join(
sim_freq_df%>%select(-count,-tot_count)%>%
  group_by(group,id)%>%
  dplyr::summarise(tot_freq=sum(freq)),
obs_df%>%select(-n,-tot_count,-type)%>%plyr::rename(c("freq"="obs_freq"))%>%unique()
)%>%
  
sim_freq_df%>%select(-count,-tot_count)%>%spread(type,freq)%>%
  mutate(oe_ratio=observed/expected)

library(ggprism)
paste0("grp",dist_sum_df$group)%>%unique()
p_cluster<-dist_sum_df%>%
  mutate(group=paste0("grp",group))%>%
  mutate(group=factor(group,levels=paste0("grp",dist_sum_df$group)%>%unique()))%>%
  ggplot(aes(x=group,y=freq,col="black"))+
  geom_bar(stat="identity",position="dodge",aes(fill=type))+
  theme_classic()+
  scale_colour_manual(values=c("black"))+
  guides(colour="none")+
  add_pvalue(p_dist_sum,
             xmin="xmin",
             xmax="xmax",
             tip.length =0.01)+
  guides(colour="none",
         fill="none")
#facet_wrap(~id)
p_cluster
ggsave("/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig3/cluster_dist.pdf",p_cluster,
       width=10,height=8)
p_dist_sum
dist_sum_A3A_ci_df<-dist_sum_A3A_ci_df%>%
  mutate(group=paste0("grp",group))%>%ungroup()%>%
  mutate(group=factor(group,levels=paste0("grp",dist_sum_df$group)%>%unique()))

dist_sum_A3A_ci_df%>%
  ggplot(aes(x=group,y=mean.freq,col="black"))+
  geom_bar(stat="identity",position="dodge",aes(fill=type))+
  geom_errorbar(aes(fill=type,ymin=lower.ci.freq,ymax=upper.ci.freq),position=position_dodge(0.9),width=0.2)+
  theme_classic()+
  scale_colour_manual(values=c("black"))+
  guides(colour="none")+
  add_pvalue(p_dist_sum,
             xmin="xmin",
             xmax="xmax",
             tip.length =0.01)+
  guides(colour="none",
         fill="none")

dist_sum_A3A_oe_df<-dist_sum_A3A_df%>%select(-n,-tot_count)%>%spread(type,freq)%>%
  mutate(oe_ratio=observed/expected)

dist_sum_A3A_oe_ci_df<-dist_sum_A3A_oe_df%>%
  group_by(group) %>%
  summarise(mean.ratio = mean(oe_ratio, na.rm = TRUE),
            sd.ratio = sd(oe_ratio, na.rm = TRUE),
            n.ratio = n()) %>%
  mutate(se.ratio = sd.ratio / sqrt(n.ratio),
         lower.ci.ratio = mean.ratio - qt(1 - (0.05 / 2), n.ratio - 1) * se.ratio,
         upper.ci.ratio = mean.ratio + qt(1 - (0.05 / 2), n.ratio - 1) * se.ratio)%>%
  mutate(group=paste0("grp",group))


dist_sum_A3A_oe_ci_df%>%mutate(id="x")%>%
  ggplot(aes(x=group,y=mean.ratio,group=id,col="black"))+
  geom_line()+
  geom_point()

#geom_bar(stat="identity",position="dodge",aes(fill=type))+
geom_errorbar(aes(ymin=lower.ci.ratio,ymax=upper.ci.ratio),position=position_dodge(0.9),width=0.2)+
  theme_classic()+
  scale_colour_manual(values=c("black"))+
  guides(colour="none")

dist_sum_A3A_ci_merge_df<-left_join(dist_sum_A3A_ci_df,dist_sum_A3A_oe_ci_df)
coeff<-250
dist_sum_A3A_ci_merge_df$mean.ratio/coeff
p_dist_ci<-dist_sum_A3A_ci_merge_df%>%
  mutate(id="x")%>%
  ggplot(aes(x=group,y=mean.freq,col="black"))+
  geom_bar(stat="identity",position="dodge",aes(fill=type))+
  geom_errorbar(aes(fill=type,ymin=lower.ci.freq,ymax=upper.ci.freq),position=position_dodge(0.9),width=0.2)+
  theme_classic()+
  scale_colour_manual(values=c("black"))+
  guides(colour="none")+
  add_pvalue(p_dist_sum,
             xmin="xmin",
             xmax="xmax",
             tip.length =0.01)+
  guides(colour="none",
         fill="none")+
  geom_point(aes(y=mean.ratio/coeff,group=id),size=3)+
  geom_line(aes(y=mean.ratio/coeff,group=id),colour="red")+
  geom_errorbar(aes(ymin=lower.ci.ratio/coeff,ymax=upper.ci.ratio/coeff),position=position_dodge(0.9),width=0.2)+
  
  scale_y_continuous(
    
    # Features of the first axis
    name = "ratio",
    
    # Add a second axis and specify its features
    sec.axis = sec_axis(~.*coeff, name="observed/expected")
  ) 
p_dist_ci  
ggsave("/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig3/cluster_dist.ci.pdf",p_dist_ci,
       width=10,height=8)

APOBEC_ci_df<-exp_df%>%
  filter(gene_id=="APOBEC3A"|gene_id=="APOBEC3B")%>%
  group_by(APOBEC,gene_id,TP53,dose) %>%
  summarise(mean.TPM = mean(TPM, na.rm = TRUE),
            sd.TPM = sd(TPM, na.rm = TRUE),
            n.TPM = n(),
            se=sd.TPM/sqrt(n.TPM))%>%
  mutate(dse=0.434/mean.TPM*se)

#dist_sum_A3A_oe_ci_df<-


dist_sum_A3A_oe_df%>%mutate(oe_ratio=oe_ratio+0.05)%>%
  group_by(group)%>%
  summarise(log_mean_ratio=mean(log10(oe_ratio)))%>%
  mutate(`10^mean_ratio`=10^log_mean_ratio)%>%
  left_join(
    dist_sum_A3A_oe_df%>%
      group_by(group)%>%
      summarise(ori_mean_ratio=mean(oe_ratio))
  )

a=c(10,20,30)
10^a
(dist_sum_A3A_oe_df%>%arrange(oe_ratio)%>%filter(group=="2")%>%select(oe_ratio))$oe_ratio%>%mean()
(dist_sum_A3A_oe_df%>%arrange(oe_ratio)%>%filter(group=="2")%>%select(oe_ratio))$oe_ratio
(dist_sum_A3A_oe_df%>%arrange(oe_ratio)%>%filter(group=="2")%>%select(oe_ratio)%>%mutate(oe_ratio=oe_ratio+0.05))$oe_ratio%>%log10()%>%mean()


dist_sum_A3A_oe_log_ci_df<-left_join(
  dist_sum_A3A_oe_df%>%
    group_by(group)%>%
    summarise(mean.ratio=mean(oe_ratio),
              log.mean.ratio=log(mean(oe_ratio),10)),
  
  dist_sum_A3A_oe_df%>%
    group_by(group) %>%
    summarise(mean.ratio = mean(oe_ratio, na.rm = TRUE),
              sd.ratio = sd(oe_ratio, na.rm = TRUE),
              n.ratio = n(),
              se=sd.ratio/sqrt(n.ratio)) %>%
    mutate(dse=0.434/mean.ratio*se)
)%>%
  mutate(upper.log.ratio=log.mean.ratio+dse,
         lower.log.ratio=log.mean.ratio-dse)%>%
  mutate(group=paste0("grp",group))

dist_sum_A3A_oe_log_ci_df

dist_sum_A3A_log_ci_merge_df<-left_join(dist_sum_A3A_ci_df,dist_sum_A3A_oe_log_ci_df)


p_ratio_ci<-dist_sum_A3A_log_ci_merge_df%>%
  mutate(id="x")%>%
  ggplot(aes(x=group,y=mean.freq,col="black"))+
  geom_hline(yintercept=0,colour="grey")+
  #geom_bar(stat="identity",position="dodge",aes(fill=type))+
  #geom_errorbar(aes(fill=type,ymin=lower.ci.freq,ymax=upper.ci.freq),position=position_dodge(0.9),width=0.2)+
  theme_classic()+
  scale_colour_manual(values=c("black"))+
  guides(colour="none")+
  guides(colour="none",
         fill="none")+
  geom_point(aes(y=log.mean.ratio,group=id),size=3)+
  geom_line(aes(y=log.mean.ratio,group=id),colour="red")+
  geom_errorbar(aes(ymin=lower.log.ratio,ymax=upper.log.ratio),position=position_dodge(0.9),width=0.2)+
  scale_y_continuous(lim=c(-1,2))

scale_y_continuous(
  
  # Features of the first axis
  name = "ratio",
  labels=scales::comma,
  # Add a second axis and specify its features
  #sec.axis = sec_axis(~.*coeff, name="observed/e#xpected")
  sec.axis = sec_axis(~ ./coeff, 
                      labels=function(x) scales::comma(round(10^x), accuracy=1),
                      breaks=c(0.001,0.01,0.1,1,2),
                      name="Cumulative Cases"))

ggsave("/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig3/cluster_dist.ci.log.pdf",p_ratio_ci,
       width=10,height=8)
#) 


coeff=1
2/coeff
coeff=1.5*max(dist_sum_A3A_ci_merge_df$mean.freq)/max(dist_sum_A3A_ci_merge_df$mean.ratio)
scales::comma(round(10^dist_sum_A3A_ci_merge_df$mean.freq), accuracy=1)

dist_sum_A3A_log_ci_merge_df%>%
  mutate(id="x")%>%
  ggplot(aes(x=group,y=mean.freq,col="black"))+
  #geom_bar(stat="identity",position="dodge",aes(fill=type))+
  #  geom_errorbar(aes(fill=type,ymin=lower.ci.freq,ymax=upper.ci.freq),position=position_dodge(0.9),width=0.2)+
  theme_classic()+
  scale_colour_manual(values=c("black"))+
  guides(colour="none")+
  #  add_pvalue(p_dist_sum,
  #             xmin="xmin",
  #             xmax="xmax",
  #             tip.length =0.01)+
  guides(colour="none",
         fill="none")+
  geom_point(aes(y=mean.log.ratio,group=id),size=3)+
  geom_line(aes(y=mean.log.ratio,group=id),colour="red")+
  geom_errorbar(aes(ymin=lower.log.ratio,ymax=upper.log.ratio),position=position_dodge(0.9),width=0.2)+
  scale_y_continuous(breaks=c(-1,0,1,2),
                     labels=c(-1,0,1,2),
                     lim=c(-1,2))

#facet_wrap(~id)
p_cluster
ggsave("/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig3/cluster_dist.pdf",p_cluster,
       width=10,height=8)


p_dist_sum2<-
  rstatix::row_wise_fisher_test(dist_sum_df2)%>%
  mutate(group1="expected",group2="observed")%>%
  rstatix::adjust_pvalue(method="BH")%>%
  mutate(xmin=seq(0.8,8.8,by=1),
         xmax=seq(1.2,9.2,by=1))%>%
  mutate(y.position=rep(c(0.45),by=9))
p_cluster2<-dist_sum_df%>%
  mutate(group=paste0("grp",group))%>%
  mutate(group=factor(group,levels=paste0("grp",dist_sum_df$group)%>%unique()))%>%
  ggplot(aes(x=group,y=freq,col="black"))+
  geom_bar(stat="identity",position="dodge",aes(fill=type))+
  geom_line(aes(col = type, group =type))+
  theme_classic()+
  #scale_colour_manual(values=c("black"))+
  guides(colour="none")+
  add_pvalue(p_dist_sum2,
             xmin="xmin",
             xmax="xmax",
             tip.length =0.01)+
  guides(colour="none",
         fill="none")
p_cluster2
ggsave("/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig3/cluster_dist.v2.pdf",p_cluster2,
       width=10,height=8)


dist_sum_df%>%
  mutate(group=paste0("grp",group))%>%
  mutate(group=factor(group,levels=paste0("grp",dist_sum_df$group)%>%unique()))%>%
  ggplot(aes(x=group,y=freq,col="black"))+
  geom_bar(stat="identity",position="dodge",aes(fill=type))+
  geom_line(aes(col = type, group =type))+
  theme_classic()+
  scale_colour_manual(values=c("black"))+
  guides(colour="none")+
  add_pvalue(p_dist_sum,
             xmin="xmin",
             xmax="xmax",
             tip.length =0.01)+
  guides(colour="none",
         fill="none")

rbind(  
  sim_df%>%mutate(type="expected")%>%select(`#CHROM`,POS1,dist,type)%>%plyr::rename(c("POS1"="POS")),
  real_df%>%mutate(type="observed")%>%select(`#CHROM`,POS,dist,type)
)%>%
  ggplot(aes(x=dist,col=type))+
  geom_histogram(stat="bin", aes(y=..density..),position="dodge",bins = 10)+
  geom_density()+
  scale_x_log10(breaks=c(1,10,100,1000,10000,100000,1000000,10000000,100000000))





dist_sum_df%>%
  mutate(group=paste0("grp",group))%>%
  mutate(group=factor(group,levels=paste0("grp",dist_sum_df$group)%>%unique()))%>%
  ggplot(aes(x=group,y=freq,colour=type))+
  geom_bar(stat="identity",position="dodge")+
  geom_line(group=1)

