library(data.table)
library(GenomicRanges)
library(dplyr)
library(tidyverse)
library(g)
genic_df<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/28_genomic_feature/01_refernce/fin/Tx/A3A.total.gene.raw.rh.vcf.gd")

ref_df<-read.csv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/28_genomic_feature/01_refernce/fin/Tx/refGene.target.unk.filtered.mappable.uniq.bed",
                 sep="\t",
                 header=F)%>%as.tibble()

ref_ori_df<-read.csv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/28_genomic_feature/01_refernce/fin/Tx/refGene.target.unk.filtered.bed",
                     sep="\t",
                     header=F)%>%as.tibble()

ref_ori_filter_df<-ref_ori_df%>%filter(V4%in%ref_df$V4)

ref_ori_fil_gr<-GRanges(seqnames=ref_ori_filter_df$V1,IRanges(start=(as.numeric(ref_ori_filter_df$V2)), end=(as.numeric(ref_ori_filter_df$V3))),strand=ref_ori_filter_df$V5,gene_id=ref_ori_filter_df$V4)
reduce(ref_ori_fil_gr)

UTR5_bed_df<-read.csv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/28_genomic_feature/01_refernce/fin/UTR/refGene.target.UTR5.mappable.uniq.bed",
                      sep="\t",
                      header=F)%>%as.tibble()
UTR5_gr<-GRanges(seqnames=UTR5_bed_df$V1,IRanges(start=(as.numeric(UTR5_bed_df$V2)), end=(as.numeric(UTR5_bed_df$V3))),strand=UTR5_bed_df$V5,gene_id=UTR5_bed_df$V4)
UTR5_red<-unlist(reduce(split(UTR5_gr,UTR5_gr$gene_id)))
reduce(split(UTR5_gr,UTR5_gr$gene_id))$A1CF
UTR5_red_df<-UTR5_red%>%as.tibble()
UTR5_red_df<-UTR5_red_df%>%mutate(gene_id=names(UTR5_red))%>%
  plyr::rename(c("seqnames"="#CHROM","gene_id"="Gene_refGene"))%>%
  mutate(width=end-start)
#UTR5_red_df%>%
# in short:
ref_ori_fil_red<-unlist(reduce(split(ref_ori_fil_gr, ref_ori_fil_gr$gene_id)))


ref_ori_fil_red_df<-ref_ori_fil_red%>%as.tibble()
names(ref_ori_fil_red)
ref_ori_fil_red_df<-ref_ori_fil_red_df%>%mutate(gene_id=names(ref_ori_fil_red))%>%
  plyr::rename(c("seqnames"="#CHROM","gene_id"="Gene_refGene"))%>%
  mutate(width=end-start)

ref_ori_fil_red_df$Gene_refGene%>%unique()

genic_ref_merge_df<-genic_ref_merge_df%>%mutate(info=paste(`#CHROM`,POS,REF,ALT,id))
UTR_df<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/28_genomic_feature/01_refernce/fin/UTR/A3A.total.UTR5.rh.vcf")%>%
  mutate(info=paste(`#CHROM`,POS,REF,ALT,id))
genic_ref_merge_df%>%filter(!info%in%UTR_df$info)%>%filter(Func_refGene=="UTR5")
genic_ref_merge_df<-left_join(genic_df,ref_ori_fil_red_df)%>%
  mutate(start=start+1)
genic_ref_merge_df$mut_type<-factor(genic_ref_merge_df$mut_type,levels=c("C>T","C>G"))
genic_ref_merge_df$TP53<-factor(genic_ref_merge_df$TP53,levels=c("WT","KO"))
genic_ref_merge_df%>%select(-id)%>%unique()%>%filter(Func_refGene%in%c("exonic","UTR5","UTR3"))%>%
  mutate(dist=ifelse(gene_dir=="+",POS-start,end-POS))%>%
  mutate(dist_perc=dist/width)%>%
  ggplot(aes(x=dist_perc))+
  geom_histogram(binwidth=0.005)+
  facet_wrap(~TP53+mut_type+Func_refGene+REF,ncol=6)

genic_ref_merge_df%>%select(-id)%>%unique()%>%
  mutate(dist=ifelse(gene_dir=="+",POS-start,end-POS))%>%
  mutate(dist_perc=dist/width)%>%
  ggplot(aes(x=dist_perc))+
  geom_histogram(binwidth=0.005)+
  facet_wrap(~TP53+mut_type,ncol=6)

genic_ref_merge_df%>%select(-id)%>%unique()%>%
  mutate(dist=ifelse(gene_dir=="+",POS-start,end-POS))%>%
  mutate(dist_perc=dist/width)%>%
  ggplot(aes(x=dist))+
  geom_histogram()+
  facet_wrap(~TP53,ncol=6)+
  scale_x_log10(lim=c(1,10000))

genic_ref_merge_df%>%select(-id)%>%unique()%>%filter(Func_refGene%in%c("UTR5"))
genic_ref_merge_df%>%select(-id)%>%unique()%>%filter(Func_refGene%in%c("UTR5"))%>%
  mutate(dist=ifelse(gene_dir=="+",POS-start,end-POS))%>%
  mutate(dist_perc=dist/width)%>%
  ggplot(aes(x=dist))+
  geom_density()+
  facet_wrap(~TP53)+
  scale_x_log10(limit=c(1,12000))

genic_ref_merge_df%>%select(-id)%>%unique()%>%filter(Func_refGene%in%c("UTR5"))%>%
  mutate(dist=ifelse(gene_dir=="+",POS-start,end-POS))%>%
  mutate(dist_perc=dist/width)%>%
  ggplot(aes(x=dist))+
  geom_histogram(binwidth=50)+
  #geom_density()+
  #facet_wrap(~TP53)+
  #scale_x_continuous(limit=c(0,12511))+
  #scale_x_continuous(limit=c(0,8000))+
  theme_bw()+
  theme(axis.text=element_text(size=20))+
  geom_vline(xintercept=200,col="red",linetype="dashed",alpha=0.3)
  

  #scale_x_log10(limit=c(1,12511))

p1<-genic_ref_merge_df%>%select(-id)%>%unique()%>%filter(Func_refGene%in%c("UTR5"))%>%
  mutate(dist=ifelse(gene_dir=="+",POS-start,end-POS))%>%
  mutate(dist_perc=dist/width)%>%
  ggplot(aes(x=dist))+
  geom_histogram(binwidth=50)+
  #geom_density()+
  #facet_wrap(~TP53)+
  #scale_x_continuous(limit=c(0,12511))+
  #scale_x_continuous(limit=c(0,10000))+
  theme_bw()+
  theme(axis.text=element_text(size=20))+
  geom_vline(xintercept=10000,col="red",linetype="dashed")+
  theme(axis.title.x=element_blank(),
        axis.text.x=element_text(size=30),
        #plot.title=element_text(size=30),
        axis.text.y=element_text(size=40),
        axis.title.y=element_text(size=40),
        axis.ticks.length=unit(.4, "cm"),
        axis.ticks = element_line(size = 4)
  )


p2<-genic_ref_merge_df%>%select(-id)%>%unique()%>%filter(Func_refGene%in%c("UTR5"))%>%
  mutate(dist=ifelse(gene_dir=="+",POS-start,end-POS))%>%
  mutate(dist_perc=dist/width)%>%
  ggplot(aes(x=dist))+
  geom_histogram(binwidth=50)+
  #geom_density()+
  #facet_wrap(~TP53)+
  #scale_x_continuous(limit=c(0,12511))+
  scale_x_continuous(limit=c(0,10000))+
  theme_bw()+
  theme(axis.text=element_text(size=20))+
  geom_vline(xintercept=200,col="red",linetype="dashed")+
  theme(axis.title.x=element_blank(),
        axis.text.x=element_text(size=30),
        #plot.title=element_text(size=30),
        axis.text.y=element_text(size=40),
        axis.title.y=element_text(size=40),
        axis.ticks.length=unit(.4, "cm"),
        axis.ticks = element_line(size = 4)
  )

ggsave("/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig4/230620/UTR_dist_total_wide.pdf",p1,
       width=10,height=8)
ggsave("/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig4/230620/UTR_dist_total_narrow.pdf",p2,
       width=10,height=8)


p2
genic_ref_merge_df%>%select(-id)%>%unique()%>%filter(Func_refGene%in%c("UTR5"))%>%
  mutate(dist=ifelse(gene_dir=="+",POS-start,end-POS))%>%
  mutate(dist_perc=dist/width)%>%
  ggplot(aes(x=dist))+
  geom_histogram(binwidth=50)+
  #geom_density()+
  facet_wrap(~TP53)+
  #scale_x_continuous(limit=c(0,12511))+
  scale_x_continuous(limit=c(0,10000))+
  theme_bw()+
  theme(axis.text=element_text(size=20))+
  geom_vline(xintercept=200,col="red",linetype="dashed")

genic_ref_merge_df%>%select(-id)%>%unique()%>%filter(Func_refGene%in%c("UTR5"))%>%
  mutate(dist=ifelse(gene_dir=="+",POS-start,end-POS))%>%
  mutate(dist_perc=dist/width)%>%
  ggplot(aes(x=dist))+
  geom_density()+
  facet_wrap(~TP53)+
  scale_x_log10(limit=c(1,10000))

genic_ref_merge_df%>%select(-id)%>%unique()%>%filter(Func_refGene%in%c("UTR5"))%>%
  mutate(dist=ifelse(gene_dir=="+",POS-start,end-POS))%>%
  mutate(dist_perc=dist/width)%>%
  ggplot(aes(x=dist))+
  geom_histogram()+
  facet_wrap(~TP53)+
  scale_x_log10(limit=c(1,10000))
genic_ref_merge_df%>%select(-id)%>%unique()%>%filter(Func_refGene%in%c("UTR5"))%>%
  mutate(dist=ifelse(gene_dir=="+",POS-start,end-POS))%>%
  mutate(dist_perc=dist/width)%>%
  arrange(TP53,dist)%>%
  select(TP53,dist,width)%>%print(n=100)

genic_ref_merge_df%>%select(-id)%>%unique()%>%filter(Func_refGene%in%c("UTR5"))%>%
  mutate(dist=ifelse(gene_dir=="+",POS-start,end-POS))%>%filter(!is.na(dist))%>%filter(TP53=="WT")%>%
  mutate(dist_perc=dist/width)%>%
  ggplot(aes(x=dist))+
  geom_histogram(binwidth=50)+
  facet_wrap(~TP53,scales="free_x")+
  scale_x_continuous()

genic_ref_merge_df%>%select(-id)%>%unique()%>%filter(Func_refGene%in%c("UTR5"))%>%filter(TP53=="WT")%>%
  mutate(dist=ifelse(gene_dir=="+",POS-start,end-POS))%>%arrange(dist)%>%print(n=100)

genic_ref_merge_df%>%
  mutate(dist=ifelse(gene_dir=="+",POS-start,end-POS))%>%
  mutate(dist_perc=dist/width)%>%
  arrange(-dist_perc)%>%
  select(`#CHROM`,POS,REF,ALT,Func_refGene,Gene_refGene,gene_dir,start,end,width,strand,dist,dist_perc)%>%
  filter(is.na(dist_perc))
  

genic_ref_merge_df%>%filter(dist_perc==1)



genic_ref_merge_df%>%unique()%>%filter(Func_refGene%in%c("UTR5"))%>%
  mutate(dist=ifelse(gene_dir=="+",POS-start,end-POS))%>%
  mutate(dist_perc=dist/width)%>%dplyr::select(`#CHROM`,POS,REF,ALT,id,Func_refGene,Gene_refGene,TP53,sig_cont,mut_type,gene_dir,width,dist)%>%arrange(dist)%>%
  print(n=100)


##UTR stat##

bed_df<-read.csv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/28_genomic_feature/01_refernce/fin/UTR/hg19_refGene.protein_coding.txt",
                 sep="\t",
                 header=F)%>%as.tibble()%>%filter(V14!="unk")%>%
  mutate(V3=gsub("chr","",V3))%>%
  filter(V3%in%as.character(c(1:22),"X"))%>%select(V3,V4,V5,V6,V7,V8,V13)
bed_df%>%
  mutate(UTR5_start=ifelse(V4=="+",V5,V8),
         UTR5_end=ifelse(V4=="+",V7,V6),
         UTR3_start=ifelse(V4=="+",V8,V7),
         UTR3_end=ifelse(V4=="+",V6,V5))%>%
  mutate(UTR5=ifelse(V4=="+",V7-V5,V6-V8),
         UTR3=ifelse(V4=="+",V6-V8,V7-V5))%>%
  #group_by(V13)%>%
  dplyr::summarise(median_UTR5=median(UTR5),
                   median_UTR3=median(UTR3),
                   meean_UTR5=mean(UTR5),
                   mean_UTR3=mean(UTR3))
