library(dplyr)
library(tidyverse)
library(stringi)
UTR3_len<-(read.csv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/28_genomic_feature/01_refernce/fin/UTR/protein_coding_UTR3_len.TCN.txt",
                    header=F)%>%as.tibble())$V1
UTR5_len<-(read.csv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/28_genomic_feature/01_refernce/fin/UTR/protein_coding_UTR5_len.TCN.txt",
                      header=F)%>%as.tibble())$V1


UTR5_df<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/28_genomic_feature/01_refernce/fin/UTR/A3A.total.UTR5.rh.vcf")%>%
  unique()%>%
  filter(REF%in%c("A","C","G","T")&
           ALT%in%c("A","C","G","T"))%>%
  mutate(mut_type=ifelse(REF=="C",paste(REF,ALT,sep=">"),paste(chartr("ATGC","TACG",REF),chartr("ATGC","TACG",ALT),sep=">")))

UTR3_df<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/28_genomic_feature/01_refernce/fin/UTR/A3A.total.UTR3.rh.vcf")%>%
  unique()%>%
  filter(REF%in%c("A","C","G","T")&
           ALT%in%c("A","C","G","T"))%>%
  
  mutate(mut_type=ifelse(REF=="C",paste(REF,ALT,sep=">"),paste(chartr("ATGC","TACG",REF),chartr("ATGC","TACG",ALT),sep=">")))
UTR5_df%>%filter(Func_refGene=="UTR3ic")
##by sample
###########

UTR5_UTR3_sample_df<-rbind(UTR5_df%>%group_by(id,TP53)%>%
                               dplyr::summarise(n=n())%>%
                               mutate(len=UTR5_len)%>%
                               mutate(rate=n/UTR5_len)%>%
                               mutate(class="UTR5")
                             ,
                             UTR3_df%>%group_by(id,TP53)%>%
                               dplyr::summarise(n=n())%>%
                               mutate(len=UTR3_len)%>%
                               mutate(rate=n/UTR3_len)%>%
                               mutate(class="UTR3")
)
UTR5_df%>%filter(id=="A3A_1st_C3_100ng-1")%>%nrow()
UTR3_df%>%filter(id=="A3A_1st_C3_100ng-1")
UTR5_UTR3_sample_df%>%filter(id=="A3A_1st_C3_100ng-1")

UTR5_UTR3_sample_df%>%select(-len,-rate)%>%
  spread(class,n)%>%
  mutate(UTR3_rate=UTR3/UTR3_len,UTR5_rate=UTR5/UTR5_len)%>%
  mutate(UTR3_rate_ratio=UTR3_rate/(UTR3_rate+UTR5_rate),
         UTR5_rate_ratio=UTR5_rate/(UTR5_rate+UTR3_rate))


left_join(UTR5_UTR3_sample_df%>%
            select(id,TP53,class,n)%>%
            spread(class,n)%>%
            mutate(UTR3_rate=UTR3/UTR3_len,
                   UTR5_rate=UTR5/UTR5_len)
          ,
          UTR5_UTR3_sample_df%>%select(id,TP53,rate,class)%>%
            spread(class,rate)%>%
            mutate(UTR3_rato=UTR3/(UTR3+UTR5),
                   UTR5_ratio=UTR5/(UTR3+UTR5))
)

ggplot(UTR5_UTR3_sample_df,aes(x=TP53,y=rate,col=class))+
  geom_boxplot()


##by TP53
#########


UTR5_UTR3_group_df<-rbind(UTR5_df%>%select(-id,-FORMAT)%>%unique()%>%group_by(TP53)%>%
                              dplyr::summarise(n=n())%>%
                              mutate(len=UTR5_len)%>%
                              mutate(rate=n/UTR5_len/6)%>%
                              mutate(class="UTR5")
                            ,
                            UTR3_df%>%select(-id,-FORMAT)%>%unique()%>%group_by(TP53)%>%
                              dplyr::summarise(n=n())%>%
                              mutate(len=UTR3_len)%>%
                              mutate(rate=n/UTR3_len/6)%>%
                              mutate(class="UTR3")
)


UTR5_UTR3_group_df%>%
  select(TP53,class,n)%>%
  spread(class,n)%>%
  mutate(UTR3_rate=UTR3/UTR3_len/6,
         UTR5_rate=UTR5/UTR5_len/6)%>%
  mutate(UTR3_rate_ratio=UTR3_rate/(UTR3_rate+UTR5_rate),
         UTR5_rate_ratio=UTR5_rate/(UTR5_rate+UTR3_rate))




##total
#######

UTR5_UTR3_tot_df<-rbind(UTR5_df%>%select(-id,-FORMAT,-TP53)%>%unique()%>%
                            dplyr::summarise(n=n())%>%
                            mutate(len=UTR5_len)%>%
                            mutate(rate=n/UTR5_len/12)%>%
                            mutate(class="UTR5")
                          ,
                          UTR3_df%>%select(-id,-FORMAT,-TP53)%>%unique()%>%
                            dplyr::summarise(n=n())%>%
                            mutate(len=UTR3_len)%>%
                            mutate(rate=n/UTR3_len/12)%>%
                            mutate(class="UTR3")
)

UTR5_UTR3_tot_df%>%
  select(class,n)%>%
  spread(class,n)%>%
  mutate(UTR3_ratio=UTR3/(UTR3+UTR5),
         UTR5_ratio=UTR5/(UTR3+UTR5))

UTR5_UTR3_tot_df%>%
  select(n,class)%>%
  spread(class,n)%>%
  mutate(UTR3_rate=UTR3/UTR3_len/12,UTR5_rate=UTR5/UTR5_len/12)%>%
  mutate(UTR3_rate_ratio=UTR3_rate/(UTR3_rate+UTR5_rate),
         UTR5_rate_ratio=UTR5_rate/(UTR5_rate+UTR3_rate))



##C>T,C>G, separate##
#####################


##by sample
###########
UTR5_df%>%filter(mut_type=="C>G,CT")
UTR5_df$mut_type<-factor(UTR5_df$mut_type,levels=c("C>T","C>G"))
UTR3_df$mut_type<-factor(UTR3_df$mut_type,levels=c("C>T","C>G"))
UTR5_UTR3_sample_df<-rbind(UTR5_df%>%group_by(id,TP53,mut_type)%>%
                               dplyr::summarise(n=n())%>%
                               mutate(len=UTR5_len)%>%
                               mutate(rate=n/UTR5_len)%>%
                               mutate(class="UTR5")
                             ,
                             UTR3_df%>%group_by(id,TP53,mut_type)%>%
                               dplyr::summarise(n=n())%>%
                               mutate(len=UTR3_len)%>%
                               mutate(rate=n/UTR3_len)%>%
                               mutate(class="UTR3")
)


UTR5_UTR3_sample_df%>%select(-len,-rate)%>%
  spread(class,n)%>%
  mutate(UTR3_rate=UTR3/UTR3_len,UTR5_rate=UTR5/UTR5_len)%>%
  mutate(UTR3_rate_ratio=UTR3_rate/(UTR3_rate+UTR5_rate),
         UTR5_rate_ratio=UTR5_rate/(UTR5_rate+UTR3_rate))%>%
  select(id,TP53,mut_type,UTR3)%>%
  spread(mut_type,UTR3)%>%
  mutate(`C>T_ratio`=`C>T`/(`C>T`+`C>G`))%>%
  mutate(`C>G_ratio`=`C>G`/(`C>T`+`C>G`))


UTR5_UTR3_sample_df%>%select(-len,-rate)%>%
  spread(class,n)%>%
  mutate(UTR3_rate=UTR3/UTR3_len,UTR5_rate=UTR5/UTR5_len)%>%
  mutate(UTR3_rate_ratio=UTR3_rate/(UTR3_rate+UTR5_rate),
         UTR5_rate_ratio=UTR5_rate/(UTR5_rate+UTR3_rate))%>%
  select(id,TP53,mut_type,UTR5)%>%
  spread(mut_type,UTR5)%>%
  mutate(`C>T_ratio`=`C>T`/(`C>T`+`C>G`))%>%
  mutate(`C>G_ratio`=`C>G`/(`C>T`+`C>G`))



left_join(UTR5_UTR3_sample_df%>%
            select(id,TP53,class,n)%>%
            spread(class,n)%>%
            mutate(UTR3_rate=UTR3/UTR3_len,
                   UTR5_rate=UTR5/UTR5_len)
          ,
          UTR5_UTR3_sample_df%>%select(id,TP53,rate,class)%>%
            spread(class,rate)%>%
            mutate(UTR3_rato=UTR3/(UTR3+UTR5),
                   UTR5_ratio=UTR5/(UTR3+UTR5))
)

ggplot(UTR5_UTR3_sample_df,aes(x=TP53,y=rate,col=class))+
  geom_boxplot()


##by TP53
#########


UTR5_UTR3_group_df<-rbind(UTR5_df%>%select(-id,-FORMAT)%>%unique()%>%group_by(TP53,mut_type)%>%
                              dplyr::summarise(n=n())%>%
                              mutate(len=UTR5_len)%>%
                              mutate(rate=n/UTR5_len)%>%
                              mutate(class="UTR5")
                            ,
                            UTR3_df%>%select(-id,-FORMAT)%>%unique()%>%group_by(TP53,mut_type)%>%
                              dplyr::summarise(n=n())%>%
                              mutate(len=UTR3_len)%>%
                              mutate(rate=n/UTR3_len)%>%
                              mutate(class="UTR3")
)


UTR5_UTR3_group_df%>%
  select(TP53,mut_type,class,n)%>%
  spread(class,n)%>%
  mutate(UTR3_rate=UTR3/UTR3_len/6,
         UTR5_rate=UTR5/UTR5_len/6)%>%
  mutate(UTR3_rate_ratio=UTR3_rate/(UTR3_rate+UTR5_rate),
         UTR5_rate_ratio=UTR5_rate/(UTR5_rate+UTR3_rate))%>%
  select(TP53,mut_type,UTR3)%>%
  spread(mut_type,UTR3)%>%
  mutate(`C>T_ratio`=`C>T`/(`C>T`+`C>G`))%>%
  mutate(`C>G_ratio`=`C>G`/(`C>T`+`C>G`))


UTR5_UTR3_group_df%>%
  select(TP53,mut_type,class,n)%>%
  spread(class,n)%>%
  mutate(UTR3_rate=UTR3/UTR3_len/6,
         UTR5_rate=UTR5/UTR5_len/6)%>%
  mutate(UTR3_rate_ratio=UTR3_rate/(UTR3_rate+UTR5_rate),
         UTR5_rate_ratio=UTR5_rate/(UTR5_rate+UTR3_rate))%>%
  select(TP53,mut_type,UTR5)%>%
  spread(mut_type,UTR5)%>%
  mutate(`C>T_ratio`=`C>T`/(`C>T`+`C>G`))%>%
  mutate(`C>G_ratio`=`C>G`/(`C>T`+`C>G`))




##total
#######

UTR5_UTR3_tot_df<-rbind(UTR5_df%>%select(-id,-FORMAT,-TP53)%>%unique()%>%
                            group_by(mut_type)%>%
                            dplyr::summarise(n=n())%>%
                            mutate(len=UTR5_len)%>%
                            mutate(rate=n/UTR5_len)%>%
                            mutate(class="UTR5")
                          ,
                          UTR3_df%>%select(-id,-FORMAT,-TP53)%>%unique()%>%
                            group_by(mut_type)%>%
                            dplyr::summarise(n=n())%>%
                            mutate(len=UTR3_len)%>%
                            mutate(rate=n/UTR3_len)%>%
                            mutate(class="UTR3")
)

UTR5_UTR3_tot_df%>%
  select(mut_type,class,n)%>%
  spread(class,n)%>%
  mutate(UTR3_ratio=UTR3/(UTR3+UTR5),
         UTR5_ratio=UTR5/(UTR3+UTR5))%>%
  select(mut_type,UTR3)%>%
  spread(mut_type,UTR3)%>%
  mutate(`C>T_ratio`=`C>T`/(`C>T`+`C>G`))%>%
  mutate(`C>G_ratio`=`C>G`/(`C>T`+`C>G`))

UTR5_UTR3_tot_df%>%
  select(mut_type,n,class)%>%
  spread(class,n)%>%
  mutate(UTR3_rate=UTR3/UTR3_len,UTR5_rate=UTR5/UTR5_len)%>%
  mutate(UTR3_rate_ratio=UTR3_rate/(UTR3_rate+UTR5_rate),
         UTR5_rate_ratio=UTR5_rate/(UTR5_rate+UTR3_rate))%>%
  select(mut_type,UTR5)%>%
  spread(mut_type,UTR5)%>%
  mutate(`C>T_ratio`=`C>T`/(`C>T`+`C>G`))%>%
  mutate(`C>G_ratio`=`C>G`/(`C>T`+`C>G`))




##`C>T`,`C>G` compare##
#######################


##by sample
###########
UTR5_df%>%filter(mut_type=="C>G,CT")

UTR5_UTR3_sample_df<-rbind(UTR5_df%>%group_by(id,TP53,mut_type)%>%
                               dplyr::summarise(n=n())%>%
                               mutate(len=UTR5_len)%>%
                               mutate(rate=n/UTR5_len)%>%
                               mutate(class="UTR5")
                             ,
                             UTR3_df%>%group_by(id,TP53,mut_type)%>%
                               dplyr::summarise(n=n())%>%
                               mutate(len=UTR3_len)%>%
                               mutate(rate=n/UTR3_len)%>%
                               mutate(class="UTR3")
)


UTR5_UTR3_sample_df%>%select(-len,-rate)%>%
  spread(class,n)%>%
  mutate(UTR3_rate=UTR3/UTR3_len,UTR5_rate=UTR5/UTR5_len)%>%
  mutate(UTR3_rate_ratio=UTR3_rate/(UTR3_rate+UTR5_rate),
         UTR5_rate_ratio=UTR5_rate/(UTR5_rate+UTR3_rate))%>%
  print(n=100)


left_join(UTR5_UTR3_sample_df%>%
            select(id,TP53,class,n)%>%
            spread(class,n)%>%
            mutate(UTR3_rate=UTR3/UTR3_len,
                   UTR5_rate=UTR5/UTR5_len)
          ,
          UTR5_UTR3_sample_df%>%select(id,TP53,rate,class)%>%
            spread(class,rate)%>%
            mutate(UTR3_rato=UTR3/(UTR3+UTR5),
                   UTR5_ratio=UTR5/(UTR3+UTR5))
)

ggplot(UTR5_UTR3_sample_df,aes(x=TP53,y=rate,col=class))+
  geom_boxplot()


##by TP53
#########


UTR5_UTR3_group_df<-rbind(UTR5_df%>%select(-id,-FORMAT)%>%unique()%>%group_by(TP53,mut_type)%>%
                              dplyr::summarise(n=n())%>%
                              mutate(len=UTR5_len)%>%
                              mutate(rate=n/UTR5_len)%>%
                              mutate(class="UTR5")
                            ,
                            UTR3_df%>%select(-id,-FORMAT)%>%unique()%>%group_by(TP53,mut_type)%>%
                              dplyr::summarise(n=n())%>%
                              mutate(len=UTR3_len)%>%
                              mutate(rate=n/UTR3_len)%>%
                              mutate(class="UTR3")
)


UTR5_UTR3_group_df%>%
  select(TP53,mut_type,class,n)%>%
  spread(class,n)%>%
  mutate(UTR3_rate=UTR3/UTR3_len,
         UTR5_rate=UTR5/UTR5_len)%>%
  mutate(UTR3_rate_ratio=UTR3_rate/(UTR3_rate+UTR5_rate),
         UTR5_rate_ratio=UTR5_rate/(UTR5_rate+UTR3_rate))




##total
#######

UTR5_UTR3_tot_df<-rbind(UTR5_df%>%select(-id,-FORMAT,-TP53)%>%unique()%>%
                            group_by(mut_type)%>%
                            dplyr::summarise(n=n())%>%
                            mutate(len=UTR5_len)%>%
                            mutate(rate=n/UTR5_len)%>%
                            mutate(class="UTR5")
                          ,
                          UTR3_df%>%select(-id,-FORMAT,-TP53)%>%unique()%>%
                            group_by(mut_type)%>%
                            dplyr::summarise(n=n())%>%
                            mutate(len=UTR3_len)%>%
                            mutate(rate=n/UTR3_len)%>%
                            mutate(class="UTR3")
)

UTR5_UTR3_tot_df%>%
  select(mut_type,class,n)%>%
  spread(class,n)%>%
  mutate(UTR3_ratio=UTR3/(UTR3+UTR5),
         UTR5_ratio=UTR5/(UTR3+UTR5))

UTR5_UTR3_tot_df%>%
  select(mut_type,n,class)%>%
  spread(class,n)%>%
  mutate(UTR3_rate=UTR3/UTR3_len,UTR5_rate=UTR5/UTR5_len)%>%
  mutate(UTR3_rate_ratio=UTR3_rate/(UTR3_rate+UTR5_rate),
         UTR5_rate_ratio=UTR5_rate/(UTR5_rate+UTR3_rate))

