library(dplyr)
library(tidyverse)
library(stringi)
exon_len<-(read.csv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/28_genomic_feature/01_refernce/fin/internal_intron_excl_overlap/protein_coding_exoonic_overlap_excl_len.TCN.txt",
                    header=F)%>%as.tibble())$V1
intron_len<-(read.csv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/28_genomic_feature/01_refernce/fin/intron_excl_overlap/protein_coding_intronic_len.excl_overlap.TCN.txt",
                      header=F)%>%as.tibble())$V1


intron_df<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/28_genomic_feature/01_refernce/fin/internal_intron_excl_overlap/A3A.total.intron.excl_overlap.raw.rh.vcf")%>%
  unique()%>%
  filter(REF%in%c("A","C","G","T")&
           ALT%in%c("A","C","G","T"))%>%
  mutate(mut_type=ifelse(REF=="C",paste(REF,ALT,sep=">"),paste(chartr("ATGC","TACG",REF),chartr("ATGC","TACG",ALT),sep=">")))

exon_df<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/28_genomic_feature/01_refernce/fin/internal_intron_excl_overlap//A3A.total.internal_excl_overlap.exon.raw.rh.vcf")%>%
  unique()%>%
  filter(REF%in%c("A","C","G","T")&
           ALT%in%c("A","C","G","T"))%>%
  
  mutate(mut_type=ifelse(REF=="C",paste(REF,ALT,sep=">"),paste(chartr("ATGC","TACG",REF),chartr("ATGC","TACG",ALT),sep=">")))
intron_df%>%filter(Func_refGene=="exonic")
##by sample
###########

intron_exon_sample_df<-rbind(intron_df%>%group_by(id,TP53)%>%
                               dplyr::summarise(n=n())%>%
                               mutate(len=intron_len)%>%
                               mutate(rate=n/intron_len)%>%
                               mutate(class="intron")
                             ,
                             exon_df%>%group_by(id,TP53)%>%
                               dplyr::summarise(n=n())%>%
                               mutate(len=exon_len)%>%
                               mutate(rate=n/exon_len)%>%
                               mutate(class="exon")
)
intron_df%>%filter(id=="A3A_1st_C3_100ng-1")%>%nrow()
exon_df%>%filter(id=="A3A_1st_C3_100ng-1")
intron_exon_sample_df%>%filter(id=="A3A_1st_C3_100ng-1")

intron_exon_sample_df%>%select(-len,-rate)%>%
  spread(class,n)%>%
  mutate(exon_rate=exon/exon_len,intron_rate=intron/intron_len)%>%
  mutate(exon_rate_ratio=exon_rate/(exon_rate+intron_rate),
         intron_rate_ratio=intron_rate/(intron_rate+exon_rate))


left_join(intron_exon_sample_df%>%
            select(id,TP53,class,n)%>%
            spread(class,n)%>%
            mutate(exon_rate=exon/exon_len,
                   intron_rate=intron/intron_len)
          ,
          intron_exon_sample_df%>%select(id,TP53,rate,class)%>%
            spread(class,rate)%>%
            mutate(exon_rato=exon/(exon+intron),
                   intron_ratio=intron/(exon+intron))
)

ggplot(intron_exon_sample_df,aes(x=TP53,y=rate,col=class))+
  geom_boxplot()


##by TP53
#########


intron_exon_group_df<-rbind(intron_df%>%select(-id,-FORMAT)%>%unique()%>%group_by(TP53)%>%
                              dplyr::summarise(n=n())%>%
                              mutate(len=intron_len)%>%
                              mutate(rate=n/intron_len)%>%
                              mutate(class="intron")
                            ,
                            exon_df%>%select(-id,-FORMAT)%>%unique()%>%group_by(TP53)%>%
                              dplyr::summarise(n=n())%>%
                              mutate(len=exon_len)%>%
                              mutate(rate=n/exon_len)%>%
                              mutate(class="exon")
)


intron_exon_group_df%>%
  select(TP53,class,n)%>%
  spread(class,n)%>%
  mutate(exon_rate=exon/exon_len/6,
         intron_rate=intron/intron_len/6)%>%
  mutate(exon_rate_ratio=exon_rate/(exon_rate+intron_rate),
         intron_rate_ratio=intron_rate/(intron_rate+exon_rate))




##total
#######

intron_exon_tot_df<-rbind(intron_df%>%select(-id,-FORMAT,-TP53)%>%unique()%>%
                            dplyr::summarise(n=n())%>%
                            mutate(len=intron_len)%>%
                            mutate(rate=n/intron_len)%>%
                            mutate(class="intron")
                          ,
                          exon_df%>%select(-id,-FORMAT,-TP53)%>%unique()%>%
                            dplyr::summarise(n=n())%>%
                            mutate(len=exon_len)%>%
                            mutate(rate=n/exon_len)%>%
                            mutate(class="exon")
)

intron_exon_tot_df%>%
  select(class,n)%>%
  spread(class,n)%>%
  mutate(exon_ratio=exon/(exon+intron),
         intron_ratio=intron/(exon+intron))

intron_exon_tot_df%>%
  select(n,class)%>%
  spread(class,n)%>%
  mutate(exon_rate=exon/exon_len/12,intron_rate=intron/intron_len/12)%>%
  mutate(exon_rate_ratio=exon_rate/(exon_rate+intron_rate),
         intron_rate_ratio=intron_rate/(intron_rate+exon_rate))



##C>T,C>G, separate##
#####################


##by sample
###########
intron_df%>%filter(mut_type=="C>G,CT")
intron_df$mut_type<-factor(intron_df$mut_type,levels=c("C>T","C>G"))
exon_df$mut_type<-factor(exon_df$mut_type,levels=c("C>T","C>G"))
intron_exon_sample_df<-rbind(intron_df%>%group_by(id,TP53,mut_type)%>%
                               dplyr::summarise(n=n())%>%
                               mutate(len=intron_len)%>%
                               mutate(rate=n/intron_len)%>%
                               mutate(class="intron")
                             ,
                             exon_df%>%group_by(id,TP53,mut_type)%>%
                               dplyr::summarise(n=n())%>%
                               mutate(len=exon_len)%>%
                               mutate(rate=n/exon_len)%>%
                               mutate(class="exon")
)


intron_exon_sample_df%>%select(-len,-rate)%>%
  spread(class,n)%>%
  mutate(exon_rate=exon/exon_len,intron_rate=intron/intron_len)%>%
  mutate(exon_rate_ratio=exon_rate/(exon_rate+intron_rate),
         intron_rate_ratio=intron_rate/(intron_rate+exon_rate))%>%
  select(id,TP53,mut_type,exon)%>%
  spread(mut_type,exon)%>%
  mutate(`C>T_ratio`=`C>T`/(`C>T`+`C>G`))%>%
  mutate(`C>G_ratio`=`C>G`/(`C>T`+`C>G`))


intron_exon_sample_df%>%select(-len,-rate)%>%
  spread(class,n)%>%
  mutate(exon_rate=exon/exon_len,intron_rate=intron/intron_len)%>%
  mutate(exon_rate_ratio=exon_rate/(exon_rate+intron_rate),
         intron_rate_ratio=intron_rate/(intron_rate+exon_rate))%>%
  select(id,TP53,mut_type,intron)%>%
  spread(mut_type,intron)%>%
  mutate(`C>T_ratio`=`C>T`/(`C>T`+`C>G`))%>%
  mutate(`C>G_ratio`=`C>G`/(`C>T`+`C>G`))



left_join(intron_exon_sample_df%>%
            select(id,TP53,class,n)%>%
            spread(class,n)%>%
            mutate(exon_rate=exon/exon_len,
                   intron_rate=intron/intron_len)
          ,
          intron_exon_sample_df%>%select(id,TP53,rate,class)%>%
            spread(class,rate)%>%
            mutate(exon_rato=exon/(exon+intron),
                   intron_ratio=intron/(exon+intron))
)

ggplot(intron_exon_sample_df,aes(x=TP53,y=rate,col=class))+
  geom_boxplot()


##by TP53
#########


intron_exon_group_df<-rbind(intron_df%>%select(-id,-FORMAT)%>%unique()%>%group_by(TP53,mut_type)%>%
                              dplyr::summarise(n=n())%>%
                              mutate(len=intron_len)%>%
                              mutate(rate=n/intron_len)%>%
                              mutate(class="intron")
                            ,
                            exon_df%>%select(-id,-FORMAT)%>%unique()%>%group_by(TP53,mut_type)%>%
                              dplyr::summarise(n=n())%>%
                              mutate(len=exon_len)%>%
                              mutate(rate=n/exon_len)%>%
                              mutate(class="exon")
)


intron_exon_group_df%>%
  select(TP53,mut_type,class,n)%>%
  spread(class,n)%>%
  mutate(exon_rate=exon/exon_len,
         intron_rate=intron/intron_len)%>%
  mutate(exon_rate_ratio=exon_rate/(exon_rate+intron_rate),
         intron_rate_ratio=intron_rate/(intron_rate+exon_rate))%>%
  select(TP53,mut_type,exon)%>%
  spread(mut_type,exon)%>%
  mutate(`C>T_ratio`=`C>T`/(`C>T`+`C>G`))%>%
  mutate(`C>G_ratio`=`C>G`/(`C>T`+`C>G`))


intron_exon_group_df%>%
  select(TP53,mut_type,class,n)%>%
  spread(class,n)%>%
  mutate(exon_rate=exon/exon_len,
         intron_rate=intron/intron_len)%>%
  mutate(exon_rate_ratio=exon_rate/(exon_rate+intron_rate),
         intron_rate_ratio=intron_rate/(intron_rate+exon_rate))%>%
  select(TP53,mut_type,intron)%>%
  spread(mut_type,intron)%>%
  mutate(`C>T_ratio`=`C>T`/(`C>T`+`C>G`))%>%
  mutate(`C>G_ratio`=`C>G`/(`C>T`+`C>G`))




##total
#######

intron_exon_tot_df<-rbind(intron_df%>%select(-id,-FORMAT,-TP53)%>%unique()%>%
                            group_by(mut_type)%>%
                            dplyr::summarise(n=n())%>%
                            mutate(len=intron_len)%>%
                            mutate(rate=n/intron_len)%>%
                            mutate(class="intron")
                          ,
                          exon_df%>%select(-id,-FORMAT,-TP53)%>%unique()%>%
                            group_by(mut_type)%>%
                            dplyr::summarise(n=n())%>%
                            mutate(len=exon_len)%>%
                            mutate(rate=n/exon_len)%>%
                            mutate(class="exon")
)

intron_exon_tot_df%>%
  select(mut_type,class,n)%>%
  spread(class,n)%>%
  mutate(exon_ratio=exon/(exon+intron),
         intron_ratio=intron/(exon+intron))%>%
  select(mut_type,exon)%>%
  spread(mut_type,exon)%>%
  mutate(`C>T_ratio`=`C>T`/(`C>T`+`C>G`))%>%
  mutate(`C>G_ratio`=`C>G`/(`C>T`+`C>G`))

intron_exon_tot_df%>%
  select(mut_type,n,class)%>%
  spread(class,n)%>%
  mutate(exon_rate=exon/exon_len,intron_rate=intron/intron_len)%>%
  mutate(exon_rate_ratio=exon_rate/(exon_rate+intron_rate),
         intron_rate_ratio=intron_rate/(intron_rate+exon_rate))%>%
  select(mut_type,intron)%>%
  spread(mut_type,intron)%>%
  mutate(`C>T_ratio`=`C>T`/(`C>T`+`C>G`))%>%
  mutate(`C>G_ratio`=`C>G`/(`C>T`+`C>G`))




##`C>T`,`C>G` compare##
#######################


##by sample
###########
intron_df%>%filter(mut_type=="C>G,CT")

intron_exon_sample_df<-rbind(intron_df%>%group_by(id,TP53,mut_type)%>%
                               dplyr::summarise(n=n())%>%
                               mutate(len=intron_len)%>%
                               mutate(rate=n/intron_len)%>%
                               mutate(class="intron")
                             ,
                             exon_df%>%group_by(id,TP53,mut_type)%>%
                               dplyr::summarise(n=n())%>%
                               mutate(len=exon_len)%>%
                               mutate(rate=n/exon_len)%>%
                               mutate(class="exon")
)


intron_exon_sample_df%>%select(-len,-rate)%>%
  spread(class,n)%>%
  mutate(exon_rate=exon/exon_len,intron_rate=intron/intron_len)%>%
  mutate(exon_rate_ratio=exon_rate/(exon_rate+intron_rate),
         intron_rate_ratio=intron_rate/(intron_rate+exon_rate))%>%
  print(n=100)


left_join(intron_exon_sample_df%>%
            select(id,TP53,class,n)%>%
            spread(class,n)%>%
            mutate(exon_rate=exon/exon_len,
                   intron_rate=intron/intron_len)
          ,
          intron_exon_sample_df%>%select(id,TP53,rate,class)%>%
            spread(class,rate)%>%
            mutate(exon_rato=exon/(exon+intron),
                   intron_ratio=intron/(exon+intron))
)

ggplot(intron_exon_sample_df,aes(x=TP53,y=rate,col=class))+
  geom_boxplot()


##by TP53
#########


intron_exon_group_df<-rbind(intron_df%>%select(-id,-FORMAT)%>%unique()%>%group_by(TP53,mut_type)%>%
                              dplyr::summarise(n=n())%>%
                              mutate(len=intron_len)%>%
                              mutate(rate=n/intron_len)%>%
                              mutate(class="intron")
                            ,
                            exon_df%>%select(-id,-FORMAT)%>%unique()%>%group_by(TP53,mut_type)%>%
                              dplyr::summarise(n=n())%>%
                              mutate(len=exon_len)%>%
                              mutate(rate=n/exon_len)%>%
                              mutate(class="exon")
)


intron_exon_group_df%>%
  select(TP53,mut_type,class,n)%>%
  spread(class,n)%>%
  mutate(exon_rate=exon/exon_len/6,
         intron_rate=intron/intron_len/6)%>%
  mutate(exon_rate_ratio=exon_rate/(exon_rate+intron_rate),
         intron_rate_ratio=intron_rate/(intron_rate+exon_rate))




##total
#######

intron_exon_tot_df<-rbind(intron_df%>%select(-id,-FORMAT,-TP53)%>%unique()%>%
                            group_by(mut_type)%>%
                            dplyr::summarise(n=n())%>%
                            mutate(len=intron_len)%>%
                            mutate(rate=n/intron_len)%>%
                            mutate(class="intron")
                          ,
                          exon_df%>%select(-id,-FORMAT,-TP53)%>%unique()%>%
                            group_by(mut_type)%>%
                            dplyr::summarise(n=n())%>%
                            mutate(len=exon_len)%>%
                            mutate(rate=n/exon_len)%>%
                            mutate(class="exon")
)

intron_exon_tot_df%>%
  select(mut_type,class,n)%>%
  spread(class,n)%>%
  mutate(exon_ratio=exon/(exon+intron),
         intron_ratio=intron/(exon+intron))

intron_exon_tot_df%>%
  select(mut_type,n,class)%>%
  spread(class,n)%>%
  mutate(exon_rate=exon/exon_len/12,intron_rate=intron/intron_len/12)%>%
  mutate(exon_rate_ratio=exon_rate/(exon_rate+intron_rate),
         intron_rate_ratio=intron_rate/(intron_rate+exon_rate))

