library(dplyr)
library(tidyverse)
library(stringi)
library(Rsamtools)
library(parallel)
library(biomaRt)

#!human coding gene
ensembl<-  useMart("ensembl", dataset="hsapiens_gene_ensembl")
mart <- useMart("ENSEMBL_MART_ENSEMBL",
                dataset="hsapiens_gene_ensembl",host="www.ensembl.org")


t_df<-read.csv("/home/users/sypark/03_Tools/annovar/humandb/hg19_refGene.txt",
               header=F,
               sep="\t")%>%
  as.tibble()

t_df%>%
  filter(V3%in%c(paste0("chr",c(1:22)),"X"))%>%select(V13)%>%unique()%>%nrow()


refseq_select_df<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/28_genomic_feature/01_refernce/intron/refSeq_Select.target.txt",
                           
)
refseq_select_df<-refseq_select_df%>%mutate(name=gsub("\\.[0-9]*$","",name))
refseq_select_df$


gene_df<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/28_genomic_feature/01_refernce/intron/Homo_sapiens.GRCh37.87.coding.gtf",
                  #header=F,
                  #sep="\t"
)
gene_df<-gene_df%>%mutate(batch=c(rep(c(1:203),each=100),rep(c(204),43)))
target<-(gene_df%>%filter(batch==1))$ensembl_id
getBM(attributes=c("refseq_mrna", "ensembl_gene_id", "hgnc_symbol"), filters = "hgnc_symbol", values = c("SGIP1"), mart= mart)
getBM(attributes=c("refseq_mrna", "ensembl_gene_id", "hgnc_symbol"), filters = "ensembl_gene_id", values = c("ENSG00000122432"), mart= mart)
getBM(attributes=c("refseq_mrna", "ensembl_gene_id", "hgnc_symbol"), filters = "refseq_mrna", values = c("NM_000157"), mart= mart)
NM_000157
ref_sel_bm<-getBM(attributes=c("refseq_mrna", "ensembl_gene_id", "hgnc_symbol"), filters = "refseq_mrna", values =refseq_select_df$name, mart= mart)

ref_sel_bm$hgnc_symbol%>%unique()%>%length()
ref_sel_bm$ensembl_gene_id%>%length()
refseq_select_df%>%nrow()
ref_sel_bm%>%filter(refseq_mrna=="NM_032291")
refseq_select_df%>%filter(!name%in%ref_sel_bm$refseq_mrna)

ref_sel_bm_expand<-getBM(attributes=c("refseq_mrna", "ensembl_gene_id", "hgnc_symbol"), filters = "hgnc_symbol", values =ref_sel_bm$hgnc_symbol, mart= mart)
ref_sel_bm_expand%>%
refseq_select_df$name%>%unique()%>%length()
ref_sel_bm_expand$hgnc_symbol%>%unique()%>%length()
t_df%>%filter(V2%in%ref_sel_bm_expand$refseq_mrna|V13%in%ref_sel_bm_expand$hgnc_symbol)%>%dplyr::select(V13)%>%unique()%>%nrow()

ref_sel_bm_expand%>%filter(hgnc_symbol=="ACPP")
check_df<-ref_sel_bm_expand%>%filter((refseq_mrna%in%t_df$V2)|(hgnc_symbol%in%t_df$V13))
check_df%>%filter(hgnc_symbol=="ASNSD1")
ref_sel_bm_expand%>%filter(!(hgnc_symbol%in%check_df$hgnc_symbol)&!(refseq_mrna%in%check_df$refseq_mrna))
ref_sel_bm_expand%>%filter(hgnc_symbol=="CYRIB")
t_df%>%filter(V2=="NM_001134194")

t_df%>%filter(V13=="ACP3")
t_df%>%filter(V13=="AHSG")


#check 1
refseq_select_df%>%select(name2)%>%unique()
refseq_select_df%>%mutate(stat=ifelse((name%in%ref_sel_bm$refseq_mrna) | (name2%in%ref_sel_bm$hgnc_symbol),1,0))%>%filter(stat==1)
refseq_select_df%>%filter(!(name%in%ref_sel_bm$refseq_mrna) & !(name2%in%ref_sel_bm$hgnc_symbol))
refseq_select_df%>%filter((name%in%ref_sel_bm$refseq_mrna) | (name2%in%ref_sel_bm$hgnc_symbol))

#`#bin` name         chrom strand   txStart     txEnd  cdsStart    cdsEnd exonCount exonStarts exonEnds score name2  cdsStartStat cdsEndStat exonFrames
#1    121 NM_170753    10    -       50723150  50732327  50723378  50725160         2    5.07e15  5.07e15     0 PGBD3  cmpl         cmpl       0,-1,     
#2   1531 NM_001395906 11    +      124082413 124086535 124085461 124086394         3    1.24e26  1.24e26     0 OR8G3P cmpl         cmpl       -1,-1,0,  
#3    739 NM_172194    14    +       20215586  20216528  20215586  20216528         1    2.02e 7  2.02e 7     0 OR4Q3  cmpl         cmpl       0,   

#t_df%>%filter(V2=="NM_170753") -> include
#t_df%>%filter(V2=="NM_001395906"|V13=="OR8G3P") -> exclude
#t_df%>%filter(V2=="NM_172194"|V13=="OR4Q3") -> include

##check2
refseq_select_df%>%filter(name2=="ASNSD1")
stat_df<-ref_sel_bm_expand%>%mutate(stat=ifelse(refseq_mrna%in%t_df$V2|hgnc_symbol%in%t_df$V13,1,0))

stat_sim_df<-stat_df%>%group_by(hgnc_symbol,stat)%>%
  dplyr::summarise(n=n())

stat_sim_count_df<-stat_sim_df%>%spread(stat,n)
stat_sim_count_df[is.na(stat_sim_count_df)]<-0
stat_sim_count_df%>%filter(`1`<1)
stat_df%>%filter(hgnc_symbol=="ASDURF")
#hgnc_symbol   `0`   `1`

#1 ADGRD2          2     0
#2 AKR1C8          2     0
#3 ANKRD40CL       2     0
#4 ASDURF          3     0
#5 B3GALT9         2     0
#6 BRD3OS          2     0
#7 C10orf143       2     0
#8 C13orf46        4     0
#9 C17orf113       1     0
#10 C17orf114       2     0
ref_sel_bm_expand%>%filter(hgnc_symbol=="ASNSD1")



t_df%>%filter(V2%in%ref_sel_bm_expand$refseq_mrna|V13%in%ref_sel_bm_expand$refseq_mrna|V2=="NM_170753"|V2=="NM_172194"|V13=="OR4Q3")%>%
  dplyr::select(V13)%>%unique()%>%
  filter(V13=="KHDRBS2")



t_df_count_df<-t_df%>%mutate(stat=ifelse(V2%in%ref_sel_bm_expand$refseq_mrna|V13%in%ref_sel_bm_expand$refseq_mrna|V2=="NM_170753"|V2=="NM_172194"|V13=="OR4Q3",1,0))%>%
  dplyr::group_by(V13,stat)%>%
  dplyr::summarise(n=n())
t_df_count_df%>%spread(stat,n)
t_df_count_df<-t_df_count_df%>%spread(stat,n)
t_df_count_df[is.na(t_df_count_df)]<-0
t_df_count_df%>%filter(`1`<1)%>%
  filter(!(grepl("AS1",V13)))%>%
  filter(!(grepl("LINC",V13)))%>%
  filter(!grepl("LOC",V13))%>%
  select(V13)%>%ungroup()%>%
  sample_n(10)


t_df_target_df<-t_df_count_df%>%filter(`1`>=1)

t_df%>%filter(V3%in%paste0("chr",c(c(1:22),"X")))
t_df%>%filter(V3%in%paste0("chr",c(c(1:22),"X")))%>%filter(!(V2%in%ref_sel_bm_expand$refseq_mrna|V13%in%ref_sel_bm_expand$refseq_mrna|V2=="NM_170753"|V2=="NM_172194"|V13=="OR4Q3"))%>%select(V13)%>%
  filter(!grepl("LINC",V13)&!grepl("LOC",V13))%>%sample_n(10)


t_df%>%filter(V13%in%t_df_target_df$V13)
t_df%>%

write.table(t_df%>%filter(V3%in%paste0("chr",c(c(1:22),"X")))%>%filter(V13%in%t_df_target_df$V13),"/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/28_genomic_feature/01_refernce/intron/hg19_refGene.protein_coding.txt",
            row.names=F,
            quote=F,
            col.names=F,
               #header=F,
               sep="\t")




convert_tmp<-mclapply(c(1:204),function(x){
  print(x)
  target<-(gene_df%>%filter(batch==x))$ensembl_id
  #target
  convert_df<-getBM(attributes=c("refseq_mrna", "ensembl_gene_id", "hgnc_symbol"), filters = "ensembl_gene_id", values = target, mart= ensembl)
  return(convert_df)
},mc.cores=15)


convert_df<-do.call(rbind,convert_tmp)%>%as.tibble()
convert_df

refgene_df<-read.csv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/28_genomic_feature/01_refernce/intron/refGene.bed",
                     header=F,
                     sep="\t")

convert_df
#refgene_df%>%filter(!refgene_df$V4%in%convert_df$refseq_mrna)%>%as.tibble()
refgene_sim_df<-refgene_df%>%filter(refgene_df$V4%in%convert_df$refseq_mrna)
refgene_sim_df<-refgene_sim_df%>%mutate(V1=gsub("chr","",V1))
refgene_sim_df<-refgene_sim_df%>%filter(V1%in%c(c(1:22),"X"))


write.table(
  left_join(refgene_sim_df,convert_df,by=c("V4"="refseq_mrna"))%>%
    select(hgnc_symbol,ensembl_gene_id,V4),
  "/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/28_genomic_feature/01_refernce/intron/target_gene_list.txt",
  sep="\t",
  quote=F,
  row.names=F,
)
gene_df%>%filter(gene_id=="ABCB1")
convert_df%>%filter(hgnc_symbol=="RSPH10B")
target_gene_df
target_gene_df%>%filter(hgnc_symbol=="PQLC2L")
#target_gene_df<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/28_genomic_feature/01_refernce/intron/target_gene_list.txt")
fasta_file<-FaFile(file='/home/users/ayh/Projects/reference/genome/human/GRCh37/A3B/human_g1k_v37.rtTA.A3B_mcherry_vec.fa')


fai_df<-read.csv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/28_genomic_feature/01_refernce/GRCh37.length.txt",
                 header=F,
                 sep="\t")
colnames(fai_df)<-c("chrom","end")
fai_df<-fai_df%>%mutate(start=1)%>%
  dplyr::select(chrom,start,end)%>%as_tibble()

fai_df<-fai_df%>%filter(!chrom%in%c("Y","MT"))

tot_len_gr<-GRanges(seqnames=fai_df$chrom,IRanges(start=(as.numeric(fai_df$start)), end=(as.numeric(fai_df$end))),strand="+")
#ref_sim_df<-ref_sim_df%>%mutate(chrom==ifelse(chrom=="X","23",chrom))
#ref_sim_df%>%mutate(dist=txEnd-txS)
#ref_sim_df%>%mutate()
#three_bp_df<-GRanges(seqnames=df$`#CHROM`,IRanges(start=(as.numeric(df$POS-1)), end=(as.numeric(df$POS)+1)),strand="+")
library(GenomicRanges)


gene_gr<-GRanges(seqnames=refgene_sim_df$V1,IRanges(start=(as.numeric(refgene_sim_df$V2)), end=(as.numeric(refgene_sim_df$V3))),strand="+")


intergenic_gr<-setdiff(tot_len_gr,reduce(gene_gr))

intergenic_gr<-(reduce(cds_gr)%>%as.tibble())$width%>%sum()

gene_len<-(reduce(gene_gr)%>%as.tibble())$width%>%sum()
intergenic_len<-(reduce(intergenic_gr)%>%as.tibble())$width%>%sum()


write.table(gene_len,"/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/28_genomic_feature/01_refernce/intergenic/protein_coding_genic_len.txt",
            sep="\t",
            quote=F,
            col.names=F,
            row.names=F)

write.table(intergenic_len,"/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/28_genomic_feature/01_refernce/intergenic/protein_coding_intergenic_len.txt",
            sep="\t",
            quote=F,
            col.names=F,
            row.names=F)
