library(tidyverse)
# if (!require("BiocManager", quietly = TRUE))
#   install.packages("BiocManager")
# BiocManager::install("SomaticSignatures")
library(SomaticSignatures)
#library(Rsamtools)
library(VariantAnnotation)
library(deconstructSigs)
library(ggfortify)
library(ggrepel)

citation("SomaticSignatures")
citation("VariantAnnotation")
citation("deconstructSigs")

ref="Mus_musculus.GRCm39.dna.toplevel.fa"
genome <- FaFile("mm39.fa")

vcfRange <- readVcfAsVRanges("new_data/vcf_style_dvmp.vcf",ref)

sca_motifs_tri = mutationContext(vcfRange,
                                 genome,k=3, check=T)
sca_motifs_7 = mutationContext(vcfRange,
                               genome,k=7, check=T)

sca_motifs_11 = mutationContext(vcfRange,
                                genome,k=11, check=T)
sca_motifs_21 = mutationContext(vcfRange,genome,k=21, check=T)

context <- data.frame(ID=as.character(sca_motifs_tri),
                      trinucleotide=sca_motifs_tri$context %>% as.character(),
                      context_7 = sca_motifs_7$context %>% as.character(),
                      context_11 = sca_motifs_11$context %>% as.character(),
                      context_21 = sca_motifs_21$context %>% as.character())
context
write.csv(context, "filtering_mutation_context.csv", row.names=F)


context
context <- context %>% 
  mutate(putative_illumina_homopolymer_strong_21 =grepl("^A{10}\\.|^T{10}\\.|T{10}$|A{10}$", context_21),
         putative_illumina_homopolymer_11 =grepl("^AAAAA\\.|^TTTTT\\.|TTTTT$|AAAAA$", context_11),
         putative_illumina_homopolymer_weak_7 = grepl("^AAA\\.|^TTT\\.|TTT$|AAA$", context_7))

context %>%
  dplyr::select(ID, starts_with("putative")) %>% distinct() %>% write.csv(row.names = F, "filter_homopolymer.csv")

context %>%
  dplyr::select(ID, starts_with("putative")) %>% distinct() %>% 
  dplyr::select(-ID) %>% colSums()


vars <- context %>%
  filter(putative_illumina_homopolymer_strong_21 == TRUE) %>% 
  distinct() %>% 
  separate(ID, into =c("CHROM", "POS"), sep=":", remove=F)
vars %>% 
  dplyr::select(ID) %>%
  write.csv("remove_variants.csv", row.names = F)