library("tidyverse")
library("Rsamtools")

options(scipen=999)

###############################
## Analyze Softclipped Bases ##
###############################

## Load Data
## ----------

## Get list of bam files.

bam_files <- list.files(file.path("bams", "human"), pattern = ".*\\.bam$", full.names = TRUE)

## Import bam files as tibble.

bams <- map(bam_files, function(x) {
 	bam <- scanBam(x) %>% pluck(1)
	
	bam$seq <- pluck(bam, "seq") %>% as.character
	bam$qual <- pluck(bam, "qual") %>% as.character
	
	bam <- bam %>%
		as_tibble(.name_repair = "unique") %>%
		select(qname, flag, cigar, seq) %>%
		mutate(
			seq_mode = ifelse(flag %in% c(99, 83), "paired", "unpaired"),
			total_frags = ifelse(seq_mode == "paired", n() / 2, n())				
		) %>%
		filter(
			(flag %in% c(99, 0)) & str_detect(cigar, "^[0-9]S.*") |
			(flag %in% c(83, 16)) & str_detect(cigar, ".*[0-9]S$")
		) %>% 
		mutate(
			softclip_bases = ifelse(
				flag %in% c(99, 0),
                        	str_extract(cigar, "^[0-9]+") %>% as.numeric,
                        	str_extract(cigar, "[0-9]+S$") %>% str_replace("S", "") %>% as.numeric
			)
                )

	n_filter <- bam %>%
		tally(softclip_bases > 3) %>%
		pull(n)

	bam <- bam %>%
		filter(softclip_bases <= 3) %>%
		mutate(total_frags = total_frags - n_filter)

	return(bam)
})

names(bams) <- bam_files %>%
	basename %>%
	str_replace("\\.bam$", "")

## Softclipped Analysis
## ----------

## Retrieve sofclipped bases.

softclipped <- bams %>% 
	map(~ mutate(.,
		softclip_seq = ifelse(
			flag %in% c(99, 0),
			str_sub(seq, 1, softclip_bases),
			str_sub(seq, nchar(seq) - softclip_bases, nchar(seq)) %>%
				rev %>%
				str_replace_all("G", "C") %>%
				str_replace_all("C", "G") %>%
				str_replace_all("A", "T") %>%
				str_replace_all("T", "A")
		),
		softclip_3prime = str_sub(softclip_seq, nchar(softclip_seq), nchar(softclip_seq))
	))

softclipped <- bind_rows(softclipped, .id = "sample")

## 3` most softclipped base.

p_threeprime <- softclipped %>%
	count(sample, total_frags, softclip_3prime) %>%
	mutate(
		frac_total = n / total_frags,
		softclip_3prime =  factor(softclip_3prime, levels = c("A", "T", "G", "C", "N"))
	) %>%
	ggplot(aes(x = softclip_3prime, y = frac_total, fill = softclip_3prime)) +
		geom_bar(stat = "identity", width = 0.75) +
		facet_wrap(. ~ sample, ncol = 3) +
		theme_bw() +
		scale_fill_viridis_d() +
		theme(
			text = element_text(size = 10),
			axis.ticks.x = element_blank(),
			axis.text.x = element_blank()
		)

ggsave("human_softclip_base.pdf", plot = p_threeprime, device = cairo_pdf, height = 6, width = 5)


## Softclipped sizes.

p_lengths <- softclipped %>%
	count(sample, total_frags, softclip_bases) %>%
	filter(softclip_bases <= 3) %>%
	mutate(frac_total = n / total_frags) %>%
	ggplot(aes(x = softclip_bases, y = frac_total, fill = softclip_bases)) +
		geom_histogram(stat = "identity", show.legend = FALSE, width = 0.75) +
		facet_wrap(. ~ sample, ncol = 3) +
		theme_bw() +
		scale_fill_viridis_c() +
		theme(text = element_text(size = 10))

ggsave("human_softclip_lengths.pdf", plot = p_lengths, device = cairo_pdf, height = 6, width = 4)
