
library("tidyverse")
library("TSRchitect")
library("GenomicFeatures")
library("ChIPseeker")
library("cowplot")

options(scipen = 999)

cores <- 8

human_gtf <- file.path("genomes", "Homo_sapiens.GRCh38.98.gtf")
yeast_gtf <- file.path("genomes", "Saccharomyces_cerevisiae.R64-1-1.98.gtf")

#######################################
## Saturation Analysis of STRIPE-seq ##
#######################################

## Subsample Bams
## ----------

## Get BAM file names.

bam_files <- list.files(file.path("results", "aligned"), pattern = ".*\\.bam$", full.names = TRUE)

## Calculate total number of reads.

read_counts <- map_dbl(bam_files,
		~ system2("samtools", paste("view -c", .x), stdout = TRUE) %>%
		as.numeric
	)

## Make table of samples and reads.

sample_table <- bam_files %>%
	tibble(
		sample = basename(.) %>% str_replace("_Aligned\\.out\\.bam$", ""),
		file = .,
		read_counts = read_counts
	) %>%
	separate(
		col = sample, sep = "_", remove = FALSE,
		into = c("technology", "strain_line", "input", "condition", "replicate")
	) %>%
	mutate(		
		organism = ifelse(strain_line == "K562", "Hsapien", "Scerevisiae"),
		seq_mode = ifelse(strain_line == "BY4741", "unpaired", "paired"),
		fragments = ifelse(seq_mode == "paired", floor(read_counts / 2), read_counts),
		sample_size = ifelse(organism == "Scerevisiae", 100000, 1000000),
		n_samples = floor(fragments / sample_size)
	) %>%
	arrange(technology, strain_line, input, condition, replicate)

## Subsample bams.

dir.create("subsampled_bams")

pwalk(sample_table, function(...) {
	args <- list(...)

	walk(seq_len(args$n_samples), function(x) {
		sample_frac <- ((args$sample_size * x) / args$fragments) %>%
			as.character %>%
			str_replace("^0", as.character(sample(1:1000000, 1)))

		new_file <- file.path("subsampled_bams", paste0((args$sample_size * x), "_", args$sample, ".bam"))

		if (!(list.files("subsampled_bams", full.names = TRUE) %>% str_detect(new_file) %>% any)) {
			if (args$seq_mode == "paired") {
				command <- paste(
					"samtools view -bs", sample_frac, args$file, "|",
					"samtools sort -n -@", cores, "- |",
					"samtools fixmate -m - - |",
					"samtools sort -@", cores, "- |",
					"samtools markdup - - |",
					"samtools view -F 3852 -f 3 -O BAM -@", cores, "-o", new_file
				)
			} else {
				command <- paste(
					"samtools sort -@", cores, args$file, "|",
					"samtools view -bs", sample_frac, "- |",
					"samtools view -F 3844 -O BAM -@", cores, "-o", new_file
				)
			}

			system(command)
			system2("samtools", paste("index", new_file))
		}
	})
})

## Retrieve TSS Stats
## ----------

## Get bam info.

subsampled_bams <- list.files("subsampled_bams", pattern = ".*\\.bam$", full.names = TRUE) %>%
	tibble(
		file = .,
		file_name = basename(.) %>% str_replace("\\.bam", ""),
	) %>%
	separate(
		col = file_name, sep = "_",
		into = c("subsamples", "technology", "strain_line", "input", "condition", "replicate")
	) %>%
	mutate(
		subsamples = as.numeric(subsamples),
		organism = ifelse(strain_line == "K562", "Hsapien", "Scerevisiae"),
		sample_name = basename(file) %>% str_replace("^[0-9]+_", "") %>% str_replace("\\.bam$", "")
	) %>%
	split(.$sample_name) %>%
	map(~ arrange(., subsamples))

## Load genome annotations.

human_txdb <- makeTxDbFromGFF(human_gtf)
yeast_txdb <- makeTxDbFromGFF(yeast_gtf)

## Process TSS info.

walk(subsampled_bams, ~ pwalk(., function(...) {
	args <- list(...)
	
	new_file <- args$file %>%
		basename %>%
		str_replace("\\.bam$", "") %>%
		paste0(., ".tsv")

	if (!(new_file %in% list.files("subsampled_TSSs"))) {

	try({
	message(paste("...Analyzing:", args$sample_name))
	
	# Export sample sheet.
	tibble(
		SAMPLE = basename(args$file) %>% str_replace("\\.bam$", ""),
		ReplicateID = 1,
		FILE = args$file
	) %>%
	write.table(
		file.path("subsampled_bams", "sample_sheet.tsv"), sep = "\t",
		col.names = TRUE, row.names = FALSE, quote = FALSE
	)

	# Find TSSs.
	if (args$strain_line == "BY4741") {
		paired_mode <- FALSE
	} else {
		paired_mode <- TRUE
	}

	tsrchitect_obj <- loadTSSobj(
		experimentTitle = "BAM Subsampling",
		inputDir = "subsampled_bams",
		n.cores = cores,
		isPairedBAM = paired_mode,
		isPairedBED = FALSE,
		sampleSheet = "sample_sheet.tsv",
		sampleNames = NA,
		replicateIDs = NA
	) %>%
		inputToTSS %>%
		processTSS(
			n.cores = cores,
			tssSet = "all",
			writeTable = FALSE
		)

	# Convert TSSs to GRanges object.
	TSSs <- tsrchitect_obj@tssCountData %>%
		pluck(1) %>%
        	dplyr::rename(seqnames = seq, start = TSS, score = nTAGs) %>%
                mutate("end" = start) %>%
                makeGRangesFromDataFrame(keep.extra.columns = TRUE)

	# Annotate GRanges and export as table.
	if (args$organism == "Hsapien") {
		gtf <- human_txdb
		promoter_region <- c(-500, 500)
	} else {
		gtf <- yeast_txdb
		promoter_region <- c(-250, 100)
	}

	annotated_TSSs <- TSSs %>%
		annotatePeak(.,
			tssRegion = promoter_region,
			TxDb = gtf,
			level = "transcript",
			sameStrand = TRUE
		) %>%
		as_tibble

	write.table(
		annotated_TSSs, file.path("subsampled_TSSs", new_file),
		sep = "\t", col.names = TRUE, row.names = FALSE, quote = FALSE
	)
	})
	}
}))

## Genomic Annotation Stats
## ----------

## Load data and get stats.

annotated_TSSs <- "subsampled_TSSs" %>%
	list.files(pattern = "\\.tsv$", full.names = TRUE) %>%
	map(function(x) {
		filtered <- read.delim(x, header = TRUE, sep = "\t", stringsAsFactors = FALSE) %>%
			as_tibble(.name_repair = "unique") %>%
			filter(score >= 3) 

		anno_stats <- filtered %>%
			transmute(genomic_anno = ifelse(annotation == "Promoter", "promoter", "non_promoter")) %>%
			count(genomic_anno) %>%
			spread(genomic_anno, n) %>%
			mutate(
				promoter_frac = promoter / (promoter + non_promoter),
				unique_genes = pull(filtered, geneId) %>% n_distinct
			)

		return(anno_stats)

	}) %>% set_names(list.files("subsampled_TSSs") %>% str_replace("\\.tsv$", ""))

annotated_TSSs <- annotated_TSSs %>%
	bind_rows(.id = "sample") %>%
	mutate(
		sampled_frags = str_extract(sample, "^[0-9]+") %>% as.numeric,
		sample_name = str_replace(sample, "^[0-9]+_", ""),
	) %>%
	separate(
		col = sample, sep = "_",
		into = c("subsamples", "technology", "strain_line", "input", "condition", "replicate")
	) %>%
	arrange(technology, strain_line, input, condition, replicate, subsamples)

write.table(
	annotated_TSSs, "saturation_values.tsv",
	sep = "\t", col.names = TRUE, row.names = FALSE, quote = FALSE
)

## Prepare data for plotting.

anno_plotting <- annotated_TSSs %>%
	mutate(
		"subsamples" = as.numeric(subsamples),
		"sample" = paste(technology, strain_line, input, condition, replicate, sep = "_"),
		"sample_group" = paste(technology, input, condition),
		"organism" = ifelse(strain_line == "K562", "Hsapien", "Scerevisiae")
	) %>%
	group_split(organism)

## Plot stats.

p <- anno_plotting %>%
	map(~ ggplot(., aes(x = subsamples, y = unique_genes, color = sample)) +
		geom_line() +
		geom_point(size = 2, aes(shape = sample_group)) +
		scale_color_viridis_d() +
		facet_wrap(. ~ organism, ncol = 1, scale = "free") +
		theme_bw()
	)

p <- plot_grid(plotlist = p, ncol = 1)

ggsave(
	"saturation_plot.pdf", plot = p,
	device = cairo_pdf, height = 12, width = 14
)
