#!/usr/bin/env Rscript

library("tidyverse")

##################################
## Analysis of 5` Mapping Methods
##################################

## Load and Prepare Sample Sheet
## ----------

samples <- "sample_sheet.tsv" %>%
	read.delim(header = TRUE, sep = "\t", stringsAsFactors = FALSE) %>%
	as_tibble %>%
	split(.$organism)

## Remove rRNA and Other Stuff
## ----------

## Download singularity container.

system("singularity pull --name gostripes.simg shub://BrendelGroup/GoSTRIPES")

## Reference rRNA fasta files for each organism.

rRNA <- list(
	"human" = file.path("genomes", "Hs_rRNA.fa"),
	"yeast" = file.path("genomes", "Sc_rRNA.fa")
)

## Tagdust2 to remove rRNA.

dir.create("./results/dusted_reads", recursive = TRUE)

walk(samples, ~ pwalk(.x, function(...) {
	args <- list(...)

	new_file <- paste(
        	args$technology, args$strain_line, args$input,
        	args$condition, args$replicate, sep = "_"
	)

	if (!(file.path("results", "dusted_reads") %>% list.files %>% str_detect(new_file) %>% any)) {
		if (args$organism == "Hsapien") {
			reads <- paste(
				file.path("sequences", args$R1),
				file.path("sequences", args$R2)
			)
			rRNA_file <- pluck(rRNA, "human")
		} else if (args$organism == "Scerevisiae" & is.na(args$R2)){
			reads <- file.path("sequences", args$R1)
			rRNA_file <- pluck(rRNA, "yeast")
		} else {
			reads <- paste(
				file.path("sequences", args$R1),
				file.path("sequences", args$R2)
			)
			rRNA_file <- pluck(rRNA, "yeast")
		}

		command <- paste(
			"singularity exec",
			"-eCB /N/dc2/scratch/rpolicas/saturation",
			"-H /N/dc2/scratch/rpolicas/saturation",
			"gostripes.simg",
			"tagdust",
			"-ref", rRNA_file,
			"-fe 3", "-t 8", "-dust 97",
			"-o", file.path(
				"results", "dusted_reads",
				paste(
					args$technology, args$strain_line, args$input,
					args$condition, args$replicate, sep = "_"
				)
			),
			"-1 R:N",
			reads
		)

		system(command)
	}
}))

## Generate STAR Genome Index
## ----------

## Download genome.

genomes <- tibble(
	"organism" = c(rep("human", 2), rep("yeast", 2)),
	"type" = c(rep(c("annotation", "assembly"), 2)),
	"files" = c(
		"ftp://ftp.ensembl.org/pub/release-98/gtf/homo_sapiens/Homo_sapiens.GRCh38.98.gtf.gz",
		"ftp://ftp.ensembl.org/pub/release-98/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna_sm.primary_assembly.fa.gz",
		"ftp://ftp.ensembl.org/pub/release-98/gtf/saccharomyces_cerevisiae/Saccharomyces_cerevisiae.R64-1-1.98.gtf.gz",
		"ftp://ftp.ensembl.org/pub/release-98/fasta/saccharomyces_cerevisiae/dna/Saccharomyces_cerevisiae.R64-1-1.dna_sm.toplevel.fa.gz"
	)
)

pwalk(genomes, function(organism, type, files) system(paste("wget", files)))

## Unzip and move files.

list.files(".", pattern = ".*\\.gz$", full.name = TRUE) %>%
	walk(~ paste("gunzip", .x) %>% system)

dir.create("./genomes")

list.files(".", pattern = "(\\.fa$|\\.gtf$)", full.name = TRUE) %>%
	walk(~ paste("mv", .x, "./genomes") %>% system)

## Generate STAR genome index.

extract_file_name <- function(organism, type) {
	cleaned <- genomes %>%
		filter(organism == !!organism, type == !!type) %>%
		pull(files) %>%
		basename %>%
		str_replace("\\.gz$", "") %>%
		file.path(".", "genomes", .)

	return(cleaned)
}

walk(c("yeast", "human"), ~ dir.create(file.path(".", "genomes", paste0(.x, "_index"))))

walk2(
	samples, names(samples),
	function(x, y) {
		if (y == "Hsapien") {
			genome_annotation <- extract_file_name("human", "annotation")
			genome_assembly <- extract_file_name("human", "assembly")
			org <- "human"
		} else {
			genome_annotation <- extract_file_name("yeast", "annotation")
			genome_assembly <- extract_file_name("yeast", "assembly")
			org <- "yeast"
		}

		star_command <- paste(
			"STAR",
			"--runThreadN 8",
			"--runMode genomeGenerate",
			"--genomeDir", file.path(".", "genomes", paste0(org, "_index")),
			"--genomeFastaFiles", genome_assembly,
			"--sjdbGTFfile", genome_annotation
		)

		if (org == "yeast") star_command <- paste(star_command, "--genomeSAindexNbases 10")

		system(star_command)
	}
)

## Align Reads
## ----------

dir.create(file.path("results", "aligned"), recursive = TRUE)

walk(samples, ~ pwalk(.x, function(...) {
	args <- list(...)

	new_file <- paste(
		args$technolgy, args$strain_line, args$input,
		args$condition, args$replicate, sep = "_"
	)

	if (!(file.path("results", "aligned") %>% list.files %>% str_detect(new_file) %>% any)) {

		if (args$strain_line %in% c("K562", "S288C")) {
			seq_files <- paste(
				file.path(
					"results", "dusted_reads",
					paste(
						args$technology, args$strain_line, args$input,
						args$condition, args$replicate, "READ1.fq", sep = "_"
					)
				),
				file.path(
					"results", "dusted_reads",
					paste(
						args$technology, args$strain_line, args$input,
						args$condition, args$replicate, "READ2.fq", sep = "_"
					)
				)
			)
		} else {
			seq_files <- file.path(
				"results", "dusted_reads",
				paste(
					args$technology, args$strain_line, args$input,
					args$condition, args$replicate, sep = "_"
				) %>% paste0(., ".fq")
			)
		}

		if (args$strain_line == "K562") {
			genome_dir <- file.path("genomes", "human_index")
		} else {
			genome_dir <- file.path("genomes", "yeast_index")
		}

		star_command <- paste(
			"STAR",
			"--runThreadN 8",
			"--genomeDir", genome_dir,
			"--readFilesIn", seq_files,
			"--outFileNamePrefix",
			file.path(
				"results", "aligned",
				paste0(
					paste(
						args$technology, args$strain_line, args$input,
						args$condition, args$replicate, sep = "_"
					), "_"
				)
			),
			"--outSAMtype BAM Unsorted"
		)

		system(star_command)
	}
}))

## Merge Selected BAM Files
## ----------

## Get name of files to merge.

to_merge <- samples %>%
	pluck("K562") %>%
	filter(str_detect(notes, pattern = "^merge.*"))

## Merge bams files.

bams <- pmap_chr(to_merge, function(...) {
	args <- list(...)

	bam <- file.path(
		"results", "cleaned_bams",
		paste(
			args$strain_line, args$input, args$technology,
			paste0("r", args$replicate_ID), args$rowid, sep = "_"
		) %>% paste0(., ".bam")
	)

	return(bam)
})

system(paste(
	"samtools merge",
	file.path("results", "cleaned_bams", "K562_5ug_RAMPAGE_r3_4.5_merged.bam"),
	paste(bams, collapse = " ")
))

## Index Bams
## ----------

bam_files <- list.files(
	file.path("results", "cleaned_bams"),
	pattern = ".*\\.bam$",
	full.name = TRUE
)

dir.create(file.path("results", "final_bams"))

walk(bam_files, function(x) {
	system(paste("samtools sort -@ 8 -o", file.path("results", "final_bams", basename(x)), x))
	system(paste("samtools index", file.path("results", "final_bams", basename(x))))
})
