#!/usr/bin/env Rscript

library(methods)
library(Biostrings)
library(ShortRead)

create_phrase_matrix <- function(fastq_file_names, sample_labels, 
                                 phrases_file_name, out_file_name) {
    df <- read.table(phrases_file_name, header = TRUE, sep = "\t", 
                     stringsAsFactors = FALSE)
    df$seq <- toupper(df$seq)
    
    for (i in seq_len(fastq_file_names)) {
        sample_label <- sample_labels[[i]]
        # turn vector of ShortReadQ objects into vector of DNAStringSet objects
        seqs <- sread(readFastq(fastq_file_names[[i]]))
        
        phrase_region <- toupper(substr(seqs, 1, 100))
        phrase_region <- reverseComplement(DNAStringSet(phrase_region))
        
        phrase_region_df <- data.frame(seq = phrase_region,
                                       stringsAsFactors = FALSE)
        phrase_region_df <- dplyr::summarize(
            dplyr::group_by(phrase_region_df, seq), count = dplyr::n())
        colnames(phrase_region_df) <- c("seq", sample_label)
        
        df <- dplyr::left_join(df, phrase_region_df, by = "seq")
        
        df[is.na(df[[sample_label]]), sample_label] = 0
    }
    
    write.table(df, file = out_file_name, quote = FALSE, sep = "\t",
                row.names = FALSE)
}

args <- commandArgs(trailingOnly = TRUE)

if (length(args) == 4) {
    create_phrase_matrix(strsplit(args[1], ";", fixed = TRUE)[[1]],
                         strsplit(args[2], ";", fixed = TRUE)[[1]],
                         args[3], args[4])
} else {
    stop("FASTQ file names (semicolon-separated), sample labels (semicolon-separated), phrases file name, and output file name must be specified")
}
