#!/usr/bin/env python
#
# Copyright (c) 2016 10x Genomics, Inc. All rights reserved.
#
import os
import sys
import subprocess
import docopt
import shutil

__doc__ = '''
Build a Long Ranger-compatible reference folder from a user-supplied
FASTA file. Creates a new folder named after the FASTA file.

Steps include:
  1. Create new reference folder
  2. Copy original FASTA into folder
  3. Index for samtools
  4. Index for pyfasta
  5. Index for bwa

There are additional files you can add manually for improved analysis.
For details, see:
http://support.10xgenomics.com/genome-exome/software/pipelines/latest/advanced/references

The commands below should be preceded by 'longranger':

Usage:
    mkref <fasta_file>
    mkref -h | --help | --version

Arguments:
    fasta_file  Path to FASTA file containing your reference.

Options:
    -h --help   Show this message.
    --version   Show version.
'''


def error(msg):
    print msg
    sys.exit(1)

def fixpath(path):
    return os.path.abspath(os.path.expandvars(os.path.expanduser(path)))

args = docopt.docopt(__doc__, version=" ") # do not remove space
fasta_path = fixpath(args["<fasta_file>"])

# Verify input fasta file exists
if not os.path.exists(fasta_path):
    error("Input FASTA file does not exist: %s" % fasta_path)

if not os.path.isfile(fasta_path):
    error("Please provide a FASTA file, not a directory: %s" % fasta_path)

# Check for write permissions in current directory
if not os.access(os.getcwd(), os.W_OK):
    error("You do not have write permission in the current working directory.")

# Parse name of new reference
basename = os.path.basename(os.path.abspath(fasta_path))
ref_name, ext = os.path.splitext(basename)

# Create the target reference folder path
ref_folder = "refdata-"+ref_name
ref_path = os.path.join(os.getcwd(), ref_folder)

# Check that destination folder doesn't already exist 
if os.path.exists(ref_path):
    error("Destination reference folder already exists: %s\nPlease delete and start again." % ref_path)

# Create reference folder structure
print "Creating new reference folder at ./refdata-%s/" % ref_name
os.mkdir(ref_path)
os.mkdir(os.path.join(ref_path, "fasta"))
os.mkdir(os.path.join(ref_path, "genes"))
os.mkdir(os.path.join(ref_path, "regions"))
os.mkdir(os.path.join(ref_path, "snps"))

# Write out genome identifier
with open(os.path.join(ref_path, "genome"), "w") as f:
    f.write(ref_name+"\n")

# Copy fasta file into fasta folder as genome.fa
print "Copying original fasta file into reference folder...",
new_fasta = os.path.join(ref_path, "fasta", "genome.fa")
shutil.copy(fasta_path, new_fasta)
os.chmod(new_fasta, 0644)
print "done\n"

print "Generating samtools index...",
subprocess.check_call(['samtools', 'faidx', new_fasta])
print "done.\n"

print "Generating pyfasta indexes...",
import pyfasta
pyf = pyfasta.Fasta(new_fasta)
contigs = len(pyf)
size = sum(len(pyf[contig]) for contig in pyf)
del pyf
print "done."
print "    Number of contigs: %d\n    Total genome size: %d\n" % (contigs, size)

print "Generating bwa index (may take over an hour for a 3Gb genome)..."
subprocess.check_call(['bwa', 'index', new_fasta])
print "...done.\n"

print ">>> Reference successfully created! <<<\n"
print "You can now specify this reference on the command line:"
print "longranger run --reference=%s ..." % ref_path

print "\nNote: to use this reference with GATK, also run this Picard command:"
print "java -jar CreateSequenceDictionary.jar R=./%s O=./%s" % (os.path.join(ref_folder, "fasta", "genome.fa"), os.path.join(ref_folder, "fasta", "genome.dict"))
print "or for new Picard versions:"
print "java -jar /path/to/picard.jar CreateSequenceDictionary R=./%s O=./%s" % (os.path.join(ref_folder, "fasta", "genome.fa"), os.path.join(ref_folder, "fasta", "genome.dict"))

print "\nThere are additional files you can add to improve analysis. For details, see:\nhttp://support.10xgenomics.com/genome-exome/software/pipelines/latest/advanced/references"