#!/usr/bin/env python

"""
Function:   
Pipeline to produce ranks for the different putative BGCs generated by Biosynthetic-SPAdes
Pseudo pipeline:
1.  Parse Biosynthetic-SPAdes output in a fasta format
2.  Run antiSMASH on the fasta files to generate .gbk formatted BGCs
3.  Run first phase of BigSCApe to generate intermediary files
4a. If not present > generate intermediary files for antismashdb
4b. Run second phase of BigSCApe in which output from 3 [and possibly 4a] are combined
5   Parse pairwise distance matrix to select reference BGC and rank putative BGCs

Author: Vittorio Tracanna
Date:   05/12/2017
Dependencies: hmmer, biopython, (mafft), munkres.py, antismash

"""

import argparse
import sys
from parse_biosyntheticSPAdes_output import *
from parse_bigscape_network import *
import subprocess

def commandLineParser():
    """
    Parses input and holds default values.
    Returns a dictionary with all the parameters for the wrapper
    """
    parser = argparse.ArgumentParser(description="Main wrapper, runs all the different steps.")
    #General argnuments to specify where to put/find files
    parser.add_argument("-i", "--inputDir", type=str, required=True, help="Path to Biosynthetic-SPAdes output")
    parser.add_argument("-o", "--outputDir", type=str, required=False, \
                        default="{}/data/out/".format("/".join(sys.argv[0].split("/")[:-1])), \
                        help="Path to the output directory")
    parser.add_argument("-asdb", "--antismashdb", type=str, required=True, help="Path to the antismashdb folder")
    parser.add_argument("-bss", "--biosyntheticSpadesStatistics", type=str, required=True, \
                        help="Full path to the biosyntheticSpades [bgc_statistics.txt] output file.")
    parser.add_argument("-v", "--verbose", required=False, action="store_true", \
                        default=False, help="Sets the amount of output on screen")
    parser.add_argument("-r", "--rerun", required=False, action="store_true", \
                        default=False, help="rerun when new version of antismashdb is available. "
                                            "Prepares the genbank files from antismashDB for the comparison with the queries putative BGCs")

    return vars(parser.parse_args())

def stk2algn(stkFiles):
    """
    Generates fasta files [.algn] of the domains aligned to the hmmProfile
    :param stkFiles: List of paths to stockholm formatted files
    :return: None
    """
    for stkFile in stkFiles:
        algnFile = stkFile[:-3] + "algn"
        if not os.path.isfile(algnFile):
            with open(algnFile, "w") as outfile, open(stkFile, "r") as infile:
                algnDict = {}
                for line in infile:
                    if line.startswith("#"):
                        continue
                    elif line[0].isupper() or line[0].isdigit():
                        header = line.split(" ")[0]
                        algn = line.split(" ")[-1]
                        algn = "".join([pos for pos in algn if (pos.isupper() or pos == "-")])
                        if header in algnDict.keys():
                            algnDict[header] += algn
                        else:
                            algnDict[header] = ">{}\n{}" .format(header, algn)
                outfile.write("\n".join(algnDict.values()))
                outfile.write("\n")
    return

def cluster2group(dirNames):
    """
    Assigns each BGC from antismash to the respective group
    :param dirNames: names of the different directories created in antiSmash
    :return: BGCs_group_id, dictionary {str(clusterName): str(groupNumber)}
    """
    BGCs_group_ids = {}
    for dirName in dirNames:
        clusterName = "{}_candidate_{}.cluster001".format(dirName.split("_")[1], dirName.split("_")[2])
        clusterGroup = dirName.split("_")[1]
        BGCs_group_ids[clusterName] = clusterGroup
    return BGCs_group_ids

if __name__ == "__main__":
    argOptions = commandLineParser()
    """
    Step 1 ==> Parse genespades output into a antismash friendly format
    """
    # Takes all folders in the inputDir folder, avoids the hidden [start with .] and the antismash output folder
    if os.path.isfile(argOptions["biosyntheticSpadesStatistics"]):
        findCandidates(argOptions["biosyntheticSpadesStatistics"], argOptions["inputDir"])
    else:
        print ('bgc_statistics.txt file not found, trying to locate it')
        argOptions["biosyntheticSpadesStatistics"] = subprocess.check_call("find {} -name 'bgc_statistics.txt'".format(argOptions["inputDir"]))
        infiles = os.listdir(argOptions["inputDir"])
        infiles = [argOptions["inputDir"] + x for x in infiles if not \
            (x.startswith(".") or x == "candidates" or x == "antismash")]
        for genome in infiles:
            if argOptions["verbose"]:
                print ("\nParsing {} assembly to find possible BGCs to rank".format(genome.split("/")[-1]))
            if not os.path.isdir('{}/candidates/'.format(argOptions["inputDir"])):
                os.mkdir('{}/candidates/'.format(argOptions["inputDir"]))
            print ('looking for candidate')
            findCandidates(genome, argOptions["inputDir"])

    """
    Step 2 ==> Run antismash on the genespades output
    """
    # creates folders to store antiSmash output
    if not os.path.exists("{}candidates/antismash/".format(argOptions["inputDir"])):
        os.mkdir("{}candidates/antismash/".format(argOptions["inputDir"]))

    # biosyntheticSPAdes produces files that start with the "BGC" prefix
    fastaFiles = [x for x in os.listdir("{}candidates/".format(argOptions["inputDir"])) if (x.startswith("BGC") and not x.endswith('.dict'))]
    for fastaFile in fastaFiles:
        # runs Antismash on the input files if .bgk is missing
        if not os.path.isfile("{}candidates/antismash/{}/index.html".format(argOptions["inputDir"], fastaFile[:-6])):
            if argOptions["verbose"]:
                print ("running antismash on {}".format(fastaFile))
            asCmd = "antismash {}candidates/{} --minimal --disable-embl --disable-svg --disable-xls --outputfolder \
            {}candidates/antismash/{}/".format(argOptions["inputDir"], fastaFile, argOptions["inputDir"], fastaFile[:-6])
            print (asCmd)
            os.system(asCmd)

    """
    Step 3 ==> Run preparatory script on db [if not available yet] and query
    """
    # Creates intermediary files to run pairwise comparison between query and db.
    codeDir = '/'.join(os.path.abspath(__file__).split('/')[:-1])

    if not os.path.exists(argOptions["outputDir"]):
        mkdirCmd = os.mkdir(argOptions["outputDir"])

    if argOptions["rerun"]:
        # run preprocessing script on the db folder
        asdbPrepCmd = "python {}/antismashdb_prep_raw.py -i {} -o {}processed_antismashdb/" \
                        .format(codeDir, argOptions["antismashdb"], argOptions["antismashdb"])
        print (asdbPrepCmd)
        os.system(asdbPrepCmd)
    # run preprocessing script on the query from antismash [genespades output]
    if not os.path.isfile("{}/candidates/BGCs.dict".format(argOptions["inputDir"])):
        query_prep_cmd = "python {}/antismashdb_prep_raw.py -i {}candidates/antismash/ -o {}candidates/" \
                         .format(codeDir, argOptions["inputDir"], argOptions["inputDir"])
        os.system(query_prep_cmd)

    """
    Step 4 ==> Run 
    """
    calc_dist_cmd = "python {}/calculate_distance.py -o {}candidates/ -i {}candidates/ -db {}" \
        .format(codeDir, argOptions["inputDir"], argOptions["inputDir"], argOptions["antismashdb"])
    if not os.path.exists("{}/candidates/networks_all/".format(argOptions["inputDir"])):
        calc_dist_cmd = "python {}/calculate_distance.py -o {}candidates/ -i {}candidates/ -db {}"\
            .format(codeDir, argOptions["inputDir"], argOptions["inputDir"], argOptions["antismashdb"])
        os.system(calc_dist_cmd)

    """
    Step 5 ==> Parse bigscape output
    """
    BGCs_group_ids = cluster2group(os.listdir("{}candidates/antismash/".format(argOptions["inputDir"])))
    print ('last step')
    for BGC in os.listdir("{}candidates/networks_all/".format(argOptions["inputDir"])):
        print ('now processing {} BGCs class'.format(BGC))
        # iterates over all the BGCs classes. ex: NRPS,PKS etc..
        if os.path.isfile("{}/candidates/networks_all/{}/all{}_c0.80.network".format(argOptions["inputDir"], BGC, BGC)):
            infile = "{}/candidates/networks_all/{}/all{}_c0.80.network".format(argOptions["inputDir"], BGC, BGC)
            network = load_network(infile)
            DSS = get_DSS(network, BGCs_group_ids)
            reference = find_reference(DSS)
            if not os.path.isfile("{}candidates/ranks.txt".format(argOptions["inputDir"])):
                out_txt = rank_putative_BGC(reference, network, DSS)
                out_file = open("{}candidates/ranks.txt".format(argOptions["inputDir"]), "w")
                out_file.write(out_txt)
                print (out_txt)
            else:
                out_txt = rank_putative_BGC(reference, network, DSS)
                print (out_txt)
