#!/usr/bin/python

from argparse import ArgumentParser
from Bio import Entrez, SeqIO, Seq


def extract_insdc(links):
    """Returns the link UIDs for RefSeq entries, from the
    passed Elink search results"""
    # Work only with INSDC accession UIDs
    linkset = [ls for ls in links[0]['LinkSetDb'] if
              ls['LinkName'] == 'assembly_nuccore_insdc']
    if 0 == len(linkset):  # There are no INSDC UIDs
        return []
    # Make a list of the INSDC UIDs
    uids = [i['Id'] for i in linkset[0]['Link']]
    return uids

def get_gi_from_acc(acc):
    Entrez.email = "jensenuk83@gmail.com"
    handle = Entrez.esearch(db="nuccore", term=str(acc))
    results = Entrez.read(handle)
    handle.close()
    fetch_handle = Entrez.efetch(db="nuccore", id=results["IdList"], rettype="gi", retmode="text")
    acc_ids = [id.strip() for id in fetch_handle]
    fetch_handle.close()
    return acc_ids[0]
    


def add_gi_header_to_seqids(input_file, output_file):

    with open(output_file, 'wt') as f_out:
        with open(input_file, 'rt') as f_in:
            for idx, record in enumerate(SeqIO.parse(f_in, "fasta")):
                print(record.id)
                gi = get_gi_from_acc(record.id)
                new_id = "gi|" + str(gi) + "|ref|" + str(record.id) + "|"
                new_record = SeqIO.SeqRecord(record.seq, id=new_id, description=record.description)
                r=SeqIO.write(new_record, f_out, 'fasta')
                if r!=1: print('Error while writing sequence:  ' + new_record.id)


def main():

    parser = ArgumentParser("")
    parser.add_argument('-f', '--fasta_file', required=True,
                        help="fasta sequence file containing all sequences for building the kraken2 db")
    parser.add_argument('-o', '--output', required=True,
                        help="fasta file with added taxon ids needed by kraken2")                    
    args = parser.parse_args()

    add_gi_header_to_seqids(args.fasta_file, args.output)

if __name__ == "__main__":
    main()
