#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Mar  5 14:04:31 2021

@author: christospapadopoulos
"""

from Bio.Blast import NCBIXML
import sys


def read_blast_denovo(blast_file):
    
    blast_denovo = {}
    
    item = next(blast_file)
    while item != 'FINISH':
        al_count = 0
        
        if item.alignments == []:
            blast_denovo[item.query] = {}
            blast_denovo[item.query]['sbjct'] = 'Nan'
            blast_denovo[item.query]['evalue'] = 'Nan'
            blast_denovo[item.query]['match_type'] = 'NaN'
            blast_denovo[item.query]['coverage'] = 'NaN'
            
            try:
                item=next(blast_file)
            except:
                item='FINISH'
            
            continue
                
        for alignment in item.alignments:
            hsp_count = 0
            al_count += 1 
            for hsp in alignment.hsps:
                hsp_count += 1
                query  = item.query
                sbjct  = alignment.hit_def
                evalue = hsp.expect
                query_align = hsp.query
                query_start = hsp.query_start
                query_end   = hsp.query_end
                sbjct_align = hsp.sbjct
                sbjct_start = hsp.sbjct_start
                sbjct_end   = hsp.sbjct_end
                coverage = round(len(hsp.query.replace('-','')) / item.query_length,2)
                if hsp_count == 1:
                    break
            if al_count == 1:
                break
        #if query in de_novo_list:
        blast_denovo[query] = {}
        blast_denovo[query]['sbjct'] = sbjct
        blast_denovo[query]['evalue'] = evalue
        blast_denovo[query]['coverage'] = coverage
            
            ####if evalue <= 1e-10:
           # if len(sbjct.split('_')) == 4:
            #    blast_denovo[query]['match_type'] = 'IGORF'
            #else:
            #    blast_denovo[query]['match_type'] = 'CDS'
            ######else:
            ########    blast_denovo[query]['match_type'] = 'NaN'
            
        try:
            item=next(blast_file)
        except:
            item='FINISH'
            
    return(blast_denovo)


mega_blast =    sys.argv[sys.argv.index("-blast")+1]
out        =    sys.argv[sys.argv.index("-out")+1]
Scer_vs_Mega = open(mega_blast,"r")
Scer_vs_Mega_records= NCBIXML.parse(Scer_vs_Mega)
Scer_vs_Mega_denovo = read_blast_denovo(blast_file = Scer_vs_Mega_records)

count = 0
with open(out,"w") as fw:
    fw.write('No\tScer_gene_name\tSeq_id\tSeq_type\tSeq_MEGA_eval\tSeq_MEGA_cov\n')
    for i in Scer_vs_Mega_denovo:
        count += 1
        if 'nc_intergenic' in Scer_vs_Mega_denovo[i]["sbjct"]:
            orf_type = "IGORF"
        elif Scer_vs_Mega_denovo[i]["sbjct"] == "Nan":
            orf_type = "NaN"
        else:
            orf_type = "CDS"
        fw.write("{:<3d}\t{:10s}\t{:<50s}\t{:<10s}\t{}\t{:<5.3f}\n".format(count,i,Scer_vs_Mega_denovo[i]["sbjct"],orf_type,float(Scer_vs_Mega_denovo[i]["evalue"]),float(Scer_vs_Mega_denovo[i]["coverage"]))) 
        #print(count,i,Scer_vs_Mega_denovo[i]["sbjct"],orf_type,Scer_vs_Mega_denovo[i]["evalue"],Scer_vs_Mega_denovo[i]["coverage"])
        
        
