#!/usr/bin/env python

#script to annot ORFs on aligned intergenic regions 


import re
from subprocess import Popen, PIPE
import os

# FONCTIONS##############################################################
def parse_fasta(fasta_file):
    
    """Function to parse a FASTA file into a sequence dictionary.
    
    The fasta_file argument must be a FASTA-formatted string or an
    opened file object.
    """
    
    if isinstance(fasta_file, file):
        fasta = fasta_file
    elif isinstance(fasta_file, str):
        fasta = fasta_file.split("\n")
    else:
        raise Exception("Argument fasta_file must be a file or a string")
    
    # Dictionary to receive sequences
    sequence_dict = {}
    
    # Variables for holding transitory informations
    current_name = ""
    current_sequence = ""
    
    # For each line in the file:
    for line in fasta:
        
        # If it is neither a comment nor an empty line:
        if not line.startswith("#") and line.strip():
            
            # If it is the start of a new fasta entry:
            if line.startswith(">"):
                
                # If it is not the first entry of the file:
                if current_name:
                    
                    # Set a dictionary item using the current name and
                    # sequence
                    sequence_dict[current_name] = current_sequence
                    
                    # Reset the sequence
                    current_sequence = ""
                
                # Change the "current name" to the new sequence name
                current_name = line.rstrip()[1:]
            
            # If it is a sequence line, add the new sequence chunk to
            # the current sequence
            else:
                current_sequence += line.rstrip()
    
    # Once all lines have been read, set the last dictionary item
    sequence_dict[current_name] = current_sequence
    
    # Return the sequence dictionary
    return sequence_dict


def revcomp(seq):
    """Renvoie le reverse complement d'une sequence nucleotidique"""
    seqrev=[base for base in seq]
    seqrev.reverse()
    dico_comp={"A":"T", "T":"A", "G":"C", "C":"G","a":"t", "t":"a", "g":"c",
    "c":"g", "N":"N", "n":"n",
    "R":"Y", "r":"y", "Y":"R","y":"r",
    "w":"w","W":"W", "K":"M","k":"m","M":"K","m":"k",
    "b":"v","B":"V","D":"H","d":"h",
    "H":"D","h":"d","V":"B","v":"b","-":"-"
    }
    seqrevc=[dico_comp[base] for base in seqrev]
    seqrevc="".join(seqrevc)
    return seqrevc



def annot_orf(dico_align,synt_name):
    final=[]
    #Annotation des ORFs pour chaque sequence
    for seqnb in range(0,len(dico_align.values())):
        seqi=dico_align.values()[seqnb]
        lseq=len(seqi)
        contig=dico_align.keys()[seqnb].split(";")[2]
        haplo=dico_align.keys()[seqnb].split(";")[1]
        # FORWARD 
        sens="+"
        
        pos=[[m.span(1)[0]+1,m.span(3)[1]] for m in re.finditer(r"(A-*T-*G-*)(?=(?:(?!(?:T-*A-*A-*|T-*A-*G-*|T-*G-*A-*))([ATGCRYSWKMBDHVN]-*){3})+(T-*(?:[GA]-*A|A-*G)))",seqi)]
        
        #si plusieurs ATG non interrompus par un stop et dans le meme cadre, on prend le premier
        
        lstart=[]
        lstop=[]
        for start,stop in pos:
            if stop not in lstop:
                lstart.append(start)
                lstop.append(stop)
        
        postri=zip(lstart,lstop)
        [final.append(synt_name+"\t"+haplo+"\t"+ contig+"\t"+str(lseq)+"\t"+str(orf[0])+"\t"+str(orf[1])+"\t"+sens+"\n") for orf in postri]
        
        #REVERSE
        sensrev="-"
        seqrev=revcomp(seqi)
        posrev=[[m.span(1)[0]+1,m.span(3)[1]] for m in re.finditer(r"(A-*T-*G-*)(?=(?:(?!(?:T-*A-*A-*|T-*A-*G-*|T-*G-*A)-*)([ATGCRYSWKMBDHVN]-*){3})+(T-*(?:[GA]-*A|A-*G)))",seqrev)]
        
        #si plusieurs ATG non interrompus par un stop et dans le meme cadre, on prend le premier
        
        lstartrev=[]
        lstoprev=[]
        for start,stop in posrev:
            if stop not in lstoprev:
                lstartrev.append(start)
                lstoprev.append(stop)
        lstartord=[lseq-nbi+1 for nbi in lstartrev]
        lstopord=[lseq-nbi+1 for nbi in lstoprev]
        postrirev=zip(lstopord,lstartord)
        
        [final.append(synt_name+"\t"+haplo+"\t"+ contig+"\t"+str(lseq)+"\t"+str(orf[0])+"\t"+str(orf[1])+"\t"+sensrev+"\n") for orf in postrirev]
    
    return final


def annot_to_gff(data_annot, size_min, haplo):
    gff=[]
    count=0
    
    for line in data_annot:
        line=line.rstrip()
        count=count+1
        array=line.split("\t")
        #haplo=array[0]
        scaff=array[1]
        start=array[3]
        stop=array[4]
        sens=array[5]
        size=int(stop)-int(start)+1
        if size >= size_min:
            infos="id_orf=orf_"+str(count)+";orf_size="+str(size)+";haplo="+haplo
            newline=[scaff,"1.annotORF.py","ORF",start,stop,".",sens,".",infos]
            gff.append("\t".join(newline)+"\n")
    return gff


# Analyses #########################################################
#==============================================================

#==============================================================
#Fichiers avec regions synteniques
#==============================================================

dir_align="../../02synt_intergenic_fasta/03_allSIDwithrecons/02_aligned_rename/"
nbhaplo=29

#Take list of sid files 


list_files=os.popen("ls "+dir_align).read().rstrip().split("\n")

for fasta_file in list_files:
    
    print fasta_file
    file_out="../../03synt_intergenic_orf/01_ORF_annotation/"+fasta_file.replace(".fasta",".orf")
    with open(dir_align+fasta_file) as fasta:
        dico_fasta=parse_fasta(fasta)
    
    SID=fasta_file.split("_")[0]
    #Annotation ORF
    if len(dico_fasta.keys())==nbhaplo:
        data_annot=annot_orf(dico_fasta,SID)
        
        #Ecriture dans fichier 
        with open(file_out,"w") as out:
            out.write("".join(data_annot))
        

#merge results in one file 
cmd ="cat ../../03synt_intergenic_orf/01_ORF_annotation/* > ../../03synt_intergenic_orf/01_ORF_annotation/all_SID_align.orf"
os.system(cmd)

print "END !"
