#!/usr/bin/env python

import os
import re
from subprocess import Popen, PIPE

# FONCTIONS##############################################################
def parse_fasta(fasta_file):
    
    """Function to parse a FASTA file into a sequence dictionary.
    
    The fasta_file argument must be a FASTA-formatted string or an
    opened file object.
    """
    
    if isinstance(fasta_file, file):
        fasta = fasta_file
    elif isinstance(fasta_file, str):
        fasta = fasta_file.split("\n")
    else:
        raise Exception("Argument fasta_file must be a file or a string")
    
    # Dictionary to receive sequences
    sequence_dict = {}
    
    # Variables for holding transitory informations
    current_name = ""
    current_sequence = ""
    
    # For each line in the file:
    for line in fasta:
        
        # If it is neither a comment nor an empty line:
        if not line.startswith("#") and line.strip():
            
            # If it is the start of a new fasta entry:
            if line.startswith(">"):
                
                # If it is not the first entry of the file:
                if current_name:
                    
                    # Set a dictionary item using the current name and
                    # sequence
                    sequence_dict[current_name] = current_sequence
                    
                    # Reset the sequence
                    current_sequence = ""
                
                # Change the "current name" to the new sequence name
                current_name = line.rstrip()[1:]
            
            # If it is a sequence line, add the new sequence chunk to
            # the current sequence
            else:
                current_sequence += line.rstrip()
    
    # Once all lines have been read, set the last dictionary item
    sequence_dict[current_name] = current_sequence
    
    # Return the sequence dictionary
    return sequence_dict





########################################################################
#BEGIN ANALYSIS 
########################################################################
"""Link to execute de script in the iupred directory """
#taille min (AA) orf for analysis 
min_size=0

#repertoires entree et sortie
dir_in="../../../../media/eleonore/Seagate_Ele/sORF_project/analyses_data/02synteny_IDBA/0_Synchro_rec/09_iupred/00fasta_genes/02fasta_gene_prot/"
dir_out="../../../../media/eleonore/Seagate_Ele/sORF_project/analyses_data/02synteny_IDBA/0_Synchro_rec/09_iupred/01_iupred_long/"

#take file names
#liste_files=os.popen("ls "+dir_in).read().rstrip().split("\n")

liste_files=["A03_IDBA_genecons_aa.fasta", "D01_IDBA_genecons_aa.fasta", "D06_IDBA_genecons_aa.fasta", \
"YPS128_IDBA_genecons_aa.fasta"]

for filei in liste_files:
     
    #filei="SA03orf_stat1_aa.fasta"
    #fileo="SA03orf_s10.txt"
    fileo="gene_"+filei.replace("IDBA_genecons_aa.fasta","s"+str(min_size)+".txt")
    #file name with unique sequence to run iupred
    testfile="orftest.seq"
    
    #haplotype
    haplo=filei.split("/")[-1].split("_")[0]
    
    
    #read fasta file and create a dictionnary
    with open(dir_in+filei) as seq:
        dico_seq=parse_fasta(seq)
    
    
    #write a testorf.seq file foreach seq in the dictionnary 
    #and run iupred  
    
    with open(dir_out+fileo,"w") as out_results: 
        for keyi in dico_seq.keys():
            if len(dico_seq[keyi]) >= min_size:
                #write the orf seq in one file  
                with open(testfile, "w") as out:
                    out.write(">"+keyi+"\n")
                    out.write(dico_seq[keyi]+"\n")
                #and run iupred on the orf
                
                #print keyi
                cmd="./iupred "+testfile+" long"
                results=os.popen(cmd).read().rstrip().split("\n")
                #take scores 
                scores=[]
                for line in results:
                    
                    m=re.search("^#", line)
                    #skip header lines
                    if m is None:
                        line=" ".join(line.split())
                        array=line.split(" ")
                        scores.append(array[2])
                        #merge results in one character string
                lscores=";".join(scores)
                size=array[0]
                #write results 
                newline=haplo+"\t"+keyi+"\t"+size+"\t"+dico_seq[keyi]+"\t"+lscores+"\n"
                out_results.write(newline)

