#!/usr/bin/env python

import re
import os
from subprocess import Popen, PIPE

#script to align all synt intergenic regions (24 HC strains + YPS128+S288C+ ancestral reconstructions)

#=======================================================================
#====== BEGIN FUNCTIONS ======================================================

def parse_fasta(fasta_file):
    
    """Function to parse a FASTA file into a sequence dictionary.
    
    The fasta_file argument must be a FASTA-formatted string or an
    opened file object.
    """
    
    if isinstance(fasta_file, file):
        fasta = fasta_file
    elif isinstance(fasta_file, str):
        fasta = fasta_file.split("\n")
    else:
        raise Exception("Argument fasta_file must be a file or a string")
    
    # Dictionary to receive sequences
    sequence_dict = {}
    
    # Variables for holding transitory informations
    current_name = ""
    current_sequence = ""
    
    # For each line in the file:
    for line in fasta:
        
        # If it is neither a comment nor an empty line:
        if not line.startswith("#") and line.strip():
            
            # If it is the start of a new fasta entry:
            if line.startswith(">"):
                
                # If it is not the first entry of the file:
                if current_name:
                    
                    # Set a dictionary item using the current name and
                    # sequence
                    sequence_dict[current_name] = current_sequence
                    
                    # Reset the sequence
                    current_sequence = ""
                
                # Change the "current name" to the new sequence name
                current_name = line.rstrip()[1:]
            
            # If it is a sequence line, add the new sequence chunk to
            # the current sequence
            else:
                current_sequence += line.rstrip()
    
    # Once all lines have been read, set the last dictionary item
    sequence_dict[current_name] = current_sequence
    
    # Return the sequence dictionary
    return sequence_dict


#=======================================================================
#=======================================================================


def write_fasta(sequence_dict, out_file = None):
    
    """Function to write a sequence dictionary in FASTA format to an
    opened file or a returned string
    
    If given, out_file must be a file object opened in writing mode
    """
    
    if not all(isinstance(key, str) for key in sequence_dict.keys()):
        raise Exception("All dictionary keys must be strings")
    
    if out_file and not isinstance(out_file, file):
        raise Exception("If given, argument out_file must be a file object opened in writing mode")
    
    # String to hold the whole FASTA file
    if not out_file:
        fasta_string = ""
    
    # For each item in the sequence dictionary:
    for name, seq in sequence_dict.iteritems():
        
        # Split the sequence in lines of 60 characters and join
        # them with newline characters
        seq_lines = "\n".join(re.findall(r".{1,60}", seq)) + "\n"
        
        # Complete the FASTA entry by adding the name line at the top
        fasta_entry = ">" + name + "\n" + seq_lines
        
        # Add the FASTA entry to the output file or string
        if out_file:
            out_file.write(fasta_entry)
        else:
            fasta_string += fasta_entry
    
    # Once the writing is completed, if no out_file was given,
    # return the FASTA string
    if not out_file:
        return fasta_string


#=======================================================================
#=======================================================================



def muscle_align(sequence_dict, big = False):
    
    """Function to get the aligned version of a sequence dictionary
    using MUSCLE.
    
    Specify big=True if sequence size fills the RAM.
    MUSCLE parameters " -maxiters 1 -diags1 -sv " will then be used.
    """
    
    if not all(isinstance(seq, str) for seq in sequence_dict.itervalues()):
        raise Exception("All dictionary values must be strings")
    
    #assert not isinstance(big, bool), "Argument big must be a bool"
    
    # Create a command line object (a pipe through MUSCLE) using the
    # subprocess module
    command = ["muscle"]
    
    if big:
        command.extend(["-maxiters", "1", "-diags1", "-sv"])
    
    muscle_pipe = Popen(command, stdin = PIPE, stdout = PIPE, stderr = PIPE)
    
    # Get the alignment by sending the dictionary in FASTA format to
    # MUSCLE's stdin. Index 0 gets stdout (1 would be stderr).
    
    alignment = muscle_pipe.communicate(write_fasta(sequence_dict))[0]
    
    # Return the results as a dictionary of sequences with the same type
    # as the sequences in the input dictionary
    
    if sequence_dict:
        return parse_fasta(alignment)
    else:
        return {}


#====== END FUNCTIONS ======================================================
#=======================================================================
#1 read recons and make dictionnary
dir_recons="../../02synt_intergenic_fasta/02_historian_results/"
dir_all_SID="../../02synt_intergenic_fasta/00_notaligned/"

#outfiles 
dir_allrec_notal="../../02synt_intergenic_fasta/03_allSIDwithrecons/01_notaligned/"
dir_allrec_al="../../02synt_intergenic_fasta/03_allSIDwithrecons/02_aligned/"

list_fastarec=os.popen("ls "+dir_recons).read().rstrip().split("\n")

reconsnames=["N1","N2","N3"]

#foreach reconstructed file 
for fasta_file in list_fastarec:
    
    
    testfile=os.stat(dir_recons+fasta_file).st_size ==0
    if testfile==False:
        
        #Create a dictionnary with reconstructed sequences
        with open(dir_recons+fasta_file) as rec:
            dico_rec=parse_fasta(rec)
        
        #create a dictionnary with all strains sequences (24 Spar+ cer)
        fasta_all=fasta_file.replace(".recons","")
        
        with open(dir_all_SID+fasta_all) as allseq:
            dico_allseq=parse_fasta(allseq)
            
        #Add reconstructed sequences to dico_allseq
        
        
        sid=fasta_all.split("_")[0]
        for haplorec in reconsnames:
            #take sequence and remove alignment gap
            newseq=dico_rec[haplorec].replace("-","").upper()
            lseq=len(newseq)
            #create new id
            newname=sid+";"+haplorec+";"+sid+";1-"+str(lseq)
            dico_allseq[newname]=newseq
        
        #Write file with all sid + reconstruction in fasta
        fasta_allseq=fasta_all.replace(".fasta","_all_notal.fasta")
        
        with open(dir_allrec_notal+fasta_allseq,"w") as out:
            write_fasta(dico_allseq,out)


start_sid=0 #number to dont start each alignment from 0 if needed

i=0

#run muscle alignment on all sequences with reconstrucctions 
list_allseq=os.popen("ls "+dir_allrec_notal).read().rstrip().split("\n")

for fasta_file in list_allseq:
    i=i+1
    if i >= start_sid:
        with open(dir_allrec_notal+fasta_file) as fasta:
            dico_all_notal=parse_fasta(fasta)
        print fasta_file
        dico_align=muscle_align(dico_all_notal, big=True)
        
        fasta_out=fasta_file.replace("notal","al")
        
        with open(dir_allrec_al+fasta_out,"w") as out:
            write_fasta(dico_align, out)
    

#Merge all sid in one file for RM and rename Scer for sherter names
#We will mask non aligned regions and do the correspondance after with ORFs
dir_allrec_notal="../../02synt_intergenic_fasta/03_allSIDwithrecons/01_notaligned/"
list_fastarec=os.popen("ls "+dir_allrec_notal).read().rstrip().split("\n")


dico_seq2={}

for fasta_file in list_fastarec:
    
    #lecture du fasta aligne
    with open(dir_allrec_notal+fasta_file) as fasta:
        dico_seq=parse_fasta(fasta)
    
    #create new dico for newnames
    
    
    for key in dico_seq.keys():
        newseq=dico_seq[key]
        #modify name if Y128 because too long
        if "Y128" in key and "ordered" in key:
            key2=key.split("|")[0].replace(".fsa_tpg","")+key.split("|")[2] 
            print key2
        else:
            key2=key
        
        dico_seq2[key2]=newseq

#Ecriture dans un nouveau fichier
fastaout="../../02synt_intergenic_fasta/03_allSIDwithrecons/all_SID_notal_rename.fasta"
with open(fastaout, "w") as out:
    write_fasta(dico_seq2, out)

#
