#!/usr/bin/env python

#Script to extract sequences with conserved synteny (one file per SID groups)
#do the alignment and write aligned sequences 


import re
import os
from subprocess import Popen, PIPE

#=======================================================================
#====== BEGIN FUNCTIONS ======================================================

def parse_fasta(fasta_file):
    
    """Function to parse a FASTA file into a sequence dictionary.
    
    The fasta_file argument must be a FASTA-formatted string or an
    opened file object.
    """
    
    if isinstance(fasta_file, file):
        fasta = fasta_file
    elif isinstance(fasta_file, str):
        fasta = fasta_file.split("\n")
    else:
        raise Exception("Argument fasta_file must be a file or a string")
    
    # Dictionary to receive sequences
    sequence_dict = {}
    
    # Variables for holding transitory informations
    current_name = ""
    current_sequence = ""
    
    # For each line in the file:
    for line in fasta:
        
        # If it is neither a comment nor an empty line:
        if not line.startswith("#") and line.strip():
            
            # If it is the start of a new fasta entry:
            if line.startswith(">"):
                
                # If it is not the first entry of the file:
                if current_name:
                    
                    # Set a dictionary item using the current name and
                    # sequence
                    sequence_dict[current_name] = current_sequence
                    
                    # Reset the sequence
                    current_sequence = ""
                
                # Change the "current name" to the new sequence name
                current_name = line.rstrip()[1:]
            
            # If it is a sequence line, add the new sequence chunk to
            # the current sequence
            else:
                current_sequence += line.rstrip()
    
    # Once all lines have been read, set the last dictionary item
    sequence_dict[current_name] = current_sequence
    
    # Return the sequence dictionary
    return sequence_dict


#=======================================================================
#=======================================================================


def write_fasta(sequence_dict, out_file = None):
    
    """Function to write a sequence dictionary in FASTA format to an
    opened file or a returned string
    
    If given, out_file must be a file object opened in writing mode
    """
    
    if not all(isinstance(key, str) for key in sequence_dict.keys()):
        raise Exception("All dictionary keys must be strings")
    
    if out_file and not isinstance(out_file, file):
        raise Exception("If given, argument out_file must be a file object opened in writing mode")
    
    # String to hold the whole FASTA file
    if not out_file:
        fasta_string = ""
    
    # For each item in the sequence dictionary:
    for name, seq in sequence_dict.iteritems():
        
        # Split the sequence in lines of 60 characters and join
        # them with newline characters
        seq_lines = "\n".join(re.findall(r".{1,60}", seq)) + "\n"
        
        # Complete the FASTA entry by adding the name line at the top
        fasta_entry = ">" + name + "\n" + seq_lines
        
        # Add the FASTA entry to the output file or string
        if out_file:
            out_file.write(fasta_entry)
        else:
            fasta_string += fasta_entry
    
    # Once the writing is completed, if no out_file was given,
    # return the FASTA string
    if not out_file:
        return fasta_string


#=======================================================================
#=======================================================================


def muscle_align(sequence_dict, big = False):
    
    """Function to get the aligned version of a sequence dictionary
    using MUSCLE.
    
    Specify big=True if sequence size fills the RAM.
    MUSCLE parameters " -maxiters 1 -diags1 -sv " will then be used.
    """
    
    if not all(isinstance(seq, str) for seq in sequence_dict.itervalues()):
        raise Exception("All dictionary values must be strings")
    
    #assert not isinstance(big, bool), "Argument big must be a bool"
    
    # Create a command line object (a pipe through MUSCLE) using the
    # subprocess module
    command = ["muscle"]
    
    if big:
        command.extend(["-maxiters", "1", "-diags1", "-sv"])
    
    muscle_pipe = Popen(command, stdin = PIPE, stdout = PIPE, stderr = PIPE)
    
    # Get the alignment by sending the dictionary in FASTA format to
    # MUSCLE's stdin. Index 0 gets stdout (1 would be stderr).
    
    alignment = muscle_pipe.communicate(write_fasta(sequence_dict))[0]
    
    # Return the results as a dictionary of sequences with the same type
    # as the sequences in the input dictionary
    
    if sequence_dict:
        return parse_fasta(alignment)
    else:
        return {}




def extract_synt(synt,dico_coord, dico_haplo):
    
    """Function to extract syntenic sequences in a dictionnary
    Arguments: synt infos ex SID0;ID0-ID1
    dico_coord : coordinates for each orthogroup
    dico_haplo : genomic sequences
    """ 
    dico_synt={}
    synt_id=synt.split(";")[0]
    ortho1=synt.split(";")[1].split("-")[0]
    ortho2=synt.split(";")[1].split("-")[1]
    
    coord1=dico_coord[ortho1]
    coord2=dico_coord[ortho2]
    
    # Take haplo id, chrom id and coordinate to extract sequence in 
    #a dico_synt dictionnaray
    
    for col in range(0,25+1):
        selcol1=coord1.split("\t")[col]
        array1=selcol1.split(";")
        haplo=array1[0].split("_")[0]
        chrom_id=array1[1]
        #start syntenic region = end of the gene 1
        start_syn=int(array1[2].split("-")[1])
        selcol2=coord2.split("\t")[col]
        array2=selcol2.split(";")
        #stop synt region = befor start of the gene2
        stop_syn=int(array2[2].split("-")[0])-2
        
        seqi=dico_haplo[haplo+";"+chrom_id][start_syn:stop_syn]
        key_synt=synt_id+";"+haplo+";"+chrom_id+";"+str(start_syn)+"-"+str(stop_syn)
        
        #Create a dictionnary with syn seq per synt group     
        dico_synt[key_synt]=seqi
        
    return dico_synt


#====== END FUNCTIONS ======================================================
#=======================================================================

#Read genomes and make a dictionnary
link_genomes="../00links/link_fasta.txt"
link_synt="../../../Clustering/05_SPar24HC/syntpairs.cons"
link_coord="../../../Clustering/05_SPar24HC/all.rbh.pairs.cons"


#dir to write syn fasta
dir_fasta="../../02synt_intergenic_fasta/00_notaligned/"
dir_aligned="../../02synt_intergenic_fasta/aligned/"
dir_aligned4="../../02synt_intergenic_fasta/aligned_4/"

nbhaplo=26
min_seq=100 #min length of a seq

dico_haplo={}

with open(link_genomes)as link:
    for file_in in link:
        file_in=file_in.rstrip()
        
        haplo=file_in.split("/")[-1].split("_")[0]
        print haplo
        dico_genome={}
        
        #file_in="../../../00data_seq/pseudo_IDBA/A03_IDBA.fapseudoscaff.fasta"
        with open(file_in) as genome:
            dico_genome=parse_fasta(genome)
        
        for key in dico_genome.keys():
            newkey="S"+haplo+";"+key
            
            if haplo=="YPS128":
                newkey="Y128;"+key
            if haplo=="S288Crename":
                newkey="S288;"+key
            
            dico_haplo[newkey]=dico_genome[key]
    


#read syntenic group and stock in a list
lsynt=[]
with open (link_synt) as synt:
    for line in synt:
        line=line.rstrip()
        lsynt.append(line)
        


#Read coord and stock informations in dictioonnary key=id ortho, 
# values== coord

dico_coord={}
with open(link_coord) as coord:
    for line in coord:
        line=line.rstrip()
        array=line.split("\t")
        #add chr name for ref car different
        arrayref=array[54].split(";")
        arrayref[1]="chr"+arrayref[1]
        array[54]=";".join(arrayref)
        ID=array[0]
        valk="\t".join([array[i] for i in range(54,79+1)])
        dico_coord[ID]=valk
        

#OK on a tout
#on lance l'extraction des regions intergeniques et ectriture dans un 
#fichier fasta par region

sel_4=["SA03", "SD01", "SD06", "Y128"]
for synt in lsynt:
    synt_out=synt.replace(";","_")+".fasta"
    synt_al=synt.replace(";","_")+".aligned.fasta"
    synt_al4=synt.replace(";","_")+".aligned4.fasta"
    #fasta avec regions intergenic 
    dico_synt=extract_synt(synt,dico_coord, dico_haplo)
    
    #Test if no empty sequences
    test_seq="OK"
    for seqi in dico_synt:
        seqlength=len(dico_synt[seqi])
        if seqlength <= min_seq:
            test_seq="REMOVE"
        
    #print test_seq
    if len(dico_synt.keys()) == nbhaplo and test_seq=="OK":
        #write not aligned sequences
        with open(dir_fasta+synt_out, "w") as out_synt:
            write_fasta(dico_synt, out_synt)
        
        
        #fasta avec region inter alignees 
        #dico_align=muscle_align(dico_synt)
        
        #with open(dir_aligned+synt_al, "w") as out_al:
        #write_fasta(dico_align, out_al)
        
        #fasta avec region inter alignees for the 4 main lineages (for reconstruction)
        #dictionnary with selected strains
        dico_synt4={}
        for keytest in dico_synt.keys():
            haplo=keytest.split(";")[1]
            if haplo in sel_4:
                dico_synt4[keytest]=dico_synt[keytest]
            
        
        #alignment
        dico_align4=muscle_align(dico_synt4)
        
        with open(dir_aligned4+synt_al4, "w") as out_al4:
            write_fasta(dico_align4, out_al4)
