from re import *

import re
import os
from subprocess import Popen, PIPE

# Script to make one multifasta files per gene, take gc content


#=======================================================================
#====== BEGIN FUNCTIONS ======================================================

def parse_fasta(fasta_file):
    
    """Function to parse a FASTA file into a sequence dictionary.
    
    The fasta_file argument must be a FASTA-formatted string or an
    opened file object.
    """
    
    if isinstance(fasta_file, file):
        fasta = fasta_file
    elif isinstance(fasta_file, str):
        fasta = fasta_file.split("\n")
    else:
        raise Exception("Argument fasta_file must be a file or a string")
    
    # Dictionary to receive sequences
    sequence_dict = {}
    
    # Variables for holding transitory informations
    current_name = ""
    current_sequence = ""
    
    # For each line in the file:
    for line in fasta:
        
        # If it is neither a comment nor an empty line:
        if not line.startswith("#") and line.strip():
            
            # If it is the start of a new fasta entry:
            if line.startswith(">"):
                
                # If it is not the first entry of the file:
                if current_name:
                    
                    # Set a dictionary item using the current name and
                    # sequence
                    sequence_dict[current_name] = current_sequence
                    
                    # Reset the sequence
                    current_sequence = ""
                    
                # Change the "current name" to the new sequence name
                current_name = line.rstrip()[1:]
            
            # If it is a sequence line, add the new sequence chunk to
            # the current sequence
            else:
                current_sequence += line.rstrip()
    
    # Once all lines have been read, set the last dictionary item
    sequence_dict[current_name] = current_sequence
    
    # Return the sequence dictionary
    return sequence_dict


#=======================================================================
#=======================================================================


def write_fasta(sequence_dict, out_file = None):
    
    """Function to write a sequence dictionary in FASTA format to an
    opened file or a returned string
    
    If given, out_file must be a file object opened in writing mode
    """
    
    if not all(isinstance(key, str) for key in sequence_dict.keys()):
        raise Exception("All dictionary keys must be strings")
    
    if out_file and not isinstance(out_file, file):
        raise Exception("If given, argument out_file must be a file object opened in writing mode")
    
    # String to hold the whole FASTA file
    if not out_file:
        fasta_string = ""
    
    # For each item in the sequence dictionary:
    for name, seq in sequence_dict.iteritems():
    
        # Split the sequence in lines of 60 characters and join
        # them with newline characters
        seq_lines = "\n".join(re.findall(r".{1,60}", seq)) + "\n"
        
        # Complete the FASTA entry by adding the name line at the top
        fasta_entry = ">" + name + "\n" + seq_lines
        
        # Add the FASTA entry to the output file or string
        if out_file:
            out_file.write(fasta_entry)
        else:
            fasta_string += fasta_entry
    
    # Once the writing is completed, if no out_file was given,
    # return the FASTA string
    if not out_file:
        return fasta_string



def muscle_align(sequence_dict, big = False):
    
    """Function to get the aligned version of a sequence dictionary
    using MUSCLE.
    
    Specify big=True if sequence size fills the RAM.
    MUSCLE parameters " -maxiters 1 -diags1 -sv " will then be used.
    """
    
    if not all(isinstance(seq, str) for seq in sequence_dict.itervalues()):
        raise Exception("All dictionary values must be strings")
    
    #assert not isinstance(big, bool), "Argument big must be a bool"
    
    # Create a command line object (a pipe through MUSCLE) using the
    # subprocess module
    command = ["muscle"]
    
    if big:
        command.extend(["-maxiters", "1", "-diags1", "-sv"])
    
    muscle_pipe = Popen(command, stdin = PIPE, stdout = PIPE, stderr = PIPE)
    
    # Get the alignment by sending the dictionary in FASTA format to
    # MUSCLE's stdin. Index 0 gets stdout (1 would be stderr).
    
    alignment = muscle_pipe.communicate(write_fasta(sequence_dict))[0]
    
    # Return the results as a dictionary of sequences with the same type
    # as the sequences in the input dictionary
    
    if sequence_dict:
        return parse_fasta(alignment)
    else:
        return {}





#====== END FUNCTIONS ======================================================
#=======================================================================
dir_corres="../../01annot_gene/04gff_annotcons/all_annotations/"
dir_fasta="../../01annot_gene/02fasta_all/fasta_nuc_coding/"
dir_aligned_fasta="../../11_selection/02_align_genes_allhaplo/"
dir_notal="../../11_selection/00_notal_genes"

#make a dictionnary with corresppondances for gene names 

list_gff=os.popen("ls "+dir_corres).read().rstrip().split("\n")
dico_corres={}
gene_names=[]
for filei in list_gff:
	dicoi={}
	with open (dir_corres+filei) as gff:
		for line in gff:
			line=line.rstrip()
			infos=line.split("\t")[8]
			array=infos.split(";")
			if array[2]=="conserved=TRUE":
				haplo=array[0].replace("haplo=","")
				augid=array[1].replace("augustus=","")
				geneid=array[4].replace("gene_id=","")
				dicoi[augid]=geneid
				gene_names.append(geneid)
		dico_corres[haplo]=dicoi

gene_nameu=set(gene_names)
#read fasta for genes 
fasta_names=os.popen("ls "+dir_fasta).read().rstrip().split("\n")

dico_corres2={\
'SC06':'C06', \
'SC04':'C04',\
'SC05':'C05',\
'SC02':'C02',\
'SC03':'C03', \
'SC01':'C01',\
'SA04':'A04', \
'SA05':'A05', \
'SA06':'A06',\
'SA01':'A01',\
'SA02' :'A02',\
'SA03' :'A03',\
'SD01' :'D01',\
'SD03' :'D03',\
'SD02':'D02',\
'SD05':'D05', \
'SD04':'D04',\
'SD06':'D06',\
'SB06': 'B06',\
'SB05':'B05', \
'SB04':'B04', \
'SB03':'B03',\
'SB02':'B02', \
'SB01': 'B01'}

chaplo=dico_corres2.keys()
dico_all={}

for haploi in chaplo:
	with open(dir_fasta+dico_corres2[haploi]+"_IDBA_DNA_coding.fasta") as fasta:
		dico_haplo=parse_fasta(fasta)
		
	dico_all[haploi]=dico_haplo

#make a new dictionnary simpler for next step 
dico_genes={}
for haplo in chaplo:
	dico_haplo=dico_all[haplo]
	for key in dico_haplo.keys():
		idaug=key.split("\t")[1]
		test=dico_corres[haplo].get(idaug,"Not")
		if test != "Not":
			gene_corres=dico_corres[haplo][idaug]
		
		newkey=haplo+";"+gene_corres
		dico_genes[newkey]=dico_haplo[key]
	


#make a multifasta per gene

for gene in gene_nameu:
	gene_notal={}
	for haplo in chaplo:
		test=dico_genes.get(haplo+";"+gene,"not")
		if test !="not":
			gene_notal[haplo]=dico_genes[haplo+";"+gene]
	#align 
	gene_al=muscle_align(gene_notal)
	
	align_name=gene+"_align.fasta"
	with open(dir_aligned_fasta+align_name,"w") as out:
		write_fasta(gene_al,out)


