from re import *

import re
import os
from subprocess import Popen, PIPE

# Script to make one multifasta files per gene, take gc content


#=======================================================================
#====== BEGIN FUNCTIONS ======================================================

def parse_fasta(fasta_file):
    
    """Function to parse a FASTA file into a sequence dictionary.
    
    The fasta_file argument must be a FASTA-formatted string or an
    opened file object.
    """
    
    if isinstance(fasta_file, file):
        fasta = fasta_file
    elif isinstance(fasta_file, str):
        fasta = fasta_file.split("\n")
    else:
        raise Exception("Argument fasta_file must be a file or a string")
    
    # Dictionary to receive sequences
    sequence_dict = {}
    
    # Variables for holding transitory informations
    current_name = ""
    current_sequence = ""
    
    # For each line in the file:
    for line in fasta:
        
        # If it is neither a comment nor an empty line:
        if not line.startswith("#") and line.strip():
            
            # If it is the start of a new fasta entry:
            if line.startswith(">"):
                
                # If it is not the first entry of the file:
                if current_name:
                    
                    # Set a dictionary item using the current name and
                    # sequence
                    sequence_dict[current_name] = current_sequence
                    
                    # Reset the sequence
                    current_sequence = ""
                    
                # Change the "current name" to the new sequence name
                current_name = line.rstrip()[1:]
            
            # If it is a sequence line, add the new sequence chunk to
            # the current sequence
            else:
                current_sequence += line.rstrip()
    
    # Once all lines have been read, set the last dictionary item
    sequence_dict[current_name] = current_sequence
    
    # Return the sequence dictionary
    return sequence_dict


#=======================================================================
#=======================================================================


def write_fasta(sequence_dict, out_file = None):
    
    """Function to write a sequence dictionary in FASTA format to an
    opened file or a returned string
    
    If given, out_file must be a file object opened in writing mode
    """
    
    if not all(isinstance(key, str) for key in sequence_dict.keys()):
        raise Exception("All dictionary keys must be strings")
    
    if out_file and not isinstance(out_file, file):
        raise Exception("If given, argument out_file must be a file object opened in writing mode")
    
    # String to hold the whole FASTA file
    if not out_file:
        fasta_string = ""
    
    # For each item in the sequence dictionary:
    for name, seq in sequence_dict.iteritems():
    
        # Split the sequence in lines of 60 characters and join
        # them with newline characters
        seq_lines = "\n".join(re.findall(r".{1,60}", seq)) + "\n"
        
        # Complete the FASTA entry by adding the name line at the top
        fasta_entry = ">" + name + "\n" + seq_lines
        
        # Add the FASTA entry to the output file or string
        if out_file:
            out_file.write(fasta_entry)
        else:
            fasta_string += fasta_entry
    
    # Once the writing is completed, if no out_file was given,
    # return the FASTA string
    if not out_file:
        return fasta_string



def muscle_align(sequence_dict, big = False):
    
    """Function to get the aligned version of a sequence dictionary
    using MUSCLE.
    
    Specify big=True if sequence size fills the RAM.
    MUSCLE parameters " -maxiters 1 -diags1 -sv " will then be used.
    """
    
    if not all(isinstance(seq, str) for seq in sequence_dict.itervalues()):
        raise Exception("All dictionary values must be strings")
    
    #assert not isinstance(big, bool), "Argument big must be a bool"
    
    # Create a command line object (a pipe through MUSCLE) using the
    # subprocess module
    command = ["muscle"]
    
    if big:
        command.extend(["-maxiters", "1", "-diags1", "-sv"])
    
    muscle_pipe = Popen(command, stdin = PIPE, stdout = PIPE, stderr = PIPE)
    
    # Get the alignment by sending the dictionary in FASTA format to
    # MUSCLE's stdin. Index 0 gets stdout (1 would be stderr).
    
    alignment = muscle_pipe.communicate(write_fasta(sequence_dict))[0]
    
    # Return the results as a dictionary of sequences with the same type
    # as the sequences in the input dictionary
    
    if sequence_dict:
        return parse_fasta(alignment)
    else:
        return {}





#====== END FUNCTIONS ======================================================
#=======================================================================

dir_fasta="../../09_iupred/00fasta_genes/01fasta_gene_nt/"
dir_aligned_fasta="../../11_selection/02_align_genes/"
dir_gc="../../11_selection/table_genes_gc.txt" 

dir_orf60="../tables_out/table_S2_metaexp_V1.txt"


#take ORFs in multianalysis (>60 nt)
#stock in a dictionnary 
dico_orf={}
with open(dir_orf60) as orffile:
	for line in orffile:
		line=line.rstrip()
		array=line.split("\t")
		if array[3]=="gene":
			dico_orf[array[2]]=array[7]

#make a multifasta dictionnary per haplotype
dico_corres={"SA03":"A03","SD06":"D06","SD01":"D01","Y128":"YPS128"}

chaplo=["SA03","SD06","SD01", "Y128"]
dico_all={}
for haplo in chaplo:
	with open(dir_fasta+dico_corres[haplo]+"_IDBA_genecons_nt.fasta") as fasta:
		dico_haplo=parse_fasta(fasta)
		
	dico_all[haplo]=dico_haplo



#make a multifasta per orf

for orf in dico_orf.keys():
	orf_notal={}
	for haplo in chaplo:
		test=dico_all[haplo].get(orf,"not")
		if test !="not":
			orf_notal[haplo]=dico_all[haplo][orf]
	#align 
	orf_al=muscle_align(orf_notal)
	
	align_name=orf+"_align.fasta"
	with open(dir_aligned_fasta+align_name,"w") as out:
		write_fasta(orf_al,out)

#take % of gc 
#list of gene files
list_fasta=os.popen("ls "+dir_aligned_fasta+"*").read().rstrip().split("\n")


with open(dir_gc,"w") as out:
	for gene in list_fasta:
		with open(gene) as fasta:
			dico_gene=parse_fasta(fasta)
		
		gene_name=gene.split("/")[-1].replace("_align.fasta","")
		for key in dico_gene.keys():
			seq=dico_gene[key]
			g_count=float(seq.count("G"))
			c_count=float(seq.count("C"))
			gc_perc=(g_count+c_count)/len(seq.replace("-",""))*100
			
			out.write("\t".join([key,gene_name,str(gc_perc)])+"\n")

#test

 megacc 


megacc -a distance_mean.mao -d YPR198W_align.fas -o testmega_mean -t hc4_intergenic_phy_phyml_tree.txt
megacc -a distance_pairwise.mao -d YPR198W_align.fas -o testmega_pair -t hc4_intergenic_phy_phyml_tree.txt


