from re import *

import re
import os
from subprocess import Popen, PIPE

# Script to make multifasta files with  aligned intergenic ORFs +/- 500 kb (for Alex experiments)


#=======================================================================
#====== BEGIN FUNCTIONS ======================================================

def parse_fasta(fasta_file):
    
    """Function to parse a FASTA file into a sequence dictionary.
    
    The fasta_file argument must be a FASTA-formatted string or an
    opened file object.
    """
    
    if isinstance(fasta_file, file):
        fasta = fasta_file
    elif isinstance(fasta_file, str):
        fasta = fasta_file.split("\n")
    else:
        raise Exception("Argument fasta_file must be a file or a string")
    
    # Dictionary to receive sequences
    sequence_dict = {}
    
    # Variables for holding transitory informations
    current_name = ""
    current_sequence = ""
    
    # For each line in the file:
    for line in fasta:
        
        # If it is neither a comment nor an empty line:
        if not line.startswith("#") and line.strip():
            
            # If it is the start of a new fasta entry:
            if line.startswith(">"):
                
                # If it is not the first entry of the file:
                if current_name:
                    
                    # Set a dictionary item using the current name and
                    # sequence
                    sequence_dict[current_name] = current_sequence
                    
                    # Reset the sequence
                    current_sequence = ""
                    
                # Change the "current name" to the new sequence name
                current_name = line.rstrip()[1:]
            
            # If it is a sequence line, add the new sequence chunk to
            # the current sequence
            else:
                current_sequence += line.rstrip()
    
    # Once all lines have been read, set the last dictionary item
    sequence_dict[current_name] = current_sequence
    
    # Return the sequence dictionary
    return sequence_dict


#=======================================================================
#=======================================================================


def write_fasta(sequence_dict, out_file = None):
    
    """Function to write a sequence dictionary in FASTA format to an
    opened file or a returned string
    
    If given, out_file must be a file object opened in writing mode
    """
    
    if not all(isinstance(key, str) for key in sequence_dict.keys()):
        raise Exception("All dictionary keys must be strings")
    
    if out_file and not isinstance(out_file, file):
        raise Exception("If given, argument out_file must be a file object opened in writing mode")
    
    # String to hold the whole FASTA file
    if not out_file:
        fasta_string = ""
    
    # For each item in the sequence dictionary:
    for name, seq in sequence_dict.iteritems():
    
        # Split the sequence in lines of 60 characters and join
        # them with newline characters
        seq_lines = "\n".join(re.findall(r".{1,60}", seq)) + "\n"
        
        # Complete the FASTA entry by adding the name line at the top
        fasta_entry = ">" + name + "\n" + seq_lines
        
        # Add the FASTA entry to the output file or string
        if out_file:
            out_file.write(fasta_entry)
        else:
            fasta_string += fasta_entry
    
    # Once the writing is completed, if no out_file was given,
    # return the FASTA string
    if not out_file:
        return fasta_string




def muscle_align(sequence_dict, big = False):
    
    """Function to get the aligned version of a sequence dictionary
    using MUSCLE.
    
    Specify big=True if sequence size fills the RAM.
    MUSCLE parameters " -maxiters 1 -diags1 -sv " will then be used.
    """
    
    if not all(isinstance(seq, str) for seq in sequence_dict.itervalues()):
        raise Exception("All dictionary values must be strings")
    
    #assert not isinstance(big, bool), "Argument big must be a bool"
    
    # Create a command line object (a pipe through MUSCLE) using the
    # subprocess module
    command = ["muscle"]
    
    if big:
        command.extend(["-maxiters", "1", "-diags1", "-sv"])
    
    muscle_pipe = Popen(command, stdin = PIPE, stdout = PIPE, stderr = PIPE)
    
    # Get the alignment by sending the dictionary in FASTA format to
    # MUSCLE's stdin. Index 0 gets stdout (1 would be stderr).
    
    alignment = muscle_pipe.communicate(write_fasta(sequence_dict))[0]
    
    # Return the results as a dictionary of sequences with the same type
    # as the sequences in the input dictionary
    
    if sequence_dict:
        return parse_fasta(alignment)
    else:
        return {}





#====== END FUNCTIONS ======================================================
#=======================================================================

#take names of orf >60 nt 
#take orf coordinates not aligned (stat0or1)
#take sequence and stock in a fasta 
#align fasta 
chaplo=["SA03","SD06","SD01","Y128"]
dico_corres={"SA03":"A03","SD06":"D06","SD01":"D01","Y128":"YPS128"}

dir_coordorf="../../03synt_intergenic_orf/03_ORF_cons_spar/02per_haplo_ps/"
dir_genomes="../../../../00data_seq/pseudo_IDBA/"
dir_gff60="../../11_selection/00_notal_orf/gff60/"
dir_fastanotal="../../11_selection/00_notal_orf/fasta_haplo60/"

dir_orf_aligned="../../11_selection/01_align_orf/"
dir_gc="../../11_selection/table_orf_gc.txt" 

for haplo in chaplo:
	gff_out=haplo+"_pscoord_st01.gff"
	with open(dir_gff60+gff_out,"w")as out:
		with open(dir_coordorf+"table_orf_"+haplo+".gff") as gff:
			for line in gff:
				line=line.rstrip()
				array=line.split("\t")
				size=int(array[4])-int(array[3])+1
				if size >= 60:
					out.write(line+"\n")
	genome_file=dir_genomes+dico_corres[haplo]+"_IDBA.fapseudoscaff.fasta"
	cmd="bedtools getfasta -s -name -fi "+genome_file+" -bed "+dir_gff60+gff_out+" -fo "+dir_fastanotal+gff_out.replace("gff","fasta")
	os.system(cmd)

#take another file for cer 
haplo="Y128"
gff_out=haplo+"_pscoord_st01.gff"
with open(dir_gff60+gff_out,"w")as out:
	with open("../../../0_Synchro_rec/03synt_intergenic_orf/02_ORF_tables/02per_haplo_ps/table_orf_Y128.gff") as gff:
		for line in gff:
			line=line.rstrip()
			array=line.split("\t")
			size=int(array[4])-int(array[3])+1
			if size >= 60:
				out.write(line+"\n")

genome_file=dir_genomes+dico_corres[haplo]+"_IDBA.fapseudoscaff.fasta"
cmd="bedtools getfasta -s -name -fi "+genome_file+" -bed "+dir_gff60+gff_out+" -fo "+dir_fastanotal+gff_out.replace("gff","fasta")
os.system(cmd)

#take list of conserved orf for analysis 
dataorf="../tables_out/table_S2_metaexp_V1.txt"
dico_cons={}

with open(dataorf) as orffile:
	for line in orffile:
		line=line.rstrip()
		#print line
		if line.split("\t") =="orf":
			orf=line.split("\t")[2]
			
			dico_cons[orf]=""


#read fasta and stock in diictionnary 
dico_fasta={}

for haplo in chaplo:
	with open(dir_fastanotal+haplo+"_pscoord_st01.fasta") as fasta:
		dico_haplo=parse_fasta(fasta)
	
	dico_fasta[haplo]=dico_haplo 



for orf in dico_cons.keys():
	dico_orf={}
	for haplo in chaplo:
		test=dico_fasta[haplo].get(orf,"not")
		if test != "not":
			dico_orf[haplo]=dico_fasta[haplo][orf]
	
	#align orf 
	dico_align=muscle_align(dico_orf)
	
	#write aligned file 
	if len(dico_align.keys())==4:
		with open(dir_orf_aligned+orf+"_align.fasta","w") as out_fasta:
			write_fasta(dico_align,out_fasta)
		


#take % of gc 
#list of gene files
list_fasta=os.popen("ls "+dir_orf_aligned+"*").read().rstrip().split("\n")


with open(dir_gc,"w") as out:
	for gene in list_fasta:
		with open(gene) as fasta:
			dico_gene=parse_fasta(fasta)
		
		gene_name=gene.split("/")[-1].replace("_align.fasta","")
		for key in dico_gene.keys():
			seq=dico_gene[key]
			g_count=float(seq.count("G"))
			c_count=float(seq.count("C"))
			gc_perc=(g_count+c_count)/len(seq.replace("-",""))*100
			
			out.write("\t".join([key,gene_name,str(gc_perc)])+"\n")


