from re import *

import re
import os
from subprocess import Popen, PIPE

# Script to make multifasta files with  aligned intergenic ORFs to check mutations leading to convergence


#=======================================================================
#====== BEGIN FUNCTIONS ======================================================

def parse_fasta(fasta_file):
    
    """Function to parse a FASTA file into a sequence dictionary.
    
    The fasta_file argument must be a FASTA-formatted string or an
    opened file object.
    """
    
    if isinstance(fasta_file, file):
        fasta = fasta_file
    elif isinstance(fasta_file, str):
        fasta = fasta_file.split("\n")
    else:
        raise Exception("Argument fasta_file must be a file or a string")
    
    # Dictionary to receive sequences
    sequence_dict = {}
    
    # Variables for holding transitory informations
    current_name = ""
    current_sequence = ""
    
    # For each line in the file:
    for line in fasta:
        
        # If it is neither a comment nor an empty line:
        if not line.startswith("#") and line.strip():
            
            # If it is the start of a new fasta entry:
            if line.startswith(">"):
                
                # If it is not the first entry of the file:
                if current_name:
                    
                    # Set a dictionary item using the current name and
                    # sequence
                    sequence_dict[current_name] = current_sequence
                    
                    # Reset the sequence
                    current_sequence = ""
                    
                # Change the "current name" to the new sequence name
                current_name = line.rstrip()[1:]
            
            # If it is a sequence line, add the new sequence chunk to
            # the current sequence
            else:
                current_sequence += line.rstrip()
    
    # Once all lines have been read, set the last dictionary item
    sequence_dict[current_name] = current_sequence
    
    # Return the sequence dictionary
    return sequence_dict


#=======================================================================
#=======================================================================


def write_fasta(sequence_dict, out_file = None):
    
    """Function to write a sequence dictionary in FASTA format to an
    opened file or a returned string
    
    If given, out_file must be a file object opened in writing mode
    """
    
    if not all(isinstance(key, str) for key in sequence_dict.keys()):
        raise Exception("All dictionary keys must be strings")
    
    if out_file and not isinstance(out_file, file):
        raise Exception("If given, argument out_file must be a file object opened in writing mode")
    
    # String to hold the whole FASTA file
    if not out_file:
        fasta_string = ""
    
    # For each item in the sequence dictionary:
    for name, seq in sequence_dict.iteritems():
    
        # Split the sequence in lines of 60 characters and join
        # them with newline characters
        seq_lines = "\n".join(re.findall(r".{1,60}", seq)) + "\n"
        
        # Complete the FASTA entry by adding the name line at the top
        fasta_entry = ">" + name + "\n" + seq_lines
        
        # Add the FASTA entry to the output file or string
        if out_file:
            out_file.write(fasta_entry)
        else:
            fasta_string += fasta_entry
    
    # Once the writing is completed, if no out_file was given,
    # return the FASTA string
    if not out_file:
        return fasta_string



def revcomp(seq):
    """Renvoie le reverse complement d'une sequence nucleotidique"""
    seqrev=[base for base in seq]
    seqrev.reverse()
    dico_comp={"A":"T", "T":"A", "G":"C", "C":"G","a":"t", "t":"a", "g":"c",
    "c":"g", "N":"N", "n":"n",
    "R":"Y", "r":"y", "Y":"R","y":"r",
    "w":"w","W":"W", "K":"M","k":"m","M":"K","m":"k",
    "b":"v","B":"V","D":"H","d":"h",
    "H":"D","h":"d","V":"B","v":"b","-":"-"
    }
    seqrevc=[dico_comp[base] for base in seqrev]
    seqrevc="".join(seqrevc)
    return seqrevc




def align_orf(ORF_name,dico_coordfiles,dico_genomes,ext,dico_orfaligned,dico_start ,dico_cons):
    dico_orfextended={}
    testcount=[]
    
    #def a dictionnary with lineages
    dico_lineage={\
    "Y128":"Cer", "SA03":"SpC","SD06":"SpB","SD01":"SpA"}
    
    for haplo in dico_coordfiles.keys():
        alt_name="alt_orf=False"
        cmd="grep '"+ORF_name+"' "+dir_gff+dico_coordfiles[haplo]
        coordhaplo=os.popen(cmd).read().rstrip()
        array=coordhaplo.split("\t")
        #coordred=[array[0],array[3],array[4],array[6]]
        #take statut info
        stat=array[8].split(";")[3].replace("stat=","")
        #ORF coordinates
        scaf=array[0]
        start=int(array[3])
        stop=int(array[4])
        sens=array[6]
        #extended coordinates
        startext=start-1-ext
        stopext=stop+ext
        
        
        #extract sequences and change ORF seq in upper cases
        orf_seq=dico_genomes[haplo][scaf][start-1:stop]
        ext_seq=dico_genomes[haplo][scaf][startext:stopext].lower()
        
        
        #if stat =1 change the orf in uppercase
        
        if stat=="1":
            ext_seq=ext_seq.replace(orf_seq.lower(),orf_seq.upper())
        #if stat =0 change the uppercase of orf if another share the same start
        
        if stat=="0":
            #take aligned positions
            coordstart=dico_orfaligned[ORF_name]
            #check if there is another orf with same start
            alt_orf=dico_start.get(haplo+";"+coordstart,"not")
            if alt_orf != "not":
                
                #take coordinates of the altorf if exists
                cmdalt="grep '"+alt_orf+"' "+dir_gff+dico_coordfiles[haplo]
                coordalt=os.popen(cmdalt).read().rstrip()
                
                #check if alt ORF conserved after filtering (because aligned coordinates not filtered)
                if len(coordalt) >=1: 
                    
                    arrayalt=coordalt.split("\t")
                    #ORF coordinates
                    scafa=arrayalt[0]
                    starta=int(arrayalt[3])
                    stopa=int(arrayalt[4])
                    
                    alt_seq=dico_genomes[haplo][scafa][starta-1:stopa]
                    
                    #check number of  pos of alt sequence (if not multiple)
                    iterator = finditer(alt_seq.lower(), ext_seq)
                    count = 0
                    for match in iterator:
                        count +=1
                    testcount.append(count)
                    ext_seq=ext_seq.replace(alt_seq.lower(),alt_seq.upper())
                    alt_name="alt_orf=True"
        
        if sens=="-":
            ext_seq=revcomp(ext_seq)
        
        keyhaplo=haplo+";"+dico_lineage[haplo]+";stat="+stat+";orfname="+ORF_name+";genomic_coord="+scaf+":"+\
        str(startext)+"-"+str(stopext)+"("+sens+");"+alt_name
        
        dico_orfextended[keyhaplo]=ext_seq
    
    with open("not_aligned.fasta","w") as out:
        write_fasta(dico_orfextended,out)
    
    #fasta output
    align_file=dico_cons[ORF_name]+"_"+ORF_name+"_ext"+str(ext)+"align.fasta"
    cmdalign="mafft --quiet --preservecase not_aligned.fasta > "+dir_alignorf+"fasta/"+align_file
    os.system(cmdalign)
    
    #clustalw output
    align_file=dico_cons[ORF_name]+"_"+ORF_name+"_ext"+str(ext)+"align.clustalw"
    cmdalign="mafft --quiet --clustalout --preservecase not_aligned.fasta > "+dir_alignorf+"clustalw/"+align_file
    os.system(cmdalign)
    
    os.system("rm not_aligned.fasta")
    return testcount


#====== END FUNCTIONS ======================================================
#=======================================================================




#1 take ORFS names
#2 extract genomic coordinates
#write a temporary file with extended coordinates
#extract genomic sequences in a dictionnary
#change letter upper case for ORF, lower case for upstream and downstream
#align with muscle
#write fasta

#check if another ORF starts here
# add conservation infos
ext=0
dir_genomes="../../../../00data_seq/pseudo_IDBA/"
dir_gff="../../03synt_intergenic_orf/03_ORF_cons_spar/02per_haplo_ps/"
dir_coordalign="../../03synt_intergenic_orf/02_ORF_tables/01masking_steps/table_orf_sid_aligned_notRM"

dir_alignorf="../../07_candidates/05_align_convORFs_4/" #aligned  orf

dir_infos="../tables_out/02conservation/conservation_table_spar.txt"

#take orf list
cmdorf="cat list_convORFs_BC.txt"
corf=os.popen(cmdorf).read().rstrip().split("\n")

#read genomes and stock in a dictionnary
dico_genomefiles={"SA03":"A03_IDBA.fapseudoscaff.fasta",\
"SD06": "D06_IDBA.fapseudoscaff.fasta" ,\
"SD01" : "D01_IDBA.fapseudoscaff.fasta",\
"Y128":"Y128_IDBA.fapseudoscaff.fasta"}

#dico_genomefiles={"SA03":"A03_IDBA.fapseudoscaff.fasta",\
#"SD06": "D06_IDBA.fapseudoscaff.fasta" }


dico_genomes={}

for haplo in dico_genomefiles.keys():
    fasta_file=dico_genomefiles[haplo]
    with open(dir_genomes+fasta_file) as fasta:
        dico_genomes[haplo]=parse_fasta(fasta)

#stock aligned coordinates in a dictionnary to make start correspondances
dico_start={} #key = pos start ifo , value=orf name
dico_orfaligned={} #key =orf, value = align coordinates


with open(dir_coordalign) as align:
    for line in align:
        line=line.rstrip()
        array=line.split("\t")
        haplo=array[1]
        stat=array[2]
        orfi=array[3]
        
        if stat=="1":
            if array[7]=="+":
                coordstart=haplo+";"+array[4]+":"+array[5]+";"+array[7]
                coord_orfstart=array[4]+":"+array[5]+";"+array[7]
            if array[7]=="-":
                coordstart=haplo+";"+array[4]+":"+array[6]+";"+array[7]
                coord_orfstart=array[4]+":"+array[6]+";"+array[7]
            
            dico_start[coordstart]=orfi
            dico_orfaligned[orfi]=coord_orfstart


dico_cons={}
with open(dir_infos) as consfile:
    for line in consfile:
        line=line.rstrip()
        array=line.split("\t")
        dico_cons[array[0]]=array[4]



dico_coordfiles={"SA03":"table_orf_SA03.gff",\
"SD06": "table_orf_SD06.gff" ,\
"SD01" : "table_orf_SD01.gff",\
"Y128":"table_orf_Y128.gff"}


#dico_coordfiles={"SA03":"table_orf_SA03.gff",\
#"SD06": "table_orf_SD06.gff" }

#ORF_name="ORF_102655"
for ORF_name in corf:
    nbhit=align_orf(ORF_name,dico_coordfiles,dico_genomes,ext,dico_orfaligned,dico_start,dico_cons )
    if len(nbhit) >=1:
        if max(nbhit) >1:
            print nbhit+"\t"+ORF_name


#check alignments nb of differences
