#!/usr/bin/env python

#Make pseudomolecules per chr from scaffolds IDBA with ABACAS software
#Run the programme in the result ABACAs directory 
#ex: sORF_project/analyses_data/02synteny_IDBA/ABACAS/ABACAS_refPar/pseudomolecules/

import os
from subprocess import Popen, PIPE
import re 

# FONCTIONS##############################################################
def parse_fasta(fasta_file):
    
    """Function to parse a FASTA file into a sequence dictionary.
    
    The fasta_file argument must be a FASTA-formatted string or an
    opened file object.
    """
    
    if isinstance(fasta_file, file):
        fasta = fasta_file
    elif isinstance(fasta_file, str):
        fasta = fasta_file.split("\n")
    else:
        raise Exception("Argument fasta_file must be a file or a string")
    
    # Dictionary to receive sequences
    sequence_dict = {}
    
    # Variables for holding transitory informations
    current_name = ""
    current_sequence = ""
    
    # For each line in the file:
    for line in fasta:
        
        # If it is neither a comment nor an empty line:
        if not line.startswith("#") and line.strip():
            
            # If it is the start of a new fasta entry:
            if line.startswith(">"):
                
                # If it is not the first entry of the file:
                if current_name:
                    
                    # Set a dictionary item using the current name and
                    # sequence
                    sequence_dict[current_name] = current_sequence
                    
                    # Reset the sequence
                    current_sequence = ""
                
                # Change the "current name" to the new sequence name
                current_name = line.rstrip()[1:]
            
            # If it is a sequence line, add the new sequence chunk to
            # the current sequence
            else:
                current_sequence += line.rstrip()
    
    # Once all lines have been read, set the last dictionary item
    sequence_dict[current_name] = current_sequence
    
    # Return the sequence dictionary
    return sequence_dict

#==========================================================================
# ANALYSES #################################################################
#==========================================================================
#directories for YPS128 and ref S288C
#liste_scaff="../../../0_Synchro_rec/00scripts_ps24/00links/link_YPS128_IDBA.txt"
#liste_chr="../../../0_Synchro_rec/00scripts_ps24/00links/links_chr_refcer.txt"


#directories for Spar strains and ref paradoxus CBS432
liste_scaff="../../../0_Synchro_rec/00scripts_ps24/00links/link_IDBA24.txt"
liste_chr="../../../0_Synchro_rec/00scripts_ps24/00links/links_chrpar.txt"

#Run ABACAS per haplotype file and foreach ref chromosome

with open (liste_scaff) as scaff:
    for file_scaff in scaff:
        file_scaff=file_scaff.rstrip()
        filename=file_scaff.split("/")[-1]
        #For each cerevisiae chromosome
        with open(liste_chr) as chrom:
            for file_chr in chrom:
                file_chr=file_chr.rstrip()
                chrname=file_chr.split("/")[-1]
                cmd_line="perl ../Program/abacas.1.3.1.pl -b -r "+file_chr+" -q "+file_scaff+" -p nucmer >> abacas.log"
                
                
                #print cmd_line
                os.system(cmd_line)
                #===========================================
                
                #===========================================
        
        
        #Count the number of scaffold
        cmd_count= "cat "+filename+"_chr*.fa.bin | sort | uniq -c > "+filename+".unused"
        
        #print cmd_count
        os.system(cmd_count)
        #===========================================
        
        #Create dictionnary with all sequences
        dico_seq={}
        with open(file_scaff) as allseq:
            dico_seq=parse_fasta(allseq)
        
        #Read the countfile and create a multifasta with unused scaff
        unused_out=filename+".unused.fasta"
        with open(unused_out,"w") as outfasta:
            with open (filename+".unused") as count:
                for line in count:
                    line=line.rstrip()
                    #print line
                    array=line.split(" ")
                    #print array[2]
                    if int(array[-2]) > 16: 
                        print line+"Error" 
                    #If scaffold never used, = 16
                    if int(array[-2]) == 16:
                        seqnamei=array[-1]
                        seqi=dico_seq[seqnamei]
                        seqi_line="\n".join(re.findall(r".{1,60}", seqi)) + "\n"
                        outfasta.write(">"+seqnamei+"\n")
                        outfasta.write(seqi_line)
        #Merge pseudo and unused scaffolds
        
        allchrnames=[]
        with open(liste_chr) as chrom:
            for file_chr in chrom:
                file_chr=file_chr.rstrip()
                chrname=file_chr.split("/")[-1]
                fastaname2=filename+"_"+chrname+".fasta "
                allchrnames.append(fastaname2)
                cmd_line3="cat "+" ".join(allchrnames)+unused_out+" > "+filename+"pseudoscaff.fasta"
                
                print cmd_line3
                os.system(cmd_line3)
                #===========================================




