#!/usr/bin/env python

import os 
import re

#script to prepare files for masking 
#do alignment of SID on gene and ncrna database
########################################################################
########################################################################
########################################################################


def parse_blast(file_align):
    """Function to convert blast output to bed format
    REad a blast output and write a bed file in the same directory"""
    
    file_out=file_align.replace(".blast", ".bed")
    with open(file_align) as align:
        with open(file_out,"w") as out_align:
            for line in align:
                line=line.rstrip()
                array=line.split("\t")
                chrom=array[1]
                start=array[8]
                stop=array[9]
                #Order start and stop if alignment on reverse strand
                
                if int(start) < int(stop):
                    newline=chrom+"\t"+start+"\t"+stop+"\n"
                else:
                    newline=chrom+"\t"+stop+"\t"+start+"\n"
                out_align.write(newline)


def parse_fasta(fasta_file):
    
    """Function to parse a FASTA file into a sequence dictionary.
    
    The fasta_file argument must be a FASTA-formatted string or an
    opened file object.
    """
    
    if isinstance(fasta_file, file):
        fasta = fasta_file
    elif isinstance(fasta_file, str):
        fasta = fasta_file.split("\n")
    else:
        raise Exception("Argument fasta_file must be a file or a string")
    
    # Dictionary to receive sequences
    sequence_dict = {}
    
    # Variables for holding transitory informations
    current_name = ""
    current_sequence = ""
    
    # For each line in the file:
    for line in fasta:
        
        # If it is neither a comment nor an empty line:
        if not line.startswith("#") and line.strip():
            
            # If it is the start of a new fasta entry:
            if line.startswith(">"):
                
                # If it is not the first entry of the file:
                if current_name:
                    
                    # Set a dictionary item using the current name and
                    # sequence
                    sequence_dict[current_name] = current_sequence
                    
                    # Reset the sequence
                    current_sequence = ""
                
                # Change the "current name" to the new sequence name
                current_name = line.rstrip()[1:]
            
            # If it is a sequence line, add the new sequence chunk to
            # the current sequence
            else:
                current_sequence += line.rstrip()
    
    # Once all lines have been read, set the last dictionary item
    sequence_dict[current_name] = current_sequence
    
    # Return the sequence dictionary
    return sequence_dict


def write_fasta(sequence_dict, out_file = None):
    
    """Function to write a sequence dictionary in FASTA format to an
    opened file or a returned string
    
    If given, out_file must be a file object opened in writing mode
    """
    
    if not all(isinstance(key, str) for key in sequence_dict.keys()):
        raise Exception("All dictionary keys must be strings")
    
    if out_file and not isinstance(out_file, file):
        raise Exception("If given, argument out_file must be a file object opened in writing mode")
    
    # String to hold the whole FASTA file
    if not out_file:
        fasta_string = ""
    
    # For each item in the sequence dictionary:
    for name, seq in sequence_dict.iteritems():
        
        # Split the sequence in lines of 60 characters and join
        # them with newline characters
        seq_lines = "\n".join(re.findall(r".{1,60}", seq)) + "\n"
        
        # Complete the FASTA entry by adding the name line at the top
        fasta_entry = ">" + name + "\n" + seq_lines
        
        # Add the FASTA entry to the output file or string
        if out_file:
            out_file.write(fasta_entry)
        else:
            fasta_string += fasta_entry
    
    # Once the writing is completed, if no out_file was given,
    # return the FASTA string
    if not out_file:
        return fasta_string



########################################################################
#######          ANALYSIS              #################################
########################################################################
# First rename aligned sequences to have the same cervesiae names as for RM files 
dir_align="../../02synt_intergenic_fasta/03_allSIDwithrecons/02_aligned/"
dir_re="../../02synt_intergenic_fasta/03_allSIDwithrecons/02_aligned_rename/"

list_sid=os.popen("ls "+dir_align).read().rstrip().split("\n")


for fasta_file in list_sid:
    with open(dir_align+fasta_file) as fasta:
        #test if binary file 
        test_nbseq=os.popen("grep '>' "+dir_align+fasta_file+ " | wc -l").read().rstrip()
        if test_nbseq == "29":
            #print fasta_file
            with open(dir_re+fasta_file,"w")as out:
                for line in fasta: 
                    #modify name if Y128 because too long
                    if "Y128" in line and "ordered" in line:
                        newline=line.split("|")[0].replace(".fsa_tpg","")+line.split("|")[2]+"\n"
                    
                    else:
                        newline=line
                    out.write(newline)

# cat all sequences in all_sid_al_re.fasta


#Alignments ############################################################
#fasta files for alignments 
dir_genomes="../../02synt_intergenic_fasta/03_allSIDwithrecons/all_SID_notal_rename.fasta"
dir_ncRNA="../../../../00data_seq/non-coding-db/rna_genomic_rename.fasta"
dir_CDS="../../../../00data_seq/cerevisiae/dbwithpseudo/orf_genomic_withoutdubious.fasta"
dir_ensembl="../../../../00data_seq/ensembl/Fungi/merged_CDS_cer_eub_kud.fasta"
fastaref="../../../../00data_seq/cerevisiae/genomes/S288C.fasta"
min_id=0.6
#default evalue = 10 


fasta=dir_genomes

#Make db 
dir_db=fasta.replace(".fasta","")
cmd= "makeblastdb -in  "+fasta+" -dbtype nucl"
#print cmd
os.system(cmd)

#=================================================================

#Align rRNA
rrna_out="../../02synt_intergenic_fasta/04_mask_RM_db/02_align_db/rRNAonallSID.blast"

cmd_rrna="blastn -query "+dir_ncRNA+" -db "+fasta+\
" -out "+rrna_out+" -outfmt 6 -perc_identity "+str(min_id)
print cmd_rrna
os.system(cmd_rrna)
#=================================================================

# make bed file from rrna output 
parse_blast(rrna_out)


#Align CDS_yeast_gdb
cds_out1="../../02synt_intergenic_fasta/04_mask_RM_db/02_align_db/CDS_ceronallSID.blast"
cmd_cds1="blastn -query "+dir_CDS+" -db "+fasta+\
" -out "+cds_out1+" -outfmt 6 -perc_identity "+str(min_id)
print cmd_cds1
os.system(cmd_cds1)
#=================================================================
#make bed file from cds output 
parse_blast(cds_out1)


#Align CDS_ensembl db
cds_out2="../../02synt_intergenic_fasta/04_mask_RM_db/02_align_db/CDS_EnsonallSID.blast"
cmd_cds2="blastn -query "+dir_ensembl+" -db "+fasta+\
" -out "+cds_out2+" -outfmt 6 -perc_identity "+str(min_id)
print cmd_cds2
os.system(cmd_cds2)
#=================================================================
#make bed file from cds output 
parse_blast(cds_out2)


#merge CDS outpout in one file 
cds_out="../../02synt_intergenic_fasta/04_mask_RM_db/02_align_db/CDSonallSID.blast"
cmdmerge="cat "+cds_out1+" "+cds_out2+" > "+cds_out


os.system(cmdmerge)
parse_blast(cds_out)






