import os 
import re

#=======================================================================
#====== BEGIN FUNCTIONS ======================================================

def parse_fasta(fasta_file):
    
    """Function to parse a FASTA file into a sequence dictionary.
    
    The fasta_file argument must be a FASTA-formatted string or an
    opened file object.
    """
    
    if isinstance(fasta_file, file):
        fasta = fasta_file
    elif isinstance(fasta_file, str):
        fasta = fasta_file.split("\n")
    else:
        raise Exception("Argument fasta_file must be a file or a string")
    
    # Dictionary to receive sequences
    sequence_dict = {}
    
    # Variables for holding transitory informations
    current_name = ""
    current_sequence = ""
    
    # For each line in the file:
    for line in fasta:
        
        # If it is neither a comment nor an empty line:
        if not line.startswith("#") and line.strip():
            
            # If it is the start of a new fasta entry:
            if line.startswith(">"):
                
                # If it is not the first entry of the file:
                if current_name:
                    
                    # Set a dictionary item using the current name and
                    # sequence
                    sequence_dict[current_name] = current_sequence
                    
                    # Reset the sequence
                    current_sequence = ""
                
                # Change the "current name" to the new sequence name
                current_name = line.rstrip()[1:]
            
            # If it is a sequence line, add the new sequence chunk to
            # the current sequence
            else:
                current_sequence += line.rstrip()
    
    # Once all lines have been read, set the last dictionary item
    sequence_dict[current_name] = current_sequence
    
    # Return the sequence dictionary
    return sequence_dict


#====== END FUNCTIONS ======================================================
#=======================================================================

#=======================================================================
#====== BEGIN ANALYSIS ======================================================
#1 convert rm output in bed file (blast output were previously converted in bed format)
#2 convert orf coordinates from align to not align
#3 remove orf with overlap with RM regions or blast hits (alignment)
#4 remove orf with overlapp with known annotation in cerevisiae (position)



#1 parse RM output ======================================================

#Create file with masked coord with RM
rmfile="../../02synt_intergenic_fasta/04_mask_RM_db/01_ReppeatMasker/Results/all_SID_notal.fasta.out"
rm_out="../../02synt_intergenic_fasta/04_mask_RM_db/01_ReppeatMasker/Results/all_SID_notal.fasta.bed"

with open (rmfile) as rm:
    with open(rm_out, "w") as out:
        for line in rm:
            line=line.rstrip()
            line=" ".join(line.split())
            array=line.split(" ")
            #on selectionne les lignes avec les hits , commence par un numero
            m1=re.match(r"^\d", line)
            
            if m1 is not None: 
                
                infos=array[4]
                #id_synt=infos.split(";")[0]+";"+infos.split(";")[1]
                id_synt=infos
                start=array[5]
                stop=array[6]
                newline=id_synt+"\t"+start+"\t"+stop+"\n"
                out.write(newline)

#OK fichier .bed des regions masquees 


#2 convert aligned coordinates to not aligned =======================

fasta_file="../../02synt_intergenic_fasta/03_allSIDwithrecons/all_SID_al_re.fasta"
orf_notal="../../03synt_intergenic_orf/02_ORF_tables/01masking_steps/table_orf_sid_NOTaligned_notRM"
orf_al="../../03synt_intergenic_orf/02_ORF_tables/01masking_steps/table_orf_sid_aligned_notRM"

#read align fasta file
with open(fasta_file) as fasta:
    dico_alignb=parse_fasta(fasta)

#REname seq names to help

dico_align={}


#Change seq name to help with orf annotations
for key in dico_alignb.keys():
    
    key2=key.split(";")[0]+";"+key.split(";")[1]
    dico_align[key2]=dico_alignb[key]

#Dicctionnary to make correspondances between db alignments
dico_corres={}
for key in dico_alignb.keys():
    keyred=key.split(";")[0]+";"+key.split(";")[1]
    dico_corres[keyred]=key


#REad orf file and convert coordinates aligned to not aligned
with open(orf_al) as orf:
    with open (orf_notal, "w") as not_al:
        for line in orf:
            line=line.rstrip()
            array=line.split("\t")
            start=int(array[5])
            stop=int(array[6])
            idsynt=array[4]+";"+array[1]
            seq=dico_align[idsynt]
            #print line
            #count gap number before stop and start positions
            #seq="AAGCC---GCTGAAT-CGCGCGGG---GTC"
            #start=12
            #stop=29
            
            gapstop=seq[0:stop-1].count('-')
            gapstart=seq[0:start-1].count('-')
            gaporf=gapstop-gapstart
            
            newstart=start-gapstart
            newstop=stop-gapstop
            
            newarray=array
            newarray[5]=str(newstart)
            newarray[6]=str(newstop)
            newline="\t".join(newarray)+"\t"+str(gaporf)+"\n"
            not_al.write(newline)

#create bed file 
orfbed="../../03synt_intergenic_orf/02_ORF_tables/01masking_steps/orf_sid_NOTaligned.bed"
with open (orf_notal) as not_al:
    with open(orfbed,"w") as bed:
        for line in not_al:
            line=line.rstrip()
            array=line.split("\t")
            id_sid=array[4]+";"+array[1]
            
            newline="\t".join([dico_corres[id_sid],array[5], array[6], array[3]])+"\n"
            bed.write(newline)

# remove orf with overlap with RM regions or blast hits
rm_file="../../02synt_intergenic_fasta/04_mask_RM_db/01_ReppeatMasker/Results/all_SID_notal.fasta.bed"
masked_rm="../../03synt_intergenic_orf/02_ORF_tables/01masking_steps/1.table_orf_sid_NOTaligned_filtRM"

cmd1= "bedtools subtract -A -a  "+orfbed+ " -b "+rm_file+" > "+masked_rm
print cmd1
os.system(cmd1)
#=============================================================

#mask ncrna
ncrna_file="../../02synt_intergenic_fasta/04_mask_RM_db/02_align_db/rRNAonallSID.bed"
masked_rna="../../03synt_intergenic_orf/02_ORF_tables/01masking_steps/2.table_orf_sid_NOTaligned_filtRM_filtncrna"

cmd2= "bedtools subtract -A -a  "+masked_rm+ \
" -b "+ncrna_file+" > "+masked_rna
print cmd2
os.system(cmd2)

#mask cds 
cds_file="../../02synt_intergenic_fasta/04_mask_RM_db/02_align_db/CDSonallSID.bed"
masked_CDS="../../03synt_intergenic_orf/02_ORF_tables/01masking_steps/3.table_orf_sid_NOTaligned_filtRM_filtncrna_filtCDS"

cmd3= "bedtools subtract -A -a  "+masked_rna+" -b "+cds_file+" > "+masked_CDS
print cmd3
os.system(cmd3)
#=============================================================

#take orf in cerevisiae 
#S288C orf
table_cer="../../03synt_intergenic_orf/02_ORF_tables/01masking_steps/4.table_orf_ref_chr"
#all other orfs
table_nocer="../../03synt_intergenic_orf/02_ORF_tables/01masking_steps/4.table_orf_noref_sid"

with open(masked_CDS) as orf:
    with open(table_cer,"w") as cer:
        with open(table_nocer, "w") as other:
            for line in orf:
                line=line.rstrip()
                array=line.split("\t")
                infos=array[0].split(";")
                haplo=infos[1]
                if haplo != "S288":
                    other.write(line+"\n")
                else:
                    start_sid=infos[3].split("-")[0]
                    stop_sid=infos[3].split("-")[1]
                    #convert pos with sid start
                    start_chr=int(start_sid)+int(array[1])
                    stop_chr=int(start_sid)+int(array[2])
                    newline="\t".join([infos[2],str(start_chr), str(stop_chr),array[3]])
                    cer.write(newline+"\n")


#4 Dernier filtre CDS et rna cerevisiae (remove overlapp with known annotation)
#======================================================================= 
dir_gff_ref="../../../../00data_seq/cerevisiae/cutted_gff3/S288C_Genes.gff"
dir_gff_rrna="../../../../00data_seq/cerevisiae/cutted_gff3/S288C_rRNA.gff3"

cmd="bedtools subtract -A -a  "+table_cer+" -b "+dir_gff_ref+" > "+table_cer+"sup1"
os.system(cmd)
cmd="bedtools subtract -A -a  "+table_cer+"sup1"+" -b "+dir_gff_rrna+" > "+table_cer+"sup2"
os.system(cmd)

#Merge with other annotations 
cmd = "cat "+table_cer+"sup2 "+table_nocer+" > ../../03synt_intergenic_orf/02_ORF_tables/01masking_steps/5.table_orf_allfilt"
os.system(cmd)
#MASKED FILE OK !

#Convert masked file to gff 
#make dictionnary with gff lines from first file before masking
dico_all={}
with open(orf_notal) as orf:
    for line in orf:
        line=line.rstrip()
        array=line.split("\t")
        
        haplo=array[1]
        sid=array[4]
        scaff=sid+";"+haplo
        orf=array[3]
        id_line=orf+";"+haplo
        start=array[5]
        stop=array[6]
        sens=array[7]
        size=int(stop)-int(start)+1
        nbgap=array[8]
        stat=array[2]
        infos="id_orf="+orf+";orf_size="+str(size)+";haplo="+haplo+";stat="+stat+";nbgap="+nbgap
        newline="\t".join([scaff,"1.annotORF.py","ORF",start,stop,".",sens,".",infos])
        dico_all[id_line]=newline


orf_filt="../../03synt_intergenic_orf/02_ORF_tables/01masking_steps/5.table_orf_allfilt"
filt_gff=orf_filt+".gff"

with open(orf_filt) as orf:
    with open(filt_gff,"w") as gff:
        for line in orf:
            line=line.rstrip()
            array=line.split("\t")
            if array[0].startswith("SID"):
                id_line=array[3]+";"+array[0].split(";")[1]
            
            else:
                id_line=array[3]+";S288"
            
            gff.write(dico_all[id_line]+"\n")


#cat pour merger les fichiers 

# FINI !





