import os 
import re

"""Script to replace fasta masked positions by N, and check intergenic 
charcateristics after masking (size length by SID, total length by SID, identity)"""


########################################################################


def parse_fasta(fasta_file):
    
    """Function to parse a FASTA file into a sequence dictionary.
    
    The fasta_file argument must be a FASTA-formatted string or an
    opened file object.
    """
    
    if isinstance(fasta_file, file):
        fasta = fasta_file
    elif isinstance(fasta_file, str):
        fasta = fasta_file.split("\n")
    else:
        raise Exception("Argument fasta_file must be a file or a string")
    
    # Dictionary to receive sequences
    sequence_dict = {}
    
    # Variables for holding transitory informations
    current_name = ""
    current_sequence = ""
    
    # For each line in the file:
    for line in fasta:
        
        # If it is neither a comment nor an empty line:
        if not line.startswith("#") and line.strip():
            
            # If it is the start of a new fasta entry:
            if line.startswith(">"):
                
                # If it is not the first entry of the file:
                if current_name:
                    
                    # Set a dictionary item using the current name and
                    # sequence
                    sequence_dict[current_name] = current_sequence
                    
                    # Reset the sequence
                    current_sequence = ""
                
                # Change the "current name" to the new sequence name
                current_name = line.rstrip()[1:]
            
            # If it is a sequence line, add the new sequence chunk to
            # the current sequence
            else:
                current_sequence += line.rstrip()
    
    # Once all lines have been read, set the last dictionary item
    sequence_dict[current_name] = current_sequence
    
    # Return the sequence dictionary
    return sequence_dict



def write_fasta(sequence_dict, out_file = None):
    
    """Function to write a sequence dictionary in FASTA format to an
    opened file or a returned string
    
    If given, out_file must be a file object opened in writing mode
    """
    
    if not all(isinstance(key, str) for key in sequence_dict.keys()):
        raise Exception("All dictionary keys must be strings")
    
    if out_file and not isinstance(out_file, file):
        raise Exception("If given, argument out_file must be a file object opened in writing mode")
    
    # String to hold the whole FASTA file
    if not out_file:
        fasta_string = ""
    
    # For each item in the sequence dictionary:
    for name, seq in sequence_dict.iteritems():
        
        # Split the sequence in lines of 60 characters and join
        # them with newline characters
        seq_lines = "\n".join(re.findall(r".{1,60}", seq)) + "\n"
        
        # Complete the FASTA entry by adding the name line at the top
        fasta_entry = ">" + name + "\n" + seq_lines
        
        # Add the FASTA entry to the output file or string
        if out_file:
            out_file.write(fasta_entry)
        else:
            fasta_string += fasta_entry
    
    # Once the writing is completed, if no out_file was given,
    # return the FASTA string
    if not out_file:
        return fasta_string



#=======================================================================
#====== BEGIN ANALYSIS ======================================================

#take bed coordinates of rm and db results, make one file with all coordinates to mask
rm_bed="../../02synt_intergenic_fasta/04_mask_RM_db/01_ReppeatMasker/Results/all_SID_notal.fasta.bed"
ncrna_file="../../02synt_intergenic_fasta/04_mask_RM_db/02_align_db/rRNAonallSID.bed"
cds_file="../../02synt_intergenic_fasta/04_mask_RM_db/02_align_db/CDSonallSID.bed"
dir_masked="../../02synt_intergenic_fasta/04_mask_RM_db/03_masked_fasta/"

#cat bed files 
cmd = "cat "+rm_bed+" "+ncrna_file+" "+cds_file+" > "+dir_masked+"rm_nc_cds.bed"
os.system(cmd)

#take file with all SID not aligned 
SID_fasta="../../02synt_intergenic_fasta/03_allSIDwithrecons/all_SID_notal_rename.fasta"

#run maskfasta 

cmd="bedtools maskfasta -fi "+SID_fasta+" -bed "+dir_masked+"rm_nc_cds.bed -fo "+\
dir_masked+"all_SID_notal_rename_masked.fasta"

os.system(cmd)


#OK fasta masqué 
#read fasta
with open(dir_masked+"all_SID_notal_rename_masked.fasta") as fasta:
	dico_fasta=parse_fasta(fasta)


#write a file with all SID stat infos after masking 
outtxt=dir_masked+"SID_masked_length.txt"

dico_sid={}
csid=[]
with open(outtxt,"w") as out:
	for key in dico_fasta.keys():
		SID=key.split(";")[0]
		haplo=key.split(";")[1]
		seq=dico_fasta[key].replace("N","")
		newline="\t".join([haplo,SID,str(len(seq))])
		out.write(newline+"\n")
		
		csid.append(SID)

csid=set(csid)
#write one masked fasta file per SID (for alignments)

for sid in csid:
	dico_sel={}
	for key in dico_fasta.keys():
		SID=key.split(";")[0]
		if sid == SID:
			
			seq=dico_fasta[key]
			dico_sel[key]=seq
	
	dir_out=dir_masked+"masked_fastaSID/"+sid+"_masked.fasta"
	
	with open(dir_out,"w") as out:
		write_fasta(dico_sel, out)
		

#write one masked fasta file per SID (for alignments) for the for sequenced haplotypes
selhaplo=["SA03","SD06","SD01","Y128"]

dir_sidred=dir_masked+"masked_fastaSID_red4/"+sid+"_masked.fasta"
for sid in csid:
	dico_sel={}
	for key in dico_fasta.keys():
		SID=key.split(";")[0]
		haplo=key.split(";")[1]
		if sid == SID and haplo in selhaplo:
			
			seq=dico_fasta[key]
			dico_sel[key]=seq
	
	dir_out=dir_masked+"masked_fastaSID_red4/"+sid+"_masked4.fasta"
	
	with open(dir_out,"w") as out:
		write_fasta(dico_sel, out)
		



#read each sid fasta file and extract perentage of identity
dir_sidred=dir_masked+"masked_fastaSID_red4/"
dir_stat=dir_masked+"align_sid_stat.txt"
with open(dir_stat,"w")as out:
	cmd_erase= "rm stat.out"
	count=0
	for sid in csid:
		print sid+"\t"+str(count)
		os.system(cmd_erase)
		cmd="clustalw -align -infile="+dir_sidred+sid+"_masked4.fasta -type=dna  -stats=stat.out >align.out"
		os.system(cmd)
		
		cmd_high="grep 'aln pw-id highest' stat.out" 
		sel_high=os.popen(cmd_high).read().rstrip().replace("aln pw-id highest: ","")
		
		
		cmd_low="grep 'aln pw-id lowest' stat.out" 
		sel_low=os.popen(cmd_low).read().rstrip().replace("aln pw-id lowest: ","")
		
		newline="\t".join([sid,sel_high,sel_low])
		out.write(newline+"\n")
		
		count=count+1




