#! /usr/bin/env python
import sys, os, glob, string, subprocess,time, math, re, compiler
space = re.compile(r'\s+');
compiler.parseFile(sys.argv[0]); 

######################################################################

bti = {'A':0,'C':1,'G':2,'T':3,'a':0,'c':1,'g':2,'t':3,'N':4};
itb = {0:'A', 1:'C',2:'G',3:'T'};

def flagtostring(flag):
	fstring = '';
	for i in range(10): 
		if flag%2 ==1: fstring += '1'; 
		else: fstring +='0';
		flag = flag/2;
	return fstring; 

#flagtostring(int(sys.argv[1])); sys.exit();

def read_fasta(seqfile):
	sequences = {}; reflist = [];
	File = open(seqfile,'r');
	if not File: print >>sys.stderr, 'reference sequence file',seqfile,'not found'; sys.exit();
	for line in File:
		if '>' in line: 
			refname = line.strip().strip('\r').lstrip('>');
			sequences[refname] = ['',{}];    # refsequence and basecalls 
			reflist.append(refname);
		else: sequences[refname][0] += line.strip().strip('\r').upper();
	File.close();
	return [sequences,reflist];
	

def print_pileup(reflist,current,index,start,end):
	for j in range(start,end):
		bases = '@'; qvalues = '@'; mvalues = '@'; positions = ''; mismatches = ''; coverage =0; 
		if j in index[1]: 
			T = index[1][j]; # T is the string that has basecalls for position j in the refsequnece 
			coverage = (len(T)-1)/6;
			for k in range(coverage): 
				qvalues += T[6*k+2]; positions += str(ord(T[6*k+3])-33) + ','; mvalues += T[6*k+4]; mismatches += T[6*k+6];	strand = T[6*k+5]; 
				if strand == '+' and index[0][j] == T[6*k+1]: bases += ',';
				elif strand == '-' and index[0][j] == T[6*k+1]: bases += '.';
				elif strand == '+': bases += T[6*k+1];
				else: bases += T[6*k+1].lower();
			print reflist[current],j+1,index[0][j],coverage,bases,qvalues,mvalues,positions,mismatches; 
			del index[1][j];
		else:
			print reflist[current],j+1,index[0][j],coverage,bases,qvalues,mvalues,positions,mismatches; 



def samtopileup(readfile,seqfile,readlength):
	[sequences,reflist] = read_fasta(seqfile); 
	for a,b in sequences.iteritems():
		#print a,len(b[0]),b[0][0:50];
		refsequence = b[0];
		print >>sys.stderr, a,len(b[0]);
		#for i in range(len(refsequence)): b[1][i] = '_';

	MIN_QUALITY = 0; MIN_MAPPING = 0;  MAX_MISMATCHES = 3;  MAX_SCORES = 60;

	current =0 ; index = sequences[reflist[current]]; last =0;

	File=open(readfile,'r');  reads =0;
	if not File: print >>sys.stderr, 'samfile',readfile,'not found'; sys.exit();
	for s in File:
		line = space.split(s);
		if line[2] == '*': continue;
		reads +=1; 
		if reads%100000 ==0: print >>sys.stderr, 'processed',reads;
		#if reads > 100: break;
		readid = line[0]; flag = int(line[1]); locus = line[2]; start = int(line[3]); mq = int(line[4]); cigarstring = line[5]; 
		read = line[9]; quality = line[10]; 
		refsequence = sequences[locus][0];  # index into refnames table 
		offset =0;

		if flag ==4: continue;
		fs = flagtostring(flag); 
		if fs[2] == '1': continue;  # unmapped read 
		if fs[4] == '0': strand = '+';
		else: strand = '-';
		if fs[0] == '1':
			if fs[6] == '1': read12 = 1;
			elif fs[7] == '1': read12 = 2;
			else: read12 = 0;
		else: read12 = 0;
	#	print flag,fs,fs[4],strand,read12; print s;

#		if read12 == 2: continue;

		if 'I' in cigarstring or 'D'  in cigarstring or cigarstring == '*': indelread =1; continue;
		#if cigarstring != str(readlength) + 'M': continue;    # only for readlengthM reads 

#		print start-offset-1,start-offset-1+len(read);
		if start -offset -1 < 0: continue; 
		if start -offset -1 + len(read) >= len(refsequence): continue; 

		mismatches =0;
		for i in range(len(read)):
			if read[i].upper() != refsequence[start+i-offset-1]: mismatches +=1;
		if mismatches >= 7: continue;
		#print read,start,mq;
		if read12 ==2: delta = readlength;
		else: delta =0;
#		print '|',sequences[locus][0][start-offset-1:start-offset-1+len(read)],read,'start',start,'offset',offset;
		for i in range(len(read)):
			if strand == '+': psb = chr(i+33+1+delta);
			else: psb = chr(len(read) -i + 33+delta);
			mqb = chr(min(123,mq+33)); 
			try: sequences[locus][1][start+i-offset-1] += read[i]+quality[i]+ psb + mqb + strand + chr(mismatches+48);
			except KeyError: sequences[locus][1][start+i-offset-1] = '_' + read[i]+quality[i]+ psb + mqb + strand + chr(mismatches+48);
#			print read[i]+quality[i]+ psb + mqb + strand + ':'+ refsequence[start+i-offset-1],# + chr(min(mq,93)+33),
#		print;
		
		if locus != reflist[current]: 
			# print pileup from previous to end of len(index[0])
			print_pileup(reflist,current,index,last,len(index[0])); last = 0;
			current +=1; index = sequences[reflist[current]];
		else:
			print_pileup(reflist,current,index,last,start-1); last = start-1;
	
	print_pileup(reflist,current,index,last,len(index[0])); # final lines


	File.close();


if len(sys.argv)< 4: print 'python sam_pileup.py SAMfile referencesequence.fasta readlength'; sys.exit();
samtopileup(sys.argv[1],sys.argv[2],int(sys.argv[3]));


