import sys
import pysam
import re
from numpy import median
import time

def real_dist_calc(MC_list,current_m,current_i,current_d,current_sh):
	for cigar in MC_list:   #loop for calculate real distance
		if cigar[1]=="M":
			current_m=current_m+int(cigar[0])
		elif cigar[1]=="I" :
			current_i=current_i+int(cigar[0])
		elif cigar[1]=="D" :
			current_d=current_d+int(cigar[0])
			'blank'
		elif (cigar[1]=="S" or cigar[1]=="H" ):
			current_sh=current_sh+int(cigar[0])
	return current_m,current_i,current_d,current_sh


def read_search(t_file,chr1,pos1,read_dic,count_stat):
	t=0
#       count_stat==0
	for read in t_file.fetch(chr1,pos1-1,pos1):
#		print len(t_file.fetch(chr1,pos1-1,pos1))
		if not(read.is_proper_pair) or read.cigartuples==None or read.mate_is_unmapped:
			continue
		else:
			MC_list=re.findall(r'(\d+)([A-Z]{1})', read.get_tag("MC"))
			est_dist=0
			current_m=0;current_i=0;current_d=0;target_del_stat=0;current_sh=0
			c_count=0
			current_m,current_i,current_d,current_sh=real_dist_calc(MC_list,current_m,current_i,current_d,current_sh)

			rel_dist=current_m+current_d  # start with 0
#			print "search"
#			print rel_dist
#			raw_input()
			h_digit=hex(read.flag)[-2]
			tmp_read_dic=read_type(read,h_digit,rel_dist,read_dic)

                        if tmp_read_dic=="pass":
                                continue
                        else:
                                read_dic = tmp_read_dic
#			print len(read_dic.keys())
#			raw_input()
		"""
		if t>1000:
			count_stat=1
			break
#		      out_file2.write(vcf_line+"\n")
#		      continue
		"""
		t+=1
#	print len(read_dic.keys())
#	raw_input()
	return count_stat,read_dic


def read_type(read,h_digit,rel_dist,read_dic):
	if read.is_read1:
		if h_digit=="6":
			pos_key=str(read.reference_start)+"\t"+str(read.next_reference_start+rel_dist)+"\t"+"F1R2"
		elif h_digit=="5":
			pos_key=str(read.next_reference_start)+"\t"+str(read.reference_end)+"\t"+"F2R1"
		elif h_digit=="4":
			pos_key=str(read.reference_start)+"\t"+str(read.next_reference_start+rel_dist)+"\t"+"F1R1"
		elif h_digit=="7":
			pos_key=str(read.next_reference_start)+"\t"+str(read.reference_end)+"\t"+"F2R2"
		else:
			return "pass"


	else:
		if h_digit=="a":
			pos_key=str(read.reference_start)+"\t"+str(read.next_reference_start+rel_dist)+"\t"+"F2R1"
		elif h_digit=="9":
			pos_key=str(read.next_reference_start)+"\t"+str(read.reference_end)+"\t"+"F1R2"
		elif h_digit=="8":
			pos_key=str(read.next_reference_start)+"\t"+str(read.reference_end)+"\t"+"F2R1"
		elif h_digit=="b":
			pos_key=str(read.reference_start)+"\t"+str(read.next_reference_start+rel_dist)+"\t"+"F2R2"
		else:
			return "pass"
	if pos_key in read_dic.keys():
		read_dic[pos_key].append(read)
	else:
		read_dic[pos_key]=[read]
#	print read_dic
#	raw_input()
	return read_dic

def real_dist_cigar_calc(cigar_list,est_dist,current_m,current_i,current_d,target_del_stat):
	for cigar in cigar_list:   #loop for calculate real distance
		if cigar[0]==0 and (current_m + current_d)  <=est_dist:
			current_m=current_m+cigar[1]
		elif cigar[0]==1 and (current_m + current_d) <=est_dist:
			current_i=current_i+cigar[1]
		elif cigar[0]==2 and (current_m + current_d) <=est_dist:
			if current_m+current_d+cigar[1] > est_dist:
				target_del_stat=1
				break
			else:
				current_d=current_d+cigar[1]
		elif current_m + current_d > est_dist:
			break
		else:
			'blank'
#	print "func"
#	print est_dist
#	print current_i
#	print current_d

	return est_dist+current_i-current_d,target_del_stat
def block_search(sorted_list,read_dic,pos1,alt_nt,var_loca_lt,var_loca_rt):
	out_dic={}
	for block in sorted_list:
		var_mapq=[];ref_mapq=[]
		ref_n=[];var_n=[]
		nalt=0
		nref=0
		var_nm=[]
		tot_n=[]
		var_loca_lt=[]
		var_loca_rt=[]
		for read in read_dic[block]:
#			print len(read_dic[block])
#			raw_input()
#			tot_n.append(read.query_name) ## change the position in 240322 ayh
			var_read='off'
			est_dist=pos1-read.reference_start-1
#			print read.query_name
#			print "est_dist"
#			print read.query_name
#			print pos1
#			print read.reference_start
#			print est_dist
#			print raw_input()
			rlength=read.infer_query_length()
			cigar_list=read.cigartuples
#			print read.cigartuples
			current_m=0;current_i=0;current_d=0;target_del_stat=0
			rel_dist,target_del_stat = real_dist_cigar_calc(cigar_list,est_dist,current_m,current_i,current_d,target_del_stat)
#			print read.query_name
#			print rel_dist
			if target_del_stat==1:
				continue
			tot_n.append(read.query_name)
#			rel_dist=est_dist+current_i-current_d  # start with 0
#			print current_i
#			print current_d
				
			if read.query_alignment_sequence[rel_dist]==alt_nt:   #var_read
				var_read='on'

			if var_read == 'on':
				if read.has_tag('NM')==True:
					current_nm=read.get_cigar_stats()[0][10] - (read.get_cigar_stats()[0][1]+read.get_cigar_stats()[0][2])
					var_nm.append(current_nm)
				var_n.append(read.query_name)
				var_mapq.append(read.mapping_quality)
				if cigar_list[0][0]==0:
					dist=rel_dist
					var_loca_lt.append(dist)
					var_loca_rt.append(rlength-1-dist)
				elif cigar_list[0][0]==4 or cigar_list[0][0]==5:
					dist=rel_dist+cigar_list[0][1]
					var_loca_lt.append(dist)
					var_loca_rt.append(rlength-1-dist)

			if var_read == 'off':  #ref_read
				ref_n.append(read.query_name)
				ref_mapq.append(read.mapping_quality)
			else:
				'blank'
		if len(ref_mapq)==0:
			mr_mq='NA'
		else:
			mr_mq=str(median(ref_mapq))
		if len(var_mapq)==0:
			mv_mq='NA'
		else:
			mv_mq=str(median(var_mapq))
		if len(var_mapq)==0:
			mv_mq='NA'
		if len(var_loca_lt)==0:
			vlocal="NA";vlocar="NA"
		else:
			vlocalmin=str(min(var_loca_lt))
			vlocalmed=str(median(var_loca_lt))
			vlocalmax=str(max(var_loca_lt))
			vlocarmin=str(min(var_loca_rt))
			vlocarmed=str(median(var_loca_rt))
			vlocarmax=str(max(var_loca_rt))
			vlocal=','.join([vlocalmin, vlocalmed, vlocalmax])
			vlocar=','.join([vlocarmin, vlocarmed, vlocarmax])
		if len(var_nm)==0:
			mv_nm="NA"
		else:
			mv_nm=str(median(var_nm))
#		print set(var_n)
#		print set(ref_n)
#		print set(tot_n)
		var_n=len(set(var_n))
		ref_n=len(set(ref_n))
		tot_n=len(set(tot_n))
#		print var_n
#		print ref_n
#		print tot_n
		read_list=[str(tot_n),str(ref_n),str(var_n),mr_mq,mv_mq,vlocal,vlocar,mv_nm]
		if not ("\t".join(block.split('\t')[0:2])) in out_dic.keys():
			out_dic[("\t".join(block.split('\t')[0:2]))]=[]
			for i in range(0,len(read_list)):
				out_dic[("\t".join(block.split('\t')[0:2]))].append("NA")
		
		if block.split('\t')[2]=="F1R1":
			out_dic["\t".join(block.split('\t')[0:2])][0]=','.join(read_list)
		elif block.split('\t')[2]=="F1R2":
			out_dic["\t".join(block.split('\t')[0:2])][1]=','.join(read_list)
		elif block.split('\t')[2]=="F2R1":
			out_dic["\t".join(block.split("\t")[0:2])][2]=','.join(read_list)
		else:
			out_dic["\t".join(block.split("\t")[0:2])][3]=','.join(read_list)
	return out_dic

def output_gen(out_dic):
	out_list=[[],[],[],[]]
	for block in sorted(out_dic.keys(),key=lambda x : (int(x.split('\t')[0]),int(x.split('\t')[1]))):
		for i in range(0,4):
			out_list[i].append(out_dic[block][i])
	cmd=""
	for i in range(0,4):
		cmd+=";".join(out_list[i])+"\t"
	return cmd

def main():
	vcf_file=file(sys.argv[1])
	out_file=file(sys.argv[1]+".v13.info.txt","w")
	vcf_line=vcf_file.readline().rstrip()
	while "##" in vcf_line:
		vcf_line=vcf_file.readline().rstrip()
		continue
#vcf_line=vcf_file.readline().rstrip()
	out_file.write(vcf_line+'\t'+"F1R1(tot_n;ref_n;var_n;ref_mq;var_mq;var_LocaLeftMin;Med;Max;var_LocaRightMin;Med;Max;var_medMismatch)\tF1R2(tot_n;ref_n;var_n;ref_mq;var_mq;var_LocaLeftMin;Med;Max;var_LocaRightMin;Med;Max;var_medMismatch)\tF2R1(tot_n;ref_n;var_n;ref_mq;var_mq;var_LocaLeftMin;Med;Max;var_LocaRightMin;Med;Max;var_medMismatch)\tF2R2(tot_n;ref_n;var_n;ref_mq;var_mq;var_LocaLeftMin;Med;Max;var_LocaRightMin;Med;Max;var_medMismatch)\n")
	vcf_line=vcf_file.readline().rstrip()
	t_file=pysam.AlignmentFile(sys.argv[2],"rb")
	start_time = time.time()
	while vcf_line:
#	print vcf_line
		vcf_info=vcf_line.split('\t')
		chr1=vcf_info[0]
		pos1=int(vcf_info[1])
#	print "pos1"
#	print pos1
		ref_nt=vcf_info[3]
		alt_nt=(vcf_info[4])

		read_dic={}
		var_loca_lt=[]
		var_loca_rt=[]
		t=0
		count_stat=0
		count_stat,read_dic=read_search(t_file,chr1,pos1,read_dic,count_stat)
		if count_stat==1:
			vcf_line=vcf_file.readline().rstrip()
			continue
		sorted_list=sorted(read_dic.keys(),key=lambda x :(int(x.split("\t")[0]),int(x.split("\t")[1]),x.split("\t")[2]))
#	print sorted_list
#	print len(sorted_list)
#	raw_input()
#	for block in sorted_list:
#		print len(read_dic[block])
#		print read_dic[block]
#		raw_input()
		
#	print read_dic
#	raw_input()
		out_dic=block_search(sorted_list,read_dic,pos1,alt_nt,var_loca_lt,var_loca_rt)
		out_file.write(vcf_line+"\t"+(output_gen(out_dic))[:-1]+"\n")
		vcf_line=vcf_file.readline().rstrip()
		continue


	print("--- %s seconds ---" % (time.time() - start_time))		
	out_file.close()

if __name__ == "__main__":
    # execute only if run as a script
    main()


