import sys, math, SingleLinkage, FastaManager, FileUtility, string, os

try:
	import DatabaseOp
except ImportError:
	print "\nNOTE: DatabaseOp not imported"


class parser:

	def __init__(self, dbtask="", config=""):
		self.dbtask = dbtask
		self.config = config	    

	
	def for_mega(self,bdir,fasta):
		
		if bdir[-1] != "/":
			bdir += "/"
		
		self.parse_table("%s.out" % fasta,"evalue",T=1,wself=1)
		self.symmetrify("%s.out_T1_evalue.parse"    % fasta)
		self.mega_score("%s.out_T1_evalue_c0h0.sym" % fasta)
    

	def match_fam(self,blast,matrix,target,unknown=0):
		
		matrix = futil.file_to_dict(matrix,1)

		inp = open(blast,"r")
		inl = inp.readline()
		sdict = {}
		while inl != "":
			llist  = inl.split("\t")
			query  = llist[0]
			subj   = llist[1]
			pid    = float(llist[2])
			evalue = llist[-2]
			changed = 0
			
			if unknown:
				if(not sdict.has_key(query) or 
				   (sdict.has_key(query) and pid > sdict[query][1])):
					sdict[query] = [subj,pid,evalue]
					changed = 1			
			else:
				if(not sdict.has_key(subj) or 
				   (sdict.has_key(subj) and pid > sdict[subj][1])):
					sdict[subj] = [query,pid,evalue]
					changed = 1
			inl = inp.readline()
		
		oup = open(blast+"."+target,"w")
		for i in sdict.keys():
			if matrix[sdict[i][0]] == target:
				oup.write("%s\t%s\t%i\t%s\n" % \
							(i,sdict[i][0],sdict[i][1],sdict[i][2]))
                            
	def get_scores_for_spc(self,score_file,outbase,per_id=10,homogenize=0):
		MAX_SCORE = 200
		MIN_SCORE = 0
		
		if outbase == "":
			outbase = score_file

		oup3 = open(outbase+"_p%ih%i.log" % (per_id,homogenize),"w")

		oup3.write("ParseBlast.py get_score_for_spc")
		oup3.write(" score_file= %s\n" % score_file)
		oup3.write(" outbase   = %s\n" % outbase)
		oup3.write(" per_id    = %i\n" % per_id)
		oup3.write(" homogenize= %i\n\n" % homogenize)


		inp = open(score_file,"r")
		score_dict = {}
		name_dict    = {}
		inline   = inp.readline()
		while inline != "":	     
			llist = inline.split("\t")
			idx1  = llist[0]
			len1  = int(idx1[idx1.find("-")+1:]) - \
					int(idx1[idx1.find(" ")+1:idx1.find("-")]) + 1
			idx2  = llist[1]
			len2  = int(idx2[idx2.find("-")+1:]) - \
					int(idx2[idx2.find(" ")+1:idx2.find("-")]) + 1

			if not name_dict.has_key(idx1):
				name_dict[idx1] = 0
			if not name_dict.has_key(idx2):
				name_dict[idx2] = 0					     

			score = llist[2][:-1]
			if score[0] == "e":
				try:
					score = -math.log10(float("1"+score))
				except OverflowError:
					print "OVERFLOW:",idx1,idx2,score
					inline = inp.readline()
					continue
			elif score  == "0.0": 
				score = MAX_SCORE
			else:
				try:
					score = -math.log10(float(score))
				except OverflowError:
					print "OVERFLOW:",idx1,idx2,score
					inline = inp.readline()
					continue
				except ValueError:
					print "ValueError:",idx1,idx2,score
					inline = inp.readline()
					continue

			if score < MIN_SCORE:
				inline = inp.readline()
				continue

			if homogenize:
				score_dict[idx1+"_"+idx2] = [len1,len2,math.sqrt(score)]
			else:
				score_dict[idx1+"_"+idx2] = [len1,len2,score]
					
			inline = inp.readline()

		tscores = {}
		keys = score_dict.keys()
		for i in keys:
			idx1    = i[:i.find("_")]
			idx2    = i[i.find("_")+1:]

			if score_dict.has_key(idx1+"_"+idx2):
				score12 = score_dict[i][2]
			else:
				continue

			if score_dict.has_key(idx2+"_"+idx1):
				len1    = score_dict[i][0]
				len2    = score_dict[i][1]
				score21 = score_dict[idx2+"_"+idx1][2]
			else:
				continue

			score = (score12+score21)/2

			if len1 >= len2:
				selfL = score_dict[idx1+"_"+idx1][2]
			else:
				selfL = score_dict[idx2+"_"+idx2][2]
				
			score = score/selfL

			if idx1 != idx2:
				del score_dict[i]
				del score_dict[idx2+"_"+idx1]
			
			if score > 1:
				print "Score bigger than 1:",idx1,idx2,score
				score = 1

			if not tscores.has_key(idx1):
				tscores[idx1] = [[idx2],[score]]
			else:
				if not idx2 in tscores[idx1][0]:
					tscores[idx1][0].append(idx2)
					tscores[idx1][1].append(score)

			if not tscores.has_key(idx2):
				tscores[idx2] = [[idx1],[score]]
			else:
				if not idx1 in tscores[idx2][0]:
					tscores[idx2][0].append(idx1)
					tscores[idx2][1].append(score)			  

		del score_dict
	
		if per_id != 0:

			print "Get top %i scores...\n" % per_id
			for i in tscores.keys():
				map = {}
				for j in range(len(tscores[i][1])):
					if not map.has_key(tscores[i][1][j]):
						map[tscores[i][1][j]] = [j]
					else:
						map[tscores[i][1][j]].append(j)


				order = map.keys()
				order.sort()
				order.reverse()

				ilist = []
				slist = []		      
				for j in range(1,per_id+1):
					if j<len(order):
						
						score   = order[j]
						members = map[score]

						for k in members:
							ilist.append(tscores[i][0][k])
							slist.append(score)
					else:
						break

				tscores[i] = [ilist,slist]


			idx_dict   = {}
			count = 0
			
			oup1 = open(outbase+"_p%ih%i.idx" % (per_id,homogenize),"w")
			oup2 = open(outbase+"_p%ih%i.spc" % (per_id,homogenize),"w")

			pairs = {}
			for i in tscores.keys():
				for j in range(len(tscores[i][0])):

					index = tscores[i][0][j]
					score = tscores[i][1][j]

					if i in tscores[index][0]:
						
						if not idx_dict.has_key(i):
							idx_dict[i]  = count
							name_dict[i] = 1
							oup1.write("%s\t%i\n" % (i,count))      
							count = count +1

						if not idx_dict.has_key(index):
							idx_dict[index]  = count
							name_dict[index] = 1
							oup1.write("%s\t%i\n" % (index,count))
							count = count+1

						if(not pairs.has_key(i+"_"+index) and
						   not pairs.has_key(index+"_"+i)):
							oup2.write("%i\t%i\t%f\n" % (idx_dict[i],
														 idx_dict[index],
														 score))
							pairs[i+"_"+index] = 1
					else:
						continue

			del tscores

			oup3.write("Singletons:\n")
			count = 0
			for i in name_dict.keys():
				if name_dict[i] == 0:
					oup3.write(" "+i+"\n")
					count = count+1
			oup3.write("Total = %i" % count)
			
			print "Done!\n"
			sys.exit(0)
		

	def get_scores_for_mcl(self,score_file,outbase,cutoff=1,homogenize=0):

		if outbase == "":
			outbase = score_file

		inp = open(score_file,"r")
		oup = open(outbase+".idx","w")

		idx = {}
		c = 0
		o_scores = {}

		inline = inp.readline()
		countS = 0
		while inline != "":
			if countS % 100000 == 0:
				print " %i x 100k" % (countS/100000)
			countS += 1
			lnlist = inline.split("\t")
			if len(lnlist) != 3:
				print "Score format problem: should be [query][subj][-log(e)]"
				print "Quit!"
				sys.exit(0)
			
			if not idx.has_key(lnlist[0]):
				idx[lnlist[0]] = c
				oup.write("%i\t%s\n" % (c,lnlist[0]))
				c = c+1
			if not idx.has_key(lnlist[1]):
				idx[lnlist[1]] = c
				oup.write("%i\t%s\n" % (c,lnlist[1]))
				c = c+1				 

			if not o_scores.has_key(lnlist[0]):
				if float(lnlist[2]) >= cutoff:
					o_scores[lnlist[0]] = {lnlist[1]:float(lnlist[2])}
			else:
				if float(lnlist[2]) >= cutoff:
					if o_scores[lnlist[0]].has_key(lnlist[1]):
						if float(lnlist[2]) > o_scores[lnlist[0]][lnlist[1]]:
							o_scores[lnlist[0]][lnlist[1]] = float(lnlist[2])
					else:
						o_scores[lnlist[0]][lnlist[1]] = float(lnlist[2])

			inline = inp.readline()

		D = c
		
		inp.close()
		oup.close()
		
		s_scores = {}
		okeys = o_scores.keys()
		print "Symmetrify scores..."
		countS = 0
		for i in okeys:
			if countS % 1e4 == 0:
				print " %i x 10k" % (countS/1e4)
			countS+= 1
			oikeys = o_scores[i].keys()
			for j in oikeys:
				score = 0

				if o_scores.has_key(j) and o_scores[j].has_key(i):
					if o_scores[i][j] == o_scores[j][i]:
						score = o_scores[i][j]
					else:
						score = (o_scores[i][j]+o_scores[j][i])/2.0

					del o_scores[j][i]
					if i != j:
						del o_scores[i][j]
				
				if score !=  0:
					if homogenize:
						score = math.sqrt(score)

					if s_scores.has_key(idx[i]):
						if not s_scores[idx[i]].has_key(idx[j]):
							s_scores[idx[i]][idx[j]] = score
					else:
						s_scores[idx[i]] = {idx[j]:score}
						

					if s_scores.has_key(idx[j]):
						if not s_scores[idx[j]].has_key(idx[i]):
							s_scores[idx[j]][idx[i]] = score
					else:
						s_scores[idx[j]] = {idx[i]:score}
		o_scores = {}
		
		print "Generate output file..."
		oup = open(outbase+".mci","w")
		oup.write("(mclheader\nmcltype matrix\ndimensions %ix%i\n)\n" % (D,D))
		oup.write("(mclmatrix\nbegin\n")
		
		for i in s_scores.keys():			       
			oup.write("%i " % i)
			for j in s_scores[i]:
				s = str(s_scores[i][j])
				oup.write("%i:%s " % (j,s[:s.find(".")+4]))
			oup.write("$\n")

		for i in range(D):
			if not s_scores.has_key(i):
				score = 200.0
				if homogenize:
					score = math.sqrt(score)
				oup.write("%i %i:%f $\n" % (i,i,score))

		oup.write(")")
		
		print "Close outputstream..."
		oup.close()
		print "Done!"
				  

	def symmetrify(self,score_file,outbase="",cutoff=0,homogenize=0):
		
		oup_log = open(score_file+"_sym.log","w")
		
		print "Construct score dict..."
		o_scores = {}
		inp = open(score_file,"r")
		inline = inp.readline()

		c = 0
		ndict = {}
		print "Processed lines:"
		oup_log.write("More than one scores:\n")
		while inline != "":
			if c % 100000 == 0:
				print " %ix100k" % (c/100000)
			c += 1
				
			lnlist = inline.split("\t")

			lnlist[-1] = self.rmlb(lnlist[-1])
			
			if not ndict.has_key(lnlist[0]):
				ndict[lnlist[0]] = 0
			if not ndict.has_key(lnlist[1]):
				ndict[lnlist[1]] = 0
			
			if not o_scores.has_key(lnlist[0]):
				o_scores[lnlist[0]] = {lnlist[1]:float(lnlist[-1])}
			else:
				if o_scores[lnlist[0]].has_key(lnlist[1]):
				
					oup_log.write(" %s,%s %f -> %s\n" % \
										(lnlist[0],
										 lnlist[1],
										 o_scores[lnlist[0]][lnlist[1]],
										 lnlist[-1]))
					if float(lnlist[-1]) > o_scores[lnlist[0]][lnlist[1]]:
						o_scores[lnlist[0]][lnlist[1]] = float(lnlist[-1])											   
				else:
					o_scores[lnlist[0]][lnlist[1]] = float(lnlist[-1])
			inline = inp.readline()
		
		print "Apply cutoff..."
		okeys = o_scores.keys()
		selfC = 0
		oup_log.write("\nSelf score below cutoff at %i:\n" % cutoff)
		for i in okeys:
			jkeys = o_scores[i].keys()
			for j in jkeys:
				if o_scores[i][j] < cutoff:
					# for log
					if i == j:      
						oup_log.write(" %s\t%f\n" % (i,o_scores[i][i]))
						selfC += 1
					o_scores[i][j] = cutoff

		print " %i self score below cutoff %i" % (selfC,cutoff)
		
		print "Symmetrify and normalize scores..."
		s_scores = {}
		print "Process total %i taxa:" % len(okeys)
		c = 0
		oup_log.write("\nNot in score dict, most likely not in BLAST file:\n")
		for i in okeys:		 
			if c % 10 == 0:
				print "",c+1
			c += 1
			oikeys = o_scores[i].keys()
			for j in oikeys:
				score = 0
				if o_scores.has_key(j):
					if o_scores[j].has_key(i):
						if o_scores[i][j] == o_scores[j][i]:
							score = o_scores[i][j]
						else:
							score = (o_scores[i][j]+o_scores[j][i])/2.0
					else:
						o_scores[j][i] = o_scores[i][j]
						score = o_scores[i][j]
				t_score = score
				
				if o_scores.has_key(i):
					if o_scores[i].has_key(i):
						pass
					else:
						print "Self score missing:",i
						print "Did you parse the table -wself 1? QUIT!"
						sys.exit(0)
				else:
					print " score dict misses:",i
					oup_log.write(" %i" % i)
					o_scores[i] = {i:cutoff}
				
				if o_scores.has_key(j):
					if o_scores[i].has_key(j):
						pass
					else:
						print "Self score missing:",j
						print "Did you parse the table -wself 1? QUIT!"
						sys.exit(0)
				else:
					print " score dict misses:",[j]
					oup_log.write(" %s" % j)
					o_scores[j] = {j:cutoff}
				if o_scores[i][i] >= o_scores[j][j]:
					if o_scores[i][i] == 0:
						score = 1.0
					else:
						score = 1.0 - score/o_scores[i][i]
				else:
					if o_scores[j][j] == 0:
						score = 1.0
					else:
						score = 1.0 - score/o_scores[j][j]
					
				if homogenize > 0 and score != 0:
					if homogenize == 1:
						score = score*score
					elif homogenize == 2:
						score = math.sqrt(score)

				if s_scores.has_key(i):
					if not j in s_scores[i][0]:
						s_scores[i][0].append(j)
						s_scores[i][1].append(score)
				else:
					s_scores[i] = [[j],[score]]

				if s_scores.has_key(j):
					if not i in s_scores[j][0]:
						s_scores[j][0].append(i)
						s_scores[j][1].append(score)
				else:
					s_scores[j] = [[i],[score]]

		if outbase == "":
			outbase = score_file[:score_file.rfind(".")]
		outbase = "%s_c%ih%i" % (outbase,cutoff,homogenize)
		
		oup = open(outbase+".sym","w")
		print "Generate output: %s" % (outbase+".sym")
		for i in s_scores.keys():
			for j in range(len(s_scores[i][0])):
				oup.write("%s\t%s\t%f\n" % (i,
											s_scores[i][0][j],
											s_scores[i][1][j]))							
		print "Closing output stream..."
		oup.close()
		print "Done!"
	
	
	def sym2(self,score):
		
		
		inp = open(score)
		inl = inp.readline()
		S   = {}
		while inl != "":
			L = inl.split("\t")
			if L[0] not in S:
				if L[1] not in S:
					S[L[0]] = {L[1]:float(L[-1])}
					S[L[1]] = {L[0]:float(L[-1])}
				elif L[0] not in S[L[1]]:
					S[L[0]] = {L[1]:float(L[-1])}
					S[L[1]] = {L[0]:float(L[-1])}
				else:
					S[L[0]][L[1]] = (S[L[0]][L[1]]+	float(L[-1]))/2.0
					S[L[1]][L[0]] = (S[L[1]][L[0]]+	float(L[-1]))/2.0
									
				
			elif L[1] not in S[L[0]]:
				S[L[0]][L[1]] = float(L[-1])
			
			
			if L[1] not in S:
				S[L[1]] = {L[0]:float(L[-1])}
			elif L[0] not in S[L[1]]:
				S[L[1]][L[0]] = float(L[-1])
			
			
			
			inl = inp.readline()
							

	def get_scores_for_neighbor(self,sym_score):

		scores = {}
		index  = {}	     
		inp = open(sym_score,"r")
		inline = inp.readline()
		print "Generate score dict..."
		c = 0
		while inline != "":
			if c % 100000 == 0:
				print " %ix100k" % (c/100000)
			c += 1

			llist = inline.split("\t")
			llist[2] = llist[2][:6]
			
			if not index.has_key(llist[0]):
				index[llist[0]] = 1
			if not index.has_key(llist[1]):
				index[llist[1]] = 1			     

			if not scores.has_key(llist[0]):
				scores[llist[0]] = {llist[1]:llist[2]}
			else:
				scores[llist[0]][llist[1]] = llist[2]
					   
			if not scores.has_key(llist[1]):
				scores[llist[1]] = {llist[0]:llist[2]}
			else:
				scores[llist[1]][llist[0]] = llist[2]			   
			inline = inp.readline()

		oup  = open(sym_score+".neigh","w")
		sline = ""	      
		ikeys = index.keys()
		oup.write("     %i\n" % len(ikeys))
		print "Write scores..."
		for i in index.keys():
			#print i
			count = 0
			for j in ikeys:
				count = count + 1
				if scores[i].has_key(j):
					sline = sline + scores[i][j] + "  "
				else:
					sline = sline + "1.0000  "		      
			if len(i)>10:
				print "The identifier is longer than 10 characters."
				print "Should run index_names\nExit!"
				sys.exit(0)
			else:
				oup.write(i+(10-len(i))*" "+sline[:-2]+"\n")
			sline = ""
		print "Done!"

	def mega_score(self,sym_score):
		
		scores = {}
		index  = {}	     
		inp = open(sym_score,"r")
		inline = inp.readline()
		print "Generate score dict..."
		c = 0
		while inline != "":
			if c % 100000 == 0:
				print " %ix100k" % (c/100000)
			c += 1

			llist = self.rmlb(inline).split("\t")
			
			if not index.has_key(llist[0]):
				index[llist[0]] = 1
			if not index.has_key(llist[1]):
				index[llist[1]] = 1			     

			if not scores.has_key(llist[0]):
				scores[llist[0]] = {llist[1]:llist[-1]}
			else:
				scores[llist[0]][llist[1]] = llist[-1]
					   
			if not scores.has_key(llist[1]):
				scores[llist[1]] = {llist[0]:llist[-1]}
			else:
				scores[llist[1]][llist[0]] = llist[-1]			   
			inline = inp.readline()

		oup  = open(sym_score+".meg","w")
		oup.write("#mega\n!Title %s;\n"%sym_score + \
			  "!format DataType=distance DataFormat=upperright;\n\n")
		
		print "Write taxa..."
		ikeys = index.keys()
		ikeys.sort()
		for i in ikeys:
			oup.write("#%s\n" % i)
		oup.write("\n")
		
		print "Write scores..."
		sline = ""	      
		for i in range(len(ikeys)):
			for j in range(i+1,len(ikeys)):
				if scores[ikeys[i]].has_key(ikeys[j]):
					sline = sline + scores[ikeys[i]][ikeys[j]] + "  "
				else:
					sline = sline + "1.0000  "		      

			oup.write(" "+sline[:-2]+"\n")
			sline = ""
		
		print "Close output stream..."
		oup.close()
		print "Done!"   

	def score_matrix(self, score):
		
		print "Read scores into a dictionary..."
		D = {}
		inp = open(score)
		inl = inp.readline()
		while inl != "":
			[id1,id2,S] = inl.strip().split("\t")
			if id1 not in D:
				D[id1] = {id2:S}
			elif id2 not in D[id1]:
				D[id1][id2] = S
			if id2 not in D:
				D[id2] = {id1:S}
			elif id1 not in D[id2]:
				D[id2][id1] = S
			inl = inp.readline()
		
		print "Write matrix..."
		ids = D.keys()
		ids.sort()
		oup = open(score+".matrix","w")
		oup.write("\t%s\n" % "\t".join(ids))
		for i in ids:
			oup.write("%s" % i)
			for j in ids:
				if i == j:            # Same sequence
					oup.write("\t0")    
				elif j not in D[i]:   # Score not there due to thresholding
					oup.write("\t1")
				else:
					oup.write("\t%s" % D[i][j])
			oup.write("\n")
		
		print "Done!"
	
	def parse_blast_file(self,blastout,T,target):
		
		inp = open(blastout,"r")
		oup = open(blastout+"_T%i_%s.parse" % (T,target),"w")
		inline = inp.readline()
		
		query = sbjct = ""
		evalue = 10.0
		percent = 0
		match   = "-"
		got_e = 0
		got_p = 0
		
		print "Query processed:"
		while inline != "":
			if inline[-2:] == "\r\n":
				inline = inline[:-2]
			elif inline[-1] == "\n":
				inline = inline[:-1]
			
			if len(inline) > 7 and inline[:7] == "Query= ":
				query = inline[7:]
				print " ",query

			elif len(inline) > 1 and inline[0] == ">":
				sbjct = inline[1:]
			
			elif(len(inline) > 6 and target == "evalue" and \
				 inline[:6] == " Score"):
				evalue = inline[inline.find("Expect = ")+9:]				  
				if evalue[0] == "e":
					evalue = "1"+evalue
				ef = float(evalue)						      
				if float(evalue) >= T:
					inline = inp.readline()
					continue
				if evalue == "0.0":
					evalue = "1e-200"		       
				got_e = 1
			
			elif((target == "percentID" or target == "percentSIM") and \
				 inline.find("Identities =") != -1):		    

				lnlist = inline.split(",")

				if target == "percentID":
					match   = lnlist[0][lnlist[0].find("=")+1:\
										lnlist[0].find("(")]
					match   = match[match.find(" ")+1:match.rfind(" ")]
					percent = lnlist[0][lnlist[0].find("(")+1:\
										lnlist[0].find("%")]
				elif target == "percentSIM":
					percent = lnlist[1][lnlist[1].find("(")+1:\
										lnlist[1].find("%")]

				if float(percent) <= T:
					inline = inp.readline()
					continue

				got_p = 1
				
			if target == "evalue" and got_e:
				try:
					convert = -math.log10(float(evalue))
					oup.write("%s\t%s\t%s\t%f\n" % (query,sbjct,evalue,convert))
				except ValueError:
					print "Value can't be transformed:",evalue
				got_e = 0
	
			elif(target == "percentID" or target == "percentSIM") and got_p:	
				oup.write("%s\t%s\t%s\t%s\n" % (query,sbjct,match,percent))
				got_p = 0
				
			inline = inp.readline()

		inp.close()
		oup.close()

	def merge_match(self,blast,feature,mtype="blast_match"):
		
		
		inp = open(blast,"r")
		inline = inp.readline()
		slist = []
		while inline != "":
			llist = inline.split("\t")
			if llist[1] not in slist:
				slist.append(llist[1])
			inline = inp.readline()
		
		oup = open(blast+".merged.coord","w")
		print "Process subj:"
		for i in slist:
			print "",i
			
			cdict  = {}
			inp    = open(blast,"r")
			inline = inp.readline()
			while inline != "":
				llist = inline.split("\t")
				if i == llist[1]:
					if int(llist[8]) < int(llist[9]):
						sL  = int(llist[8])
						sR  = int(llist[9])
						ori = "W"
					else:
						sR  = int(llist[8])
						sL  = int(llist[9])
						ori = "C"
					
					if cdict.has_key(sL):
						if sR > cdict[sL][0]:
							cdict[sL] = [sR,ori]
					else:
						cdict[sL] = [sR,ori]
				
				inline = inp.readline() 
			
			
			ckeys = cdict.keys()
			ckeys.sort()
			coord_list = []
			last = -1
			not_added = 0
			for j in ckeys:
				if last == -1:
					last = j

				else:
					if cdict[last][0] > j:
						if cdict[last][1] != cdict[j][1]:
							print "  Discrepancy: %s/%s,%s <-> %s/%s,%s" % \
									 (cdict[last][1],last,cdict[last][0],
									  cdict[j][1],i,cdict[j][0])
						cdict[last][0] = cdict[j][0]
						
						if ckeys[-1] == j:
							coord_list.append([cdict[last][1],last,\
											   cdict[last][0]])
						
					else:
						coord_list.append([cdict[last][1],last,cdict[last][0]])
						if ckeys[-1] == j:
							coord_list.append([cdict[j][1],j,cdict[j][0]])
						else:
							last = j
			
			for j in coord_list:
				fid = "%s_%s_%i_%i" % (feature,j[0],j[1],j[2])
				oup.write("%s\t%s\t%s\t%s\t%s\t%s\n" % \
						  (i,fid,mtype,j[0],j[1],j[2]))
			
        
	def parse_table(self,blast,target,T,verbose=0,wself=0,lenT="",fasta="",QorS=0):

		print "Parse blast table:"
		print " Table name:",blast
		print " Target    :",target
		print " Threshold :",T
		print " Verbose   :",verbose
		print " Wself     :",wself
		print ""
		
		inp = open(blast,"r")
		oup = open(blast+"_T%i_%s.parse" % (T,target),"w")
		
		sizes  = {}
		minLen = 100
		if lenT != "":
			lenT = float(lenT)/100.0
			print "lenT      :",lenT
			print "Fasta     :",fasta
			print "QorS      :",QorS
			if fasta == "":
				print "If you set lenT, you need the sequence file!"
				sys.exit(0)
			else:
				sizes = fmanager.get_sizes(fasta,1)

		inline = inp.readline()
		c = 0
		countW = 0
		sdict  = {}
		while inline != "":
			if c%10000 == 0:
				print " %i x10k" % (c/10000)
			c += 1  
			if inline[0] != "#":
				llist = inline.split("\t")			
				out_str = ""
				qualify = 1
				if lenT != "":
					qMatch = float(llist[7]) - float(llist[6])
					sMatch = float(llist[9]) - float(llist[8])
					qLen   = float(sizes[llist[0]])
					sLen   = float(sizes[llist[1]])				
					if qLen < minLen or sLen < minLen:
						qualify = 0			
					if ((QorS == 0 and qMatch/qLen < lenT) or
	 			        (QorS == 1 and sMatch/sLen < lenT)):
						qualify = 0
					
				if target in ["evalue","convert"] and qualify:
					evalue = llist[10]
					if evalue[0] == "e":
						evalue = "1"+evalue
					if evalue == "0.0":
						evalue = "1e-200"
					try:
						ef = -math.log10(float(evalue))
					except OverflowError:
						print "Overflow:",evalue
						sys.exit(0)
					if ef > 0 and ef > T and \
						not sdict.has_key("%s%s" % (llist[0],llist[1])):
						out_str = "%s\t%f\t%s\t%s\t%s\t%s" % \
								 (evalue,T,llist[6],llist[7],llist[8],llist[9])
						sdict["%s%s" % (llist[0],llist[1])] = 1
					else:
						inline = inp.readline()
						continue
						
					if verbose:
						out_str = "%s\t%f\t%s\t%s\t%s\t%s" % \
								 (evalue,ef,llist[6],llist[7],llist[8],llist[9])
					elif target == "convert":
						out_str = "%f" % ef
					else:
						out_str = "%s\t%f" % (evalue,ef)
					
				elif target == "percentID" and qualify:
					ident = float(llist[2])
					if ident < T or sdict.has_key("%s%s" % (llist[0],llist[1])):
						inline = inp.readline()
						continue
					match = int(float(llist[3])*ident/100)
					sdict["%s%s" % (llist[0],llist[1])] = 1
						
					if verbose:
						out_str = "%i/%s\t%f\t%s\t%s\t%s\t%s" % (match,llist[3],ident,llist[6],llist[7],llist[8],llist[9])
					else:	
						out_str = "%i/%s\t%f" % (match,llist[3],ident)
				elif qualify:
					print "HERE"
					tlist = target.split(",")
					for j in tlist:
						out_str = out_str + llist[int(j)] + "\t"
					out_str = out_str[:-1]
				
				if ((not wself and llist[0] != llist[1]) or wself) and qualify:
					oup.write("%s\t%s\t%s\n" % (llist[0],llist[1],out_str))
					countW += 1
					
			inline = inp.readline()

		print "Total %i pairs, %i qualified and written" % (c,countW)
		print "Done!"
		
	def parse_table2(self,blast,E,I,H):
		
		print "Check inconsistency, E, I defaults are set to zero"
		
		inp = open(blast,"r")
		inl = inp.readline()
		qdict = {}
		print "Parse blast output:"
		while inl != "":
			if inl[0] != "#":
				L = inl.split("\t")
				try:
					evalue = float(L[-2])
				except IndexError:
					print "Evalue ERR:",L
					sys.exit(0)
				if evalue == 0:
					evalue = 200
				else:
					evalue = -math.log(evalue,10)
				ident  = float(L[2])
				length = int(L[3])
				
				if evalue >= E and ident >= I and length >= H:
					if qdict.has_key(L[0]):
						if L[1] not in qdict[L[0]]:
							qdict[L[0]][0] += 1
							qdict[L[0]][1].append(L[1])
					else:
						#print "",L[0]
						qdict[L[0]] = [1,[L[1]]]
			inl = inp.readline()

		print "Generate output..."
		oup = open(blast+".E%iI%iL%i.parsed" % (int(E),int(I),H),"w")
		for i in qdict:
			oup.write("%s\t%i\t%s\n" % (i,qdict[i][0],
									  string.joinfields(qdict[i][1],",")))
		print "Done!"


		
	def parse_table3(self,blast,E,I):
		inp = open(blast,"r")		
		oup = open("%s_E%i_I%i.out" % (blast,E,I),"w")
		inl = inp.readline()
		print "Parse blast output:"
		c = 0
		countE = 0
		while inl != "":
			if c%10000 == 0:
				print " %i x10k" % (c/10000)
			c += 1
			L = inl.split("\t")
			evalue = L[-2]
			try:
				ident  = float(L[2])
			except ValueError:
				print "ERR:",L
				inl = inp.readline()
				continue
			if evalue[0] == "e":
				evalue = "1"+evalue
			if evalue == "0.0":
				evalue = "1e-200"
			try:
				ef = -math.log10(float(evalue))
			except OverflowError:
				print "Overflow:",evalue
				sys.exit(0)
			
			if ef >= E and ident > float(I):
				oup.write("%s\t%s\t%f\n" % (L[0],L[1],ef))
			else:
				countE += 1
			inl = inp.readline()
		
		print "Total %i scores, %i eliminated" % (c,countE)
			
		print "Done!"
	
	
	def parse_table4(self,blast,E,I,H):
		inp = open(blast)
		oup = open(blast+".E%iI%iL%i.parse4" % (int(E),int(I),H),"w")
		inl = inp.readline()
		countT = countQ = 0
		print "BLAST file :", blast
		print "Output file:", blast+".E%iI%iL%i.parse4" % (int(E),int(I),H)
		c = 0
		while inl != "":
			if inl[0] != "#":
				L = inl.split("\t")
				if c % 1e5 == 0:
					print " %i x 100k" % (c/1e5)
				c += 1
				try:
					evalue = float(L[-2])
				except IndexError:
					print L
					sys.exit(0)
				if evalue == 0:
					evalue = 400
				else:
					evalue = -math.log(evalue,10)
				ident  = float(L[2])
				length = int(L[3])
				
				if evalue >= E and ident >= I and length >= H:
					oup.write(inl)
					countQ += 1
				countT += 1
			inl = inp.readline()
		
		print "Total %i lines, %i qualified" % (countT,countQ)
		print "Done!"
		
		
	def merge(self,score,T):
		inp = open(score,"r")
		oup = open("%s_T%i.merged" % (score,T),"w")
		oup2= open("%s_T%i.merged.idx" % (score,T),"w")
		inl = inp.readline()
		print "Merge blast score:"
		c = 0
		
		mdict = {}
		tdict = {}
		while inl != "":
			if c%10000 == 0:
				print " %i x10k" % (c/10000)
			c += 1  
			L = inl.split("\t")
			if float(L[-1]) >= T:
				pass
			
			inl = inp.readline()
		
		
	def parse_blast_db(self,outbase,T):
		
		qTuple = self.dbtask.select("SELECT count(*) FROM %s" % \
									self.config["table1"])

		print "Parse blast score from %i queries:" % qTuple[0][0]

		if outbase == "":
			outbase = self.config["table1"]
		
		oup = open(outbase+".parse","w")
		
		has_more = 1
		c = 0

		INCREMENT = 100
		
		while has_more:
			print " ",c*INCREMENT+1,"-",(c+1)*INCREMENT

			qTuple = self.dbtask.select(\
				"SELECT data FROM %s WHERE " % self.config["table1"] +\
				"sputnik_ref >= %i AND " % (c*INCREMENT+1) +\
				"sputnik_ref <  %i" % ((c+1)*INCREMENT))

			if qTuple == []:
				has_more = 0
				print " done..."
			else:

				for j in qTuple:
		
					llist = j[0].split("\n")
					query = sbjct = ""
					evalue = 10.0
					got_e = 0
					for k in llist:
						if len(k) > 7 and k[:7] == "Query= ":
							query = k[7:]
						if len(k) > 1 and k[0] == ">":
							sbjct = k[1:]
						if len(k) > 6 and k[:6] == " Score":
							evalue = k[k.find("Expect = ")+9:]
							
							if evalue[0] == "e":
								evalue = "1"+evalue
							ef = float(evalue)
							
							if float(evalue) >= T:
								continue
							if evalue == "0.0":
								evalue = "1e-200"					       
							got_e = 1

						if got_e:
							convert = math.fabs(math.log10(float(evalue)))
							oup.write("%s\t%s\t%f\n" % \
									  (query,sbjct,convert))
							got_e = 0
							
			c = c+1

		oup.close()

	def check_acc(self,acc_file,parsed_file):

		print "\nConstructing acc_dict...\n"
		inp = open(acc_file,"r")
		inline = inp.readline()

		acc_dict = {}
		while inline != "":
			if inline[0] == ">":

				c_acc = inline[1:-1]

				i_acc = ""
				if "." in c_acc:
					i_acc = c_acc[:c_acc.find(".")] + c_acc[c_acc.find(".")+1:]
					
				elif "-" in c_acc:
					i_acc = c_acc[:c_acc.find("-")] + c_acc[c_acc.find("-")+1:]

				acc_dict[i_acc] = c_acc
				
			inline = inp.readline()

		print "Checking parsed file...\n"
		inp = open(parsed_file,"r")
		oup = open(parsed_file+".chk","w")
		inline = inp.readline()

		count = 0
		while inline != "":
			
			b_point = inline.find("\t")
			
			acc1 = inline[:b_point]
			if acc_dict.has_key(acc1):
				oup.write(acc_dict[acc1]+inline[b_point:])
			else:
				oup.write(inline)
			
			inline = inp.readline()
			count = count+1
			if count % 100000 == 0:
				print " %ik" % (count/1000)

	def check_missing(self,fasta,blast,qors):
		
		fnames = fmanager.get_names(fasta,1)
		FD = {}
		for i in fnames:
			FD[i] = 0
		
		inp = open(blast)
		inl = inp.readline()
		while inl != "":
			L = inl.split("\t")
			if L[qors] not in FD:
				print "Should not happen:",L[qors]
			else:
				FD[L[qors]] = 1			
			inl = inp.readline()
		
		oup = open(blast+".missing.%s" % qors,"w")
		for i in FD:
			if FD[i] == 0:
				oup.write("%s\n" % i)
						

	def get_selected(self,seq_list,score_file):
		
		print "Get scores based on: %s" % seq_list
		ndict = {}
		inp = open(seq_list,"r")
		inline = inp.readline()
		while inline != "":
			if not ndict.has_key(self.rmlb(inline)):
				ndict[self.rmlb(inline)] = 1
			else:
				print "Redundant:",self.rmlb(inline)
			inline = inp.readline()
				
		inp = open(score_file,"r")
		oup = open(seq_list+".scores","w")
		inline = inp.readline()
		while inline != "":
			llist = inline.split("\t")
			if ndict.has_key(llist[0]) or ndict.has_key(llist[1]):
				oup.write(inline)
			inline = inp.readline()

		print "Done!"
	def extract_cds(self,blast,stype,T,wu,ignorestop=0):
	
		if stype:
			INC = 3
		else:
			INC = 1
		
		print "Start extract coding sequences"

		inp = open(blast,"r")
		oup = open(blast+"_ext.fa","w")
		inline = inp.readline()
		subj = seq = L = R = ""
		hasL = 0
		cdict = {}
		
		countSubj = 0
		while inline != "":
			if inline[:7] == "Query= ":
				print "Query:",inline[7:-1]
				pass
			elif inline[0] == ">" or inline.find("Score =") != -1:
				if seq != "":
					if L == '':
						print "Is this WU blast? Need to set the flag!"
						sys.exit()
					if seq.find("-") != -1:
						segments = seq.split("-")
						seq = ""
						for i in segments:
							seq = seq + i
					if not ignorestop and seq.find("*") != -1:
						segments = seq.split("*")
						seq = ""
						for i in segments:
							seq = seq + i				   
					
					if not cdict.has_key(subj):
						countSubj = countSubj+1
						
						if int(L)<int(R):
							cdict[subj] = [[int(L),int(R)]]
						else:
							cdict[subj] = [[-int(L),-int(R)]]
					else:
						if int(L)<int(R):
							cdict[subj].append([int(L),int(R)])
						else:
							cdict[subj].append([-int(L),-int(R)])	   
					oup.write(">"+subj+"_"+L+"_"+R+"\n"+seq+"\n")
					
					seq = L = R = ""
					if inline[0] == ">":
						subj = ""										       
					hasL = 0
									
				if inline[0] == ">":
					subj = self.rmlb(inline)[1:]
							
			elif inline.find("Sbjct") != -1:
				llist = inline.split(" ")
				tmp = []
				for j in llist:
					if j != "":
						tmp.append(j)
				llist = tmp				
				
				if not hasL:
					if wu:
						print "NOT DEALT WITH"
						sys.exit(0)
						for j in llist[1:]:
							if j != "":
								L = j
								break
					else:
						L = llist[1]
					R = self.rmlb(llist[-1])
					hasL = 1
				else:
					R = self.rmlb(llist[-1])   # keep getting R   
				alphaIndex = 0
				try:
					if llist[-2][0].isdigit():
						for i in range(len(llist[-2])):
							if llist[-2][i].isalpha():
								alphaIndex = i
								break
				except IndexError:
					pass
				
				seq = seq + llist[-2][alphaIndex:]
								
			inline = inp.readline()	 
		
		if seq.find("-") != -1:
			segments = seq.split("-")
			seq = ""
			for i in segments:
				seq = seq + i
		if not ignorestop and seq.find("*") != -1:
			print "STOP"
			segments = seq.split("*")
			seq = ""
			for i in segments:
				seq = seq + i				   
		
		if not cdict.has_key(subj):
			countSubj = countSubj+1
			
			if int(L)<int(R):
				cdict[subj] = [[int(L),int(R)]]
			else:
				cdict[subj] = [[-int(L),-int(R)]]
		else:
			if int(L)<int(R):
				cdict[subj].append([int(L),int(R)])
			else:
				cdict[subj].append([-int(L),-int(R)])
		
		oup.write(">"+subj+"_"+L+"_"+R+"\n"+seq+"\n")
		
		tdict = {}
		for i in cdict.keys():
			countW = 0
			countC = 0
			wlist = []
			clist = []
			for j in range(len(cdict[i])):
				if abs(cdict[i][j][0]) < abs(cdict[i][j][1]):
					countW += 1
					wlist.append(cdict[i][j])
				elif abs(cdict[i][j][0]) > abs(cdict[i][j][1]):
					countC += 1
					clist.append(cdict[i][j])
			if countW > 0:		
				if tdict.has_key(i+"_0"):
					tdict[i+"_0"].append(wlist)
				else:
					tdict[i+"_0"] = wlist
			if countC > 0:
				if tdict.has_key(i+"_1"):		
					tdict[i+"_1"].append(clist)
				else:
					tdict[i+"_1"] = clist
		
		cdict = tdict
		print "\nConsolidate coordinates:"
		for i in cdict.keys():
			
			idxL = idxR = 0
			mdict = {} # idx of element which is eliminated		 
			
			for j in range(len(cdict[i])):
				
				if mdict.has_key(j):
					continue
					
				for k in range(j+1,len(cdict[i])):
					
					if mdict.has_key(k):
						continue
					if cdict[i][j][0] < cdict[i][k][0]:      # jL<kL
						if cdict[i][j][1] >= cdict[i][k][1]: # and jR>=kR, j(k)
							mdict[k] = 1
						else:
							pass
					elif cdict[i][j][0] > cdict[i][k][0]:    # jL>kL
						if cdict[i][j][1] <= cdict[i][k][1]: # and jR<=kR, k(j)
							mdict[j] = 1 
							break
						else:
							pass
					else:							         # jL = kL
						if cdict[i][j][1] > cdict[i][k][1]:  # and jR>kR, j(k)
							mdict[k] = 1
						elif cdict[i][j][1]<cdict[i][k][1]:  # and jR<kR, k(j)
							mdict[j] = 1
							break
						else:						    # and jR=kR, k=j
							mdict[k] = 1    
			
			tlist = []
			for j in range(len(cdict[i])):
				if not mdict.has_key(j):
					tlist.append(cdict[i][j])
			cdict[i] = tlist
			Ldict = {}
			for j in range(len(cdict[i])):
				Ldict[cdict[i][j][0]] = j
			keys = Ldict.keys()
			keys.sort()
			clist = []
			for j in keys:
				clist.append(cdict[i][Ldict[j]])
			
			for j in range(len(clist)):
				if j == 0:
					clist[j].append(clist[j][0])
				else:
					clist[j].append(clist[j-1][1]+INC)		      
			cdict[i] = clist
			
		print "\nAssemble sequences:"
		
		inp = open(blast+"_ext.fa","r")
		oup = open(blast+"_assm.fa","w")
		inline = inp.readline()
		sdict = {}

		countAssemblee = 0
		while inline != "":
			if inline[0] == ">":			    
				inline = inline[1:]
				llist  = inline.split("_")
				subj   = ""
				L      = int(llist[-2])
				R      = int(llist[-1])
				for j in llist[:-2]:
					subj = subj + j + "_"
				subj = subj[:-1]
				
				if L < R:
					subj += "_0"
				else:
					subj += "_1"
						
				for j in range(len(cdict[subj])):
					
					if(L!= abs(cdict[subj][j][0]) or \
					   R!= abs(cdict[subj][j][1])):
						continue
					
					countAssemblee = countAssemblee + 1
					seq = inp.readline()[:-1]

					if cdict[subj][j][0] < cdict[subj][j][2]:
						seq = seq[(cdict[subj][j][2] - \
								   cdict[subj][j][1] + 1)/INC - 1:]

					elif cdict[subj][j][0] > cdict[subj][j][2]:
						seq = "(%i-%i)" % (abs(cdict[subj][j][2]),
										   abs(cdict[subj][j][0])) + seq
					elif cdict[subj][j][0] == cdict[subj][j][2]:
						seq = seq
					else:
						seq = seq[:-1]
					cdict[subj][j].append(seq)
																
			inline = inp.readline()


		print "\nOutput assembled sequences..."
		
		countAssembled = 0
		for i in cdict.keys():
			L = abs(cdict[i][0][0])
			R = abs(cdict[i][0][1])
			seq = ""
			slist = []
			for j in range(len(cdict[i])):
				idx = cdict[i][j][3].find(")")
				disrupt = 0
				if idx != -1:
					gap = cdict[i][j][3][1:idx].split("-")
					if abs(int(gap[0])-int(gap[1])) > T:
						disrupt = 1
						
				if disrupt:					
					slist.append([i,L,R,seq])
					countAssembled = countAssembled+ 1
					L = abs(cdict[i][j][0])
					R = abs(cdict[i][j][1])
					seq = cdict[i][j][3][idx+1:]
				else:
					seq = seq + cdict[i][j][3]
					R = abs(cdict[i][j][1])
			
			slist.append([i,L,R,seq])
			countAssembled = countAssembled+ 1
			
			for j in slist:
				oup.write(">%s_%i_%i\n%s\n" % (j[0],j[1],j[2],j[3]))
		
		print "\nTotal %i unique subjects"		   % countSubj
		print "      %i sequences to be assembled" % countAssemblee
		print "      %i assembled sequences"	% countAssembled
		print "Done!\n"

	def get_subj(self,blast,desc,style,ttype,T):

		print "Get matching subjects:"
		print " Blast output:",blast
		print " Desc flag   :",desc
		print " Style       :",style
		
		if T != 0 and style != 9:
			print "WARNING: T can't be apply to style 1 at this point!"
		
		inp = open(blast,"r")
		oup = open(blast+".gi","w")
		inline = inp.readline()
		sdict = {}
		count = 0
		while inline != "":
			if not desc:
				if style == 1:
					if inline.find("Query=") != -1:
						print inline[6:-1]
					elif inline[0] == ">":
						if not sdict.has_key(inline[1:-1]):
							count = count+ 1
							sdict[inline[1:-1]] = 0
							oup.write(inline[1:])
				elif style == 9:
					if inline[0] != "#":
						llist = inline.split("\t")
						try:
							if not sdict.has_key(llist[1]):
								if T != 0:
									if ttype == "evalue":
										if float(llist[-2]) <= T:
											sdict[llist[1]] = float(llist[-2])
											count = count+ 1
											oup.write(llist[1]+"\n")
									else:
										if float(llist[2])  >= T:
											sdict[llist[1]] = float(llist[-2])
											count = count+ 1
											oup.write(llist[1]+"\n")
								else:
									sdict[llist[1]] = float(llist[2])
									count = count+ 1
									oup.write(llist[1]+"\n")
						except IndexError:
							print "ERROR-line format:",[inline]
				else:
					print "Unknown style type, quit!"
					sys.exit(0)
			else:
				if inline.find("Sequences producing") != -1:
					inp.readline() # rid of the empty line
					inline = inp.readline()
					while inline != "" and inline not in ["","\n","\r\n"]:
						llist  = inline.split(" ")
						firstS = 0
						for k in range(len(llist)):
							if llist[k] == "":
								if firstS == 0:
									firstS = 1
								else:
									key = ""
									for m in llist[:k-1]:
										key = key + m + " "
									key = key[:-1]
									if sdict.has_key(key):
										sdict[key] += 1
									else:
										count += 1
										sdict[key] = 1
									break
						inline = inp.readline()
					
			inline = inp.readline()
			
		if desc:
			for j in sdict.keys():
				oup.write(j+"\n")
				
		print "Total %i non-redundant matching subjects" % count
		print "Done!"

	def index_names(self,score):
		
		print "Start converting names to indices:"
		inp  = open(score,"r")
		oup1 = open(score+".mod","w")
		oup2 = open(score+".name","w")
		
		inline = inp.readline()
		
		ndict = {}
		count = 0
		c = 0
		while inline != "":
			if c%10000 == 0:
				print " %ix10k" % (c/10000)
			c += 1
			llist = inline.split("\t")
			if not ndict.has_key(llist[0]):
				ndict[llist[0]] = count
				oup2.write(llist[0]+"\t%i" % count+"\n")
				count = count + 1
			if not ndict.has_key(llist[1]):
				ndict[llist[1]] = count
				oup2.write(llist[1]+"\t%i" % count+"\n")
				count = count+ 1

			oup1.write("%i\t%i\t%s" % \
				   (ndict[llist[0]],ndict[llist[1]],llist[-1]))
					
			inline = inp.readline()

		print "Total %i score pairs with %i unique indices" % (c,count)
		print "Done!"


	def rename(self,fasta,name):

		inp    = open(name,"r")
		inline = inp.readline()
		ndict  = {}
		while inline != "":
			L = inline[:-1].split("\t")
			if ndict.has_key(L[1]):
				print "Redundant names"
			else:
				ndict[L[1]] = L[0]
			inline = inp.readline()

		inp    = open(fasta,"r")
		inline = inp.readline()
		countF = 0  # found
		countN = 0  # not found
		while inline != "":
			if inline[0] == ">":
				if ndict.has_key(inline[1:-1]):
					oup.write(">%s\n" % ndict[inline[1:-1]])
					countF = countF+1
				else:
					oup.write(inline)
					countN = countN+1
			else:
				oup.write(inline)
			inline = inp.readline()

		print "Found %s, not found %s" % (countF,countN)
		print "Done!"
	
	def parse_align2(self,blast,flag=1,gap=0):

		QUERY_END = "  Database:"
		QUERY_END2= "Lambda"
		ID_TAG    = " Score ="
		ID_STR    = "("
		ID_END    = "%"
		
		inp = open(blast,"r")
		oup = open(blast+".mod","w")
		inl = inp.readline()
		subj = query = ""

		olist = [""]*12 
		qL = qR = sL = sR = 0
		countL = 0
		if flag:
			print "Start parsing %s" % blast
		while inl != "":
			if flag and countL % 100000 == 0:
				print " %ix100k" % (countL/100000)
			countL += 1
			if inl[-2:] == "\r\n":
				inl = inl[:-2]
			elif inl[-1] == "\n":
				inl = inl[:-1]
			
			if inl.find("Query=") != -1:
				query = inl[inl.find("=")+2:]
				if query == "":
					query = "query"
			elif inl.find(">") != -1:
				subj = inl[1:]				
			elif inl.find("Score =") != -1:
				
				if olist[0] != "":
					olist[6:10] = [qL,qR,sL,sR]
					oup.write("%s\n" % (string.joinfields(olist,"\t")))
				
				olist = [""]*12
				olist[0] = query
				olist[1] = subj
				qL = qR = sL = sR = 0
								
				ilist = inl.split(" ")
				tmp = []
				for j in ilist:
					if j != "":
						tmp.append(j)
				ilist = tmp
				
				if ilist[-1][0] == "e":
					ilist[-1] = "1"+ilist[-1]
				olist[10] = ilist[-1]
				
				olist[11] = ilist[2]
				
			elif inl.find("Identities =") != -1:
				ilist = inl.split(" ")
				tmp = []
				for j in ilist:
					if j != "":
						tmp.append(j)
				ilist = tmp
				mismatch = ilist[2].split("/")
				olist[3] = mismatch[1]
				mismatch = int(mismatch[1]) - int(mismatch[0])
				olist[4] = "%i" % mismatch
				olist[2] = ilist[3][ilist[3].find("(")+1:ilist[3].find("%")]
				if ilist[-4] != "Gaps":
					olist[5] = "0"
				else:
					olist[5] = ilist[-2][:ilist[-2].find("/")]
			
			elif inl.find("Query:") != -1 or inl.find("Sbjct:") != -1:
				ilist = inl.split(" ")
				c = 0
				L = R = ""
				tmp = []
				for j in ilist:
					if j != "":
						tmp.append(j)
				ilist = tmp
				
				if ilist[0] == "Query:":
					if qL == 0:
						qL = ilist[1]
					qR = ilist[-1]
				elif ilist[0] == "Sbjct:":
					if sL == 0:				
						sL = ilist[1]
					sR = ilist[-1]
		
			inl = inp.readline()
		
		olist[6:10] = [qL,qR,sL,sR]
		oup.write("%s\n" % (string.joinfields(olist,"\t")))
		
		if flag:
			print "Total %i subject lines. Done!" % countL
		
	
	def parse_align3(self,blast,seq_id):
		
		print "\nGet %s from %s:" % (blast,seq_id)
		inp = open(blast,"r")
		oup = open(seq_id+".out","w")
		inl = inp.readline()
		olist = []
		findT = 0
		count = 0
		query = ""
		while inl != "":
			if inl.find("Query=") != -1:
				if olist != []:
					oup.write("%s\n" % string.joinfields(olist,""))
					olist = []
					findT = 0
				else:
					olist.append(inl+"\n")
				query = self.rmlb(inl)
			
			if inl.find(">") != -1:
				if self.rmlb(inl)[1:] == seq_id:
					findT = 1
					count += 1
					olist.append(inl)
				else:
					findT = 0
			elif findT:
				olist.append(inl)
			inl = inp.readline()
		
		print " found %i fimes." % count

	def parse_gap(self,blast):
		
		inp = open(blast,"r")
		oup1= open(blast+".gap","w")
		oup2= open(blast+".seqpair","w")

		Qtag  = "Query= "
		Qend1 = "  Database:"
		Qend2 = "Lambda"
		Stag  = ">"
		Qline = "Query: "
		Sline = "Sbjct: "
		Rtag  = "Score ="
		
		def gap(q,s):
			qL = []
			sL = []
			if len(q) == len(s):
				qI        = 0 # keep track of query string non-gap count
				sI        = 0 # keep track of subjt string non-gap count
				qgap      = 0 # query gap position
				sgap      = 0 # subject gap position
				qgap_size = 0
				sgap_size = 0

				for i in range(len(q)):
					if q[i] != "-":
						if qgap_size != 0:
							qL.append("%i|%i-%i" % (qI,sI-qgap_size+1,sI))
							qgap_size = 0

					if s[i] != "-":
						if sgap_size != 0:
							sL.append("%i|%i-%i" % (sI,qI-sgap_size+1,qI))
							sgap_size = 0

					if q[i] != "-":
						qI += 1
					else:
						qgap_size +=1
				
					if s[i] != "-":	
						sI += 1
					else:
						sgap_size +=1
											
			else:
				print "ERR:query and subjt strings different length"
				print qstr
				print sstr
				sys.exit(0)
			
			return qL,sL
				
		print "Find gaps..."
		inl = inp.readline()
		query = subjt = queryNew = subjtNew = qline = sline = ""
		qC = []
		sC = []
		while inl != "":
			inl = self.rmlb(inl)
			if inl.find(Qtag) != -1:
				if query == "":
					query = inl[len(Qtag):]
				queryNew = inl[len(Qtag):]
			elif inl.find(Stag) != -1:
				if subjt == "":
					subjt = inl[1:]
				subjtNew = inl[1:]
			elif inl.find(Rtag) != -1:
				if sline != "":
					
					qlist,slist = gap(qline,sline)
					alen  = len(qline)
					oup1.write("%s\t%s\t%s\t%s\t%s\t%s\t%i\t%s\t%s\n" % \
						(query,subjt,qC[0],qC[-1],sC[0],sC[-1],alen,
						 string.joinfields(qlist,","),
						 string.joinfields(slist,",")))
					oup2.write("%s\t%s\n%s\t%s\n\n" % (query,qline,subjt,sline))

					qline = sline = ""
					query = queryNew
					subjt = subjtNew
					qC = []
					sC = []

			elif inl.find(Qline) != -1:
				L = inl.split(" ")
				qline += L[-2]
				qC.append(L[1])
				qC.append(L[-1])
			elif inl.find(Sline) != -1:
				L = inl.split(" ")
				sline += L[-2]
				sC.append(L[1])
				sC.append(L[-1])
			inl = inp.readline()
		qlist,slist = gap(qline,sline)
		alen  = len(qline)
		oup1.write("%s\t%s\t%s\t%s\t%s\t%s\t%i\t%s\t%s\n" % \
			(query,subjt,qC[0],qC[-1],sC[0],sC[-1],alen,
			 string.joinfields(qlist,","),
			 string.joinfields(slist,",")))
		oup2.write("%s\t%s\n%s\t%s\n\n" % (query,qline,subjt,sline))

	
		print "Done!"


	def parse_align(self,blast,T,wself,format,style,query=""):
		inp = open(blast,"r")
		oup = open(blast+".log","w")

		oup.write("BLAST output:"+blast+"\n")
		oup.write("Threshold   :"+str(T)+"\n")
		oup.write("Include self:"+str(wself)+"\n")
		
		QUERY_STR = "Query="
		QUERY_END = "  Database:"
		QUERY_END2= "Lambda"
		SUBJ_STR  = ">"
		ID_TAG    = " Score ="
		ID_STR    = "("
		ID_END    = "%"

		inline  = inp.readline()
		outlines= []
		subj = "" 
		id   = ""
		w    = 0
		skip = 0
		
		print "Start parsing..."
		while inline != "":
			
			if inline[-2:] == "\r\n":
				inline = inline[:-2]
			elif inline[-1] == "\n":
				inline = inline[:-1]
			
			if inline.find(QUERY_STR) != -1:
				
				if outlines != []:
					for i in outlines:
						oup.write(i+"\n")
					
					outlines = []
				
				query = inline[inline.find(" ")+1:]
				outlines.append("\n%s\nQuery= %s" % ("-"*40,query))
			elif inline.find(SUBJ_STR) != -1:
				subj = inline[1:]
				if not wself:
					if subj == query:
						skip = 1
					else:
						skip = 0
				w = 0
			
			if not skip:
				if inline.find(ID_TAG) != -1:
					inline = inp.readline()	
					if inline[-2:] == "\r\n":
						inline = inline[:-2]
					elif inline[-1] == "\n":
						inline = inline[:-1]
							 
					id = int(inline[inline.find(ID_STR)+1:inline.find(ID_END)])
					if id > T:
						F = " "
						if id == 100:
							F = "y"
						outlines.append("\n[%s] SUBJ:%s(%i" % (F,subj,id) + "%)\n")
						inline = inp.readline()
						w = 1
					else:
						w = 0
				elif(inline.find(QUERY_END) != -1 or
					 inline.find(QUERY_END2)!= -1):
					
					inline = inp.readline()
					continue
				
				if w == 1:

					if len(inline) > 0:					
						if inline[-2:] == "\r\n":
							inline = inline[:-2]
						elif inline[-1] == "\n":
							inline = inline[:-1]
					if len(inline)>7:			     
						if inline[:6] == "Query:":
							llist   = inline.split(" ")
							aln_idx = inline.find(llist[-2])
							if not format:
								outlines.append(inline[7:])

							inline = inp.readline()
								
							if inline[-2:] == "\r\n":
								inline = inline[:-2]
							elif inline[-1] == "\n":
								inline = inline[:-1]
							
							inline = inline[aln_idx:]
							convert = ""
							for j in inline:
								if j not in [" ","+"]:
									convert = convert + "."
								elif j == " ":
									convert = convert + "X"
								elif j == "+":
									convert = convert + "+"
							
							if not format:
								outlines.append(" "*(aln_idx-7)+convert)						
							else:
								outlines.append(convert)
							
							if not format:
								inline = inp.readline()
								outlines.append(inline[7:])							
							else:
								inp.readline()
			
			inline = inp.readline()
		
		if outlines != []:
			for j in outlines:
				oup.write(j+"\n")
					
		print "\nParse_align done!\n"
	
	def parse_align4(self,blast):
		QUERY_END = "  Database:"
		QUERY_END2= "Lambda"
		ID_TAG    = " Score ="
		ID_STR    = "("
		ID_END    = "%"
		
		inp = open(blast,"r")
		oup = open(blast+".align_seq","w")
		inl = inp.readline()
		subj = query = ""
		
		OL = [""]*14
		qL = qR = sL = sR = 0
		countL = 0
		while inl != "":
			
			inl = inl.strip()
			
			if inl.find("Query=") != -1:
				query = inl[inl.find("=")+2:]
				if query == "":
					query = "query"
			elif inl.find(">") != -1:
				subj = inl[1:]				
			elif inl.find("Score =") != -1:
				if OL[0] != "":
					if countL % 1e3 == 0:
						print " %i k" % (countL/1e3)
					countL += 1
					OL[6:10] = [qL,qR,sL,sR]
					oup.write("#%s %s %s-%s|%s-%s %s\n" % \
								(OL[0],OL[1],OL[6],OL[7],OL[8],OL[9],OL[10]))
					oup.write("%s\n%s\n" % (OL[12],OL[13]))
				
				OL = [""]*14
				OL[0] = query
				OL[1] = subj
				qL = qR = sL = sR = 0
								
				ilist = inl.split(" ")
				tmp = []
				for j in ilist:
					if j != "":
						tmp.append(j)
				ilist = tmp
				
				if ilist[-1][0] == "e":
					ilist[-1] = "1"+ilist[-1]
				OL[10] = ilist[-1]
				
				OL[11] = ilist[2]
				
			elif inl.find("Identities =") != -1:
				ilist = inl.split(" ")
				tmp = []
				for j in ilist:
					if j != "":
						tmp.append(j)
				ilist = tmp
				mismatch = ilist[2].split("/")
				OL[3] = mismatch[1]
				mismatch = int(mismatch[1]) - int(mismatch[0])
				OL[4] = "%i" % mismatch
				OL[2] = ilist[3][ilist[3].find("(")+1:ilist[3].find("%")]
				if ilist[-4] != "Gaps":
					OL[5] = "0"
				else:
					OL[5] = ilist[-2][:ilist[-2].find("/")]
			
			elif inl.find("Query:") != -1 or inl.find("Sbjct:") != -1:
				ilist = inl.split(" ")
				c = 0
				L = R = ""
				tmp = []
				for j in ilist:
					if j != "":
						tmp.append(j)
				ilist = tmp
				if ilist[0] == "Query:":
					if qL == 0:
						qL = ilist[1]
					qR = ilist[-1]
					OL[12] += ilist[2]
				elif ilist[0] == "Sbjct:":
					if sL == 0:				
						sL = ilist[1]
					sR = ilist[-1]
					OL[13] += ilist[2]
		
			inl = inp.readline()
		
		# write the last query
		OL[6:10] = [qL,qR,sL,sR]
		#print OL
		oup.write("#%s %s %s-%s|%s-%s %s\n" % \
							(OL[0],OL[1],OL[6],OL[7],OL[8],OL[9],OL[10]))
		oup.write("%s\n%s\n" % (OL[12],OL[13]))
		countL += 1
		
		print "Total %i pairs. Done!" % countL		

	def refine(self,log):
	
		lenT = 50 # length threshold, set at 50 nt
		inp = open(log,"r")
		oup = open(log+".refined","w")
		inl = inp.readline()
		sdict = {}
		print "Start refinement..."
		while inl != "":
			inl = self.rmlb(inl)
		
			if inl.find("Query=") != -1:
				query = inl[len("Query=")+1:]
			elif inl.find("[") != -1:
				subjt = inl[inl.find(":")+1:inl.find("(")]
				pid   = int(inl[inl.find("(")+1:inl.find(")")-1])
				inp.readline()
				leng = 0
				inl = inp.readline()
				inl = self.rmlb(inl)
				while inl != "":
					leng += len(inl)
					inl = inp.readline()
					try:
						inl = self.rmlb(inl)
					except IndexError:
						break
				
				if sdict.has_key(subjt):
					sdict[subjt].append([pid,leng,query])
				else:
					sdict[subjt] = [[pid,leng,query]]
			inl = inp.readline()

		tdict = {}
		print "Rank..."
		for i in sdict:
			qlist = sdict[i]
			lens = {}        # length as key, pid as value
			qs   = {}        # pid and length as key, query as value
			for j in qlist:
				if lens.has_key(j[1]):
					lens[j[1]].append(j[0])
				else:
					lens[j[1]] = [j[0]]
				if qs.has_key("%i %i" % (j[0],j[1])):
					qs["%i %i" % (j[0],j[1])].append(j[2])
				else:
					qs["%i %i" % (j[0],j[1])] = [j[2]]
			lkeys = lens.keys()
			lkeys.sort()
			lkeys.reverse()
			qlist = []
			for j in lkeys:
				if len(lens[j]) > 1:
					lens[j].sort()
					lens[j].reverse()
					for k in lens[j]:
						if len(qs["%i %i" % (k,j)]) > 1:
							for l in qs["%i %i" % (k,j)]:
								qlist.append([k,j,l])
						else:
							qlist.append([k,j,qs["%i %i" % (k,j)][0]])
				else:
					if len(qs["%i %i" % (lens[j][0],j)]) > 1:
						for l in qs["%i %i" % (lens[j][0],j)]:
							qlist.append([lens[j][0],j,l])
					else:
						qlist.append([lens[j][0],j,qs["%i %i" % (lens[j][0],j)][0]])
			tdict[i] = qlist
		sdict = tdict			

		print "Generate output..."
		inp = open(log,"r")
		oup = open(log+".refined","w")
		inl = inp.readline()
		while inl != "":
			inl = self.rmlb(inl)
			tag = " "
			if inl.find("Query=") != -1:
				query = inl[len("Query=")+1:]
				ostr = inl
				print "",query
			elif inl.find("[") != -1:
				subjt = inl[inl.find(":")+1:inl.find("(")]
				pid   = int(inl[inl.find("(")+1:inl.find(")")-1])
				
				if len(sdict[subjt]) == 1:
					if sdict[subjt][0][1] > lenT:
						tag = "u"
					else:
						tag = "-"
				else:
					c = 0
					for j in sdict[subjt]:
						if pid == j[0] and query == j[2]:
							if j[1] > lenT:
								tag = str(c)
							else:
								tag = "-"
							break
						c += 1
						
				if tag == "-":
					inp.readline()
					inl = inp.readline()
					inl = self.rmlb(inl)
					while inl != "":
						inl = inp.readline()
						try:
							inl = self.rmlb(inl)
						except IndexError:
							break
				else:
					ostr = "[%s%s" % (tag,inl[2:])
			else:
				ostr = inl
			
			if tag != "-":
				oup.write(ostr+"\n")
			inl = inp.readline()
			

	def get_qualified(self,log,fasta,qtag="y",priority=0,):

		print "Start get_qualified:"
		print "     log:",log
		print "  format:",format
		print "   fasta:",fasta
		print "priority:",priority
		print "    qtag:",qtag,"\n"

		print "Read log file, parse qualified entries..."
		inp = open(log,"r")
		qdict = {}
		
		inline = inp.readline()
		query = subj = ""
		while inline != "":
			if inline.find("Query=") != -1:
				query = inline[inline.find(" ")+1:-1]
				if qdict.has_key(query):
					print "This shouldn't happen"
				else:
					qdict[query] = []
			elif inline[0] == "[":
				if inline[1] in qtag.split(","):
					subj = inline[inline.find(":")+1:inline.find("(")]
					if subj not in qdict[query]:
						if subj.find("_") != -1 and query.find("_") != -1:
							if subj[:subj.find("_")] != query[:query.find("_")]:
								pass
							else:
								qdict[query].append(subj)
						else:		
							qdict[query].append(subj)
			inline = inp.readline()
		print "Single linkage..."
		clusters = link.get_relations2(qdict)
			
		print "Get sequence size..."
		fmanager = FastaManager.fasta_manager()
		sdict     = fmanager.get_sizes(fasta,1)
		
		print "Generate output..."
		oup = open(log+".cluster","w")
		for i in clusters:
			outline = ""
			if len(i) > 1:
				max_id = i[0]
				
				for j in i:
					if priority and not qdict.has_key(j):
						outline += "%s(%i)\t" % (j,sdict[j])
						continue
					else:
						if max_id != j and (sdict[j] > sdict[max_id]):
							max_id = j
							
					outline += "%s(%i)\t" % (j,sdict[j])
								
				for j in outline.split("\t")[:-1]:
					if j[:j.find("(")] != max_id:
						oup.write("%s\t%s\n" % (j[:j.find("(")],max_id))
				oup.write("%s\t-\n" % max_id)
			else:
				oup.write("%s\t-\n" % i[0])


	def get_qualified4(self,blast,fasta,eT,idenT=95,matchL=150,lengT=0.9,
						qOnly=0):
		
		print "Get sequence sizes..."
		sizes = fmanager.get_sizes(fasta,1)
		inp = open(blast,"r")
		inl = inp.readline()
		print "Parse blast output:"
		print " eT    :",eT
		print " idenT :",idenT
		print " matchL:",matchL
		print " lengT :",lengT
		countQ = 0
		c = 0
		oupL = open(blast+"_E%iI%iL%iP%iQ%i.qlines" % \
									(eT,idenT,matchL,int(lengT*100),qOnly),"w")
		while inl != "":
			print[inl]
			if c > 0 and c%10000 == 0:
				print " %i x 10k" % (c/10000)
			c += 1
			
			if inl[0] == "#":
				inl = inp.readline()
				continue
			
			L = inl.split("\t")
			I = float(L[2])
			M = int(L[3])
			E = L[-2]
			if E[0] == "e":
				E = "1"+E
			if E == "0.0":
				E = "1e-200"
			
			try:
				E = math.fabs(math.log10(float(E)))
			except ValueError:
				print "Malformed Evalue:",E
				inl = inp.readline()
				continue
			
			if qOnly == 1:
				try:
					N = sizes[L[0]]
				except KeyError:
					print "%s not in the size library" % L[0]
					print len(L)
					print L[:1]
					sys.exit(0)
			elif qOnly == 2:
				N = sizes[L[1]]
			else:
				try:
					N = min(sizes[L[0]],sizes[L[1]]) 
				except:
					if sizes.has_key(L[0]):
						N = sizes[L[0]]
					else:
						N = sizes[L[1]]
			
			if L[0] != L[1] and I >= float(idenT) and M >= matchL and \
				float(M)/float(N) >= lengT and E >= eT:
				oupL.write(inl)
				countQ += 1
			inl = inp.readline()
		
		print "%i total, %i qualified" % (c,countQ)
		print "Done!"
		
		
	def get_qualified3(self,blast,fasta,idenT=95,matchL=150,lengT=0.9):
		
		print "Get sequence sizes..."
		sizes = fmanager.get_sizes(fasta,1)
		
		qdict = {}
		inp = open(blast,"r")
		inl = inp.readline()
		print "Parse blast output:"
		print " idenT :",idenT
		print " matchL:",matchL
		print " lengT :",lengT
		countQ = 0
		c = 0
		oupL = open(blast+".qlines","w")
		while inl != "":
			L = inl.split("\t")
			I = float(L[2])
			M = int(L[3])
			try:
				N = min(sizes[L[0]],sizes[L[1]])
			except:
				if sizes.has_key(L[0]):
					N = sizes[L[0]]
				else:
					N = sizes[L[1]]
			
			if c > 0 and c%10000 == 0:
				print " %i x 10k" % (c/10000)
			c += 1
			if L[0] != L[1] and I >= float(idenT) and M >= matchL and \
				float(M)/float(N) >= lengT:
				oupL.write(inl)
				if qdict.has_key(L[0]):
					if L[1] not in qdict[L[0]]:
						countQ += 1
						qdict[L[0]].append(L[1])
				else:
					countQ += 1
					qdict[L[0]] = [L[1]]
			inl = inp.readline()
		print "",countQ,"qualified pairs."
		
		print "Single linkage..."
		clusters = link.get_relations(qdict,isdict=1)
			
		print "Generate output..."
		oup = open(blast+".cluster","w")
		oup.write("Thresholds: idenT=%i, matchL=%i, lengT=%f\n" % \
					(idenT,matchL,lengT))
		for i in clusters:
			outline = ""
			if len(i) > 1:
				for j in i:
					if sizes.has_key(j):
						max_id = j
						break
						
				for j in i:
					if sizes.has_key(j) and max_id != j and \
						(sizes[j] > sizes[max_id]):
						max_id = j
				for j in outline.split("\t")[:-1]:
					if j[:j.find("(")] != max_id:
						oup.write("%s\t%s\n" % (j[:j.find("(")],max_id))
				oup.write("%s\t-\n" % max_id)
			else:
				oup.write("%s\t-\n" % i[0])
		
		print "Done!"

	def get_qualified2(self,log,qtag):
		
		inp = open(log,"r")
		inline = inp.readline()
		
		query = ""
		subjt = ""
		oup   = open(log+".qlist","w")
		written = {}
		while inline != "":
			if inline[-2:] == "\r\n":
				inline = inline[:-2]
			elif inline[-1] == "\n":
				inline = inline[:-1]
				
			if inline.find("Query=") != -1:
				query = inline[inline.find(" ")+1:]
			elif inline.find("SUBJ:") != -1:
				if inline[1] in qtag.split(","):
					subjt = inline[inline.find(":")+1:inline.find("(")]
					if written.has_key("%s %s" % (query,subjt)):
						pass
					else:
						oup.write("%s\t%s\t%s\n" % (query,subjt,inline[1]))
						written["%s %s" % (query,subjt)] = 1
			
			inline = inp.readline()
			
		print "Done!"
		
	def get_reciprocal(self,blast,target,fasta,sizeP,wself,oflag=0):
		
		print "Blast  :",blast
		print "Target :",target
		print "Fasta  :",fasta
		print "sizeP  :",sizeP
		print "wself  :",wself
		
		inp = open(blast,"r")
		inl = inp.readline()
		
		if target == "percentSIM":
			print "Can't do percentSIM here, do percentID"
			tidx = 2
		elif target == "percentID":
			tidx = 2
		else:
			tidx = 10
		
		bdict = {}
		while inl != "":
			if inl[-2:] == "\r\n":
				inl = inl[:-2]
			elif inl[-1] == "\n":
				inl = inl[:-1]
				
			ilist = inl.split("\t")
			if ilist[0] != ilist[1]:
				score = float(ilist[tidx])
				
				match_leng = float(ilist[3])-float(ilist[5])
												
				if bdict.has_key(ilist[0]):
					if bdict[ilist[0]][1] > score:
						bdict[ilist[0]] = [ilist[1],score,match_leng,inl]
				else:
					bdict[ilist[0]] = [ilist[1],score,match_leng,inl]	
				if bdict.has_key(ilist[1]):
					if bdict[ilist[1]][1] > score:
						bdict[ilist[1]] = [ilist[0],score,match_leng,inl]
				else:
					bdict[ilist[1]] = [ilist[0],score,match_leng,inl]
			inl = inp.readline()
		
		if fasta != "":
			sdict = fmanager.get_sizes(fasta,1)
			if sizeP > 1:
				print "Illegal size threhsold, should <= 1."
				fasta = ""
		
		oup = open(blast+".recip","w")
		oup1= open(blast+".recip.log","w")
		oup1.write("Query\tSubjt\tLengthShort\tAlnShort\tWrite\n")
		written = {}
		for i in bdict.keys():
			if written.has_key(i):
				pass
			else:
				ihit = bdict[i][0]
				if bdict[ihit][0] == i:
					write = 1
					if fasta != "":
						L1 = float(sdict[i])        # length of i
						L2 = float(sdict[ihit])     # length of i's recip hit
						M1 = bdict[i][2]     		# match length for i-ihit
						M2 = bdict[ihit][2]  		# match length for ihit-i
						
						shorterL = L1
						shorterM = M1
						if L2 < L1:
							shorterL = L2
						if M2 < M1:
							shorterM = M2
						
						if float(shorterM)/float(shorterL) < sizeP:
							write = 0
						oup1.write("%s\t%s\t%f\t%f\t%i\n" % \
										(i,ihit,shorterL,shorterM,write))
					
					if(not wself and i[:i.find("_")]==ihit[:ihit.find("_")]):
						write = 0
							
					if write:
						if oflag:
							oup.write("%s\t%s\n" % (i,ihit))
						else:
							if bdict[i][0] == ihit:
								oup.write("%s\n" % bdict[i][-1])
							
				written[i] = 0
				written[ihit] = 0
		
		print "Done!"	
	
	def match_list(self,blast,name):
	
		print "Read name list..."
		inp   = open(name,"r")
		inl   = inp.readline()
		names = {}
		while inl != "":
			if self.rmlb(inl) not in names:
				names[self.rmlb(inl)] = 0
			inl = inp.readline()

		print "Read blast..."
		inp = open(blast,"r")
		inl = inp.readline()
		bdict = {}
		while inl != "":
			L = inl.split("\t")
			if L[0] not in bdict:
				bdict[L[0]] = 1			
			inl = inp.readline()
		
		print "Matching..."
		countIN = 0
		for i in bdict.keys():
			if i in names:
				countIN += 1
				names[i] = 1
		
		oup = open("%s-NOT_IN-%s" % (name,blast),"w")
		for i in names:
			if names[i] == 0:
				oup.write("%s\n" % i)
		print "Total %i names, %i in query list" % (len(names.keys()),countIN)	
		print "Total %i queries, %i in name list" % (len(bdict.keys()),countIN)		
		print "\nDone!\n"
	
	def match_list2(self,blast,alist):
		
		adict = futil.file_to_dict(alist,5)
		inp   = open(blast,"r")
		oup   = open(blast+".mod","w")
		inl   = inp.readline()
		while inl != "":
			inl = self.rn_lb(inl)
			L   = inl.split("\t")
			if adict.has_key(L[1]):
				oup.write("%s\t%s\n" % (inl,adict[L[1]]))
			inl = inp.readline()
			
		print "Done!"
        
	def log_vs_match(self,log,list):
		
		ldict = futil.file_to_dict(list)
		
		query  = ""
		subjt  = ""
		inp    = open(log,"r")
		oup    = open(log+".log","w")
		inline = inp.readline()
		while inline != "":
			if inline.find("Query=") != -1:
				query = inline[inline.find(" ")+1:-1]
				oup.write(inline)
			elif inline.find("[") != -1:
				subjt = inline[inline.find(":")+1:inline.find("(")]
				if ldict.has_key(subjt):
					if ldict[subjt] == query:
						oup.write("[m]%s" % inline[inline.find("]")+1:])
					else:
						oup.write("[n]%s" % inline[inline.find("]")+1:])
				else:
					oup.write("[-]%s" % inline[inline.find(" "):])
			else:
				oup.write(inline)			
			inline = inp.readline()		
		print "Done!"		

	def delete(self,blast,glist):
		
		print "Read gene list..."
		gdict = futil.file_to_dict(glist,0)
		
		inp = open(blast,"r")
		inl = inp.readline()
		oup = open(blast+".mod","w")
		countL = 0
		print "Process line:"
		while inl != "":
			if countL % 1000000 == 0:
				print " %i x 1m" % (countL/1000)
			L   = inl.split("\t")
			if not gdict.has_key(L[0]) and not gdict.has_key(L[1]):	
				oup.write(inl)
				
			countL += 1
			inl = inp.readline()
			
		inp.close()
		oup.close()
		print "Done!"
		
	def threshold(self,blast,T):

		inp = open(blast,"r")
		oup = open(blast+"_T%i.out" % T,"w")
		inl = inp.readline()
		countT = countQ = 0
		while inl != "":
			if countT % 10000 == 0:
				print " %i x 10k" % (countT/10000)
			countT += 1
			llist = inl.split("\t")
			evalue = llist[10]
			if evalue[0] == "e":
				evalue = "1"+evalue
			if evalue == "0.0":
				evalue = "1e-200"
			
			try:
				ef = -math.log10(float(evalue))
			except OverflowError:
				print "Overflow:",evalue
				sys.exit(0)
			if ef > float(T):
				oup.write(inl)
				countQ += 1
				
			inl = inp.readline()
		
		print "Total %i scores, %i qualfied at threshold %i" % (countT,countQ,T)
		
	def fix_blat(self,blat):
		
		inp = open(blat)
		oup = open(blat+".fixed","w")
		inl = inp.readline()
		while inl != "":
			L = inl.split("\t")
			qL = int(L[6])
			qR = int(L[7])
			sL = int(L[8])
			sR = int(L[9])
			
			if L[0][0] == " ":
				L[0] = L[0][1:]
			
			if qL < qR:
				oup.write(inl)
			else:
				oup.write("%s\t%i\t%i\t%i\t%i\t%s" % \
						(string.joinfields(L[:6],"\t"),qR,qL,sR,sL,
						 string.joinfields(L[10:],"\t")))
			inl = inp.readline()
			
		print "Done!"
	
	def chain2(self,blast,gapL=0,fragT=80,chainT=90,debug=0):
		pass
	
	def chain(self,blast,fastaQ,fastaS,gapL=0,fragT=80,chainT=90,debug=0):
		
		print "Get query sizes:",fastaQ
		qsize = fmanager.get_sizes(fastaQ,1)
		
		print "Get subject sizes:",fastaS
		ssize = fmanager.get_sizes(fastaS,1)
		
		print "Read file:", blast
		inp    = open(blast)
		inl    = inp.readline()
		bdict  = {}
		count = 0
		while inl != "":
			if count%1e5 == 0:
				print " %i x 100k" % (count/1e5)
			count += 1
			L  = inl.split("\t")
			sL = int(L[8])
			sR = int(L[9])
			qL = int(L[6])
			qR = int(L[7])
			ID = float(L[2])
			if not bdict.has_key(L[0]):
				if sL < sR:
					bdict[L[0]] = {L[1]:{sL:[sR,ID,qL,qR,"F"]}}
				else:
					bdict[L[0]] = {L[1]:{sR:[sL,ID,qL,qR,"R"]}}
			else:
				if not bdict[L[0]].has_key(L[1]):
					if sL < sR:
						bdict[L[0]][L[1]] = {sL:[sR,ID,qL,qR,"F"]}
					else:
						bdict[L[0]][L[1]] = {sR:[sL,ID,qL,qR,"R"]}
				else:
					if sL < sR:
						bdict[L[0]][L[1]][sL] = [sR,ID,qL,qR,"F"]
					else:
						bdict[L[0]][L[1]][sR] = [sL,ID,qL,qR,"R"]		
			inl = inp.readline()
		
		for i in bdict:
			print i
			for j in bdict[i]:
				print "",j
				for k in bdict[i][j]:
					print " ",k,bdict[i][j][k]
				
		print "Iterate query..."
		non    = []
		oup    = open("%s_I%iC%iG%i.chain"   %(blast,fragT,chainT,gapL),"w")
		oup1   = open("%s_I%iC%iG%i.allfrag" %(blast,fragT,chainT,gapL),"w")
		oup.write( "Query\tSubjt\t%ID\tqL\tqR\tsL\tsR\tnChains\tqSize\tsSize\tqCoord\tsCoord\tOri\n")
		oup1.write("Query\tSubjt\t%ID\tqL\tqR\tsL\tsR\tOri\n")
		
		countQ = 0
		countS = 0
		countC = 0
		countF = 0
		qQ     = {}
		for i in bdict:
			if countQ%1000 == 0:
				print " %i k" % (countQ/1000)
			countQ += 1
			for j in bdict[i]:
				countS += 1
				c = bdict[i][j].keys()
				c.sort()

				chains = []
				single = []
				old = 0
				for k in range(len(c)):
					if debug:
						print "single:",single
						print "chains:",chains

					if debug:
						print i,j,c[k],bdict[i][j][c[k]][1],"->",fragT
					if k+1 == len(c):
						if single == [] and bdict[i][j][c[k]][1] >= fragT:
							single = [[c[k],bdict[i][j][c[k]][0],
											bdict[i][j][c[k]][2],
											bdict[i][j][c[k]][3],
											bdict[i][j][c[k]][1],
											bdict[i][j][c[k]][4],]]				
						if single != []:
							chains.append(single)
						break
					
					if bdict[i][j][c[k]][1] < fragT:
						if debug:
							print " ID < fragT"
						
					elif bdict[i][j][c[k+1]][1]<fragT                     or \
					  	 bdict[i][j][c[k+1]][2]-bdict[i][j][c[k]][3]>gapL or \
					  	 c[k+1]-bdict[i][j][c[k]][0]>gapL or \
					  	 bdict[i][j][c[k]][2]>bdict[i][j][c[k+1]][2]:
					  	case = [0,0,0,0]
						if debug:
							print " -> next frag not qualified"
							print "   ",c[k+1],bdict[i][j][c[k+1]]
					  	if bdict[i][j][c[k+1]][1] < fragT:
					  		if debug:
					  			print "  case0",bdict[i][j][c[k+1]][1],fragT
					  		case[0] = 1
					  	if bdict[i][j][c[k+1]][0]-bdict[i][j][c[k]][3]>gapL or \
					  	   bdict[i][j][c[k+1]][0]-bdict[i][j][c[k]][1]>gapL:
					  		if debug:
					  			print "  case1",bdict[i][j][c[k+1]][0] - c[k], gapL
					  		case[1] = 1
					  	if bdict[i][j][c[k]][2] > bdict[i][j][c[k+1]][2]:
					  		if debug:
					  			print "  case3",bdict[i][j][c[k]][2],bdict[i][j][c[k+1]][2]
					  		case[3] = 1
						if single == []:
							single = [[c[k],bdict[i][j][c[k]][0],
											bdict[i][j][c[k]][2],
											bdict[i][j][c[k]][3],
											bdict[i][j][c[k]][1],
											bdict[i][j][c[k]][4]]]
						chains.append(single)
						single = []
					else:
						if single == []:
							single = [[c[k],bdict[i][j][c[k]][0],
											bdict[i][j][c[k]][2],
											bdict[i][j][c[k]][3],
											bdict[i][j][c[k]][1],
											bdict[i][j][c[k]][4]],
								   	  [c[k+1],bdict[i][j][c[k+1]][0],
										  	bdict[i][j][c[k+1]][2],
										  	bdict[i][j][c[k+1]][3],
										  	bdict[i][j][c[k+1]][1],
											bdict[i][j][c[k+1]][4]]]
						else:
							single.append([c[k+1],bdict[i][j][c[k+1]][0],
												  bdict[i][j][c[k+1]][2],
												  bdict[i][j][c[k+1]][3],
												  bdict[i][j][c[k+1]][1],
												  bdict[i][j][c[k+1]][4]])
				
				if debug:
					print "\nCHAINS",
					print chains,"\n"
				
				if len(chains) != 0:
					for k in range(len(chains)):
						
						localsL = localsR = localqL = localqR = -1
						for m in chains[k]:
							if localqL == -1:
								localsL = m[0]
								localsR = m[1]
								localqL = m[2]
								localqR = m[3]
							if localqL > m[2]:
								localsL = m[0]
								localqL = m[2]
							if localqR < m[3]:
								localsR = m[1]
								localqR = m[3]						
						
						cLen = 0
						cMat = 0
						qC   = ""
						sC   = ""
						ori  = ""
						for m in chains[k]:
							oup1.write("%s\t%s\t%f\t%i\t%i\t%i\t%i\t%s\n" % \
											(i,j,m[4],m[2],m[3],m[0],m[1],m[5]))
							cLen += m[3]-m[2]+1
							cMat += (m[3]-m[2]+1)*m[4]/100
							qC   += "%i|%i," % (m[2],m[3])
							sC   += "%i|%i," % (m[0],m[1])
							ori  += m[5] 
						
						try:
							cMat/cLen
						except:
							print "ERR: cMat/cLen:%i/%i" % (cMat,cLen)
							print i,j,chains
							print "EXIT, CHECK!"
							sys.exit(0)
						
						if cMat/cLen*100 >= chainT:
							if i not in qQ:
								qQ[i] = 1
							else:
								qQ[i] += 1
							oup.write("%s\t%s\t%f\t%i\t%i\t%i\t%i\t%i\t%i\t%i\t%s\t%s\t%s\n" % \
										(i,j,cMat/cLen*100,localqL,localqR,localsL,localsR,
										 len(chains[k]),qsize[i],ssize[j],
										 qC[:-1],sC[:-1],ori))
						else:
							countC += 1
				else:
					non.append(i)
					countF += 1
					
		print "Total %i queries"   % countQ
		print "      %i q-s pairs" % countS
		print "      %i q-s with no frag > fragT"   % countF
		print "      %i q-s with no chain > chainT" % countC
		print "      %i qualified queries"          % len(qQ.keys())
		print "      %i q-s with qualified chains"  % (countS-countF-countC)
		print "Done!"

	def pairs(self,blast):
		
		inp = open(blast)
		oup = open(blast+".unique_pairs","w")
		inl = inp.readline()
		pairs = {}
		query = {}
		subjt = {}
		countQ = 0
		countP = 0
		countS = 0
		while inl != "":
			L = inl.split("\t")
			p = "%s\t%s\n" % (L[0],L[1])
			if p not in pairs:
				pairs[p] = 1
				oup.write(p)
				countP += 1			
			if L[0] not in query:
				query[L[0]] = 1
				countQ += 1
			if L[1] not in subjt:
				subjt[L[1]] = 1
				countS += 1
				
			inl = inp.readline()
		
		print "Queries     :",countQ
		print "Subjects    :",countS
		print "Unique pairs:",countP
		
	def get_sp(self,score,sp):
		
		sp  = sp.split(",")
		inp = open(score)
		oup = open(score+".%s" % string.joinfields(sp,"_"),"w")
		inl = inp.readline()
		c = 0
		cQ = 0
		while inl != "":
			c += 1
			L   = inl.split("\t")
			sp1 = L[0][:2]
			sp2 = L[1][:2]
			if sp1 in sp and sp2 in sp:
				cQ += 1
				oup.write(inl)
			inl = inp.readline()
		print "%i scores, %i qualified" % (c,cQ)
	
	def filterout(self,blast,fasta,E,I,P,A):
		
		EV = E
		ID = I
		CV = P
		AL = A
		
		print "Read sequences"
		fm  = FastaManager.fasta_manager()
		ssize = fm.get_sizes(fasta,1)
		
		print "\nParse blast output"
		inp = open(blast)
		oup = open(blast+".E1I40A30C0.5","w")
		inl = inp.readline()
		c = 0
		cQ = 0
		while inl != "":
			L = inl.split("\t")
			if c % 1e5 == 0:
				print " %i x 10k" % (c/1e5)
			c += 1
			if L[0] != L[1]:
				qL = (float(L[-5])-float(L[-6])+1)/ssize[L[0]]
				sL = (float(L[-3])-float(L[-4])+1)/ssize[L[1]]
				if float(L[-2]) < EV and float(L[2]) > ID and int(L[3]) > AL:
					if (qL > sL and sL > CV) or (qL < sL and qL > CV):
						oup.write(inl)
						cQ += 1
			inl = inp.readline()		
		print "Total %i pairs, %i qualified" % (c,cQ)	
	

	def xspecies(self,blast):
	
		print "ASSUMING: 1st 2 char are sp abbrv, only 2 sp"
		inp = open(blast)
		oup = open(blast+".xspecies","w")
		inl = inp.readline()
		sp = {}
		c = 0
		while inl != "":
			if c % 1e5 == 0:
				print " %i x 10k" % (c/1e5)
			c += 1			
			L = inl.split("\t")
			sp1 = L[0][:2]
			sp2 = L[1][:2]
			if sp1 != sp2:
				oup.write(inl)
				if sp1 not in sp:
					sp[sp1] = {L[0]:1}
				else:
					sp[sp1][L[0]] = 1
				if sp2 not in sp:
					sp[sp2] = {L[1]:1}
				else:
					sp[sp2][L[1]] = 1
			inl = inp.readline()
		print "Species:",sp.keys()
		for i in sp:
			print "",i,len(sp[i].keys())
			
		print "Done!"
	

	def gettop(self,blast):
		inp = open(blast)
		oup = open(blast+".top","w")
		inl = inp.readline()
		m   = {}
		while inl != "":
			L = inl.split("\t")

			if L[0] not in m:
				oup.write(inl)
				m[L[0]] = 1
			
			inl = inp.readline()

	def prepare_mcl(self,blast):
		pass


	def rmlb(self,astr):
		if astr[-2:] == "\r\n":
			astr = astr[:-2]
		elif astr[-1] == "\n":
			astr = astr[:-1]
		return astr

	def help(self):
		print "\npython ParseBlast.py\n"
		print " -f  the code to excecute in ParseBlast:"
		print "     parse_db   - parse blast out stored in psql db"
		print "       REQUIRES: -c, -p, OPTIONAL: -T"
		print "     parse_file - parse blast output file"
		print "       REQUIRES: -blast"
		print "       OPTIONAL: -outbase, -T, -target"
		print "     parse_align- parse the alignment part from blast output"
		print "       REQUIRES: -blast, OPTIONAL: -T,-self,-format"
		print "       -style"
		print "     refine - parse the log file to get info on mutiple subject"
		print "       matches, REQUIRES: log"
		print "     parse_align2-parse the output file from defaut to m = 8"
		print "       REQUIRES: -blast"
		print "     parse_align3-get the blast output of a particular subj."
		print "       need -blast, -target"
		print "     parse_align4-get the squence pairs out. NEED: blast"
		print "     get_qualified - this is an extension for parse_align which"
		print "       will filter modified log file and cluster the"
		print "       qualified entries. REQUIRES:log,fasta,qtag.OPTIONAL:"
		print "       priority"
		print "     get_qualified2- this is another extension for parse_align"
		print "       this will simply get the qualified subj for one entry. No"
		print "       clustering is done. REQUIRES: log, qtag"
		print "     get_qualified3- this is similar to get_qualified. But takes"
		print "       tabular blast output and 3 threshold settings. REQUIRES:"
		print "       blast, fasta, OPTIONAL: I,L,P"
		print "     get_qualified4- similar to get_qualified3 but no clustering"
		print "       REQUIRES: blast, fasta, OPTIONAL: E,I,L,P,Q"
		print "     gettop - get the top match of every query. NEED: blast"
		print "     parse_table- parse the tabular blast output"
		print "       REQUIRES: -blast, -target, OPTIONAL: -T, verbose, wself"
		print "     parse_table2 - parse tabular output based on idenity, length"
		print "       and/or evalue and generate [Q][count][subjs]. REQUIRES: "
		print "       -blast, OPTIONAL: E,I,L"
		print "     parse_table3 - no storage involved, simply take blast out"
		print "       then apply threshold (T) to get -log(e). REQUIRES: blast,"
		print "       OPTIONAL: E,I"
		print "     parse_table4 - parse tabular output based on idenity, length"
		print "       and/or evalue. No modification to the lines. REQUIRES: "
		print "       -blast, OPTIONAL: E,I,L"
		print "     parse_gap  - parse blast out and get gap info. NEED: blast"
		print "     threshold  - apply threshold to a blast output, NEED: blast"
		print "        T"
		print "     symmetrify - symmetrify, normalize, and homogenize scores."
		print "       REQUIRES: -score. OPTIONAL: -cutoff,-outbase,-homog"
		print "     check_acc  - check and correct accessions" 
		print "       REQUIRES: -acc, -score"
		print "     check_missing - check if query OR subj is missing in the"
		print "       Blast output. NEED: blast, fasta, qors"
		print "     spc_score  - get score for non-paramegnetic clustering"
		print "       REQUIRES: -score. OPTIONAL: -per_id,-homog,-o"
		print "     mcl_score  - get score matrix for MCL"
		print "       REQUIRES: -score, OPTIONAL: -cutoff,-outbase,-homog"
		print "     nei_score  - get score matrix for Neighbor in Phylip"
		print "       REQUIRES: -score (modified by symmetrify)"
		print "     mega_score - get score matrix for MEGA"
		print "       REQUIRES: -score (modified by symmetrify)"
		print "     score_matrix - plain old matrix, NEED: score (symmetrified)."
		print "     select     - get scores for a subset of the sequences used"
		print "       in the original BLAST run"
		print "       REQUIRES: -list,-score"
		print "     extract_cds- get the subject sequence out"
		print "       REQUIRES: -blast. OPTIONAL: stype, T, wu, ignorestop"
		print "     get_subj   - get a non-redundant list of matching subj"
		print "       REQUIRES: -blast, OPTIONAL: desc, style, ttype, T"
		print "     index_names- convert sequence names into indices"
		print "       REQUIRES: -score"
		print "     rename     - Change id within the fasta file"
		print "       REQUIRES: -fasta, -name"
		print "     merge_match- merge the subj coords of overlapping entries"
		print "       in a blast output. REQUIRES: -blast, -feature"
		print "     get_reciprocal - get the reciprocal best matches"
		print "       REQUIRES: blast, OPTIONAL: target, fasta, T, wself, P"
		print "     match_list - get the top match for each query sequence and"
		print "       see if the top match is in a list of names. REQUIRES:"
		print "       -blast, -name"
		print "     match_list2 - based on the passed list file, output"
		print "       qualified entries. REQUIRES: -matrix, -blast"
		print "     log_vs_match - compare the log file against the list"
		print "       generated by match_list. New flags are inserted, see"
		print "       code doc for details. REQUIRES: -log, -list"
		print "     match_fam - get subj with match to members of a specified"
		print "       group of genes. REQUIRES: blast,matrix,target. OPTIONAL:"
		print "       unk"
		print "     delete - delete all scores involving any sequence in a list"
		print "       -blast, -list"
		print "     fix_blat - fix the modified blat -out=blast output by"
		print "       parse_align2, NEED: blast (blat-based)"
		print "     chain - link fragments together. NEED: blast,fastaQ,fastaS"
		print "       OPTIONAL: gapL,fragT,chainT,debug"
		print "     pairs - get unique pairs and IDs. NEED: blast"
		print "     get_sp - get scores of particular species, NEED: score,sp"
		print "     filterout - filter blastouput. NEED: blast,fasta. OPTIONAL:"
		print "       E,I,P,A"
		print "     xspecies - get only cross species matches. NEED: blast"  
		print "     for_mega - do blast, parse score for mega. NEED: blast,"
		print "       fasta"
		print "     gettop - get top matches. NEED: blast"
		print " -c    configuration"
		print " -p    specify the block to excecute within config"
		print " -log  log file generated by parse_align"
		print " -blast blast output file, not required if -f parse_db"
		print "       for_mega, this is the blast program dir"
		print " -blast1 for get_reciprocal, first blast output in table format"
		print " -blast2 for get_reciprocal, second output in table format"
		print " -cluster cluster file"
		print " -score score file, required for mcl_score and check_acc"
		print " -list file with a list of seq_id"
		print " -cid  cluster id"
		print " -o    base name for output, default=[blast]"
		print " -acc  name for fasta file with correct accessions"	   
		print " -cutoff cutoff value for the matrix, default = 0"
		print " -ttype threshold type, identity[default], or evalue"
		print " -stype sequence type, protein [1] or nucleotide [0,default]"
		print " -T    -log(E) threshold for parsing blast scores, default 0"
		print "       or it will be compared against percentID or SIM, depend"
		print "       on which is specified in target."
		print "       -> For extract_cds, this is the allowed gap size between"
		print "          stretches"
		print "       -> For get_subj, ID or E threshold, not transformed"
		print "       -> For get_reciprocal, length threshold"
		print " -outbase output base name"
		print " -H    homogenize by taking square [1],square root[2], or no"
		print "	      homogenization at all [0,default]. For e value, use 1."
		print " -per_id  number of top scores to get for SPC, default 10"
		print " -stype type of subj, pep [0] or nucleotide [1,default]"
		print " -ignorestop default no [0]"
		print " -target for parse_file: the kind of score to parse -"
		print "	      evalue[default],percentID or percentSIM"
		print "	      for parse_table can be evalue, percentID, bit, or"
		print "	      token no. separated by ','"
		print " -desc parse matching subj from descriptor lines [1] or from"
		print "	      alignment [0, default]"
		print " -style the BLAST output (-m option) style, 1 (default) or 9"
		print "	      In parse_align, [0, def] means no nt involved, [1] yes"
		print " -name for renaming id within fasta file in [new][old] format"
		print "       for match_list, it is simply a list of names, one column"
		print " -wself include self match [1] or not [0, default]. For"
		print "       get_reciprocal, means match from the same organisms will"
		print "       kept [1] or not [0, default]. If 1, the sequence names"
		print "       HAVE to have sp abbrv followed by an underscore"
		print " -format output align part [0,default] or only the converted"
		print "       match line [1]"
		print " -feature The common feature for the query sequences in generat-"
		print "       ing the blast output"
		print " -fasta For get_qualified. This file is for comparing sizes of"
		print "       cluster members, so make sure the relevant file is used"
		print " -priority for get_qualified. Whether query sequence should take"
		print "       precedance [1] or not [0,default]"
		print " -verbose for parse_table, get qL,qR,sL,sR"
		print " -lenT % length treshold. If this is specified, need BOTH fasta"
		print "       and qors."
		print " -qors for parse_table, based on query [0, default] or subj [1]"
		print "       to apply lenT."
		print " -matrix file with [seq_id][group_designation] or [taxa][seq_id]"
		print "       for match_list2"
		print " -target the target group"
		print " -qtag for get_qualified, specify qualifying tags separated by"
		print "       ','."
		print " -unk  sequences of unknown group are query [1, default] or subj"
		print "       [0]"
		print " -wu   WU format BLAST [1] or not [0, default]"
		print " -E    evalue threshold in -log(e), default -1"
		print " -I    identity threshold in %. Default 95"
		print " -L    length threshold, default 150."
		print " -A    alignment length threshold, default 30"
		print " -P    threshold for the proportion of match length vs sequence"
		print "       length, default 0.9"
		print " -Q    for match length vs. sequence length comparison, use both"
		print "       query & subject [0], query only [1], or subject only [2]"
		print " -fragT fragment %ID lower bound"
		print " -chianT chain %ID lower bound"
		print " -gapL max gap between fragment allowed"
		print " -fastaQ query fasta file"
		print " -fastaS subject fasta file"
		print " -sp   2 letter species abbr. separated by ','"
		print " -debug display debug string [1], default no [0]"
		print ""
		sys.exit(0)

			
#-------------------------------------------------------------------------------
# Function calls
#-------------------------------------------------------------------------------


if __name__ == '__main__':
	configFile = operation = function = blast = accfile = cluster = outbase = \
				 cid = score = list = name = log = feature = fasta = blast1 = \
				 blast2 = matrix = target = qtag = lenT = ttype = fastaQ = \
				 fastaS = sp = qors = ""

	target     = "evalue"
	homogenize = desc = wself = qors = unk = cutoff = format = priority = \
				 verbose = wu = debug = 0
	T	       = 0.0
	per_id     = 10
	stype       = 1
	style      = 1
	futil      = FileUtility.file_util()
	fmanager   = FastaManager.fasta_manager()
	link       = SingleLinkage.single_linkage()
	E          = 0
	I          = 0
	L          = 150
	P          = 0.9
	Q          = 0
	A          = 30
	fragT      = 80
	chainT     = 90
	gapL       = 0
	ignorestop = 0

	parse = parser()

	for i in range(1,len(sys.argv),2):
		if sys.argv[i] == "-c":
			configFile = sys.argv[i+1]
		elif sys.argv[i] == "-p":
			operation  = sys.argv[i+1]
		elif sys.argv[i] == "-o" or sys.argv[i] == "-outbase":
			outbase    = sys.argv[i+1]
		elif sys.argv[i] == "-blast":
			blast     = sys.argv[i+1]
		elif sys.argv[i] == "-acc":
			accfile    = sys.argv[i+1]
		elif sys.argv[i] == "-cluster":
			cluster    = sys.argv[i+1]
		elif sys.argv[i] == "-score":
			score      = sys.argv[i+1]
		elif sys.argv[i] == "-f":
			function   = sys.argv[i+1]      
		elif sys.argv[i] == "-cid":
			cid	= int(sys.argv[i+1])
		elif sys.argv[i] == "-H":
			homogenize = int(sys.argv[i+1])
		elif sys.argv[i] == "-per_id":
			per_id     = int(sys.argv[i+1])
		elif sys.argv[i] == "-cutoff":
			cutoff     = float(sys.argv[i+1])
		elif sys.argv[i] == "-T":
			T	  = float(sys.argv[i+1])
		elif sys.argv[i] == "-list":
			list       = sys.argv[i+1]
		elif sys.argv[i] == "-stype":
			stype       = int(sys.argv[i+1])
		elif sys.argv[i] == "-target":
			target     = sys.argv[i+1]
		elif sys.argv[i] == "-desc":
			desc       = int(sys.argv[i+1])
		elif sys.argv[i] == "-tokens":
			tokens     = sys.argv[i+1]
		elif sys.argv[i] == "-style":
			style      = int(sys.argv[i+1])
		elif sys.argv[i] == "-format":
			format     = int(sys.argv[i+1])
		elif sys.argv[i] == "-name":
			name       = sys.argv[i+1]
		elif sys.argv[i] == "-wself":
			wself      = int(sys.argv[i+1])
		elif sys.argv[i] == "-log":
			log	       = sys.argv[i+1]
		elif sys.argv[i] == "-feature":
			feature    = sys.argv[i+1]
		elif sys.argv[i] == "-fasta":
			fasta      = sys.argv[i+1]
		elif sys.argv[i] == "-blast1":
			blast1     = sys.argv[i+1]
		elif sys.argv[i] == "-blast2":
			blast2     = sys.argv[i+1]
		elif sys.argv[i] == "-priority":
			priority   = sys.argv[i+1]
		elif sys.argv[i] == "-verbose":
			verbose    = int(sys.argv[i+1])
		elif sys.argv[i] == "-qors":
			qors       = int(sys.argv[i+1])
		elif sys.argv[i] == "-lenT":
			lenT       = int(sys.argv[i+1])
		elif sys.argv[i] == "-matrix":
			matrix     = sys.argv[i+1]
		elif sys.argv[i] == "-target":
			target     = sys.argv[i+1]
		elif sys.argv[i] == "-qtag":
			qtag       = sys.argv[i+1]
		elif sys.argv[i] == "-unk":
			unk        = int(sys.argv[i+1])
		elif sys.argv[i] == "-wu":
			wu         = int(sys.argv[i+1])
		elif sys.argv[i] == "-E":
			E          = float(sys.argv[i+1])
		elif sys.argv[i] == "-I":
			I          = float(sys.argv[i+1])
		elif sys.argv[i] == "-L":
			L          = int(sys.argv[i+1])
		elif sys.argv[i] == "-P":
			P          = float(sys.argv[i+1])
		elif sys.argv[i] == "-A":
			A          = int(sys.argv[i+1])
		elif sys.argv[i] == "-Q":
			Q          = int(sys.argv[i+1])
		elif sys.argv[i] == "-ttype":
			ttype      = sys.argv[i+1]
		elif sys.argv[i] == "-gapL":
			gapL       = int(sys.argv[i+1])
		elif sys.argv[i] == "-fragT":
			fragT      = int(sys.argv[i+1])
		elif sys.argv[i] == "-chainT":
			chainT     = int(sys.argv[i+1])
		elif sys.argv[i] == "-fastaQ":
			fastaQ      = sys.argv[i+1]
		elif sys.argv[i] == "-fastaS":
			fastaS      = sys.argv[i+1]
		elif sys.argv[i] == "-ignorestop":
			ignorestop  = int(sys.argv[i+1])
		elif sys.argv[i] == "-sp":
			sp      = sys.argv[i+1]
		elif sys.argv[i] == "-debug":
			debug       = sys.argv[i+1]
		else:
			print "Unknown parameter:",sys.argv[i]
			parse.help()
			
	if function == "parse_db":
		if configFile == "" or operation == "":
			print "\nNeed to define config file and operation\n"
			
			print " -help for help"
			sys.exit(0)
		dbtask = DatabaseOp
		config = dbtask.configConnect(configFile,operation)
		parse  = parser(dbtask,config)
		parse.parse_blast_db(outbase,T)
	elif function == "for_mega":
		if blast == "" or fasta == "":
			print "\nNeed to specify the blast program dir and fasta\n"	  
			print " -help for help"
			sys.exit(0)		    
		parse.for_mega(blast,fasta)
	elif function == "xspecies":		
		if blast == "":
			print "\nNeed to specify the blast output file to be parsed\n"	  
			print " -help for help"
			sys.exit(0)		    
		parse.xspecies(blast)
	elif function == "parse_file":		
		if blast == "":
			print "\nNeed to specify the blast output file to be parsed\n"	  
			print " -help for help"
			sys.exit(0)		    
		parse.parse_blast_file(blast,T,target)
	elif function == "parse_align":		
		if blast == "":
			print "\nNeed to specify the blast output file to be parsed\n"	  
			print " -help for help"
			sys.exit(0)	    
		parse.parse_align(blast,T,wself,format,style)

	elif function == "parse_align4":		
		if blast == "":
			print "\nNeed to specify the blast output\n"	  
			print " -help for help"
			sys.exit(0)	    
		parse.parse_align4(blast)

	elif function == "refine":		
		if log == "":
			print "\nNeed log file.\n"	  
			print " -help for help"
			sys.exit(0)	    
		parse.refine(log)
	elif function == "parse_align2":		
		if blast == "":
			print "\nNeed to specify the blast output file to be parsed\n"	  
			print " -help for help"
			sys.exit(0)	    
		parse.parse_align2(blast)
	elif function == "parse_align3":		
		if blast == "" or target == "":
			print "\nNeed the blast output and target id\n"	  
			print " -help for help"
			sys.exit(0)	    
		parse.parse_align3(blast,target)
	elif function == "get_qualified":		
		if log == "" or fasta == "" or qtag == "":
			print "\nNeed .log, fasta files and qtag\n"
			print " -help for help"
			sys.exit(0)
		parse.get_qualified(log,fasta,qtag,priority)
	elif function == "get_qualified2":		
		if log == "" or qtag == "":
			print "\nNeed parse_align output and qtag\n"
			print " -help for help"
			sys.exit(0)
		parse.get_qualified2(log,qtag)
	elif function == "get_qualified3":		
		if blast == "" or fasta == "":
			print "\nNeed blast output and fasta\n"
			print " -help for help"
			sys.exit(0)
		parse.get_qualified3(blast,fasta,I,L,P)
	elif function == "get_qualified4":		
		if blast == "" or fasta == "":
			print "\nNeed blast output and fasta\n"
			print " -help for help"
			sys.exit(0)
		parse.get_qualified4(blast,fasta,E,I,L,P,Q)
	elif function == "parse_table":		
		if blast == "" or target == "":
			print "\nNeed the blast output and target to be parsed\n"	       
			print " -help for help"
			sys.exit(0)	    
		parse.parse_table(blast,target,T,verbose,wself,lenT,fasta,qors)
	elif function == "parse_table2":
		if blast == "":
			print "\nNeed the blast output\n"	       
			print " -help for help"
			sys.exit(0)	    
		parse.parse_table2(blast,E,I,L)
	elif function == "parse_table3":		
		if blast == "":
			print "\nNeed the blast output\n"	       
			print " -help for help"
			sys.exit(0)	    
		parse.parse_table3(blast,E,I)
	elif function == "parse_table4":		
		if "" in [blast,E,I,L]:
			print "\nNeed the blast, E, I, L\n"	       
			print " -help for help"
			sys.exit(0)	    
		parse.parse_table4(blast,E,I,L)
	elif function == "check_acc":		
		if score == "" or accfile == "":
			print "\nNeed fasta file with correct acc and the parsed file\n"
			print " -help for help"
			sys.exit(0)
		parse.check_acc(accfile,score)
	elif function == "check_missing":		
		if blast == "" or fasta == "" or qors == "":
			print "\nNeed fasta, blast, and qors\n"
			print " -help for help"
			sys.exit(0)
		parse.check_missing(fasta,blast,qors)
	elif function == "symmetrify":		
		if score == "":
			print "\nNeed score file\n"
			print " -help for help"
			sys.exit(0)
		parse.symmetrify(score,outbase,cutoff,homogenize)
	elif function == "spc_score":		
		if score == "":
			print "\nNeed blast ouput, cluster file, and cluster id\n"
			print " -help for help"
			sys.exit(0)
		parse.get_scores_for_spc(score,outbase,per_id,homogenize)
	elif function == "mcl_score":
		if score == "":
			print "\nNeed score file\n"
			print " -help for help"
			sys.exit(0)
		parse.get_scores_for_mcl(score,outbase,cutoff,homogenize)
	elif function == "nei_score":		
		if score == "":
			print "\nNeed symmetrified score file\n"
			print " -help for help"
			sys.exit(0)
		parse.get_scores_for_neighbor(score)
	elif function == "mega_score":		
		if score == "":
			print "\nNeed symmetrified score file\n"
			print " -help for help"
			sys.exit(0)
		parse.mega_score(score)
	elif function == "score_matrix":		
		if score == "":
			print "\nNeed symmetrified score file\n"
			print " -help for help"
			sys.exit(0)
		parse.score_matrix(score)
	elif function == "select":
		if score == "" or list == "":
			print "\nNeed score file and seq_id list\n"
			print " -help for help"
			sys.exit(0)
		parse.get_selected(list,score)
	elif function == "extract_cds":		
		if blast == "":
			print "\nNeed blast output\n"
			print " -help for help"
			sys.exit(0)
		if T == 1.0:
			T = 0
		parse.extract_cds(blast,stype,T,wu,ignorestop)
	elif function == "get_subj":
		
		if blast == "":
			print "\nNeed blast output\n"
			print " -help for help"
			sys.exit(0)
		parse.get_subj(blast,desc,style,ttype,T)

	elif function == "index_names":
		
		if score == "":
			print "\nNeed score file\n"
			print " -help for help"
			sys.exit(0)
		parse.index_names(score)

	elif function == "rename":
		
		if fasta == "" or name == "":
			print "\nNeed fasta and name files\n"
			print " -help for help"
			sys.exit(0)
		parse.rename(fasta,name)	

	elif function == "merge_match":
		
		if blast == "" or feature == "":
			print "\nNeed blast output and feature designation\n"
			print " -help for help"
			sys.exit(0)
		parse.merge_match(blast,feature)	

	elif function == "get_reciprocal":
		
		if blast == "":
			print "\nNeed blast output\n"
			print " -help for help"
			sys.exit(0)
		parse.get_reciprocal(blast,target,fasta,P,wself)	

	elif function == "match_list":
		
		if blast == "" or name == "":
			print "\nNeed two blast output and names\n"
			print " -help for help"
			sys.exit(0)
		parse.match_list(blast,name)	

	elif function == "match_list2":
		
		if blast == "" or matrix == "":
			print "\nNeed two blast output and taxa-name matrix\n"
			print " -help for help"
			sys.exit(0)
		parse.match_list2(blast,matrix)	

	elif function == "log_vs_match":
		
		if log == "" or list == "":
			print "\nNeed log file and match list\n"
			print " -help for help"
			sys.exit(0)
		parse.log_vs_match(log,list)

	elif function == "match_fam":
		
		if blast == "" or matrix == "" or target == "":
			print "\nNeed blast output, matrix file, and target groupt\n"
			print " -help for help"
			sys.exit(0)
		parse.match_fam(blast,matrix,target,unk)	
	elif function == "delete":
		
		if blast == "" or list == "":
			print "\nNeed blast output and a list of names\n"
			print " -help for help"
			sys.exit(0)
		parse.delete(blast,list)	
	elif function == "threshold":
		
		if blast == "" or T == "":
			print "\nNeed blast output and threshold\n"
			print " -help for help"
			sys.exit(0)
		parse.threshold(blast,T)	
	elif function == "parse_gap":
		
		if blast == "":
			print "\nNeed blast output\n"
			print " -help for help"
			sys.exit(0)
		parse.parse_gap(blast)
	elif function == "fix_blat":		
		if blast == "":
			print "\nNeed blast output\n"
			print " -help for help"
			sys.exit(0)
		parse.fix_blat(blast)
	elif function == "chain":		
		if "" in [blast,fastaQ,fastaS]:
			print "\nNeed blast output, fastaQ, fastaS\n"
			print " -help for help"
			sys.exit(0)
		parse.chain(blast,fastaQ,fastaS,gapL,fragT,chainT,debug)
	elif function == "pairs":		
		if blast == "":
			print "\nNeed blast output\n"
			print " -help for help"
			sys.exit(0)
		parse.pairs(blast)
	elif function == "get_sp":
		if score == "" or sp == "":
			print "\nNeed score file and species\n"
			print " -help for help"
			sys.exit(0)
		parse.get_sp(score,sp)
	elif function == "filterout":
		if "" in [blast,fasta]:
			print "\nNeed to specify blast and fasta\n"	  
			print " -help for help"
			sys.exit(0)		    
		parse.filterout(blast,fasta,E,I,P,A)
	# 8/10,07
	elif function == "gettop":
		if blast == "":
			print "\nNeed blast output"
			print " -help for help"
			sys.exit(0)		    
		parse.gettop(blast)		
	else:
		print "\nNo such function defined: '%s'\n" % function
		
		print " -help for help"
		sys.exit(0)


