#2020-04-28 written
#2020-04-29 add condition for short inversion; adjust Tallfrag based on PNSC info; option for simple-formatted output
#2020-06-01 add option of NoiseCountRead for BAMs with severe noise.

import sys, argparse
parser = argparse.ArgumentParser(
	description = "this is a filtering scripts for annotated SVs.")
parser.add_argument("input", help="input file")
parser.add_argument("--ColInfo", help="Starting number of column which have read information. (default = 12)", default=12, type=int)
parser.add_argument("--ShortDist", help="Distance defining short-distanced SVs(shSVs) and long-distanced SVs(lgSVs)(bp). (default = 1000)", default=1000, type=int)
parser.add_argument("--ShortInv", help="Distance defining short-distanced inversions(shINVs)(bp). (default = 5000)", default=5000, type=int)

parser.add_argument("--AllFragLong", help = "Minimum required number of variant fragments to pass in lgSVs.(default = 3)", default=3, type=int)
parser.add_argument("--SAFragLong", help = "Minimum required number of variant fragments with appropritate SA tag to pass in lgSVs.(default = 0)", default=0, type=int)
parser.add_argument("--AllFragShort", help = "Minimum required number of variant fragments to pass in shSVs.(default = 3)", default=3, type=int)
parser.add_argument("--SAFragShort", help = "Minimum required number of variant fragments with appropritate SA tag to pass in shSVs.(default = 1)", default=1, type=int)
parser.add_argument("--AllFragShINV", help = "Minimum required number of variant fragments to pass in shINVs.(default = 5)", default=5, type=int)
parser.add_argument("--SAFragShINV", help = "Minimum required number of variant fragments with appropritate SA tag to pass in shINVs.(default = 2)", default=2, type=int)

parser.add_argument("--NormFragLong", help = "Maximum allowable fragment number in Normal BAM for lgSVs. (default = 0)", default=0, type=int)
parser.add_argument("--NormFragShort",help = "Maximum allowable fragment number in Normal BAM for shSVs. (default = 5)", default=5, type=int)
parser.add_argument("--PanelSampleLong", help = "Maximum allowable sample number in Normal Panel for lgSVs. (default = 0)", default=0, type=int)
parser.add_argument("--PanelSampleShort", help = "Maximum allowable sample number in Normal Panel for shSVs. (default = 5)", default=5, type=int)

parser.add_argument("--MedMAPQ", help = "Minimum required median MAPQ of variant reads. (default = 40)", default=40, type=float)
parser.add_argument("--NoiseBAM", help = "BAM which is used for background noise evaluation (choices: tumor or normal)(default = normal)", default="normal", type=str)
parser.add_argument("--NoiseCountRead", help = "Discordant read count for noise evaluation. If this is set to 'two', at least two reads are required to be counted as noise targets. Default is 'one', but you should select 'two' for severely nosied samples not to miss true variants. (choices = 'one' or 'two')(default = one)", default="one", type=str, choices=["one","two"])
parser.add_argument("--NoiseTarget", help = "Maximum allowable number of mate sites in noise evaluation. This value should be increased in poor-quality BAM not to miss the true variants. (default = 5)", default=5, type=int)
parser.add_argument("--MinVAF", help = "Minimum required VAF of variant fragments in both breakends. (default = 0.03)", default=0.03, type=float)
parser.add_argument("--MinPosVar", help = "Minimum required variation of starting position of variant reads.(bp) (default = 2)", default=2, type=int)
parser.add_argument("--AllFragProb", help = "Minimum required number of variant fragments when a breakend is Problematic.(default = 5)", default=5, type=int)
parser.add_argument("--SAFragProb", help = "Minimum required number of variant fragments with appropriate SA tag when a breakend is Problematic.(default = 1)", default=1, type=int)

parser.add_argument("--FailMedMAPQ", help = "Maximum allowable median MAPQ to define mapping-failure site. (default = 5)", default=5, type=int)
parser.add_argument("--FailTarget", help = "Maximum allowable number of mate sites to define mapping-failure site.(default = 50)", default=50, type=int)
parser.add_argument("--MaskFail", help = "Change chromosome and position of mapping-failure site to dot(.).", action="store_true")
parser.add_argument("--AllFragFail", help = "Minimum required number of variant fragments when a breakend is mapping-failure site.(default = 10)", default=10, type=int)
parser.add_argument("--SAFragFail", help = "Minimum required number of variant fragments with appropriate SA tag when a breakend is mapping-failure site.(default = 3)", default=3, type=int)

parser.add_argument("--SimpleOut", help = "Generate simple-formatted output: #CHR1, POS1, CHR2, POS2, MH, CT, SVtype, T_all, T_sa, T_ref1, T_ref2, T_vaf1, T_vaf2.", action="store_true")

args=parser.parse_args()

designated_name = 'PON_SV;sampleN	tBPinfo	nBPinfo	re_chr1	re_pos1	re_chr2	re_pos2	MH	terminal	SVtype	MAPQ	DV	RV	tSA	nSA	Tumor_Ref1;Ref2;AllDiscordantFragments;SplitFragments;SATagFragments;Vaf1;Vaf2	PairNormal_AllFragments;SAFragments;FragCount1;FragCount2	new_mate1	neo_mate1	new_mate2	neo_mate2	Tumor_BP1_Total;Normal;Discor;Chr;clust;Chr2;clust2	Tumor_BP2_Total;Normal;Discor;Chr;clust;Chr2;clust2	Normal_BP1_Total;Normal;Discor;Chr;clust;Chr2;clust2	Normal_BP2_Total;Normal;Discor;Chr;clust;Chr2;clust2	PairNormalSameClip	MAPQ1_min;med;max	MAPQ2_min;med;max	POS1_min;med;max	POS2_min;med;max'


if args.NoiseBAM not in ['tumor','normal']:
	print('FATAL ERROR: Wrong argument in --NoiseBAM. The argument should be tumor or normal.')
	sys.exit(1)

args_dic = vars(args)
out_list=[]
for key in args_dic:
	out_list.append(key + '=' + str(args_dic[key]))



ncol=args.ColInfo-1
in_file=open(args.input)
if args.SimpleOut:
	out_file=open(args.input + '.fi_sim','w')
else:
	out_file=open(args.input+'.fi', 'w')
print(args.input)

out_line_list=[]
simple_header_list=['#CHR1', 'POS1','CHR2','POS2','MH','CT','SVtype','Tall','Tsa','Tref1','Tref2','Tvaf1','Tvaf2']
simple_header = '\t'.join(simple_header_list)
if args.SimpleOut:
	out_line_list.append(simple_header+"\n")

n=0; m=0
in_line = in_file.readline().strip()
while in_line:
#	print (in_line)
	in_indi=in_line.split('\t')
	if in_line[0:4]=='#CHR':
		out_file.write('##Arguments_in_filter: '+','.join(out_list)+'\n')
		readinfo_colnames='\t'.join(in_indi[ncol:ncol+30]) 
		if readinfo_colnames != designated_name:
			print('######FATAL ERROR:Column names are wrong')
			print(f'###Input column names:\n{readinfo_colnames}')
			print(f'###Designated column  names:\n{designated_name}')
			sys.exit(1)
		if args.SimpleOut == False:
			out_line_list.append(in_line)
	elif in_line[0]=='#':
		'blank'
	else:
		n = n+1

		#define variables
		pon_info = in_indi[ncol]
		pon_sampleN = int(pon_info.split(';')[1])

		chr1 = in_indi[ncol+3]
		pos1 = int(in_indi[ncol+4])
		chr2 = in_indi[ncol+5]
		pos2 = int(in_indi[ncol+6])
		mh = in_indi[ncol+7]
		ct = in_indi[ncol+8]
		svtype = in_indi[ncol+9]

		T_frag_info = in_indi[ncol+15]
#		print (T_frag_info)
		if T_frag_info==".":
			in_line = in_file.readline().rstrip()
			continue
		Tallfrag = int(T_frag_info.split(';')[2])
		if Tallfrag == 0 :
			in_line = in_file.readline().strip()
			continue
		Tspfrag = int(T_frag_info.split(';')[3])
		Tsafrag = int(T_frag_info.split(';')[4])
		T_BP1_refFrag = int(T_frag_info.split(';')[0])
		T_BP2_refFrag = int(T_frag_info.split(';')[1])

		N_frag_info = in_indi[ncol+16]
		Nallfrag = int(N_frag_info.split(';')[0])
		Nsafrag = int(N_frag_info.split(';')[1])

		T_BP1_disco_info = in_indi[ncol+21]
		T_BP1_clustN = int(T_BP1_disco_info.split(';')[4])
		T_BP1_clustNt = int(T_BP1_disco_info.split(';')[6])
		T_BP2_disco_info = in_indi[ncol+22]
		T_BP2_clustN = int(T_BP2_disco_info.split(';')[4])
		T_BP2_clustNt = int(T_BP2_disco_info.split(';')[6])
		N_BP1_disco_info = in_indi[ncol+23]
		N_BP1_clustN = int(N_BP1_disco_info.split(';')[4])
		N_BP1_clustNt = int(N_BP1_disco_info.split(';')[6])
		N_BP2_disco_info = in_indi[ncol+24]
		N_BP2_clustN = int(N_BP2_disco_info.split(';')[4])
		N_BP2_clustNt = int(N_BP2_disco_info.split(';')[6])

		PNSC = int(in_indi[ncol+25])

		BP1_MQ_info = in_indi[ncol+26]
		BP2_MQ_info = in_indi[ncol+27]
		if BP1_MQ_info == 'NA':
			BP1_medMQ = 60
		else:
			BP1_medMQ = float(BP1_MQ_info.split(';')[1])
		
		if BP2_MQ_info == 'NA':
			BP2_medMQ = 60
		else:
			BP2_medMQ = float(BP2_MQ_info.split(';')[1])

		BP1_pos_info = in_indi[ncol+28]
		if BP1_pos_info == 'NA':
			BP1_pos_dist = 100
		else:
			BP1_pos_dist = int(BP1_pos_info.split(';')[2]) -  int(BP1_pos_info.split(';')[0])
		BP2_pos_info = in_indi[ncol+29]
		if BP2_pos_info == 'NA':
			BP2_pos_dist = 100
		else:
			BP2_pos_dist = int(BP2_pos_info.split(';')[2]) -  int(BP2_pos_info.split(';')[0])
		
		#adjust Tallfrag and define VAF
		if PNSC > 0:
			Tallfrag = Tallfrag - Tspfrag + Tsafrag
			Tspfrag = 0
		Tvaf1 = round(Tallfrag / (Tallfrag + T_BP1_refFrag),4)
		Tvaf2 = round(Tallfrag / (Tallfrag + T_BP2_refFrag),4)


		#define simple output format
		simple_out_list=[chr1, str(pos1), chr2, str(pos2), mh, ct, svtype, str(Tallfrag), str(Tsafrag), str(T_BP1_refFrag), str(T_BP2_refFrag), str(Tvaf1), str(Tvaf2)]
		simple_out = '\t'.join(simple_out_list)

		#define svclass
		if svtype == 'TRA':
			svclass='longSV'
		elif svtype == 'INV' and (pos2 - pos1) < args.ShortInv:
			svclass='shortINV'
		else:
			if pos2 - pos1 < args.ShortDist:
				svclass='shortSV'
			else:
				svclass='longSV'
		#define noise number of BP1 and BP2
		if args.NoiseBAM == 'normal':
			if args.NoiseCountRead == 'one':
				bp1_noise_num = N_BP1_clustN
				bp2_noise_num = N_BP2_clustN
			elif args.NoiseCountRead == 'two':
				bp1_noise_num = N_BP1_clustNt
				bp2_noise_num = N_BP2_clustNt
		elif args.NoiseBAM == 'tumor':
			if args.NoiseCountRead == 'one':
				bp1_noise_num = T_BP1_clustN
				bp2_noise_num = T_BP2_clustN
			elif args.NoiseCountRead == 'two':
				bp1_noise_num = T_BP1_clustNt
				bp2_noise_num = T_BP2_clustNt

		#define problematic breakend
		BP1_problem="N"; BP2_problem="N"

		if bp1_noise_num > args.NoiseTarget:
			BP1_problem='Y'
		if bp2_noise_num > args.NoiseTarget:
			BP2_problem='Y'
		if BP1_medMQ < args.MedMAPQ or Tvaf1 < args.MinVAF or BP1_pos_dist < args.MinPosVar:
			BP1_problem = 'Y'
		if BP2_medMQ < args.MedMAPQ or Tvaf2 < args.MinVAF or BP2_pos_dist < args.MinPosVar:
			BP2_problem = 'Y'

		#define failure breakend
		BP1_failure="N"; BP2_failure="N"
		if bp1_noise_num > args.FailTarget:
			BP1_failure='Y'
		if bp2_noise_num > args.FailTarget:
			BP2_failure='Y'
		if BP1_medMQ < args.FailMedMAPQ:
			BP1_failure='Y'
		if BP2_medMQ < args.FailMedMAPQ:
			BP2_failure='Y'


		# filter
		passed=''
		if Nsafrag == 0 and ((svclass == 'longSV' and Nallfrag <= args.NormFragLong and pon_sampleN <= args.PanelSampleLong) or (svclass == 'shortSV' and Nallfrag <= args.NormFragShort and pon_sampleN <= args.PanelSampleShort)):
			if BP1_failure == 'N' and BP2_failure == 'N' and BP1_problem == 'N' and BP2_problem == 'N':
				if svclass == 'longSV' and Tallfrag >= args.AllFragLong and Tsafrag >= args.SAFragLong:
					m = m+1
					passed = 'Y'
				elif svclass == 'shortSV' and Tallfrag >= args.AllFragShort and Tsafrag >= args.SAFragShort:
					m = m+1
					passed = 'Y'
				elif svclass == 'shortINV' and Tallfrag >= args.AllFragShINV and Tsafrag >= args.SAFragShINV:
					m = m+1
					passed = 'Y'
			elif set([BP1_problem, BP2_problem]) == set(['Y','N']):
				if Tallfrag >= args.AllFragProb and Tsafrag >= args.SAFragProb:
					m = m+1
					passed = 'Y'
			elif set([BP1_failure, BP2_failure]) == set(['Y','N']):
				if Tallfrag >= args.AllFragFail and Tsafrag >= args.SAFragFail:
					m = m+1
					passed = 'Y'
		if passed =='Y':
			if args.SimpleOut:
				out_line_list.append(simple_out)
			else:
				out_line_list.append(in_line)
	in_line = in_file.readline().strip()

out_file.write('\n'.join(out_line_list)+'\n')

print(n)
print(m)
			
