Supplemental File S2. perl scripts used to filter for genotype likelihood and strand bias. #!/usr/bin/perl use strict; use warnings; ###script to filter VCF file for calls where the minimum difference between genotype likelihoods (PL) is less than 21. my ($input, $output) = @ARGV; #input and output open (IN, "<$input") or die "cant open $input - $!"; open (OUT, ">$output") or die "cant write to $output - $!"; my $PL_threshold = "21"; while ( ){ my $row = $_; my @output_array = (); if ($row =~ m/^#/) { #ignore header rows print OUT "$row"; } if ($row !~ m/^#/) { my @cells = split; foreach my $cell (@cells){ if ($cell !~ m/^.\/./) { @output_array = (@output_array, $cell); } if ($cell =~ m/(.\/.):(\d+,\d+,\d+|\d+,\d+,\d+,\d+,\d+,\d+|\d+,\d+,\d+,\d+,\d+,\d+,\d+,\d+,\d+,\d+):(\d+):(\d+):(\d+):(\d+,\d+|\d+,\d+,\d+|\d+,\d+,\d+,\d+)/) { my @PL_array = (split(/,/,$2)); my @sorted_PL = sort { $a <=> $b } @PL_array; my $PL_diff = ($sorted_PL[1]-$sorted_PL[0]); if ($PL_diff > $PL_threshold) { @output_array = (@output_array, $cell); } if ($PL_diff <= $PL_threshold) { my $new_cell = "./.:$2:$3:$4:$5:$6"; @output_array = (@output_array, $new_cell); } } elsif ($cell =~m/(.\/.):(\d+):(\d+):(\d+):(\d+,\d+|\d+,\d+,\d+|\d+,\d+,\d+,\d+)/) { @output_array = (@output_array, $cell); } } print OUT join ("\t",@output_array,"\n"); @output_array = (); } } close IN; close OUT; ###script to filter VCF file for calls which show evidence of statistically significant strand bias (SP, p > 0.05). my ($input, $output) = @ARGV; #input and output in the command line open (IN, "<$input") or die "cant open $input - $!"; open (OUT, ">$output") or die "cant write to $output - $!"; my $SP_threshold = "13"; #translates to p > 0.05 while ( ){ my $row = $_; my @output_array = (); if ($row =~ m/^#/) { print OUT "$row"; } if ($row !~ m/^#/) { my @cells = split; foreach my $cell (@cells){ if ($cell !~ m/^.\/./) { @output_array = (@output_array, $cell); } if ($cell =~ m/(.\/.):(\d+,\d+,\d+|\d+,\d+,\d+,\d+,\d+,\d+|\d+,\d+,\d+,\d+,\d+,\d+,\d+,\d+,\d+,\d+):(\d+):(\d+):(\d+):(\d+,\d+|\d+,\d+,\d+|\d+,\d+,\d+,\d+)/) { if ($5 < $SP_threshold) { @output_array = (@output_array, $cell); } elsif ($5 >= $SP_threshold) { my $new_cell = "./.:$2:$3:$4:$5:$6"; @output_array = (@output_array, $new_cell); } } elsif ($cell =~m/(.\/.):(\d+):(\d+):(\d+):(\d+,\d+|\d+,\d+,\d+|\d+,\d+,\d+,\d+)/) { if ($4 < $SP_threshold) { @output_array = (@output_array, $cell); } elsif ($4 >= $SP_threshold) { my $new_cell = "./.:$2:$3:$4:$5"; @output_array = (@output_array, $new_cell); } } } print OUT join ("\t",@output_array,"\n"); @output_array = (); } } close IN; close OUT; ###Examples of genotype formats: ##VARIANT site format: #GT:PL:DP:DV:SP:DPR 1/1:237,57,0:19:19:0:0,19 #GT:PL:DP:DV:SP:DPR 1/1:221,37,0,224,57,231:20:19:0:1,19,0 #GT:PL:DP:DV:SP:DPR 1/2:60,3,0,60,3,60,60,3,60,60:1:1:0:0,1,0,0 #GT:PL:DP:DV:SP:DPR 1/1:142,18,0,142,18,142,142,18,142,142:6:6:0:0,6,0,0 ##INVARIANT site format: #GT:DP:DV:SP:DPR 0/0:24:0:0:24,0 or 0/0:42:1:0:41,1,0 or 0/0:42:1:0:41,1,0,0