## Analysis pipeline to compare Strand-seq with Mostovoy et al.

#perl 1_VCF2SNVlist.pl phasing_final_hybrid_scaffolds2_universal.vcf > Super-Scaffold_52_phased.txt

#/home/daewoooo/Downloads/lastz-distrib-1.03.73/src/lastz_32 chrX_NCBI36.fa Super-Scaffold_52.fa --format=rdotplot --ungapped --notransition --maxwordcount=90% --exact=500 --identity=95 --seed=match15 --ambiguous=iupac --match=1,5 --twins=1..100 > rdotplot

#perl 3_RDPLOT2RANGES.pl rdotplot > rdplot.ranges

#perl 4_CONTIG2GENOME.pl rdplot.ranges Super-Scaffold_52_phased.txt > trans.coord

#perl 5_COMPARE_HAPS.pl chr23_haplo_1 trans.coord chr23_haplo_2 > chr23_haplo_1_comparison
#perl 5_COMPARE_HAPS.pl chr23_haplo_2 trans.coord chr23_haplo_1 > chr23_haplo_2_comparison

####################
# 1_VCF2SNVlist.pl #
####################

#!/usr/bin/perl -w
use strict;
use Data::Dumper;

my $vcf_file = shift;
open IN , "<", $vcf_file or die "Can't read file";

my %out;

print "Chrom\tPos\tRef\tAlt\tGen\tHap1_allele\tHap2_allele\tPhaseBlock\n";

while (<IN>) {
	chomp;
	next if $_ =~ /\#+/;

	my ($chr, $pos, $rsID, $ref, $alt, $qual, $filter, $info, $format, $gen) = (split "\t", $_);

	next if $chr ne 'Super-Scaffold_52';
	
	my $del = 1 if length($ref) > length($alt) or length($ref) > 1;		
	my $ins = 1 if length($ref) < length($alt) or length($ref) > 1;
	
	next if $del or $ins;

	my ($phase, $phaseBlock) = (split ":", $gen)[0,8];
	$phaseBlock = 0 if !defined $phaseBlock;

	my ($allele1, $allele2) = (split "|", $phase)[0,2];

	my $hap1_base = '';
	if ($allele1 == 0) {
		$hap1_base = $ref;
	} else {
		$hap1_base = $alt;
	}

	my $hap2_base = '';
	if ($allele2 == 0) {
		$hap2_base = $ref;
	} else {
		$hap2_base = $alt;
	}

	
	
	print "$chr\t$pos\t$ref\t$alt\t$phase\t$hap1_base\t$hap2_base\t$phaseBlock\n";
}


######################
# 3_RDPLOT2RANGES.pl #
######################

#!/usr/bin/perl -w
use strict;
use Data::Dumper;

my $rdplot_file = shift;
open IN , "<", $rdplot_file or die "Can't read file";

print "GenomeCoord1\tContigCoord1\tGenomeCoord2\tContigCoord2\n";

my $header = <IN>;

while (my $line1 = <IN>) {
       my $line2 = <IN>;
       my $line3 = <IN>;		
	chomp($line1, $line2);

	my ($genPos1, $contPos1) = (split "\t", $line1);
	my ($genPos2, $contPos2) = (split "\t", $line2);

	print "$genPos1\t$contPos1\t$genPos2\t$contPos2\n";	
}


######################
# 4_CONTIG2GENOME.pl #
######################

#!/usr/bin/perl -w
use strict;
use Data::Dumper;

my $coord_file = shift;
my $phased_file = shift;
open IN , "<", $coord_file or die "Can't read file";
open IN2 , "<", $phased_file or die "Can't read file";

my $header1 = <IN>;
my %coord = map{chomp; (split "\t", $_)[0], $_} <IN>;

my $header2 = <IN2>;
while (<IN2>) {
	chomp;
	my ($pos, $gen, $allele1, $allele2, $phaseBlock) = (split "\t", $_)[1,4,5,6,7]; 
	
	my $transCoord = 0;

	foreach my $i (sort {$a <=> $b} keys %coord) {
		my ($GenomeCoord1, $ContigCoord1, $GenomeCoord2, $ContigCoord2) = (split "\t", $coord{$i})[0,1,2,3];

		my ($ContigStart, $ContigEnd) = (0,0); 
		$ContigCoord1 < $ContigCoord2 ? $ContigStart = $ContigCoord1 : $ContigStart = $ContigCoord2;
		$ContigCoord1 < $ContigCoord2 ? $ContigEnd = $ContigCoord2 : $ContigEnd = $ContigCoord1;

		my $direction = '';
		if ($pos >= $ContigStart and $pos <= $ContigEnd) {
			
			if ($ContigCoord1 < $ContigCoord2) {
				$transCoord = ($pos-$ContigCoord1)+$GenomeCoord1;
				$direction = 'frw';
			} else {
				$transCoord = $GenomeCoord2-($pos-$ContigCoord2);
				$direction = 'rev';
			}
			print "$pos\t$transCoord\t$gen\t$allele1\t$allele2\t$direction\t$phaseBlock\n";
		}
	}
}



#####################
# 5_COMPARE_HAPS.pl #
#####################

#!/usr/bin/perl -w
use strict;
use Data::Dumper;

my $SS_haps = shift;
my $ref_haps = shift;
my $SS_opos = shift;

open IN1 , "<", $SS_haps or die "Can't read file";
open IN2 , "<", $ref_haps or die "Can't read file";
open IN3 , "<", $SS_opos or die "Can't read file";

my %SS = map{chomp; (split "\t", $_)[0], $_} <IN1>;
my %ref = map{chomp; (split "\t", $_)[1], $_} <IN2>;
my %SSopos = map{chomp; (split "\t", $_)[0], $_} <IN3>;

print "ContigPos\tGenomePos\tSS_hap\tRef1_Ref2\tComparison\tPhaseBlock\n";

my $match_ref1 = 0;
my $match_ref2 = 0;
my $none = 0;
foreach my $pos (sort {$a <=> $b} keys %SS) {	
	if ( exists($ref{$pos}) ) {
		my $ss_hap = (split "\t", $SS{$pos})[1];

		if (exists($SSopos{$pos})) {		
			my $ss_opos = (split "\t", $SSopos{$pos})[1];
			next if $ss_hap eq $ss_opos;
		}
	
		my ($contigPos, $genomePos, $gen, $ref1, $ref2, $direction, $phaseBlock) = (split "\t", $ref{$pos});
		next if $ref1 eq $ref2;

		if ($direction eq 'rev') {
			$ss_hap =~ tr/ACGT/TGCA/;
		}
		
		my $comp = '';
		if ($ss_hap eq $ref1) {
			$comp = 'ref1';
			$match_ref1++;
		} elsif ($ss_hap eq $ref2) {
			$comp = 'ref2';
			$match_ref2++;
		} else {
			$comp = 'None';
			$none++;
		}	

		$comp = 'unphased' if $gen =~ /\//;		
	
		print "$contigPos\t$pos\t$ss_hap\t$ref1 $ref2\t$comp\t$phaseBlock\n";
	}
}

