#!/usr/bin/env perl
############################################################
##                                                        ##
##                    VCF_TO_FASTA.PL                     ##
##                                                        ##
##                       Programmed by MASAFUMI NOZAWA    ##
##                       Last Modified on 2019.4.26       ##
##                                                        ##
############################################################

if ($#ARGV<4){
	print "\n";
	print "This program makes a alternative assembly based on vcf information.\n";
	print "[Usage]\n";
	print "perl vcf_to_fasta.pl ref_fasta_file vcfM_file vcfF_file coverage_file(including positions with zero coverge) output_file\n";
	exit;
}

#minimum depth
$min_depth=5;

#minimum genotype quality
$min_gqual=20;

$printtime=localtime;
print("Script start at $printtime\n");

open(OUT,">$ARGV[4]")||die;

#reading the reference genome
open(IN2,"<$ARGV[0]")||die;
@name=();
@seq=();
$i=0;
while($line2=<IN2>){
	chomp($line2);
	if($line2=~/^>/){
		@data=split(/\s+/,$line2);
		$name[$i]=substr($data[0],1,length($data[0])-1);
		print("$name[$i]\n");
	}else{
		@{$seq[$i]}=split(//,$line2);
		$i++;
	}
}
close(IN2);
$printtime=localtime;
print("Reading the reference genome: Done at $printtime\n");

#change ref nucl with N if coverage is zero
open(IN3,"<$ARGV[3]")||die;
while($line3=<IN3>){
	chomp($line3);
	@data3=split(/\t/,$line3);
	if($data3[2]==0){
		$check=0;
		LOOP0:for($i=0;$i<=$#name;$i++){
			if($data3[0] eq $name[$i]){
				$position=$data3[1]-1;
				$seq[$i][$position]='N';
				$check++;
				last(LOOP0);
			}
		}
		if($check==0){
			die;
		}
	}
}
close(IN3);
$printtime=localtime;
print("Change non covered nucleotides with N: Done at $printtime\n");


$n_rep=0;
open(IN,"<$ARGV[1]")||die;
while($line=<IN>){
	chomp($line);
	@data=split(/\s+/,$line);
	$ch=$data[0];
	$posi=$data[1];
	$refe=$data[3];
	$alte=$data[4];
	@data_2=split(/:/,$data[9]);
	$status=$data_2[0];
	@data_3=split(/,/,$data_2[1]);
	#alt_freq：AD in the vcf file from GATK
	$alt_freq=$data_3[1];
	$depth=$data_2[2];
	$gqual=$data_2[3];
	
	#change ref nucl with alt nucl
	if(($depth>=$min_depth)&&($gqual>=$min_gqual)){
		$counter=0;
		open(IN4,"<$ARGV[2]")||die;
		LOOP4:while($line4=<IN4>){
			chomp($line4);
			@data4=split(/\s+/,$line4);
			$ch2=$data4[0];
			$posi2=$data4[1];
			$refe2=$data4[3];
			$alte2=$data4[4];
			if(($ch eq $ch2)&&($posi eq $posi2)&&($refe eq $refe2)&&($alte eq $alte2)){
				$counter++;
				last(LOOP4);
			}
			if(($ch eq $ch2)&&($posi < $posi2)){
				last(LOOP4);
			}
		}
		close(IN4);
		
		if($counter==0){
			@data_4=split(/,/,$alte);
			if($#data_4==0){
				LOOP:for($i=0;$i<=$#name;$i++){
					if($ch eq $name[$i]){
						#exclude ||($status eq '1/1')
						if(($status eq '0/1')){
							$position=$posi-1;
							$seq[$i][$position]=$alte;
							if(length($refe)>1){
								for($j=$posi;$j<=$posi+length($refe)-2;$j++){
									$seq[$i][$j]='';
								}
							}
							$n_rep++;
						}
						last(LOOP);
					}
				}
			}elsif(($#data_4==1)||($#data_4==2)){
				$alt1='';
				$alt2='';
				$alt3='';
				$alt1=$data_4[0];
				$alt2=$data_4[1];
				$alt3=$data_4[2];
				$a_freq1=$alt_freq;
				$a_freq2=$data_3[2];
				$a_freq3=$data_3[3];
				if(($a_freq1>=$a_freq2)&&($a_freq1>=$a_freq3)){
					$alt_freq=$a_freq1;
					$alte=$alt1;
				}elsif(($a_freq2>$a_freq1)&&($a_freq2>=$a_freq3)){
					$alt_freq=$a_freq2;
					$alte=$alt2;
				}elsif(($a_freq3>$a_freq1)&&($a_freq3>$a_freq2)){
					$alt_freq=$a_freq3;
					$alte=$alt3;
				}else{
					die;
				}
				LOOP2:for($i=0;$i<=$#name;$i++){
					if($ch eq $name[$i]){
						if($status=~/[123]/){
							$position=$posi-1;
							$seq[$i][$position]=$alte;
							if(length($refe)>1){
								for($j=$posi;$j<=$posi+length($refe)-2;$j++){
									$seq[$i][$j]='';
								}
							}
							$n_rep++;
						}
						last(LOOP2);
					}
				}
			}else{
				die;
			}
		}
	}
}
$printtime=localtime;
print("Change reference nucleotides with alternative nucleotides: Done at $printtime\n");

for($i=0;$i<=$#name;$i++){
	print(OUT ">$name[$i]\n");
	$sequence=join('',@{$seq[$i]});
	print(OUT "$sequence\n");
	$sequence='';
}
$printtime=localtime;
print("All Done! at $printtime\n");
print("no of replacements\t");
print("$n_rep\n");

close(IN);
close(OUT);
