use strict;
use Cwd qw(abs_path getcwd);
use Getopt::Long;
use Data::Dumper;
use File::Basename qw(basename dirname);
use FindBin qw($Bin $Script);

my $programe_dir=basename($0);
my $path=dirname($0);

my $ver    = "1.0";
my $Writer = "Daih <daih\@biomarker.com.cn>";
my $Data   = "2015/11/6";
my $BEGIN=time();
#######################################################################################

# ------------------------------------------------------------------
# GetOptions
# ------------------------------------------------------------------
my ($gatk_snp_file1,$gatk_snp_file2,$sample_list1,$sample_list2,$gff1,$gff2,$gene_match_file,$cds1,$cds2,$od);
GetOptions(
			"h|?" =>\&help,
			"od:s"=>\$od,
			"g1:s"=>\$gatk_snp_file1,
			"g2:s"=>\$gatk_snp_file2,
			"s1:s"=>\$sample_list1,
			"s2:s"=>\$sample_list2,
			"gff1:s"=>\$gff1,
			"gff2:s"=>\$gff2,
			"cds1:s"=>\$cds1,
			"cds2:s"=>\$cds2,
			"m:s"=>\$gene_match_file,
			) || &help;
&help unless ($gatk_snp_file1 && $gatk_snp_file2 && $sample_list1 && $sample_list2 && $gff1 && $gff2 && $gene_match_file && $cds1 && $cds2);

sub help
{
	print <<"	Usage End.";
    Description:
        Writer  : $Writer
        Data    : $Data
        Version : $ver
        function: ......
    Usage:
	perl XXX.pl -g1 <gatk snp vcf file 1> -g2 <gatk snp vcf file 2> -s1 <sample list file1> -s2 <sample list file2> -gff1 <gff1 file> -gff2 <gff2 file> -cds1 <tc cds file> -cds2 <bsb cds file> -m <gene match file> -od <outfile>
	-g1	SNP information from gatk	<file>
	-g2	SNP information from gatk	<file>
	-s1	sample list1			<file>
	-s2	sample list2			<file>
	-m	gene match file			<file>
	-gff1	gff1 file			<file>
	-gff2	gff2 file			<file>
	-cds1	cds1 file			<file>
	-cds2	cds2 file			<file>
	-od	outdir				<file>
	Usage End.
	exit;
}
# ------------------------------------------------------------------
# GetOptions
# ------------------------------------------------------------------
$od = abs_path ($od);
mkdir $od if (!-d $od);
$gatk_snp_file1 = abs_path($gatk_snp_file1);
$gatk_snp_file2 = abs_path($gatk_snp_file2);
$sample_list1 = abs_path($sample_list1);
$sample_list2 = abs_path($sample_list2);
$gff1 = abs_path($gff1);
$gff2 = abs_path($gff2);
$cds1 = abs_path($cds1);
$cds2 = abs_path($cds2);
$gene_match_file = abs_path($gene_match_file);
###############Time
my $Time_Start;
$Time_Start = sub_format_datetime(localtime(time()));
print "\nStart $programe_dir Time :[$Time_Start]\n\n";
################
my %basechange = ("A"=>"T", "T"=>"A", "C"=>"G", "G"=>"C", "N"=>"N", "\*"=>"\*");
print "Reading Gff files......\n";
my %gff_hash1 = &Gff($gff1);
my %gff_hash2 = &Gff($gff2);
print "Done!\n\n";
print "Reading CDS fasta files......\n";
my %cds_hash1 = &ReadCDS($cds1);
my %cds_hash2 = &ReadCDS($cds2);
print "Done!\n\n";

print "Reading sample list files......\n";
my %sample_list_hash1;
open my $SA1, "<$sample_list1";
while(<$SA1>){
	chomp;
	next if(/^#/ || /^$/);
	$sample_list_hash1{$_} = 1;
}
my %sample_list_hash2;
open my $SA2, "<$sample_list2";
while(<$SA2>){
	chomp;
	next if(/^#/ || /^$/);
	$sample_list_hash2{$_} = 1;
}
print "Done!\n\n";

print "Filter SNP......\n";
my %vcf_hash1 = &SnpFilter($gatk_snp_file1,"TC", \%sample_list_hash1);#print Dumper(\%vcf_hash1);exit;
my %vcf_hash2 = &SnpFilter($gatk_snp_file2, "BSB", \%sample_list_hash2);#print Dumper(\%vcf_hash2);exit;
print "Done!\n\n";

print "Matching started......\n";
open my $MA, "<$gene_match_file";
open my $O, ">$od/AlternativeASE.xls";
my $num=0;
while(<$MA>){
	chomp;
	$num += 1;
	my @pos = split/\t+/;
	print "line$num: $pos[0]\t$pos[3]\n";
	my $len = abs($pos[2]-$pos[1]);
	for(my $i=0;$i<=$len;$i++){
		my $tc_pos = $pos[1]+$i;
		my $bsb_pos = $pos[4]+$i;
		my($tc_genome_chr, $bsb_genome_chr);
		my($tc_genome_pos, $bsb_genome_pos);
		my($tc_gene_orien, $bsb_gene_orien);
		if(exists $gff_hash1{$pos[0]}){
			my @cds_pos = @{$gff_hash1{$pos[0]}};
			$tc_gene_orien = $cds_pos[0][-1];
			$tc_genome_chr = $cds_pos[0][0];
			$tc_genome_pos = &PosChange($tc_pos, $tc_gene_orien, \@cds_pos);
		}else{
			print "Error: Can not find gene $pos[0] in the tc gff file\n";
		}
		if(exists $gff_hash2{$pos[3]}){
			my @cds_pos = @{$gff_hash2{$pos[3]}};
			$bsb_gene_orien = $cds_pos[0][-1];
			$bsb_genome_chr = $cds_pos[0][0];
			$bsb_genome_pos = &PosChange($bsb_pos, $bsb_gene_orien, \@cds_pos);
		}else{
			print "Error: Can not find gene $pos[3] in the bsb gff file\n";
		}
		my (%tc_base, %bsb_base);
		my (%tc_base_genome, %bsb_base_genome);
		if(exists $vcf_hash1{$tc_genome_chr}{$tc_genome_pos}){
			for my $onebase(keys %{$vcf_hash1{$tc_genome_chr}{$tc_genome_pos}}){
				my $dep = $vcf_hash1{$tc_genome_chr}{$tc_genome_pos}{$onebase};
				$onebase = uc($onebase);
				$tc_base_genome{$onebase} = $dep;
				$onebase = $basechange{$onebase} if($tc_gene_orien eq "-");
				$tc_base{$onebase} = $dep;
			}
		}else{
			my $cdsbase = &GetCdsBase($pos[0], $tc_pos, \%cds_hash1);
			$cdsbase = uc($cdsbase);
			$tc_base{$cdsbase} = "CdsBase";
			$cdsbase = $basechange{$cdsbase} if($tc_gene_orien eq "-");
			$tc_base_genome{$cdsbase} = "CdsBase";
		}
		if(exists $vcf_hash2{$bsb_genome_chr}{$bsb_genome_pos}){
			for my $onebase(keys %{$vcf_hash2{$bsb_genome_chr}{$bsb_genome_pos}}){
				my $dep = $vcf_hash2{$bsb_genome_chr}{$bsb_genome_pos}{$onebase};
				$onebase = uc($onebase);
				$bsb_base_genome{$onebase} = $dep;
				$onebase = $basechange{$onebase} if($bsb_gene_orien eq "-");
				$bsb_base{$onebase} = $dep;
			}
		}else{
			my $cdsbase = &GetCdsBase($pos[3], $bsb_pos, \%cds_hash2);
			$cdsbase = uc($cdsbase);
			$bsb_base{$cdsbase} = "CdsBase";
			$cdsbase = $basechange{$cdsbase} if($bsb_gene_orien eq "-");
			$bsb_base_genome{$cdsbase} = "CdsBase";
		}
		my $signal = Judging(\%tc_base, \%bsb_base); 
		if($signal == 1){
			print $O "$tc_genome_chr\t$tc_genome_pos";
			my(@tc_final_base, @tc_final_depth);
			for my $idx(keys %tc_base_genome){
				my $count = $tc_base_genome{$idx};
				push @tc_final_base, $idx;
				push @tc_final_depth, $count;
			}
			my $tc_base_line = join",", @tc_final_base;
			my $tc_depth_line = join",", @tc_final_depth;
			print $O "\t$tc_base_line\t$tc_depth_line\t$pos[0]\t$tc_gene_orien";
			
			print $O "\t$bsb_genome_chr\t$bsb_genome_pos";
			my(@bsb_final_base, @bsb_final_depth);
			for my $idy(keys %bsb_base_genome){
				my $count = $bsb_base_genome{$idy};
				push @bsb_final_base, $idy;
				push @bsb_final_depth, $count;
			}
			my $bsb_base_line = join",", @bsb_final_base;
			my $bsb_depth_line = join",", @bsb_final_depth;
			print $O "\t$bsb_base_line\t$bsb_depth_line\t$pos[3]\t$bsb_gene_orien\n";
		}
		
	}
}
print "Done!\n\n";

sub Judging{
	my($hash1, $hash2) = @_;
	my %tc = %{$hash1};
	my %bsb = %{$hash2};
	my $sig;
	for my $tc_key(keys %tc){
		if(exists $bsb{$tc_key}){
			$sig = 0;
			last;
		}else{
			$sig = 1;
		}
	}
	return($sig);
}

sub GetCdsBase{
	my($geneid, $pos, $hash) = @_;
	my %cds_hash = %{$hash};
	if(exists $cds_hash{$geneid}){
		my $cds_seq = $cds_hash{$geneid};
		my $cdsbase = substr($cds_seq,$pos-1,1);
		return($cdsbase);
	}else{
		print "Error: Can not find the sequence of gene $geneid\n";
	}
}

sub PosChange{
	my($pos, $orien, $array) = @_;
	my @cds_pos = @{$array};
	if($orien eq "+"){
		my $accum = 0;
		for(my $i=0; $i<=$#cds_pos;$i++){
			$accum += abs($cds_pos[$i][2]-$cds_pos[$i][1])+1;
			if($accum >= $pos){
				my $genome_pos = $cds_pos[$i][2]-($accum-$pos);
				return ($genome_pos);
				last;
			}
		}
	}else{
		my $accum = 0;
		for(my $i=0; $i<=$#cds_pos;$i++){
			$accum += abs($cds_pos[$i][1]-$cds_pos[$i][2])+1;
			if($accum >= $pos){
				my $genome_pos = $cds_pos[$i][1]+($accum-$pos);
				return($genome_pos);
				last;
			}
		}
	}
}



sub SnpFilter{
	my($file, $key, $sample_list_hash) = @_;
	my %sample_list_hash = %{$sample_list_hash};
	my %vcf_hash;
	my @col_num;
	open my $VCF, "<$file";
	open my $OD, ">$od/$key.snp.filter.xls";
	print $OD "#CHROM\tPOS\tREF\tALT\tQUAL";
	while(<$VCF>){
		chomp;
		next if(/^##/);
		my @cols = split/\t+/;
		if(/^#/){
			for(my $i=5;$i<$#cols;$i+=3){
				my $id = (split/_/,$cols[$i])[0];
				if(exists $sample_list_hash{$id}){
					push @col_num, $i;
					print $OD "\t$cols[$i]\t$cols[$i+1]\t$cols[$i+2]";
				}
			}
			print $OD "\n";
		}else{
			my $flag = 0;
			my $raw = $cols[$col_num[0]];
			for my $idx(@col_num){
				if($cols[$idx] eq "N"){
					$flag = 0;
					last;
				}elsif($raw ne $cols[$idx]){
					$flag += 1;
				}
			}
			if($flag >=1){
				print $OD "$cols[0]\t$cols[1]\t$cols[2]\t$cols[3]\t$cols[4]";
				for my $idy(@col_num){
					print $OD "\t$cols[$idy]\t$cols[$idy+1]\t$cols[$idy+2]";
					my @base_type = split/,/,$cols[$idy];
					my @base_depth = split/,/,$cols[$idy+1];
					for(my $i=0;$i<=$#base_type;$i++){
						$vcf_hash{$cols[0]}{$cols[1]}{$base_type[$i]}+= $base_depth[$i];
					}
				}
				print $OD "\n";
			}
		
		}
	}
	return(%vcf_hash);
}

sub Gff{
	my $file = shift;
	my %hash;
	open my $IN, "<$file";
	while(<$IN>){
		chomp;
		next if(/^#/ || /^$/);
		next unless(/\tCDS\t/);
		my @aa = split/\t+/;
		my ($geneid) = $_ =~ /Parent=(\w+?);/;
		push @{$hash{$geneid}},[$aa[0], $aa[3], $aa[4], $aa[6]];
	}
	return(%hash);
}

sub ReadCDS{
	my $file = shift;
	my %hash;
	open my $IN, "<$file";
	$/ = ">";
	<$IN>;
	while(<$IN>){
		chomp;
		next if(/^#/ || /^$/);
		my($head, $seq) = split/\n+/,$_,2;
		my $id = (split/\s+/, $head)[0];
		$seq =~ s/\n+//g;
		$hash{$id} = $seq;
	}
	$/ = "\n";
	return(%hash);
}
###############Time
my $Time_End;
$Time_End = sub_format_datetime(localtime(time()));
print "\nEnd $programe_dir Time :[$Time_End]\n\n";
&Runtime($BEGIN);
###############Subs
sub sub_format_datetime #Time calculation subroutine
{
	my($sec, $min, $hour, $day, $mon, $year, $wday, $yday, $isdst) = @_;
	$wday = $yday = $isdst = 0;
	sprintf("%4d-%02d-%02d %02d:%02d:%02d", $year+1900, $mon+1, $day, $hour, $min, $sec);
}

sub Runtime # &Runtime($BEGIN);
{
	my ($t1)=@_;
	my $t=time()-$t1;
	print "Total $programe_dir elapsed time : [",&sub_time($t),"]\n";
}
sub sub_time
{
	my ($T)=@_;chomp $T;
	my $s=0;my $m=0;my $h=0;
	if ($T>=3600) {
		my $h=int ($T/3600);
		my $a=$T%3600;
		if ($a>=60) {
			my $m=int($a/60);
			$s=$a%60;
			$T=$h."h\-".$m."m\-".$s."s";
		}else{
			$T=$h."h-"."0m\-".$a."s";
		}
	}else{
		if ($T>=60) {
			my $m=int($T/60);
			$s=$T%60;
			$T=$m."m\-".$s."s";
		}else{
			$T=$T."s";
		}
	}
	return ($T);
}

sub ABSOLUTE_DIR #$pavfile=&ABSOLUTE_DIR($pavfile);
{
	my $cur_dir=`pwd`;chomp($cur_dir);
	my ($in)=@_;
	my $return="";
	if(-f $in){
		my $dir=dirname($in);
		my $file=basename($in);
		chdir $dir;$dir=`pwd`;chomp $dir;
		$return="$dir/$file";
	}elsif(-d $in){
		chdir $in;$return=`pwd`;chomp $return;
	}else{
		warn "Warning just for file and dir\n";
		exit;
	}
	chdir $cur_dir;
	return $return;
}

