#!/usr/bin/perl

#./calculate_overlap.pl -i /data/projects/nguyen/Schnitzler/2021_02_23/symbio/Hsym.overl.out -g /data/projects/nguyen/Schnitzler/2021_02_23/symbio/Hsym_primary_v1.0.gff3 -f /data/projects/nguyen/Schnitzler/2021_02_23/symbio/Hsym_primary_v1.0.fa  -o /data/projects/nguyen/Schnitzler/2021_02_23/symbio/Hsym.transcript.genemodels.overl

use strict;
use warnings;

use Bio::Perl;
use Data::Dumper;
use Getopt::Std;
use Bio::Range;



my $usage=
  "\nUSAGE:\n".
  "        $0 -i [bed] -f [fasta] -g [gff3] -o [output]\n".
  "\n".
  " -i: output of overlapTranscripts.pl\n".
  " -f: fasta file\n".
  " -g: .gff3 gene models\n".
  " -o: Output file\n\n";


my %args;
getopt('ifgo', \%args);

my $outfile = $args{'o'} if $args{'o'};
my $overlap = $args{'i'} if $args{'i'};
my $gene_models = $args{'g'} if $args{'g'};
my $fasta = $args{'f'} if $args{'f'};


die $usage unless ($outfile && $gene_models && $fasta && $overlap);

my @bio_seqs = read_all_sequences($fasta, 'fasta');
my $seq_hash = {};
foreach my $seq (@bio_seqs){
    $seq_hash->{$seq->display_id}->{'seq'} = $seq->seq;
    $seq_hash->{$seq->display_id}->{'len'} = length($seq->seq);
}

open (FH,"$overlap") or die "Can't open file for reading: $overlap\n";
while (<FH>){
    chomp($_);
    #scaffold	gene	gene_start	gene_end	transcript_start	transcript_end	align_start	align_end	align_length
    #HyS0001	HyS0001.6	105743	106975	88378	110798	105743	106975	1233
    #HyS0001	HyS0001.6	105743	106975	105845	106179	105845	106179	335
    #HyS0001	HyS0001.7	107139	108395	88378	110798	107139	108395	1257
    #HyS0001	HyS0001.7	107139	108395	107054	107470	107139	107470	332
    #HyS0001	HyS0001.7	107139	108395	107626	107957	107626	107957	332
    unless ($_ =~ /^\#/){
   	my @arr = split("\t", $_);
    	my $masked_str = 'O' x $arr[8];
	substr($seq_hash->{$arr[0]}->{'seq'},$arr[6]-1,$arr[8]) = $masked_str;
    }
}
close FH;

my $genemodels_data = {};
my $gene_length = 0;
my @GENES = ();
open (FH,$gene_models) or die "Can't open file for reading: $gene_models\n";
while (<FH>){
    chomp($_);
    #HyS0001	AUGUSTUS_PASA	gene	43811	44158	.	-	.	ID=HyS0001.1g;Name=HyS0001.1
    #HyE0001	AUGUSTUS_PASA	mRNA	1777	7329	.	-	.	ID=HyE0001.1;Parent=HyE0001.1g;Name=HyE0001.1
    #HyE5562	manual	mRNA	2150	2887	.	+	.	ID=HyE5562.2;Parent=HyE5562.2g;Name=HyE5562.2|COX2
    my @arr = split("\t",$_);
    if (defined $arr[2] && $arr[2] eq 'mRNA' ){
	$arr[8] =~ s/(.*)Name=(.*)(\|.*)*/$2/g;#|COX2 => $3
	push @GENES, $arr[8];
	my $len =  ($arr[4] -$arr[3]) + 1;
	$genemodels_data->{$arr[8]}->{'s'} =  $arr[3];
	$genemodels_data->{$arr[8]}->{'l'} =  $len;
	$gene_length += $len;
    }
}
close FH;

my $total_overl = 0;
open (OUT,">$outfile") or die "Can't open file for writing: $outfile\n";
print OUT "#gene\tgene_length\taln_len\t%aln\n";
foreach my $gene (@GENES){
    $gene =~ /(.*)\.(\d+)/;
    my $scf = $1;
    my $sequence =  $seq_hash->{$scf}->{'seq'};
    my $gene_seq = substr($sequence,$genemodels_data->{$gene}->{'s'}-1,$genemodels_data->{$gene}->{'l'});
    if (!$gene_seq){
	print "ERROR: $gene\n";
    }
    my @arr = split(//,$gene_seq); 
    my $overl = grep (/O/,@arr);
    $total_overl += $overl;
    my $gene_percent = ($overl / $genemodels_data->{$gene}->{'l'}) * 100;
    print OUT "$gene\t",$genemodels_data->{$gene}->{'l'},"\t$overl\t",sprintf("%.2f",$gene_percent),"\n";
}

print OUT "OVERLAP RATE: ",sprintf("%.2f", ($total_overl/$gene_length) * 100),"%\n";

close OUT;
