#!/usr/bin/perl

########################################################################################
#
#	1.	Average intron length
#	2.	Average length of intergenic regions
#	3.	Average exon length
#	4.	Percentage of genome that is coding and non-coding
#	5.	GC content
#	6.	Average length of predicted transcripts (including 5' and 3' regions)
#	7.	Average protein lengths
#	8.	Average scaffold lengths
#	9. 	Calculate the mode (most frequently occurring value) for each category mentioned above (1-8 => median)
#	10.	Average number of introns per gene and average number of exons per gene
#
# ./generate_stats.pl -f /data/projects/hydractinia/Hsymbio/Assembly/Final_Assembly/modified_with_Mitochondria/Hsym_primary_v1.0.fa -d /data/projects/hydractinia/Hsymbio/Gene_Models/2017_08_15.strand_specific.170711_SONIC_HKNMYBBXX.allpaths.hc12.trinity/gff3_from_augustus_r1.withoutUTR.withoutIso/final/individuals/ -p /data/projects/hydractinia/Hsymbio/Gene_Models/2017_08_15.strand_specific.170711_SONIC_HKNMYBBXX.allpaths.hc12.trinity/gff3_from_augustus_r1.withoutUTR.withoutIso/final/Hsym_primary_v1.0.aa -o /data/projects/hydractinia/SCRIPT_OUTPUT/stats/2021_03_29/symbio/Hsym_stats.out
# 
########################################################################################


use strict;
use warnings;

use Getopt::Std;
use Data::Dumper;
use Bio::Perl;
use Storable ('dclone');


my $usage=
  "\nUSAGE:\n".
  "        $0 -f [fasta] -d [directory] -p [protein] -o [output] \n".
  "\n".
  " -f: Scaffold\n".
  " -p: Proteins\n".
  " -d: Directory containing gene models file\n".
  " -o: Output file\n\n";


my %args;
getopt('fdop', \%args);

my $fasta = $args{'f'} if $args{'f'};
my $protein = $args{'p'} if $args{'p'};
my $dir = $args{'d'} if $args{'d'};
my $output = $args{'o'} if $args{'o'};

die $usage unless ($dir && $output && $fasta && $protein);

my @bio_seqs = read_all_sequences($fasta,'fasta');

my ($mode_scaffold,$mode_protein,$mode_exon,$mode_intron,$mode_intergenics,$mode_coding,$mode_non_coding,$mode_transcript);
my ($scaffold_len,$scaffold_no)=(0,0);
my ($scaffold_hash,$scaffold_noncoding_hash);
foreach my $seq (@bio_seqs){
    my $len = length($seq->seq);
    $scaffold_hash->{$seq->display_id} = $len; 
    $scaffold_len += $len;
    $scaffold_no++;

    $scaffold_noncoding_hash->{$seq->display_id} = 1;

    #mode
    unless (exists $mode_scaffold->{$len}){
	$mode_scaffold->{$len} = 1;
    }
    else{
	$mode_scaffold->{$len} += 1;
    }

}

@bio_seqs = read_all_sequences($protein,'fasta');
my ($protein_len,$protein_no) = (0,0);
foreach my $seq (@bio_seqs){
    my $sequence = $seq->seq;
    $sequence =~ s/\*$//;
    my $prot_len = length($sequence);
    $protein_len += $prot_len;
    $protein_no++;

    #mode
    unless (exists $mode_protein->{$prot_len}){
	$mode_protein->{$prot_len} = 1;
    }
    else{
	$mode_protein->{$prot_len} += 1;
    }

}

opendir (DIR, $dir) or die "Can't open directory for reading: $dir\n";
my @gene_models = grep { /^[^\.]/ && -f $dir.$_} readdir (DIR);




my $data;
my $scf_len=0;
foreach my $contig (@gene_models){
    if ($contig =~ /\.gff3$/){
    	my $file = $dir.$contig;
	$contig =~ s/(.*)\.gff3$/$1/g;
    #$contig = "ML".$contig;
    	delete $scaffold_noncoding_hash->{$contig};

    #print "$contig\n";
   
    	my $result = getData($file);
    
    	$data->{'intron_len'} += $result->{'intron_len'};
    	$data->{'intron_no'} += $result->{'intron_no'};
    	$data->{'intergenics_len'} += $result->{'intergenics_len'};
    	$data->{'intergenics_no'} += $result->{'intergenics_no'};
    	$data->{'exon_len'} += $result->{'exon_len'};
    	$data->{'exon_no'} += $result->{'exon_no'};
   	$data->{'coding'} += $result->{'coding'};
    	$data->{'non_coding'} += $result->{'non_coding'};
    	$data->{'transcript_len'} += $result->{'transcript_len'};
   	$data->{'transcript_no'} += $result->{'transcript_no'};  

    	$scf_len +=  $scaffold_hash->{$contig};
    #print "RESULT = $contig\t$data->{'coding'}\t$data->{'non_coding'}\t$scf_len\n";
    	unless ( ($data->{'coding'} + $data->{'non_coding'}) ==  $scf_len){
	    die "CHECK coding and non-coding $contig\tcoding=$data->{'coding'}\tnon_coding=$data->{'non_coding'}\tscaffold_len=$scf_len\n";
    	}
    #print "Coding = $data->{'coding'}\tNon_coding = $data->{'non_coding'}\tLen = $scf_len\n";
     }   
}

#some scaffolds don't have genes predicted => belong to non_coding part
foreach my $seqname (keys %$scaffold_noncoding_hash){
    my $contig_len = $scaffold_hash->{$seqname};
    $data->{'non_coding'} += $contig_len;
    unless (exists $mode_non_coding->{$contig_len}){
	$mode_non_coding->{$contig_len} = 1;
    }
    else{
	$mode_non_coding->{$contig_len} += 1;
    }
   
}

open (OUT, ">$output") or die "Can't open file for writing: $output\n";
print OUT "#type\tlength\tnumber\taverage\tmode\n";

my ($arvg,$mode);
$arvg = $data->{'intron_len'} / $data->{'intron_no'};
$mode = getMode($mode_intron);
print OUT "Average intron length\t",$data->{'intron_len'},"\t",$data->{'intron_no'},"\t",sprintf("%.2f", $arvg),"\t$mode\n";

$arvg = $data->{'intergenics_len'} / $data->{'intergenics_no'};
$mode = getMode($mode_intergenics);
print OUT "Average length of intergenic regions\t",$data->{'intergenics_len'},"\t",$data->{'intergenics_no'},"\t",sprintf("%.2f", $arvg),"\t$mode\n";

$arvg = $data->{'exon_len'} / $data->{'exon_no'};
$mode = getMode($mode_exon);
print OUT "Average exon length\t",$data->{'exon_len'},"\t",$data->{'exon_no'},"\t",sprintf("%.2f", $arvg),"\t$mode\n";

$arvg = $data->{'transcript_len'} / $data->{'transcript_no'};
$mode = getMode($mode_transcript);
print OUT "Average length of predicted transcript\t",$data->{'transcript_len'},"\t",$data->{'transcript_no'},"\t",sprintf("%.2f", $arvg),"\t$mode\n";

$arvg = $data->{'coding'} / $scaffold_len;
$mode = getMode($mode_coding);
print OUT "Average coding\t",$data->{'coding'},"\t",$scaffold_len,"\t",sprintf("%.2f", $arvg*100),"\t$mode\n";

$arvg = $data->{'non_coding'} / $scaffold_len;
$mode = getMode($mode_non_coding);
print OUT "Average non-coding\t",$data->{'non_coding'},"\t",$scaffold_len,"\t",sprintf("%.2f", $arvg*100),"\t$mode\n";

$arvg = $protein_len / $protein_no;
$mode = getMode($mode_protein);
print OUT "Average protein length\t",$protein_len,"\t",$protein_no,"\t",sprintf("%.2f", $arvg),"\t$mode\n";

$arvg = $scaffold_len / $scaffold_no;
$mode = getMode($mode_scaffold);
print OUT "Average scaffold length\t",$scaffold_len,"\t",$scaffold_no,"\t",sprintf("%.2f", $arvg),"\t$mode\n";

$arvg = $data->{'intron_no'} / $data->{'transcript_no'};
print OUT "Average introns per gene\t",$data->{'intron_no'},"\t",$data->{'transcript_no'},"\t",sprintf("%.2f", $arvg),"\t.\n";

$arvg = $data->{'exon_no'} / $data->{'transcript_no'};
print OUT "Average exons per gene\t",$data->{'exon_no'},"\t",$data->{'transcript_no'},"\t",sprintf("%.2f", $arvg),"\t.\n";

close OUT;

sub getData {
    my ($file) = @_;

    open (FH, $file) or die "Can't open file for reading: $file\n";
    my ($feature,$strand,$gene_strand,$start,$end,@gene_exons,@intergenics,$gene_start,$gene_end) ;
    my ($coding,$non_coding,$gene_coding,$gene_noncoding,$gene_exon) = (0,0,0,0,0);
    my ($transcript_no,$transcript_len,$exon_no,$exon_len,$intron_no,$intron_len,$intergenics_no,$intergenics_len) = (0,0,0,0,0,0,0,0);
    
    my ($contig,$scaffold_start,$scaffold_end);
    while (<FH>){
	chomp ($_);
	#HyS0001	AUGUSTUS_PASA	gene	108440	109512	.	-	.	ID=HyS0001.8g;Name=HyS0001.8
	#HyS0001	AUGUSTUS_PASA	mRNA	108440	109512	.	-	.	ID=HyS0001.8;Parent=HyS0001.8g;Name=HyS0001.8
	#HyS0001	AUGUSTUS_PASA	five_prime_UTR	109388	109512	.	-	.	Parent=HyS0001.8
	#HyS0001	AUGUSTUS_PASA	exon	108440	109512	.	-	.	Parent=HyS0001.8
	#HyS0001	AUGUSTUS_PASA	CDS	108440	109387	.	-	0	Parent=HyS0001.8
	#
	if($_ =~ /(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)/ && $3 ne 'contig'){
	    $contig = $1;
	    $feature = $3;
	    $start = $4;
	    $end = $5;
	    $strand = $7;
	    my $len = ($end - $start) + 1;

	    if($feature eq 'gene'){
		#scaffold
		$scaffold_start = $start unless (defined $scaffold_start);
		$scaffold_end = $end;

		#transcripts
		$transcript_no++;
		$transcript_len += $len;

		#mode
		unless (exists $mode_transcript->{$len}){
		    $mode_transcript->{$len} = 1 unless $len==0;
	 	}
		else{
		    $mode_transcript->{$len} += 1;
		}


		push @intergenics,[$start,$end];

		if(defined $gene_start){
		    my $intron = getIntrons(\@gene_exons,$gene_strand);
		    my $coding_sum = $gene_coding + $intron + $gene_noncoding; #gene_noncoding=UTRs; $gene_coding=CDS 
		    my $transcript_sum = ($gene_end - $gene_start) + 1;
		    unless ( $coding_sum == $transcript_sum){
			die "CHECK scaffold $contig\t$gene_start\t$gene_end\t$coding_sum=$gene_coding:$intron:$gene_noncoding\t$transcript_sum\n";
		    }
		    unless ( ($gene_noncoding + $gene_coding) == $gene_exon){
			die "CHECK exon $contig\t$gene_start\t$gene_end\tcoding=$gene_coding\tnoncoding=$gene_noncoding\texon=$gene_exon\n";
		    }
		    
		    #introns
		    $non_coding += ($intron + $gene_noncoding);
		    $intron_len += $intron;
		    $intron_no += $#gene_exons;

		    #mode
		    unless (exists $mode_intron->{$intron}){
		    	$mode_intron->{$intron} = 1 unless $intron==0;;
	 	    }
		    else{
		    	$mode_intron->{$intron} += 1;
		    }

		    unless (exists $mode_non_coding->{$intron}){
		    	$mode_non_coding->{$intron} = 1 unless $intron==0;;
	 	    }
		    else{
		    	$mode_non_coding->{$intron} += 1;
		    }


		    #reset variables
		    $gene_coding = 0;
		    $gene_noncoding = 0;
		    $gene_exon = 0;
		    @gene_exons = ();

		}

		$gene_start = $start;
		$gene_end = $end;
		$gene_strand = $strand;
	    }	    
	    elsif($feature eq 'exon'){
		#exons
		$exon_no++;
		$exon_len += $len;
		$gene_exon += $len;

		#median
		push @gene_exons, [$start,$end];

		#mode
		unless (exists $mode_exon->{$len}){
		    $mode_exon->{$len} = 1;
	 	}
		else{
		    $mode_exon->{$len} += 1;
		}
	    }
	    elsif($feature eq 'CDS'){
		#median
		$coding += $len; #for the whole scaffold
		$gene_coding += $len; #for a gene within a scaffold

		#mode
		unless (exists $mode_coding->{$len}){
		    $mode_coding->{$len} = 1 unless $len==0;;
	 	}
		else{
		    $mode_coding->{$len} += 1;
		}

	    }
	    elsif($feature eq 'five_prime_UTR' || $feature eq 'three_prime_UTR'){
		#median
		#$non_coding += $len; #for the whole scaffold
		$gene_noncoding += $len; #for a gene within a scaffold

		#mode
		unless (exists $mode_non_coding->{$len}){
		    $mode_non_coding->{$len} = 1 unless $len==0;;
	 	}
		else{
		    $mode_non_coding->{$len} += 1;
		}

	    }
	}
	
    }#while
    close FH;

    #last gene - introns
    my $intron = getIntrons(\@gene_exons,$gene_strand);
    my $coding_sum = $gene_coding + $intron + $gene_noncoding;
    my $transcript_sum = ($gene_end - $gene_start) + 1;
    unless ( $coding_sum == $transcript_sum){
    	die "CHECK scaffold $contig\t$gene_start\t$gene_end\t$coding_sum=$gene_coding:$intron:$gene_noncoding\t$transcript_sum\n";
    }
    unless ( ($gene_noncoding + $gene_coding) == $gene_exon){
	die "CHECK exon $contig\t$gene_start\t$gene_end\tcoding=$gene_coding\tnoncoding=$gene_noncoding\texon=$gene_exon\n";
    }

    $non_coding += ($intron + $gene_noncoding);
    $intron_len += $intron;
    $intron_no += $#gene_exons;

    #mode for last gene
    unless (exists $mode_intron->{$intron}){
	$mode_intron->{$intron} = 1 unless $intron==0;;
    }
    else{
	$mode_intron->{$intron} += 1;
    }

    unless (exists $mode_non_coding->{$intron}){
	$mode_non_coding->{$intron} = 1 unless $intron==0;;
    }
    else{
	$mode_non_coding->{$intron} += 1;
    }

    #intergenics
    #median
    $intergenics_len = getIntrons(\@intergenics,$gene_strand,1);
    $intergenics_no = $#intergenics;
    $non_coding += $intergenics_len;

    #mode
    unless (exists $mode_intergenics->{$intergenics_len}){
	$mode_intergenics->{$intergenics_len} = 1 unless $intergenics_len==0;
    }
    else{
	$mode_intergenics->{$intergenics_len} += 1;
    }

    
    my $data;
    
    $data->{'intron_len'} = $intron_len;
    $data->{'intron_no'} = $intron_no;
    $data->{'intergenics_len'} = $intergenics_len;
    $data->{'intergenics_no'} = $intergenics_no;
    $data->{'exon_len'} = $exon_len;
    $data->{'exon_no'} = $exon_no;
    $data->{'coding'} = $coding;
    my $ncoding = ($scaffold_start - 1) + ($scaffold_hash->{$contig} - $scaffold_end); #non_gene
    $data->{'non_coding'} = $non_coding + $ncoding;
    $data->{'transcript_len'} = $transcript_len;
    $data->{'transcript_no'} = $transcript_no;
    $data->{'scaffold_start'} = $scaffold_start;
    $data->{'scaffold_end'} = $scaffold_end;
    
    #test
    unless ( ($transcript_len + $intergenics_len + $ncoding) ==  $scaffold_hash->{$contig}){
	die "CHECK whole scaffold $contig\tncoding=$ncoding\ttranscript=$transcript_len\tintergenics=$intergenics_len\tscaffold_len=$scaffold_hash->{$contig}\n";
    }
    unless ( ($data->{'coding'} + $data->{'non_coding'}) ==  $scaffold_hash->{$contig}){
	die "CHECK coding $contig\tcoding=$coding\tnon_coding=$non_coding\tncoding=$ncoding\tscaffold_len=$scaffold_hash->{$contig}\n";
    }
    #print "DATA = $contig\t$data->{'coding'}\t$data->{'non_coding'}\t$scaffold_hash->{$contig}\n";
    return $data;  
}

sub getIntrons {
    my ($exons,$strand,$flag) = @_;

    my $arr = dclone($exons);    
    
    if(!$flag && $strand eq '-'){
	@$arr = reverse(@$arr);
    }
   # print Dumper($arr),"\n";
    my ($intron_hash,$prev_end);
    my $intron=0;
    foreach my $coord (@$arr){
	my $start = $coord->[0];
	my $end = $coord->[1];

	if(defined $prev_end){
	    my $len = ($start - $prev_end) - 1;
	    $intron += $len;
	}
	$prev_end = $end;
    }
    #print "Intron = $intron\n";
    return $intron;
}

sub getMode {
    my ($hash) = @_;

    my ($mode,$max_value) = (-1,-1);
    foreach my $key (keys %$hash){
	my $value = $hash->{$key};
	if ($max_value < $value){
	    $max_value = $value;
	    $mode = $key;     
	}
    }
    
    return $mode;
}


