#!/usr/bin/env perl
# --------------------------------------------
# Alex Lomsadze
# GaTech
# Last update 2021
#
# Enrich GFF3 file:
#     * add CDS intron GFF lines
#     * add start and stop codon GFF lines 
#     * add HMM state label to CDS GFF lines: Partial, Single, Initial, Internal or Terminal
#     * add splice site dinucleotides: gt_ag, gc_ag, etc to intron GFF lines
#     * add start and stop sequence to start/stop codon GFF lines
#     * compare calculated features with existing - for some of the features - intron/utr
# To do:
#     * separate intorns in UTRs and CDS   -!
#
# Input file must be in "nice" GFF3 format
# for example, file generated by "gt" code with --nice option
# --------------------------------------------

use strict;
use warnings;

use Getopt::Long;
use Data::Dumper;

my $VERSION = "v7_2023";

# --------------------------------------------
my $in_gff3 = '';          # input GFF3 file
my $out_gff3 = '';         # output GFF3 file

my $verbose = '';          #
my $warnings = '';         #
my $debug = '';            #

my $gseq_in = '';          # input file with genome sequence FASTA
my $tseq_out = '';         # output file for transcriptome FASTA sequence; transcripts are reverse complemented when on reverse strand
my $tgff_out = '';         # output file for transcriptome annotation GFF

my $min_intron = 10;       # minimum length between cds-cds to get label intron; shorter length is labeled as "gap" - most probably these gaps are frame-shifts
my $no_gc = 0;             # skip calculation of sequence GC features
my $adjust_cds = 0;        # move exon CDS borders of incomplete CDS to full codon

my $get_sites = 0;         # output seq around sites as in GeneMark.hmm model

# not supported yet
my $CDSseq_out = '';       # output file for CDS sequence FASTA
my $cds_gene_only = '';    # output only information about protein coding transcripts

# --------------------------------------------
Usage() if ( @ARGV < 1 );
ParseCMD();
CheckBeforeRun();
# --------------------------------------------

# Download genome - optional
my %genome = ();
LoadGenome( $gseq_in, \%genome ) if $gseq_in;

# put all GFF lines from one gene into this array
# genes are proceed one gene at a time
# this array is reused for the next gene
my @record = ();
my %meta = ();

# helper for debug on file level
my %info = ();
$info{"genes_in_input"} = 0;
$info{"CDS_genes_in_input"} = 0;
$info{"annot_introns_in_input"} = 0;
$info{"calc_introns_match_annot"} = 0;
$info{"calc_introns_mismatch_annot"} = 0;
$info{"annot_introns_mismatch_calc"} = 0;

# transcript and CDS sequences - all genes
#my %tseq = ();

print "# starting input GFF3 parsing\n" if $verbose;

open( my $IN, $in_gff3 ) or die "error on open file $in_gff3: $!\n";
open( my $OUT, ">", $out_gff3 ) or die "error on open file $out_gff3: $!\n";

my $TSEQ_OUT;
if ( $tseq_out )
{
	open( $TSEQ_OUT, ">", $tseq_out ) or die "error on open file $tseq_out: $!\n";
}

my $TGFF_OUT;
if ( $tgff_out )
{
	open( $TGFF_OUT, ">", $tgff_out ) or die "error on open file $tgff_out: $!\n";
}

while(my $line = <$IN>)
{
	# skip empty lines
	next if( $line =~ /^\s*$/ );

	# keep GFF3 support lines in output
	if ( $line =~ /^#[^#]/ or $line =~ /^##[^#]/ )
	{
		print $OUT $line;
		next;
	}

	# one full gene was placed into record
	if ( $line =~ /^#{3}\s*$/ )
	{
		if (IsEmpty(\@record))
		{
			@record = ();
			%meta = ();
			next;
		}

		my @new_record = EnrichRecord(\@record, $gseq_in);

		PrintRecord(\@new_record);
		print $OUT $line;

		PrintTseqRecord($TSEQ_OUT) if $tseq_out;
		PrintTgffRecord($TGFF_OUT) if $tgff_out;

		$info{"genes_in_input"} += 1;
		$info{"CDS_genes_in_input"} += 1 if IsCDSgene(\@new_record);
		
		if ( $verbose and( $info{"genes_in_input"} % 1000 == 0 ))
		{
			print "# processed genes and CDS genes: ". $info{"genes_in_input"} ." ". $info{"CDS_genes_in_input"} ."\n";
		}

		@record = ();
		%meta = ();
	}
	else
	{
		AddLineToRecord( $line, \@record );
	}
}

close $TSEQ_OUT if $tseq_out;
close $TGFF_OUT if $tgff_out;
close $OUT;
close $IN;

if ( @record )
	{ die "error, check input GFF3 file format, array must be empty at this point\n"; }

if ($verbose)
{
	print Dumper(\%info);
	print "# done\n";
}

# --------------------------------------------
sub IsEmpty
{
	my $ref = shift;

	my $str_status = 0;

	foreach my $str (@{$ref})
	{
		$str_status = 1 if ($str =~ /\S/);
	}

	$str_status = ($str_status == 1) ? 0 : 1;

	return $str_status;
}
# --------------------------------------------
sub PrintTgffRecord
{
	my $OUT = shift;

	foreach my $tid (keys %meta)
	{
		next if ! $meta{$tid}{'is_cod'};

		my $line = $meta{$tid}{'tid'};
		$line .= "\ttransfer_from_dna";
		$line .= "\tCDS";
		$line .= "\t". ($meta{$tid}{'utr5_length'} + 1);
		$line .= "\t". ($meta{$tid}{'t_length'} - $meta{$tid}{'utr3_length'} );
		$line .= "\t0";
		$line .= "\t+";
		$line .= "\t0";
		$line .= "\tgene_id \"". $meta{$tid}{'gid'} ."\"; transcript_id \"". $meta{$tid}{'tid'} ."\";\n";

		print $OUT $line;
	}
}
# --------------------------------------------
sub GC
{
	my $s = shift;

	my $count_gc = 0;
	my $count_at = 0;

	$count_gc = () = $s =~ m/[gcGC]/g;
	$count_at = () = $s =~ m/[atAT]/g;

	my $gc = 0;

	if ( $count_at + $count_at == 0 )
	{
		print "# warning, no ATCG letters in sequence\n";
	}
	else
	{
		$gc = int( 100 * $count_gc / ($count_gc + $count_at));
	}

	return $gc;
}
# --------------------------------------------
sub AddTagBool
{
	my $bool = shift;
	my $label = shift;
	
	$bool = 0 if (!defined $bool);

	return "\t". $label ."=". $bool;
}
# --------------------------------------------
sub PrintTseqRecord
{
	my $OUT = shift;

	my $letters_per_line = 100;

	foreach my $tid (keys %meta)
	{
		my $defline = ">". $meta{$tid}{'tid'} ."\t". $meta{$tid}{'gid'};

		$defline .= AddTagBool( $meta{$tid}{'GC'}, "GC" );
		$defline .= AddTagBool( $meta{$tid}{'GC_on_genome'}, "GCg" );
		$defline .= AddTagBool( $meta{$tid}{'is_cod'}, "IsCOD" );
		$defline .= AddTagBool( $meta{$tid}{'is_canonical'}, "IsCanonical" );
		$defline .= AddTagBool( $meta{$tid}{'is_complete'}, "IsComplete" );
		$defline .= AddTagBool( $meta{$tid}{'is_pseudo'}, "IsPseudo" );
		$defline .= AddTagBool( $meta{$tid}{'t_length'}, "t_length" );

		if ( $meta{$tid}{'is_cod'} )
		{
			$defline .= AddTagBool( $meta{$tid}{'cod_length'}, "c_length" );
			$defline .= AddTagBool( $meta{$tid}{'utr5_length'}, "5utr" );
			$defline .= AddTagBool( $meta{$tid}{'utr3_length'}, "3utr" );
		}
		else
		{
			$defline .= AddTagBool( -1, "c_length" );
			$defline .= AddTagBool( -1, "5utr" );
			$defline .= AddTagBool( -1, "3utr" );
		}

		$defline .= AddTagBool( $meta{$tid}{'transcript_support_level'}, "transcript_support_level" );
		$defline .= AddTagBool( $meta{$tid}{'level'}, "level" );
		$defline .= AddTagBool( $meta{$tid}{'CCDS'}, "CCDS" );
		$defline .= AddTagBool( $meta{$tid}{'appris_principal_1'}, "appris_principal_1" );
		$defline .= AddTagBool( $meta{$tid}{'basic'}, "basic" );

		print $OUT $defline ."\n";

		my $pos = 0;
		while ( $pos < length( $meta{$tid}{'seq'} ))
		{
			print $OUT substr($meta{$tid}{'seq'}, $pos, $letters_per_line) ."\n";
			$pos += $letters_per_line;
		}
	}
}
# --------------------------------------------
sub CreateTseq
{
	my $ref = shift;
	my $h_genome = shift;
	my $tid = shift;

	my $s = '';
	my $strand = '';

	foreach my $entry ( @{$ref} )
	{
		$strand = $entry->[6];

		if ($warnings)
		{
			if (( $entry->[2] !~ /exon/ )and( $entry->[2] !~ /pseudogenic_exon/ ))
				{ die "error, exon type is expected: $entry->[2]\n"; }
		}

		$s .= substr( $h_genome->{$entry->[0]}, $entry->[3] - 1, $entry->[4] - $entry->[3] + 1 );
	}

	if ( $strand eq '+' )
	{
		;
	}
	elsif ( $strand eq '-' )
	{
		$s = RevComp($s);
	}
	elsif ( $strand eq '.' )
	{
		;
	}

	$meta{$tid}{'seq'} = $s;
	$meta{$tid}{'strand'}  = $strand;

	if (!$no_gc)
	{
		if ( length($s) != 0 )
		{
			$meta{$tid}{'GC'} = GC($s);
			$meta{$tid}{'GC_on_genome'} = GC( substr( $h_genome->{$ref->[0][0]}, $ref->[0][3] -1 , $ref->[-1][4] -  $ref->[0][3] + 1));
		}
		else
		{
			print "warning, transcript length is zero: $tid\n";
		}
	}
}
# --------------------------------------------
sub LoadGenome
{
	my $fname = shift; # source
	my $ref = shift; # destination

	my $seqid = '';

	open( my $IN, $fname ) or die "error on open file $fname: $!\n";
	while(my $line = <$IN>)
	{
		next if ( $line =~ /^#/ );
		next if ( $line =~ /^\s*$/ );

		if ( $line =~ /^\s*>/ )
		{
			if ( $line =~ /^>\s*(\S+)\s+/ )
			{
				$seqid = $1;
				print "# seqid: $seqid\n" if $verbose;
			}
			else
				{ die "error, unexpected defline format found: $line\n"; }
		}
		else
		{
			if ( ! $seqid )
				{ die "error, seqid is missing\n"; }

			$line =~ s/[\s0-9]//g;

			$ref->{$seqid} .= $line;
		}
	}
	close $IN;

	if ($verbose)
	{
		print "# genome sequence from file: $fname\n";
		print "# number of sequences: ". (scalar keys %{$ref}) ."\n";
	}
}
# --------------------------------------------
sub GetLength
{
	my $ref = shift;

	my $len = 0;

	foreach my $entry (@{$ref})
	{
		$len += ($entry->[4] - $entry->[3] + 1)
	}

	return $len;
}
# --------------------------------------------
sub CheckCanonical
{
	my $tid = shift;

	my $is_canonical = 1;

	$is_canonical = 0 if !$meta{$tid}{'is_cod'};

	$is_canonical = 0 if ! exists $meta{$tid}{'has_supported_start'};
	$is_canonical = 0 if ! exists $meta{$tid}{'has_supported_stop'};
	$is_canonical = 0 if ! exists $meta{$tid}{'has_supported_introns'};
	$is_canonical = 0 if ! exists $meta{$tid}{'has_gap'};

	if ( $is_canonical )
	{
		$is_canonical = 0 if !$meta{$tid}{'has_supported_start'};
		$is_canonical = 0 if !$meta{$tid}{'has_supported_stop'};
		$is_canonical = 0 if !$meta{$tid}{'has_supported_introns'};
		$is_canonical = 0 if !$meta{$tid}{'has_gap'};
	}

	$meta{$tid}{'is_canonical'} = $is_canonical;
}
# --------------------------------------------
sub CheckComplete
{
	my $tid = shift;

	my $is_complete = 1;

	$is_complete = 0 if !$meta{$tid}{'is_cod'};

	$is_complete = 0 if ! exists $meta{$tid}{'has_supported_start'};
	$is_complete = 0 if ! exists $meta{$tid}{'has_supported_stop'};
	$is_complete = 0 if ! exists $meta{$tid}{'change_in_phase'};

	if ( $is_complete )
	{
		$is_complete = 0 if !$meta{$tid}{'has_supported_start'};
		$is_complete = 0 if !$meta{$tid}{'has_supported_stop'};
		$is_complete = 0 if $meta{$tid}{'change_in_phase'};
	}

	$meta{$tid}{'is_complete'} = $is_complete;
}
# --------------------------------------------
sub CalcUTRs
{
	my $cds_ref = shift;
	my $exon_ref = shift;
	my $tid = shift;

	my %cds_to_exon_index = ();
	my %exon_to_cds_index = ();

	my $total_exons = scalar @{$exon_ref};
	my $total_cds = scalar @{$cds_ref};

	my $i = 1; # index+1 on exon arr
	my $j = 1; # index+1 on CDS arr

	while( $i < $total_exons + 1 )
	{
		if ( $exon_ref->[$i - 1][4] <  $cds_ref->[0][4] )
		{
			$exon_to_cds_index{$i} = 0;
			$i += 1;
		}
		elsif ( $exon_ref->[$i - 1][4] >= $cds_ref->[0][4] )
		{
			$exon_to_cds_index{$i} = 1;
			$cds_to_exon_index{1} = $i;
			$j += 1;
			$i += 1;
			last
		}
	}

	while( $j < $total_cds + 1)
	{
		if (  $exon_ref->[$i -1][3] == $cds_ref->[$j -1][3] )
		{
			$exon_to_cds_index{$i} = $j;
			$cds_to_exon_index{$j} = $i;
			$i+=1;
			$j+=1;
		}
		else
		{
			$i+=1;
			last;
		}
	}

	while( $i < $total_exons + 1 )
	{
		$exon_to_cds_index{$i} = 0;
		$i += 1;
	}

	my @arr = (sort{$a<=>$b} keys %cds_to_exon_index);

	# ----
	my $utr_L = 0;
	my $utr_R = 0;

	my $first_cds_idx = $arr[0];
	my $last_cds_idx = $arr[-1];
	my $first_exon_match = $cds_to_exon_index{$first_cds_idx};
	my $last_exon_match = $cds_to_exon_index{$last_cds_idx};

	for( my $idx = 1; $idx < $first_exon_match; $idx += 1 )
	{
		$utr_L += ( $exon_ref->[$idx -1][4] - $exon_ref->[$idx -1][3] + 1 );
	}
	$utr_L += ( $cds_ref->[$first_cds_idx -1][3] - $exon_ref->[$first_exon_match -1][3] );

	for( my $idx = $total_exons; $idx > $last_exon_match; $idx -= 1 )
	{
		$utr_R += ( $exon_ref->[$idx -1][4] - $exon_ref->[$idx -1][3] + 1 );
	}
	$utr_R += ( $exon_ref->[$last_exon_match -1][4] - $cds_ref->[$last_cds_idx -1][4] );

	if ( $cds_ref->[0][6] eq '+' )
	{
		$meta{$tid}{'utr5_length'} = $utr_L;
		$meta{$tid}{'utr3_length'} = $utr_R;
	}
	elsif ( $cds_ref->[0][6] eq '-' )
	{
		$meta{$tid}{'utr5_length'} = $utr_R;
		$meta{$tid}{'utr3_length'} = $utr_L;
	}
}
# --------------------------------------------
sub ParsePseudo
{
	my $ref = shift;
	my $tid = shift;

	$meta{$tid}{'is_pseudo'} = 0;

	foreach my $entry (@{$ref})
	{
		if ($entry->[8] =~ /pseudogene/)
		{
			$meta{$tid}{'is_pseudo'} = 1;
		}
	}
}
# --------------------------------------------
sub CheckInFrameStops
{
	my $tid = shift;

	my $utr5  = $meta{$tid}{'utr5_length'};
	my $utr3  = $meta{$tid}{'utr3_length'};
	my $total = $meta{$tid}{'t_length'};
	my $cds   = $meta{$tid}{'cod_length'};

	if ( $utr5 + $utr3 + $cds != $total )
	{
		print Dumper($meta{$tid});
		die "error, in parsing CDS on transcript level\n";
	}

	for( my $pos = $utr5; $pos < $utr5 + $cds - 3; $pos += 3 )
	{
		my $codon = substr( $meta{$tid}{'seq'}, $pos, 3 );
		if ($codon =~ /TAA|TAG|TGA/)
		{
			print "# in frame stop $meta{$tid}{'tid'}\n";
		}
	}
}
# --------------------------------------------
sub AdjustCDS
{
	my $ref = shift;
	my $tid = shift;

	$meta{$tid}{'change_in_phase'} = 0;

	if ( $ref->[0][6] eq '+' )
	{
		if ( $ref->[0][7] != 0 )
		{
			if ( $ref->[0][3] + $ref->[0][7] <= $ref->[0][4] )
			{
				$ref->[0][3] += $ref->[0][7];
				$ref->[0][7] = 0;
			}
			else
			{
				print "# warning, short cds". $meta{$tid}{'tid'} ."\n";
			}
			$meta{$tid}{'change_in_phase'} = 1;
		}
	}
	elsif ( $ref->[-1][6] eq '-' )
	{
		if ( $ref->[-1][7] != 0 )
		{
			if ( $ref->[-1][4] - $ref->[-1][7] >= $ref->[-1][3] )
			{
				$ref->[-1][4] -= $ref->[-1][7];
				$ref->[-1][7] = 0;
			}
			else
			{
				print "# warning, short cds". $meta{$tid}{'tid'} ."\n";
			}
			$meta{$tid}{'change_in_phase'} = 1;
		}
	}

	$meta{$tid}{'cod_length'} = GetLength($ref);

	my $mod = $meta{$tid}{'cod_length'} % 3;

	if ( $mod != 0 )
	{
		if ( $ref->[-1][6] eq '+' )
		{
			if ( $ref->[-1][4] - $mod >= $ref->[-1][3] )
			{
				$ref->[-1][4] -= $mod;
			}
			else
			{
				print "# warning, short cds". $meta{$tid}{'tid'} ."\n";
			}
			$meta{$tid}{'change_in_phase'} = 1;
		}
		elsif ( $ref->[0][6] eq '-' )
		{
			if ( $ref->[0][3] + $mod <= $ref->[0][4] )
			{
				$ref->[0][3] += $mod;
			}
			else
			{
				print "# warning, short cds". $meta{$tid}{'tid'} ."\n";
			}
			$meta{$tid}{'change_in_phase'} = 1;
		}
	}

	$meta{$tid}{'cod_length'} = GetLength($ref);
}
# -------------------------------------------
sub ParseGap
{
	my $ref = shift;
	my $tid = shift;

	$meta{$tid}{'has_gap'} = 0;

	foreach my $entry (@{$ref})
	{
		if ( $entry->[2] eq "gap" )
		{
			$meta{$tid}{'has_gap'} = 1;
		}
	}
}
# -------------------------------------------
sub HasGap
{
	my $ref = shift;
	my $id = shift;
	my $has_gap = 0;

	foreach my $entry (@{$ref})
	{
		if ( $entry->[2] eq "gap" )
		{
			$has_gap = 1;
		}
	}

	return $has_gap;
}
# --------------------------------------------
sub ParseAnnotationQuality
{
	my $ref = shift;
	my $tid = shift;

	foreach my $entry (@{$ref})
	{
		$meta{$tid}{'transcript_support_level'} = 0;
		if ($entry->[8] =~ /;transcript_support_level=(\d+)/)
		{
			$meta{$tid}{'transcript_support_level'} = $1;
		}

		$meta{$tid}{'level'} = 0;
		if ($entry->[8] =~ /;level=(\d+)/)
		{
			$meta{$tid}{'level'} = $1;
		}

		$meta{$tid}{'CCDS'} = 0;
		$meta{$tid}{'appris_principal_1'} = 0;
		$meta{$tid}{'basic'} = 0;

		if ( $entry->[8] =~ /[\t ;]tag=([^;]*)/ )
		{
			my $value = $1;

			$meta{$tid}{'CCDS'} = 1 if ($value =~ /CCDS/);
			$meta{$tid}{'appris_principal_1'} = 1 if ($value =~ /appris_principal_1/);
			$meta{$tid}{'basic'} = 1 if ($value =~ /basic/);
		}
	}
}
# --------------------------------------------
sub EnrichRecord
{
	# ref on array of arrays of GFF3 values from one gene
	my $ref = shift;
	# parse sequence or not
	my $use_seq = shift;
	# return enriched record
	my @new_rec = ();

	# with features
	my @gene = ();
	my @transcripts = ();
	my @exons = ();
	my @cds = ();
	my @introns = ();
	my @start_codon = ();
	my @stop_codon = ();

	# Find "gene" line and put it into @gene
	@gene = GetGene($ref);
	if ( (scalar @gene) == 0 ) { print Dumper($ref); die "error, no gene in record\n"; }
	my $gene_id = GetGeneID(\@gene);

	# prepare mrna lines
	@transcripts = SelectTranscripts($ref);
	if ( (scalar @transcripts) == 0 ) { print Dumper($ref); die "error, no transcript in record\n"; }
	AddCountToAttrSimple(\@transcripts);

	my %mrna = GetMrnaIDs(\@transcripts);
	SplitByMRNA( $ref, \%mrna );

	# strt output array
	push @new_rec, @gene;

	foreach my $key ( keys %mrna )
	{
		@exons = SelectExonsSorted($mrna{$key});
		@cds = SelectCDSsorted($mrna{$key});

		$meta{$key}{'gid'} = $gene_id;
		$meta{$key}{'tid'}= $key;
		$meta{$key}{'t_length'} = GetLength(\@exons);
		$meta{$key}{'cod_length'} = GetLength(\@cds);

		ParseAnnotationQuality($mrna{$key}, $key);
		ParsePseudo($mrna{$key}, $key);
		ParseGap($mrna{$key}, $key);

		foreach my $entry (@exons) { $entry->[8] = "Parent=". $key .";"; }
		foreach my $entry (@cds)   { $entry->[8] = "Parent=". $key .";"; }

		AddCountToAttr(\@exons);
		CreateTseq(\@exons, \%genome, $key) if $tseq_out;

		if ( @cds > 0 )
		{
			$meta{$key}{'is_cod'} = 1;
			
			AdjustCDS(\@cds, $key) if $adjust_cds;
			CalcUTRs( \@cds, \@exons, $key );

			@introns = CreateCdsIntrons(\@cds, $key);
			CompareIntrons( $ref, \@introns ) if $warnings;
			@start_codon = CreateStartCodon(\@cds, $key);
			@stop_codon = CreateStopCodon(\@cds, $key);

			foreach my $entry (@introns)      { $entry->[8] = "Parent=". $key .";"; }
			foreach my $entry (@start_codon)  { $entry->[8] = "Parent=". $key .";"; }
			foreach my $entry (@stop_codon)   { $entry->[8] = "Parent=". $key .";"; }

			if ( $use_seq )
			{
				my $are_splice_sites_supported =  AddSpliceSites( \@introns, \%genome );
				if ( $are_splice_sites_supported )
					{ $meta{$key}{'has_supported_introns'} = 1; }
				else
					{ $meta{$key}{'has_supported_introns'} = 0; }
				
				my $codon = '';

				$codon = AddStartStopSites( \@start_codon, \%genome );
				if ($codon eq 'ATG')
					{ $meta{$key}{'has_supported_start'} = 1; }
				else
					{ $meta{$key}{'has_supported_start'} = 0; }

				$codon = AddStartStopSites( \@stop_codon, \%genome );
				if ($codon =~ /TAA|TAG|TGA/)
					{ $meta{$key}{'has_supported_stop'} = 1; }
				else
					{ $meta{$key}{'has_supported_stop'} = 0; }

				CheckInFrameStops($key) if $tseq_out;
				CheckCanonical($key);
				CheckComplete($key);
			}

			AddLabelsToCDS(\@cds, $key);
			AddCountToAttr(\@cds);
			AddCountToAttr(\@introns);
			AddCountToAttrSemiReverse(\@start_codon, 1);
			AddCountToAttrSemiReverse(\@stop_codon, 0);

			push @new_rec, [ @{$mrna{$key}[0]} ];
			push @new_rec, @exons;
			push @new_rec, @cds;
			push @new_rec, @introns;
			push @new_rec, @start_codon;
			push @new_rec, @stop_codon;
		}
		else
		{
			$meta{$key}{'is_cod'} = 0;
			push @new_rec, [ @{$mrna{$key}[0]} ];
			push @new_rec, @exons;
		}
	}

	return @new_rec;
}
# --------------------------------------------
sub CompareIntrons
{
	my $annot = shift;
	my $calc = shift;

	my %h = ();

	# collect all annotated introns
	foreach my $entry ( @{$annot} )
	{
		if ( $entry->[2] =~ /[Ii]ntron/ )
		{
			my $key = $entry->[0] ."_". $entry->[3] ."_". $entry->[4] ."_". $entry->[6];
			$h{$key} = 0;
		}
	}

	$info{"annot_introns_in_input"} += scalar (keys %h);

	foreach my $entry ( @{$calc} )
	{
		my $key = $entry->[0] ."_". $entry->[3] ."_". $entry->[4] ."_". $entry->[6];

		if ( exists $h{$key} )
		{
			$info{"calc_introns_match_annot"} += 1;
		}
		else
		{
			$info{"calc_introns_mismatch_annot"} += 1;
		}
	}

	$info{"annot_introns_mismatch_calc"} = $info{"annot_introns_in_input"} - $info{"calc_introns_match_annot"};
}
# --------------------------------------------
sub AddStartStopSites
{
	my $ref = shift;
	my $h_genome = shift;

	my $scodon = '';

	foreach my $entry ( @{$ref} )
	{
		if ($warnings)
		{
			if ( $entry->[2] !~ /[Ss]tart_codon/ and $entry->[2] !~ /[Ss]top_codon/ )
				{ die "error, start/stop type is expected: $entry->[2]\n"; }
		}

		my $CODON = '';

		if ( $entry->[4] - $entry->[3] + 1 == 3 )
		{
			$CODON = substr( $h_genome->{$entry->[0]}, $entry->[3] -1, 3 );
			$scodon = $CODON;
		}
		else
		{
			if ( $entry->[4] - $entry->[3] + 1 == 1 )
			{
				$CODON = substr( $h_genome->{$entry->[0]}, $entry->[3] -1, 1 );
				$scodon .= $CODON;
			}
			elsif ( $entry->[4] - $entry->[3] + 1 == 2 )
			{
				$CODON = substr( $h_genome->{$entry->[0]}, $entry->[3] -1, 2 );
				$scodon .= $CODON;
			}
			else
				{die;}
		}

		if ( $entry->[6] eq '+' )
		{
			;
		}
		elsif ( $entry->[6] eq '-' )
		{
			$CODON = RevComp($CODON);
		}
		else
			{ die "error, strand is required for sequence extraction:"; }

		$entry->[8] .= ("site_seq=". $CODON .";");
	}

	if ( $ref->[0][6] eq '-' )
	{
		$scodon = RevComp($scodon);
	}

	return uc($scodon);
}
# --------------------------------------------
sub AddSpliceSites
{
	my $ref = shift;
	my $h_genome = shift;

	my $is_supported = 1;

	foreach my $entry ( @{$ref} )
	{
		if ($warnings)
		{
			if ( $entry->[2] !~ /[Ii]ntron/ and $entry->[2] !~ /gap/ )
				{ die "error, intron type is expected: $entry->[2]\n"; }
		}

		my $DON = '';
		my $ACC = '';

		my $DONsite = '';
		my $ACCsite = '';

		if ( $entry->[6] eq '+' )
		{
			$DON = substr( $h_genome->{$entry->[0]}, $entry->[3] -1, 2 );
			$ACC = substr( $h_genome->{$entry->[0]}, $entry->[4] -2, 2 );

			if ($get_sites)
			{
				$DON = substr( $h_genome->{$entry->[0]}, $entry->[3] -1 -3,  2 +3  +4 );
				$ACC = substr( $h_genome->{$entry->[0]}, $entry->[4] -2 -18, 2 +18 +1 );
			}
		}
		elsif ( $entry->[6] eq '-' )
		{
			$ACC = substr( $h_genome->{$entry->[0]}, $entry->[3] -1, 2 );
			$DON = substr( $h_genome->{$entry->[0]}, $entry->[4] -2, 2 );

			$DON = RevComp($DON);
			$ACC = RevComp($ACC);

			if ($get_sites)
			{
				$ACCsite = substr( $h_genome->{$entry->[0]}, $entry->[3] -1 -1, 2 +18 +1);
				$DONsite = substr( $h_genome->{$entry->[0]}, $entry->[4] -2 -4, 2 +3  +4);

				$DONsite = RevComp($DONsite);
				$ACCsite = RevComp($ACCsite);
			}
		}
		else
			{ die "error, strand is required for sequence extraction:"; }

		$entry->[8] .= ("site_seq=". $DON ."_". $ACC .";");

		if ($get_sites)
		{
			$entry->[8] .= ("site=". $DONsite ."_". $ACCsite .";");
		}

		if ((uc($DON) ne 'GT')and(uc($DON) ne 'GC'))
		{
			$is_supported = 0;
		}

		if (uc($ACC) ne 'AG')
		{
			$is_supported = 0;
		}
	}

	return $is_supported;
}
# --------------------------------------------
sub RevComp
{
	my $s = shift;

	$s = reverse($s);

	$s =~ s/A/1/g;
	$s =~ s/T/A/g;
	$s =~ s/1/T/g;

	$s =~ s/C/2/g;
	$s =~ s/G/C/g;
	$s =~ s/2/G/g;

	$s =~ s/a/3/g;
	$s =~ s/t/a/g;
	$s =~ s/3/t/g;

	$s =~ s/c/4/g;
	$s =~ s/g/c/g;
	$s =~ s/4/g/g;

	return $s;
}
# --------------------------------------------
sub AddCountToAttrSimple
{
	my $ref = shift;

	my $size = scalar @{$ref};

	for( my $i = 0; $i < $size; $i += 1 )
	{
		$ref->[$i][8] .= ";" if ( $ref->[$i][8] !~ m/;\s*$/ );
		$ref->[$i][8] .= "count=". ($i+1) ."_". $size .";";
	}
}
# --------------------------------------------
sub AddCountToAttr
{
	my $ref = shift;

	my $size = scalar @{$ref};

	return if ( $size == 0 );

	if ( $ref->[0][6] eq "+" )
	{
		for( my $i = 0; $i < $size; $i += 1 )
		{
			$ref->[$i][8] .= ";" if ( $ref->[$i][8] !~ m/;$/ );
			$ref->[$i][8] .= "count=". ($i+1) ."_". $size .";";
		}
	}
	elsif ( $ref->[0][6] eq "-" )
	{
		for( my $i = $size -1; $i >= 0; $i -= 1 )
		{
			$ref->[$i][8] .= ";" if ( $ref->[$i][8] !~ m/;$/ );
			$ref->[$i][8] .= "count=". ($i+1) ."_". $size .";";
		}
	}
}
# --------------------------------------------
sub AddCountToAttrSemiReverse
{
	my $ref = shift;
	my $is_start = shift;

	my $size = scalar @{$ref};

	return if ( $size == 0 );

	if ( $size == 1 )
	{
		$ref->[0][8] .= ";" if ( $ref->[0][8] !~ /;$/ );
		$ref->[0][8] .= "count=1_1;";
	}
	elsif ( $size == 2 )
	{
		$ref->[0][8] .= ";" if ( $ref->[0][8] !~ /;$/ );
		$ref->[1][8] .= ";" if ( $ref->[1][8] !~ /;$/ );

		if (( $is_start and ($ref->[0][6] eq "+" )) or ( !$is_start and ($ref->[0][6] eq "-" )))
		{
			$ref->[0][8] .= "count=1_2;";
			$ref->[1][8] .= "count=2_2;";
		}
		elsif (( !$is_start and ($ref->[0][6] eq "+" )) or ( $is_start and ($ref->[0][6] eq "-" )))
		{
			$ref->[0][8] .= "count=2_2;";
			$ref->[1][8] .= "count=1_2;";
		}
	}
}
# --------------------------------------------
sub AddCountToAttrInHashMRNA
{
	my $ref = shift;

	my $size = scalar ( keys %{$ref} );
	my $i = 0;

	foreach my $key ( keys %{$ref} )
	{
		$ref->{$key}[0][8] .= ";" if ( $ref->{$key}[0][8] !~ m/;$/ );
		$ref->{$key}[0][8] .= "count=". ($i+1) ."_". $size .";";

		$i += 1;
	}
}
# --------------------------------------------
sub AddLabelsToCDS
{
	my $ref = shift;
	my $tid = shift;

	my $size = scalar @{$ref};

	if ( $size == 1 )
	{
		$ref->[0][8] .= ";" if ( $ref->[0][8] !~ /;$/ );
		if ( !$meta{$tid}{'has_supported_start'} or !$meta{$tid}{'has_supported_stop'} )
		{
			$ref->[0][8] .= "cds_type=Partial;";
		}
		else
		{
			$ref->[0][8] .= "cds_type=Single;";
		}
	}
	elsif ( $size > 1 )
	{
		if ( $ref->[0][6] eq "+" )
		{
			$ref->[0][8] .= ";" if ( $ref->[0][8] !~ /;$/ );
			if ( ! $meta{$tid}{'has_supported_start'} )
			{
				$ref->[0][8] .= "cds_type=Partial;";
			}
			else
			{
				$ref->[0][8] .= "cds_type=Initial;";
			}

			$ref->[$size -1][8] .= ";" if ( $ref->[$size -1][8] !~ /;$/ );
			if ( ! $meta{$tid}{'has_supported_stop'} )
			{
				$ref->[$size -1][8] .= "cds_type=Partial;";
			}
			else
			{
				$ref->[$size -1][8] .= "cds_type=Terminal;";
			}
		}
		elsif ( $ref->[0][6] eq "-" )
		{
			$ref->[$size -1][8] .= ";" if ( $ref->[$size -1][8] !~ /;$/ );
			if ( ! $meta{$tid}{'has_supported_start'} )
			{
				$ref->[$size -1][8] .= "cds_type=Partial;";
			}
			else
			{
				$ref->[$size -1][8] .= "cds_type=Initial;";
			}

			$ref->[0][8] .= ";" if ( $ref->[0][8] !~ /;$/ );
			if ( ! $meta{$tid}{'has_supported_stop'} )
			{
				$ref->[0][8] .= "cds_type=Partial;";
			}
			else
			{
				$ref->[0][8] .= "cds_type=Terminal;";
			}
		}

		for( my $i = 1; $i < $size -1; $i += 1 )
		{
			$ref->[$i][8] .= ";" if ( $ref->[$i][8] !~ /;$/ );
			$ref->[$i][8] .= "cds_type=Internal;";
		}
	}
}
# --------------------------------------------
sub CreateStopCodon
{
	# ref on array of arrays
	my $ref = shift;
	my $tid = shift;

	# output
	my @arr = ();

	# first CDS from the left
	my @current = @{$ref->[0]};

	if ( $current[6] eq '+' )
	{
		my $idx = (scalar @{$ref}) - 1;

		# last CDS from the left
		@current = @{$ref->[$idx]};

		if ( $current[4] - $current[3] + 1 >= 3 )
		{
			$current[2] = "stop_codon";
			$current[3] = $current[4] - 3 + 1;
			$current[7] = 0;

			push @arr, [ @current ];
		}
		else
		{
			# last CDS from the left

			if ( $current[4] - $current[3] + 1 == 2 )
			{
				$current[2] = "stop_codon";
				$current[3] = $current[4] - 1;
				$current[7] = 2;

				push @arr, [ @current ];

				# before last CDS from left
				@current = @{$ref->[$idx - 1]};

				$current[2] = "stop_codon";
				$current[3] = $current[4];
				$current[7] = 0;

				push @arr, [ @current ];
                        }
			elsif ( $current[4] - $current[3] + 1 == 1 )
			{
				$current[2] = "stop_codon";
				$current[3] = $current[4];
				$current[7] = 1;

				push @arr, [ @current ];

				@current = @{$ref->[$idx - 1]};

				$current[2] = "stop_codon";
				$current[3] = $current[4] - 1;
				$current[7] = 0;

				push @arr, [ @current ];
			}

			if ($warnings)
			{
				print "warning, split stop codon detected: $meta{$tid}{'tid'}\n";
				print Dumper(\@arr) if $debug;
			}
		}
	}
	elsif ( $current[6] eq '-' )
	{
		# first CDS from the left

		if ( $current[4] - $current[3] + 1 >= 3 )
		{
			$current[2] = "stop_codon";
			$current[4] = $current[3] + 3 - 1;
			$current[7] = 0;

			push @arr, [ @current ];
		}
		else
		{
			if ( $current[4] - $current[3] + 1 == 2 )
			{
				$current[2] = "stop_codon";
				$current[4] = $current[3] + 1;
				$current[7] = 2;

				push @arr, [ @current ];

				@current = @{$ref->[1]};

				$current[2] = "stop_codon";
				$current[4] = $current[3];
				$current[7] = 0;

				push @arr, [ @current ];
			}
			elsif ( $current[4] - $current[3] + 1 == 1 )
			{
				$current[2] = "stop_codon";
				$current[4] = $current[3];
				$current[7] = 1;

				push @arr, [ @current ];

				@current = @{$ref->[1]};

				$current[2] = "stop_codon";
				$current[4] = $current[3] + 1;
				$current[7] = 0;

				push @arr, [ @current ];
			}

			if ($warnings)
			{
				print "warning, split stop codon detected: $meta{$tid}{'tid'}\n";
				print Dumper(\@arr) if $debug;
			}
		}
	}

	@arr = sort{ $a->[3] <=> $b->[3] } @arr;

	return @arr;
}
# --------------------------------------------
sub CreateStartCodon
{
	# ref on array of array - values of CDS from one mrna - sorted
	my $ref = shift;
	my $tid = shift;

	# output array of arrays
	my @arr = ();

	# this is first CDS on the left side
	my @current = @{$ref->[0]};

	if ( $current[6] eq '+' )
	{
		# complete start codon
		if ( $current[4] - $current[3] + 1 >= 3 )
		{
			$current[2] = "start_codon";
			$current[4] = $current[3] + 3 - 1;
			$current[7] = 0;

			push @arr, [ @current ];
		}
		else
		{
			if ( @{$ref} < 2 )
				{ die "error, not enough data to position start codon\n"; }

			# to do : check strand

			if ( $current[4] - $current[3] + 1 == 2 )
			{
				$current[2] = "start_codon";
				$current[4] = $current[3] + 1;
				$current[7] = 0;

				push @arr, [ @current ];

				# mode to second CDS from left
				@current = @{$ref->[1]};

				$current[2] = "start_codon";
				$current[4] = $current[3];
				$current[7] = 1;

				push @arr, [ @current ];
			}
			elsif ( $current[4] - $current[3] + 1 == 1 )
			{
				$current[2] = "start_codon";
				$current[4] = $current[3];
				$current[7] = 0;

				push @arr, [ @current ];

				# mode to second CDS from left
				@current = @{$ref->[1]};

				$current[2] = "start_codon";
				$current[4] = $current[3] + 1;
				$current[7] = 2;

				push @arr, [ @current ];
			}

			if ($warnings)
			{
				print "warning, split start codon detected: $meta{$tid}{'tid'}\n";
				print Dumper(\@arr) if $debug;
			}
		}
	}
	elsif ( $current[6] eq '-' )
	{
		my $idx = (scalar @{$ref}) - 1;

		# this is last CDS from the left
		@current = @{$ref->[$idx]};

		if ( $current[4] - $current[3] + 1 >= 3 )
		{
			$current[2] = "start_codon";
			$current[3] = $current[4] - 3 + 1;
			$current[7] = 0;

			push @arr, [ @current ];
		}
		else
		{
			if ( @{$ref} < 2 )
				{ die "error, not enough data to position start codon\n"; }

			if ( $current[4] - $current[3] + 1 == 2 )
			{
				$current[2] = "start_codon";
				$current[3] = $current[4] - 1;
				$current[7] = 0;

				push @arr, [ @current ];

				# before last CDS from left
				@current = @{$ref->[$idx - 1]};

				$current[2] = "start_codon";
				$current[3] = $current[4];
				$current[7] = 1;

				push @arr, [ @current ];
                        }
                        elsif ( $current[4] - $current[3] + 1 == 1 )
                        {
                                $current[2] = "start_codon";
                                $current[3] = $current[4];
                                $current[7] = 0;

                                push @arr, [ @current ];

				# before last CDS from left
                                @current = @{$ref->[$idx - 1]};

                                $current[2] = "start_codon";
                                $current[3] = $current[4] - 1;
                                $current[7] = 2;

                                push @arr, [ @current ];
                        }

			if ($warnings)
			{
				print "warning, split start codon detected: $meta{$tid}{'tid'}\n";
				print Dumper(\@arr) if $debug;
			}
		}
	} 

	@arr = sort{ $a->[3] <=> $b->[3] } @arr;

	return @arr;
}
# --------------------------------------------
sub PrintRecord
{
	my $ref = shift;

	foreach my $entry ( @{$ref} )
	{
		print $OUT   $entry->[0] ."\t". $entry->[1] ."\t". $entry->[2] ."\t". $entry->[3] ."\t". $entry->[4] ."\t". $entry->[5] ."\t". $entry->[6] ."\t". $entry->[7];
		if ( defined $entry->[8] )
		{
			print $OUT "\t". $entry->[8];
		}
		print $OUT  "\n";
	}
}
# --------------------------------------------
sub CreateCdsIntrons
{
	# ref on array of arrys - CDS values from GFF file - one mrna - sorted
	my $ref = shift;
	my $tid = shift;

	my $size = scalar @{$ref};

	# ref on arry of arrays with introns - output
	my @arr = ();

	# two CDS minimum for intron deriviation
	return @arr if ($size < 2);

	my $i = 0;
	my $j = 1;

	while( $j < $size)
	{
		# two exons must be on the same strand

		if ( $ref->[$i][6] eq $ref->[$j][6] )
		{
			my @current = @{$ref->[$i]};

			$current[2] = "intron";
			$current[3] = $ref->[$i][4] + 1;
			$current[4] = $ref->[$j][3] - 1;

			if ( $ref->[$i][6] eq "+" )
			{
				$current[7] = 3 - $ref->[$j][7];
				$current[7] = 0 if ( $current[7] == 3 );
			}
			elsif ( $ref->[$i][6] eq "-" )
			{
				$current[7] = 3 - $ref->[$i][7];
				$current[7] = 0 if ( $current[7] == 3 );
			}

			if ( $current[4] - $current[3] + 1 < $min_intron )
			{
				$current[2] = "gap";

				if ($warnings)
				{
					my $gap_length = $current[4] - $current[3] + 1;
					print "warning, distance between CDS-CDS is below $min_intron $gap_length ". $meta{$tid}{'tid'} ." : intron was replaced by gap label\n";
					print Dumper(\@current) if $debug;
				}
			}

			push @arr, [@current];
		}
		else
		{
			print "warning, oposite strand CDS were detected: intron is not assigned in such cases\n" if $warnings;
		}

		$i += 1;
		$j += 1;
	}

	return @arr;
}
# --------------------------------------------
sub SelectTranscripts
{
	my $ref = shift;
	my @arr = ();

	my %labels = ( "mRNA" => 1, "transcript" => 1);
	$labels{"nc_primary_transcript"} = 1;
       	$labels{"lnc_RNA"} = 1;
	$labels{"unconfirmed_transcript"} = 1;
	$labels{"pseudogene"} = 1;
	$labels{"tRNA"} = 1;
	$labels{"rRNA"} = 1;
	$labels{"ncRNA"} = 1;
	$labels{"snRNA"} = 1;
	$labels{"snoRNA"} = 1;
	$labels{"pre_miRNA"} = 1;
	$labels{"antisense_lncRNA"} = 1;
	$labels{"miRNA_primary_transcript"} = 1;
	$labels{"miRNA"} = 1;
	$labels{"pseudogenic_transcript"} = 1;
	$labels{"ncRNA_gene"} = 1;
	$labels{"scRNA"} = 1;
	$labels{"transcript_region"} = 1;
	$labels{"antisense_RNA"} = 1;

	foreach my $entry (@{$ref})
	{
		if (( exists $labels{ $entry->[2] } )or( $entry->[2] =~ /^\S_gene_segment$/ ))
		{
			push @arr, [ @{$entry} ];
		}
	}

	if ( scalar @arr == 0 )
	{
		print Dumper($ref);
		die "error, lines with transcript info not found\n";
	}

	return @arr;
}
# --------------------------------------------
sub SelectExonsSorted
{
	my $ref = shift;
	my @arr = ();

	foreach my $entry (@{$ref})
	{
		if (( $entry->[2] eq "exon" )or( $entry->[2] eq "pseudogenic_exon" )or( $entry->[2] eq "miRNA_" ))
		{
			push @arr, [ @{$entry} ];
		}
	}

	@arr = sort{ $a->[3] <=> $b->[3] } @arr;

	return @arr;
}
# --------------------------------------------
sub SelectCDSsorted
{
	my $ref = shift;
	my @arr = ();

	foreach my $entry (@{$ref})
	{
		if ( $entry->[2] eq "CDS" )
		{
			if ( $entry->[6] !~ /^[+-]$/ )
				{ die "error, CDS strand value is missing: $entry->[6]\n"; } 

			push @arr, [ @{$entry} ];
		}
	}

	# is it possible to have trans-splicing from different chromosomes?
	
	@arr = sort{ $a->[0] cmp $b->[0] || $a->[3] <=> $b->[3] } @arr;

	if ( $warnings and (@arr > 0))
	{
		my $i = 0;
		my $j = 1;
		my $size = scalar @arr;

		while( $j < $size )
		{
			if ( $arr[$i][4] >= $arr[$j][3] )
			{
				print "warning, two CDS from the same mRNA overlap: $arr[$i][3] .. $arr[$i][4]  $arr[$j][3] .. $arr[$j][4]\n"; 
			}

			$i += 1;
			$j += 1;
		}

		my $strand = $arr[0][6];
		my $seqid = $arr[0][0];

		foreach my $current (@arr)
		{
			if ( $strand ne $current->[6] )
			{
				print "warning, two strands detected in one mRNA: $strand  $current->[6]\n";
				last;
			}	
		}

		foreach my $current (@arr)
		{
			if ( $seqid ne $current->[0] )
			{
				print "warning, two seqid detected in one mRNA: $seqid  $current->[0]\n";
				last;
			}
		}
	}

	return @arr;
}
# --------------------------------------------
sub SplitByMRNA
{
	# ref array of arrays with GGF values of one gene
	my $ref = shift;
	# ref on hash of arrays - with GFF values separated by mrna ID
	my $h_mrna = shift;

	my %labels = ( "mRNA" => 1, "transcript" => 1 );
	$labels{"nc_primary_transcript"} = 1;
	$labels{"lnc_RNA"} = 1;
	$labels{"unconfirmed_transcript"} = 1;
	$labels{"gene"} = 1;
	$labels{"stop_codon_redefined_as_selenocysteine"} = 1;
	$labels{"pseudogene"} = 1;
	$labels{"tRNA"} = 1;
	$labels{"rRNA"} = 1;
	$labels{"ncRNA"} = 1;
	$labels{"snRNA"} = 1;
	$labels{"snoRNA"} = 1;
	$labels{"pre_miRNA"} = 1;
	$labels{"antisense_lncRNA"} = 1;
	$labels{"miRNA_primary_transcript"} = 1;
	$labels{"ncRNA_gene"} = 1;
	$labels{"miRNA"} = 1;
	$labels{"pseudogenic_transcript"} = 1;
	$labels{"scRNA"} = 1;
	$labels{"transcript_region"} = 1;
	$labels{"antisense_RNA"} = 1;
	$labels{"transposable_element_gene"} = 1;

	foreach my $entry (@{$ref})
	{
		next if ( exists $labels{ $entry->[2] } );
		next if ( $entry->[2] =~ /^\S_gene_segment$/ );

		if ( $entry->[8] =~ /Parent=(\S+?);/ or $entry->[8] =~ /Parent=(\S+)/)
		{
			my @parent_set = split( ',', $1 );

			foreach my $value (@parent_set)
			{
				if ( exists $h_mrna->{$value} )
				{
					push @{$h_mrna->{$value}}, [ @{$entry} ];
				}
				else
				{
					print Dumper($ref) if $debug;
					die "error, feature not in mRNA:\n$value\n";
				}
			}
		}
		else
			{ die "error, line without the Parent in:\n$entry->[8]\n"; }
	}
}
# --------------------------------------------
sub GetMrnaIDs
{
	# ref on array of arrays - GFF values one gene
	my $ref = shift;

	# one or many mRNA per record 
	my %mRNA = ();

	my %labels = ( "mRNA" => 1, "transcript" => 1 );
	$labels{"nc_primary_transcript"} = 1;
	$labels{"lnc_RNA"} = 1;
	$labels{"unconfirmed_transcript"} = 1;
       	$labels{"pseudogene"} = 1;
	$labels{"tRNA"} = 1;
	$labels{"rRNA"} = 1;
	$labels{"ncRNA"} = 1;
	$labels{"snRNA"} = 1;
	$labels{"snoRNA"} = 1;
	$labels{"pre_miRNA"} = 1;
	$labels{"antisense_lncRNA"} = 1;
	$labels{"miRNA_primary_transcript"} = 1;
	$labels{"lnc_RNA"} = 1;
	$labels{"miRNA"} = 1;
	$labels{"pseudogenic_transcript"} = 1;
	$labels{"scRNA"} = 1;
	$labels{"transcript_region"} = 1;
	$labels{"antisense_RNA"} = 1;

	foreach my $entry (@{$ref})
	{
		if (( exists $labels{ $entry->[2] } ) or ( $entry->[2] =~ /^\S_gene_segment$/))
		{
			if ( $entry->[8] =~ /ID=(\S+?);/ )
			{
				my $ID = $1;

				if ( ! exists $mRNA{$ID} )
				{
					push @{$mRNA{$ID}}, [ @{$entry} ];
				}
				else
					{ die "error, mRNA ID duplication found: $ID\n"; }
			}
			else
				{ die "error, mRNA ID field not found in: $entry->[8]\n"; }
		}
	}

	if ( scalar (keys %mRNA ) == 0 )
	{
		print Dumper($ref); # if $debug;
		die "error, no mRNA in record\n";
	}

	return %mRNA;
}
# --------------------------------------------
sub GetGene
{
	my $ref = shift;
	
	my @arr = ();

	foreach my $entry (@{$ref})
	{
		if (( $entry->[2] eq "gene" )or( $entry->[2] eq "ncRNA_gene" )or( $entry->[2] eq "pseudogene" )or( $entry->[2] eq "transposable_element_gene" ))
		{
			 push @arr, [@{$entry}];
		}
	}

	return @arr;
}
# --------------------------------------------
sub GetGeneID
{	
	my $ref = shift;

	my $gene_id = '';

	foreach my $entry (@{$ref})
	{
		if ( !$gene_id )
		{
			if ( $entry->[8] =~ /ID=(\S+?);/ ) 
			{
				$gene_id = $1;
			}
			else
				{ die "error, gene ID field not found in: $entry->[8]\n"; }
		}
		else
			{ die "error, gene entry duplication was detected in record: $gene_id\n"; }
	}

	if ( ! $gene_id )
		{ die "error, gene id is missing\n"; }

	return $gene_id;
}
# --------------------------------------------
sub IsCDSgene
{
	# ref on array of arrays of GFF3 values from one gene
	my $ref = shift;

	my $gene_count = 0;
	my $cds_count = 0;

	my $gene_name = '';

	foreach my $entry (@{$ref})
	{
		if ( $entry->[2] eq "CDS" )
		{
			$cds_count += 1;
		}
		elsif ( $entry->[2] eq "gene" )
		{
			$gene_count += 1;
			$gene_name = $entry->[8];
		}
	}

	if ( $gene_count > 1 )
	{
		print Dumper($ref) if $debug;
		die "error, gene label duplication was detected: $gene_name\n";
	}

	if ( $gene_count == 1 and $cds_count > 0 )
	{
		return 1;
	}
	elsif ( $cds_count == 0 )
	{
		return 0;
	}
	else
	{
		die "error, unexpected combination of gene and cds count was detected: $gene_name $gene_count $cds_count\n";
	}
}
# --------------------------------------------
sub CheckForValidGFF
{
	my $str = shift;
	my $ref = shift;

	# seqid 1
	if ( $ref->[0] !~ /^\S+$/ )
		{ die "error, unexpected seqid format found: $ref->[0]\n$str\n"; }

	# start 4
	if ( $ref->[3] !~ /^\d+$/ )
		{ die "error, unexpected start format found:\n$str\n"; }

	# end 5
	if ( $ref->[4] !~ /^\d+$/ )
		{ die "error, unexpected end format found:\n$str\n"; }

	# start <= end
	if ( $ref->[3] > $ref->[4] )
		{ die "error, start is more than end:\n$str\n"; }

	# strand 6
	if ( $ref->[6] !~ /^[+-.]$/ )
		{ die "error, wrong strand value:\n$str\n"; }

	# phase 7
	if ( $ref->[7] !~ /^[.012]$/ )
		 { die "error, wrong phase value:\n$str\n"; }
}
# --------------------------------------------
sub AddLineToRecord
{
	# str - line from GFF3 file
	# ref on array of arrays - put all split lines from one gene here
	my $str = shift;
	my $ref = shift;

	chomp $str;

	my @arr = split( '\t', $str );

	my $size = @arr;

	if ( $size != 8 and $size != 9 )
		{ die "error, unexpected number of TABs found:\n$str\n"; }

	CheckForValidGFF( $str, \@arr ) if $debug;

	push @{$ref}, [@arr];
}
# ------------------------------------------------
sub CheckBeforeRun
{
	die "error, file not found: option --in $in_gff3\n" if( ! -e $in_gff3 );
	die "error, output file name matches input file: $in_gff3 $out_gff3\n" if ( $out_gff3 eq $in_gff3 );

	if ( $gseq_in )
	{
		die "error, file not found: option --fgseq $gseq_in\n" if( ! -e $gseq_in );
		die "error, output file name matches input file: $gseq_in $out_gff3\n" if ( $out_gff3 eq $gseq_in );
	}
}
# ------------------------------------------------
sub ParseCMD
{
	my $opt_results = GetOptions
	(
		'in_gff3=s'     => \$in_gff3,
		'out_gff3=s'    => \$out_gff3,

		'verbose'       => \$verbose,
		'warnings'      => \$warnings,
		'debug'         => \$debug,

		'gseq_in=s'     => \$gseq_in,
		'tseq_out=s'    => \$tseq_out,
		'tgff_out=s'    => \$tgff_out,
		'CDSseq_out=s'  => \$CDSseq_out,

		'min_intron=i'  => \$min_intron,
		'no_gc'         => \$no_gc,
		'cds_gene_only' => \$cds_gene_only,
		'adjust_cds'    => \$adjust_cds,
		'get_sites'     => \$get_sites,
        );

	die "error on command line\n" if( !$opt_results );
	die "error, unexpected argument found on command line\n" if( @ARGV > 0 );

	$verbose = 1 if $debug;
	$warnings = 1 if $debug;
}
# ------------------------------------------------
sub Usage
{
        print qq(
Usage: $0  --in_gff3 [name]  --out_gff3 [name]

  This program takes as input 'nice' GFF3 formatted genome annotation file
  and enriches annotation by adding introns, stop + start codons, CDS types, etc

Optional:
  --gseq_in          input file with genome sequence in FASTA format
  --tseq_out         output transcript sequences into this file
  --tgff_out         output transcript annotation into this file
# --CDSseq_out       output CDS sequences into this file

  --min_intron [$min_intron]  minimum length of intron to calculate from CDS-CDS borders
  --no_gc            skip calculation of sequence GC features
# --cds_gene_only    output only protein coding genes
  --adjust_cds       adjust borders of incomplete CDS to complete codon
  --get_sites        output seq around sites as in GeneMark.hmm model

General:
  --verbose
  --warnings
  --debug

Version: $VERSION

);
	exit 1;
}
# ------------------------------------------------

