#!/usr/bin/perl -w

use POSIX;
use Sort::Naturally;

$| = 1;


$bed    = $ARGV[0];
$getseq = $ARGV[1];

unless (defined $bed) { die "$0 bed_file [getseq 0/1 flag]"; }

$filetag = '';
if ($getseq) {
    $prefix = 20; $suffix = 70;
    $filetag = ".siRNA_SV_region";
    $RM = "CACCGGAC.GTCCGGTG";
}

$cleanIDs = "6456_disinfected.SVs";
#$cleanIDs = "2973_rejected.SVs";
if (-e "$cleanIDs") {
    open (CLEANIDSFILE, "$cleanIDs") || print "\tcannot read $cleanIDs\n";
    while (<CLEANIDSFILE>) {
        if (/\w/) {
            chomp;
            @tabs = split /\t/;
            $clean{$tabs[0]} = 1;
        }
    }
    close CLEANIDSFILE;
}

$gff = "MASiVEdb.Zea_mays.gff.noBLAST.gff.SS";
$fasta = "MASiVEdb.Zea_mays.gff.noBLAST.fasta.SS";
$cleanpass = 0;

open (MATRIX, "MASiVEdb.Zea_mays.matrix.NEW1") || print "\tcannot read MASiVEdb.Zea_mays.matrix.NEW1\n";
while (<MATRIX>) {
    unless (/^#/) {
        chomp;
        @tabs = split /\t/;
        if ($cleanpass || $clean{$tabs[5]}) {
            $fam = (split / /, $tabs[11])[0];
            if (defined $fam) {
                $fams{$tabs[5]} = $fam;
                $ages{$tabs[5]} = $tabs[13];
                $envs{$tabs[5]} = $tabs[12];
                $d2c{$tabs[5]} = $tabs[10];
            }
        }
    }
}
close MATRIX;

if ($getseq) {
    print STDERR "loading $fasta\n";
    open (FASTA, "$fasta") || print "\tcannot read $fasta\n";
    while (<FASTA>) {
        chomp;
        if (/^>/) {
            s/^>//;
            $SV = $_;
            $skipSV = 1;
            if (defined $clean{$SV}) {
                $skipSV = 0;
            }
        } elsif (!$skipSV && /\S/) {
            $fasta{$SV} = $_;
        }
    }
    close FASTA;
}
print STDERR "loading $gff\n";
open (GFF, "$gff") || print "\tcannot read $gff\n";
while (<GFF>) {
    unless (/^#/) {
        chomp;
        @tabs = split /\t/;
        @split8 = (split /;/, $tabs[8]);
        if ($cleanpass || $clean{$split8[0]}) {
            if ($tabs[2] eq 'Sirevirus') {
                $coords{$split8[0]}{SV}{from} = $tabs[3];
                $coords{$split8[0]}{SV}{to} = $tabs[4];
                $coords{$split8[0]}{SV}{len} = $tabs[5];    
           } elsif ($tabs[2] eq 'long_terminal_repeat') {
                $coords{$split8[0]}{$split8[6]}{from} = $tabs[3];
                $coords{$split8[0]}{$split8[6]}{to} = $tabs[4];
                $coords{$split8[0]}{$split8[6]}{len} = $tabs[5];
            } elsif ($tabs[2] eq 'insert') {
                $coords{$split8[0]}{insert}{from} = $tabs[3];
                $coords{$split8[0]}{insert}{to} = $tabs[4];
                $coords{$split8[0]}{insert}{len} = $tabs[5];
            }
        }
    }
}
close GFF;
print STDERR keys(%coords) . " SVs\n";
die if keys(%coords) == 0;

$bins = 100;
$bins_offset{'5prime'} = 0;
$bins_offset{internal} = 100;
$bins_offset{'3prime'} = 200;
$bins_offset{insert}   = 300;

$regions2sort{'5prime'} = 1;
$regions2sort{internal} = 2;
$regions2sort{'3prime'} = 3;
$regions2sort{insert}   = 4;

foreach $SV (sort keys %coords) {
    $SV =~ /Zmay_chr_(\d+)-/;
    $new_chr = $1;
    $chr = "chr$new_chr";

    unless (defined $groups{$SV})   { $groups{$SV} = 'na'; }
    unless (defined $fams{$SV})     { $fams{$SV} = 'na'; }
    $agegroup = 'na';
    unless (defined $ages{$SV})     { $ages{$SV} = 'na'; } else { $agegroup = sprintf("%1.0f",2 * $ages{$SV}) / 2; }
    unless (defined $envs{$SV})     { $envs{$SV} = 'na'; }
    unless (defined $d2c{$SV})      { $d2c{$SV} = 'na'; }
    undef %annotation;
    undef %counters;

    $running = 0;
    for ($c=$coords{$SV}{'5prime'}{from};$c<=$coords{$SV}{'5prime'}{to};$c++) { $annotation{$c} = '5prime'; }
    for ($c=$coords{$SV}{'3prime'}{from};$c<=$coords{$SV}{'3prime'}{to};$c++) { $annotation{$c} = '3prime'; }
    if (exists $coords{$SV}{insert}) {
        for ($c=$coords{$SV}{insert}{from};$c<=$coords{$SV}{insert}{to};$c++) { $annotation{$c} = 'insert'; }
        $coords{$SV}{internal}{len} = $coords{$SV}{SV}{len} - $coords{$SV}{'5prime'}{len} - $coords{$SV}{'3prime'}{len} - $coords{$SV}{'insert'}{len};
    } else {
        $coords{$SV}{internal}{len} = $coords{$SV}{SV}{len} - $coords{$SV}{'5prime'}{len} - $coords{$SV}{'3prime'}{len};
    }
    if ($SV =~ /-D-/) {
        for ($c=$coords{$SV}{SV}{from};$c<=$coords{$SV}{SV}{to};$c++) {
            unless (exists $annotation{$c}) { $annotation{$c} = 'internal'; }
            ++$running;
            ++$counters{$annotation{$c}};
            $bin = ceil(($counters{$annotation{$c}}*$bins) / $coords{$SV}{$annotation{$c}}{len});
            $offset_bin = $bin + $bins_offset{$annotation{$c}};
            $coords2{"$chr $c"}{sv} = "$SV\t$fams{$SV}";
            $coords2{"$chr $c"}{bin} = $offset_bin;
            $coords2{"$chr $c"}{run} = $running;
       }
    } elsif ($SV =~ /-P-/) {
        for ($c=$coords{$SV}{SV}{to};$c>=$coords{$SV}{SV}{from};$c--) {
            unless (exists $annotation{$c}) { $annotation{$c} = 'internal'; }
            ++$running;
            ++$counters{$annotation{$c}};
            $bin = ceil(($counters{$annotation{$c}}*$bins) / $coords{$SV}{$annotation{$c}}{len});
            $offset_bin = $bin + $bins_offset{$annotation{$c}};
            $coords2{"$chr $c"}{sv} = "$SV\t$fams{$SV}";
            $coords2{"$chr $c"}{bin} = $offset_bin;
            $coords2{"$chr $c"}{run} = $running;
        }
    }
    ++$SVs;
    if (($SVs/1000) !~ /\./) { print STDERR " $SVs"; }
    #if ($SVs > 100) { last; }
}
undef %coords;
undef %counters;
undef %annotation;
print STDERR "\n";

unless (defined $SVflt) { $SVflt = "noSVflt"; }

print STDERR "loading bed $bed\n"; # coords start from zero !
open (CONCHIE_ANNOTATED, ">$bed.$SVflt.mod.anno$filetag") || print "\tcannot create $bed.$SVflt.mod.anno$filetag\n";
select((select(CONCHIE_ANNOTATED), $| = 1)[0]);
open (CONCHIE, "$bed") || print "\tcannot read $bed\n";
while (<CONCHIE>) {
    if (/\w/) {
        chomp;
        @tabs = split /\t/;
        if      ($tabs[4] eq '+') { $first = $tabs[1]+1+1;
        } elsif ($tabs[4] eq '-') { $first = $tabs[2]+1-2; } # the -2 is for the error of the - mapping (probably)
        if (defined $coords2{"$tabs[0] $first"}) {
            $siRNA_SV_region = '';
            if ($getseq && defined $coords2{"$tabs[0] $first"}) {
                $len = $tabs[2] - ($tabs[1]+1) + 1;
                $SV = (split /\t/, $coords2{"$tabs[0] $first"}{sv})[0];
                $siRNA_SV_region = substr($fasta{$SV},$coords2{"$tabs[0] $first"}{run}-1-$prefix,$prefix+$len+$suffix);
            }
            for $c ($tabs[1]+1..$tabs[2]) {
                $c2 = $c+1;
                if (defined $coords2{"$tabs[0] $c2"}) {
                    $SVinfos{$coords2{"$tabs[0] $c2"}{sv}} += $coords2{"$tabs[0] $c2"}{bin};
                    ++$counts{$coords2{"$tabs[0] $c2"}{sv}};
                }
            }
            foreach $SVinfo (keys %SVinfos) {
                if ($SVinfo =~ /-P-/) {
                    if ($tabs[4] eq '+') { $dir2print = '-';
                    } else {               $dir2print = '+'; }
                } else {                   $dir2print = $tabs[4]; }
                $ave_bin = sprintf "%.0f", $SVinfos{$SVinfo} / $counts{$SVinfo};
                if      ($ave_bin <= 100) { $region = '5prime';
                } elsif ($ave_bin <= 200) { $region = 'internal';
                } elsif ($ave_bin <= 300) { $region = '3prime';
                } elsif ($ave_bin <= 400) { $region = 'insert'; }
                $siRNA_SV_region2print = '';
                if ($getseq && $siRNA_SV_region ne '') {
                    $siRNA_SV_regions{$siRNA_SV_region} = 1;
                    $RMs = () = $siRNA_SV_region =~ /$RM/g;
                    if ($siRNA_SV_region =~ /(.{$prefix})(.{$len})(.{$suffix})/) {
                        $siRNA_SV_region2print = "\t".lc($1)."$2".lc($3)."\t$RMs";
                    } else {
                        $siRNA_SV_region2print = "\t$siRNA_SV_region\t$RMs";
                    }
                }
                print CONCHIE_ANNOTATED "$tabs[3]\t$dir2print\t$tabs[5]\t$SVinfo\t$region\t$ave_bin\t$first\t".$coords2{"$tabs[0] $first"}{run}."$siRNA_SV_region2print\n";
            }
            undef %SVinfos;
            undef %counts;
        }
    }
}
close CONCHIE;
close CONCHIE_ANNOTATED;

SKIP:
if ($getseq) {
    undef %coords2;
    
    open (TMP, ">$bed.$SVflt.siRNA_SV_regions") || print "\tcannot create $bed.$SVflt.siRNA_SV_regions\n";
    foreach $siRNA_SV_region (keys %siRNA_SV_regions) { print TMP "$siRNA_SV_region\n"; }
    close TMP;
    `RNAfold --noPS < $bed.$SVflt.siRNA_SV_regions > $bed.$SVflt.siRNA_SV_regions.RNAfolded`;
    open (TMP, "$bed.$SVflt.siRNA_SV_regions.RNAfolded") || print "\tcannot read $bed.$SVflt.siRNA_SV_regions.RNAfolded\n";
    #ACGAGACGAGACGACGAAGAGUUUAU
    #........((((........)))).. ( -1.20)
    #ACCACGUGUACAGAUCAGUAGACUAAUAUGACAG
    #.....((.(((......))).))........... ( -3.00)
    while (<TMP>) {
        if (/^[ACGU]/) {
            chomp;
            $seq = $_;
            $seq =~ s/U/T/g;
        } elsif (/(\-\d+\.\d+)/) {
            $fes{$seq} = $1;
        }
    }
    close TMP;
    
    select((select(CONCHIE_ANNOTATED), $| = 1)[0]);
    open (CONCHIE_ANNOTATED_FE, ">$bed.$SVflt.mod.anno$filetag.fe") || print "\tcannot create $bed.$SVflt.mod.anno$filetag.fe\n";
    open (CONCHIE_ANNOTATED, "$bed.$SVflt.mod.anno$filetag") || print "\tcannot read $bed.$SVflt.mod.anno$filetag\n";
    while (<CONCHIE_ANNOTATED>) {
        if (/\w/) {
            chomp;
            @tabs = split /\t/;
            if (defined $fes{uc($tabs[9])}) { print CONCHIE_ANNOTATED_FE "$_\t".$fes{uc($tabs[9])}."\n"; }
        }
    }
    close CONCHIE_ANNOTATED;
    close CONCHIE_ANNOTATED_FE;
}

exit;
