#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Mar  5 23:05:56 2021

@author: christospapadopoulos
"""

import sys

def read_multiFASTA(fasta_file):
    dico = {}
    with open(fasta_file,'r') as fasta:
        for line in fasta:
            if line.startswith('>'):
                name = str(line.split()[0])[1:]
                dico[name] = ''
            elif line == '\n':
                continue
            else:
                seq = line.strip()
                dico[name] = dico[name] + seq
    return(dico) 


def ranges(nums):
    '''
    Finds the consecutive indexes of numbers
    in order to find the overlapping region between 2
    series of numbers
    '''
    #nums = sorted(set(nums))
    gaps = [[s, e] for s, e in zip(nums, nums[1:]) if s+1 < e]
    edges = iter(nums[:1] + sum(gaps, []) + nums[-1:])
    return list(zip(edges, edges))


genome_file = sys.argv[sys.argv.index("-genome")+1]
genome = read_multiFASTA(genome_file)
gff_file    = sys.argv[sys.argv.index("-gff")+1]
out         = sys.argv[sys.argv.index("-out")+1]

dico = {}
with open(gff_file,"r") as f:
    for line in f:
        if line.startswith("#"):
            continue
        chrom = line.split()[0]
        if chrom not in dico:
            dico[chrom] = []
        dico[chrom].append((int(line.split()[3]),int(line.split()[4])))
        
dico_igr = {}
for chrom in dico:
    my_chrom_idx = list(range(1,len(genome[chrom])))
    for j in dico[chrom]:
        my_chrom_idx = list(set(my_chrom_idx) - set(list(range(j[0],j[1]))))
    dico_igr[chrom] = ranges(sorted(my_chrom_idx))
        

with open(out,"w") as fw:
    for chrom in dico_igr:
        for igr in dico_igr[chrom]:
            n = chrom + "_" + str(igr[0]) + "-" + str(igr[1])
            fw.write(">{}\n{}\n".format(n,genome[chrom][igr[0]:igr[1]]))
        
        
    
        
    
    
    
    
    
    
    
    
    
    

        