import re
import pandas as pd
import natsort as ns

min_variant_size = snakemake.wildcards.L

# retrieve species order
with open(snakemake.input.order) as file_object:
    species_order = file_object.readline()

# based on: https://www.vipinajayakumar.com/parsing-text-with-python/#step-3-define-regular-expressions
rx_dict = {
    'n_segments': re.compile(r'Number of segments: (?P<n_segments>\d*)\n'),
    'n_edges': re.compile(r'Number of links: (?P<n_edges>\d*)\n'),
    'total_pangenome_size': re.compile(r'Total segment length: (?P<total_pangenome_size>\d*)\n'),
    'backbone_size': re.compile(r'Sum of rank-0 segment lengths: (?P<backbone_size>\d*)\n')
}

def _parse_line(line):
    """
    Do a regex search against all defined regexes and
    return the key and match result of the first matching regex
    
    """
    for key, rx in rx_dict.items():
        match = rx.search(line)
        if match:
            return key, match
    return None, None

with open(snakemake.input.stat) as file_object:
    line = file_object.readline()
    while line:
        key, match = _parse_line(line)
        
        if key == 'n_segments':
            n_segments = int(match.group('n_segments'))
        
        if key == 'n_edges':
            n_edges = int(match.group('n_edges'))
        
        if key == 'total_pangenome_size':
            graph_size = int(match.group('total_pangenome_size'))

        if key == 'backbone_size':
            bb_size = int(match.group('backbone_size'))

        line = file_object.readline()

# get segment, segment length, genome rank
df_segments = pd.read_table(snakemake.input.segments, 
    names = ['segment', 'length', 'rank']
)

# naturally sort segment column
df_segments['segment'] = pd.Categorical(
    df_segments['segment'], ordered=True, categories=ns.natsorted(df_segments['segment'].unique())
)

# extract a list of flex segments by taking the middle segments of the bubbles
# then, add this information to the main dataframe
flex_segments = []
with open(snakemake.input.bubble) as file_object:
    line = file_object.readline().strip('\n')
    while line:
        for seg in line.split(',')[1:-1]:
            flex_segments.append(seg)
        line = file_object.readline()
df_segments['core_bool'] = True
df_segments.loc[df_segments.query("segment in @flex_segments").index, 'core_bool'] = False

# using the existing information, we can calculate most statistics that we want
# note that core stats include core regions unalignable by query
linear_size = df_segments.query('core_bool == True')['length'].sum() 
linear_count = df_segments.query('core_bool == True').shape[0]

variable_size = df_segments.query('core_bool == False')['length'].sum()
variable_count = n_segments - linear_count

# count the number of bubbles
n_bubbles = sum(1 for line in open(snakemake.input.bubble))

# write results
items_to_write = (
    [
        species_order, min_variant_size, 
        n_segments, n_edges, n_bubbles, 
        bb_size, graph_size,
        linear_size, linear_count, variable_size, variable_count
    ] + 
    df_segments['length'].describe().tolist()[1:] # mean, std, q0, q25, q50, q75, q100
)
with open(snakemake.output.out, 'w') as o:
    o.write(','.join([str(x) for x in items_to_write]) + '\n')

# import benchmark results, and add metadata fields
tmp = [] # to be converted to a dataframe later
counter = 0
with open(snakemake.input.bmk) as file_object:
    line = file_object.readline()
    while line:
        if counter > 0:
            results = line.strip('\n').split('\t')
            row = {
                'species_order': species_order,
                'min_variant_size': min_variant_size,
                'iteration': counter,
                'cpu_time': float(results[9]),
                'wallclock_time': float(results[0]),
                'max_pss': float(results[5])
            }
            tmp.append(row)
        counter +=1
        line = file_object.readline()
pd.DataFrame(tmp).to_csv(snakemake.output.bmk, index = False)
