import pandas as pd
from pybedtools import BedTool

# function to read in a BED6 file: chr, start, end, name, age, strand
# where col4 is te_class:te_subclass:te_id:repmask_id
def read_and_parse_BED6(path_to_file, classes_to_remove, size_filter):
    output_df = pd.read_table(
        path_to_file,
        names = "chr,start,end,name,age,strand".split(','),
        dtype = {'chr': str, 'start': int, 'end': int, 'name': str, 'age': float, 'strand': str}
    )

    # extract info about TE from name column
    output_df[['te_class', 'te_subclass', 'te_id', 'repmask_id']] = output_df['name'].str.split(':', expand=True)

    # filter out simple repeats and unwanted entries based on config file
    output_df = output_df.query("(te_class not in @classes_to_remove) & ((end - start)>@size_filter)").copy()
    output_df = (
        output_df
            .assign(te_class = lambda dataframe: ['Helitron' if x == 'RC' else x for x in dataframe['te_class']])
            .assign(te_class = lambda dataframe: ['SINE' if x == 'SINE?' else x for x in dataframe['te_class']])
    )
    return(output_df)

# function to read in a BED file intersected with the -wao option
# chr, start, end, name, age, strand
# where col4 is te_class:te_subclass:te_id:repmask_id
def read_and_parse_BED_wao(path_to_file, classes_to_remove, size_filter):
    output_df = pd.read_table(path_to_file,
        names = "chr,start,end,name,age,strand,sv_chr,sv_start,sv_end,sv_name,sv_len,sv_strand,overlap_len".split(','),
        dtype = {
            'chr': str, 'start': int, 'end': int, 
            'name': str, 'age': float, 'strand': str,
            'sv_chr': str, 'sv_start': str, 'sv_end': str, 
            'sv_name': str, 'sv_len': int, 'sv_strand': str, 
            'overlap_len': int
        }
    )

    # extract info about TE from name column
    output_df[['te_class', 'te_subclass', 'te_id', 'repmask_id']] = output_df['name'].str.split(':', expand=True)

    # filter out simple repeats and unwanted entries based on config file
    output_df = output_df.query("(te_class not in @classes_to_remove) & ((end - start)>@size_filter)").copy()
    output_df = (
        output_df
            .assign(te_class = lambda dataframe: ['Helitron' if x == 'RC' else x for x in dataframe['te_class']])
            .assign(te_class = lambda dataframe: ['SINE' if x == 'SINE?' else x for x in dataframe['te_class']])
    )
    return(output_df)

# function to perform a Bedtools merge operation to collapse overlapping ranges
# and calculate the total length of these non-overlapping ranges (GenomicRanges::reduce)
def get_non_overlapping_total_length(bed_df):
    if bed_df.shape[0] == 0:
        return(0)
    else:
        bed_df = bed_df[['chr', 'start', 'end']]
        tmp_df = BedTool.from_dataframe(bed_df).sort().merge().to_dataframe()
        return(sum(tmp_df['end'] - tmp_df['start']))