#!/usr/bin/env python3
#Aidan Manning 4.2.19
#Small Script to annotate sailor output files with feature and geneID of called editing sites

import argparse
import sys
import os
import pandas as pd
import pybedtools as bedtools

if __name__ == '__main__':

    ap = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]), usage=__doc__)
    ap.add_argument('--gtf', required=True, help='GTF file containing annotations; can be any GTF file from wormbase')
    ap.add_argument('--fwd', required=True, help='fwd bed file from sailor output')
    ap.add_argument('--rev', required=True, help='rev bed file from sailor output')
    #ap.add_argument('--bed', required=True, help='merged fwd/rev bed file containing sites to be annotated; this file is created by using "cat fwdfilename revfilename > mergedfilename"')
    ap.add_argument('--wb', required=True, help='wb file containing wb to geneID annotations')
    ap.add_argument('--o', required=True, help='path for output file')
    args = ap.parse_args()

    #reads in files
    #merged = bedtools.BedTool(args.bed)
    gtf = bedtools.BedTool(args.gtf)
    fwd = bedtools.BedTool(args.fwd)
    rev = bedtools.BedTool(args.rev)
    wbgene_to_geneID = pd.read_csv(args.wb, header = None, names = ['wbID', 'geneID'])

    #merges the fwd and rev bed files
    merged = bedtools.BedTool.cat(fwd, rev, postmerge = False)

    #peforms intersection
    merged_gtf = merged.intersect(gtf, s = True, loj = True)

    #converts to pandas dataframe
    df = bedtools.BedTool.to_dataframe(merged_gtf, header = None, names = ['chr', 'pos-1', 'pos', 'coverage', 'conf',
                                                                           'strand', 'chrGTF', 'source', 'feature',
                                                                           'pos_1_GTF', 'pos_2_GTF', 'dot', 'strandGTF',
                                                                           'dot2', 'info'])
    #removes unneeded columns
    df_cut = df.drop(['chrGTF', 'source', 'pos_1_GTF', 'pos_2_GTF', 'dot', 'strandGTF', 'dot2'], axis = 1)

    #adds numerical value to feature types
    df_cut.loc[df_cut['feature'] == 'gene', 'numerical'] = 0
    df_cut.loc[df_cut['feature'] == 'exon', 'numerical'] = 1
    df_cut.loc[df_cut['feature'] == 'three_prime_utr', 'numerical'] = 2
    df_cut.loc[df_cut['feature'] == 'five_prime_utr', 'numerical'] = 3
    df_cut.loc[df_cut['feature'] == 'CDS', 'numerical'] = 4

    #sorts the dataframe by numerical for removing duplicates
    df_cut_sorted = df_cut.sort_values(by = ['numerical'], ascending = False)

    #removes duplicates and keeps the highest value
    df_nodup = df_cut_sorted.drop_duplicates(subset = ['chr', 'pos-1', 'pos', 'coverage', 'conf', 'strand'], keep = 'first')

    #splits the info column
    df_nodup_expand = df_nodup['info'].str.split(';', expand = True)

    #removes the unneeded info
    df_nodup_expand = df_nodup_expand.drop([1, 2, 3, 4, 5, 6, 7, 8], axis = 1)

    #combines the two dataframes
    concat = pd.concat([df_nodup, df_nodup_expand], axis = 1)

    #renames the last column
    concat = concat.rename(columns = {0:'wbIDRough'})

    #removes all of the unnecessary stuff from the info column
    concat_list = [x[9:23] for x in concat.wbIDRough]

    #cleans up the file for the merge and output
    concat['wbID'] = concat_list
    concat_clean = concat.drop(['info', 'numerical', 'wbIDRough'], axis = 1)
    concat_clean = concat_clean.replace({'.' : 'intergenic', '' : 'intergenic'})
    concat_clean = concat_clean.replace({'feature' : 'gene'}, 'intron')

    #merges based on the wbID to get the geneIDs
    merged_final = concat_clean.merge(wbgene_to_geneID, how = 'left', on = ['wbID'])

    #writes the file to output csv file
    merged_final.to_csv(args.o)