#!/usr/bin/env python3
#Aidan Manning 4.2.19
#Small Script to annotate sailor output files with feature and geneID of called editing sites

import argparse
import sys
import os
import pandas as pd

if __name__ == '__main__':

    ap = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]), usage=__doc__)
    ap.add_argument('--v', required=True, help='vcf file converted to csv containing the output of mpileup')
    ap.add_argument('--snp', required=True, help='bed file containing the known SNPs')
    ap.add_argument('--o', required=True, help='path for output file')
    args = ap.parse_args()

    #reads in the files
    v = pd.read_csv(args.v, header = None, sep='\t', names=['chr', 'pos', 'id', 'ref', 'alt', 'qual', 'filter', 'info',
                                                            'tags', 'taginfo'])
    snp = pd.read_csv(args.snp, header=None, sep='\t', names=['chr', 'pos-1', 'pos', 'strand'])
    snp = snp.drop(['pos-1'], axis=1)

    #removes unneeded columns and splits up the vcf column data
    v['info_clean'] = v['info'].str.split(';').str[0]
    v['coverage'] = v['info_clean'].str.split('=').str[1]
    v['taginfo_clean'] = v['taginfo'].str.split(':').str[1]
    v['alt1'] = v['taginfo_clean'].str.split(',').str[2]
    v['alt2'] = v['taginfo_clean'].str.split(',').str[3]
    v_cut = v.drop(['id', 'qual', 'filter', 'tags', 'info', 'taginfo', 'info_clean'], axis=1)

    #converts dataframe column types to integers for maths
    v_cut['coverage'] = v_cut['coverage'].astype(int)
    v_cut['alt1'] = v_cut['alt1'].astype(int)
    v_cut['alt2'] = v_cut['alt2'].astype(int)

    #calculates percent variant based on DP4 field of the vcf file, multiply by 100 to give percentage, rounded to 2
    #decimal places
    v_cut['%variant'] = round(((v_cut['alt1'] + v_cut['alt2']) / v_cut['coverage']) * 100, 2)

    #looks for overlaps with the known snps file
    merged = v_cut.merge(snp, how='left', on=['chr', 'pos'])

    #removes the known snps from the list of variant sites
    merged_clean = merged[merged.strand != '+']
    merged_clean = merged_clean[merged_clean.strand != '-']
    merged_clean['#variant_reads'] = merged_clean['alt1'] + merged_clean['alt2']

    #removes unneeded columns
    merged_clean = merged_clean.drop(['taginfo_clean', 'strand', 'alt1', 'alt2'], axis=1)


    #writes the file to output csv file
    merged_clean.to_csv(args.o)