#!/usr/bin/env python

"""This program is used to calculate PU score for corresponding RNA regions of introns
Usage: %prog <rnafold_dp_ps_dir> <secstr_feature_file>
"""

import gzip
import os
import re
from collections import defaultdict
import sys
import numpy as np


def open_file(in_file, mode='r'):
    if re.search('.gz$', in_file):
        mode += 't'
        return gzip.open(in_file, mode=mode)
    return open(in_file, mode=mode)


def calculate_PU(dp_file=''):
    pos2prob = {}
    with open(dp_file) as fh:
        for line in fh:
            line = line.strip()
            if line == "/sequence { (\\":
                seq = re.sub(r'\\$', '', next(fh))
                for pos in range(len(seq)):
                    pos2prob[pos] = 0
            elif re.search('ubox$', line) and not re.search('^%', line):
                lineL = line.split()
                start, end, prob = lineL[:-1]
                pos2prob[int(start)] += float(prob) * float(prob)
                pos2prob[int(end)] += float(prob) * float(prob)
    pu_score = [1-pos2prob[pos] for pos in pos2prob]
    return pu_score


def get_region_PU(dp_dir='', out_file=''):
    # chr10_127000307_127008731_+I_3p_141_210_dp.ps
    dp_list = os.listdir(dp_dir)
    dp_list = [re.sub(r'_dp\.ps', '', os.path.basename(i))
               for i in dp_list if re.search('_dp.ps$', i)]
    feature2intron2pu = {}
    intron_list = set(
        ['_'.join(i.split('_')[:3] + [i.split('_')[3][0]]) for i in dp_list])
    sub_list = set(['_'.join([i.split('_')[3][1:]] + i.split('_')[4:])
                   for i in dp_list])
    print(sub_list)
    for sub in sub_list:
        for stat in ['max', 'avg']:
            feature2intron2pu['SecStr.{}{}'.format(stat, sub)] = {}
    for intron in intron_list:
        for sub in sub_list:
            pu_score = calculate_PU(os.path.join(dp_dir, '{}{}_dp.ps'.format(intron, sub)))
            if not pu_score:
                print(intron)
                continue
            feature2intron2pu['SecStr.max' + sub][intron] = max(pu_score)
            feature2intron2pu['SecStr.avg' +
                              sub][intron] = np.average(pu_score)
    with open(out_file, 'w') as output:
        output.write('\t'.join(['Intron_ID'] +
                     list(feature2intron2pu.keys())) + '\n')
        for intron in list(intron_list):
            output.write('\t'.join([intron] + [str(feature2intron2pu[feature][intron])
                         for feature in feature2intron2pu]) + '\n')


def main():
    try:
        rnafold_dp_ps_dir = sys.argv[1]
        secstr_feature_file = sys.argv[2]
    except:
        sys.exit(__doc__)

    get_region_PU(rnafold_dp_ps_dir, secstr_feature_file)


if __name__ == '__main__':
    main()
