#!/usr/bin/env python
import os
import sys
import re
import argparse
import random
import collections




parser = argparse.ArgumentParser(description="""           
Description
-----------
    This script simulates single-end reads from the population genome""",formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Prerequisites
-------------
    python version 3+

Authors
-------
    Robert Kofler 
""")


parser.add_argument("--binsize", type=int, required=True, dest="binsize", default=None, help="binsize")
parser.add_argument("--miRNA", type=int, required=True, dest="mirna", default=None, help="mirna")
parser.add_argument("--min-mq", type=int, required=True, dest="minmq", default=None, help="min map qual")
parser.add_argument("--sam", type=argparse.FileType('r'), required=True, dest="sam", default=None, help="the sam file")
args = parser.parse_args()

binsize=args.binsize
mirna=float(args.mirna)
minlen=23
maxlen=29


"""
0                                       1   2   3   4   5   
HISEQ:226:C9M93ANXX:3:1101:19614:50461	16	2L	125	25	29M	*	0	0	ATTGGATTGGAAGTGTGTGCGTTCGGTTA	FFFFFFFFFFFFFFFFFFFFFFFBBBBBB	AS:i:0	UQ:i:0	NM:i:0	MD:Z:29	PG:Z:novoalign
HISEQ:226:C9M93ANXX:3:1102:20337:86794	16	2L	125	25	29M	*	0	0	ATTGGATTGGAAGTGTGTGCGTTCGGTTA	/FB/<///<BFBFBBFFFFB<<F/BBBBB	AS:i:0	UQ:i:0	NM:i:0	MD:Z:29	PG:Z:novoalign
"""

ph=collections.defaultdict(lambda:collections.defaultdict(lambda:0))

for s in args.sam:
    if(s.startswith("#")):
        continue
    a=s.rstrip("\n").split("\t")  
    chr=a[2]
    pos=int(a[3])
    mq=int(a[4])
    if mq < args.minmq:
        continue
    readlen=len(a[9])
    if readlen<minlen or readlen>maxlen:
        continue
    posk=int(pos/binsize)
    ph[chr][posk]+=1
    # eg pos=112; and binsize=100
    # key=int(112/100) = 1

for chr,tmp in ph.items():
    for posk in sorted(tmp.keys()):
        bincount=ph[chr][posk]
        normcount=(float(bincount)*1000000.0)/mirna
        startpos=posk*binsize
        endpos=startpos+binsize-1
        # eg key = 1 with binsize 100
        # pos is between 100 and 200
        
        print("{0}\t{1}\t{2}\t{3}\t{4}".format(chr,startpos,endpos,bincount,normcount))