#!/usr/bin/env python
import os
import sys
import inspect
import re
import argparse
import random
import math
import collections
import fileinput
prog = re.compile(r"(\d+)([MISDHN])")




parser = argparse.ArgumentParser(description="""           
Description
-----------
Summary statistics
""",formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
miRNA: 21-23nt
piRNA: 23-28nt


Authors
-------
    Robert Kofler
""")
parser.add_argument('--sam', type=argparse.FileType('r'), default=None,dest="sam", required=True, help="A sam file")
parser.add_argument("--min-mq", type=int, required=False, dest="minmq", default=0, help="min mapping quality")
parser.add_argument("--max-mm", type=int, required=False, dest="maxmm", default=10, help="max mismatches")
parser.add_argument("--sample-id", type=str, required=True, dest="sid", default=10, help="the sample id")
args = parser.parse_args()
minmq=args.minmq
maxmm=args.maxmm




ps=collections.defaultdict(lambda:0)
pas=collections.defaultdict(lambda:0)
tes=collections.defaultdict(lambda:0)
teas=collections.defaultdict(lambda:0)
mis=collections.defaultdict(lambda:0)
mias=collections.defaultdict(lambda:0)
mirnacount=0
pcount=0
tecount=0

reo=re.compile(r"NM:i:(\d+)")
for line in args.sam:
     """
0         1         2              3    4         5    6         7      8            9                        10                  11
r1	16	M14653_te	172	70	23M	*	0	0	ATGTCGAGTTTCGTGCCGAATAA	FFFFFFFFFFFFFFFFFFBBBBB	PG:Z:novoalign	AS:i:0	UQ:i:0	NM:i:0	MD:Z:23
r2	0	M14653_te	240	70	27M	*	0	0	AACAGCTGCGGAATCGCACCGAATGCT	BBBBBFFFFFBFFFFFFFFFFFFFFFF	PG:Z:novoalign	AS:i:0	UQ:i:0	NM:i:0	MD:Z:27
     """
     a=line.rstrip("\n").split("\t")
     
     # discard unmapped
     flag=int(a[1])
     if flag & 0x004 > 0:
          continue
     
     # discard low mapping quality
     mq=int(a[4])
     if mq< minmq:
          continue
     
     # discard mismatch
     mm=0
     mo=re.search(reo,line)
     if mo is None:
          continue
     mm=int(mo.group(1))
     if(mm>maxmm):
          continue  
     
     
     ref=a[2]
     readlen=len(a[9])
     if ref.endswith("_te"):
          teseq=ref[:-3]
          if teseq=="PPI251":
               pcount+=1
               if flag& 0x10:
                    # reverse complement if flag 0x10 is set
                    pas[readlen]+=1
               else:
                    ps[readlen]+=1
          else:
               tecount+=1
               if flag& 0x10:
                    # reverse complement if flag 0x10 is set
                    teas[readlen]+=1
               else:
                    tes[readlen]+=1
               
                    
     elif ref.endswith("_miRNA"):
          mirnacount+=1
          if flag& 0x10:
               # reverse complement if flag 0x10 is set
               mias[readlen]+=1
          else:
               mis[readlen]+=1

          
     elif ref.endswith("_rRNA") or  ref.endswith("_rRNA;"):
          pass
     elif ref.endswith("_tRNA"):
          pass
     elif  ref.endswith("_snoRNA;") or ref.endswith("_snoRNA") or ref.endswith("_snRNA;") or ref.endswith("_snRNA") or ref.endswith("_mRNA"):
          pass
     else:
          raise Exception("Unknown sequence end "+ ref)
sid=args.sid
for k in sorted(ps.keys()):
     counts=ps[k]
     ncount=float(100.0*counts)/float(pcount)
     print("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}".format(sid,"P","s","P-s",k,counts,ncount))
for k in sorted(pas.keys()):
     counts=pas[k]
     ncount=float(100.0*counts)/float(pcount)
     print("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}".format(sid,"P","as","P-as",k,counts,-ncount))


for k in sorted(tes.keys()):
     counts=tes[k]
     ncount=float(100.0*counts)/float(tecount)
     print("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}".format(sid,"TE","s","TE-s", k, counts,ncount))
for k in sorted(teas.keys()):
     counts=teas[k]
     ncount=float(100.0*counts)/float(tecount)
     print("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}".format(sid,"TE","as","TE-as", k,counts,-ncount))


for k in sorted(mis.keys()):
     counts=mis[k]
     ncount=float(100.0*counts)/float(mirnacount)
     print("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}".format(sid,"miRNA","s","miRNA-s",k,counts,ncount))
for k in sorted(mias.keys()):
     counts=mias[k]
     ncount=float(100.0*counts)/float(mirnacount)
     print("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}".format(sid,"miRNA","as","miRNA-as",k,counts,-ncount))



