#!/usr/bin/env python
from ntpath import realpath
import os
import sys
import inspect
import re
import argparse
import random
import math
import collections
import fileinput
prog = re.compile(r"(\d+)([MISDHN])")




parser = argparse.ArgumentParser(description="""           
Description
-----------
Summary statistics
""",formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Length distribution of piRNAs complementary to the P-element normalized to a million miRNAs.
miRNA: 21-23nt


Authors
-------
    Robert Kofler
""")
parser.add_argument('--sam', type=argparse.FileType('r'), default=None,dest="sam", required=True, help="A sam file")
parser.add_argument("--min-mq", type=int, required=False, dest="minmq", default=0, help="min mapping quality")
parser.add_argument("--max-mm", type=int, required=False, dest="maxmm", default=10, help="max mismatches")
parser.add_argument("--sample-id", type=str, required=True, dest="sid", default=10, help="the sample id")
args = parser.parse_args()
minmq=args.minmq
maxmm=args.maxmm



pirnanorm=0
ppicount=0
psicount=0
tecount=0

reo=re.compile(r"NM:i:(\d+)")
for line in args.sam:

     a=line.rstrip("\n").split("\t")
     
     # discard unmapped
     flag=int(a[1])
     if flag & 0x004 > 0:
          continue
     
     # discard low mapping quality
     mq=int(a[4])
     if mq< minmq:
          continue
     
     # discard mismatch
     mm=0
     mo=re.search(reo,line)
     if mo is None:
          continue
     mm=int(mo.group(1))
     if(mm>maxmm):
          continue  
     
     
     ref=a[2]
     readlen=len(a[9])
     if ref.endswith("_te"):
          if readlen>=23 and readlen<=29:
               pirnanorm+=1  # normalization always to the ones between 23 and 29
               # therefore no continue here and hard coding of sm
               
          teseq=ref[:-3]
          if teseq=="PPI251":
               if readlen>=23 and readlen<=29:
                    ppicount+=1
               elif readlen>=20 and readlen<=22:
                    psicount+=1
               
                    
     elif ref.endswith("_miRNA"):
          pass         
     elif ref.endswith("_rRNA") or  ref.endswith("_rRNA;"):
          pass
     elif ref.endswith("_tRNA"):
          pass
     elif  ref.endswith("_snoRNA;") or ref.endswith("_snoRNA") or ref.endswith("_snRNA;") or ref.endswith("_snRNA") or ref.endswith("_mRNA"):
          pass
     else:
          raise Exception("Unknown sequence end "+ ref)
sid=args.sid
ppicount=1000000.0*(float(ppicount)/float(pirnanorm))
psicount=1000000.0*(float(psicount)/float(pirnanorm))
print("{0}\t{1}\t{2}\t{3}".format(sid,"P","piRNA",ppicount))
print("{0}\t{1}\t{2}\t{3}".format(sid,"P","siRNA",psicount))









