#!/usr/bin/env python
import os
import sys
import inspect
import re
import argparse
import random
import math
import collections
import fileinput
prog = re.compile(r"(\d+)([MISDHN])")

def get_end(start,cig):
     
     result = re.findall(prog,cig)
     alignmentleng=0
     for count,cigchar in result:
          count=int(count)
          if cigchar=="M" or cigchar=="D":
               alignmentleng+=count
     
     end=start+alignmentleng-1
     return end

def getchrlist(chrstr):
     if "," in chrstr:
          tmp=chrstr.split(",")
          return(tmp)
     else:
          return [chrstr]

parser = argparse.ArgumentParser(description="""           
Description
-----------
Summary statistics
""",formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
miRNA: 21-23nt
piRNA: 23-28nt


Authors
-------
    Robert Kofler
""")
parser.add_argument('--sam', type=argparse.FileType('r'), default=None,dest="sam", required=True, help="A sam file")
parser.add_argument("--min-mq", type=int, required=False, dest="minmq", default=0, help="min mapping quality")
parser.add_argument("--max-mm", type=int, required=False, dest="maxmm", default=10, help="max mismatches")
parser.add_argument("--sample-id", type=str, required=True, dest="sid", default=10, help="the sample id; several ids may be separated by ',' ")
parser.add_argument("--chromosomes", type=str, required=False, dest="chroms", default="X,2L,2R,3L,3R,4", help="the chromosomes")
parser.add_argument("--window-size", type=int, required=False, dest="window", default=1000, help="the window size")
parser.add_argument("--min-length", type=int, required=False, dest="minsize", default=23, help="the minimum size")
parser.add_argument("--max-length", type=int, required=False, dest="maxsize", default=29, help="the maximum size")
parser.add_argument("--mirna", type=int, required=False, dest="mirna", default=None, help="the number of mirnas, used for normalization, in ppm")
args = parser.parse_args()
minmq=args.minmq
maxmm=args.maxmm


chrlist=getchrlist(args.chroms)
chrset=set(chrlist)
pistart=args.minsize
piend=args.maxsize
win=args.window
mirna=args.mirna
sid=args.sid
sid=re.sub(",","\t",sid)


ps=collections.defaultdict(lambda:collections.defaultdict(lambda:0))
pas=collections.defaultdict(lambda:collections.defaultdict(lambda:0))

reo=re.compile(r"NM:i:(\d+)")


          


for line in args.sam:
     """
0         1         2              3    4         5    6         7      8            9                        10                  11
r1	16	M14653_te	172	70	23M	*	0	0	ATGTCGAGTTTCGTGCCGAATAA	FFFFFFFFFFFFFFFFFFBBBBB	PG:Z:novoalign	AS:i:0	UQ:i:0	NM:i:0	MD:Z:23
r2	0	M14653_te	240	70	27M	*	0	0	AACAGCTGCGGAATCGCACCGAATGCT	BBBBBFFFFFBFFFFFFFFFFFFFFFF	PG:Z:novoalign	AS:i:0	UQ:i:0	NM:i:0	MD:Z:27
     """
     a=line.rstrip("\n").split("\t")
     
     # discard unmapped
     flag=int(a[1])
     if flag & 0x004 > 0:
          continue
     
     # discard low mapping quality
     mq=int(a[4])
     if mq< minmq:
          continue
     
     # discard mismatch
     mm=0
     mo=re.search(reo,line)
     if mo is None:
          continue
     
     mm=int(mo.group(1))
     if(mm>maxmm):
          continue  
     
     chrom=a[2]
     if chrom not in chrset:
          continue
     
     # discard reads that are too small or large
     readlen=len(a[9])
     if(readlen<pistart or readlen> piend):
          continue
     
     pos=int(a[3])
     if flag& 0x10:
          # reverse complement if flag 0x10 is set
          pos=get_end(pos,a[5])
     binindex=int(pos/win)
     
     if flag&0x10:
          pas[chrom][binindex]+=1
     else:
          ps[chrom][binindex]+=1


for chrom in chrlist:
     keyset=set(ps[chrom].keys()+pas[chrom].keys())
     for binindex in sorted(keyset):
          binstart=win*binindex
          binend=binstart+win-1
          sense=ps[chrom][binindex]
          antisense=pas[chrom][binindex]
          if mirna is not None:
               sense=sense*1000000/float(mirna)
               antisense=antisense*1000000.0/float(mirna)
          strandbias="na"
          if(sense>antisense):
               strandbias=float(antisense)/float(sense)
          else:
               strandbias=float(sense)/float(antisense)
          
          print "{0}\t{1}\t{2}\t{3}\t{4}\t{5}".format(sid, chrom, binstart, binend, "s",  sense)
          print "{0}\t{1}\t{2}\t{3}\t{4}\t{5}".format(sid, chrom, binstart, binend, "as", antisense)
          print "{0}\t{1}\t{2}\t{3}\t{4}\t{5}".format(sid, chrom, binstart, binend, "bias", strandbias)
          
    

