#!/usr/bin/env python
from ntpath import realpath
import os
import sys
import inspect
import re
import argparse
import random
import math
import collections
import fileinput
prog = re.compile(r"(\d+)([MISDHN])")


def rev_compl(st):
    nn = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A','N':'N'}
    return "".join(nn[n] for n in reversed(st))

parser = argparse.ArgumentParser(description="""           
Description
-----------
Summary statistics
""",formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Length distribution of piRNAs complementary to the P-element normalized to a million miRNAs.
miRNA: 21-23nt
Sense are written 5' to 3' therefore the 5' end ist first: 
https://de.wikipedia.org/wiki/Nukleins%C3%A4ure-Nomenklatur
https://en.wikipedia.org/wiki/Coding_strand#/media/File:Process_of_DNA_transcription.png

https://omarwagih.github.io/ggseqlogo/

Authors
-------
    Robert Kofler
""")
parser.add_argument('--sam', type=argparse.FileType('r'), default=None,dest="sam", required=True, help="A sam file")
parser.add_argument("--min-mq", type=int, required=False, dest="minmq", default=0, help="min mapping quality")
parser.add_argument("--max-mm", type=int, required=False, dest="maxmm", default=10, help="max mismatches")
parser.add_argument("--sample-id", type=str, required=True, dest="sid", default=10, help="the sample id")
args = parser.parse_args()
minmq=args.minmq
maxmm=args.maxmm

# things to change
# 1 siRNA and piRNA
# 2 reverse the antisense otherwise it will 

pi_p=collections.defaultdict(lambda:{'A':0,'T':0,'C':0,'G':0})
pi_te=collections.defaultdict(lambda:{'A':0,'T':0,'C':0,'G':0})

si_p=collections.defaultdict(lambda:{'A':0,'T':0,'C':0,'G':0})
si_te=collections.defaultdict(lambda:{'A':0,'T':0,'C':0,'G':0})



def print_matrix(col,sid,smtype,te):
     cov=str(sum(col[0].values())) #coverage in first position
     order="A,C,T,G"
     matrix=[]
     for i in sorted(col.keys()):
          basecol=col[i]
          sumi=float(sum(basecol.values()))
          ac,tc,cc,gc=basecol['A'],basecol['T'],basecol['C'],basecol['G']
          matrix.extend([ac,cc,tc,gc])
     matrix=[str(i) for i in matrix]
     matrix=",".join(matrix)
     topr=[sid,smtype,te,cov,order,matrix]
     print("\t".join(topr))


"""
def print_collection(col,sid,smtype,seas,te):
     for i in sorted(col.keys()):
          basecol=col[i]
          sumi=float(sum(basecol.values()))
          ac,tc,cc,gc=basecol['A'],basecol['T'],basecol['C'],basecol['G']
          a=float(ac)/float(sumi)
          t=float(tc)/float(sumi)
          c=float(cc)/float(sumi)
          g=float(gc)/float(sumi)
          # print("{:10.4f}".format(x)) 
          atcg=[sid,smtype,seas,te,str(i),str(sumi)]
          countstr=[str(i) for i in [ac,tc,cc,gc]]
          freqstr=["{0:.2f}".format(i) for i in [a,t,c,g]]
          #atcg.append(":".join(countstr)) 
          atcg.append(":".join(freqstr))
          print("\t".join(atcg))
"""

def add_read_to_collection(col,read):
     for i in range(0,len(read)):
          base=read[i]
          if base not in col[i]:
               #print(base)
               continue
          col[i][base]+=1


reo=re.compile(r"NM:i:(\d+)")
for line in args.sam:

     a=line.rstrip("\n").split("\t")
     
     # discard unmapped
     flag=int(a[1])
     if flag & 0x004 > 0:
          continue
     
     # discard low mapping quality
     mq=int(a[4])
     if mq< minmq:
          continue
     
     # discard mismatch
     mm=0
     mo=re.search(reo,line)
     if mo is None:
          continue
     mm=int(mo.group(1))
     if(mm>maxmm):
          continue  
     
     
     ref=a[2]
     read=a[9]
     readlen=len(read)

     if ref.endswith("_te"):
          teseq=ref[:-3]
          if readlen>=23 and readlen<=29:
               if teseq=="PPI251":
                    if flag& 0x10:
                         # reverse complement if flag 0x10 is set
                         rev=rev_compl(read)
                         add_read_to_collection(pi_p,rev) 
                    else:
                         add_read_to_collection(pi_p,read)
               else:
                    if flag & 0x10:
                         rev=rev_compl(read)
                         add_read_to_collection(pi_te,rev)
                    else:
                         add_read_to_collection(pi_te,read)     
          elif(readlen>=20 and readlen<=22):
               if teseq=="PPI251":
                    if flag& 0x10:
                         # reverse complement if flag 0x10 is set
                         rev=rev_compl(read)
                         add_read_to_collection(si_p,rev) 
                    else:
                         add_read_to_collection(si_p,read)
               else:
                    if flag & 0x10:
                         rev=rev_compl(read)
                         add_read_to_collection(si_te,rev)
                    else:
                         add_read_to_collection(si_te,read)    
                        
     elif ref.endswith("_miRNA"):
          pass         
     elif ref.endswith("_rRNA") or  ref.endswith("_rRNA;"):
          pass
     elif ref.endswith("_tRNA"):
          pass
     elif  ref.endswith("_snoRNA;") or ref.endswith("_snoRNA") or ref.endswith("_snRNA;") or ref.endswith("_snRNA") or ref.endswith("_mRNA"):
          pass
     else:
          raise Exception("Unknown sequence end "+ ref)

print_matrix(pi_p,args.sid,"piRNA","P")
print_matrix(pi_te,args.sid,"piRNA","TE")

print_matrix(si_p,args.sid,"siRNA","P")
print_matrix(si_te,args.sid,"siRNA","TE")









