#!/usr/bin/python3


#
# Version: 20180210
#


gflg_exclude = 1


#
# Run DESeq2 result filter (with some analysis)
#

#
# Output is written to stdout so redirect it to a file
# in order to save it.
#


import sys
import os
import re
import math
import gzip
import textwrap
import pickle
import argparse
import statistics


#
# The following data files are the data sources for the
# gene information. This program is run initially with the
# '-m' command line parameter in order to read these data
# files and make a pickle file.
#
gfn_gadam  = 'GeneTPM.180123.edit1.tissue_mean.long_gene_names.txt'


#
# Path to directory that has DESeq2 pairwise comparison results.
# Notes:
#   o  typical file name 'run.tbx37_T0~tbx37_T3.results.txt'
#   o  typical file contents
#        baseMean log2FoldChange lfcSE stat pvalue padj
#        WBGene00017027_D1044.1_D1044.1_protein-coding 133.206641833608 -4.27576228148998 0.347196075728112 -9.43490583705564 3.9134559404371e-21 5.7919147918469e-17
#        WBGene00001385_F02A9.2_far-1_protein-coding 4009.66229809446 7.04992568860088 0.667463336269483 9.06405694493199 1.25686753867866e-19 9.30081978622211e-16
#
gfn_deseq2 = 'run.all.180125'

#
# output pickle default filename
#
gfn_pickle = 'gene_diffexpress.pickle'


def main():

  argparser = argparse.ArgumentParser()
  argparser.add_argument( '-a', '--run-analysis',       help='run analysis', action='store_true' )
  argparser.add_argument( '-m', '--make-pickle-file',   help='read data files and make a pickle file', action='store_true' )
  argparser.add_argument( '-p', '--pickle-filename',    help='input pickle filename (default: %s)' % gfn_pickle, default=gfn_pickle )

  args = argparser.parse_args()

  if( args.pickle_filename ):
    fn_pickle = args.pickle_filename
  else:
    fn_pickle = gfn_pickle

  if( args.make_pickle_file ):
    dadam  = {}
    lhadam = []
    print( 'read Adam\'s expression data next...', file=sys.stderr )
    xreadAdam( gfn_gadam, lhadam, dadam )
    ddeseq2    = {}
    print( 'read DESeq2 data next...', file=sys.stderr )
    xreadDeseq2( gfn_deseq2, ddeseq2 )
    dfiles = { 'tpm': gfn_gadam, 'deseq2': gfn_deseq2 }
    ddata = { 'lhadam': lhadam, 'dadam': dadam, 'ddeseq2': ddeseq2, 'dfiles': dfiles }
    print( 'write pickle file next...', file=sys.stderr )
    xwritePickle( ddata, fn_pickle )
  elif( args.run_analysis ):
    print( 'run analysis', file=sys.stderr )
    xrunAnalysis( fn_pickle, gflg_exclude )
  return( 0 )


def xreadAdam( fn, lhadam, dadam ):
  fp = open( fn, 'r' )
  for inline in fp:
    inline = inline.rstrip()
    toks   = inline.split()
    if( toks[0] == 'gene' ):
      for i in range( 1, len( toks ) ):
        lhadam.append( toks[i] )
      continue
    tnam = toks[0].split( '_' )[0]
    gnam = tnam
    ladam = []
    for s in toks[1:]:
      ladam.append( float( s ) )
    dadam.setdefault( gnam, ladam.copy() )
  fp.close()
  return( 0 )


#
#   o  typical file name 'run.tbx37_T0~tbx37_T3.results.txt'
#        baseMean log2FoldChange lfcSE stat pvalue padj
#        WBGene00017027_D1044.1_D1044.1_protein-coding 133.206641833608 -4.27576228148998 0.347196075728112 -9.43490583705564 3.9134559404371e-21 5.7919147918469e-17
#        WBGene00001385_F02A9.2_far-1_protein-coding 4009.66229809446 7.04992568860088 0.667463336269483 9.06405694493199 1.25686753867866e-19 9.30081978622211e-16
#
def xreadDeseq2( fn_deseq2, ddeseq2 ):
  lfil = os.listdir( fn_deseq2 )
  lfres = []
  for fil in lfil:
    if( re.search( r'\.results\.txt$', fil ) ):
      lfres.append( fil )
  for fil in lfres:
    mobj = re.match( r'^run.(.+).results.txt$', fil )
    if( mobj == None ):
      print( 'Error: unexpected condition', file=sys.stderr )
      sys.exit( -1 )
    nmcmp = mobj.group( 1 )
    if( nmcmp == None ):
      print( 'Error: unexpected condition', file=sys.stderr ) 
      sys.exit( -1 )
    mpath = '%s/%s' % ( fn_deseq2, fil )
    fp = open( mpath, 'r' )
    for inline in fp:
      inline = inline.rstrip()
      stok   = inline.split()
      if( stok[0] == 'baseMean' ):
        continue
      if( stok[1] != 'NA' ):
        baseMean = float( stok[1] )
      else:
        baseMean = float( 'nan' )
      if( stok[2] != 'NA' ):
        log2FoldChange = float( stok[2] )
      else:
        log2FoldChange = float( 'nan' )
      if( stok[6] != 'NA' ):
        padjust = float( stok[6] )
      else:
        padjust = float( 'nan' )
      nmgene = stok[0].split( '_' )[0]
      ddeseq2.setdefault( nmgene, {} )
      ddeseq2[nmgene].setdefault( nmcmp, { 'baseMean': baseMean, 'log2FoldChange': log2FoldChange, 'padjust': padjust, 'fullname': stok[0] } )
    fp.close()

  return( 0 )


def xwritePickle( object, fn ):
  fp = open( fn, 'wb' )
  pickle.dump( object, fp )
  fp.close()
  return( 0 )


def xreadPickle( fn ):
  fp = open( fn, 'rb' )
  object = pickle.load( fp )
  fp.close()
  return( object )


def selsort( a ):
  return( a[1] )


def xgetPadjust( nmgene, tissue_1, tissue_2, ddeseq2, dpadjust ):
  if( tissue_1 < tissue_2 ):
    nmcmp = '%s~%s' % ( tissue_1, tissue_2 )
    msign = -1.0
  else:
    nmcmp = '%s~%s' % ( tissue_2, tissue_1 )
    msign = 1.0
  if( not ddeseq2.get( nmgene ) ):
    print( 'missing DESeq2 information for gene \'%s\'' % ( nmgene ), file=sys.stderr )
    sys.exit( -1 )
  if( not ddeseq2[nmgene].get( nmcmp ) ):
    print( 'missing DESeq2 information for comparison \'%s\' for gene \'%s\'' % ( nmcmp, nmgene ), file=sys.stderr )
  baseMean       = ddeseq2[nmgene][nmcmp]['baseMean']
  if( not math.isnan( ddeseq2[nmgene][nmcmp]['log2FoldChange'] ) ):
    log2FoldChange = msign * ddeseq2[nmgene][nmcmp]['log2FoldChange']
  else:
    log2FoldChange = float( 'nan' )
  padjust        = ddeseq2[nmgene][nmcmp]['padjust']
  fullname       = ddeseq2[nmgene][nmcmp]['fullname']
  dpadjust.setdefault( 'nmcmp', nmcmp )
  dpadjust.setdefault( 'baseMean', baseMean )
  dpadjust.setdefault( 'log2FoldChange', log2FoldChange )
  dpadjust.setdefault( 'padjust', padjust )
  dpadjust.setdefault( 'fullname', fullname )
  return( 0 )


#
# check that tissue is largest and smallest
#
def xanalyzeGene( nmgene, lhadam, dadam, ddeseq2, flag_exclude, dexclude ):
  dtpm = {}
  for i in range( len( dadam[nmgene] ) ):
    nmtis = lhadam[i].split( '_' )[0]
    tmpnt = lhadam[i].split( '_' )[1]
    dtpm.setdefault( tmpnt, [] )
    dtpm[tmpnt].append( [ lhadam[i], dadam[nmgene][i] ] )

  #  ceh-32  202  tbx-37, cnd-1
  #  cnd-1   340  tbx-37, ceh-32
  #  end-1   801  none
  #  hlh-1   377  none
  #  nhr-25   37  none  + tbx-37
  #  pha-4   178  tbx-37
  #  tbx-37  175  cnd-1, ceh-32, pha-4

  if( not flag_exclude ):
    #  test all tissues without exclusion
    for tmpnt in dtpm.keys():
      dpadjust = {}
      dtpm[tmpnt].sort( key=selsort, reverse=True )
      xgetPadjust( nmgene, dtpm[tmpnt][0][0], dtpm[tmpnt][1][0], ddeseq2, dpadjust )
      print( '%s %s %.4f %.4f %s %s %.4f %.4f %.4f %.4f %s' % ( nmgene, tmpnt, dtpm[tmpnt][0][1] / dtpm[tmpnt][1][1], dpadjust['padjust'], dtpm[tmpnt][0][0], dtpm[tmpnt][1][0], dtpm[tmpnt][0][1], dtpm[tmpnt][1][1], dpadjust['baseMean'], 2**dpadjust['log2FoldChange'], dpadjust['fullname'] ) )
  else:
    #  test tissues with exclusion
    for tmpnt in dtpm.keys():
      dpadjust = {}
      dtpm[tmpnt].sort( key=selsort, reverse=True )
      nmmax = dtpm[tmpnt][0][0].split( '_' )[0]
      texclude = ''
      for i in range( 1, len( dtpm[tmpnt] ) ):
        nmtis = dtpm[tmpnt][i][0].split( '_' )[0]
        if( dexclude.get( nmmax ) and dexclude[nmmax].get( nmtis ) ):
          dtpm[tmpnt][i][1] = -1.0
          texclude += ' ' + dtpm[tmpnt][i][0]
      dtpm[tmpnt].sort( key=selsort, reverse=True )
      xgetPadjust( nmgene, dtpm[tmpnt][0][0], dtpm[tmpnt][1][0], ddeseq2, dpadjust )
      print( '%s %s %.4f %.4f %s %s %.4f %.4f %.4f %.4f %s' % ( nmgene, tmpnt, dtpm[tmpnt][0][1] / dtpm[tmpnt][1][1], dpadjust['padjust'], dtpm[tmpnt][0][0], dtpm[tmpnt][1][0], dtpm[tmpnt][0][1], dtpm[tmpnt][1][1], dpadjust['baseMean'], 2**dpadjust['log2FoldChange'], dpadjust['fullname'] ), end='' )
      if( len( texclude ) ):
        print( ' exc:%s' % ( texclude ), end='' )
      print()

#  sys.exit( 0 )
  return( 0 )


def xrunAnalysis( fn_pickle, flag_exclude ):
  ddata   = xreadPickle( fn_pickle )
  lhadam  = ddata['lhadam']
  dadam   = ddata['dadam']
  ddeseq2 = ddata['ddeseq2']
  dfiles  = ddata['dfiles']

  dexclude = { 'ceh32': { 'cnd1': 1, 'tbx37': 1 },
               'cnd1':  { 'ceh32': 1, 'tbx37': 1 },
               'nhr25': { 'tbx37': 1 },
               'pha4':  { 'tbx37': 1 },
               'tbx37': { 'ceh32': 1, 'cnd1': 1, 'pha4': 1 }
             }

  print( '# File TPM data:    %s' % ( dfiles['tpm'] ) )
  print( '# Directory DESeq2: %s' % ( dfiles['deseq2'] ) )
  print( '# Exclude flag:     %s' % ( flag_exclude ) )
  
  for nmgene in dadam.keys():
    xanalyzeGene( nmgene, lhadam, dadam, ddeseq2, flag_exclude, dexclude )
  return( 0 )


if __name__ == '__main__':
  main()

