#! /usr/bin/env python3.5

# NOTE: this script requires MAFFT to be installed

import os
import subprocess

def readFasta(fname) :
  seqs = {}
  ID = ''
  seq = ''
  fh = open(fname, 'rt')
  for line in fh :
    line = line.strip()
    if line[0] == '>' :
      if len(seq) != 0 :
        seqs[ID] = seq
      ID = line[1:].strip()
      seq = ''
    else :
      seq = seq + line.upper()
  if len(seq) != 0 :
    seqs[ID] = seq
  return seqs

# Assumes header has format "> phenotype other_stuff_here"

def readFastk(filename) :
  ifh = open(filename, 'rt')
  lines = ifh.readlines()
  phenotypes = set()
  seqs = {}
  id = ''
  for i in range(len(lines)) :
    if i % 4 == 0 :
      id = lines[i][1:].strip()
      phenotypes.add(id.split()[0])
    elif i % 4 == 1 :
      seqs[id] = lines[i].strip().upper()
    elif i % 4 == 2 :
      assert lines[i].strip() == '+'
    elif i % 4 == 3 :
      continue
    else :
      raise 'This should never happen.'
  ifh.close()
  ids = sorted([ id for id in seqs ])
  return ids, seqs , phenotypes

def doAlignment(lines, ofname) :
    ofh = open('tmp.fa', 'wt')
    ofh.writelines(lines)
    ofh.close()
    with open(os.devnull, 'wb') as devnull :
        with open(ofname, 'wb') as tmpAln :
            subprocess.run(['mafft', 'tmp.fa'], stderr=devnull, stdout=tmpAln)
        os.remove('tmp.fa')

def main(fastk, consensus) :
  consensuses = readFasta(consensus)
  ids, contigs , phenotypes = readFastk(fastk)
  dirname = 'contigsWithConsensus'
  if not os.path.exists(dirname) :
    os.mkdir(dirname)
  for phen in phenotypes :
    ofname = os.path.join(dirname, '%s.%s.case.aln.fa' %(fastk[:-6], phen))
    consenH = '>%s.case.consensus\n'%phen
    consenS = '%s\n' %consensuses[consenH[1:-1]]
    lines = [ consenH, consenS ]
    for header in [ id for id in ids if phen in id ] :
        lines.append('>%s\n' %header)
        lines.append('%s\n' %contigs[header])
    doAlignment(lines, ofname)

if __name__ == '__main__' :
  from argparse import ArgumentParser
  parser = ArgumentParser()
  parser.add_argument('fastk', help='Expects this format: important25.fastk where 25 is the k-mer number (k = 4, k = 5, etc. You understand.)')
  parser.add_argument('consensus', help='Fasta file with consensus sequences for phenotype case and control groups.')
  args = parser.parse_args()
  main(args.fastk, args.consensus)

