#!/usr/bin/env python

# Copyright 2009, 2010 Yann Surget-Groba
#
# multiple-K.py is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# multiple-K.py is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this file. If not, see <http://www.gnu.org/licenses/>.


import sys, getopt, seqUtils, Bio, os
from Bio.Blast import NCBIXML
from Bio import SeqIO

def usage():
    print """Usage: multiple-K.py -o outfile [-d directory] [-v]
    -o outfile: will be overwritten if exists
    -d directory: directory containing the fasta files to merge (default=current directory)
    -v: optional flag to print run informations"""

###Get command-line arguments
try:
    opts, args = getopt.getopt(sys.argv[1:], "d:o:v")
except getopt.GetoptError:
    usage()
    sys.exit(1)
    
verbose = outfile = False
directory = './'
cd_hitPath = 'cd-hit-est' #change to the full path if cd-hit is not in your $PATH

for o,a in opts:
    if o == '-o':
        outfile = a
    if o == '-v':
        verbose = True
    if o == '-d':
        directory = a
        
if not outfile:
    usage()
    sys.exit(1)


###Merge contigs from different k-mers:
if verbose:
    print "1) Merge contigs obtained with different k-mers"
allSeq = []
#browse selected directory for fasta files to merge (must have extension .fa)
for f in os.listdir(directory):
    if f.find('.fa') == -1:
        continue
    if verbose:
        print "\tProcessing file", f
    handle = open(directory + '/' + f)
    for s in SeqIO.parse(handle,'fasta'):
        #add filename to sequence name
        s.id += "|%s" % f
        s.description = ''
        allSeq.append(s)
    handle.close()

#write contigs from all k-mers to temporary file
tmpHandle = open('.allSeq.tmp','w')
SeqIO.write(allSeq, tmpHandle, 'fasta')
tmpHandle.close()

###Run CD-HIT to remove duplicates
if verbose:
    print "2) Removing duplicates..."
    output = ''
else:
    output = ' > /dev/null'
os.system("%s -i .allSeq.tmp -o %s -c 1 -n 10 -l 20 -r 1 -d 0%s" % (cd_hitPath, outfile, output))
os.remove('.allSeq.tmp')
os.remove('%s.bak.clstr'%outfile)

