import argparse
import textwrap
import glob
import os
import sys
import pandas
import re

### Usage
parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,
     	description=textwrap.dedent('''\
		------------------------------------------------------------------------------
		This program extract TPM for each gene across multiple samples (in a ballgown-structured directory).
		"-A gene_abund.tab" needs to be turned on during the StringTie run.

		Note that genenames that are in lowercase will be converted to uppercase.
		
		Author: Dan Sun
		Created: 05/17/2018
		Last modified: 01/01/2019
		------------------------------------------------------------------------------
        '''))
		
parser.add_argument('--measure', default = 'TPM', help='Two measures available: "FPKM" or "TPM"', required=False)
parser.add_argument('--useAcc', type = int, default = 0, help='0: genename as names; 1: accession as names; 2: output all info', required=False)
parser.add_argument('--abundName', default = 'gene_abund.tab', help='Gene abundance filename', required=False)
parser.add_argument('--out', help='output filename', required=True)
args = parser.parse_args()

### Natural sort
def natural_sort(l): 
    convert = lambda text: int(text) if text.isdigit() else text.lower() 
    alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ] 
    return sorted(l, key = alphanum_key)

abunds = {}
### Iterate through subdirectories in 'ballgown' and store abundance data for each gene across samples
outHandle = open(args.out, "w")
header = ['Gene']
info = {}
samples = ''
for root, dirs, files in os.walk('.'):
	samples = sorted(dirs)
	for sample in samples:
		path = os.path.join(sample, args.abundName)
		with open(path) as abundHandle:
			for line in abundHandle:
				break
			for line in abundHandle:
				cols = line.strip("\n").split("\t")
				if args.useAcc == 0: # be aware that some genes may be duplicated in the genome (same gene names, but different gene IDs)
					if cols[1] != '-' and cols[1] != '.':
						gene = cols[1]
					else:
						gene = cols[0]
				elif args.useAcc == 1:
					gene = cols[0]
				elif args.useAcc == 2:
					header = ['GeneId', 'Gene', 'Scaffold', 'Strand', 'Start', 'End']
					gene = cols[0]
					info[gene] = cols[0] + "\t" + cols[1] + "\t" + cols[2] + "\t" + cols[3] + "\t" + cols[4] + "\t" + cols[5] + "\t"
				else:
					sys.exit('--useAcc only takes in 0/1/2')
				if args.measure == 'TPM':
					abund = cols[8]
				elif args.measure == 'FPKM':
					abund = cols[7]
				else:
					sys.exit('Abundance measure has to be either TPM or FPKM!')
				if gene not in abunds:
					abunds[gene] = {}
				abunds[gene][sample] = abund
	break
header.extend(samples)
headerStr = '\t'.join(header)
outHandle.write(headerStr + '\n')

### Output
genes = natural_sort(abunds.keys())
for gene in genes:
	values = []
	good = 1
	for sample in samples:
		if sample not in abunds[gene]:
			good = 0
			break
		values.append(abunds[gene][sample])
	if good == 1:
		abundsStr = ''
		if gene in info:
			abundsStr = info[gene] + '\t'.join(values)
		else:
			abundsStr = '\t'.join(values)
		outHandle.write(abundsStr + '\n')
