#! /usr/bin/ python3.5

#Edited by Alan Du 06/17/2019 to differentiate case enriched vs. case depleted alleles in the summary

import glob

def readConsensusFromFasta(fname) :
	h = ''
	s = ''
	fh = open(fname, 'rt')
	for line in fh :
		line = line.strip()
		if line[0] == '>' :
			h = line[1:].strip()
		elif h == 'consensus' :
			s = s + line.upper()
	fh.close()
	return s

# Uses nucleotide codes from https://www.genome.jp/kegg/catalog/codes1.html

_possibles = set(['A', 'G', 'C', 'T', 'R', 'Y', 'N', 'W', 'S', 'M', 'K', 'B', 'H', 'D', 'V'])
noA = set(['G', 'C', 'T', 'Y', 'S', 'K', 'B'])
hasA = _possibles - noA
noT = set(['A', 'G', 'C', 'R', 'S', 'M', 'V'])
hasT = _possibles - noT
noC = set(['A', 'G', 'T', 'R', 'W', 'K', 'D'])
hasC = _possibles - noC
noG = set(['A', 'C', 'T', 'Y', 'W', 'M', 'H'])
hasG = _possibles - noG

def nucsToCode(nucs) :
	possibles = set([ n for n in _possibles ])
	if len(nucs) == 0 :
		return '-'
	if 'A' in nucs :
		possibles = possibles - noA
	else :
		possibles = possibles - hasA
	if 'T' in nucs :
		possibles = possibles - noT
	else :
		possibles = possibles - hasT
	if 'C' in nucs :
		possibles = possibles - noC
	else :
		possibles = possibles - hasC
	if 'G' in nucs :
		possibles = possibles - noG
	else :
		possibles = possibles - hasG
	if len(possibles) != 1 :
		print(len(possibles), nucs, possibles)
	assert len(possibles) == 1
	code = possibles.pop()
	return code

def getSignificantAllele_caseEnrich(ps, pSig) :
	sigA = '-'
	nucs = set()
	has = False
	for allele in ps :
		pval = ps[allele][0]
		odds = ps[allele][1]
		if pval <= pSig :
			if odds > 1:
				nucs.add(allele)
				has = True
	sigA = nucsToCode(nucs)
	if has :
		assert sigA != '-'
	return sigA

def getSignificantAllele_caseDeplete(ps, pSig) :
	sigA = '-'
	nucs = set()
	has = False
	for allele in ps :
		pval = ps[allele][0]
		odds = ps[allele][1]
		if pval <= pSig :
			if odds < 1:
				nucs.add(allele)
				has = True
	sigA = nucsToCode(nucs)
	if has :
		assert sigA != '-'
	return sigA

# CHR	 SNP		 BP   A1	  F_A	  F_U   A2		CHISQ			P		   OR		   SE		  L95		  U95
#  TE   rd0.A		  0	0		0		0	1		   NA		   NA		   NA		   NA		   NA		   NA
#  TE   rd0.T		  0	1   0.2128   0.2925	2		2.434	   0.1187	   0.6539	   0.2737	   0.3824		1.118
#  TE   rd0.C		  0	2		0 0.004717	1	   0.4451	   0.5047			0		  inf			0		  nan
#  TE   rd0.G		  0	2		0  0.02358	1		2.261	   0.1327			0		  inf			0		  nan
#  TE   rd1.A		  0	2  0.04255  0.06604	1	   0.7322	   0.3922	   0.6286	   0.5471	   0.2151		1.837
#  TE   rd1.T		  0	2		0  0.03302	1		 3.19	  0.07409			0		  inf			0		  nan
#  TE   rd1.C		  0	0		0		0	1		   NA		   NA		   NA		   NA		   NA		   NA
#  TE   rd1.G		  0	1   0.2553	0.316	2		1.338	   0.2474		0.742	   0.2586		0.447		1.232
#  TE   rd2.A		  0	2		0   0.0283	1		2.723	  0.09888			0		  inf			0		  nan
#  TE   rd2.T		  0	1   0.1915   0.2217	2	   0.4143	   0.5198	   0.8315		0.287	   0.4737		1.459
#  TE   rd2.C		  0	2		0 0.009434	1	   0.8937	   0.3445			0		  inf			0		  nan
#  TE   rd2.G		  0	2  0.02128  0.01887	1	  0.02358		0.878		 1.13	   0.7989	   0.2362		5.411

def readAssocExpand(assocName, pSig) :
	greater_seq = ''
	less_seq = ''
	fh = open(assocName, 'rt')
	i = 0
	ps = {}
	for line in fh :
		if line[0] == '#' :
			continue
		line = line.strip().split()
		if line[0] == 'CHR' :
			continue
		if i % 4 == 0 and i != 0 :
			sigAllele_greater = getSignificantAllele_caseEnrich(ps, pSig)
			sigAllele_less = getSignificantAllele_caseDeplete(ps, pSig)
			greater_seq += sigAllele_greater
			less_seq += sigAllele_less
			ps = {}
		allele = line[1][-1]
		if line[8] != 'NA' and line[9] != 'NA':
			if line[3] == '1' and line[6] == '2': # Checks to see if the allele is the major allele. If so, take the inverse of the listed odds ratio
				try:
					ps[allele] = [float(line[8]), 1/float(line[9])]
				except ZeroDivisionError:
					ps[allele] = [1, 1]
			else: # If allele is minor allele, take the odds ratio
				ps[allele] = [float(line[8]), float(line[9])]
		else:
			ps[allele] = [1, 1]
		i += 1
	if i % 4 == 0 and i != 0 :
		sigAllele_greater = getSignificantAllele_caseEnrich(ps, pSig)
		greater_seq += sigAllele_greater
		sigAllele_less = getSignificantAllele_caseDeplete(ps, pSig)
		less_seq += sigAllele_less
	fh.close()
	return greater_seq, less_seq

def readAssocMajor(assocName, pSig) :
	greater_seq = ''
	less_seq = ''
	fh = open(assocName, 'rt')
	notMap = {
		'A':'B',
		'C':'D',
		'G':'H',
		'T':'V'
	}
	for line in fh :
		if line[0] == '#' :
			continue
		line = line.strip().split()
		if line[0] == 'CHR' :
			continue
		allele = line[6] #Major allele
		if line[9] != 'NA':
			try: #Odds ratio of major allele, odds ratio calcaulted by PLINK is for allele 1 (typically minor)
				odds = 1/float(line[9])
			except ZeroDivisionError:
				odds = 1000000 #Give a ridiculous odds ratio if dividing by 0 (to make it obvious)
			except ValueError: #Occurs when there is an "NA" and the allele is "1" instead of "0"
				odds = 1
		else:
			odds = 1
		if allele == '0' :
			allele = '-'
		elif allele == '1': #if PLINK called the major allele as A1 (which is typically minor allele), take the listed odds ratio
			allele = line[3]
			try:
				odds = float(line[9])
			except ValueError:
				odds = 1
		elif allele == '-' :
			allele = notMap[line[3]]
		pVal = float(line[8]) if line[8] != 'NA' else 1
		if pVal <= pSig:
			if odds > 1:
				greater_seq += allele
				less_seq += '-'
			else:
				less_seq += allele
				greater_seq += '-'
		else:
			greater_seq += '-'
			less_seq += '-'
		if pVal <= pSig :
			assert allele != '-'
	fh.close()
	return greater_seq, less_seq

def handleAssoc(assocName, type, phen, pSig) :
	if type == 'expand' :
		sigSeq_greater, sigSeq_less = readAssocExpand(assocName, pSig)
	elif type == 'major' :
		sigSeq_greater, sigSeq_less = readAssocMajor(assocName, pSig)
	print('>%s.%s.P<=%f,cases_enriched' %(phen, type, pSig))
	print(sigSeq_greater)
	print('>%s.%s.P<=%f,cases_depleted' %(phen, type, pSig))
	print(sigSeq_less)

def main(fastaName, pSig) :
	consensus = readConsensusFromFasta(fastaName)
	print('>consensus')
	print(consensus)
	expands = glob.glob('*expand*.assoc')
	majors = glob.glob('*major*.assoc')
	# I know this looks dumb, but it's to ensure phenotypes are printed together (easier to manually inspect results.)
	for expandName in expands :
		phen = expandName.split('.')[0].split('expand_')[1]
		handleAssoc(expandName, 'expand', phen, pSig)
		for majorName in majors :
			if phen == majorName.split('.')[0].split('major_')[1] :
				handleAssoc(majorName, 'major', phen, pSig)

if __name__ == '__main__' :
	import argparse
	parser = argparse.ArgumentParser()
	parser.add_argument('fasta', help='fasta with consensus sequence')
	parser.add_argument('p', type=float, help='upper-limit p-value of significance')
	args = parser.parse_args()
	main(args.fasta, args.p)

