#! /usr/bin/env python

import sys
import argparse
import os

#This function parses the file string, and creates the appropriate file object
def openFile(fileString,chrom):
	#Determining if file should be inverted.
	if fileString[0] == '!':
		invertFile=True
		p1File=fileString.lstrip('!')
	else:
		invertFile=False
		p1File=fileString
	
	#Determining padding
	splitFileName=p1File.split(':')
	f2File=splitFileName[0]
	try:
		if len(splitFileName)==1:
			padding=[0,0]
			orientedFile=False
		elif len(splitFileName)==2:
			if splitFileName[1]=='':
				sys.stderr.write('ERROR: Incorrect padding. If two padding parameters are given, one must start with "+" and one with "-"\n')
				exit(1)
			elif splitFileName[1][0]=='+':
				padding=[0,int(splitFileName[1])]
				orientedFile=True
			elif splitFileName[1][0]=='-':
				padding=[-int(splitFileName[1]),0]
				orientedFile=True
			else:
				padding=[int(splitFileName[1]),int(splitFileName[1])]
				orientedFile=False
		elif len(splitFileName)==3:
			if splitFileName[1]=='' or splitFileName[2]=='':
				sys.stderr.write('ERROR: Incorrect padding. If two padding parameters are given, one must start with "+" and one with "-"\n')
				exit(1)
			elif splitFileName[1][0]=='+' and splitFileName[2][0]=='-':
				padding=[-int(splitFileName[2]),int(splitFileName[1])]
				orientedFile=True
			elif splitFileName[1][0]=='-' and splitFileName[2][0]=='+':
				padding=[-int(splitFileName[1]),int(splitFileName[2])]
				orientedFile=True
			else:
				sys.stderr.write('ERROR: Incorrect padding. If two padding parameters are given, one must start with "+" and one with "-"\n')
				exit(1)
		else:
			sys.stderr.write('ERROR: To many padding parameters: "%s". At most two can be used.\n'%f)
			exit(1)

	except ValueError:
		sys.stderr.write("ERROR: Padding can't be converted to int.\n")
		exit(1)

	#Create structure containing all relevant information about a track:
	newFile={}
	if f2File=='stdin':
		newFile['file']=sys.stdin		
	else:
		if not os.path.isfile(f2File):
			sys.stderr.write('ERROR: File "%s" can\'t be found.\n'%f2File)
			exit(1)
		newFile['file']=open(f2File)

	newFile['fileName']=f2File
	newFile['padding']=padding
	newFile['invert']=invertFile
	newFile['endOfFile']=False
	newFile['noMoreElements']=False
	newFile['element']=[findNextChromosome(chrom, ''),-1,-1]
	newFile['orientedFile']=orientedFile	
	newFile['sortInfo']=['',-1]
	newFile['bedList']=[]
	newFile['bed']=[]
	newFile['nextBed']=[]

	
	readBed(newFile,chrom)

	if newFile['bed']==[]:
		sys.stderr.write('ERROR: File %s empty\n'%f2File)
		exit(1)
	
	return newFile

#Converts a line to a bed-element.
def lineToBed(line):
	data=line.rstrip('\n').split('\t')
	if len(data)>=3:
		newChr=data[0]
		try:
			newStart=int(data[1])
			newEnd=int(data[2])
			if newStart<0 or newEnd<=newStart:
				sys.stderr.write('ERROR: Invalid bed coordinate in line %s\n'%line)
				exit(1)
			if len(data)>=6:
				newOrientation=data[5]
				if newOrientation is not '+' and newOrientation is not '-':
					sys.stderr.write('ERROR: Invalid orientation. If given, it must be "+" or "-". Last line: %s\n'%line)
					exit(1)
			else:
				newOrientation=''
		except ValueError:
			sys.stderr.write('ERROR: Coordinates in bed-file not integers. Last line: %s\n'%line)
			exit(1)
		return [newChr, newStart, newEnd, newOrientation]
	else:
		sys.stderr.write('ERROR: Invalid bed element: %s\n'%line)
		exit(1)

#Adds padding to a bed element, respecting the boundaries of the chromosome
def padBed(bed, padding,chrom):

	if padding[0]==padding[1]:
		bed[1]-=padding[0]
		bed[2]+=padding[1]
	else:
		#If padding is assymetric, check so bed object is orented!
		if bed[3]=='+':
			bed[1]-=padding[0]
			bed[2]+=padding[1]
		elif bed[3]=='-':
			bed[1]-=padding[1]
			bed[2]+=padding[0]
		else:
			sys.stderr.write('ERROR: Unoriented bed element, assymetric padding.\n')
			exit(1)

	#Making sure padded elements don't extend beyond the chromosome

	if bed[1] < 0:
		bed[1]=0

	if bed[2] > chrom[bed[0]]:
		bed[2]=chrom[bed[0]]

	return
	

#Reads the next element in the bed file. If two elements overlap, they are merged into one.
def readNextElement(f,chrom):

	#Checking if the last element was read last time. If so, returns false.
	if f['noMoreElements']:
		f['element']=[]
		return False

	if f['invert'] == False:

			#If we reached the end of the file, we use the cached bed-element to build a last track elelment.
			if f['endOfFile']:
				f['element']=[f['bed'][0], f['bed'][1], f['bed'][2]]
				f['bed']=[]
				f['noMoreElements']=True
				return True

			newChr =  f['bed'][0]	
			newStart = f['bed'][1]
			newEnd = f['bed'][2]

			while True:
					readBed(f, chrom)

					if f['bed']==[]:
						f['noMoreElements']=True
						break

					if  f['bed'][0]!=newChr or f['bed'][1] > newEnd:
						#Break if new element is not overlapping the old one 
						break
					else:
						#Join them if they overlap.
						newEnd = max(newEnd, f['bed'][2])
			
			f['element']=[newChr, newStart, newEnd]

			return True

	else:

		#Step 0: Check if this is the first time...
		if len(f['bed'])>0:
			if f['element'][1]==-1 and f['bed'][1]>0:
				f['element']=[f['bed'][0], 0, f['bed'][1]]
				return True


		#Step 1: Check if last 'element' went all $the way to the end of the chromosome
		if f['element'][2]==chrom[f['element'][0]]:
			nextChromosome=findNextChromosome(chrom, f['element'][0])

			#Checking so next chromosome exists..
			if nextChromosome=='':
				f['element']=[]
				f['endOfFile']=True
				f['noMoreElements']=True
				return False

			#Step 1a: If 'bed' is not on the next chromosome, the next element covers the full chormosome
			if f['bed'] == []:
				f['element']=[nextChromosome, 0, chrom[nextChromosome]]
				return True
			else:
				#If next bed is not on the next chromosome, generate a chromosome filling element.
				if f['bed'][0]!=nextChromosome:
					f['element']=[nextChromosome, 0, chrom[nextChromosome]]
					return True
				#Step 1b: If 'bed' doesn't start right at the beginning of the next chromosome, the next element goes from the beginning, to the beginning of 'bed' 
				elif f['bed'][1]!=0:
					f['element']=[nextChromosome, 0, f['bed'][1]]
					return True


		#Step 2: The next element starts somewhere after 'bed' 

		#These are the coordinates of the first point at which the new 'element' can start.
		bedChrom, bedEnd = f['bed'][0], f['bed'][2]

		#Step 3
		while True:
			readBed(f, chrom)

			if f['bed']==[]:
				if bedEnd<chrom[bedChrom]:
					f['element']=[bedChrom, bedEnd, chrom[bedChrom]]
				else:
					nextChrom=findNextChromosome(chrom, bedChrom)
					if nextChrom != '':
						f['element']=[nextChrom, 0, chrom[nextChrom]]
					else:
						f['element']==[]
						f['noMoreElements']=True
						return False

				if findNextChromosome(chrom, f['element'][0])=='':
					f['noMoreElements']==True
				return True
			
			#Check if the newly read 'newBed' is connected with the old one. If so, update bedChrom and bedEnd.

			if f['bed'][0]==bedChrom and f['bed'][1] <= bedEnd:
				bedChrom, bedEnd = f['bed'][0], max(bedEnd, f['bed'][2])
				continue

			nextChrom=findNextChromosome(chrom, bedChrom)
			if bedEnd==chrom[bedChrom] and f['bed'][0]==nextChrom and f['bed']==0:
				bedChrom, bedEnd = f['bed'][0], f['bed'][2]
				continue

			#The newly read newBed is not connected with the old one. The new element is between the new and the old one.
			if f['bed'][0]==bedChrom:
				f['element']=[bedChrom, bedEnd, f['bed'][1]]
				return True
			else:
				f['element']=[bedChrom, bedEnd, chrom[bedChrom]]
				return True
				

#Reads the next bed element in a file, with neat extra features:
#1) Makes sure the file is sorted
#2) Makes sure the elements are appropriate
#3) If assymetric padding is used, the bed elements can be a bit out of order, even in a sorted file. The funciton
#	read in enough lines to handle this.
def readBed(f, chrom):
	paddingAssymetry=abs(f['padding'][0]-f['padding'][1])	

	#-2 If nextBed is empty, read new one. Has value [] when empty.
	#-1 If list is empty, add 'nextBed' to list. If list and 'nextBed' both are empty, we mark endOfFile. Obs. 'bed' is empty when endOfFile=True.
	#0. Remove 'bed' from list..
	#1. Look through list for next element element.
	#2. If 'nextBed' is close enough to be in list, add it, read new 'nextBed', reapeat until 'nextBed' is empty or not close enough
	#3. Find First element in list, link to it using 'bed'

	#Reads 'nextBed' if it is empty. 
	if len(f['nextBed'])==0:
		readNextBed(f,chrom)

	#If f['bed'] is not empty, then it is in f['bedList']. Remove it.
	if len(f['bed'])!=0 and len(f['bedList'])!=0:
		f['bedList'].remove(f['bed'])

	#If f['bedList'] is empty, move f['nextBed'] to it, read new nextBed
	if len(f['bedList'])==0:
		if len(f['nextBed'])==0:
			f['endOfFile']=True
			f['bed']=[]
			return False
		else:
			f['bedList'].append(f['nextBed'])
			readNextBed(f,chrom)
	

	#If 'nextBed' is empty, we are at the end of the file, use first element in 'bedList' as 'bed' 
	if len(f['nextBed'])==0:
		sorted(f['bedList'],key=lambda x:x[1])
		f['bed']=f['bedList'][0]
		return True
	else:
		#In this case, we have non-empty 'nextBed' and 'bedList'. We now add new element to 'bedList'
		#until the start of 'nextBed' at least paddingAssymetry bp away from the first element in 'bedList'

		while (f['bedList'][0][0]==f['nextBed'][0]) and f['bedList'][0][1]>=(f['nextBed'][1]-paddingAssymetry):
			f['bedList'].append(f['nextBed'])
			f['bedList']=sorted(f['bedList'],key=lambda x:x[1])
			readNextBed(f,chrom)
			if len(f['nextBed'])==0:
				break

	f['bedList']=sorted(f['bedList'],key=lambda x:x[1])
	f['bed']=f['bedList'][0]

	return True


#Reads a new element 'nextBed'
def readNextBed(f, chrom):
	lastBed=f['nextBed']
	newLine=f['file'].readline().rstrip('\n')
	if newLine=='':
		f['nextBed']=[]
		return False
	else:
		f['nextBed']=lineToBed(newLine)
		newSortInfo=newLine.split('\t')[:2]

		#Checking so element is oriented if required
		if f['orientedFile'] and f['nextBed'][3]=='':
			sys.stdout.write('ERROR: Bed element not oriented as required for assymetric paddings.\n')
			exit(1)
		#Checking so chromosome is recognized
		if f['nextBed'][0] not in chrom:
			sys.stdout.write('ERROR: Element in genome(unknown chromosome): "%s\t%d\t%d"\n'%(f['nextBed'][0],f['nextBed'][1],f['nextBed'][2]))
			exit(1)
		padBed(f['nextBed'],f['padding'],chrom)

		#Making sure the elements in the file are sorted. Because padding might be assymetric, it is a bit messy to deduce this from the padded elements..
		if len(lastBed)>0:
			if newSortInfo[0]<f['sortInfo'][0] or ( newSortInfo[0]==f['sortInfo'][0] and int(newSortInfo[1])<int(f['sortInfo'][1])):
				sys.stderr.write('ERROR: Bed file %s not sorted.\n'%f['fileName'])
				print lastBed
				exit(1)

		#Saving way info to be used to check so infile is sorted.
		f['sortInfo']=newSortInfo
		return True

#Finds the next chromosome
def findNextChromosome(chrom_sizes, current):
	next = ''
	for chrom in chrom_sizes:
		if chrom>current and  (next=='' or  chrom<next):
			next=chrom
	return next

#Given a list of files, finds the file with element ending first(i.e. the next to be read)
def findFirstEndingElement(fileList):

	firstChr, firstEnd, first = '', -1, -1

	for i in range(len(fileList)):
		if len(fileList[i]['element'])!=0:
			if firstChr=='' or fileList[i]['element'][0]<firstChr or (fileList[i]['element'][0]==firstChr and fileList[i]['element'][2]<firstEnd):
				firstChr, firstEnd, first = fileList[i]['element'][0], fileList[i]['element'][2], i
	return first

#Given a list of files, finds the file with element starting last
def findLastStartingElement(fileList):

	firstChr, lastStart, last = '', -1, -1

	for i in range(len(fileList)):
		if len(fileList[i]['element'])!=0:
			if firstChr=='' or fileList[i]['element'][0]>firstChr or (fileList[i]['element'][0]==firstChr and fileList[i]['element'][1]>lastStart):
				firstChr, lastStart, last = fileList[i]['element'][0], fileList[i]['element'][1], i
	return last

#Given a list of files, finds the file with element starting first(i.e. the next to be read). If looking for first in a subset of the files, pass list of indices.
def findFirstStartingElement(fileList, possible=-1):

	firstChr, firstStart, first = '', -1, -1

	if possible==-1:
		possible=range(len(fileList))

	for i in possible:
		if len(fileList[i]['element'])!=0:
			if firstChr=='' or fileList[i]['element'][0]<firstChr or (fileList[i]['element'][0]==firstChr and fileList[i]['element'][1]<firstStart):
				firstChr, firstStart, first = fileList[i]['element'][0], fileList[i]['element'][2], i
	return first

#Bitwise OR between tracks in fileList, returns first element
def bitwiseOR(fileList):
	#List contain
	notConnectedYet=range(0,len(fileList))

	i=findFirstStartingElement(fileList)	
	chr=fileList[i]['element'][0]
	start=fileList[i]['element'][1]
	end=fileList[i]['element'][2]
	notConnectedYet.remove(i)

	while True:	
		if len(notConnectedYet)==0:
			break
		i=findFirstStartingElement(fileList,notConnectedYet)
		
		if i==-1:
			break
		if fileList[i]['element'][0]>chr or (fileList[i]['element'][0]==chr and fileList[i]['element'][1]>end):
			break
		else:
			end=max(end,fileList[i]['element'][2])
			notConnectedYet.remove(i)

	return [chr, start, end]
		
#Bitwise AND between tracks in fileList, returns element.
def bitwiseAND(fileList):


	i=findLastStartingElement(fileList)
	j=findFirstEndingElement(fileList)

	if fileList[i]['element'][0]==fileList[j]['element'][0] and fileList[i]['element'][1]<fileList[j]['element'][2]:
		return [ fileList[j]['element'][0], fileList[i]['element'][1], fileList[j]['element'][2]]
	else:
		return [ fileList[j]['element'][0], fileList[j]['element'][2], fileList[j]['element'][2]]

#Checks if we have reached the end of the calculation(AND: end of first file, OR: end of last file)
def stopReading(fileList, orStatus):

	#If all files are at the end, return True, otherwise false
	if orStatus:
		for file in fileList:
			if file['noMoreElements']==False:
				return False
		return True

	#If program is running in AND mode, read until first file terminates.
	else:
		for file in fileList:
			if file['noMoreElements']==True:
				return True
		return False

#########################
# Main code starts here #
#########################

#Parsing arguments
parser = argparse.ArgumentParser(description='Performs a base-pair-wise overlap for a number of tracks.')
parser.add_argument('chromPath', metavar='chrom.sizes', help='Chromosome size file.')
parser.add_argument('file', metavar='!f.bed:n', help='Sorted input BED file. Pass \"stdin\" to read from STDIN. Adding \'!\' in front performs logical NOT on full track (writing \'\\!\' may be necessary in BASH). Elements can be padded symmetrically by adding :n after file name. Adding :-n only pads upstream, and :+n pads downstream. The combination :-n:+m is allowed.', nargs='+', default='stdin')
parser.add_argument("--verbose", help="increase output verbosity", action="store_true")
parser.add_argument('--OR', help='Performs base-pair-wise OR instead of AND(default)', action="store_true")
parser.add_argument('--bed', metavar="out.bed", help='Outputs data to out.bed')
args = parser.parse_args()


#Reading chromosome lengths
chromosomeLengths={}
if args.verbose:
	sys.stderr.write('Reading chromosome lengths.\n')

chrom_sizes_file=args.chromPath

try:
	with open(chrom_sizes_file) as f_chrom_sizes:
		line=f_chrom_sizes.readline()
		while line != '':

			data=line.rstrip('\n').split('\t')
			if len(data) != 2:
				sys.stderr.write('ERROR: Incorrect number of columns in %s\n'%args.c)
				exit(1)

			#Converting chromosome length string to int
			try:
				chromLength=int(data[1])
			except ValueError:
				sys.stderr.write('ERROR: Coordinate not integer valued in %s\n'%args.c)
				exit(1)
				
			chromosomeLengths[data[0]]=chromLength
			line=f_chrom_sizes.readline().rstrip('\n')

except IOError as e:
	sys.stderr.write("ERROR: Can't open chromosome size file %s\n"%chrom_sizes_file)
	exit(1)
	
if args.bed:
	try:
		if args.bed=='stdout':
			outFile=sys.stdout
		else:
			outFile=open(args.bed,'w')
	except IOError:
		sys.stderr.write("ERROR: Can't open output bed-file %s\n"%args.bed)
		exit(1)
else:
	trackLength=0

#Parsing file information and creating file objects
bedFiles=[]
for f in args.file:
	bedFiles.append(openFile(f,chromosomeLengths))
	#Reads the first element in each file
	readNextElement(bedFiles[-1],chromosomeLengths)

#Finds the first element
if args.OR:
	currentElement=bitwiseOR(bedFiles)
else:
	currentElement=bitwiseAND(bedFiles)

#Looping through files, applies AND or OR
while not stopReading(bedFiles, args.OR):

	#Finds the next element. Continue if reading fails(i.e. inverted track ends up not having any more element)
	if  not readNextElement(bedFiles[findFirstEndingElement(bedFiles)],chromosomeLengths):
		continue

	if args.OR:
		nextElement=bitwiseOR(bedFiles)
	else:
		nextElement=bitwiseAND(bedFiles)
	
	#If the next element overlaps the current, join them, otherwise print the current, and move forward.
	if nextElement[0]==currentElement[0] and nextElement[1]<=currentElement[2]:
		currentElement[2]=nextElement[2]
	else:
		if args.bed:
			if currentElement[1] != currentElement[2]:
				outFile.write("%s\t%d\t%d\n"%(currentElement[0], currentElement[1], currentElement[2]))
		else:
			trackLength+=currentElement[2] - currentElement[1]

		currentElement=nextElement 

if args.bed:
	if currentElement[1] != currentElement[2]:
		outFile.write("%s\t%d\t%d\n"%(currentElement[0], currentElement[1], currentElement[2]))
else:
	trackLength+=currentElement[2] - currentElement[1]
	#Calculating length of genome
	genomeLength=0
	for chr in chromosomeLengths:
		genomeLength+=chromosomeLengths[chr]
	print "%d\t%d\t%.2f"%(trackLength, genomeLength, 100*float(trackLength)/float(genomeLength)) + r"%"
