#! /python22/Lib

import string,sys,os,math,re,subprocess,random,cmd,time,statistics
from scipy.stats import fisher_exact

##intersects TSSs with L1 coordinates to obtain TSS profile over monomers, and outputs TSS positions for TSSprofile.py

def initiate(input, bed, meth, rmsk, output, outputTwo):

	sample = str.split(input,".")[0]
	repeatList = {}
	repeatKey = {}
	f = open(rmsk)
	for thisline in f:
		data = str.split(thisline)
		if ((data[11] == "SINE") or (data[11] == "LTR") or (data[11] == "LINE")):
			chromosome = data[5]
			start = int(int(data[6])/10)
			stop = int(int(data[7])/10)
			try:
				repeatKey[chromosome]
			except:
				repeatKey[chromosome] = {}
			repeatList[data[12]] = 0
			for i in range(start,stop+1):
				repeatKey[chromosome][i] = data[12]
	f.close()
	repeatList["N"] = 0

	methData = {}
	f = open(meth)
	for thisline in f:
		data = str.split(thisline)
		if (data[4][:2] == "L1"):
			if (data[5] == "+"):
				coordinates = data[1]+":"+data[2]
			else:
				coordinates = data[1]+":"+data[3]
			methData[coordinates] = data[6:]
	f.close()

	L1s = {}
	f = open(bed)
	for thisline in f:
		data = str.split(thisline)
		if ((data[3] == "L1MdTFI") or (data[3] == "L1MdTFII")):
			if ((int(data[6]) > 1) and (len(data[0]) < 6)):
				chromosome = data[0]
				start = int(data[1])
				stop = int(data[2])
				strand = data[4]
				monomers = str.split(data[7][1:-1],",")
				monomerStarts = []
				monomerStops = []
				try:
					L1s[chromosome]
				except:
					L1s[chromosome] = []
				if (strand == "+"):
					UTRstart = start
					UTRstop = start+int(str.split(str.split(monomers[-1],":")[1],"-")[1])
					bodyStart = UTRstop
					bodyStop = stop
					for i in range(len(monomers)):
						monomerStarts.append(UTRstart+int(str.split(str.split(monomers[i],":")[1],"-")[0]))
						monomerStops.append(UTRstart+int(str.split(str.split(monomers[i],":")[1],"-")[1])-1)
					L1s[chromosome].append([start,stop,strand,UTRstart,UTRstop,bodyStart,bodyStop,monomerStarts,monomerStops,[],data[3]])
				else:
					UTRstart = start + int(str.split(str.split(monomers[0],":")[1],"-")[0])
					UTRstop = stop
					bodyStart = start
					bodyStop = UTRstart
					for i in range(len(monomers)):
						monomerStarts.append(start+int(str.split(str.split(monomers[i],":")[1],"-")[0]))
						monomerStops.append(start+int(str.split(str.split(monomers[i],":")[1],"-")[1])+1)
					L1s[chromosome].append([start,stop,strand,UTRstart,UTRstop,bodyStart,bodyStop,monomerStarts,monomerStops,[],data[3]])
	f.close()

	
	used = {}
	f = open(input)
	for thisline in f:
		if (thisline[0] != "@"):
			data = str.split(thisline)
			if (data[2] != "*"):
				if (int(data[4]) > 0):
					ID = data[0]
					sequence = data[9]
					chromosome = data[2]
					start = int(data[3])
					stop = start + processCIGAR(data[5])[2]
					if ((sequence[:46] == "TCTAATACGACTCACTATAGGGCAAGCAGTGGTATCAACGCAGAGT") and (sequence[-46:] == "TTGAGCATCAGCAACAGAATACAAGAGATGAAGCTTGGCGTAATCA")):
						strand = "+"
					elif ((sequence[-46:] == "ACTCTGCGTTGATACCACTGCTTGCCCTATAGTGAGTCGTATTAGA") and (sequence[:46] == "TGATTACGCCAAGCTTCATCTCTTGTATTCTGTTGCTGATGCTCAA")):
						strand = "-"
					try:
						currentTerms = L1s[chromosome]
					except:
						currentTerms = []
					if (len(currentTerms) > 0):
						if (strand == "+"):
							for i in range(len(currentTerms)):
								if ((currentTerms[i][2] == strand) and (stop >= currentTerms[i][5]) and (stop <= currentTerms[i][6])):
									try:
										used[chromosome+"|"+str(start)+"|"+strand]
									except:
										currentTerms[i][9].append(start)
										used[chromosome+"|"+str(start)+"|"+strand] = "Y"
						else:
							for i in range(len(currentTerms)):
								if ((currentTerms[i][2] == strand) and (start >= currentTerms[i][5]) and (start <= currentTerms[i][6])):
									try:
										used[chromosome+"|"+str(stop)+"|"+strand]
									except:
										currentTerms[i][9].append(stop)
										used[chromosome+"|"+str(stop)+"|"+strand] = "Y"
						L1s[chromosome] = currentTerms
	f.close()

	monomerPositions = {}
	for i in range(213):
		monomerPositions[i] = 0	
	positions = {}
	for i in range(101):
		positions[i] = 0
	positions[-1] = 0

	b = open(outputTwo, "w")
	o = open(output, "w")
	chromosomes = list(L1s.keys())
	for i in range(len(chromosomes)):
		currentTerms = L1s[chromosomes[i]]
		for j in range(len(currentTerms)):
			if (currentTerms[j][4]-currentTerms[j][3] >= 136):
				UTRcount = 0
				upstreamCount = 0
				if (currentTerms[j][2] == "+"):
					for k in range(len(currentTerms[j][9])):
						position = round(100*float(currentTerms[j][9][k]-currentTerms[j][3])/float(currentTerms[j][4]-currentTerms[j][3]),0)
						if (position >= 100):
							position = 100
							o.write(chromosomes[i]+"|"+str(currentTerms[j][9][k])+"|+\n")
						elif (position < 0):
							upstreamCount = upstreamCount + 1
							term = "N"
							try:
								term = repeatKey[chromosomes[i]][int(currentTerms[j][9][k]/10)]
							except:
								pass
							repeatList[term] = repeatList[term] + 1
							position = -1
						elif (currentTerms[j][9][k] < currentTerms[j][3]):
							upstreamCount = upstreamCount + 1
							term = "N"
							try:
								term = repeatKey[chromosomes[i]][int(currentTerms[j][9][k]/10)]
							except:
								pass
							repeatList[term] = repeatList[term] + 1
							position = -1
						else:
							UTRcount = UTRcount + 1
							o.write(chromosomes[i]+"|"+str(currentTerms[j][9][k])+"|+\n")
							monomerPosition = 212-(currentTerms[j][7][-1]-currentTerms[j][9][k])%212
							monomerPositions[monomerPosition] = monomerPositions[monomerPosition] + 1
						positions[position] = positions[position] + 1
				else:
					for k in range(len(currentTerms[j][9])):
						position = round(100*float(currentTerms[j][4]-currentTerms[j][9][k])/float(currentTerms[j][4]-currentTerms[j][3]),0)
						if (position >= 100):
							position = 100
						elif (position < 0):
							upstreamCount = upstreamCount + 1
							term = "N"
							try:
								term = repeatKey[chromosomes[i]][int(currentTerms[j][9][k]/10)]
							except:
								pass
							repeatList[term] = repeatList[term] + 1
							position = -1
						elif (currentTerms[j][9][k] > currentTerms[j][4]):
							upstreamCount = upstreamCount + 1
							term = "N"
							try:
								term = repeatKey[chromosomes[i]][int(currentTerms[j][9][k]/10)]
							except:
								pass
							repeatList[term] = repeatList[term] + 1
							position = -1
						else:
							UTRcount = UTRcount + 1
							o.write(chromosomes[i]+"|"+str(currentTerms[j][9][k])+"|-\n")
							monomerPosition = 212-(currentTerms[j][9][k]-currentTerms[j][8][0]-1)%212
							monomerPositions[monomerPosition] = monomerPositions[monomerPosition] + 1
						positions[position] = positions[position] + 1

				if (currentTerms[j][2] == "+"):
					otherTerms = methData[chromosomes[i]+":"+str(currentTerms[j][0])]
				else:
					otherTerms = methData[chromosomes[i]+":"+str(currentTerms[j][1])]
				b.write(chromosomes[i]+"\t"+str(currentTerms[j][0])+"\t"+str(currentTerms[j][1])+"\t"+currentTerms[j][2]+"\t"+str(currentTerms[j][4]-currentTerms[j][3])+"\t"+currentTerms[j][10]+"\t"+str(UTRcount)+"\t"+str(upstreamCount))
				for j in range(len(otherTerms)):
					b.write("\t"+otherTerms[j])
				b.write("\n")
	o.close()
	b.close()

def processCIGAR(input):

	previous = 0
	result = [0,0,0]
	for i in range(len(input)):
		if (input[i] not in "0123456789"):
			if (input[i] == "S"):
				if (previous == 0):
					result[0] = int(input[previous:i])
				else:
					result[1] = int(input[previous:i])
			elif (input[i] == "M"):
				result[2] = result[2] + int(input[previous:i])
			elif (input[i] == "D"):
				result[2] = result[2] + int(input[previous:i])
			elif (input[i] == "N"):
				result[2] = result[2] + int(input[previous:i])
			previous = i+1
	return result

initiate(sys.argv[1], "L1Md.joined.6kbp.notF.reannotated.with_match.monocount.bed", "mm10.10kbp_L1_5UTR.mESC_data.excl_ambig.segmeth.tsv", "rmsk_mm10.txt", sys.argv[2], sys.argv[3])
sys.exit()