# MIT License

# Copyright (c) 2021 Nelson T. Chuang

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.


#L1_analysis4.py
#----------
#usage:
#----------
# python L1_analysis4.py <path to Pacbio dir of alignments> 

#-p pacbio mode
#-c output canonical positions to "canonical_positions.txt"
#-v	verbose mode

#-----------
#Description:
#-----------
#Pipeline to analyze L1 sequences-primarily for full-length annotation
#Designed to annotate full-length fasta sequences
#
#Dependencies: Requires reference L1 consensus, Mummer/Nucmer, Needleman
#
#Reads folder of fasta files in format: 'chr_position.fa'
#Creates /Mummer and /Needle for global alignments
#Outputs tab-delim of analysis to 'L1_analysis.txt'


#-----------
#Change log
#----------
#1/15/19: added "_truncated" for Ta1ds with 5' truncation of the 1d canonical site
#1/24/18: updating the canonical positions to include printout of new canonical pos
#11/27/17: added output of canonical positions flag
#11/17/17: added the Ta1d-CCA and Ta1d-TCA subfamilies
#11/14/17: added the Ta1d-CA and Ta1d-AT subfamily 
#11/12/2017: changed the Ambig-Ta to distinguish Ambig-Ta0 and Ambig-Ta1nd
#6/21/17: fixed bug with input file having '.' before the .fa
#5/3/17: Need to add back original PB pipeline parser for sequence_amplicons.fa
#-pacbio amplicon_analysis.fastas have multiple barcode sequences
#-pacbio mode will just aggregate the assemblies into default fastas dir
#-fixed bug with ambiguous family assignments that messed up subfam assignment
#-added Ambig-Ta1d which is ACA with 1d at 74
#4/14/17: added argument parsing and allows input of single multi-fasta file
#-will create directories: 'alignment' and 'fastas'
#-still dependent on single fasta files for nucmer
#-given input, it will detect if file or folder
#4/12/17: added strand to output
#4/6/17: included checking if no L1 exists if a submitted site is false
#3/31/17: adjusting it to run on new data, SVU41
#-it will accept .fsa/.fa/.fasta
#12/5/16: reads in folder of individual fasta files.
#-fasta files have to be named their corresponding CHR_POS.fa
#11/26/16: completed the internal deletion checker
#-Extract correct L1 sequence with large gap
#-fixed paths to Needle
#9/29/16:
#-Tweaked Nucmer to detect smaller clusters and account for large gaps
#6/21/16:
#-use Mummer to find start/end to trim and extract flanks
#-use Needleman to align for variant calling and accurate ORFps
#-Created own subfamily caller
#***********************************************************************************

#libraries
from subprocess import Popen,PIPE
import os,sys,re
from glob import glob
import shutil
from Bio import SeqIO,AlignIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Emboss.Applications import NeedleCommandline
import argparse 


#----------------------------------------------------------
#Parse arguments
#----------------------------------------------------------
parser=argparse.ArgumentParser()
parser.add_argument("input", type=str, help="Input fasta file or folder")
parser.add_argument("-p", "--pacbio", help="Input directory is where PacBio assemblies are.",
					action="store_true")
parser.add_argument("-v", "--verbose", help="display all processing output",
					action="store_true")
parser.add_argument("-c","--canonical", help="output the canonical positions",
					action="store_true")
args=parser.parse_args()

#in future can add verbosity setting for amount of output/status to display

#-----------------------------------------------------------
#Initialization
#-----------------------------------------------------------
ref_cons="/home/nchuang/reference_files/L1Ta.cons.2019.fa"



allsites=[] #stores all submitted sites
mysites=[] #stores only sites that align to reference L1
badsites=[] #stores sites with no alignment to reference L1
mydir="alignments" #default alignment folder where processing files are kept for reference
fastas_dir="fastas"
fasta_ext='fa' #default fasta extension

#create working Alignments folder
if os.path.exists(mydir):
	shutil.rmtree(mydir)
os.mkdir(mydir)



#read in single fasta file and split it up into folder 'fastas' for processing
if os.path.isfile(args.input):    
	if os.path.exists("fastas"):
		shutil.rmtree("fastas")
	os.mkdir("fastas")
	for seq_record in SeqIO.parse(args.input,"fasta"):
		SeqIO.write(seq_record, "fastas/"+seq_record.id+".fa", "fasta")
		
#read in fastas directory if exists already
elif os.path.isdir(args.input) and args.pacbio is False:
	fastas_dir = args.input #target dir with sequences and where to store alignment folders
	if fastas_dir.endswith('/'): #remove the end / if they add it
		fastas_dir=fastas_dir[:-1]

#Pacbio mode, aggregate the amplicon_analysis.fastas into one 'fastas' folder
elif args.pacbio and os.path.isdir(args.input):
	pbdir=args.input
	
	#create fastas_dir for pipeline
	if os.path.exists(fastas_dir):
		shutil.rmtree(fastas_dir)
	os.mkdir(fastas_dir)
	
	#get list of site directories
	pbsites=[os.path.basename(x) for x in glob("{}/[0-9XY]*_[0-9]*".format(pbdir))]
	
	#check if PacBio alignments exist
	for site in pbsites:
		pbfile="{}/{}/amplicon_analysis.fasta".format(pbdir,site)
		if (os.path.isfile(pbfile)) and (os.stat(pbfile).st_size!=0):
			shutil.copyfile(pbfile, "{}/{}.fa".format(fastas_dir,site))
		else:
			print("No alignment found for {}.".format(site))
			
#Argparse errors
else:
	if arg.pacbio:
		parser.error("Invalid Pacbio directory!")
	else:
		parser.error("Invalid input!")
	
	
print()
print("-"*50)
print("1. Reading available L1 raw sequences...")
print("-"*50)


#get list of site files
for file in os.listdir(fastas_dir):
	#check for fasta file
	if file.endswith(".fsa") or file.endswith(".fasta") or file.endswith(".fa"):
		fasta_ext=file[file.rfind('.')+1:] #keep extension for nucmer output
		site=file[:file.rfind('.')]
		#print(site)
		allsites.append(site) #list of files
		
print("Sequences to be processed: {}".format(len(allsites)))
print()
		
#-----------------------------------------------------------
#Mummer
#-----------------------------------------------------------

print("-"*50)
print("2. Mummer and L1 flank trimming")
print("-"*50)
print("Creating directory /Mummer...",end='')
if (os.path.exists("{}/Mummer".format(mydir))):
	shutil.rmtree("{}/Mummer".format(mydir))
os.mkdir("{}/Mummer".format(mydir))
print("done")

#run Nucmer for L1 boundaries
print("Running Nucmer to identify L1 boundaries...",end='')

maxgap=1000 
mincluster=20 
minmatch=20


for site in allsites:    
	proc=Popen(["nucmer","--maxgap={}".format(maxgap),"--mincluster={}".format(mincluster),"-o",
				"--minmatch={}".format(minmatch),"--pref={}/Mummer/{}".format(mydir,site),
				ref_cons, "{}/{}.{}".format(fastas_dir,site,fasta_ext)],
				stdout=PIPE,stderr=PIPE,universal_newlines=True)
	#for line in iter(proc.stderr.readline,''):
	#    print(">>> "+str(line.rstrip()))
	proc.wait()
	
	
#------------------------------------------------------
#create raw pacbio assembly sequnces
#------------------------------------------------------
coordlist={}  #dict of the L1 raw sequence coords and their: start,end, length, name, rev

#extract from *.coords the longest L1 name and coordinates
for index, site in enumerate(allsites):

	coord=open("{}/Mummer/{}.coords".format(mydir,site),'r')
	regex=r"(\d+)\s+(\d+)\s+\|\s+(\d+)\s+(\d+).*\d+\s+(\d+)\s+\|\s+(\S+)\s+\|\s\S+\s+(\S+)"
	
	#flags
	longest=0
	most_acc=0
	nomatch=0
	
	mumlist={}
	
	for line in coord:  #iterate through the *.coord list line by line
		#check if there is a MUM
		if re.search(regex,line):
			nomatch=0
			match=re.search(regex,line)
			ref_start=int(match.group(1))
			ref_end=int(match.group(2))
			start=int(match.group(3))
			end=int(match.group(4))
			length=int(match.group(5))
			name=match.group(7)
			accuracy=match.group(6)         
			
			#store the MUM, this excludes the L1 Mum 
			if (start<end): #forward sense
				mumlist[(ref_start,ref_end)]={'start':start,'end':end,'length':length,'name':name,'rev':False}
			else:           #reverse sense
				mumlist[(ref_start,ref_end)]={'start':start,'end':end,'length':length,'name':name,'rev':True}

						
			#check for longest MUM            
			if (length>longest):
				longest=length
				most_acc=accuracy
				
				#flag the L1 MUM
				L1ref_coord=(ref_start,ref_end,name)
				
				#check for strand
				if (start<end): #forward sense
					coordlist[site]={'start':start,'end':end,'length':length,'name':name,'rev':False}
				else:           #reverse sense
					coordlist[site]={'start':start,'end':end,'length':length,'name':name,'rev':True}

			
			#if same length take the one with the higher accuracy IDY
			#(this is relevant for finding best consensus after Pacbio assembly)
			elif (length==longest): 
				print ("Warning! {} - {} has same length ({}) as previous sequence!".format(site,name,length))

				if (accuracy>most_acc): 
					print ("Using newer sequence due to higher IDY%: {}".format(accuracy));
					
					#flag the L1 MUM
					L1ref_coord=(ref_start,ref_end,name)
					
					#check for strand
					if (start<end): #forward sense
						coordlist[site]={'start':start,'end':end,'length':length,'name':name,'rev':False}
					else:           #reverse sense
						coordlist[site]={'start':start,'end':end,'length':length,'name':name,'rev':True}
		else:
			nomatch=1 #there is no MUM, essentially no L1

	#badsite
	if nomatch==1: 
		badsites.append(site) #add to badsite list
		print()
		print(">>>{} does not align to reference L1. It will be excluded from analysis.".format(site))
		nomatch=0 #reset flag
		
	#goodsite
	else:          
		mysites.append(site) #add to goodsite list
		
		#Internal Deletion Check or stitch mums if there are large indels
		#if there is an L1, proceed with internal deletion check
		#search mumlist for adjacent MUMs to L1 MUM
		mumstart=L1ref_coord[0]
		mumend=L1ref_coord[1]
			
		for refsite in mumlist:
			#Verify same consensus sequence as the L1 MUM
			if mumlist[refsite]['name']==L1ref_coord[2]:
				#start of MUM and not in L1 MUM 
				if refsite[1]<mumstart:
					mumstart=refsite[0]
					#update coordlist for correct L1 start
					coordlist[site]['start']=mumlist[refsite]['start']
				#end of MUM and not in L1 MUM
				elif refsite[0]>mumend:
					mumend=refsite[1]
					#update coordlist for correct L1 start
					coordlist[site]['end']=mumlist[refsite]['end']
					
		coordlist[site]['length']=abs(coordlist[site]['end']-coordlist[site]['start'])+1
	

#extracts the raw PacBio sequence from longest/most accurate assembly (reads amplicon_analysis.fasta)
def get_raw_L1(mydir,mysite,mycoordlist):
	
	raw_L1=None #raw L1 sequence
	
	if (os.path.isfile("{}/{}.{}".format(mydir,mysite,fasta_ext))):  #check if file exists
		
		for seq_record in SeqIO.parse("{}/{}.{}".format(mydir,mysite,fasta_ext),"fasta"):
			
			seq_id = seq_record.id
			
			if (mycoordlist[mysite]['name']==seq_id): #check for correct assembly 
				if (mycoordlist[mysite]['rev']): #reverse strand                    
					#get reverse complement
					raw_L1=str(seq_record.reverse_complement().seq)
				else:
					raw_L1=str(seq_record.seq)
		
	return raw_L1 #return the raw sequence        

#trims the raw PacBio sequence into trimmed L1 and its flanks
def trim_L1(mysite,mycoordlist,myrawlist):
	trimmed_L1={} #dict of trimmed L1s and flanks
	
	#initialize and adjust for index position!
	trim_start=mycoordlist[mysite]['start']-1
	trim_end=mycoordlist[mysite]['end']
	rawlen=len(myrawlist[mysite])
	
		
	#check for reverse strand and fix index position!
	if (trim_start>trim_end):
		#get correct reverse comp coordinates
		trim_start=rawlen-trim_start-1
		trim_end=rawlen-trim_end+1
		
	#trim off flanks
	trimmed=myrawlist[mysite][trim_start:trim_end]
	
	#extract 5' flank
	flank5=myrawlist[mysite][:trim_start]
	#extract 3' flank
	flank3=myrawlist[mysite][trim_end:]
	
	
	trimmed_L1={'rawlength':rawlen,'trimmed':trimmed,'5flank':flank5,'3flank':flank3}
	
	return trimmed_L1
	
	
rawlist={} #dict of sites and raw sequences
trimmedlist={} #dict of sites and trimmed L1 and flanks
for site in mysites:
	rawlist[site]=get_raw_L1(fastas_dir,site,coordlist) #get raw sequence for site
	
	trimmedlist[site]=trim_L1(site,coordlist,rawlist)
	
	#print("Site {}: {}...{}".format(site,trimmedlist[site]['trimmed'][:10],trimmedlist[site]['trimmed'][-10:]))
	#print(trimmedlist[site]['5flank'])
	
print("done\n")


#################################################################
#Find ORF and ORFp
#################################################################

orf_list={} #dict of all orfps by site
trans_table=1 #translation table for biopython


#------------------------------------------
#ORF1
#------------------------------------------
def getorf1p(alignment):    
			
	#find reference orf1 start site
	reference=alignment[0].seq.upper()
	orf1_start=reference.find("AGATGGG")+2      #query for start of ORF1, add two to shift to ATG frame

	#check for valid ATG at query
	if (str(alignment[1].seq[orf1_start:orf1_start+3]).upper()=='ATG'):
		
		#translate orf1
		record=alignment[1].seq[orf1_start:].ungap('-')  
		pro=str(record.translate(trans_table, to_stop=True))
		
		pro_len=len(pro)    
		end=orf1_start+pro_len*3+3
	
		pro_start=pro.find("M")
		orf1p=pro[pro_start:]
	else:
		orf1p='.'
	return orf1p

#------------------------------------------
#ORF2
#------------------------------------------

def getorf2p(alignment):    
			
	#find reference orf2 start site
	reference=alignment[0].seq.upper()
	orf2_start=reference.find("AATGACA")+1      #query for start of ORF1, add two to shift to ATG frame
	
	#check for valid ATG at query
	if (str(alignment[1].seq[orf2_start:orf2_start+3]).upper()=='ATG'):
		#translate orf2
		record=alignment[1].seq[orf2_start:].ungap('-')  
		pro=str(record.translate(trans_table, to_stop=True))
		
		pro_len=len(pro)    
		end=orf2_start+pro_len*3+3
	
		pro_start=pro.find("M")
		orf2p=pro[pro_start:]
	else:
		orf2p='.'
		
	return orf2p 

	
########################################################
#Subfamily
########################################################
#call subfamilies only with 100% canonical positions
#pass in alignment type
def getfamily(alignment):
	
	family=None
	#Subfamily using L1Ta.cons.fa as ref
	#could just use reference positions instead of search function
	rec=alignment[1].seq.upper()
	reference=alignment[0].seq.upper()
	
	#Boissinot canonical positions
	canon={}
	canon[73]=reference.find("ACGGGTGATTT")+2     
	canon[705]=reference.find("CCCTGACCCCC")	
	canon[1813]=reference.find("CTGAGAGATTT")	
	canon[5532]=reference.find("TGAGTATAAA")
	canon[5535]=reference.find("GTATAAATC")
	canon[5926]=reference.find("ACATTAGTGGG")
	canon[5927]=canon[5926]+1
	canon[5928]=canon[5926]+2
	
	# #My canonical positions
	#canon[353]=reference.find("AGAGGGTCCTA") #Ta1d-specific
	#canon[424]=reference.find("")
	canon[1026]=reference.find("CGAGCTGAG")
	# canon[2181]
	# canon[2373]
	canon[3337]=reference.find("AGACCAATAA")
	canon[3440]=reference.find("AGGAGGAACT")
	#canon[3951]=reference.find("") #Ta1d-specific
	# canon[3981]
	# canon[4249]
	#canon[4371]=reference.find("") #Ta1d-specific
	# canon[4554]
	# canon[4895]
	# canon[5003]
	#canon[5106]=reference.find("") #Ta1d-specific
	# canon[5388]
		
	
	subfam={}
	subfam['Pa2']={73:'G',705:'C',1813:'C',5532:'G',5535:'C',5926:'G',5927:'A',5928:'G'}
	subfam['PreTa']={73:'G',705:'C',1813:'C',5532:'G',5535:'C',5926:'A',5927:'C',5928:'G'}
	subfam['Ta0']={73:'G',705:'C',1813:'C',5532:'G',5535:'C',5926:'A',5927:'C',5928:'A'}
	subfam['Ta1nd']={73:'G',705:'C',1813:'C',5532:'T',5535:'G',5926:'A',5927:'C',5928:'A'}
	subfam['Ta1d']={73:'-',705:'T',1813:'T',3337:'A',3440:'A',5532:'T',5535:'G',5926:'A',5927:'C',5928:'A'}
	subfam['Ta1d-CCA']={73:'-',705:'T',1026:'C',1813:'T',3337:'C',3440:'A',5532:'T',5535:'G',5926:'A',5927:'C',5928:'A'}
	subfam['Ta1d-TCA']={73:'-',705:'T',1026:'T',1813:'T',3337:'C',3440:'A',5532:'T',5535:'G',5926:'A',5927:'C',5928:'A'}
	subfam['Ta1d-CAT']={73:'-',705:'T',1026:'C',1813:'T',3337:'A',3440:'T',5532:'T',5535:'G',5926:'A',5927:'C',5928:'A'}
	
	#Ambiguous-incomplete matches
	ambigfam={}
	ambigfam['Ambig-Pa2']={5926:'G',5927:'A',5928:'G'}
	ambigfam['Ambig-PreTa']={5926:'A',5927:'C',5928:'G'}
	#ambigfam['Ambig-Ta']={5926:'A',5927:'C',5928:'A'}
	ambigfam['Ambig-Ta0']={73:'G',5532:'G',5535:'C',5926:'A',5927:'C',5928:'A'}
	ambigfam['Ambig-Ta1nd']={73:'G',5926:'A',5927:'C',5928:'A'}
	ambigfam['Ambig-Ta1d']={73:'-',705:'T',1813:'T',5926:'A',5927:'C',5928:'A'}    
	ambigfam['Ambig-Ta1d-AT']={73:'-',705:'T',1813:'T',3337:'A',3440:'T',5926:'A',5927:'C',5928:'A'}
	ambigfam['Ambig-Ta1d-CCA']={73:'-',705:'T',1813:'T',1026:'C',3337:'C',3440:'A',5926:'A',5927:'C',5928:'A'}
	ambigfam['Ambig-Ta1d-TCA']={73:'-',705:'T',1813:'T',1026:'T',3337:'C',3440:'A',5926:'A',5927:'C',5928:'A'}
	

#Check Canonical Positions
	if (args.canonical):
		#output_canon=open("canonical_positions.txt",'w')
		for pos in sorted(canon):
			output_canon.write(rec[canon[pos]]+'\t')
		output_canon.write('\n')
	
	#check each subfamily
	for fam in subfam:
		fam_check=0
		
		#check reference canonical positions to query canonical positions            
		for pos in subfam[fam]:
			qpos=canon[pos]
			#print("{}:{} compared to {}".format(pos,rec[qpos],subfam[fam][pos]))
			if (rec[qpos]==subfam[fam][pos]):
				fam_check+=1
			else:
				#failed match, skip to next subfam
				break
		
		#verify that every position matches        
		if (fam_check==len(subfam[fam])): #flag to see if all canonical sites match                
			family=fam
			break
		else:
			family='L1Ambig'
			
			
			#check the ambiguous subfamilies
			#note that I am reverse sorting so 'Ambig-Ta1d' will be the last checked subfamily
			for fam in sorted(ambigfam.keys(),reverse=True):
				ambig_check=0
				
				for pos in ambigfam[fam]:
					qpos=canon[pos]
					if (rec[qpos]==ambigfam[fam][pos]):
						ambig_check+=1
					else:
						#failed match, skip to next ambigfam
						break
				if (ambig_check==len(ambigfam[fam])):
					family=fam
					break

	#Flags the annotation with "truncated" if the 1d position is due to large 5' deletion/truncation
	if family[:4]=="Ta1d" or family[:10]=='Ambig-Ta1d':
		if (rec[canon[73]-1]=='-') or (rec[canon[73]+1]=='-'):
			family=family+"_truncated"

	return family

#------------------------------------------------------
#Needleman - standardize ORFp coordinates
#------------------------------------------------------
print("-"*50)
print("3. Needleman-Wunsch Alignment")
print("-"*50)

if (os.path.exists("{}/Needle".format(mydir))):
	shutil.rmtree("{}/Needle".format(mydir))
os.mkdir("{}/Needle".format(mydir))


#Run Needleman-Wunsch                 
for site in mysites:
	if args.verbose:
		print("Aligning {}... ".format(site),end='')
	
	needle=mydir+"/Needle/"+site+".needle"
	#create temporary query seq
	rec1 = SeqRecord(Seq(trimmedlist[site]['trimmed']),id=site)
	bseq=str(rec1.seq)
	needle_cline=NeedleCommandline(asequence=ref_cons, bsequence="asis:"+bseq,
								   gapopen=10,gapextend=0.5,outfile=needle)
	stdout,stderr=needle_cline()
	#print(stdout+stderr)
	if args.verbose:
		print("done.")
	#can't get PIPE to work!
	#child = Popen(str(needle_cline),stdin=PIPE,stdout=PIPE,stderr=PIPE,universal_newlines=True,shell=(sys.platform!="win32"))
	#SeqIO.write(rec1,child.stdin,"fasta")
	#child.stdin.close()
	
print("All done.")
print("-"*50)
print("4. ORF1p, ORF2p and Subfamily Detection")
print("-"*50)

if (args.canonical):
	output_canon=open("canonical_positions.txt",'w')
	canon_list=[73,705,1026,1813,3337,3440,5532,5535,5926,5927,5928]
	output_canon.write("L1_ID\t")
	for x in canon_list:
		output_canon.write(str(x)+'\t')
	output_canon.write('\n')
		
for site in mysites:
	
	needle=mydir+"/Needle/"+site+".needle"    
	align=AlignIO.read(needle,"emboss")
	
	#get orfps
	orf_list[site]={'orf1p':getorf1p(align),'orf2p':getorf2p(align)} #get orfps
	
	#get subfamily  
	#Check Canonical Position
	if (args.canonical):
		output_canon.write(str(site)+'\t')
		
	trimmedlist[site].update({'family':getfamily(align)})



########################################################
#OUTPUT
########################################################
print("-"*50)
print("5. Writing to file: L1_analysis.txt")
print("-"*50)
outfile="L1_analysis.txt"
f=open(outfile,"w")

#header
f.write("Site\tStrand\tSubfamily\tRaw size\t5'flank size\t3'flank size\tL1 size\tORF1p size\tORF2p size\t5'flank seq\t3'flank seq\tL1 seq\tORF1p seq\tORF2p seq\n")

for site in mysites:    
	
	#site id
	f.write(site+"\t")
	
	#barcode name
	#f.write(rawlist[site]['name'],end='/t')
	
	#strand
	if coordlist[site]['rev']:
		f.write("-\t") #reverse strand
	else:
		f.write("+\t") #forward strand
	
	#subfamily
	f.write(trimmedlist[site]['family']+"\t")
	
	#raw PB length   
	f.write(str(trimmedlist[site]['rawlength'])+"\t")
		
	#5' flank length
	f.write(str(len(trimmedlist[site]['5flank']))+"\t")
	
	#3' flank length
	f.write(str(len(trimmedlist[site]['3flank']))+"\t")
	
	#L1 length
	f.write(str(len(trimmedlist[site]['trimmed']))+"\t")
	
	#ORF1p length
	f.write(str(len(orf_list[site]['orf1p']))+"\t")
	
	#ORF2p length
	f.write(str(len(orf_list[site]['orf2p']))+"\t")
		
	#5' flank sequence
	f.write(trimmedlist[site]['5flank']+"\t")
	
	#3' flank sequence
	f.write(trimmedlist[site]['3flank']+"\t")
	
	#L1 sequence
	f.write(trimmedlist[site]['trimmed']+"\t")
	
	#ORF1p sequence
	f.write(orf_list[site]['orf1p']+"\t")
	
	#ORF2p sequence
	f.write(orf_list[site]['orf2p']+"\t")
	
	f.write('\n')


  
