from Bio import SeqIO
def parse_fasta(fasta_file):
id_length = {}
for seq_record in SeqIO.parse(fasta_file, "fasta"):
new_id = seq_record.id.split("|")[1]
length = len(seq_record)
id_length[new_id] = length
return(id_length)
def select_iso(splice_file, id_length):
file = str(splice_file)
prefix = file.split("/")[1]
prefix = prefix.split("_")
prefix = (prefix[0][0]).upper() + prefix[1][0:3] + "|"
with open(splice_file) as isoforms:
longest_isoforms = []
for line in isoforms:
line = line.rstrip().split(';')
lengths = { gene: id_length.get(gene, 0) for gene in line}
longest_isoforms.append(prefix + (max(lengths, key=lengths.get)))
return(longest_isoforms)
# Read list of file names
file_dict = {}
with open('filenames.txt') as filenames:
for line in filenames:
fasta = line.split("\t")[0]
splice = line.split("\t")[1].rstrip()
file_dict[fasta] = splice
# Create new FASTA files
for fasta, splice in file_dict.items():
id_length = parse_fasta(fasta)
longest_iso = select_iso(splice, id_length)
prot_dict = SeqIO.to_dict(SeqIO.parse(fasta, "fasta"))
new_prot_dict = { gene_id: prot_dict[gene_id] for gene_id in longest_iso }
out_name = fasta.split("/")[1]
out_name = out_name.split(".")[0] + "_long_iso.fa"
print("writing ", out_name)
with open(out_name, 'w') as outfile:
SeqIO.write(new_prot_dict.values(), outfile, 'fasta')