#script to pull transcripts from Ensembl that are not present in the Isoseq transcriptome
#input files = Transcript_IDs_in_Isoseq.txt (one column where each line == 1 transcript id; ENSGACT...)
#Ensembl_transcript_lengths_all.txt (4 columns where the length is column 2 and the transcript id is column 1)
#to run script: python3 Pull.Missing.Transcripts.py <transcripts in isoseq> <transcripts in Ensembl> <output: transcripts that are in Ensembl but missing from Isoseq>
#Author: Alice Naftaly, Jan 2021

import sys

#read transcripts in isoseq
#returns list with transcript ids in Isoseq
def read_iso_transcripts():
    iso_transcripts_file = sys.argv[1]
    iso_transcripts = []
    with open(iso_transcripts_file, 'r') as iso:
        for line in iso:
            new_line = line.strip("\n")
            iso_transcripts.append(new_line)
    set_iso_transcripts = list(set(iso_transcripts))
    return set_iso_transcripts


#read transcript in Ensembl
#returns dictionary with key == transcript id and value == length of transcript
def read_Ens_transcripts():
    ens_transcripts_file = sys.argv[2]
    ensembl_transcripts = {}
    with open(ens_transcripts_file, 'r') as ensembl:
        for line in ensembl:
            if line.startswith("ENS"):
                new_line = line.split()
                transcript_id = new_line[0]
                length = int(new_line[1])
                ensembl_transcripts.update({transcript_id:length})
    return ensembl_transcripts

#remove transcripts in Isoseq from Ensembl dictionary
#returns filtered ensembl transcripts dictionary
def remove_iso_transcripts():
    iso_transcripts = read_iso_transcripts()
    ensembl_transcripts = read_Ens_transcripts()
    for transcript in iso_transcripts:
        if transcript in ensembl_transcripts:
            del ensembl_transcripts[transcript]
    return ensembl_transcripts

#write filtered ensembl dictionary to file
def write():
    ensembl_transcripts = remove_iso_transcripts()
    output = sys.argv[3]
    with open(output, 'a') as out:
        for transcript in ensembl_transcripts:
            final = "%s\t%s\n" % (str(transcript), str(ensembl_transcripts[transcript]))
            out.write(final)

write()
