#sorting out multiple instances from a sam file (same transcript number) for running through cDNA cupcake
#to run script: python3 SortSam.MultipleTranscripts.py <sam file>

import sys

#read sam file
#returns dictionary with transcript lines (no header lines) where key == transcript/x and value == line
def read_sam():
    sam_file = sys.argv[1]
    sam_dict = {}
    filtered_sam_dict = {}
    with open(sam_file, 'r') as sam:
        for line in sam:
            if line.startswith("transcript"):
                new_line = line.split()
                transcript = new_line[0]
                if transcript in sam_dict:
                    sam_dict[transcript].append(line)
                elif transcript not in sam_dict:
                    sam_dict.update({transcript:[line]})
    for key in sam_dict:
        if len(sam_dict[key]) == 1:
            filtered_sam_dict.update({key:sam_dict[key][0]})
        elif len(sam_dict[key]) > 1:
            for val in sam_dict[key]:
                split_val = val.split("\t")
                if split_val[1] == "0" or split_val[1] == "16":
                    filtered_sam_dict.update({key:val})
    return filtered_sam_dict

#write new sam file:
def write():
    sam_file = sys.argv[1]
    output = sys.argv[2]
    sam_dict = read_sam()
    with open(sam_file, 'r') as sam, open(output, 'a') as out:
        for line in sam:
            if line.startswith("transcript"):
                continue
            else:
                out.write(line)
        for tran in sam_dict:
            single_transcript = str(sam_dict[tran])
            out.write(single_transcript)
write()
