#examining normalized read counts across tissues to determine if sex-specific spliced isoforms are expresssed in all tissues (from one sex)
#any tissue with less than 10 normalized read counts will be considered as not expressed for that isoform
#   this is an arbitrary cutoff; This is what I used in the ATAC paper.
#will compare the normalized reads for each isoform and report a number of tissues the isoform is expressed in (1,2,3,4). I do expect variation as some tissue specific genes should only show up in one sex
#to run script: python3 Compare.Sex.Spliced.Isoform.Expression.py <normalized read counts for one sex> <output file; format: Isoform \t Number tissues isoform is expressed in \t list of tissues isoform is expressed in>
#Author: Alice Naftaly, Feb 2021

import sys

#read normalized read counts
def read_nrc():
    nrc_file = sys.argv[1]
    norm_readcounts_dict = {}
    with open(nrc_file, 'r') as nrc:
        for line in nrc:
            if line.startswith("\"PB"):
                new_line = line.strip("\n")
                new_line = new_line.split(",")
                isoform = new_line[0].strip("\"")
                brain_nrc = round(float(new_line[1].strip("\"")),2)
                liver_nrc = round(float(new_line[2].strip("\"")),2)
                pronephros_nrc = round(float(new_line[3].strip("\"")),2)
                gonad_nrc = round(float(new_line[4].strip("\"")),2)
                dict_value = [brain_nrc, liver_nrc, pronephros_nrc, gonad_nrc]
                norm_readcounts_dict.update({isoform:dict_value})
    return norm_readcounts_dict

#sort through normalized read counts
#determine how many tissues each isoform is expressed in
#want to return 2 values where the first is a number 1-4 (for number of tissues expressed in) and the second is a list of the tissues the isoform is expressed in
def sort_nrc():
    norm_readcounts_dict = read_nrc()
    summary_dict = {}
    for isoform in norm_readcounts_dict:
        single_isoform_nrc = norm_readcounts_dict[isoform]
        expressed_tissues = []
        expressed_count = 0
        if single_isoform_nrc[0] > 10:
            expressed_tissues.append("brain")
            expressed_count += 1
        if single_isoform_nrc[1] > 10:
            expressed_tissues.append("liver")
            expressed_count += 1
        if single_isoform_nrc[2] > 10:
            expressed_tissues.append("pronephros")
            expressed_count += 1
        if single_isoform_nrc[3] > 10:
            expressed_tissues.append("gonad")
            expressed_count += 1
        dict_value = [expressed_count, expressed_tissues]
        summary_dict.update({isoform:dict_value})
    return summary_dict

#write output
def write():
    summary = sort_nrc()
    output = sys.argv[2]
    with open(output, 'a') as out:
        header = "Isoform\tNo.Tissues.Expressed.In\tTissues.List\n"
        out.write(header)
        for iso in summary:
            single_iso = summary[iso]
            tissue_list = ",".join(single_iso[1])
            tissue_count = str(single_iso[0])
            final = "%s\t%s\t%s\n" % (str(iso), tissue_count, tissue_list)
            out.write(final)

write()
