#count number of occurrences for significant GO terms
#reads in significant GO term list and all observed GO terms for a set of genes
#to run script: python3 Count.GO.occcurrences.py <significant GO term files> <all GO terms for set of genes file; observed GO terms> <output; file with GO.Term \t No.Occurrences \n>
#Author: Alice Naftaly, May 2020

import sys


#read in significant GO terms
#returns list of significant go terms
def read_sig_go_terms():
    sig_go_terms_file = sys.argv[1]
    sig_go_terms_list = []
    with open(sig_go_terms_file, 'r') as sig_go_terms:
        for line in sig_go_terms:
            if line.startswith("GO:"):
                new_line = line.split()
                sig_go_terms_list.append(new_line[0])
    return sig_go_terms_list


#read in all GO terms
#returns dictionary with key == GO term and value == number of occurrences of that GO term
def read_all_go_terms():
    all_go_terms_file = sys.argv[2]
    all_go_terms_dict = {}
    with open(all_go_terms_file, 'r') as all_go_terms:
        for line in all_go_terms:
            if line.startswith("PB"):
                new_line = line.split()
                go_terms = new_line[1].split(",")
                for GO in go_terms:
                    if GO in all_go_terms_dict:
                        all_go_terms_dict[GO].append("1")
                    elif GO not in all_go_terms_dict:
                        all_go_terms_dict.update({GO:["1"]})
    return all_go_terms_dict

#pull sig go terms Occurrences
#returns dictionary with key == GO term and value == number of occurrences
def count():
    sig_go_terms = read_sig_go_terms()
    all_go_terms = read_all_go_terms()
    go_terms_count = {}
    for GO in sig_go_terms:
        single_count = len(all_go_terms[GO])
        go_terms_count.update({GO:single_count})
    return go_terms_count

#write go term occurrences to a file
def write():
    go_terms_count = count()
    output = sys.argv[3]
    with open(output, 'a') as out:
        header = "GO.Term\tCounts\n"
        out.write(header)
        for GO in go_terms_count:
            single_count = go_terms_count[GO]
            final = "%s\t%s\n" % (str(GO), str(single_count))
            out.write(final)

write()
