####
# Script takes in a list of Pfam IDs and produces a sifter input file (.pli)
#

# family names -> list of uniprot proteins
families = {}
familylist = []
# uniprot ids -> list of GOA annotations annotations
uniprots = {}
experimental = ['TAS', 'IDA', 'IMP']


# read in list of Pfam IDs
infpf = open("hundred.txt", 'r')
for line in infpf:
    familylist.extend(line.strip().split())

print familylist

infpf = open("../db/Pfam-A.full", 'r')
count = 0
currentPfamily = ''
printout = False
for line in infpf:
    if line[0] == '#':
        if '#=GF AC' in line:
            currentPfamily = line[line.find('PF'):line.find('.')]
            if currentPfamily in familylist:
                families[currentPfamily] = []
                printout = True
                print currentPfamily
    if printout and line[0:4] == '#=GS':
        uniprotid = line[(line.find(' ')+1):line.find('/')]
        if not uniprotid in families[currentPfamily]:
            families[currentPfamily].append(uniprotid)
    if line[0:2] == '//':
        printout = False


# load GOA terms in uniprots
# 5 is Type, 3 is GO ID, 1 is uniprotid, 6 is F/C/P, 8 is swissprotid
infgo = open("../db/gene_association.goa_uniprot", 'r')
#infgo = open("../db/gene_association.small", 'r')
count = 0
for line in infgo:
    d = line.strip().split('\t')
    if len(d) > 10 and d[0][0:3] == 'Uni':
        #if d[6] == 'F':
        if d[6] in experimental and d[8] == 'F' and (not d[5] == 'NOT'):
            spid = d[10][(d[10].rfind('|')+1):len(d[10])]
            if not d[1] in uniprots.keys():
                uniprots[d[1]] = []
            if not spid in uniprots.keys():
                uniprots[spid] = []
            uniprots[d[1]].append([d[4],d[6]])
            uniprots[spid].append([d[4],d[6]])
    count = count + 1

#print uniprots

# iterate through families:
# write name of family, # of uniprot ids in family, 
# total # of experimental annotations, total # of distinct functions

for fam in families.keys():
    outf = open(fam+".pli", 'w')
    outf.write("<?xml version=\"1.0\"?>\n")
    outf.write("<Family>\n")
    outf.write("  <FamilyID>"+fam+"</FamilyID>\n")
    for uni in families[fam]:
        unip = uni
        outf.write("   <Protein>\n")
        if uni in uniprots.keys() or uni[0:(uni.find('_'))] in uniprots.keys():
            if uni[0:(uni.find('_'))] in uniprots.keys():
                unip = uni[0:(uni.find('_'))]
            outf.write("    <ProteinName>"+uni+"</ProteinName>\n")
            outf.write("    <ProteinNumber>"+uni+"</ProteinNumber>\n")
            outf.write("    <GONumber>[")
            for upfi in range(len(uniprots[unip])):
                upf = uniprots[unip][upfi]
                if upfi == 0:
                    outf.write(upf[0][3:])
                else:
                    outf.write(", "+upf[0][3:])
            outf.write("]</GONumber>\n")
            outf.write("    <MOC>[")
            for upfi in range(len(uniprots[unip])):
                upf = uniprots[unip][upfi]
                if upfi == 0:
                    outf.write(upf[1])
                else:
                    outf.write(", "+upf[1])
            outf.write("]</MOC>\n")
        else:
            outf.write("    <ProteinName>"+uni+"</ProteinName>\n")
            outf.write("    <ProteinNumber>"+uni[0:(uni.find('_'))]+"</ProteinNumber>\n")
        outf.write("   </Protein>\n")
    outf.write("<\Family>")


# build trees

