#!/usr/bin/env python

import os 

#parse cluster analysis (with silixx)

filedir="../../../Clustering/05_SPar24HC/all.rbh.pairs.silixx"

dico_clust={}
dico_annot={}

with open(filedir) as clust:
    for line in clust:
        line=line.rstrip()
        line=" ".join(line.split())
        #print line
        group=line.split(" ")
        idline=group[0]
        seqid=group[1]
        
        if idline in dico_clust.keys():
            dico_clust[idline]=dico_clust[idline]+"\t"+seqid
        else: 
            dico_clust[idline]=seqid
        #dico annotation
        if "S288" in line:
            dico_annot[idline]=seqid
    

#matrice 0/1 par cluster avec tri des haplo
#nameout="../Silix/output/allrbh_delta.count"
nameout=filedir.replace("silixx","count")
nb_haplo=26

#On prend la liste des haplotypes 
#le premier element de dico_clust les a ts
hap1=dico_clust.values()[0].split("\t")
haplol=[h.split("_")[0] for h in hap1]

with open(nameout,"w") as out:
    
    for key in dico_clust.keys():
        lseq=dico_clust[key]
        array=lseq.split("\t")
            
        if  len(array)<=nb_haplo:
            count=[]
            
            
            #haplol=["S288","Y128" , "SD01", "SD06","SA03"]
            
            for hname in haplol:
                
                if hname in lseq:
                    count.append("1")
                else:
                    count.append("0")
            
            if key in dico_annot.keys():
                refname=dico_annot[key]
            else:
                refname="NA"
            
            out.write("ID"+key+"\t"+"\t".join(count)+"\t"+refname+"\n")


#matrice par cluster avec nom des genes par haplo
#nameout2="../Silix/output/allrbh_delta.gene"
nameout2=filedir.replace("silixx","gene")



with open(nameout2,"w") as out:
    
    for key in dico_clust.keys():
        lseq=dico_clust[key]
        array=lseq.split("\t")
        if  len(array)<=nb_haplo:
            #count=[".",".",".",".","."]
            count=["." for i in range(0,nb_haplo)] 
                
            for hname in haplol:
                #print hname
                posl=[array.index(i) for i in array if hname in i]
                #print posl
                if str(posl) != "[]":
                    pos1=str(posl).replace("[","").replace("]","")
                    if len(pos1)<=2:
                        pos2=int(pos1)
                        #print pos2
                        
                        count[haplol.index(hname)]=array[pos2]
                pos1=""
                
            #print count
          
            out.write("\t".join(count)+"\n")
       


#do a file with both counts and gene names 

cmd="paste "+nameout+" "+nameout2+" > "+nameout.replace("count","cg")
os.system(cmd)
