#!/usr/bin/env python

import re
import pandas as pd
import os

#Script to import coordinates for each gene on orthogroups and extract 
#syntenic orthogroups
#chromosome or scaffold; start-stop; sens

#First:create a dictionnary foreach feature ex "SA03_g1" with coordinates
#Second fill lines with coordinates 

dir_aug="../../01annot_gene/01augustus_pseudoscaff/"
link_clust="../../../Clustering/05_SPar24HC/all.rbh.pairs.cg"
link_clust2="../../../Clustering/05_SPar24HC/all.rbh.pairs.coord" 
link_clustc="../../../Clustering/05_SPar24HC/all.rbh.pairs.cons"
refcer_dir="../../../CHROnicle/05_SPar24HC/00rawGenom/S288C/S288C_prot.fasta"

synt_out="../../../Clustering/05_SPar24HC/syntpairs.cons"


#Si sur laptop
#link_clust="../../orthogroups/all.rbh.pairs.cg"
#link_clust2="../../orthogroups/all.rbh.pairs.coord"



#Create dictionnary with coordinates
dico_coord={}
#take list of augustus files directories 

augdir=os.popen("ls "+dir_aug+"*").read().rstrip().split("\n")

for augfile in augdir:
    augfile=augfile.rstrip()
    #print augfile
    haplo=augfile.split("/")[-1].split("_")[0]
    if haplo=="YPS128":
        haploid="Y128"
    else:
        haploid="S"+haplo
        
    print haploid
    
    with open(augfile) as augustus:
        for line in augustus:
            line=line.rstrip()
            m1=re.match(r"^#", line)
            if m1 is None: 
                #print line
                array=line.split("\t")
                if array[2]=="gene":
                    #print line
                    chrom=array[0]
                    start=array[3]
                    stop=array[4]
                    sens=array[6]
                    keyid=haploid+"_"+array[8]
                    valid=chrom+";"+start+"-"+stop+";"+sens
                    
                    keyid=haploid+"_"+array[8]
                    dico_coord[keyid]=keyid+";"+valid
                        

#Add S288C informations in dico_coord 
with open(refcer_dir) as cer:
    for line in cer:
        line=line.rstrip()
        
        m1=re.match(r"^>", line)
        if m1 is not None: 
            array=line.split(" ")
            keyid="S288_"+array[1]
            
            value=keyid+";"+array[2]+";"+array[3]+"-"+array[4]+";"+array[5]
            dico_coord[keyid]=value





#Add coordinates on orthogroups
with open(link_clust) as clust:
    with open (link_clust2, "w") as out:
        for line in clust:
            line=line.rstrip()
            array=line.split("\t")
            #genes_id columns foreach coordinate
            array2=array
            testc="conserved"
        
            for col in range(28,53+1):
                if array[col] in dico_coord.keys():
                    array2.append(dico_coord[array[col]])
                    
                elif array[col] == ".":
                    array2.append(".")
                    testc="specific"
                else: 
                    print "ERROR"
            array2.append(testc)
            
            lineout="\t".join(array2)
            out.write(lineout+"\n")
                


#Ordering gene ortho along chromosomes and extract syntenic coordinates
#
#Select conserved genes 

cmd="grep 'conserved' "+link_clust2+" > "+link_clustc
print cmd
os.system(cmd)


#Read table to sort orthogroups in tabc
#Create vector with colnames

colnames=["id"]

for i in range(0,26):
    colnames.append("p"+str(i))

colnames.append("gene")

for i in range(0,26):
    colnames.append("g"+str(i))

for i in range(0,26):
    colnames.append("coord"+str(i))

colnames.append("group")


tcoord=pd.read_table(link_clustc, names=list(colnames))


#Create a table with just coordinate informations for each cluser
#Init a new table id group

tabc=pd.DataFrame(tcoord['id'])
for namesi in range(55,79+1):
    selhaplo=tcoord[colnames[namesi]]
    chromcoord=[]
    startcoord=[]
    stopcoord=[]
    for gene in selhaplo:
        ginfos=gene.split(";")
        #chrom coordinates
        if namesi==79: #If YPS128
            chrom=ginfos[1].replace("ordered_chr","")
            chrom=chrom.replace("scaffold_","")
            chrom=chrom.split(".")[0]
            
        else: 
            
            chrom=ginfos[1].replace("ordered_Spar_","")
            chrom=chrom.replace("scaffold_","")
            
        starti=ginfos[2].split("-")[0]
        stopi=ginfos[2].split("-")[1]
        
        chromcoord.append(int(chrom))
        startcoord.append(int(starti))
        stopcoord.append(int(stopi))
    tabc["chr_"+str(namesi)]=list(chromcoord)
    tabc["start_"+str(namesi)]=list(startcoord)
    tabc["stop_"+str(namesi)]=list(stopcoord)


#table avec groupes ordonnees 

sorted_tabc=tabc.sort_values(['chr_79','start_79','stop_79'])

#extract syntpairs id 
#take ordered groups, check if next chrom is the same
sorted_id=list(sorted_tabc['id'])
sorted_chrom=list(sorted_tabc['chr_79'])
#dictionnary with chrom for each id and test if its the same
dico_chrom={sorted_id[x]:sorted_chrom[x] for x in range (0,len(sorted_id))}

#add pairs of syntenic genes in a list
synt_pairs=[]
synt_id=0
for i in range(0,len(sorted_id)-1):
    gene1=sorted_id[i]
    gene2=sorted_id[i+1]
    if(dico_chrom[gene1]==dico_chrom[gene2]):
        synti="SID"+str(synt_id)+";"+gene1+"-"+gene2
        synt_pairs.append(synti)
        synt_id +=1
    


with open(synt_out,"w") as outs:
    for i in range(0,len(synt_pairs)):
        outs.write(synt_pairs[i]+"\n") 



