#Parse gff to gtf file for metagene analysis with plastid package
#write 3 lines per orf: 
#- exon with UTR
#-CDS
#start position 


def gff_to_gtf_genes(gff, gtf, minsize):
    upstream=50
    downstream=102
    with open(gtf, "w")as out:
        with open(gff)as gff:
            for line in gff:
                line=line.rstrip()
                array=line.split("\t")
                info=array[-1].split(";")[-1].replace("gene_id=","")
                newinfo="gene_id \""+info+"\"; transcript_id \""+info+\
                "\"; start_codon \"ATG\";"
                size=int(array[4])-int(array[3])+1
                if int(array[3])>110 and size > minsize and info !="NA":
                    #SENS FORWARD
                    #===================================================
                    if array[6]=="+":
                        #exon line
                        #print line
                        exonarray=line.split("\t")
                        exonarray[2]="exon"
                        exonarray[3]=str(int(array[3])-upstream)
                        exonarray[4]=str(int(array[4])+downstream)
                        exonarray[-1]=newinfo
                        exon_line="\t".join(exonarray)+"\n"
                        out.write(exon_line)
                        
                        #CDS line
                        CDSarray=line.split("\t")
                        CDSarray[-1]=newinfo
                        CDSarray[2]="CDS"
                        CDS_line="\t".join(CDSarray)+"\n"
                        out.write(CDS_line)
                        
                        #start_codon
                        startarray=line.split("\t")
                        endcount=int(CDSarray[4])-int(CDSarray[3])+1+upstream
                        
                        infostart=newinfo+" cds_start \"50\"; cds_end \""+\
                        str(endcount)+"\";" 
                        startarray[-1]=infostart
                        startarray[2]="start_codon"
                        startarray[4]=str(int(array[3])+2)
                        start_line="\t".join(startarray)+"\n"
                        out.write(start_line)
                        
                    
                    #SENS REVERSE 
                    #====================================================
                    if array[6]=="-":
                        #exon line
                        #print line
                        exonarray=line.split("\t")
                        exonarray[2]="exon"
                        exonarray[3]=str(int(array[3])-downstream)
                        exonarray[4]=str(int(array[4])+upstream)
                        exonarray[-1]=newinfo
                        exon_line="\t".join(exonarray)+"\n"
                        out.write(exon_line)
                        
                        #CDS line
                        CDSarray=line.split("\t")
                        CDSarray[-1]=newinfo
                        CDSarray[2]="CDS"
                        CDS_line="\t".join(CDSarray)+"\n"
                        out.write(CDS_line)
                        
                        #start_codon line
                        startarray=line.split("\t")
                        endcount=int(CDSarray[4])-int(CDSarray[3])+3+upstream
                        
                        infostart=newinfo+" cds_start \"50\"; cds_end \""+\
                        str(endcount)+"\";" 
                        startarray[-1]=infostart
                        startarray[2]="start_codon"
                        startarray[3]=str(int(array[4])-2)
                        start_line="\t".join(startarray)+"\n"
                        out.write(start_line)

# ANALYSIS #############################################################
#directories for gff or gtf files
gff_dir="../../01annot_gene/04gff_annotcons/conserved_synt/"
gtf_dir="../../04plastid/01_gtf_files/"
minsize=60

#list of haplotypes to parse
chaplo=["SA03", "SD01", "SD06", "Y128"]
dico_haplo={"SA03":"A03", "SD01":"D01","SD06":"D06","Y128":"YPS128"}

for haplo in dico_haplo.keys():
    gff_file=dico_haplo[haplo]+"_IDBA_aug_cons.gff"
    gtf_file=haplo+"_genes.gtf"
    gff_to_gtf_genes(gff_dir+gff_file, gtf_dir+gtf_file, minsize)


