import os

#Function to extact reads coordinates from orf annotations
########################################################################

def cut_bam(file_coord, orf, haplo, ext, dico_bam, type_lib): 
    
    #file_coord : gff file with coordinates of all ORFs
    #orf : orf name
    # haplo : haplotype
    # ext : nb of flanking nucleotides
    # dico_bam : dictionnary with links to bam files 
    # type lib : RPF or TOT 
    
    coord_out="../../07_candidates/02_ORF_bam/"+orf+"_"+haplo+"_ext"+str(ext)+".bed"
    cut_rep1="../../07_candidates/02_ORF_bam/"+orf+"_"+haplo+"_ext"+str(ext)+"_"+type_lib+"_rep1.bed"
    cut_rep2="../../07_candidates/02_ORF_bam/"+orf+"_"+haplo+"_ext"+str(ext)+"_"+type_lib+"_rep2.bed"
    genome_all="../../../../00data_seq/pseudo_IDBA/"+haplo.replace("S","")+"_IDBA.genome"
    genomefile="../../07_candidates/02_ORF_bam/"+orf+"_"+haplo+".genome" #with reduced info 
    
    #extract orf coordinates for all haplotypes
    cmd="grep '"+orf+";' "+file_coord
    list_coord=os.popen(cmd).read().rstrip().split("\n")
    
    #read coord and write orf extended coord of haplo
    with open(coord_out,"w") as out:
        for line in list_coord:
            array=line.split("\t")
            info=array[8]
            haploi=info.split(";")[2].replace("haplo=","")
            if haploi==haplo:
                chrom=array[0]
                start=array[3]
                stop=array[4]
                
                start_ext=str(int(start)-ext)
                stop_ext=str(int(stop)+ext)
                coord="\t".join([chrom,start_ext, stop_ext])
                out.write(coord+"\n")
                
                #make a reduced genome file per orf 
                cmd_genome="grep \'"+chrom+"\' "+genome_all+" > "+genomefile
                print cmd_genome
                
                
                os.system(cmd_genome)
    
    
    #extract read coordinates in orf region from bam file 
    #rep1
    bam_rep1=dico_bam[haplo+"_rep1"]
    cmd1="bedtools intersect -wa  -bed -a "+bam_rep1+ \
    " -b "+coord_out+" > "+cut_rep1
    print cmd1
    os.system(cmd1)
    
    #rep2
    bam_rep2=dico_bam[haplo+"_rep2"]
    cmd2="bedtools intersect -wa  -bed -a "+bam_rep2+ \
    " -b "+coord_out+" > "+cut_rep2
    print cmd2
    os.system(cmd2)
    
    #convert to coverage format
    #rep1   
    cov_sens1=cut_rep1.replace(".bed","_sens.cov")
    cov_anti1=cut_rep1.replace(".bed","_anti.cov")
    
    
    cmd_cov1s="bedtools genomecov -dz -strand + -i "+cut_rep1+" -g "+\
    genomefile+" > "+cov_sens1
    print cmd_cov1s
    os.system(cmd_cov1s)
    
    cmd_cov1a="bedtools genomecov -dz -strand - -i "+cut_rep1+" -g "+\
    genomefile+" > "+cov_anti1
    print cmd_cov1a
    os.system(cmd_cov1a)
    
    #rep2  
    cov_sens2=cut_rep2.replace(".bed","_sens.cov")
    cov_anti2=cut_rep2.replace(".bed","_anti.cov")
    
    
    cmd_cov2s="bedtools genomecov -dz -strand + -i "+cut_rep2+" -g "+\
    genomefile+" > "+cov_sens2
    print cmd_cov2s
    os.system(cmd_cov2s)
    
    cmd_cov2a="bedtools genomecov -dz -strand - -i "+cut_rep2+" -g "+\
    genomefile+" > "+cov_anti2
    print cmd_cov2a
    os.system(cmd_cov2a)
    


#Analysis
########################################################################

#nb of nucleotides to extend
ext=100

#orf coordinates
#genomic coordinates of spar strains
file_coord="../../03synt_intergenic_orf/02_ORF_tables/table_orf_filtcons_genomic.gff"
#coordinates on syntenic regions for ancestral reconstructions
file_recons="../../03synt_intergenic_orf/02_ORF_tables/table_orf_recons_sid.gff"

#RPF data
#make dico with bam correspondances 
dico_bam_rpf={\
"SA03_rep1":"../../../../04analyses_RPF/02mapping_ps/results_merged/RPF_CC1_allonA03_IDBA.fapseudoscaff_sorted.bam",\
"SA03_rep2":"../../../../04analyses_RPF/02mapping_ps/results_merged/RPF_CC2_allonA03_IDBA.fapseudoscaff_sorted.bam",\
"SD01_rep1":"../../../../04analyses_RPF/02mapping_ps/results_merged/RPF_AA1_allonD01_IDBA.fapseudoscaff_sorted.bam",\
"SD01_rep2":"../../../../04analyses_RPF/02mapping_ps/results_merged/RPF_AA3_allonD01_IDBA.fapseudoscaff_sorted.bam",\
"SD06_rep1":"../../../../04analyses_RPF/02mapping_ps/results_merged/RPF_BB1_allonD06_IDBA.fapseudoscaff_sorted.bam",\
"SD06_rep2":"../../../../04analyses_RPF/02mapping_ps/results_merged/RPF_BB3_allonD06_IDBA.fapseudoscaff_sorted.bam",\
"Y128_rep1":"../../../../04analyses_RPF/02mapping_ps/results_merged/RPF_Cer1_allonYPS128_IDBA.fapseudoscaff_sorted.bam",\
"Y128_rep2":"../../../../04analyses_RPF/02mapping_ps/results_merged/RPF_Cer3_allonYPS128_IDBA.fapseudoscaff_sorted.bam"}


# Idem with total RNA
#make dico with bam correspondances 
dico_bam_tot={\
"SA03_rep1":"../../../../04analyses_ARNtot/02mapping_ps/results_merged/T_CC1allonA03_ps_sorted.bam",\
"SA03_rep2":"../../../../04analyses_ARNtot/02mapping_ps/results_merged/T_CC2allonA03_ps_sorted.bam",\
"SD01_rep1":"../../../../04analyses_ARNtot/02mapping_ps/results_merged/T_AA1allonD01_ps_sorted.bam",\
"SD01_rep2":"../../../../04analyses_ARNtot/02mapping_ps/results_merged/T_AA3allonD01_ps_sorted.bam",\
"SD06_rep1":"../../../../04analyses_ARNtot/02mapping_ps/results_merged/T_BB1allonD06_ps_sorted.bam",\
"SD06_rep2":"../../../../04analyses_ARNtot/02mapping_ps/results_merged/T_BB3allonD06_ps_sorted.bam",\
"Y128_rep1":"../../../../04analyses_ARNtot/02mapping_ps/results_merged/T_Cer1allonYPS128_ps_sorted.bam",\
"Y128_rep2":"../../../../04analyses_ARNtot/02mapping_ps/results_merged/T_Cer3allonYPS128_ps_sorted.bam"}


#list of candidates
#updtate 2018 in file ../04diff_expression/select_ORF_022018

#cmd_orf="cut -f1 ../04diff_expression/select_ORF_022018"
#corf=os.popen(cmd_orf).read().rstrip().split("\n")

chaplo=["SA03","SD06","SD01","Y128"]

corf=["ORF_14438", "ORF_152062","ORF_163629","ORF_187706","ORF_50484", \
"ORF_62412","ORF_68573","ORF_69174","ORF_69270", "ORF_88904", "ORF_95911",\
"ORF_99584"]






for orf in corf:
    
    print "###########   "+orf+"   ######################\n\n\n"
    
    for haplo in chaplo:
        #cut_bam(file_coord, orf, haplo, ext, dico_bam_rpf, "RPF")
        #cut_bam(file_coord, orf, haplo, ext, dico_bam_tot, "TOT")




#extract gff genomic coordinates of all candidates 

for orf in corf:
	dir_out="../../07_candidates/02_ORF_bam/"+orf+"_coord.gff"
	with open(dir_out,"w")as out:
		cmd="grep '"+orf+";' "+file_coord
		list_coord=os.popen(cmd).read().rstrip().split("\n")
		for line in list_coord:
			array=line.split("\t")
			haplo=array[8].split(";")[2].replace("haplo=","")
			if haplo in chaplo:
				out.write(line+"\t"+haplo+"\n")

#extract all orf in the region 
for orf in corf:
	for haplo in chaplo:
		coord_orf="../../07_candidates/02_ORF_bam/"+orf+"_"+haplo+"_ext"+str(ext)+".bed"
		coord_st1="../../"+\
		"03synt_intergenic_orf/02_ORF_tables/03per_haplo_stat1_ps/"+\
		"table_orf_"+haplo+"_stat1.gff"
		cmd1="bedtools intersect -wa -wb  -a "+coord_st1+ \
		" -b "+coord_orf+" > ../../07_candidates/02_ORF_bam/"+orf+"_"+haplo+"_allORF.gff"
		
		os.system(cmd1)
    
#idem with ancestors ===================================================================

#extract gff genomic coordinates of all candidates 
haplorec=["N1","N2"]

for orf in corf:
	dir_out="../../07_candidates/02_ORF_bam/"+orf+"_coord_recons.gff"
	with open(dir_out,"w")as out:
		cmd="grep '"+orf+";' "+file_recons
		list_coord=os.popen(cmd).read().rstrip().split("\n")
		for line in list_coord:
			array=line.split("\t")
			haplo=array[8].split(";")[2].replace("haplo=","")
			if haplo in haplorec:
				#write a file with all coordinates in ine file
				out.write(line+"\t"+haplo+"\n")
				
				#write an extended bed file for intersect command
				coord_bed="../../07_candidates/02_ORF_bam/"+orf+"_"+haplo+"_ext"+str(ext)+".bed"
				with open(coord_bed, "w") as out2:
					
					startext=int(array[3])-ext
					if startext <1:
						startext=1
					
					endext=int(array[4])+ext
					line2="\t".join([array[0], str(startext), str(endext)])+"\n"
					out2.write(line2)
					
				#write a not extended bed file to have syntenic ORF coordinate
				coord_bed2="../../07_candidates/02_ORF_bam/"+orf+"_"+haplo+".bed"
				with open(coord_bed2, "w") as out3:
					endext=int(array[4])+ext
					line2="\t".join([array[0], array[3], array[4]])+"\n"
					out3.write(line2)


#extract all orf in the region 

for orf in corf:
	for haplo in haplorec:
		coord_orf="../../07_candidates/02_ORF_bam/"+orf+"_"+haplo+"_ext"+str(ext)+".bed"
		coord_st1="../../"+\
		"03synt_intergenic_orf/02_ORF_tables/03per_haplo_stat1_ps/"+\
		"table_orf_"+haplo+"_stat1.gff"
		cmd1="bedtools intersect -wa -wb  -a "+coord_st1+ \
		" -b "+coord_orf+" > ../../07_candidates/02_ORF_bam/"+orf+"_"+haplo+"_allORF.gff"
		
		os.system(cmd1)




