# 01_mapBatch.py
#
# Jonathan M. Galazka
#
# A simple modification of the Mirny Lab script, 01_iterative_mapping.py available
# at https://bitbucket.org/mirnylab/hiclib
#
# Requires Mirny Lab scripts available at https://bitbucket.org/mirnylab/hiclib and all of their dependencies.
# 
# Usage: python ./mapBatch
#
# Each pair of .fastq files (reads 1, and 2 ) should be in folder named 'dataset1', 'dataset2', etc.
# and these should be contained in a single folder called 'HiC'
# These folders should also contain and empty folder named 'tmp'
# Directory structure should look like this: /Volumes/HD/HiC/dataset1/tmp, /Volumes/HD/HiC/dataset2/tmp, etc.
#
# Output will be a mapped_reads.hdf5 file. Additional files in the 'tmp' folder can be deleted after running.


import os
import logging
from hiclib import mapping
from mirnylib import h5dict, genome

logging.basicConfig(level=logging.DEBUG)

datasets = ['dataset1', 'dataset2'] # Name of dataset folders

for dataset in datasets:

	temp_dir='/Volumes/HD/HiC/' + dataset + '/tmp/'
	
	fastq_path_1='/Volumes/HD/HiC/' + dataset + '/R1.fastq'
	out_sam_path_1='/Volumes/HD/HiC/' + dataset + '/tmp/R1.sam'
	
	fastq_path_2='/Volumes/HD/HiC/' + dataset + '/R2.fastq'
	out_sam_path_2='/Volumes/HD/HiC/' + dataset + '/tmp/R2.sam'
	
	mapped_reads_path = '/Volumes/HD/HiC/' + dataset + '/mapped_reads.hdf5'
	
	sam_basename1='/Volumes/HD/HiC/' + dataset + '/tmp/R1.sam'
	sam_basename2='/Volumes/HD/HiC/' + dataset + '/tmp/R2.sam'
	 
	genome_db = genome.Genome('/Users/galazkaj/genome/chromosomes/', readChrms=['#'])

	mapping.iterative_mapping(
		bowtie_path='/Users/galazkaj/bowtie2-2.1.0/bowtie2',
		bowtie_index_path='/Users/galazkaj/bowtie2-2.1.0/indexes/nc12_fixed', 
		fastq_path=fastq_path_1,
		out_sam_path=out_sam_path_1,
		min_seq_len=20,
		len_step=5,
    	seq_start=0,
    	seq_end=50,
    	nthreads=4,  
    	#max_reads_per_chunk = 10000000,  #optional, on low-memory machines
    	temp_dir=temp_dir, 
    	bowtie_flags='--very-sensitive')
    	#bash_reader='../../bin/sra/bin/fastq-dump -Z')

	mapping.iterative_mapping(
		bowtie_path='/Users/galazkaj/bowtie2-2.1.0/bowtie2',
    	bowtie_index_path='/Users/galazkaj/bowtie2-2.1.0/indexes/nc12_fixed',
    	fastq_path=fastq_path_2,
    	out_sam_path=out_sam_path_2,
    	min_seq_len=20,
    	len_step=5,
    	seq_start=0,
    	seq_end=50,
    	nthreads=4,  
    	#max_reads_per_chunk = 10000000, 
    	temp_dir=temp_dir,  
		bowtie_flags='--very-sensitive')
    	#bash_reader='../../bin/sra/bin/fastq-dump -Z')
    
	mapped_reads = h5dict.h5dict(mapped_reads_path)

	mapping.parse_sam(
    	sam_basename1=sam_basename1,
    	sam_basename2=sam_basename2,
    	out_dict=mapped_reads,
    	genome_db=genome_db)
    	