# -*- coding: utf-8 -*-
# python2.7
'''
Log:
Released on March 18, 2022
Modified on Aug 20, 2022:
    change to import another functions
    add option to use bowtie2 build reference
'''
'''
Requirements:
(1) Based on python2.7; need module pandas, pathos
(2) bowtie2; command 'bowtie2' should be executable in $PATH, otherwise need to speficy the path.

Purpose:
Deconvolute the Methyl-SNP-seq Reads with Calibration:
(1) convert the bisulfite converted Non-methyl C base in Read1 back to C if R1=T and R2=C
    Set the base quality of R1 bases that are mismatching the R2 bases to a low base quality score based on the comparison to reference genome. 
    To get a better accuracy, only Reads/Bases with BAQ>=30 and MAPQ>=20 are used for Calibration.
(2) save cytosine methylation status in a report
This step generates a fastq file name_DeconvolutedRead.fq and a methylation report name.Deconvolution.5mC

Usage:
$python DeconvolutionWithCalibration \
    --Read1 TestSeq_R1_val_1.fq --Read2 TestSeq_hairpin_R2_val_2.fq \
    --name TestSeq --bowtie2_reference /mnt/home/yan/Bo/AccurateSeq/hg38_lambda_XP12_T4_pUC19_combine/GCA_000001405.15_GRCh38_no_alt_spike \
    --reference /mnt/home/yan/Bo/AccurateSeq/hg38_lambda_XP12_T4_pUC19_combine/GCA_000001405.15_GRCh38_no_alt_spike.fa \
    --vcf /mnt/home/yan/Bo/AccurateSeq/NA12878_WGS/variant_bowtie2/HG001_WGS_gatktmp/HG001_WGS_filtered.vcf \
    --smp 4 --percent 0.02 \
    --path_to_bowtie2 /mnt/home/ettwiller/yan/exe/miniconda2/envs/my-bowtie2-env/bin/bowtie2 \
    --path_to_samtools /mnt/home/ettwiller/yan/exe/miniconda2/envs/my-bowtie2-env/bin/samtools \
    --path_to_bedtools /mnt/home/ettwiller/yan/exe/miniconda2/envs/my-bowtie2-env/bin/bedtools

--Read1, Read2:
    illumina adapter and hairpin adapter removed Read1 and Read2 fastq or fastq.gz files
    
--percent: Default 0.05
    percent to downsample the Read1 and Read2 for Base Calibration analysis

--reference: 
    reference genome.fa used for bedtools getfasta
--bowtie2_reference: Optional
    bowtie2 reference used for mapping, which is generated by bowtie2-build based on reference genome.fa. 
    e.g. /mnt/home/yan/Bo/AccurateSeq/hg38_lambda_XP12_T4_pUC19_combine/GCA_000001405.15_GRCh38_no_alt_spike
    If provided, skip the reference build step.
    Else the bowtie2 reference will be built based on reference which may take time.

--name: name of output files, e.g. TestSeq
    output files are saved in the current working directory:
    TestSeq.BaseCalibration.table
    TestSeq.BaseCalibration.probability
    TestSeq.Deconvolution.5mC: methylation report, cytosine position in read is 0-indexed.
    TestSeq_DeconvolutedRead.fq: Deconvolution for Read1 with BaseCalibration

--vcf: Optional
    A vcf file containing known SNP positions.
    If provided, the positions in this vcf file are ignored from base calibration.
    Several variant files can be merged into one using picard.jar MergeVcfs function.

--smp: Optional
    number of threads used for bowtie2 mapping, default 1
    It is used as -p option for bowtie2.

--dir: Optional
    full path directory to save the output files
    if not provided, the output files are saved at current working directory.

--path_to_bowtie2, --path_to_bedtools, --path_to_samtools: Optional
    use this option to specify a path to bowtie2/bedtools/samtools executable, e.g.
    /usr/bin/bowtie2
    Else it is assumed that bowtie2/bedtools/samtools is executable in the PATH.

Note:
(1) the bowtie2-build step is time consuming for human genome, so provide the premade bowtie2 reference if possible
(2) current version uses CompareBase.py for cycle dependent error rate estimation, it can be improved by using samtools mpileup in a future version.
(3) This step generates large temporary files so make sure there is enough space.
Size of temporary files:
    input: Read1.fq.gz 26G, which contains 850 million 100bp reads, will generate an uncompressed Read1.fq 165G, 
    downsample with 0.02: TestSeq.downsample.R1.fq and TestSeq.downsample.R2.fq 3.3G, which contains 16 million reads
    mapping sam: TestSeq_deconvoluted_R1.sam 4.3G
    base calibration file: 
    TestSeq_deconvoluted_R1.compareBase.txt 24G, TestSeq_R2.compareBase.txt 28G, TestSeq.BaseCalibration.probability
    This step will also generate temp files having the same size as the compareBase.txt file.
Timing of Starting with an uncompressed fastq file pair having 850 million reads:
    Step 2 Downsample step: 45min
    Step 3 conversion of Read1: 7min
    Step 4 bowtie2 mapping with -p 4: 1h20min
    Step 5 Base Calibration in the presence of a vcf file: 2h35min
    Step 6 Deconvolution with base calibration with multiple threads 8 nodes: 2h21min
'''

import datetime
import os, sys
import argparse
from subprocess import check_call

script_dir = os.path.dirname( __file__ )
sys.path.append(os.path.join(script_dir, 'src'))

try:
    import FastqHandle_pairend
    import DeconvolutionConversion
    import CompareBase
    import BaseCalibration
    import DeconvolutionCalibration_v2
    import toolpath
    from getpath import GetFilePath, CreatePrefix, CheckFormat
except Exception as e: 
    print e
    quit()

__version__ = '2022.08.20'

def Arg():
    parser = argparse.ArgumentParser()
    parser.add_argument('--Read1', help='hairpin removed Read1 file', dest='Read1', required=True)
    parser.add_argument('--Read2', help='hairpin removed Read2 file', dest='Read2', required=True)
    parser.add_argument('--name', help='name suffix used for output', dest='name', required=True)
    parser.add_argument('--percent', help='fastq downsample percent used for BaseCalibration analysis', dest='percent', default=0.05, type=float)
    parser.add_argument('--reference', help='reference fa file for mapping', dest='reference', required=False)
    parser.add_argument('--bowtie2_reference', help='bowtie2 reference for mapping', dest='bowtie2_reference', required=False)  
    parser.add_argument('--vcf', help='vcf file showing snp', dest='vcf', required=False, default = None)
    parser.add_argument('--dir', help='dir to save the output file', dest='dir', required=False, default = None)
    parser.add_argument('--smp', help='number of thread used for bowtie2 mapping', dest='smp', type=int, default = 1)

    parser.add_argument('--path_to_bowtie2', help='specify a path to bowtie2', dest='pathBowtie2', default='bowtie2', required=False)
    parser.add_argument('--path_to_bedtools', help='specify a path to bedtools', dest='pathBedtools', default='bedtools', required=False)
    parser.add_argument('--path_to_samtools', help='specify a path to samtools', dest='pathSamtools', default='samtools', required=False)

    args = parser.parse_args()
    return args

def main(Read1, Read2, percent, name, dir, bowtie2_reference, reference, smp, vcf, pathBowtie2, pathSamtools, pathBedtools):
    '''
    Run several deconvolution steps.
    '''
    # test samtools, bedtools, bowtie2
    toolpath.init(pathSamtools, pathBedtools, pathBowtie2)
    toolpath.Tools()
    print

    # get the full path
    if reference:
        reference = GetFilePath(reference)
    elif bowtie2_reference:
        bowtie2_reference = GetFilePath(bowtie2_reference)
    else:
        print 'error: Need to provide either reference genome or bowtie2 build reference.'
        quit()
    
    if vcf:
        vcf = GetFilePath(vcf)
    Read1 = GetFilePath(Read1)
    Read2 = GetFilePath(Read2)

    prefix = CreatePrefix()

    if not dir: # use CWD to save output dir if not specified
        dir = os.getcwd()

    cwd =  os.getcwd() # current working dir

    print 'Perform Methyl-SNP-seq Read Deconvolution with calibrarion using verion: {}.'.format(__version__)
    print 'Start at {}\n'.format(datetime.datetime.now())
    
    if os.path.exists('DeconvolutionDir{}'.format(prefix)):
        tempdir = os.path.join(cwd, 'DeconvolutionDir{}_{}'.format(prefix, name)) # 'DeconvolutionDir{}_{}'.format(prefix, name)
    else:
        tempdir = os.path.join(cwd, 'DeconvolutionDir{}'.format(prefix))
    print 'Generate a tmp dir {}\n'.format(tempdir)
    os.mkdir(tempdir)
    os.chdir(tempdir)

    # decompress the input reads, do this once at the beginning to save time
    print 'Step1: Check the format of input Read files and decompress if inpput is compressed.'
    if CheckFormat(Read1):
        command = 'gunzip -c {} > {}.uncompressed.R1.fq'.format(Read1, name)
        check_call(command, shell=True)
        Read1 = os.path.join(tempdir, '{}.uncompressed.R1.fq'.format(name))
        print 'Decompress the Read1 file into: {}'.format(Read1)
    if CheckFormat(Read2):
        command = 'gunzip -c {} > {}.uncompressed.R2.fq'.format(Read2, name)
        check_call(command, shell=True)
        Read2 = os.path.join(tempdir, '{}.uncompressed.R2.fq'.format(name))
        print 'Decompress the Read2 file into: {}'.format(Read2)
    print 'Step finished at {}\n'.format(datetime.datetime.now())
    # output in tempdir: name.uncompressed.R1.fq and name.uncompressed.R2.fq

    # downsample input Read1 and Read2 for calibration
    print 'Step2: Downsample input reads for calibration.'
    Fastqfile = FastqHandle_pairend.Fastq([Read1, Read2], ['{}.downsample.R1.fq'.format(name), '{}.downsample.R2.fq'.format(name)])
    Fastqfile.Downsample(percent)
    Fastqfile.EndStep(True) # output: name.downsample.R1.fq, name.downsample.R2.fq
    print 'Step finished at {}\n'.format(datetime.datetime.now())

    # Convertion of Read1 to generate Deconvolution_R1
    print 'Step3: Convert downsampled Read1.'
    DeconvolutionConversion.Deconvolute('{}.downsample.R1.fq'.format(name), '{}.downsample.R2.fq'.format(name), name) # output name_Deconvolution_R1.fq, which is downsampled Deconvolution_R1
    print 'Step finished at {}\n'.format(datetime.datetime.now())

    # bowtie2 mapping for downsampled Deconvolution_R1 and R2, add --quiet to turn off bowtie2 print
    print 'Step4: Bowtie2 mapping.'
    with open('DeconvolutionPipeline{}'.format(prefix), 'w') as output:
        print>>output, '#!/bin/bash'
        if not bowtie2_reference:
            pathbuild = os.path.join(os.path.dirname(pathBowtie2), os.path.basename(pathBowtie2).replace('bowtie2', 'bowtie2-build')) # bowtie2-build path
            print>>output, "echo Build bowtie2 reference."
            command = '{} {} bowtie2_reference --quiet'.format(pathbuild, reference) # bowtie2-build reference name
            print>>output, "echo"
            print>>output, command
            command = '{} -p {} -x bowtie2_reference -U {}_Deconvolution_R1.fq -S {}_deconvoluted_R1.sam --quiet'.format(pathBowtie2, smp, name, name) 
            print>>output, command
            print>>output, "echo"
            command = '{} -p {} -x bowtie2_reference -U {}.downsample.R2.fq -S {}_R2.sam --quiet'.format(pathBowtie2, smp, name, name) 
            print>>output, command
        else:
            command = '{} -p {} -x {} -U {}_Deconvolution_R1.fq -S {}_deconvoluted_R1.sam --quiet'.format(pathBowtie2, smp, bowtie2_reference, name, name) 
            print>>output, command
            print>>output, "echo"
            command = '{} -p {} -x {} -U {}.downsample.R2.fq -S {}_R2.sam --quiet'.format(pathBowtie2, smp, bowtie2_reference, name, name) 
            print>>output, command
        print>>output, "echo"
        # output name_R2.sam name_deconvoluted_R1.sam
    try:
        check_call(['bash', 'DeconvolutionPipeline{}'.format(prefix)], shell=False)
    except:
        print "bowtie2 mapping error."
        quit()
    print 'Step finished at {}\n'.format(datetime.datetime.now())

    # Compare the Deconvolution_R1, Read2 base with the reference for base calibration with BAQ>=30 and MAPQ>=20
    print 'Step5: Count the probablity for Baes Calibration.'
    dic_args = {'input_bam': '{}_deconvoluted_R1.sam'.format(name), \
        'percent': 1, 'REF': reference, 'input_fastq': '{}_Deconvolution_R1.fq'.format(name), \
            'vcf': vcf, 'output': '{}_deconvoluted_R1.compareBase.txt'.format(name), 'BAQ': 30, 'MAPQ': 20, \
                'pathSamtools': pathSamtools, 'pathBedtools': pathBedtools}
    CompareBase.main(dic_args) # output name_deconvoluted_R1.compareBase.txt
    print
    dic_args = {'input_bam': '{}_R2.sam'.format(name), \
        'percent': 1, 'REF': reference, 'input_fastq': '{}.downsample.R2.fq'.format(name), \
            'vcf': vcf, 'output': '{}_R2.compareBase.txt'.format(name), 'BAQ': 30, 'MAPQ': 20, \
                'pathSamtools': pathSamtools, 'pathBedtools': pathBedtools}
    CompareBase.main(dic_args) # output name_R2.compareBase.txt
    print
    BaseCalibration.main('{}_deconvoluted_R1.compareBase.txt'.format(name), '{}_R2.compareBase.txt'.format(name), name)
    print 'Step finished at {}\n'.format(datetime.datetime.now())
    # output name.BaseCalibration.probability

    # Deconvolution including Base Calibration
    print 'Step6: Deconvolute with Base Calibration.'
    dic_probability = DeconvolutionCalibration_v2.BaseCalibrationDict('{}.BaseCalibration.probability'.format(name)).dic_probability
    DeconvolutionCalibration_v2.main(Read1, Read2, dic_probability, name)
    print 'Step finished at {}\n'.format(datetime.datetime.now())
    # output name_DeconvolutedRead.fq, name.Deconvolution.5mC

    # move the output files
    print "Step7: The following output files are saved at: {}".format(dir)
    for item in ['{}.DeconvolutedRead.fq'.format(name), '{}.Deconvolution.5mC'.format(name), '{}.BaseCalibration.table'.format(name), '{}.BaseCalibration.probability'.format(name)]:
        print item
        command = ['mv', item, dir]
        check_call(command, shell=False)
    
    command = ['cd', cwd]
    check_call(command, shell=False)
    print 'Finished at {}'.format(datetime.datetime.now())
    
    command = ['rm', '-r', 'DeconvolutionDir{}'.format(prefix)]
    check_call(command, shell=False)
   
##-------mainbody
if __name__ == '__main__':

    # check the required python files:
    path = os.path.join(script_dir, 'src')
    pythonls = ['FastqHandle_pairend.py', 'DeconvolutionConversion.py', 'CompareBase.py', 'BaseCalibration.py', 'DeconvolutionCalibration_v2.py', 'getpath.py', 'toolpath.py']

    for item in pythonls:
        if not os.path.exists(os.path.join(path, item)):
            print "{} can not be found.".format(item)
            quit()

    args = Arg()
    main(args.Read1, args.Read2, args.percent, args.name, args.dir, args.bowtie2_reference, \
        args.reference, args.smp, args.vcf, args.pathBowtie2, args.pathSamtools, args.pathBedtools)
    