#!/bin/sh

#########################################
##   Recalibration Using Known dbSNP   ##
#########################################

## **Software required**
# gatk v4.2.0.0 (https://github.com/broadinstitute/gatk)
# samtools v1.13 (https://github.com/samtools/samtools)
# gatk and samtools were executed using the NIG Supercomputer system (https://sc.ddbj.nig.ac.jp)
samtools="singularity exec /usr/local/biotools/s/samtools:1.13--h8c37831_0 samtools" # path to samtools (in NIG Supercomputer)
gatk="singularity exec /usr/local/biotools/g/gatk4:4.2.0.0--0 gatk" # path to gatk4 (in NIG Supercomputer)

## **Prepare dbSNP files**
## mm10
# dbSNP was downloaded from Roland lab's container (https://github.com/roland-rad-lab/MoCaSeq)
# numbering for chromosomes was changed from 1,2,3,.... to chr1, chr2, chr3 with sed commands
# ("sed 's/##contig=<ID=/##contig=<ID=chr/g' | sed 's/^[1-9XYM]/chr&/g'")
## mm39
# dbSNP was downloaded from MGP ftp site (https://www.mousegenomes.org/snps-indels/)
# extract vcfs of strains used in MGP_v5_snp_and_indels.exclude_wild.vcf.gz with bcftools

snp_file_mm10="path_to_dbSNP_dir/MGP_v5_snp_and_indels.exclude_wild.vcf.gz" # snps and indels for mm10
# snp_file_mm39="path_to_dbSNP_dir/MGP_v8_snps_exclude_wild.rsID.vcf.gz" # snps for mm39
# indel_file_mm39="path_to_dbSNP_dir/MGP_v8_indels_exclude_wild.rsID.vcf.gz" # indels for mm39

## **Reference genome**
reference_genome="path_to_reference_genome/mm10.fa" # for mm10
# reference_genome="path_to_reference_genome/mm39.fa" # for mm39

threads=6

list=("RY1114 H T6 T7" "RY1115 H T2 T6")
for item in "${list[@]}"; do
    arr=($item)
    name=${arr[0]}
    tissues=("${arr[@]:1}")  # Collect all tissues in an array

    for tissue in "${tissues[@]}"; do
        temp_dir="${name}/temp"
        mkdir -p $name/results/bam
        
        # Base-recalibration
        $gatk --java-options "-Xmx16g" BaseRecalibrator \
        -R $reference_genome \
        -I $temp_dir/${name}_${tissue}.cleaned.sorted.readgroups.marked.bam \
        --known-sites $snp_file_mm10 \
        --use-original-qualities \
        -O $name/results/QC/${name}_${tissue}.GATK4.pre.recal.table

        # Apply BQSR
        $gatk --java-options "-Xmx16g" ApplyBQSR \
        -R $reference_genome\
        -I $temp_dir/${name}_${tissue}.cleaned.sorted.readgroups.marked.bam \
        -O $name/results/bam/${name}_${tissue}.bam \
        -bqsr $name/results/QC/${name}_${tissue}.GATK4.pre.recal.table

        # Base-recalibration
        $gatk --java-options "-Xmx16g" BaseRecalibrator \
        -R $reference_genome \
        -I $name/results/bam/${name}_${tissue}.bam \
        --known-sites $snp_file_mm10 \
        --use-original-qualities \
        -O $name/results/QC/${name}_${tissue}.GATK4.post.recal.table

        $samtools index -@ "$threads" $name/results/bam/${name}_${tissue}.bam
    done
done

##for mm39 base-recalibration
#--known-sites $snp_file_mm39
#--known-sites $indel_file_mm39