# Steps in SNP calling from gDNA-Seq data
# Workflow based on: https://www.broadinstitute.org/gatk/guide/bp_step.php


#############################################################
# Map and mark duplicates									#
# https://www.broadinstitute.org/gatk/guide/article?id=2799 #
#############################################################

# http://gatkforums.broadinstitute.org/discussion/1317/collected-faqs-about-bam-files


#######
# 1. prepare bwa index
bwa index AQPM00000000_NN.fasta # Genome file with truncated headers removing notes

#######
# 2. Generate a SAM file containing aligned reads
bwa mem -M -R '@RG\tID:female1\tSM:female\tPL:illumina\tLB:libFem1\tPU:unit1' AQPM00000000_NN.fasta female1_S5_R1_001.fq female1_S5_R2_001.fq > female1_S5_001_AQPM00000000_GATK.sam 

bwa mem -M -R '@RG\tID:female2\tSM:female\tPL:illumina\tLB:libFem2\tPU:unit1' AQPM00000000_NN.fasta female2_S4_R1_001.fq female2_S4_R2_001.fq >  female2_S4_001_AQPM00000000_GATK.sam

bwa mem -M -R '@RG\tID:female3\tSM:female\tPL:illumina\tLB:libFem3\tPU:unit1' AQPM00000000_NN.fasta female3_S6_R1_001.fq female3_S6_R2_001.fq > female3_S6_001_AQPM00000000_GATK.sam

bwa mem -M -R '@RG\tID:male1\tSM:male\tPL:illumina\tLB:libMale1\tPU:unit1' AQPM00000000_NN.fasta male1_S2_R1_001.fq male1_S2_R2_001.fq >  male1_S2_001_AQPM00000000_GATK.sam

bwa mem -M -R '@RG\tID:male2\tSM:male\tPL:illumina\tLB:libMale2\tPU:unit1' AQPM00000000_NN.fasta male2_S3_R1_001.fq male2_S3_R2_001.fq >  male2_S3_001_AQPM00000000_GATK.sam

bwa mem -M -R '@RG\tID:male3\tSM:male\tPL:illumina\tLB:libMale3\tPU:unit1' AQPM00000000_NN.fasta male3_S1_R1_001.fq male3_S1_R2_001.fq >  male3_S1_001_AQPM00000000_GATK.sam


#############
# 3. Convert to BAM, sort and mark duplicates

# Sort reads

java -Xmx8g -jar -Djava.io.tmpdir=./tmp ./picard-tools-1.133/picard.jar SortSam INPUT=female1_S5_001_AQPM00000000_GATK.sam OUTPUT=female1_S5_001_AQPM00000000_GATK_sorted.bam SORT_ORDER=coordinate

java -Xmx8g -jar -Djava.io.tmpdir=./tmp ./picard-tools-1.133/picard.jar SortSam INPUT=female2_S4_001_AQPM00000000_GATK.sam OUTPUT=female2_S4_001_AQPM00000000_GATK_sorted.bam SORT_ORDER=coordinate

java -Xmx8g -jar -Djava.io.tmpdir=./tmp ./picard-tools-1.133/picard.jar SortSam INPUT=female3_S6_001_AQPM00000000_GATK.sam OUTPUT=female3_S6_001_AQPM00000000_GATK_sorted.bam SORT_ORDER=coordinate

java -Xmx8g -jar -Djava.io.tmpdir=./tmp ./picard-tools-1.133/picard.jar SortSam INPUT=male1_S2_001_AQPM00000000_GATK.sam OUTPUT=male1_S2_001_AQPM00000000_GATK_sorted.bam SORT_ORDER=coordinate 

java -Xmx8g -jar -Djava.io.tmpdir=./tmp ./picard-tools-1.133/picard.jar SortSam INPUT=male2_S3_001_AQPM00000000_GATK.sam OUTPUT=male2_S3_001_AQPM00000000_GATK_sorted.bam SORT_ORDER=coordinate 

java -Xmx8g -jar -Djava.io.tmpdir=./tmp ./picard-tools-1.133/picard.jar SortSam INPUT=male3_S1_001_AQPM00000000_GATK.sam OUTPUT=male3_S1_001_AQPM00000000_GATK_sorted.bam SORT_ORDER=coordinate 


# Mark duplicate reads

java -Xmx8g -jar -Djava.io.tmpdir=./tmp ./picard-tools-1.133/picard.jar MarkDuplicates MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=4000 INPUT=female1_S5_001_AQPM00000000_GATK_sorted.bam OUTPUT=female1_S5_001_AQPM00000000_GATK_dedup.bam METRICS_FILE=female1_S5_001_AQPM00000000_GATK_dedup_metrics.txt

java -Xmx8g -jar -Djava.io.tmpdir=./tmp ./picard-tools-1.133/picard.jar MarkDuplicates MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=4000 INPUT=female2_S4_001_AQPM00000000_GATK_sorted.bam OUTPUT=female2_S4_001_AQPM00000000_GATK_dedup.bam METRICS_FILE=female2_S4_001_AQPM00000000_GATK_dedup_metrics.txt

java -Xmx8g -jar -Djava.io.tmpdir=./tmp ./picard-tools-1.133/picard.jar MarkDuplicates MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=4000 INPUT=female3_S6_001_AQPM00000000_GATK_sorted.bam OUTPUT=female3_S6_001_AQPM00000000_GATK_dedup.bam METRICS_FILE=female3_S6_001_AQPM00000000_GATK_dedup_metrics.txt

java -Xmx8g -jar -Djava.io.tmpdir=./tmp ./picard-tools-1.133/picard.jar MarkDuplicates MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=4000 INPUT=male1_S2_001_AQPM00000000_GATK_sorted.bam OUTPUT=male1_S2_001_AQPM00000000_GATK_dedup.bam METRICS_FILE=male1_S2_001_AQPM00000000_GATK_dedup_metrics.txt

java -Xmx8g -jar -Djava.io.tmpdir=./tmp ./picard-tools-1.133/picard.jar MarkDuplicates MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=4000 INPUT=male2_S3_001_AQPM00000000_GATK_sorted.bam OUTPUT=male2_S3_001_AQPM00000000_GATK_dedup.bam METRICS_FILE=male2_S3_001_AQPM00000000_GATK_dedup_metrics.txt

java -Xmx8g -jar -Djava.io.tmpdir=./tmp ./picard-tools-1.133/picard.jar MarkDuplicates MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=4000 INPUT=male3_S1_001_AQPM00000000_GATK_sorted.bam OUTPUT=male3_S1_001_AQPM00000000_GATK_dedup.bam METRICS_FILE=male3_S1_001_AQPM00000000_GATK_dedup_metrics.txt


# Index the BAM files

java -Xmx8g -jar -Djava.io.tmpdir=./tmp ./picard-tools-1.133/picard.jar BuildBamIndex INPUT=female1_S5_001_AQPM00000000_GATK_dedup.bam

java -Xmx8g -jar -Djava.io.tmpdir=./tmp ./picard-tools-1.133/picard.jar BuildBamIndex INPUT=female2_S4_001_AQPM00000000_GATK_dedup.bam

java -Xmx8g -jar -Djava.io.tmpdir=./tmp ./picard-tools-1.133/picard.jar BuildBamIndex INPUT=female3_S6_001_AQPM00000000_GATK_dedup.bam

java -Xmx8g -jar -Djava.io.tmpdir=./tmp ./picard-tools-1.133/picard.jar BuildBamIndex INPUT=male1_S2_001_AQPM00000000_GATK_dedup.bam

java -Xmx8g -jar -Djava.io.tmpdir=./tmp ./picard-tools-1.133/picard.jar BuildBamIndex INPUT=male2_S3_001_AQPM00000000_GATK_dedup.bam

java -Xmx8g -jar -Djava.io.tmpdir=./tmp ./picard-tools-1.133/picard.jar BuildBamIndex INPUT=male3_S1_001_AQPM00000000_GATK_dedup.bam



#############################################################
# INDEL realignment											#
# https://www.broadinstitute.org/gatk/guide/article?id=2800 #
#############################################################

# 1. Create a target list of intervals to be realigned

java -Xmx8g -jar -Djava.io.tmpdir=./tmp ./GenomeAnalysisTK-3.4-0/GenomeAnalysisTK.jar -T RealignerTargetCreator -R AQPM00000000_NN.fasta -I female1_S5_001_AQPM00000000_GATK_dedup.bam -o female1_S5_001_AQPM00000000_GATK_realignment_targets.list 

java -Xmx8g -jar -Djava.io.tmpdir=./tmp ./GenomeAnalysisTK-3.4-0/GenomeAnalysisTK.jar -T RealignerTargetCreator -R AQPM00000000_NN.fasta -I female2_S4_001_AQPM00000000_GATK_dedup.bam -o female2_S4_001_AQPM00000000_GATK_realignment_targets.list 

java -Xmx8g -jar -Djava.io.tmpdir=./tmp ./GenomeAnalysisTK-3.4-0/GenomeAnalysisTK.jar -T RealignerTargetCreator -R AQPM00000000_NN.fasta -I female3_S6_001_AQPM00000000_GATK_dedup.bam -o female3_S6_001_AQPM00000000_GATK_realignment_targets.list 

java -Xmx8g -jar -Djava.io.tmpdir=./tmp ./GenomeAnalysisTK-3.4-0/GenomeAnalysisTK.jar -T RealignerTargetCreator -R AQPM00000000_NN.fasta -I male1_S2_001_AQPM00000000_GATK_dedup.bam -o male1_S2_001_AQPM00000000_GATK_realignment_targets.list 

java -Xmx8g -jar -Djava.io.tmpdir=./tmp ./GenomeAnalysisTK-3.4-0/GenomeAnalysisTK.jar -T RealignerTargetCreator -R AQPM00000000_NN.fasta -I male2_S3_001_AQPM00000000_GATK_dedup.bam -o male2_S3_001_AQPM00000000_GATK_realignment_targets.list 

java -Xmx8g -jar -Djava.io.tmpdir=./tmp ./GenomeAnalysisTK-3.4-0/GenomeAnalysisTK.jar -T RealignerTargetCreator -R AQPM00000000_NN.fasta -I male3_S1_001_AQPM00000000_GATK_dedup.bam -o male3_S1_001_AQPM00000000_GATK_realignment_targets.list 


# 2. Perform realignment of the target intervals

java -Xmx8g -jar -Djava.io.tmpdir=./tmp ./GenomeAnalysisTK-3.4-0/GenomeAnalysisTK.jar -T IndelRealigner -R AQPM00000000_NN.fasta -I female1_S5_001_AQPM00000000_GATK_dedup.bam -targetIntervals female1_S5_001_AQPM00000000_GATK_realignment_targets.list -o female1_S5_001_AQPM00000000_GATK_realigned_reads.bam 

java -Xmx8g -jar -Djava.io.tmpdir=./tmp ./GenomeAnalysisTK-3.4-0/GenomeAnalysisTK.jar -T IndelRealigner -R AQPM00000000_NN.fasta -I female2_S4_001_AQPM00000000_GATK_dedup.bam -targetIntervals female2_S4_001_AQPM00000000_GATK_realignment_targets.list -o female2_S4_001_AQPM00000000_GATK_realigned_reads.bam 

java -Xmx8g -jar -Djava.io.tmpdir=./tmp ./GenomeAnalysisTK-3.4-0/GenomeAnalysisTK.jar -T IndelRealigner -R AQPM00000000_NN.fasta -I female3_S6_001_AQPM00000000_GATK_dedup.bam -targetIntervals female3_S6_001_AQPM00000000_GATK_realignment_targets.list -o female3_S6_001_AQPM00000000_GATK_realigned_reads.bam 

java -Xmx8g -jar -Djava.io.tmpdir=./tmp ./GenomeAnalysisTK-3.4-0/GenomeAnalysisTK.jar -T IndelRealigner -R AQPM00000000_NN.fasta -I male1_S2_001_AQPM00000000_GATK_dedup.bam -targetIntervals male1_S2_001_AQPM00000000_GATK_realignment_targets.list -o male1_S2_001_AQPM00000000_GATK_realigned_reads.bam 

java -Xmx8g -jar -Djava.io.tmpdir=./tmp ./GenomeAnalysisTK-3.4-0/GenomeAnalysisTK.jar -T IndelRealigner -R AQPM00000000_NN.fasta -I male2_S3_001_AQPM00000000_GATK_dedup.bam -targetIntervals male2_S3_001_AQPM00000000_GATK_realignment_targets.list -o male2_S3_001_AQPM00000000_GATK_realigned_reads.bam 

java -Xmx8g -jar -Djava.io.tmpdir=./tmp ./GenomeAnalysisTK-3.4-0/GenomeAnalysisTK.jar -T IndelRealigner -R AQPM00000000_NN.fasta -I male3_S1_001_AQPM00000000_GATK_dedup.bam -targetIntervals male3_S1_001_AQPM00000000_GATK_realignment_targets.list -o male3_S1_001_AQPM00000000_GATK_realigned_reads.bam 



##########################################################################################
# Base Recalibration																	 #
# https://www.broadinstitute.org/gatk/guide/article?id=2800 							 #
# http://gatkforums.broadinstitute.org/discussion/44/base-quality-score-recalibration-bqsr 
##########################################################################################


############
# Call SNPs naively based on initial alignment    

java -Xmx8g -jar -Djava.io.tmpdir=./tmp ./GenomeAnalysisTK-3.4-0/GenomeAnalysisTK.jar -T HaplotypeCaller -R AQPM00000000_NN.fasta -I female1_S5_001_AQPM00000000_GATK_realigned_reads.bam -dontUseSoftClippedBases -stand_call_conf 20.0 -stand_emit_conf 20.0 -o female1_S5_001_AQPM00000000_GATK_VariantCall1.vcf 

java -Xmx8g -jar -Djava.io.tmpdir=./tmp ./GenomeAnalysisTK-3.4-0/GenomeAnalysisTK.jar -T HaplotypeCaller -R AQPM00000000_NN.fasta -I female2_S4_001_AQPM00000000_GATK_realigned_reads.bam -dontUseSoftClippedBases -stand_call_conf 20.0 -stand_emit_conf 20.0 -o female2_S4_001_AQPM00000000_GATK_VariantCall1.vcf 

java -Xmx8g -jar -Djava.io.tmpdir=./tmp ./GenomeAnalysisTK-3.4-0/GenomeAnalysisTK.jar -T HaplotypeCaller -R AQPM00000000_NN.fasta -I female3_S6_001_AQPM00000000_GATK_realigned_reads.bam -dontUseSoftClippedBases -stand_call_conf 20.0 -stand_emit_conf 20.0 -o female3_S6_001_AQPM00000000_GATK_VariantCall1.vcf 

java -Xmx8g -jar -Djava.io.tmpdir=./tmp ./GenomeAnalysisTK-3.4-0/GenomeAnalysisTK.jar -T HaplotypeCaller -R AQPM00000000_NN.fasta -I male1_S2_001_AQPM00000000_GATK_realigned_reads.bam -dontUseSoftClippedBases -stand_call_conf 20.0 -stand_emit_conf 20.0 -o male1_S2_001_AQPM00000000_GATK_VariantCall1.vcf 

java -Xmx8g -jar -Djava.io.tmpdir=./tmp ./GenomeAnalysisTK-3.4-0/GenomeAnalysisTK.jar -T HaplotypeCaller -R AQPM00000000_NN.fasta -I male2_S3_001_AQPM00000000_GATK_realigned_reads.bam -dontUseSoftClippedBases -stand_call_conf 20.0 -stand_emit_conf 20.0 -o male2_S3_001_AQPM00000000_GATK_VariantCall1.vcf 

java -Xmx8g -jar -Djava.io.tmpdir=./tmp ./GenomeAnalysisTK-3.4-0/GenomeAnalysisTK.jar -T HaplotypeCaller -R AQPM00000000_NN.fasta -I male3_S1_001_AQPM00000000_GATK_realigned_reads.bam -dontUseSoftClippedBases -stand_call_conf 20.0 -stand_emit_conf 20.0 -o male3_S1_001_AQPM00000000_GATK_VariantCall1.vcf 



##################
# Filter highest confidence SNPs and INDELs
# to use for base recalibration
# Only example for Female 1 sample is shown
#  http://gatkforums.broadinstitute.org/discussion/2806/howto-apply-hard-filters-to-a-call-set

# Use the following filters
#http://gatkforums.broadinstitute.org/discussion/3225/you-are-unable-to-use-vqsr-recalibration-to-filter-variants

#For SNPs:
#QD < 2.0
#MQ < 40.0
#FS > 60.0
#SOR > 4.0
#MQRankSum < -12.5
#ReadPosRankSum < -8.0

#For indels:
#QD < 2.0
#ReadPosRankSum < -20.0
#InbreedingCoeff < -0.8 <-- The InbreedingCoeff statistic is a population-level calculation that is only available with 10 or more samples. If you have fewer samples you will need to omit that particular filter statement.
#FS > 200.0
#SOR > 10.0

# Extract SNPs
java -Xmx8g -jar -Djava.io.tmpdir=./tmp ./GenomeAnalysisTK-3.4-0/GenomeAnalysisTK.jar -T SelectVariants -R AQPM00000000_NN.fasta -V female1_S5_001_AQPM00000000_GATK_VariantCall1.vcf -selectType SNP -o female1_S5_001_AQPM00000000_GATK_VariantCall_SNPs_v1.vcf 

# Filter SNPs
java -Xmx8g -jar -Djava.io.tmpdir=./tmp ./GenomeAnalysisTK-3.4-0/GenomeAnalysisTK.jar -T VariantFiltration -R AQPM00000000_NN.fasta -V female1_S5_001_AQPM00000000_GATK_VariantCall_SNPs_v1.vcf -window 35 -cluster 3 -filterName "SNP_v1" --filterExpression "QD < 2.0 || MQ < 40.0 ||FS > 60.0 || SOR > 4.0 || MQRankSum < -12.5 || ReadPosRankSum < -8.0" -o female1_S5_001_AQPM00000000_GATK_VariantFilter_SNPs_v1.vcf 

# Extract indels    
java -Xmx8g -jar -Djava.io.tmpdir=./tmp ./GenomeAnalysisTK-3.4-0/GenomeAnalysisTK.jar -T SelectVariants -R AQPM00000000_NN.fasta -V female1_S5_001_AQPM00000000_GATK_VariantCall1.vcf -selectType INDEL -o female1_S5_001_AQPM00000000_GATK_VariantCall_INDEL_v1.vcf 

# Filter indels
java -Xmx8g -jar -Djava.io.tmpdir=./tmp ./GenomeAnalysisTK-3.4-0/GenomeAnalysisTK.jar -T VariantFiltration -R AQPM00000000_NN.fasta -V female1_S5_001_AQPM00000000_GATK_VariantCall_INDEL_v1.vcf -window 35 -cluster 3 -filterName "SNP_v1" --filterExpression "QD < 2.0 || ReadPosRankSum < -20.0 || FS > 200.0 || SOR > 10.0" -o female1_S5_001_AQPM00000000_GATK_VariantFilter_INDEL_v1.vcf 



###################
# Use high quality variants to perform base recalibration (BQSR)

#1. Analyze patterns of covariation in the sequence dataset
java -Xmx8g -jar -Djava.io.tmpdir=./tmp ./GenomeAnalysisTK-3.4-0/GenomeAnalysisTK.jar -T BaseRecalibrator -R AQPM00000000_NN.fasta -I female1_S5_001_AQPM00000000_GATK_realigned_reads.bam -knownSites female1_S5_001_AQPM00000000_GATK_VariantFilter_SNPs_v1.vcf -knownSites female1_S5_001_AQPM00000000_GATK_VariantFilter_INDEL_v1.vcf -o female1_S5_001_AQPM00000000_GATK_recal_data_v1.table 

#2. Do a second pass to analyze covariation remaining after recalibration
java -Xmx8g -jar -Djava.io.tmpdir=./tmp ./GenomeAnalysisTK-3.4-0/GenomeAnalysisTK.jar -T BaseRecalibrator -R AQPM00000000_NN.fasta -I female1_S5_001_AQPM00000000_GATK_realigned_reads.bam -knownSites female1_S5_001_AQPM00000000_GATK_VariantFilter_SNPs_v1.vcf -knownSites female1_S5_001_AQPM00000000_GATK_VariantFilter_INDEL_v1.vcf -BQSR female1_S5_001_AQPM00000000_GATK_recal_data_v1.table -o female1_S5_001_AQPM00000000_GATK_post_recal_v1.table 

#3. Generate before/after plots
java -Xmx8g -jar -Djava.io.tmpdir=./tmp ./GenomeAnalysisTK-3.4-0/GenomeAnalysisTK.jar -T AnalyzeCovariates -R AQPM00000000_NN.fasta -before female1_S5_001_AQPM00000000_GATK_recal_data_v1.table  -after female1_S5_001_AQPM00000000_GATK_post_recal_v1.table -plots female1_S5_001_AQPM00000000_GATK_recal_plots_v1.pdf 

# 4. Apply the recalibration to sequence data
java -Xmx8g -jar -Djava.io.tmpdir=./tmp ./GenomeAnalysisTK-3.4-0/GenomeAnalysisTK.jar -T PrintReads -R AQPM00000000_NN.fasta -I female1_S5_001_AQPM00000000_GATK_realigned_reads.bam -BQSR female1_S5_001_AQPM00000000_GATK_recal_data_v1.table -o female1_S5_001_AQPM00000000_GATK_recal_reads_v1.bam 


#########
# Use re-calibrated reads to re-call variants

java -Xmx8g -jar -Djava.io.tmpdir=./tmp ./GenomeAnalysisTK-3.4-0/GenomeAnalysisTK.jar -T HaplotypeCaller -R AQPM00000000_NN.fasta -I female1_S5_001_AQPM00000000_GATK_recal_reads_v1.bam -dontUseSoftClippedBases -stand_call_conf 20.0 -stand_emit_conf 20.0 -o female1_S5_001_AQPM00000000_GATK_VariantCall2.vcf 


# Extract high quality SNPs an INDELs to use for another round of base recalibration
# Use same filters as above

# Extract SNPs
java -Xmx8g -jar -Djava.io.tmpdir=./tmp ./GenomeAnalysisTK-3.4-0/GenomeAnalysisTK.jar -T SelectVariants -R AQPM00000000_NN.fasta -V female1_S5_001_AQPM00000000_GATK_VariantCall2.vcf -selectType SNP -o female1_S5_001_AQPM00000000_GATK_VariantCall_SNPs_v2.vcf 

# Filter SNPs
java -Xmx8g -jar -Djava.io.tmpdir=./tmp ./GenomeAnalysisTK-3.4-0/GenomeAnalysisTK.jar -T VariantFiltration -R AQPM00000000_NN.fasta -V female1_S5_001_AQPM00000000_GATK_VariantCall_SNPs_v2.vcf -window 35 -cluster 3 -filterName "SNP_v1" --filterExpression "QD < 2.0 || MQ < 40.0 ||FS > 60.0 || SOR > 4.0 || MQRankSum < -12.5 || ReadPosRankSum < -8.0" -o female1_S5_001_AQPM00000000_GATK_VariantFilter_SNPs_v2.vcf 

# Extract indels    
java -Xmx8g -jar -Djava.io.tmpdir=./tmp ./GenomeAnalysisTK-3.4-0/GenomeAnalysisTK.jar -T SelectVariants -R AQPM00000000_NN.fasta -V female1_S5_001_AQPM00000000_GATK_VariantCall2.vcf -selectType INDEL -o female1_S5_001_AQPM00000000_GATK_VariantCall_INDEL_v2.vcf 

# Filter indels
java -Xmx8g -jar -Djava.io.tmpdir=./tmp ./GenomeAnalysisTK-3.4-0/GenomeAnalysisTK.jar -T VariantFiltration -R AQPM00000000_NN.fasta -V female1_S5_001_AQPM00000000_GATK_VariantCall_INDEL_v2.vcf -window 35 -cluster 3 -filterName "SNP_v1" --filterExpression "QD < 2.0 || ReadPosRankSum < -20.0 || FS > 200.0 || SOR > 10.0" -o female1_S5_001_AQPM00000000_GATK_VariantFilter_INDEL_v2.vcf 


# Use high quality variants to perform base recalibration (BQSR)

#1. Analyze patterns of covariation in the sequence dataset
java -Xmx8g -jar -Djava.io.tmpdir=./tmp ./GenomeAnalysisTK-3.4-0/GenomeAnalysisTK.jar -T BaseRecalibrator -R AQPM00000000_NN.fasta -I female1_S5_001_AQPM00000000_GATK_recal_reads_v1.bam -knownSites female1_S5_001_AQPM00000000_GATK_VariantFilter_SNPs_v2.vcf -knownSites female1_S5_001_AQPM00000000_GATK_VariantFilter_INDEL_v2.vcf -o female1_S5_001_AQPM00000000_GATK_recal_data_v2.table 

#2. Do a second pass to analyze covariation remaining after recalibration
java -Xmx8g -jar -Djava.io.tmpdir=./tmp ./GenomeAnalysisTK-3.4-0/GenomeAnalysisTK.jar -T BaseRecalibrator -R AQPM00000000_NN.fasta -I female1_S5_001_AQPM00000000_GATK_recal_reads_v1.bam -knownSites female1_S5_001_AQPM00000000_GATK_VariantFilter_SNPs_v2.vcf -knownSites female1_S5_001_AQPM00000000_GATK_VariantFilter_INDEL_v2.vcf -BQSR female1_S5_001_AQPM00000000_GATK_recal_data_v2.table -o female1_S5_001_AQPM00000000_GATK_post_recal_v2.table 

#3. Generate before/after plots
java -Xmx8g -jar -Djava.io.tmpdir=./tmp ./GenomeAnalysisTK-3.4-0/GenomeAnalysisTK.jar -T AnalyzeCovariates -R AQPM00000000_NN.fasta -before female1_S5_001_AQPM00000000_GATK_recal_data_v2.table  -after female1_S5_001_AQPM00000000_GATK_post_recal_v2.table -plots female1_S5_001_AQPM00000000_GATK_recal_plots_v2.pdf 

# 4. Apply the recalibration to sequence data
java -Xmx8g -jar -Djava.io.tmpdir=./tmp ./GenomeAnalysisTK-3.4-0/GenomeAnalysisTK.jar -T PrintReads -R AQPM00000000_NN.fasta -I female1_S5_001_AQPM00000000_GATK_recal_reads_v1.bam -BQSR female1_S5_001_AQPM00000000_GATK_recal_data_v2.table -o female1_S5_001_AQPM00000000_GATK_recal_reads_v2.bam 



###############
# Do third round of base recalibration
# Run calling, filtering and recalibration in a single script

####
# Use re-calibrated reads to re-call variants

java -Xmx8g -jar -Djava.io.tmpdir=./tmp ./GenomeAnalysisTK-3.4-0/GenomeAnalysisTK.jar -T HaplotypeCaller -R AQPM00000000_NN.fasta -I female1_S5_001_AQPM00000000_GATK_recal_reads_v2.bam -dontUseSoftClippedBases -stand_call_conf 20.0 -stand_emit_conf 20.0 -o female1_S5_001_AQPM00000000_GATK_VariantCall3.vcf 


####
# Extract high quality SNPs an INDELs to use for another round of base recalibration
# Use same filters as above

# Extract SNPs
java -Xmx8g -jar -Djava.io.tmpdir=./tmp ./GenomeAnalysisTK-3.4-0/GenomeAnalysisTK.jar -T SelectVariants -R AQPM00000000_NN.fasta -V female1_S5_001_AQPM00000000_GATK_VariantCall3.vcf -selectType SNP -o female1_S5_001_AQPM00000000_GATK_VariantCall_SNPs_v3.vcf 

# Filter SNPs
java -Xmx8g -jar -Djava.io.tmpdir=./tmp ./GenomeAnalysisTK-3.4-0/GenomeAnalysisTK.jar -T VariantFiltration -R AQPM00000000_NN.fasta -V female1_S5_001_AQPM00000000_GATK_VariantCall_SNPs_v3.vcf -window 35 -cluster 3 -filterName "SNP_v1" --filterExpression "QD < 2.0 || MQ < 40.0 ||FS > 60.0 || SOR > 4.0 || MQRankSum < -12.5 || ReadPosRankSum < -8.0" -o female1_S5_001_AQPM00000000_GATK_VariantFilter_SNPs_v3.vcf 

# Extract indels    
java -Xmx8g -jar -Djava.io.tmpdir=./tmp ./GenomeAnalysisTK-3.4-0/GenomeAnalysisTK.jar -T SelectVariants -R AQPM00000000_NN.fasta -V female1_S5_001_AQPM00000000_GATK_VariantCall3.vcf -selectType INDEL -o female1_S5_001_AQPM00000000_GATK_VariantCall_INDEL_v3.vcf 

# Filter indels
java -Xmx8g -jar -Djava.io.tmpdir=./tmp ./GenomeAnalysisTK-3.4-0/GenomeAnalysisTK.jar -T VariantFiltration -R AQPM00000000_NN.fasta -V female1_S5_001_AQPM00000000_GATK_VariantCall_INDEL_v3.vcf -window 35 -cluster 3 -filterName "SNP_v1" --filterExpression "QD < 2.0 || ReadPosRankSum < -20.0 || FS > 200.0 || SOR > 10.0" -o female1_S5_001_AQPM00000000_GATK_VariantFilter_INDEL_v3.vcf 


#####
# Use high quality variants to perform base recalibration (BQSR)

#1. Analyze patterns of covariation in the sequence dataset
java -Xmx8g -jar -Djava.io.tmpdir=./tmp ./GenomeAnalysisTK-3.4-0/GenomeAnalysisTK.jar -T BaseRecalibrator -R AQPM00000000_NN.fasta -I female1_S5_001_AQPM00000000_GATK_recal_reads_v2.bam -knownSites female1_S5_001_AQPM00000000_GATK_VariantFilter_SNPs_v3.vcf -knownSites female1_S5_001_AQPM00000000_GATK_VariantFilter_INDEL_v3.vcf -o female1_S5_001_AQPM00000000_GATK_recal_data_v3.table 

#2. Do a second pass to analyze covariation remaining after recalibration
java -Xmx8g -jar -Djava.io.tmpdir=./tmp ./GenomeAnalysisTK-3.4-0/GenomeAnalysisTK.jar -T BaseRecalibrator -R AQPM00000000_NN.fasta -I female1_S5_001_AQPM00000000_GATK_recal_reads_v2.bam -knownSites female1_S5_001_AQPM00000000_GATK_VariantFilter_SNPs_v3.vcf -knownSites female1_S5_001_AQPM00000000_GATK_VariantFilter_INDEL_v3.vcf -BQSR female1_S5_001_AQPM00000000_GATK_recal_data_v3.table -o female1_S5_001_AQPM00000000_GATK_post_recal_v3.table 

#3. Generate before/after plots
java -Xmx8g -jar -Djava.io.tmpdir=./tmp ./GenomeAnalysisTK-3.4-0/GenomeAnalysisTK.jar -T AnalyzeCovariates -R AQPM00000000_NN.fasta -before female1_S5_001_AQPM00000000_GATK_recal_data_v3.table  -after female1_S5_001_AQPM00000000_GATK_post_recal_v3.table -plots female1_S5_001_AQPM00000000_GATK_recal_plots_v3.pdf >  female1_S5_001_AQPM00000000_GATK_recal_plots_v3.out 2> female1_S5_001_AQPM00000000_GATK_recal_plots_v3.err

# 4. Apply the recalibration to sequence data
java -Xmx8g -jar -Djava.io.tmpdir=./tmp ./GenomeAnalysisTK-3.4-0/GenomeAnalysisTK.jar -T PrintReads -R AQPM00000000_NN.fasta -I female1_S5_001_AQPM00000000_GATK_recal_reads_v2.bam -BQSR female1_S5_001_AQPM00000000_GATK_recal_data_v3.table -o female1_S5_001_AQPM00000000_GATK_recal_reads_v3.bam 



########
# Final variant calling and filtering on *recal_reads_v3.bam
cd /project/meisel/users/rpmeisel

# Variant Calling
java -Xmx8g -jar -Djava.io.tmpdir=./tmp ./GenomeAnalysisTK-3.4-0/GenomeAnalysisTK.jar -T HaplotypeCaller -R AQPM00000000_NN.fasta -I female1_S5_001_AQPM00000000_GATK_recal_reads_v3.bam -dontUseSoftClippedBases -stand_call_conf 20.0 -stand_emit_conf 20.0 -o female1_S5_001_AQPM00000000_GATK_VariantCall_final.vcf 

# Variant Filtering 
java -Xmx8g -jar -Djava.io.tmpdir=./tmp ./GenomeAnalysisTK-3.4-0/GenomeAnalysisTK.jar -T VariantFiltration -R AQPM00000000_NN.fasta -V female1_S5_001_AQPM00000000_GATK_VariantCall_final.vcf -window 35 -cluster 3 -filterName FS -filter "FS > 30.0" -filterName QD -filter "QD < 2.0" -o female1_S5_001_AQPM00000000_GATK_VariantFilter_final.vcf 

# Output records for all sites and samples (for joint genotyping):
java -Xmx8g -jar -Djava.io.tmpdir=./tmp ./GenomeAnalysisTK-3.4-0/GenomeAnalysisTK.jar -T HaplotypeCaller -R AQPM00000000_NN.fasta -I female1_S5_001_AQPM00000000_GATK_recal_reads_v3.bam -o female1_S5_001_AQPM00000000_GATK_VariantCall_final.g.vcf -ERC GVCF 


########
# Variant calling and filtering using all bam files from each sex to produce a single vcf

# Variant Calling
java -Xmx8g -jar -Djava.io.tmpdir=./tmp ./GenomeAnalysisTK-3.4-0/GenomeAnalysisTK.jar -T HaplotypeCaller -R AQPM00000000_NN.fasta -I female1_S5_001_AQPM00000000_GATK_recal_reads_v3.bam -I female2_S4_001_AQPM00000000_GATK_recal_reads_v3.bam -I female3_S6_001_AQPM00000000_GATK_recal_reads_v3.bam -dontUseSoftClippedBases -stand_call_conf 20.0 -stand_emit_conf 20.0 -o female_AQPM00000000_GATK_VariantCall_final.vcf 

java -Xmx8g -jar -Djava.io.tmpdir=./tmp ./GenomeAnalysisTK-3.4-0/GenomeAnalysisTK.jar -T HaplotypeCaller -R AQPM00000000_NN.fasta -I male1_S2_001_AQPM00000000_GATK_recal_reads_v3.bam -I male2_S3_001_AQPM00000000_GATK_recal_reads_v3.bam -I male3_S1_001_AQPM00000000_GATK_recal_reads_v3.bam -dontUseSoftClippedBases -stand_call_conf 20.0 -stand_emit_conf 20.0 -o male_AQPM00000000_GATK_VariantCall_final.vcf 


# Variant Filtering 
java -Xmx8g -jar -Djava.io.tmpdir=./tmp ./GenomeAnalysisTK-3.4-0/GenomeAnalysisTK.jar -T VariantFiltration -R AQPM00000000_NN.fasta -V female_AQPM00000000_GATK_VariantCall_final.vcf -window 35 -cluster 3 -filterName FS -filter "FS > 30.0" -filterName QD -filter "QD < 2.0" -o female_AQPM00000000_GATK_VariantFilter_final.vcf 

java -Xmx8g -jar -Djava.io.tmpdir=./tmp ./GenomeAnalysisTK-3.4-0/GenomeAnalysisTK.jar -T VariantFiltration -R AQPM00000000_NN.fasta -V male_AQPM00000000_GATK_VariantCall_final.vcf -window 35 -cluster 3 -filterName FS -filter "FS > 30.0" -filterName QD -filter "QD < 2.0" -o male_AQPM00000000_GATK_VariantFilter_final.vcf 


# Output records for all sites and samples (for joint genotyping):
java -Xmx8g -jar -Djava.io.tmpdir=./tmp ./GenomeAnalysisTK-3.4-0/GenomeAnalysisTK.jar -T HaplotypeCaller -R AQPM00000000_NN.fasta -I female1_S5_001_AQPM00000000_GATK_recal_reads_v3.bam -I female2_S4_001_AQPM00000000_GATK_recal_reads_v3.bam -I female3_S6_001_AQPM00000000_GATK_recal_reads_v3.bam -o female_AQPM00000000_GATK_VariantCall_final.g.vcf -ERC GVCF 

java -Xmx8g -jar -Djava.io.tmpdir=./tmp ./GenomeAnalysisTK-3.4-0/GenomeAnalysisTK.jar -T HaplotypeCaller -R AQPM00000000_NN.fasta -I male1_S2_001_AQPM00000000_GATK_recal_reads_v3.bam -I male2_S3_001_AQPM00000000_GATK_recal_reads_v3.bam -I male3_S1_001_AQPM00000000_GATK_recal_reads_v3.bam -o male_AQPM00000000_GATK_VariantCall_final.g.vcf -ERC GVCF 



######
# Joint genotyping

#Genotype:
java -Xmx8g -jar -Djava.io.tmpdir=./tmp ./GenomeAnalysisTK-3.4-0/GenomeAnalysisTK.jar -T GenotypeGVCFs -R AQPM00000000_NN.fasta -V female_AQPM00000000_GATK_VariantCall_final.g.vcf -V male_AQPM00000000_GATK_VariantCall_final.g.vcf -o JointGenotype.vcf

# Filter variants from Joint Genotyping
java -Xmx8g -jar -Djava.io.tmpdir=./tmp ./GenomeAnalysisTK-3.4-0/GenomeAnalysisTK.jar -T VariantFiltration -R AQPM00000000_NN.fasta -V JointGenotype.vcf -window 35 -cluster 3 -filterName FS -filter "FS > 30.0" -filterName QD -filter "QD < 2.0" -o JointGenotype_Filtered.vcf



