#!/bin/bash

mkdir ~/mydirectory/filter

cd ~/mydirectory/filter

# add path to your input directory
$INFILE="~/mydirectory/inputdata/genotypedAllSites_merged.g.vcf"
$REFERENCE="~/mydirectory/inputdata/GCF_001858045.2_O_niloticus_UMD_NMBU_genomic_edit.fna"

# load necessary software modules
module load gatk/4.1.4.1
module load vcftools/0.1.16
module load bcftools/1.10.2
module load htslib/1.10.2 


###########################################################################
##access initial file
###########################################################################
SNPS_IN=$(grep -v "#" -c $INFILE)
echo "initial number SNPs $SNPS_IN" > OverViewSNPsremoval_AllSites.txt


###########################################################################
##apply hard quality filters per site
###########################################################################
echo "start applying hard filters"
gatk VariantFiltration -R $REFERENCE \
-O SelectedVariantsFilteredAllSites_tmp1.vcf.gz \
-V $INFILE \
--filter-name "LowMQ" --filter-expression "MQ < 40.0" \
--filter-name "StrandBias" --filter-expression "FS > 40.0" \
--filter-name "QualityByDepth" --filter-expression "QD < 2.0" \
--filter-name "LowDepth" --filter-expression "DP < 20" \
--filter-name "HighDepth" --filter-expression "DP > 200" \
--filter-name "StrandOddsRatio" --filter-expression "SOR > 9.0" \
--filter-name "MQRankSum" --filter-expression "MQRankSum < -12.5" \
--filter-name "ReadPosRankSum" --filter-expression "ReadPosRankSum < -8.0" \

gatk SelectVariants -R $REFERENCE \
-V SelectedVariantsFilteredAllSites_tmp1.vcf.gz -O SelectedVariantsFilteredAllSites_tmp2.vcf.gz \
--exclude-filtered --select-type-to-exclude INDEL
SNPS_RET=$(zgrep -c  -v "#" SelectedVariantsFilteredAllSites_tmp2.vcf.gz)

echo "done applying hard filters"
echo "retained SNPs step 1: $SNPS_RET" >> OverViewSNPsremoval_AllSites.txt
echo "retained SNPs step 1: $SNPS_RET"


###########################################################################
##apply hard quality filters per genotype
##filter individual genotypes and set to missing if below these filters
###########################################################################
echo "start applying genotype filters"
bcftools filter -S . -e 'FMT/DP<3 | FMT/GQ<20' -O z -o SelectedVariantsFilteredAllSites_tmp3.vcf.gz \
SelectedVariantsFilteredAllSites_tmp2.vcf.gz
SNPS_RET=$(zgrep -c -v "#" SelectedVariantsFilteredAllSites_tmp3.vcf.gz)

echo "done applying genotype filters"
echo "retained SNPs step 2: $SNPS_RET" >> OverViewSNPsremoval_AllSites.txt

###########################################################################
##remove unseen alleles after that filter
###########################################################################
echo "start applying allele trimming"
bcftools view --trim-alt-alleles SelectedVariantsFilteredAllSites_tmp3.vcf.gz | bgzip -c > SelectedVariantsFilteredAllSites_tmp4.vcf.gz
zcat SelectedVariantsFilteredAllSites_tmp4.vcf.gz | fill-an-ac | bgzip -c > SelectedVariantsFilteredAllSites_tmp5.vcf.gz
SNPS_RET=$(zgrep -c -v "#" SelectedVariantsFilteredAllSites_tmp5.vcf.gz)
echo "done applying allele trimming and filling"
echo "retained SNPs step 3: $SNPS_RET" >> OverViewSNPsremoval_AllSites.txt
echo "retained SNPs step 3: $SNPS_RET"

###########################################################################
##apply hard filter for missigness
###########################################################################
echo "start applying missigness filters"
vcftools --gzvcf SelectedVariantsFilteredAllSites_tmp5.vcf.gz \
--max-missing-count 0 \
--recode --recode-INFO-all --stdout | bgzip -c > SelectedVariantsFilteredAllSites_tmp6.vcf.gz
SNPS_RET=$(zgrep -c -v "#" SelectedVariantsFilteredAllSites_tmp6.vcf.gz)
echo "retained SNPs step 4: $SNPS_RET" >> OverViewSNPsremoval_AllSites.txt
echo "retained SNPs step 4: $SNPS_RET"
echo "done applying missigness filters"

###########################################################################
##format the above set missing sites to make them missing diploid
###########################################################################
echo "start formating missing genotypes"
zcat SelectedVariantsFilteredAllSites_tmp6.vcf.gz | perl -pe "s/\s\.:/\t.\/.:/g" | bgzip -c > SelectedVariantsFilteredAllSites_tmp7.vcf.gz
echo "done formating missing genotypes"

###########################################################################
##apply mask
###########################################################################
echo "start applying mapping mask"
vcftools --gzvcf SelectedVariantsFilteredAllSites_tmp7.vcf.gz \
--exclude-bed ~/mydirectory/MappabilityMaskOrenil2/all_mask_100_90.bed \
--recode --stdout | bgzip -c > BenithosAllSites.final.vcf.gz
SNPS_RET=$(zgrep -c -v "#" BenithosAllSites.final.vcf.gz)
echo "retained SNPs step 5: $SNPS_RET" >> OverViewSNPsremoval_AllSites.txt
echo "retained SNPs step 5: $SNPS_RET"
echo "done applying mapping mask"

###########################################################################
##remove UNPLACED
###########################################################################
echo "start removing UNPLACED"
vcftools --gzvcf BenithosAllSites.final.vcf.gz \
--not-chr UNPLACED \
--recode --recode-INFO-all --stdout | bgzip -c > BenithosAllSites.final.NoUNP.vcf.gz
SNPS_RET=$(zgrep -c -v "#" BenithosAllSites.final.NoUNP.vcf.gz)
echo "done removing UNPLACED"
echo "retained SNPs final no UNPLACED: $SNPS_RET" >> OverViewSNPsremoval_AllSites.txt
echo "retained SNPs final no UNPLACED: $SNPS_RET"

rm SelectedVariantsFilteredAllSites_tmp1.vcf.gz
rm SelectedVariantsFilteredAllSites_tmp2.vcf.gz
rm SelectedVariantsFilteredAllSites_tmp3.vcf.gz
rm SelectedVariantsFilteredAllSites_tmp4.vcf.gz
rm SelectedVariantsFilteredAllSites_tmp5.vcf.gz
rm SelectedVariantsFilteredAllSites_tmp6.vcf.gz
rm SelectedVariantsFilteredAllSites_tmp7.vcf.gz