#!/bin/bash

# add path to your input directory
$INFILE="~/mydirectory/inputdata/genotyped_merged.g.vcf"
$REFERENCE="~/mydirectory/inputdata/GCF_001858045.2_O_niloticus_UMD_NMBU_genomic_edit.fna"


module load gatk/4.1.4.1
module load vcftools
module load bcftools
module load htslib/1.10.2

cd ~/mydirectory/filter
###########################################################################
##access initial file
###########################################################################
SNPS_IN=$(grep -v "#" -c $INFILE)
echo "initial number SNPs $SNPS_IN" > OverViewSNPsremoval_VariantsOnly.txt

###########################################################################
##apply hard quality filters per site
###########################################################################
echo "start applying hard filters"
gatk VariantFiltration -R $REFERENCE \
-O SelectedVariantsFiltered_tmp1.vcf.gz \
-V $INFILE \
--filter-name "LowMQ" --filter-expression "MQ < 40.0" \
--filter-name "StrandBias" --filter-expression "FS > 40.0" \
--filter-name "QualityByDepth" --filter-expression "QD < 2.0" \
--filter-name "LowDepth" --filter-expression "DP < 20" \
--filter-name "HighDepth" --filter-expression "DP > 200" \
--filter-name "StrandOddsRatio" --filter-expression "SOR > 9.0" \
--filter-name "MQRankSum" --filter-expression "MQRankSum < -12.5" \
--filter-name "ReadPosRankSum" --filter-expression "ReadPosRankSum < -8.0" \

gatk SelectVariants -R $REFERENCE \
-V SelectedVariantsFiltered_tmp1.vcf.gz --exclude-filtered --select-type-to-include SNP -O SelectedVariantsFiltered_tmp2.vcf.gz

SNPS_RET=$(zgrep -c  -v "#" SelectedVariantsFiltered_tmp2.vcf.gz)

echo "done applying hard filters"
echo "retained SNPs step 1: $SNPS_RET" >> OverViewSNPsremoval_VariantsOnly.txt
echo "retained SNPs step 1: $SNPS_RET"

###########################################################################
##apply hard quality filters per genotype
##filter individual genotypes and set to missing if below these filters
###########################################################################
echo "start applying genotype filters"
bcftools filter -S . -e 'FMT/DP<3 | FMT/GQ<20' -O z -o SelectedVariantsFiltered_tmp3.vcf.gz \
SelectedVariantsFiltered_tmp2.vcf.gz
SNPS_RET=$(zgrep -c -v "#" SelectedVariantsFiltered_tmp3.vcf.gz)

echo "done applying genotype filters"
echo "retained SNPs step 2: $SNPS_RET" >> OverViewSNPsremoval_VariantsOnly.txt
echo "retained SNPs step 2: $SNPS_RET"
###########################################################################
## fill an-ac fields after genotype trimming above
##remove unseen alleles after that filter and invariant sites (i.e. everyone is alt)
###########################################################################
echo "start applying allele trimming, filling and filtering"
bcftools view --trim-alt-alleles SelectedVariantsFiltered_tmp3.vcf.gz -O z -o SelectedVariantsFiltered_tmp4.vcf.gz
zcat SelectedVariantsFiltered_tmp4.vcf.gz | fill-an-ac | bgzip -c > SelectedVariantsFiltered_tmp5.vcf.gz
bcftools view -e 'AC==0 || AC==AN' SelectedVariantsFiltered_tmp5.vcf.gz -O z -o SelectedVariantsFiltered_tmp6.vcf.gz
SNPS_RET=$(zgrep -c -v "#" SelectedVariantsFiltered_tmp6.vcf.gz)
echo "done applying allele trimming, filling and filtering"
echo "retained SNPs step 3: $SNPS_RET" >> OverViewSNPsremoval_VariantsOnly.txt
echo "retained SNPs step 3: $SNPS_RET"

###########################################################################
##apply hard filter for missigness
###########################################################################
echo "start applying missigness filters"
vcftools --gzvcf SelectedVariantsFiltered_tmp6.vcf.gz \
--max-missing-count 0 \
--recode --recode-INFO-all --stdout | bgzip -c > SelectedVariantsFiltered_tmp7.vcf.gz
SNPS_RET=$(zgrep -c -v "#" SelectedVariantsFiltered_tmp7.vcf.gz)
echo "done applying missigness filters"
echo "retained SNPs step 4: $SNPS_RET" >> OverViewSNPsremoval_VariantsOnly.txt

###########################################################################
##format the above set missing sites to make them missing diploid
###########################################################################
echo "start formating missing genotypes"
zcat SelectedVariantsFiltered_tmp7.vcf.gz | perl -pe "s/\s\.:/\t.\/.:/g" | bgzip -c > SelectedVariantsFiltered_tmp8.vcf.gz
echo "done formating missing genotypes"

###########################################################################
##apply mask
###########################################################################
echo "start applying mapping mask"
vcftools --gzvcf SelectedVariantsFiltered_tmp8.vcf.gz \
--exclude-bed /share/pool/CompGenomVert/RefGenomes/Orenil/all_mask_100_90.bed \
--recode --stdout | bgzip -c > SelectedVariantsFiltered_tmp9.vcf.gz
SNPS_RET=$(zgrep -c -v "#" SelectedVariantsFiltered_tmp9.vcf.gz)

echo "done applying mapping mask"
echo "retained SNPs step 5: $SNPS_RET" >> OverViewSNPsremoval_VariantsOnly.txt
echo "retained SNPs step 5: $SNPS_RET"

###########################################################################
###next steps only on VariantsOnlyFile
###########################################################################

###########################################################################
##keep only biallelic
###########################################################################
echo "start keeping only biallelic SNPs"
vcftools --gzvcf SelectedVariantsFiltered_tmp9.vcf.gz \
--min-alleles 2 --max-alleles 2 \
--recode --stdout | bgzip -c > SelectedVariantsFiltered_tmp10.vcf.gz
SNPS_RET=$(zgrep -c -v "#" SelectedVariantsFiltered_tmp10.vcf.gz)

echo "done keeping only biallelic SNPs"
echo "retained SNPs step 6: $SNPS_RET" >> OverViewSNPsremoval_VariantsOnly.txt
echo "retained SNPs step 6: $SNPS_RET"

###########################################################################
##remove spanning deletions
###########################################################################
echo "start removing spanning deletions"
bcftools view -e'ALT="*"' SelectedVariantsFiltered_tmp10.vcf.gz -o BenithosSNPsOnly.final | bgzip -c BenithosSNPsOnly.final > BenithosSNPsOnly.final.vcf.gz
SNPS_RET=$(zgrep -c -v "#" BenithosSNPsOnly.final.vcf.gz)

echo "done removing spanning deletions"
echo "retained SNPs final: $SNPS_RET" >> OverViewSNPsremoval_VariantsOnly.txt

###########################################################################
##remove UNPLACED
###########################################################################
echo "start removing UNPLACED"
vcftools --gzvcf BenithosSNPsOnly.final.vcf.gz \
--not-chr UNPLACED \
--recode --recode-INFO-all --stdout | bgzip -c > BenithosSNPsOnly.final.NoUNP.vcf.gz
SNPS_RET=$(zgrep -c -v "#" BenithosSNPsOnly.final.NoUNP.vcf.gz)

echo "done removing UNPLACED"
echo "retained SNPs final no UNPLACED: $SNPS_RET" >> OverViewSNPsremoval_VariantsOnly.txt
echo "retained SNPs final no UNPLACED: $SNPS_RET"

rm SelectedVariantsFiltered_tmp1.vcf.gz
rm SelectedVariantsFiltered_tmp2.vcf.gz
rm SelectedVariantsFiltered_tmp3.vcf.gz
rm SelectedVariantsFiltered_tmp4.vcf.gz
rm SelectedVariantsFiltered_tmp5.vcf.gz
rm SelectedVariantsFiltered_tmp6.vcf.gz
rm SelectedVariantsFiltered_tmp8.vcf.gz
rm SelectedVariantsFiltered_tmp9.vcf.gz


