###############ensemble pipeline########################################################

#!/bin/bash
reference=

########PART I Alignment

#step1: map reads to reference

#1) index the reference 
sentieon bwa index ${reference}
samtools faidx ${reference}
java -jar CreateSequenceDictionary.jar REFERENCE=${reference} OUTPUT=${reference}.dict

#2) clean fastq file

sample=

fastp -w 24 -i ${sample}_R1.fastq.gz -I ${sample}_R2.fastq.gz -o ${sample}_cleaned_R1.fastq.gz -O ${sample}_cleaned_R2.fastq.gz

#3) mapping to reference

id=
pl=

(sentieon bwa mem -M -R '@RG\tID:${id}\tSM:${sample}\tPL:${pl}' \
   -K 100000000 \
   -t 24 ${reference} \
   ${sample}_R1_clean.fastq.gz \
   ${sample}_R2_clean.fastq.gz || echo -n 'error') \
| sentieon util sort -r ${reference} -o ${sample}_sorted.bam  -t 24 --sam2bam -i –


#step2 Calculate data metrics:

sentieon driver -t 24 -r /home/n-z/zhihai/btx623/phytozome/Sbicolor_454_v3.0.1.fa -i ${sample}_sorted.bam \
    --algo GCBias --summary ./metrics/${sample}_GC_SUMMARY.txt ./metrics/${sample}_GC_METRIC.txt \
    --algo MeanQualityByCycle ./metrics/${sample}_MQ_METRIC.txt \
    --algo QualDistribution ./metrics/${sample}_QD_METRIC.txt \
    --algo InsertSizeMetricAlgo ./metrics/${sample}_IS_METRIC.txt  \
    --algo AlignmentStat ./metrics/${sample}_ALN_METRIC.txt;
sentieon plot GCBias -o ./metrics/${sample}_GC_METRIC.pdf ./metrics/${sample}_GC_METRIC.txt;
sentieon plot MeanQualityByCycle -o ./metrics/${sample}_MQ_METRIC.pdf ./metrics/${sample}_MQ_METRIC.txt;
sentieon plot QualDistribution -o ./metrics/${sample}_QD_METRIC.pdf ./metrics/${sample}_QD_METRIC.txt;
sentieon plot InsertSizeMetricAlgo -o ./metrics/${sample}_IS_METRIC.pdf ./metrics/${sample}_IS_METRIC.txt;

#step3 Remove or mark duplicates:

sentieon driver -t 24 -i ${sample}_sorted.bam \
  --algo LocusCollector --fun score_info ./deduped_bam/${sample}_SCORE.gz;
sentieon driver -t 24 -i ${sample}_sorted.bam \
  --algo Dedup [--rmdup] --score_info ./deduped_bam/${sample}_SCORE.gz \
  --metrics ./deduped_bam/${sample}_DEDUP_METRIC.txt ./deduped_bam/${sample}_sorted_deduped.bam;


#step4 Indel realignment


sentieon driver -t 24 -r ${reference} \
  -i ${sample}_sorted_deduped.bam \
  --algo Realigner \
  [-k KNOWN_SITES] \
  ./realigned_bam/${sample}_sorted_deduped_realigned.bam;


#step5 base quality score recalibration (BQSR)

sentieon driver -t 24 -r ${reference} \
  -i $i --algo QualCal ./BQSR/${sample}_RECAL_DATA.TABLE;

sentieon driver -t 24 -r ${reference} -i ${sample}_sorted_deduped_realigned.bam \
  -q ./BQSR/${sample}_RECAL_DATA.TABLE \
  --algo QualCal ./BQSR/${sample}_RECAL_DATA.TABLE.POST \
  --algo ReadWriter \
  ./BQSR/${sample}_sorted_deduped_realigned_recalibrated.bam;

sentieon driver -t 24 --algo QualCal --plot \
  --before ./BQSR/${sample}_RECAL_DATA.TABLE \
  --after ./BQSR/${sample}_RECAL_DATA.TABLE.POST \
  ./BQSR/${sample}_RECAL_RESULT.CSV;

sentieon plot QualCal -o ./BQSR/${sample}_BQSR.pdf ./BQSR/$${sample}_RECAL_RESULT.csv

#########PART II Variants calling

#####SNP:

sentieon driver -t 24 -r ${reference} \
         -i ${sample}_sorted_deduped_realigned_recalibrated.bam \
         --algo Genotyper \
         ./variants/snp_indel/${sample}_SNP_vcf.gz



#merge multiple vcf from different individuals

bcftools merge -m all *_SNP_vcf.gz -Oz -o merged_SNP.vcf.gz

#remove the multiallelic snp
bcftools view -m 2 -M 2 -v snps merged_SNP.vcf.gz -Oz -o merged_SNP_biallelic.vcf.gz

#filter SNP

gatk IndexFeatureFile \
   -F merged_SNP_biallelic.vcf.gz

gatk VariantFiltration \
   -R ${reference} \
   -V merged_SNP_biallelic.vcf.gz \
   -O merged_SNP_biallelic_qcfilter06232021.vcf.gz \
   --filter-expression "QD < 2.0 || MQ < 40.0 || MQRankSum < -12.5 || ReadPosRankSum < -8.0 || FS > 60.0" --filter-name "SNP_HARD_FAIL" \
   --filter-expression "SOR > 3.0" --filter-name "SOR_FAIL"

if [ ! -f ./merged_SNP_biallelic_qcfilter06232021.vcf.gz ];then
   echo "filtered file does not exist, aborting."
   exit
fi

if [ -f ./merged_SNP_biallelic_qcfilter06232021.vcf.gz ];then
   gatk SelectVariants \
     -R ${reference} \
     -V merged_SNP_biallelic_qcfilter06232021.vcf.gz \
     -O merged_SNP_biallelic_qcfilter06232021_excluded.vcf.gz \
     --exclude-filtered
fi


#filtration by reads mapping quality and depth
vcftools \
   --gzvcf merged_SNP_biallelic_qcfilter06232021_excluded.vcf.gz \
   --minQ 30 \
   --minDP 30 \
   --out merged_SNP_biallelic_qcfilter06232021_excluded_minQ30_minDP30 \
   --recode --recode-INFO-all 


#######SV:

#step1 SV calling using different callers

#sentieon

sentieon driver -t 24 -r ${reference} -i ${sample}_sorted_deduped_realigned_recalibrated.bam \
  --algo DNAscope --var_type bnd \
  ./variants/sv_sentieon/${sample}_TMP_VARIANT.vcf;

sentieon driver -t 24 -r ${reference} --algo SVSolver  \
  -v ./variants/sv_sentieon/${sample}_TMP_VARIANT.vcf ./variants/sv_sentieon/${sample}_sentieon.vcf;

#delly

delly call -q 24 -g ${reference} \
           -i ${sample}_sorted_deduped_realigned_recalibrated.bam \
           -o ./variants/sv_delly/${sample}_DELLY.bcf

#smoove

smoove call \
     --name ${sample} \
     --fasta ${reference} \
     --processes 1 \
     --outdir ./variants/sv_smoove/ \
     --genotype ${sample}_sorted_deduped_realigned_recalibrated.bam

#manta

~/manta-1.6.0.release_src/bin/configManta.py \
     --bam ${sample}_sorted_deduped_realigned_recalibrated.bam \
     --referenceFasta ${reference} \
     --runDir ./variants/sv_manta/${sample} && \
./variants/sv_manta/${sample}/runWorkflow.py;

#cnvnator

cnvnator \
   -root ${sample}.root \
   -tree ${sample}_sorted_deduped_realigned_recalibrated.bam \
   -chrom $(seq -f 'Chr0%g' 1 9) Chr10 && \
cnvnator -root ${sample}.root -call 1000


#step2 For each sample, merge different VCF files from different callers into one

SURVIVOR merge \
      ./sv_integrated/${sample}_sentieon_cnvnator_delly_manta_smoove.filelist \
      1000 2 1 1 0 30 \
      ./sv_integrated/${sample}_SV_fusion.vcf;

#step3 rebuild genotyped fusion vcf for each sample

for i in $(ls *.vcf);do
    h=${i%_SV_fusion.vcf};
    j=${i%.vcf};
    cat $i | grep "^##" > header
    cat $i | grep -v "^##" | cut -f1-9 > vcf.info
    cat $i | grep -v "^##" | tail -n +2 | cut -f10-14 > ft.txt
    bcftools query -f"%SVTYPE@%ID#%CHROM\_%POS-%CHR2\_%END\n" $i | sed "s/:/_/g" | sed -e "s/#/:NA:NA:/g" -e "s/@/:/g" > svtype_id_chr_pos_end.txt
    bcftools query -f"%SVTYPE@%ID#%CHROM\_%POS\n" $i | sed "s/:/_/g" | sed -e "s/#/:NA:NA:/g" -e "s/@/:/g" > svtype_id_chr_pos.txt
    bcftools query -f"%ID#%CHROM\_%POS\n" $i | sed "s/:/_/g" | sed -e "s/#/:NA:NA:/g" | sed "s/^\.:NA:NA://g" > id_chr_pos.txt
    bcftools query -f"%CHROM\_%POS\n" $i | sed "s/:/_/g" > chr_pos.txt
    bcftools query -f"%POS\n" $i > pos.txt
    paste -d "\t" svtype_id_chr_pos_end.txt svtype_id_chr_pos.txt id_chr_pos.txt chr_pos.txt pos.txt ft.txt > identifier_ft.txt
    cat identifier_ft.txt | while read line;do
        svtype_id_chr_pos_end=$(echo $line | cut -f1 -d" ");
        svtype_id_chr_pos=$(echo $line | cut -f2 -d" ");
        id_chr_pos=$(echo $line | cut -f3 -d" ");
        chr_pos=$(echo $line | cut -f4 -d" ");
        pos=$(echo $line | cut -f5 -d" ");
        ft=$(echo $line | cut -f6-10 -d" ");
        result1=$(echo $ft | xargs -n1 | grep $svtype_id_chr_pos_end)
        if [ $(echo $result1 | grep -v ^$ | wc -l) -eq 1 ]
        then 
            echo $result1 >> $j'_gt.txt'
        elif [ $(echo $result1 |grep -v ^$ | wc -l) -lt 1 ]
        then 
            result2=$(echo $ft | xargs -n1 | grep $svtype_id_chr_pos)
            if [ $(echo $result2 | grep -v ^$ |wc -l) -eq 1 ]
            then 
                echo $result2 >> $j'_gt.txt'
            elif [ $(echo $result2 | grep -v ^$ |wc -l) -lt 1 ]
            then 
                result3=$(echo $ft | xargs -n1 | grep $id_chr_pos)
                if [ $(echo $result3 | grep -v ^$ |wc -l) -eq 1 ]
                then 
                    echo $result3 >> $j'_gt.txt'
                elif [ $(echo $result3 | grep -v ^$ |wc -l) -lt 1 ]
                then 
                    result4=$(echo $ft | xargs -n1 | grep $chr_pos)
                    if [ $(echo $result4 | grep -v ^$ |wc -l) -eq 1 ]
                    then 
                        echo $result4 >> $j'_gt.txt'
                    elif [ $(echo $result4 | grep -v ^$ |wc -l) -lt 1 ]
                    then 
                        result5=$(echo $ft | xargs -n1 | grep $pos)
                        if [ $(echo $result5 | grep -v ^$ |wc -l) -eq 1 ]
                        then 
                            echo $result5 >> $j'_gt.txt'
                        else 
                            echo "no_chr_pos" >> $j'_gt.txt'
                        fi
                    else 
                    echo "mul_chr_end" >> $j'_gt.txt'                                                                                 
                    fi
                else 
                    echo "mul_chr_pos_end" >> $j'_gt.txt'          
                fi
            else
                echo "mul_id_chr_pos_end" >> $j'_gt.txt' 
            fi
        else 
            echo "mul_svtype_id_chr_pos_end" >> $j'_gt.txt'
        fi
    done
if [ $(cat $j'_gt.txt' | grep -v ^$ |wc -l) -eq $(cat identifier_ft.txt | grep -v ^$ |wc -l) ]
then
    sed -i "1i$h" $j'_gt.txt'
    paste -d "\t" vcf.info $j'_gt.txt' > vcf.info.gt
    cat header vcf.info.gt > ./extracted_genotyped_ind/$j'_genotyped.vcf' && rm header vcf.info ft.txt svtype_id_chr_pos_end.txt svtype_id_chr_pos.txt id_chr_pos.txt chr_pos.txt pos.txt identifier_ft.txt vcf.info.gt
else
    echo $j >> extracted_genotyped_ind.log
fi
done


#step4 merged the genotyped fusion vcf files

jasmine JASMINE_DIST 100 \
   --centroid_merging \
   --clique_merging \
   --normalized_type \
   --output_genotypes \
   threads=24 \
   genome_file=${reference} \
   file_list=filelist.txt \
   out_file=BAP_363_SV_merged.vcf

jasmine --dup_to_ins --postprocess_only out_file=BAP_363_SV_merged.vcf

#step5 Identify the reference genome regions prone to producing false SV calls by
#eliminating SVs identified in BTX623 and their surrounding SVs within a 2.5kb vicinity

vcftools --vcf BAP_363_SV_merged.vcf --indv PI564163 --out PI564163_sv --recode --recode-INFO-all

bcftools query -f"%CHROM\t%POS\t[%GT]\n" PI564163_sv.recode.vcf | grep -e "0/1" -e "1/1" -e "1/2" > PI564163_sv_sites.txt

cat PI564163_sv_sites.txt | awk '{print $1"\t"$2-2500"\t"$2+2500}' > PI564163_sv_sites_extend2.5kb.txt

vcftools --vcf BAP_363_SV_merged.vcf --remove-indv PI564163 --out BAP_362_SV_merged --recode --recode-INFO-all

SURVIVOR filter BAP_362_SV_merged.recode.vcf PI564163_sv_sites_extend2.5kb.txt 30 1000000 0.001 -1 BAP_362_SV_merged_excludeBTX623SVsExt2.5k.vcf

##step6 exclude the ungenotyped SVs

vcftools --vcf BAP_362_SV_merged_excludeBTX623SVsExt2.5k.vcf \
         --freq \
         --out BAP_362_SV_merged_excludeBTX623SVsExt2.5k

cat BAP_362_SV_merged_excludeBTX623SVsExt2.5k.frq | awk '{if ($4 == 0)print}' > ungenotyped_sites.txt

vcftools --vcf BAP_362_SV_merged_excludeBTX623SVsExt2.5k.vcf \
         --exclude-positions ungenotyped_sites.txt \
         --out BAP_362_SV_merged_excludeBTX623SVsExt2.5k_deUngenotyped \
         --recode --recode-INFO-all
