#!/bin/bash

#####################################
##   Postprocessing of VCF files   ##
#####################################

######### Step 1. Preparation #########
# Software required
# SnpSift (https://github.com/pcingola/SnpEff)
# SnpEff (https://github.com/pcingola/SnpSift)

# dbSNP
snp_file_mm10="path_to_dbSNP_dir/MGP_v5_snp_and_indels.exclude_wild.vcf.gz" # snps and indels for mm10
# snp_file_mm39="path_to_dbSNP_dir/MGP_v8_snps_exclude_wild.rsID.vcf.gz" # snps for mm39
# indel_file_mm39="path_to_dbSNP_dir/MGP_v8_indels_exclude_wild.rsID.vcf.gz" # indels for mm39

# genome database
# genomic database for SnpEff was downloaded with SnpEff download command
reference_genome="GRCm38.75" # for mm10
#reference_genome="GRCm39.105" # for mm39

######### Step 2. Integration and Annotation #########
# Verify the headers of VCF files and re-order if necessary (normal, tumor)
#for file in *.vcf; do
#    file_name="${file%.vcf}"
#    echo $file_name $(grep "#CHROM" $file)
#done

# Integrate comments of VCF files, remove duplicates
list=('RY1114T6' 'RY1114T7' 'RY1115T2' 'RY1115T6')
echo "##fileformat=VCFv4.2" > original_comments
for item in "${list[@]}"; do
    file="${item}_H.m2.filt.vcf"
    tail -n +2 $file | grep '^##' >> temp_comments
done
cat temp_comments | sort | uniq >> original_comments

header="#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tnormal\ttumor\tsample\tmutID"
{ cat original_comments; echo -e "$header"; } > tmp1.vcf

# Add the bodies of VCF files, adding sample ID and mutID (exclude chrUn_,chrN_random, multiple ALT)
for item in "${list[@]}"; do
    file="${item}_H.m2.filt.vcf"
    awk -v item="${item}" 'BEGIN{OFS="\t"} (!/^#/ && $1 !~ /_/ && $5 !~ /,/ ){
        $12 = item; $13 = $1"_"$2"_"$4">"$5; print
    }' $file >> tmp1.vcf
done

# Annotate genomic variants using a database
SnpSift annotate $snp_file_mm10 tmp1.vcf > tmp2.vcf
#SnpSift annotate $snp_file_mm39 tmp1.vcf > tmp1_2.vcf
#SnpSift annotate $indel_file_mm39 tmp1_2.vcf > tmp2.vcf
SnpEff -canon $reference_genome tmp2.vcf > tmp3.vcf

# Add new comments and columns
cat tmp3.vcf | grep '^##' > new_comments
echo -e "## Integrated files are from ${list[@]}_${basename}\n\
## column1-11: VCF format\n\
## column12: name of the sample\n\
## column13: mutation ID (Chr_POS_REF>ALT)\n\
## column14-15: depth of normal REF, ALT\n\
## column16-17: depth of tumor REF, ALT\n\
## column18: consequence of mutation\n\
## column19: affected gene\n\
## column20: impact of mutation\n\
## column21: hard filter (normal and tumor depth >= 10, normal ALT depth = 0, tumor ALT depth >= 3)\n\
## column22: mutation type\n\
## column23: change in single base pair\n\
## column24: change in Watson-Crick base pair\n\
## column25: number of mutations with the same mutID\n\
## column26: pass all filters" >> new_comments

additional_header="\tnRef\tnAlt\ttRef\ttAlt\tconseq\timpact\tgene\thardFilter\ttype\tchange\tchangeWC\tcount\tpassALL"
{ cat new_comments; echo -e "$header $additional_header"; } > integrated.vcf

# Split relevant fields to extract read depth, consequence, impact, and gene
awk 'BEGIN{OFS="\t"}(!/^#/ ){
    split($10,n,":");
    split($11,t,":");
    split($8,i,"ANN=");
    $14=(n[2] == "") ? "NA" : n[2];
    $15=(t[2] == "") ? "NA" : t[2];
    $16=(i[2] == "") ? "NA" : i[2];
    print
}' tmp3.vcf | \

awk 'BEGIN{OFS="\t"} {
    split($14,n,",");
    split($15,t,",");
    split($16,i,"|");
    $17=(n[1] == "") ? "NA" : n[1];
    $18=(n[2] == "") ? "NA" : n[2];
    $19=(t[1] == "") ? "NA" : t[1];
    $20=(t[2] == "") ? "NA" : t[2];
    $21=(i[2] == "") ? "NA" : i[2];
    $22=(i[3] == "") ? "NA" : i[3];
    $23=(i[4] == "") ? "NA" : i[4];
    print
}' | cut -f 1-13,17- > tmp4.vcf

######### Step 3. Tag for filtering #########
# Hard filter: Normal and tumor depth >= 10, tumor ALT > 3, normal ALT = 0
awk 'BEGIN {OFS="\t"} {
    if($14 + $15 >= 10 && $16 + $17 >= 10 &&  $17 >= 3 && $15 == 0)
        $21="PASS";
    else
        $21=".";
    print
}' tmp4.vcf > tmp5.vcf

# Mutation type and base change (snv or indel)
awk 'BEGIN {OFS="\t"} {
    if(length($4) + length($5) == 2) {
        $22="snv";
        $23=$4 ">" $5;
    }
    else {
        $22="indel";
        $23=".";
    }
    print
}' tmp5.vcf | \

# Watson-Crick base pair changes (snv only)
awk 'BEGIN {OFS="\t"} {
    if ($23 == "T>A" || $23 == "A>T" )
        $24 = "A:T>T:A";
    else if ($23 == "T>C" || $23 == "A>G")
        $24 = "A:T>G:C";
    else if ($23 == "T>G" || $23 == "A>C")
        $24 = "A:T>C:G";
    else if ($23 == "C>A" || $23 == "G>T")
        $24 = "G:C>T:A";
    else if ($23 == "C>G" || $23 == "G>C")
        $24 = "G:C>C:G";
    else if ($23 == "C>T" || $23 == "G>A")
        $24 = "G:C>A:T";
    else
        $24 = $23;
    print
}' > tmp6.vcf

# Count the number of mutID
awk 'BEGIN {OFS="\t"} NR == FNR { count[$13]++; next } {print $0, count[$13]}' tmp6.vcf tmp6.vcf >> tmp7.vcf

# Tag variants that meet all of the following criteria
# 1. Chr = 1-19X (exclude M Y)
# 2. ID = . (not dbSNP)
# 3. FILTER = PASS (FilterMutectCall PASS)
# 4. hardFilter = PASS (Normal and tumor depth >= 10, tumor ALT > 3, normal ALT = 0)
# 5. count = 1 (not shared with other tumors)

awk 'BEGIN{OFS="\t"} {
    if($1 !~ /[MY]/ && $3 == "." &&  $7 == "PASS" && $21 == "PASS" && $25 == 1 )
        $26= "PASS";
    else
        $26=".";
    print
}' tmp7.vcf >> integrated.vcf

######### Step 4. Filtering and Splitting #########
# Filter (all-filter passed)
awk 'BEGIN{OFS="\t"} (/^#/ && !/column/) || $26 == "PASS"' integrated.vcf | \
cut -f 1-13,26 > total_pass.vcf

# Split into individual tumors
for item in "${list[@]}"; do
    echo "##fileformat=VCFv4.2" > ${item}.pass.vcf
    awk -v item="${item}" 'BEGIN{OFS="\t"}( /^#CHROM/ || $12 == item )' total_pass.vcf >> "${item}.pass.vcf"
done

rm tmp*
rm *comments