#!/bin/bash

#SBATCH -q batch
#SBATCH --job-name=extract_sequencing_counts
#SBATCH --ntasks=1
#SBATCH --nodes=1
#SBATCH --time=72:00:00
#SBATCH --mem=150G


###################
# Extract Read Group Info
###################
declare -A RG2LB

while read -r line; do
    if [[ $line =~ ^@RG ]]; then
        rg=$(echo "$line" | grep -oP 'ID:\K[^ \t]+')
        lb=$(echo "$line" | grep -oP 'LB:\K[^ \t]+')
        if [[ -n $rg && -n $lb ]]; then
            RG2LB["$rg"]="$lb"
        fi
    fi
done < <(singularity exec $CONTAINER/samtools_v1.9-4-deb_cv1.sif samtools view -H "$CRAM")

# Get all unique library names
unique_libs=($(for lb in "${RG2LB[@]}"; do echo "$lb"; done | sort | uniq))

###################
# Write Output
###################
{
    echo -e "sample\tchr\tpos\t$(IFS=$'\t'; echo "${unique_libs[*]}")"

    while read -r chr start end; do
        region="${chr}:${start}-${end}"

        # Get read group tags from reads in the region
        mapfile -t rgs < <(
            singularity exec $CONTAINER/samtools_v1.9-4-deb_cv1.sif samtools view "$CRAM" "$region" 2>/dev/null |
            awk '{for(i=12;i<=NF;i++) if ($i ~ /^RG:Z:/) { split($i, a, ":"); print a[3]; }}'
        )

        # Initialize count for each library
        declare -A lib_counts
        for lib in "${unique_libs[@]}"; do
            lib_counts["$lib"]=0
        done

        # Count reads per library
        for rg in "${rgs[@]}"; do
            lb="${RG2LB[$rg]}"
            if [[ -n $lb ]]; then
                ((lib_counts["$lb"]++))
            fi
        done

        # Print row with sample name
        printf "%s\t%s\t%s" "$SAMPLE" "$chr" "$start"
        for lib in "${unique_libs[@]}"; do
            printf "\t%s" "${lib_counts[$lib]}"
        done
        printf "\n"
    done < "$BED"
} > "$OUT"
