#!/bin/bash

# Ensure that your machine has at least 30-40 free cores for efficient processing of fastq and bam files.
# If needed, adjust thread parameters or remove them.

# Go to the directory where you have the fastq files and reference fasta files
cd /storage/XXXXX/XXXXXXX/ || exit 1

# Cleanup previous temporary processing directory
rm -rf processing_tmp
mkdir processing_tmp || exit 1

# Copy fastq and fasta files to the processing directory
cp *.fastq.gz *.fa* processing_tmp/ || exit 1

# Move to the processing directory
cd processing_tmp || exit 1

# Set Python and Cutadapt paths
export PYTHONPATH=/programs/cutadapt-4.1/lib/python3.9/site-packages:/programs/cutadapt-4.1/lib64/python3.9/site-packages
export PATH=/programs/cutadapt-4.1/bin:$PATH

# Trim adapters from fastq files using Cutadapt
ls *.fastq.gz | parallel -j 10 'cutadapt -j 4 -a AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC -m 18 -M 30 {} -o {.}_trimmed.fastq.gz'

# Run FastQC on trimmed fastq files
ls *trimmed.fastq.gz | parallel -j 10 'fastqc {} -t 4 --noextract -q'

# Move FastQC output files to a separate directory
mkdir ../fastqc_data_postTrimming
mv *.zip ../fastqc_data_postTrimming

# Rename fastq files
for file in *.fastq.gz; do
    mv "$file" "$(echo "$file" | cut -f1,2,3,6 -d "_").fastq.gz"
done

# Build Bowtie2 index for rRNA reference
bowtie2-build -f dmel_refGenes_rRNA_compiled_dedupe.fa dmel_refGenes_rRNA_compiled

# Map trimmed fastq files to rRNA reference and create bam files
ls *trimmed.fastq.gz | parallel -j 10 'bowtie2 -S -p 4 -N 1 -L 15 -k 1 -x dmel_refGenes_rRNA_compiled -q {} --un-gz {.}.nonrRNA.fastq.gz -o {.}.rRNA.sam'

# Move rRNA mapping files to a separate directory
mkdir rRNA_quant
mv *rRNA.sam rRNA_quant/

# Convert sam files to bam, sort, and index
ls *.sam | parallel -j 5 'samtools view -bS -@ 10 {} >{.}.bam'
ls *.bam | parallel -j 5 'samtools sort -@ 10 {} >{.}.sort.bam'
ls *sort.bam | parallel -j 5 'samtools index -@ 10 {}'

# Set environment variables for Deeptools
export LC_ALL=en_US.utf-8
export LANG=en_US.utf-8
export PATH=/programs/deeptools-3.5.1/bin:$PATH
export PYTHONPATH=/programs/deeptools-3.5.1/lib64/python3.6/site-packages:/programs/deeptools-3.5.1/lib/python3.6/site-packages/

# Build Bowtie2 index for sncRNA reference
bowtie2-build -f dmel_sncRNA_compiledV7.fa dmel_all_sncRNA_compiled

# Map rRNA-filtered fastq files to sncRNA reference
nohup ls *rRNA.fastq.gz | parallel -j 10 'bowtie2 -S -p 4 -N 1 -L 12 -k 1 -x dmel_all_sncRNA_compiled -q {} --un-gz {.}.NONsncRNA.fastq.gz -o {.}.sncRNA.sam'

# Move sam files to a separate directory
mkdir sncRNA_mapping
mv *.bam sncRNA_mapping/

# Filter piRNA-sized small RNAs (23-30nt)
nohup ls *NONsncRNA.fastq.gz | parallel -j 10 'cutadapt -m 23 -M 30 {} -o {.}_piRNA.fastq.gz'

# Map piRNAs to TE references
gunzip *piRNA.fastq.gz
bowtie-build Dmel_repmod2_TE_filtered_library.fasta Dmel_repmod2_TE_filtered_library
ls *piRNA.fastq | parallel -j 10 'bowtie -S -p 4 -n 1 -l 12 -a -m 1 -x Dmel_repmod2_TE_filtered_library.fasta -q {} -o {.}_TEfam_uniq.sam'
ls *piRNA.fastq | parallel -j 10 'bowtie -S -p 4 -n 1 -l 12 -a -m 3 -x Dmel_repmod2_TE_filtered_library.fasta -q {} -o {.}_TEfam_multi.sam'
ls *.sam | parallel -j 5 'samtools view -bS -@ 10 {} >{.}.bam'
ls *.bam | parallel -j 5 'samtools sort -@ 10 {} >{.}.sort.bam'
ls *sort.bam | parallel -j 5 'samtools index -@ 10 {}'

# Map piRNAs from each strain to respective genome assemblies
ls *genome.fa | cut -d. -f1-2 | parallel -j 10 'bowtie2-build -f {} {.} --threads 4'
ls *rep1*.fastq | cut -d '_' -f1 | parallel -j 8 'bowtie -p 5 -n 1 -l 12 -a -m 1 -q {}_ovary_rep1_piRNA.fastq.gz -x {}_genome -S {.}_ovary_rep1_piRNA_genome_uniq.sam'
ls *rep2*.fastq | cut -d '_' -f1 | parallel -j 8 'bowtie -p 5 -n 1 -l 12 -a -m 1 -q {}_ovary_rep2_piRNA.fastq.gz -x {}_genome -S {.}_ovary_rep2_piRNA_genome_uniq.sam'
ls *.sam | parallel -j 5 'samtools view -bS -@ 10 {} >{.}.bam'
ls *.bam | parallel -j 5 'samtools sort -@ 10 {} >{.}.sort.bam'
ls *sort.bam | parallel -j 5 'samtools index -@ 10 {}'

# Get piRNA coverage across genome in 500bp bookended bins for IGV visualization
ls *sort.bam | parallel -j 10 'bamCoverage -bs 500 -b {} -of bedgraph -p 4 -o {.}_500bp.FW.bedGraph --filterRNAstrand forward'
ls *sort.bam | parallel -j 10 'bamCoverage -bs 500 -b {} -of bedgraph -p 4 -o {.}_500bp.RV.bedGraph --scaleFactor -1 --filterRNAstrand reverse'
