# nathan dot lazar at gmail dot com

# Directory /u0/dbase/nl/OOCYTES/ contains scripts and input files
# On exacloud scripts and files are in /home/exacloud/lustre1/users/lazar/OOCYTES

PATH=$PATH/:/home/exacloud/lustre1/users/lazar/bin
PATH=$PATH/:/home/exacloud/lustre1/users/lazar/bin/bedtools2/bin
PATH=$PATH/:/opt/installed
PATH=$PATH/:/home/exacloud/lustre1/users/lazar/bin/FastQC
PATH=$PATH/:/home/exacloud/lustre1/users/lazar/bin/fastx
export PATH

# Software versions:
# fastqc: v0.10.1
# Trimmomatic v0.35  # Note needs to be in specific directory see step 3)
# FastUniq           # Note needs to be in specific directory see step 5)
# fastx: v0.0.13
# BWA-mem: 0.7.9a
# samtools: 0.1.19-44428cd

##############################################################################################################
# 1) Collect all raw read files to process

# RAW_READS should have all fastq.gz files (351 total)
# Controls: 18 xx + 18 xy
# First batch: 50 + 14 + 96 (rh)
# Second bath: 155 (151204_Sheep_PicoPlex)

##############################################################################################################
# 2) Run fastqc on the raw reads
mkdir FASTQC
fastqc --outdir FASTQC --threads 6 RAW_READS/*.fq.gz

# condor_submit fastqc.submit

# Look at the number of reads filtered, read length, etc. before trimming
# Write to summary.txt
cd FASTQC
for f in `ls *.zip`; do unzip $f; done
grep "Total Sequences" */fastqc_data.txt > summary.txt
grep "Sequence length" */fastqc_data.txt >> summary.txt
grep "Total Deduplicated Percentage" */fastqc_data.txt >> summary.txt
grep "^%GC" */fastqc_data.txt >> summary.txt
rm *.html
rm -r *fastqc
cd ..

##############################################################################################################
# 3) Trim low quality bases w/ Trimmomatic

# Trim paired end reads
dir=/home/exacloud/lustre1/users/lazar/bin/Trimmomatic-0.35
for F1 in `ls RAW_READS/*R1*.fq.gz`
  do name=${F1/_R1_001.fq.gz/}; name=${name/.R1.fq.gz}; name=$(basename $name)
  F2=${F1/R1/R2}
  java -jar $dir/trimmomatic-0.35.jar \
    PE -threads 6 -phred33 $F1 $F2 \
    TRIMMED/$name.R1.trim.fq.gz TRIMMED/$name.R1.unpaired.fq.gz \
    TRIMMED/$name.R2.trim.fq.gz TRIMMED/$name.R2.unpaired.fq.gz \
    ILLUMINACLIP:$dir/adapters/TruSeq3-PE.fa:2:30:10 LEADING:5 TRAILING:5 \
    SLIDINGWINDOW:4:15 MINLEN:36
done

# condor_submit trim_se.submit
# condor_submit trim_pe.submit

# Trim single end reads
for F1 in `ls RAW_READS/* | grep -Ev "R[1|2]"`
  do name=${F1/_R1_001.fq.gz/}; name=${name/.R1.fq.gz}; name=$(basename $name)
java -jar $dir/trimmomatic-0.35.jar \
  SE -threads 6 -phred33 $F1 \
  TRIMMED/$name.trim.fq.gz  \
  ILLUMINACLIP:$dir/adapters/TruSeq3-SE.fa:2:30:10 LEADING:5 TRAILING:5 \
  SLIDINGWINDOW:4:15 MINLEN:36
done

##############################################################################################################
# 4) Run fastqc on the trimmed reads

mkdir FASTQC_TRIM
fastqc --outdir FASTQC_TRIM --threads 6 TRIMMED/*.fq.gz

# condor_submit fastqc_trim.submit

# Look at the number of reads filtered, read length, etc. before trimming
# Write to summary.txt
cd FASTQC_TRIM
for f in `ls *.zip`; do unzip $f; done
grep "Total Sequences" */fastqc_data.txt > summary.txt
grep "Sequence length" */fastqc_data.txt >> summary.txt
grep "Total Deduplicated Percentage" */fastqc_data.txt >> summary.txt
grep "^%GC" */fastqc_data.txt >> summary.txt
rm *.html
rm -r *fastqc
cd ..

##############################################################################################################
# 5) Deduplicate (a.k.a. collapse) reads that are exact copies before mapping
#    with fastx for single-end reads and fastuniq for paired-end reads

# Collapse single end reads
mkdir COLLAPSED
for f in `ls TRIMMED/*.trim.fq.gz | grep -Ev "R[1|2]"`
  do name=${f/.fq.gz/}; name=$(basename $name); echo $name
  gunzip -c $f | fastx_collapser -Q33 > COLLAPSED/$name.uniq.fa
  gzip COLLAPSED/$name.trim.collapse.fa
done

# condor_submit collapse_se.submit

# Collapse paired-end reads
for F1 in `ls TRIMMED/*R1*trim.fq.gz`
  do F2=${F1/R1/R2}
  name1=${F1/.fq.gz/}; name1=$(basename $name1)
  name2=${name1/R1/R2}
  gunzip $F1
  gunzip $F2
  echo ${F1/.gz/}  > COLLAPSED/$name1.txt
  echo ${F2/.gz/} >> COLLAPSED/$name1.txt
  fastuniq -i COLLAPSED/$name1.txt -o COLLAPSED/$name1.uniq.fq -p COLLAPSED/$name2.uniq.fq
done

# condor_submit collapse_pe.submit

# Clean up
rm COLLAPSED/*.txt
gzip COLLAPSED/*.fq
gzip COLLAPSED/*.fa

##############################################################################################################
# 6) Map with BWA-MEM

# Note: The genome must first be indexed w/ bwa index

# Map single end reads
mkdir MAPPED
for f in `ls COLLAPSED/*.fq.gz | grep -Ev "R[1|2]"`
  do ./run_bwa_mem_se.sh $f; done

# condor_submit map_se.submit

for F1 in `ls COLLAPSED/*.fq.gz | grep R1`
  do ./run_bwa_mem_pe.sh $F1; done

# condor_submit map_pe.submit

# This script runs bwa-mem, and filters for quality of 30
# Outputs are:
#  <read_name>.mapped_reads   # Read counts at each step
#  <read_name>.q30.sort.bam
#  <read_name>.q30.sort.bam.bai

# Summarize the reads mapped, unique positions, etc.
# output is file MAPPED/read_stats.txt
./summarize.sh

#############################################################################################################
# 7). Convert bam files to bed files
for f in `ls MAPPED/*.q30.sort.bam`
  do echo $f; out=${f/.bam/.bed}; echo $out
   ~/bedtools2/bin/bamToBed -i $f > $out
  done

# condor_submit bamToBed.submit

# Remove leading zeros in the chromosome names (i.e. chr01 needs to be chr1)
for f in `ls MAPPED/*.bed`
  do sed 's/chr0/chr/' $f > tmp
  mv tmp $f
  done

#############################################################################################################
# 9. Summarize mapping statistics w/ R

Rscript get_stats.R