# Pipeline for variable Non-Overlapping Window CBS Methods (VNOWC)
# Steps to map reads and create copy-number estimates for each sample. We use known 
# euploid samples to make variable-sized bins with a fixed expected number of reads
# in each bin. Then for each sample we count the number of reads in each bin and
# compare to what would be expected given the total number of reads.
# Circular binary segmentation using the R package DNAcopy calls putative
# changes in copy number which are adjusted to whole numbers. 
# A base copy number of 2 is assumed, but set to 1 if most bins are empty. 

# nathan dot lazar at gmail dot com

PATH=$PATH/:/home/exacloud/lustre1/users/lazar/bin
PATH=$PATH/:/home/exacloud/lustre1/users/lazar/bin/bedtools2/bin
PATH=$PATH/:/opt/installed
PATH=$PATH/:/home/exacloud/lustre1/users/lazar/bin/FastQC
PATH=$PATH/:/home/exacloud/lustre1/users/lazar/bin/fastx
PATH=$PATH/:/home/exacloud/lustre1/users/lazar/bin/FastUniq/source
export PATH

# Set the number of parallel threads to use
cores=20

# Software versions:
# fastqc: v0.10.1
# Trimmomatic v0.35  # Note: .jar file needs to be in specific directory see step 3)
# fastx: v0.0.13
# BWA-mem: 0.7.9a
# samtools: 0.1.19-44428cd
# Bedtools: v2.25.0  # I couldn't add this to the path for some reason see step 9)
# fastuniq:          # I couldn't find a version

##############################################################################################################
# 0.1) Construct bins using fibroblast samples
#   See: FIBROBLASTS/PIPELINE_bins

# 0.2) Collect all raw read files to process

# The directory 'fastq' should have all .fq.gz files in either PE or SE for 
# paired-end or single-end files

##############################################################################################################
# 2) Run fastqc on the raw reads
mkdir FASTQC
mkdir FASTQC/SE
mkdir FASTQC/PE
fastqc --outdir FASTQC/SE --threads $cores fastq/SE/*.fq.gz
fastqc --outdir FASTQC/PE --threads $cores fastq/PE/*.fq.gz

# condor_submit HTC/fastqc.submit

# Look at the number of reads filtered, read length, etc. before trimming
# Write to summary.txt
cd FASTQC
for f in `ls */*.zip`; do unzip $f; done
grep "Total Sequences" */*/fastqc_data.txt > summary.txt
grep "Sequence length" */*/fastqc_data.txt >> summary.txt
grep "Total Deduplicated Percentage" */*/fastqc_data.txt >> summary.txt
grep "^%GC" */*/fastqc_data.txt >> summary.txt
rm */*.html
rm -r */*fastqc
cd ..

##############################################################################################################
# 3) Trim low quality bases w/ Trimmomatic

mkdir TRIM_SE
mkdir TRIM_PE

# Trim single end reads
for F1 in `ls fastq/SE/* | grep -Ev "R[1|2]"`
  do name=${F1/_R1_001.fq.gz/}; name=${name/.R1.fq.gz}; name=$(basename $name)
java -jar $dir/trimmomatic-0.35.jar \
  SE -threads $cores -phred33 $F1 \
  TRIM_SE/$name.trim.fq.gz  \
  ILLUMINACLIP:$dir/adapters/TruSeq3-SE.fa:2:30:10 LEADING:5 TRAILING:5 \
  SLIDINGWINDOW:4:15 MINLEN:36
done

# Trim paired end reads
dir=/home/exacloud/lustre1/users/lazar/bin/Trimmomatic-0.35
for F1 in `ls fastq/PE/*R1*.fq.gz`
  do name=${F1/_R1_001.fastq.gz/}; name=${name/.R1.fastq.gz}; name=$(basename $name)
  F2=${F1/R1/R2}
  java -jar $dir/trimmomatic-0.35.jar \
    PE -threads $cores -phred33 $F1 $F2 \
    TRIM_PE/$name.R1.trim.fq.gz TRIM_PE/$name.R1.unpaired.fq.gz \
    TRIM_PE/$name.R2.trim.fq.gz TRIM_PE/$name.R2.unpaired.fq.gz \
    ILLUMINACLIP:$dir/adapters/TruSeq3-PE.fa:2:30:10 LEADING:5 TRAILING:5 \
    SLIDINGWINDOW:4:15 MINLEN:36
done

# condor_submit trim_se.submit
# condor_submit trim_pe.submit

##############################################################################################################
# 4) Run fastqc on the trimmed reads

mkdir FASTQC_TRIM
fastqc --outdir FASTQC_TRIM --threads $cores TRIM_SE/*.fq.gz
fastqc --outdir FASTQC_TRIM --threads $cores TRIM_PE/*.fq.gz

# condor_submit fastqc_trim.submit

# Look at the number of reads filtered, read length, etc. before trimming
# Write to summary.txt
cd FASTQC_TRIM
for f in `ls *.zip`; do unzip $f; done
grep "Total Sequences" */fastqc_data.txt > summary.txt
grep "Sequence length" */fastqc_data.txt >> summary.txt
grep "Total Deduplicated Percentage" */fastqc_data.txt >> summary.txt
grep "^%GC" */fastqc_data.txt >> summary.txt
rm *.html
rm -r *fastqc
cd ..

##############################################################################################################
# 4) Deduplicate (a.k.a. collapse) reads that are exact copies before mapping
#    with fastx for single-end reads and fastuniq for paired-end reads

# Collapse single end reads
mkdir COLLAPSED_SE
for f in `ls TRIM_SE/*.trim.fq.gz | grep -Ev "R[1|2]"`
  do name=${f/.fq.gz/}; name=$(basename $name); echo $name
  gunzip -c $f | fastx_collapser -Q33 > COLLAPSED_SE/$name.uniq.fa
  gzip COLLAPSED_SE/$name.trim.collapse.fa
done

# condor_submit collapse_se.submit

# Collapse paired-end reads
for F1 in `TRIM_PE/*.trim.fq.gz | grep R1`
  do F2=${F1/R1/R2}
  name1=${F1/.fq.gz/}; name1=$(basename $name1)
  name2=${name1/R1/R2}
  gunzip $F1
  gunzip $F2
  echo ${F1/.gz/}  > COLLAPSED_PE/$name1.txt
  echo ${F2/.gz/} >> COLLAPSED_PE/$name1.txt
  fastuniq -i COLLAPSED_PE/$name1.txt -o COLLAPSED_PE/$name1.uniq.fq -p COLLAPSED_PE/$name2.uniq.fq

# condor_submit collapse_pe.submit

# Clean up
rm COLLAPSED_SE/*.txt; rm COLLAPSED_PE/*.txt
gzip COLLAPSED_SE/*.fq; gzip COLLAPSED_PE/*.fq
gzip COLLAPSED_SE/*.fa; gzip COLLAPSED_PE/*.fa

##############################################################################################################
# 4) Map with BWA-MEM

# Note: The genome must first be indexed w/ bwa index

# Map single-end reads
mkdir MAPPED
mkdir MAPPED/SE
for f in `COLLAPSED_SE/*.fq.gz | grep -Ev "R[1|2]"`
  do ./run_bwa_mem_se.sh $f; done

# condor_submit map_se.submit

# Map paired-end reads
mkdir MAPPED/PE
for F1 in `COLLAPSED_PE/*.fq.gz | grep R1`
  do ./run_bwa_mem_pe.sh $F1; done

# condor_submit map_pe.submit

# This script runs bwa-mem, and filters for quality of 30
# Outputs are:
#  <read_name>.mapped_reads   # Read counts at each step
#  <read_name>.q30.sort.bam
#  <read_name>.q30.sort.bam.bai

# Summarize the reads mapped, unique positions, etc.
# output is file read_stats.txt
./summarize.sh  ## UPDATE THIS?

#############################################################################################################
# 8. Convert bam files to bed files for viewing
for f in `ls MAPPED/SE/*.q30.sort.bam`
  do echo $f; out=${f/.bam/.bed}; echo $out
   ~/bedtools2/bin/bamToBed -i $f > $out
  done

for f in `ls MAPPED/PE/*.q30.sort.bam`
  do echo $f; out=${f/.bam/.bed}; echo $out
   ~/bedtools2/bin/bamToBed -i $f > $out
  done

# condor_submit bamToBed.submit

#############################################################################################################
# 9. Summarize mapping statistics w/ R

Rscript get_stats.R ## UPDATE THIS?

#############################################################################################################
# 10. Get counts and ratios in each bin for the 4 bin sizes

for bins in 500 1000 2000 4000
  do for f in `ls MAPPED/SE/*.bam`
    do out=$(basename $f); out=${out/.q30.sort.bam/.counts}
      Rscript count_bin_reads.R FIBROBLASTS/BINS/SE/xx-euploid-1_all_merge_${bins}.bins $f \
      COUNTS/SE/$bins/$out
done done

for bins in 500 1000 2000 4000
  do for f in `ls MAPPED/PE/*.bam`
    do out=$(basename $f); out=${out/.q30.sort.bam/.counts}
      Rscript count_bin_reads.R FIBROBLASTS/BINS/PE/xx-euploid-1_all_${bins}.bins $f \
      COUNTS/PE/$bins/$out
done done

# condor_submit HTC/count_se.submit
# condor_submit HTC/count_pe.submit

#############################################################################################################
# 11. Get copy number calls and make plots for all samples 

for f in `ls COUNTS/*/*/*.counts`
  do Rscript get_copy_number.R $f
done

# condor_submit HTC/get_cn.submit

for f in `ls COUNTS/*/*/*.cn`
  do Rscript plot_copy_number.R $f
done

# condor_submit HTC/plot_cn.submit

# Make plots of raw counts for the 4000 bins for all samples
for f in `ls COUNTS/*/4000/*.cn`
  do Rscript plot_raw.R $f
done

# condor_submit HTC/plot_raw.submit

# Make whole genome plots for 16 samples at a time
# Will produce mulit_<num>.png files in the given directory
Rscript plot_many.R COUNTS/PE/500/
Rscript plot_many.R COUNTS/PE/1000/
Rscript plot_many.R COUNTS/PE/2000/
Rscript plot_many.R COUNTS/PE/4000/
Rscript plot_many.R COUNTS/SE/500/
Rscript plot_many.R COUNTS/SE/1000/
Rscript plot_many.R COUNTS/SE/2000/
Rscript plot_many.R COUNTS/SE/4000/

# condor_submit HTC/plot_many.submit

##################################################################
# 12. Count the number of reads mapped to chrM for all samples
touch chrM_counts.txt
for f in `ls Geoff_mapped/*/*.bam`
  do name=${f/.q30.sort.bam/}
  name=$(basename $name)
  count=`/opt/installed/samtools view $f | grep 'chrM' | wc -l`
  echo $name $count >> chrM_counts.txt
done

# 13. Count the number of reads mapped to chrY for all samples
touch chrY_counts.txt
for f in `ls Geoff_mapped/*/*.bam`
  do name=${f/.q30.sort.bam/}
  name=$(basename $name)
  count=`/opt/installed/samtools view $f | grep 'chrY' | wc -l`
  echo $name $count >> chrY_counts.txt
done
