# nathan dot lazar at gmail dot com

# Directory /u0/dbase/nl/OOCYTES/ contains scripts and input files
# on exacloud: /home/exacloud/lustre1/users/lazar/OOCYTES/

# Software versions:
# fastqc: v0.11.3
# cutadapt: v1.2.1
# BWA-mem: 0.7.12-r1044
# samtools: v1.3
#   with rmdup from version 0.1.9

#############################################################################################################
# 6. Process non-control reads for mapping

# Create sybolic links to raw data 
ln -s /home/exacloud/lustre1/adey_lab/share/151204_Sheep_PicoPlex/split/* RAW_READS/

# Run FASTQC on raw reads
for f in `ls RAW_READS/*`
  do echo $f; FastQC/fastqc --outdir FASTQC --noextract --threads 24 $f; done

# Prep reads for mapping
for f in `ls RAW_READS/*.fq.gz`; do ./prep_reads_se.sh $f 5 5 ; done

# Output is a file that ends with .clean.fastq.gz
# move these to PROCESSED_READS
mkdir PROCESSED_READS
mv RAW_READS/*.clean.fastq.gz PROCESSED_READS
mv RAW_READS/*.report.gz PROCESSED_READS

#############################################################################################################
# 7. Map new samples to repeat masked genome using BWA-mem

mkdir MAPPED_Batch2
for f in `ls PROCESSED_READS/*.clean.fastq.gz`
  do echo $f 
  base=`basename ${f/.1.clean.fastq.gz/}`
  echo MAPPED_Batch2/$base
  ./run_bwa_mem_se.sh RheMac7_repmasked/macam.fa.masked $r1 MAPPED_MASKED/$base
  done

#########################################################################################
# 8. Count reads in bins for the non-reference samples

# This script reports the bin loci, raw counts per bin, percentage of reads in each bin,
# expected percentage of reads in each bin and the ratio of the observed percentage of 
# reads in each bin to the expected percentage of reads in each bin. 
# Note: These ratios aren't used in plotting & copy number determination, I changed the
# pipeline for that a bit in the following scripts.

for f in `ls MAPPED_Batch2/*.q30.rmdup.bam`
  do echo $f; name=`echo $f | sed 's/.q30.rmdup.bam//'`; echo $name
  Rscript count_bin_reads.R xx-bins_200.txt $f $name.bin_200_counts.txt
  done

# For samples with less than 200,000 reads, use bins of 1000 reads
for f in `ls MAPPED_Batch2/*.q30.rmdup.bam`
  do echo $f; name=`echo $f | sed 's/.sort.q30.rmdup.bam//'`; echo $name
  reads=`tail -n 1 $name.mapped_reads`; echo $reads
  if [ $reads -lt 200000 ]
    then Rscript count_bin_reads.R xx-bins_1000.txt $f $name.bin_1000_counts.txt
  fi
  done

# 8 a. Determine "bad bins".
# A bins is "bad" if its average ratio across all samples is more than 3 standard
# deviations from the mean of the middle 99 percentile of bins for that chromosome.
Rscript get_bad_bins.R "MAPPED_Batch2/*200_counts.txt" > MAPPED_Batch2/bad_bins_200.txt
Rscript get_bad_bins.R "MAPPED_Batch2/*1000_counts.txt" > MAPPED_Batch2/bad_bins_1000.txt

## HERE ##

#########################################################################################
# 9. Generate plots of observed vs. expected read counts in each bin

for f in `ls MAPPED_Batch2/*_200_counts.txt`
  do echo $f; Rscript plot_copy_number.R $f; done

for f in `ls -d MAPPED_Batch2/*_plots`
  do mv $f CN_PLOTS/200_read_bins/ ; done

# For samples with less than 200k reads, use 1000 bp bins
for f in `ls MAPPED_Batch2/*_1000_counts.txt`
  do echo $f; Rscript plot_copy_number.R $f; done

mkdir CN_PLOTS/1000_read_bins
for f in `ls -d MAPPED_Batch2/*_plots`
  do mv $f CN_PLOTS/1000_read_bins; done

# Plots are placed in directories CN_PLOTS/200_read_bins/<name>_plots or
# CN_PLOTS/1000_read_bins/<name>_plots

# for f in `ls CN_PLOTS/1000_read_bins/*_plots/*_plots/*.png`; do echo $f; 
#   new=`echo $f | cut -d'/' -f1,2,4,5`; echo $new; mv $f $new; done

# Make a table of read counts
#############################################################################

echo -e 'reads\tmapped\tpassQ30\tunique' > all_read_counts.txt
for f in `ls MAPPED_Batch2/*.mapped_reads`
  do name=`basename $f | sed 's/.mapped_reads//' | sed 's/_L001//'`
  echo -n -e "$name\t"
  raw=RAW_READS/`basename $f | sed 's/.mapped_reads/_R1_001.fastq.gz/'`
  tot_reads=`echo $(zcat $raw | wc -l) / 4 | bc`
  echo -n -e "$tot_reads\t"
  mapped=`grep -a1 "All reads:" $f | tail -n 1`
  passQ=`grep -a1 "Passing Q 30 filter:" $f | tail -n 1`
  unique=`grep -a1 "After samtools rmdup:" $f | tail -n 1`
  echo -e "$mapped\t$passQ\t$unique"
done >> all_read_counts.txt
