#!/bin/bash

# First argument is input file (fastq), second argument is output file root,
# third argument is length of labeling BC (usually 9 or 12)

## this is the final to demultiplex samples using in line BCs

ARGS=("$@")

# Check for correct number of arguments
if [ "$#" -ne 3 ]; then
	echo "For script 'Illumina_de-multiplex-code_FitCons.sh'\n"
	echo "Usage: [input file] [output file root] [BC length]"
	exit 1
fi

BCLENGTH=${ARGS[2]}

# Pulls lines from fastq file that contain sequence and puts in a temporary file
TMPFILE="tmp"$(date +%s)
zgrep -e "[AGCTN]\{50,\}" ${ARGS[0]} > $TMPFILE

# Multiplex BCs used in the sample and numbers that refer to them
## BCs used for LTR18A TE-CRE K562 library: need sed wildcards for matching (see: http://www.unix.com/shell-programming-and-scripting/31583-wildcards-sed.html)
MPBC=('AACCTCA' 'TCTAAGC' 'CTGTCAT' 'GGAGGTG' 'GCTCGAT')

BCNUM=(bioRep1 bioRep2 bioRep3 none1 none2) # output naming system

# MIDDLE is sequence between multiplex BC and labeling BC
MIDDLE='GGATCCG' #Restriction site right before BC
#MIDDLE='CCGGGC' # used to be restriction site right after multiplex BC (XmaI 5' to 3': C^CCGGG; overhang = CCGGG + additional "C" before bc)
#MIDDLE='AATTCC' #Used for pGL-Pou Agilent array (AATTCc = over hang + constant before bc, EcoRI G^AATTC)


# Loops through each multiplex BC used in the sample
for k in $(seq 0 $(expr ${#MPBC[*]} - 1))
do
	echo "Processing BC "${BCNUM[$k]}
	CURRENTFILE=${ARGS[1]}'_'${BCNUM[$k]} #naming output file based on input + BCNUM
	
	# Pulls the labeling BC sequence from each line and outputs it, only if 
	# the sequence has the proper middle sequence and current multiplex BC.
	# The sequence can have any number of bases between the current multiplex BC and the proper middle sequence.
	sed -n 's/.*'${MPBC[$k]}'.*'$MIDDLE'\([ACGT]\{'$BCLENGTH'\}\).*/\1/p' $TMPFILE > $CURRENTFILE
	
	# Sorts the labeling BCs (necessary for uniq command) and then collapses
	# them by the unique BC sequence with counts of how many times each
	# labeling BC appears in the file
	sort $CURRENTFILE | uniq -c > $CURRENTFILE"_counts"
done

# Removes temporary file of just full sequences + BCs without counts
rm $TMPFILE

## outputs: reads_1, reads_2 ... reads_8

