#!/bin/bash

if [ -z "${CNVER_FOLDER}" ]; then
	echo "Please set CNVER_FOLDER enviornment variable to the correct path"
	exit
fi

if [ $# -ne 8 ]; then
	echo "$0 mapping_files_list contig_breaks output_dir read_len mean_insert_size stdev_insert_size min_mps_per_cluster contig_name_file"
fi 

########################  run specific options ########################  
# 
# You should change these to whatever is appropriate for you.
# Otherwise the algorithm will not run properly
#
########################################################################   

# The maximum memory allowed for the unix sort command
MEM_LIM=3000000000 

# The mean insert size of the dataset
MEAN_INSERT_SIZE=$5

# The standard deviation of the insert size of the dataset
STDEV_INSERT_SIZE=$6

# The file containing a list of the mapping files, one file per line
RMAPPING_FILES=$1

# The directory where the clusters and link edges are stored
OUTPUT_DIR=$3

# The number of matepair mappings required to use a cluster as a link edge.
MIN_MPS_PER_CLUSTER=$7

# The contig break file.  Link edges close to these are screened.
# The file included here is for the human hg18 reference.
CONTIG_BREAK_FILE=$2 

# Length of reads
READ_LEN=$4

CONTIG_NAME_FILE=$8

########################  algorithm parameters ########################  
# 
# These are further options to play around with, but the default should work on these.
#
########################################################################   

# The minimum and maximum mapped distance which is considered concordant by the algorithm
MIN_CONCORDANT_MAPPED_DIST=`expr ${MEAN_INSERT_SIZE} - 3 \* ${STDEV_INSERT_SIZE}`
MAX_CONCORDANT_MAPPED_DIST=`expr ${MEAN_INSERT_SIZE} + 3 \* ${STDEV_INSERT_SIZE}`

# The maximum mapped distance allowed for links.  All clusters with a bigger span then this are discarded.
LINK_LEN_CUTOFF=10000000

# The chromosomes for which to create the clusters and links.  You can change this if you are only working with one chromosome.
#CHROMOSOMES="chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chr21 chr22" 


# All links with at least one endpoint that falls within this distance from a contig break are discarded.
TOLERANCE_AROUND_CONTIG_BREAKS=`expr ${MEAN_INSERT_SIZE} + 3 \* ${STDEV_INSERT_SIZE}`

# The amount of standard deviations to allow in the width of a cluster is mean + base_len_factor.
BASE_LEN_FACTOR=1

# The maximum difference between the main coordinates must be within MD_JOIN_TOLERANCE * STDEV from the dif in sec coord
MD_JOIN_TOLERANCE=6



########################  harcoded values ########################  
# 
# These don't need to change unless debugging.
#
########################################################################   

GENERAL_OPTS="--mean=${MEAN_INSERT_SIZE} --stdev=${STDEV_INSERT_SIZE}"
CLUSTER_MATEPAIRS_OPTS="${GENERAL_OPTS} --colID=0 --colDist=1 --colChr=2 --colLeft=3 --colRight=4 --colTemplate=5 --baseLenFactor=${BASE_LEN_FACTOR} --mdJoinTolerance=${MD_JOIN_TOLERANCE} "
SCREEN_CONTIG_BREAKS_OPTS="${GENERAL_OPTS} --tolerance=${TOLERANCE_AROUND_CONTIG_BREAKS} --breaksFile=${CONTIG_BREAK_FILE}"
CLUSTER_MATEPAIRS_EXEC="${CNVER_FOLDER}/cluster/cluster_matepairs"
SCREEN_CONTIG_BREAKS_EXEC="${CNVER_FOLDER}/cluster/screen_contig_breaks"
CONCORDANCY_ANALYSIS_EXEC="${CNVER_FOLDER}/cluster/concordancy_analysis"
CLUSTER_FILE=${OUTPUT_DIR}/clusters
LINK_FILE=${OUTPUT_DIR}/links



########################################################################   
########################  actual code ########################  
########################################################################   

mkdir -p ${OUTPUT_DIR}
mkdir -p ${OUTPUT_DIR}/indices
rm -rf ${OUTPUT_DIR}/conc_script
rm -rf ${OUTPUT_DIR}/sort_script
rm -rf ${OUTPUT_DIR}/link_script
rm -rf ${OUTPUT_DIR}/indexmaps_script
rm -rf ${OUTPUT_DIR}/indexclus_script

MMAPPING_FILES=
for RMAPPINGS in `cat ${RMAPPING_FILES}`
do
	MMAPPINGS=${OUTPUT_DIR}/`basename ${RMAPPINGS}`.disc 
	echo "cat ${RMAPPINGS} | ${CONCORDANCY_ANALYSIS_EXEC} -f 'NA' -b 'NA' -r${READ_LEN} -d${LINK_LEN_CUTOFF} -l${MIN_CONCORDANT_MAPPED_DIST} -u${MAX_CONCORDANT_MAPPED_DIST} -o${MMAPPINGS} - " >> ${OUTPUT_DIR}/conc_script
	MMAPPING_FILES="$MMAPPING_FILES ${MMAPPINGS}"
done

#build map indices
for RMAPPINGS in `cat ${RMAPPING_FILES}`
do
	while read CONTIG
	do
		MMAPPINGS=${OUTPUT_DIR}/`basename ${RMAPPINGS}`.disc.${CONTIG}
		INDEX=${OUTPUT_DIR}/indices/`basename ${RMAPPINGS}`.disc.${CONTIG}.idx
		echo  "${CNVER_FOLDER}/cluster/idx_build ${INDEX} ${MMAPPINGS} " >> ${OUTPUT_DIR}/indexmaps_script
	done < ${CONTIG_NAME_FILE}
done


SORT_COL[0]="-k4n,4 -k5n,5"
SORT_COL[1]="-k5n,5 -k4n,4"
SORT_COL[2]="-k4n,4 -k5n,5"
SORT_COL[3]="-k5n,5 -k4n,4"
while read CONTIG; do
	# dist(1) chr(2) left(3) right(4) template(5) type(6) lstrand(7) rstrand(8)
	# id (1) dist(2) chr(3) left(4) right(5) template(6) type(7) lstrand(8) rstrand(9)
	for TYPE in 0 1 2 3; do
		FILES=`echo ${MMAPPING_FILES} | tr ' ' '\n' | awk -v contig=${CONTIG} '{ print $1"."contig }' | tr '\n' ' '`
		echo "awk -v type=${TYPE} -v findex=-1 '{  if (FNR == 1) findex++; \$5 = findex \"_\" \$5; if (\$6 == type) print  findex \"_\" FNR-1 \"_\" \$2,  \$0}' $FILES | sort ${SORT_COL[$TYPE]} -k2n,2 -S${MEM_LIM} | ${CLUSTER_MATEPAIRS_EXEC} ${CLUSTER_MATEPAIRS_OPTS} --type=$TYPE > ${CLUSTER_FILE}.${CONTIG}.t$TYPE" >> ${OUTPUT_DIR}/sort_script
	done
	echo "grep -h EDGE ${CLUSTER_FILE}.${CONTIG}.t? | ${SCREEN_CONTIG_BREAKS_EXEC} ${SCREEN_CONTIG_BREAKS_OPTS} | awk -v min_mps=${MIN_MPS_PER_CLUSTER} -v maxlen=${LINK_LEN_CUTOFF} '{ if ((\$5 >= min_mps) && (\$8 < maxlen)) print \$0 }' > ${LINK_FILE}.${CONTIG}" >> ${OUTPUT_DIR}/link_script
done < ${CONTIG_NAME_FILE}

#build cluster indices
while read CONTIG; do
	for TYPE in 0 1 2 3; do
		INDEX=${OUTPUT_DIR}/clusters.${CONTIG}.t${TYPE}.idx
		DATA=${CLUSTER_FILE}.$CONTIG.t$TYPE
		echo "${CNVER_FOLDER}/cluster/idx_build $INDEX $DATA delim HEADER1" >> ${OUTPUT_DIR}/indexclus_script
	done
done < ${CONTIG_NAME_FILE}

exit

${OUTPUT_DIR}/conc_script
${OUTPUT_DIR}/sort_script
${OUTPUT_DIR}/link_script
${OUTPUT_DIR}/indexmaps_script
${OUTPUT_DIR}/indexclus_script
