#!/bin/bash

# norg对应关系流程 (Mapping process for norg relationship)
# 例如：使用 IRGSP1.0 作为参考基因组 (e.g. IRGSP1.0 as reference genome)
# 需要提供以下文件：
# - norg merge5000 bed 文件 (norg merge5000 bed file)
# - 参考基因组的 index 文件 (.fai)，位于 fasta 文件所在目录 (Reference genome index file (.fai), located in the same directory as the fasta)
# - 参考基因组的 fasta 文件 (Reference genome fasta file)
# - 查询基因组的 fasta 文件 (Query genome fasta file)

# Use command line arguments
#### Required variables ####
## -r) $ref_fasta Reference genome fasta file
## -q) $query_fasta Query genome fasta file
## -a) $ref_name Reference genome name
## -b) $query_name Query genome name
## -m) $mummerdir Path to the mummer software bin directory, recommend using mummer4
##		Default location is $HOME/biosoft/mummer4.0.0beta2/bin
## -s) $step The step to run
##				step1: Only run bedtools slop and getfasta
##				step2: Only run slop nucmer alignment
##				step3: Only run steps after slop nucmer alignment
##				all: Run all steps

if [ -z "$1" ]
then
	echo "Help: $0 -r ref_fasta -q query_fasta -a ref_name -b query_name -m mummerdir -s step1|step2|step3|all"
	exit
fi

# Parse command line arguments
while [ -n "$1" ]
do
	case "$1" in
		-r) ref_fasta="$2"
			echo "ref_fasta = $ref_fasta"
			shift;;
		-q) query_fasta="$2"
			echo "query_fasta = $query_fasta"
			shift;;
		-a) ref_name="$2"
			echo "ref_name = $ref_name"
			shift;;
		-b) query_name="$2"
			echo "query_name = $query_name"
			shift;;
		-m) mummerdir="$2"
			echo "mummerdir = $mummerdir"
			shift;;
		-s) step="$2"
			echo "step: $step"
			shift;;
		--) shift
			break;;
		*)	echo "Warning: $1 is not an option!";;
	esac
	shift
done

# Set default step if not provided
if [ -z $step ]
then
	step="all"
	echo "default step is 'all'"
fi

# Echo the steps that will be executed
if [ $step = "all" ]
then
	echo "All step: bedtools, nucmer, determine norg type"
elif [ $step = "step1" ];then
	echo "bedtools slop & getfasta only"
elif [ $step = "step2" ];then
	echo "nucmer only"
elif [ $step = "step3" ];then
	echo "steps after step1 and 2"
else
	echo "ERROR: no such step!"
	exit
fi

# Check for required files based on the selected step
if [ -z $ref_fasta ]
then
	echo "No reference fasta file!"
	if [ $step != "step1" ]
	then
		exit
	fi
fi

if [ -z $query_fasta ]
then
	echo "No query genome fasta file!"
	exit
fi

if [ -z $ref_name ]
then
	echo "No reference name!"
	if [ $step != "step1"]
	then
		exit
	fi
fi

if [ -z $query_name ]
then
	echo "No query name!"
	exit
fi

# Set default mummer directory if not provided
if [ -z $mummerdir ]
then
	mummerdir=$HOME/biosoft/mummer4.0.0beta2/bin
	echo "default mummerdir = $mummerdir"
	export PATH=$mummerdir:$PATH
fi


#########pipeline#########

### Directory structure ###
# Create slop300 directory and subdirectories based on ref_name and query_name
# 以 ref_name 命名 (Named after ref_name)

mkdir -p slop300
mkdir -p $ref_name
mkdir -p $ref_name/$query_name

# bedtools slop 300 && bedtools getfasta (Perform bedtools slop 300 and getfasta)
# step2/3 不用运行此步骤 (Steps step2/3 do not need to run this step)

ori_query_norg_bed=03.2.merge5000/$query_name.bed
rename_query_norg_bed=slop300/$query_name.named.bed
slop300_query_norg_bed=slop300/$query_name.slop300.bed
slop300_query_norg_fa=slop300/$query_name.slop300.fa
query_fai=${query_fasta}.fai

function run_step1(){
	echo "step1 start..."
	if [ -e $ori_query_norg_bed ]
	then
		perl scripts/rename_mergebed.pl $ori_query_norg_bed > $rename_query_norg_bed
		bedtools slop -b 300 -i $rename_query_norg_bed -g $query_fai > $slop300_query_norg_bed
		bedtools getfasta -fi $query_fasta -bed $slop300_query_norg_bed -name -fo $slop300_query_norg_fa
	else
		echo "ERROR: no such file: $ori_norg_bed !"
	fi
	echo "step1 end"
}

nucmer_prefix=$ref_name/$query_name/${query_name}.slop300.$ref_name
step2_ori_delta=${nucmer_prefix}.delta
step2_ori_coords=${nucmer_prefix}.coords
step2_q_coords=${nucmer_prefix}.q.coords

function run_step2(){
	echo "step2 start..."
	if [ -e $slop300_query_norg_fa ]
	then
		$mummerdir/nucmer -p $nucmer_prefix $ref_fasta $slop300_query_norg_fa
		$mummerdir/show-coords -rclT -o $step2_ori_delta > $step2_ori_coords
		$mummerdir/delta-filter -q $step2_ori_delta > ${nucmer_prefix}.q.delta
		$mummerdir/show-coords -rclT -o ${nucmer_prefix}.q.delta > $step2_q_coords
		echo "step2 end"
	else
		echo "ERROR: no such file: $slop300_query_norg_fa!"
		exit
	fi

}

step3_dir=$ref_name/$query_name/check_type
mkdir -p $step3_dir

function run_step3(){
	echo "step3 start..."
	#type1 coords [CONTAINS]
	if [ -e $step2_ori_delta -a -e $step2_ori_coords -a -e $step2_q_coords ]
	then
		echo "Type1"
		type1_prefix=$step3_dir/type1
		type1_name=${type1_prefix}.name
		type1_bed=${type1_prefix}.bed
		cat $step2_q_coords | awk '/CONTAINS/ {print $12"\t"$1"\t"$2"\t"$13"\t0"}' > $type1_bed
		cat $step2_q_coords | awk '/CONTAINS/ {print $13}' > $type1_name
		echo "Type1 end"
		
		echo "Type2"
		type2_prefix=${step3_dir}/type2
		type2_tiling=${step2_ori_delta}.type2.tiling
		show-tiling -a $step2_ori_delta > $type2_tiling
		type2_name=${type2_prefix}.name
		type2_bed=${type2_prefix}.bed
		cat $type2_tiling | grep -v -f $type1_name | perl scripts/get_tiling_length.pl > $type2_bed
		cat $type2_bed | awk '{print $4}' > $type2_name
		echo "Type2 end"
		
		echo "Type3"
		type3_prefix=${step3_dir}/type3
		type3_tiling=${step2_ori_delta}.type3.tiling
		show-tiling -a -g 5000 $step2_ori_delta > $type3_tiling
		type3_name=${type3_prefix}.name
		type3_bed=${type3_prefix}.bed
		cat $type1_name $type2_name > ${step3_dir}/type1-2.name
		cat $type3_tiling | grep -v -f ${step3_dir}/type1-2.name | perl scripts/get_tiling_length.pl > $type3_bed
		cat $type3_bed | awk '{print $4}' > $type3_name
		echo "Type3 end"
		
		echo "Type4"
		type4_prefix=${step3_dir}/type4
		type4_tiling=${step2_ori_delta}.type4.tiling
		show-tiling -a -g 10000 $step2_ori_delta > $type4_tiling
		type4_name=${type4_prefix}.name
		type4_bed=${type4_prefix}.bed
		cat $type1_name $type2_name $type3_name > ${step3_dir}/type1-3.name
		cat $type4_tiling | grep -v -f ${step3_dir}/type1-3.name | perl scripts/get_tiling_length.pl > $type4_bed
		cat $type4_bed | awk '{print $4}' > $type4_name
		echo "Type4 end"
		
		echo "Type5"
		type5_prefix=${step3_dir}/type5
		type5_tiling=${step2_ori_delta}.type5.tiling
		show-tiling -a -g 20000 $step2_ori_delta > $type5_tiling
		type5_name=${type5_prefix}.name
		type5_bed=${type5_prefix}.bed
		cat $type1_name $type2_name $type3_name $type4_name > ${step3_dir}/type1-4.name
		cat $type5_tiling | grep -v -f ${step3_dir}/type1-4.name | perl scripts/get_tiling_length.pl > $type5_bed
		cat $type5_bed | awk '{print $4}' > $type5_name
		echo "Type5 end"
		
		echo "prepare Type6 files"
		type6_dir=${step3_dir}/type6
		mkdir -p $type6_dir
		cat $type1_name $type2_name $type3_name $type4_name $type5_name | sort -u > ${step3_dir}/type1-5.name
		type6_ori_bed=${type6_dir}/type6.bed
		cat $rename_query_norg_bed | grep -v -f ${step3_dir}/type1-5.name > $type6_ori_bed
		type6_slop100_bed=${type6_dir}/type6.slop100.bed
		type6_slop100_fa=${type6_dir}/type6.slop100.fa
		bedtools slop -b 100 -i $type6_ori_bed -g $query_fai > $type6_slop100_bed
		bedtools getfasta -fi $query_fasta -bed $type6_slop100_bed -fo $type6_slop100_fa
		type6_nucmer_prefix=${type6_dir}/type6.slop100.$ref_name
		type6_delta=${type6_nucmer_prefix}.delta
		type6_coords=${type6_nucmer_prefix}.coords
		type6_tiling=${type6_nucmer_prefix}.tiling
		nucmer -p $type6_nucmer_prefix $ref_fasta $type6_slop100_fa
		show-coords -rclT -o $type6_delta > $type6_coords
		show-tiling -a -g 20000 $type6_delta > $type6_tiling
		echo "Finished"
	else
		echo "ERROR: no enough step2 files!"
		exit
	fi
}

if [ $step = "step2" -o $step = "step3" ];then
	# 需要判断 step1 的运行结果是否存在 (Check if step1 output files exist)
	if [ -e $rename_query_norg_bed -a -e $slop300_query_norg_bed -a -e $slop300_query_norg_fa ];
	then
		echo "step1 files checked!"
		echo "skip step1"
	else
		echo "Warning: loss step1 files, still run step1."
		run_step1
	fi
	
	if [ $step = "step2" ]
	then
		run_step2
		exit
	else
		run_step3
	fi
else	# step1 or all
	if [ $step = "step1" ]
	then
		run_step1
	else	#若不为step1则为all (If not step1, it is all)
		run_step1 && run_step2 && run_step3
	fi
fi

