#!/usr/bin/bash

scripts_loc=/mnt/ix1/Resources/scripts/megahaplotyping_scripts
alleles_yes=0
genome_build=38
is_patient_female=0

# 0:  create temp directory for sorting etc. as needed
mkdir loc_temp

# 1:  Run 10X longranger on the sequenced data.  Outputs are individual vcf files for each sample.

# 2:  Filter vcfs to include SNVs only.
for i in *vcf; do ${scripts_loc}/snvs_only.pl $i | grep PASS >`basename $i vcf`filt.vcf; done

# 3:  Modify to allow combination.  Make chroms. numeric, filter contigs, sort.  
for i in *filt.vcf; do sed 's/^chr//' $i | sed 's/^X/23/' | grep -v ^GL | grep -v ^NC | grep -v ^Y | sort -k 1,1 -k 2,2 -n -T loc_temp >`basename $i vcf`num.vcf; done

# 4:  Simplify genotype information of vcf.
for i in *filt.num.vcf; do ${scripts_loc}/simplify_vcf_genotype.pl $i >`basename $i vcf`simp.vcf; done

# 5:  Merge vcf files, either two or three files.  Only retains positions with calls in all vcfs.
${scripts_loc}/integrate_vcfs.py *simp.vcf >merged.vcf

# 6:  Convert vcf output into format that allows simpler reading for building haplotype blocks.  This produces a file which ends in ‘phased_basic’.
python3 ${scripts_loc}/vcf_to_basic_barcode_info.py merged.vcf 

# 7: Generate haplotype blocks.  
python3 ${scripts_loc}/basic_to_haplotype_blocks.double.py --size 2 merged.phased_basic > merged.blocks.txt

# 8:  Create R scripts that will produce pictures of haplotype blocks and densities.
for i in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22; do ${scripts_loc}/make_rscr.double.pl --build ${genome_build} merged.blocks.txt $i 100; done
#   If patient is female, to get X-chrom haplotype blocks and test:
if [ ${is_patient_female} -gt 0 ]; then    
   ${scripts_loc}/make_rscr.double.pl --build ${genome_build} merged.blocks.txt 23 100
else
   echo "no X chrom processed" 
fi

# 9:  Run R scripts: 
for i in *.r; do R --save <$i ; done  1>r_output 2>r_errout

# 10:  Produce list of which arms are imbalanced, and then filter, and create bed version
${scripts_loc}/get_t_test_vals.pair.pl r_output | sort -k 1,1 -n >ttest_pvals.txt

${scripts_loc}/filter_t_test_vals.pl --filt 0.001 ttest_pvals.txt >ttest_filtered.txt
${scripts_loc}/ttest_to_bed_file.py ttest_filtered.txt ${genome_build} >>ttest_filtered.bed

# 11:  Find imbalanced regions in each chromosome and list them:
for i in *thresh*txt; do ${scripts_loc}/aneuploid_regions_from_thresh_info.py $i >`basename $i txt`grp.txt; done

# 12:  Find imbalanced regions where large contiguous blocks occur, convert to bed format to allow intersectBed
cat *tum*grp.txt | awk '{ if ($3 - $2 > 1000000) print; }' | sort -k 1,1 -k 2,2 -n | awk '{ print $1 "\t" $2 - 1 "\t" $3; }'  >passing_threshold_regions.bed
# filter to include only regions passing ttest (i.e. within called arms)
intersectBed -a passing_threshold_regions.bed -b ttest_filtered.bed >passing_and_called_regions.bed
# convert back to txt (1-based) to avoid any confusion hereafter 
awk '{ print $1 "\t" $2 + 1 "\t" $3; }' passing_and_called_regions.bed >passing_and_called_regions.txt
# add arm info to passing and called to allow collapse in the next step
${scripts_loc}/identify_arm.py passing_and_called_regions.txt ${genome_build} >passing_and_called_regions.arms.txt
# now sum by arm
${scripts_loc}/sum_megablocks_by_arm.py passing_and_called_regions.arms.txt ${genome_build} >passing_and_called_regions.coll.txt

# 13:  Find relationship of haplotype proportion on each arm to that of the genome: 
${scripts_loc}/get_barcode_cnv_vals_num.pair.pl r_output | sort -k 1,1 -n >haplotype_cnv_vals.txt

# 14:  Generate mega-haplotypes.
#   a: Get major/minor info for each haplotype block.
${scripts_loc}/blocks_to_list.py merged.blocks.txt >merged.blocks.lists

#   b: Convert vcf data into list format.  This produces a file ending in ‘lists’, whether output gives alleles or genotype numbers
#	(the latter is for partial masking to reduce risk of identification):
if [ ${alleles_yes} -gt 0 ]; then    
   ${scripts_loc}/vcf_prelist.allele.py merged.vcf  
else
   ${scripts_loc}/vcf_prelist.number.py merged.vcf   
fi
rm 0

#   c: Create haplotypes for each large block via the two files created in a and b.
${scripts_loc}/haplotype_from_list_and_block_info.py merged.blocks.lists merged.lists 2 >tum.haplotyped_across_tumor

#   d: convert haplotype file to bed and intersect with passing regions, to only get the bases in regions we have called
awk '{ print $1 "\t" $2 - 1 "\t" $2 "\t" $3 "\t" $4; }' tum.haplotyped_across_tumor > tum.haplotyped_across_tumor.bed

#   e: intersect to get only regions called as aneuploid in haplotyped list
intersectBed -a tum.haplotyped_across_tumor.bed -b passing_and_called_regions.bed >tum.haplotyped_across_tumor.passing.bed

#   f: convert bed file back to basic position + haplotype file
awk '{ print $1 "\t" $3 "\t" $4 "\t" $5; }' tum.haplotyped_across_tumor.passing.bed >tum.haplotyped_across_tumor.passing.txt

