# script to parse repeat masker outputs into nicer human readable and IGV compatible files
# for Malawi haplochromines

OUTDIR="$HOME/code/malawi_transposon/cloud/out/repmask/otherspecies_repbase_aug22/"

SAMPLES=(aulStu5.0 mayZeb2.0_ensembl rhaChi1.0 troMau2.0 copChr1.0 otoArg1.0 rhaChi2.0)
SIZES=(889966833 957485262 901598841 912692380 855116986 871152569 846032830)

SUBMODULE_DIR="$MODULE_DIR/repeatmasker/"

for i in "${!SAMPLES[@]}"
do
    # get sample information
    SAMPLE=${SAMPLES[i]}
    SIZE=${SIZES[i]}

    cd $OUTDIR/$SAMPLE/
    mkdir fig/

    echo processing... $SAMPLE 

    # certain species need to have chromosomes renamed
    echo "repmask to simple"
    cat raw_output/$SAMPLE.ref.fa.out | \
    python $SUBMODULE_DIR/parse_repmask_to_simple.py | \
    python $SUBMODULE_DIR/parse_repeatfamily_column.py > $SAMPLE.simple

    echo "simple to bedready"
    cat $SAMPLE.simple | python $SUBMODULE_DIR/parse_simple_to_bedready.py > $SAMPLE.bedready.txt
    
    echo "bedready to viz"
    cat $SAMPLE.bedready.txt | python $SUBMODULE_DIR/parse_bedready_for_viz.py > $SAMPLE.bedviz.bed

    echo "filter viz"
    cat $SAMPLE.bedviz.bed | egrep -v "Simple_repeat|Low_complexity" | awk '$5<=11' > "$SAMPLE".bedviz_filtered.bed

    echo "landscape plots"
    python $SUBMODULE_DIR/plot_repmask_landscape.py $SAMPLE.bedready.txt $SIZE fig/
    
    echo

done
