#!/bin/bash -x

set -o nounset
set -o pipefail
set -o errexit 
set -o xtrace

outfile=geneCoordinates-expressionFoldChanges-SorbitolTreatedData

suppTable1=SuppTable1_newIDs.txt
geneCoordinates=../../geneAnnotations/extractedGeneCoordinates.txt
geneGroup=sexualGenes-clusters1-2-3.txt

# Extract sorbitol treated expression vals
#	0hr (Early ring S, col #6), 12hr (Late ring S, col #8)
#	18hr (Early trophozoite S, col# 10) and 24hr (Late trophozoite S, col# 12)
#	30hr (Early schizont S, col# 14) and 36hr (Late schizont S, col# 16)
#	Merozoite (col# 18), Gametocyte (col #42), Sporozoite (col #44)
cat $suppTable1 | awk -v FS="\t" -v OFS="\t" '$2 != "#N/A" && NR>1 {print $2,$6,$8,$10,$12,$14,$16,$18,$42,$44,$6+$8+$10+$12+$14+$16+$18+$42+$44}' | sort -k1,1 > exprs.tmp


# get a join of expression file and file with gene coordinates. 
# if multiple entries keep the entry with highest total expression in all stages.
cat $geneCoordinates | awk 'NR>1{print}' | sort -k1,1 > allGenes.tmp
join allGenes.tmp exprs.tmp | sort -nr -k18,18 | sort -u -k1,1 > joined.tmp

# mark the genes in the gene group with 1 as an additional column and all the others with 0
cat $geneGroup | awk 'OFS="\t" {print}' | sort > geneGroup.tmp
join geneGroup.tmp joined.tmp >  overlap.tmp
cat overlap.tmp allGenes.tmp | awk '{print $1}' | sort | uniq -u > uniques.tmp
join uniques.tmp joined.tmp >  nonOverlap.tmp

echo -e "new_ID\tchromosome\tgeneStart\tgeneEnd\tstrand\tmid10KbWindow\tgeneAliasORfamily\tgeneAnnot\tEarlyRingS\tLateRingS\tEarlyTrophozoiteS\tLateTrophozoiteS\tEarlySchizontS\tLateSchizontS\tMerozoite\tGametocyte\tSporozoite\tTotalExpression\tIsSexualGene" > $outfile.txt

cat overlap.tmp | awk 'OFS="\t" {for (i=1; i<= NF; i++) {printf("%s	",$i);} printf("1\n")}' >> $outfile.txt
cat nonOverlap.tmp | awk 'OFS="\t" {for (i=1; i<= NF; i++) {printf("%s	",$i);} printf("0\n")}' >> $outfile.txt
cat $outfile.txt | sort > $outfile.tmp
mv $outfile.tmp $outfile.txt

cp $outfile.txt $outfile.xls

rm -rf *.tmp

exit

