#!/bin/bash
#

#The script should be run with arguments:	<TSRsetFile>
#

inputfile=$1
GFF3FILE=../HsGENOME/*gff3

#Fields in the yeast (annotated) TSRsetCombined.txt file:
#
# 1-4	seq, start, end strand
# 5-6	nTSSs, nTAGs
# 7-11	tsrPeak, tsrWdth, tsrTrq, tsrSI, tsrMSI
# 12	featureID


#Number of TSRs associated with mRNA annotations:
#
m=`cut -f12 $inputfile | egrep -v "^featureID|^NA" | wc -l`

#Number of TSRs not associated with mRNA annotations:
#
n=`cut -f12 $inputfile | sort  | egrep "^NA" | wc -l`

#Check:
#
c=`tail -n+2 $inputfile | wc -l`

echo "Number of TSRs     associated with mRNA annotations:	 $m"
echo "Number of TSRs not associated with mRNA annotations:	 $n"
echo "Check: wc -l on 'inputfile' minus 1 (header)       :	$c"
echo ""


egrep "	mRNA	" $GFF3FILE | cut -f9- | egrep "^ID=transcript:" | cut -d";" -f1 | cut -d"=" -f2 | sort -u > tmp_$1_all_mRNAs
m=`cat tmp_$1_all_mRNAs | wc -l`
echo "Number of mRNA annotations                         :	$m"
#Number of  protein_coding mRNA annotations:
#
cut -f9- $GFF3FILE | egrep "biotype=protein_coding" | egrep "^ID=transcript:" | cut -d";" -f1 | cut -d"=" -f2 | sort -u > tmp_$1_all_pcd_mRNAs
m=`cat tmp_$1_all_pcd_mRNAs | wc -l`
echo "Number of protein_coding mRNA annotations          :	$m"


#Number of protein_coding mRNA annotations associated with at least one TSR:
#
m=`cut -f12 $inputfile | sort -u | egrep -v "^featureID|^NA" | wc -l`
echo "Number of mRNA annotations associated with at least one TSR:	 $m"

cut -f12 $inputfile | sort -u | egrep -v "^featureID|^NA" > tmp_$1_tsr_mRNAs
diff tmp_$1_all_mRNAs tmp_$1_tsr_mRNAs | egrep "^<" | cut -c3- > tmp_$1_noTSR_mRNAs
n=`cat tmp_$1_noTSR_mRNAs | wc -l`
echo "Number of mRNA annotations not associated with any TSR     :	 $n"


#Genes associated with multiple TSRs:
#
sed -e "s/\(.*\)/awk -v locus=\1 '\$0 ~ locus {tsr[\$12] += 1}; END { for (k in tsr) print k \"\\\t\" tsr[k] }' $inputfile/" tmp_$1_tsr_mRNAs | bash | sort -k2,2 -nr > tmp_$1_mRNANBRtsr
awk '$2 == 2 { print }' tmp_$1_mRNANBRtsr > tmp_$1_mRNA_2TSR
m=`cat tmp_$1_mRNA_2TSR | wc -l`
echo "Number of mRNA annotations associated with exactly   2 TSRs:	 $m	(file: tmp_$1_mRNA_2TSR)"
awk '$2 == 3 { print }' tmp_$1_mRNANBRtsr > tmp_$1_mRNA_3TSR
m=`cat tmp_$1_mRNA_3TSR | wc -l`
echo "Number of mRNA annotations associated with exactly   3 TSRs:	 $m	(file: tmp_$1_mRNA_3TSR)"
awk '$2 > 3 { print }' tmp_$1_mRNANBRtsr > tmp_$1_mRNA_4TSR
m=`cat tmp_$1_mRNA_4TSR | wc -l`
echo "Number of mRNA annotations associated with more than 3 TSRs:	 $m	(file: tmp_$1_mRNA_4TSR)"
