#!/bin/bash
#

#The script should be run with arguments:	<TSRsetFile>
#
egrep -v "Mito" $1 > tmp_$1_nuclear_tsr.txt

inputfile=tmp_$1_nuclear_tsr.txt
GFF3FILE=../ScGENOME/*gff3

#Fields in the yeast (annotated) TSR *.txt files:
#
# 1-4	seq, start, end strand
# 5-6	nTSSs, nTAGs
# 7-11	tsrPeak, tsrWdth, tsrTrq, tsrSI, tsrMSI
# 12	featureID


#Number of TSRs associated with _nuclear_ protein_coding gene annotations:
#
m=`cut -f12 $inputfile | egrep -v "^featureID|^NA" | wc -l`

#Number of TSRs not associated with _nuclear_ protein_coding gene annotations:
#
n=`cut -f12 $inputfile | sort  | egrep "^NA" | wc -l`

#Check:
#
c=`tail -n+2 $inputfile | wc -l`

echo "Number of TSRs     associated with _nuclear_ protein_coding gene annotations:	 $m"
echo "Number of TSRs not associated with _nuclear_ protein_coding gene annotations:	 $n"
echo "Check: wc -l on 'inputfile' minus 1 (header)                              :	$c"
echo ""


#Number of _nuclear_ protein_coding gene annotations:
#
egrep -v "^Mito" $GFF3FILE | cut -f9- | egrep "biotype=protein_coding" | egrep "^ID=gene:" | cut -d";" -f1 | cut -d"=" -f2 | sort -u > tmp_$1_all_pcd_nuclear_genes
m=`cat tmp_$1_all_pcd_nuclear_genes | wc -l`
echo "Number of _nuclear_ protein_coding gene annotations                                 :	$m"


#Number of _nuclear_ protein_coding gene annotations associated with at least one TSR:
#
m=`cut -f12 $inputfile | sort -u | egrep -v "^featureID|^NA" | wc -l`
echo "Number of _nuclear_ protein_coding gene annotations associated with at least one TSR:	 $m"

cut -f12 $inputfile | sort -u | egrep -v "^featureID|^NA" > tmp_$1_tsr_pcd_nuclear_genes
diff tmp_$1_all_pcd_nuclear_genes tmp_$1_tsr_pcd_nuclear_genes | egrep "^<" | cut -c3- > tmp_$1_noTSR_pcd_nuclear_genes
n=`cat tmp_$1_noTSR_pcd_nuclear_genes | wc -l`
echo "Number of _nuclear_ protein_coding gene annotations not associated with any TSR     :	 $n"


#Genes associated with multiple TSRs:
#
sed -e "s/\(.*\)/awk -v locus=\1 '\$0 ~ locus {tsr[\$12] += 1}; END { for (k in tsr) print k \"\\\t\" tsr[k] }' $inputfile/" tmp_$1_tsr_pcd_nuclear_genes | bash | sort -k2,2 -nr > tmp_$1_pcd_nuclear_geneNBRtsr
awk '$2 == 2 { print }' tmp_$1_pcd_nuclear_geneNBRtsr > tmp_$1_pcd_nuclear_gene2tsr
m=`cat tmp_$1_pcd_nuclear_gene2tsr | wc -l`
echo "Number of _nuclear_ protein_coding gene annotations associated with exactly   2 TSRs:	 $m	(file: tmp_$1_pcd_nuclear_gene2tsr)"
awk '$2 == 3 { print }' tmp_$1_pcd_nuclear_geneNBRtsr > tmp_$1_pcd_nuclear_gene3tsr
m=`cat tmp_$1_pcd_nuclear_gene3tsr | wc -l`
echo "Number of _nuclear_ protein_coding gene annotations associated with exactly   3 TSRs:	 $m	(file: tmp_$1_pcd_nuclear_gene3tsr)"
awk '$2 > 3 { print }' tmp_$1_pcd_nuclear_geneNBRtsr > tmp_$1_pcd_nuclear_gene4tsr
m=`cat tmp_$1_pcd_nuclear_gene4tsr | wc -l`
echo "Number of _nuclear_ protein_coding gene annotations associated with more than 3 TSRs:	 $m	(file: tmp_$1_pcd_nuclear_gene4tsr)"
