#!/bin/bash
printf "
quantify is a tool to create expression profiles of non-coding RNAs in particular miRNAs.
It is well designed to handle single cell data from hundreds of cells with or without umis.
As input it needs reference files depending on what is to be quantified. 

E.g. quantifying mature miRNAs will require a fasta file with mature miRNA sequences and one with the hairpin sequences.

These can be easily obtained from miRBase with the included mirbase.pl script by just typing

## download mirbase21 and put files into ~/mirbase/21/
"
if [[ $1 ]];then
	perl mirbase.pl 21
fi
printf "
## extract miRNAs of your species of interest
"
if [[ $1 ]];then
	extract_miRNAs.pl ~/mirbase/21/hairpin.fa.gz mmu > hairpin_mmu.fa
	extract_miRNAs.pl ~/mirbase/21/mature.fa.gz mmu > mature_mmu.fa
fi
printf"
## reads from your sequencing run should be in a fasta file in which reads are collapsed and which must follow the name convention
>STRING_INT_xINT

STRING should optimally describe your sample, optimally it is a short abbreviation of 3 letters 
INT    is a running number giving your read a unique id in combination with STRING
xINT   designates the number of times the sequence has been seen in your data

For instance, the id >liv_23_x100 would indicate a sequence from a sample called liv and which had been sequenced 100 times indicated by _x100.
If your id looks like >liv_23_x100_u1000 than the _x100 must indicate the number of UMIs that have been seen for the sequence and the _u1000
would indicate the number of times the sequence hase been sequenced. Thus 900 of your 1000 reads are PCR duplicates and 100 have been indeed
captured from your RNA sample. 

## now run quantify to get expression values for input reference files 
"
if [[ $1 ]];then
	PRES=$HOME/mirbase/21/hairpin_mmu.fa
	MAT=$HOME/mirbase/21/mature_mmu.fa
	READS=all_fused.fa
	TAG=all_uniq
	quantify -p $PRES -m $MAT -r $READS -y $TAG -t Mouse
fi
printf"
## optimizations
-T -> Number of threads to use for mapping
-d -> dont create pdfs
-j -> dont create an output.mrd file
-F -> dont run RNAfold for structure prediction
-A -> dont include structure in pdfs
-G -> pdfs will only contain the coverage plot but not showing how every single read aligned 

## experimental
-N -> dont do read file conversion, will save lots of time but if your input is not properly formatted then this may have unpredictable consequences


## html modifications
-S -> sequences will not be given in the html output
-B -> 5p/3p expression values are put in the column instead of of separate columns  

## other options
-R -> dont calculate RPMs ,e.g. when using UMIs data does not need to be normalized by RPM
-Z -> only output RPMs in output
-W -> reads are weighed by the number of times they map to your reference
-X -> only works when W is given and will output also the number of weighed reads!
-U -> use only uniquely mapping reads

-C -> will allow to use either a config file as input for your reads given as -r but also to given multiple commata separated files as option -r
#qunatify -C -r file1.fa,file2.fa
#quantify -C -r config.txt
## config.txt layout
## liv1       file1.fa
## liv2       file2.fa

-w -> will consider the whole precursor as the region to match => no mature file needed, useful if someone wants to count simply all reads mapping to a referene sequence 
-> this can be used to quantify miRNA precursors or lincRNAs or tRNAs or ribosomal RNAs or ....

## To qunatify tRNAs from mouse 
## download tRNAs
"
if [[ $1 ]];then
	wget http://gtrnadb.ucsc.edu/genomes/eukaryota/Mmusc10/mm10-tRNAs.tar.gz
fi
printf "
## extract them 
"
if [[ $1 ]];then
	tar xvvzf mm10-tRNAs.tar.gz
fi
printf"
## make annotation for 3p end and 5p, 5p is defined here for being everything before the anticodon and 3p everything after the anticodon. This is an arbitrary and nonofficial division of a tRNA by the author
"
if [[ $1 ]];then
	perl make_trna_file.pl  mm10-tRNAs.ss.sort mmu
	cat mm10-tRNAs.ss.sort.3p.fa mm10-tRNAs.ss.sort.5p.fa > mm10-tRNAs.ss.sort.mature.fa
fi
printf"
## run quantify
"
if [[ $1 ]];then
	PRES=mm10-tRNAs.ss.sort.reformat.fa
	MAT=mm10-tRNAs.ss.sort.mature.fa

	quantify -p $PRES -m $MAT -r all_fused.fa -F -N -T 4 -S -t mmu -y tRNA -R -A -G -W -B -U 
fi

printf"
## get untemplated nucleotides and more info from your qunatified data
## now run the quantifier on it


## the -A option does not output RPM, we normalize it in R because reads per million mapped reads
## does not really make sense here with a capture efficiency of 0.05%
"
if [[ $2 ]];then
	TS1="_uniq_mappers"
	for i in {1..48};do
		echo cell $i
		time quantify.pl -p ~/mirbase/21/hairpin_mmu.fa -m ~/mirbase/21/mature_mmu.fa  -P -r cell_${i}_collapsed.fa -y cell_${i}$TS1 -W -U -A -t Mouse
	done

	find . -name "miRBase.mrd" |grep $TS1 > all_miRBase$TS1.mrd

	## parse the quantifier files
	for i in `cat all_miRBase$TS1.mrd`;do d=$(dirname $i);perl untemplated_nt.pl -f $i > ${d}/untemplated.csv 2>${d}/untemplated_verbose;done

	## now make the images for all cells regarding the bias
	find . -name "untemplated.csv"|grep $TS1 > all_untemplated$TS1.csv

	for i in `cat all_untemplated$TS1.csv`;do d=$(dirname $i);   b=$(echo $i |perl -ane 'if(/(cell_\d+)/){print "$1\n";}');./make_barplots.R  ${i} ${d} ${b} $TS1;done

	## adjust html files here
	for i in {1..48};do perl add_ut.pl -t cell_${i}${TS1} > cell_${i}${TS1}.html;done

fi
