#!/usr/bin/env nextflow

//Pipeline to retrieve rodent kidney public data 
// author Jeremy Ganofsky

params.out="/scratch/jganofsk/testpipeline/"
//srrs = Channel.fromPath(params.list).splitCsv(header: true)


// Look into NCBI database for rodent illumina RNAseq samples, excluding mouse and rat. Outpouts a list of samples (SRR) into the file query_results.csv

process chercherncbi {
echo true

output:
  file "query_results.csv" into result

script:

'''
wget -O ./query_results.csv 'http://trace.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?save=efetch&db=sra&rettype=runinfo&term=Illumina[All Fields] AND "rodents"[orgn] NOT ("Mus musculus"[Organism]) NOT ("Rattus"[Organism])'
'''
}


// Find the metadata associated with each SRR from query_results.csv and outputs them into info_SRR_total.csv. Uses the python script parsequerrysrr.py

process merger {
echo true
publishDir "$params.out", mode: 'copy'
input:
file x from result

output:
file 'info_SRR_total.csv' into merger
file  "*.txt"  into agarder

script:
"""
echo "number of lines input"
wc -l $x
python $HOME/ropipe/src/parsequerrysrr.py
echo "number of lines output"
wc -l info_SRR_total.csv
echo "problem parsing xml"
wc -l erreurparse.txt
echo "problem no attribute"
wc -l attributvide.txt
"""
}

//lists SRR with kidney and Illumina in their metadata. Uses python script trie_kidney.py. Outputs mergertrierkidney.csv

process trie_kidney{
echo true
publishDir "$params.out", mode: 'copy'
input:
file x from merger

output:
file 'mergertrierkidney.csv' into mergerkidney,mergerkidney2

script:
"""
python $HOME/ropipe/src/trie_kidney.py $x
"""
}

//Splits each line in the list into a different channel

resultsplit=mergerkidney.splitCsv(header: true)



// Downloads the first 500000 reads of each sample file, excluding the very first 10000. 

process telecharge {
tag {${x.Run}}
echo true
input:
val x  from resultsplit
output:
file "*" into fastq

script:
"""
fastq-dump --split-files   --defline-seq  '@\$ac_\$si/\$ri'   --defline-qual "+"  -N 10000 -X 510000  ${x.Run} 
echo  ${x.Run} 
if [ -f   ${x.Run}_1.fastq ]
then
true
else
touch ${x.Run}.fastq
fi
"""
}


// Check the quality of the first 500000 reads for each sample

process qualite {
tag{$x}
echo true
input:
file x from fastq
"""
fastqc  -f fastq -q $x  -o $params.out
rm $x
"""
}



//splits the samples into different species. Uses the mergerkidney2 produced by trie_kidney, and the python script separe.py

process separer{
input:
file x from mergerkidney2
output:
file "*.csv" into csvfich   mode flatten
script:
"""
python $HOME/ropipe/src/separe.py  $x
"""
}




// Put the fastcq files corresponding to each species into a specific folder and launches multiqc on them 

process trie{
echo true
input:
file x from csvfich
script:
"""
export     MODULEPATH='/applis/PSMN/debian9/Modules/all/Compiler/GCC/7.2.0:/applis/PSMN/debian9/easybuild/modules/all:/applis/PSMN/debian9/Modules/all/Compiler:/applis/PSMN/debian9/Modules/all/Core:/applis/PSMN/debian9/Modules/all/MPI:/applis/PSMN/debian9/Modules/all/Generic:/applis/PSMN/debian9/Modules/all/Local:/etc/lmod/modules:/usr/share/lmod/lmod/modulefiles'
source  /usr/share/lmod/lmod/init/bash
echo \$MODULEPATH
module avail
module load Python/2.7.13
name=${x.getBaseName()}
echo $x
echo \$name
name=${x.getBaseName()}
echo $x
echo \$name
mkdir -p "$params.out/\$name"
for srr in \$(cat $x | cut -d, -f1 )
do
mv  $params.out/\$srr* $params.out/\$name
done
mv $x $params.out/\$name
cd $params.out/\$name
multiqc  .
"""
