 // output folder
params.out="out_ropipe_analyse_cDNAcomplettest"
// default value if --out is not specified
params.outgroup=""

params.cDNA=""
params.assemblage=""

params.blast_cDNA=""
params.blast_assemblage=""

params.transdecoder_cDNA=""
params.transdecoder_assemblage=""

// channels are filled with assemblies or with ensembl cDNAs

cDNA=Channel.fromPath("${params.cDNA}/*.fa").map { file -> tuple(file.getSimpleName(), file) }
assemblage=Channel.fromPath("${params.assemblage}/*.fasta").map { file -> tuple(file.getSimpleName(), file) }

list_ass = file("${params.assemblage}/*.fasta", glob: true)
list_ass = list_ass.collect {x -> x.name.replaceAll("${params.cDNA}",' ')}
println "assemblage: $list_ass"

list_cDNA = file("${params.cDNA}/*.fa", glob: true)
list_cDNA = list_cDNA.collect {x -> x.name.replaceAll("${params.cDNA}",' ')}

println "cDNA: $list_cDNA"

nb_sp = list_ass.size() + list_cDNA.size()
println "species number: $nb_sp"

// Number of replicates made with subsampling sites 

TreeReplicates=10
println "TreeReplicates: $TreeReplicates"


// If bast results are already provided in an output file, then skip the blast step

if (params.transdecoder_cDNA == "")  {


        println "No given directory with transdecoder  cDNA  results.\n-> I will run them."


//#####################
//# transdecoder_cDNA #
//#####################
  
process transdecoder_cDNA {
    label 'small_mem'
    //errorStrategy 'retry'
    //maxRetries 5
    echo false
    publishDir "$params.out/1_preparation/transdecoder_cDNA/${espece}", mode: 'copy'
    tag{espece}

    input:
        set val(espece)  , file (assemblage) from cDNA

    output:
      set val(espece) , file("${espece}.pep") into transdecoder_cDNA_pep
      set val(espece) ,  file("${espece}.cds") into ( transdecoder_cDNA_cds , transdecoder_cDNA_cds4concat) 

"""
echo "TransDecoder ${espece}"
#touch ${espece}.cds
#touch ${espece}.pep
TransDecoder.LongOrfs -t $assemblage -m 80 >> ${espece}.transdecoder_longorfs.out || echo "error ${espece}"
TransDecoder.Predict -t  $assemblage  --single_best_only --retain_long_orfs_length 200  > ${espece}.transdecoder_Predict.out || echo "error ${espece}"



if [ -s ${espece}*.transdecoder.cds ]
then
    #renomer les cds
    ./remove_dup_sequences_in_fasta.py ${espece}*.transdecoder.cds ${espece}.cds
    wc -l *cds

    #renomer les pep
    seqtk subseq  ${espece}*.transdecoder.pep  list_cds.txt  > ${espece}.pep
fi
""" 

}
} else {
        println ("Directory with transdecoder cDNA results: ${params.transdecoder_cDNA} \n-> I will use them")

		// create two channels, one for cds files and one for peptides that are on the given directory 
        Channel.fromPath("${params.transdecoder_cDNA}/*.cds").map { file -> tuple(file.getSimpleName(), file) }.into{ transdecoder_cDNA_cds ; transdecoder_cDNA_cds4concat}
		transdecoder_cDNA_pep=Channel.fromPath("${params.transdecoder_cDNA}/*.pep").map { file -> tuple(file.getSimpleName(), file) }

}
//###################################################




// internal use: to compute transdecoder on internal assemblies. 

if (params.transdecoder_assemblage == "")  {


        println "No given directory with transdecoder  assemblage  results.\n-> I will run them."

//###########################
//# transdecoder_assemblage #
//###########################

process transdecoder_assemblage {
    label 'small_mem'
    //errorStrategy 'retry'
    //maxRetries 5
    echo false
    publishDir "$params.out/1_preparation/transdecoder_assemblage/${espece}", mode: 'copy'
    tag{espece}

    input:
        set val(espece)  , file (assemblage) from assemblage

    output:
       set val(espece) ,  file("${espece}.pep") into transdecoder_ass_pep
      set val(espece) ,   file("${espece}.cds") into (transdecoder_ass_cds , transdecoder_ass_cds4concat)
"""
echo "TransDecoder ${espece}"
touch ${espece}.cds
touch ${espece}.pep
TransDecoder.LongOrfs -t $assemblage -m 80 >> ${espece}.transdecoder_longorfs.out || echo "error ${espece}"
TransDecoder.Predict -t  $assemblage  --single_best_only --retain_long_orfs_length 200  > ${espece}.transdecoder_Predict.out || echo "error ${espece}"

if [ -s ${espece}*.transdecoder.cds ]
then
    #renomer les cds
    ./remove_dup_sequences_in_fasta.py ${espece}*.transdecoder.cds ${espece}.cds
    wc -l *cds

    #renomer les pep
    seqtk subseq  ${espece}*.transdecoder.pep  list_cds.txt  > ${espece}.pep
fi
"""

}
} else {
        println ("Directory with transdecoder assemblage results: ${params.transdecoder_assemblage} \n-> I will use them")

		// create two channel one for cds files and one for pep who are on the given directory 
        Channel.fromPath("${params.transdecoder_assemblage}/*.cds").map { file -> tuple(file.getSimpleName(), file) }.into{transdecoder_ass_cds ; transdecoder_ass_cds4concat}  

		transdecoder_ass_pep=Channel.fromPath("${params.transdecoder_assemblage}/*.pep").map { file -> tuple(file.getSimpleName(), file) }

}
//###################################################

        

concat_transdecoder_PEP=transdecoder_cDNA_pep.concat(transdecoder_ass_pep)
concat_transdecoder_CDS=transdecoder_cDNA_cds4concat.concat(transdecoder_ass_cds4concat)


if (params.blast_cDNA == "") {
        println "No given directory with blast  cDNA results.\n-> I will run them."

        //#####################
        //# blast2eggnog_cDNA #
        //#####################

        process blast2eggnog_cDNA {
          
            echo false
            publishDir "$params.out/1_preparation/blastx_eggnog_cDNA", mode: 'copy'
            tag{espece}

            input:
             set val(espece) , file (cDNA) from transdecoder_cDNA_cds // we do blastx on cds from transdecoder 

            output:
              set val(espece) , file ("*") into  blastx_cDNA  

        """
		  ls
          echo Sp: $espece
		  echo Db: `ls /db/conca_eggnog_rongeurs.fa`
    
          query=${espece}.cds
     
    if [ -s \$query ]
    then
        echo query: \$query
        blastx -query \$query -db /db/conca_eggnog_rongeurs.fa  -out "${espece}"_blastx.out -outfmt "6 qseqid sseqid evalue bitscore length pident qstart qend sstart send qlen" -max_hsps 1 -max_target_seqs 1 -evalue 0.000001
        wc -l "${espece}"_blastx.out
    else
        echo query: No query! ERROR
    fi
        """
        }
        //#####################

} else {
        println ("Directory with blast cDNA results: ${params.blast_cDNA} \n-> I will use them")

        blastx_cDNA=Channel.fromPath("${params.blast_cDNA}/*.out").map { file -> tuple(file.getSimpleName().replaceFirst(/_blastx/, ""), file) } 
}



if (params.blast_assemblage == "") {
        println "No given directory with blast assemblage results.\n-> I will run them."



        //###########################
        //# blast2eggnog_assemblage #
        //###########################

        process blast2eggnog_ass {
           
            echo false
            publishDir "$params.out/1_preparation/blastx_eggnog_assemblage", mode: 'copy'
            tag{espece}

            input:
             set val(espece) , file (cDNA) from transdecoder_ass_cds // we do blastx on cds from transdecoder 

            output:
              set val(espece) , file ("*") into blastx_ass   

        """
        
          echo Sp: $espece
		  echo Db: `ls /db/conca_eggnog_rongeurs.fa`
    
          query=${espece}.cds
     
    if [ -s \$query ]
    then
        echo query: \$query
        blastx -query \$query -db /db/conca_eggnog_rongeurs.fa  -out "${espece}"_blastx.out -outfmt "6 qseqid sseqid evalue bitscore length pident qstart qend sstart send qlen" -max_hsps 1 -max_target_seqs 1 -evalue 0.000001
        wc -l "${espece}"_blastx.out
    else
        echo query: No query! ERROR
    fi
        """
        }
        //#####################

} else {
        println ("Directory with blast assemblage results: ${params.blast_assemblage} \n-> I will use them")

        blastx_ass=Channel.fromPath("${params.blast_assemblage}/*.out").map { file -> tuple(file.getSimpleName().replaceFirst(/_blastx/, ""), file) }  //
}




//concat blastx cDNA and blastx ass 
conca_blastx=blastx_ass.concat(blastx_cDNA)

//#####################
//# keeponepercluster #
//#####################

process keeponepercluster {
    label 'small_mem'
    //errorStrategy 'retry'
    //maxRetries 5
    tag{espece}
    echo false 
    publishDir "$params.out/1_preparation/one_contig_per_cluster", mode: 'copy'
	 
    input:
        set val(espece) , file (x) from  conca_blastx

    output:
        set val(espece),  file ("*tsv_output_R") into  onecluster

"""

filename='attribut_names.txt'
blastx_f=\$(ls  *.out)
out_tsv="${espece}".tsv_output_R
./keeponepercluster_eggnog5.R -i \$blastx_f -o \$out_tsv -r /db/9989_members_OneToOne_without_ortho.tsv -b /db/table_biomart_mus.tsv
"""
}
//###############################################################




//#############################
//# split_cluster per letter (internal usage) to increase speed#
//#############################

alphabet=["aA","bB","cC","dD","eE","fF","gG","hH","iI","jJ","kK","lL","mM","nN","oO","pP","qQ","rR","sS","tT","uU","vV","wW","xX","yY","zZ","0-9"]
process splitcluster {
    label 'small_mem'
    //errorStrategy 'retry'
    //maxRetries 5
    tag{"${lettre} ${espece}"}						
    //echo false
    publishDir "$params.out/1_preparation/letter_split", mode: 'copy'

    input:
        set val(espece) , file (x) from  onecluster
        each lettre from alphabet

    output:
      set val(lettre) ,  val(espece),  file ("*.${lettre}.tsv_output_R")  optional true into   onecluster_split

"""
out=${espece}.${lettre}.tsv_output_R

awk '\$3~/^[${lettre}]/{print}' $x > out_tmp || exit 0 # echo "no mgi for ${espece} beginning by ${lettre}"

head out_tmp

head -n 1 $x  > header

head header 
cat header out_tmp > \$out

#rm headerre
#rm out_tmp

"""
}
//###############################################################


onecluster_per_letter = onecluster_split.groupTuple().map {it -> [it[0], it[2]]}



// Collects protein sequences that have been associated to each eggnog family
// remove families with less than 3 species and sequences smaller than 70% of median length

//##################
//# trimming_fasta #
//##################

process trimming_fasta {
  label 'small_mem'
  validExitStatus 0,2,3
  //errorStrategy 'retry'
  //maxRetries 5
  tag{"${letter}"}
  echo false
  publishDir "$params.out/1_preparation/fasta_to_align_min3_sp", mode: 'copy'

input:
  set val(letter), file (x) from onecluster_per_letter
  
  file y from concat_transdecoder_PEP.collect()
  

output:
  file ("*") optional true into al4pub
  file ("fasta_pour_alignement_${letter}_min3/*.fasta") optional true into fasta_a_aligner4conv

"""

./creertsvpourfasta.py
mkdir -p fasta_pour_alignement

./trimming_fasta.py --attribut_tsv attribut_names.txt --outputR_dir .  --pep_dir . -o fasta_pour_alignement_${letter} -l $letter
"""
}
//##########################################################





//#############
//# conca_cds #
//#############
process concacds{
    label 'small_mem'
    echo false
    publishDir "${params.out}/1_preparation/conca_cds", mode: 'copy'
    input:
      file x from concat_transdecoder_CDS.collect()
		
		

    output:
      file ("conca_cds.fa")  into conca_cds

"""


./rename_cds.py



"""
}

//####################################################################



//################
//# aligner4conv #
//################

// alignment of protein sequences with MAFFT
process aligner4conv {
    label 'small_mem'
    tag{"${fasta.getBaseName()}"}
    publishDir "${params.out}/2_mafft_alignment/aa_alignment", mode: 'copy'
    echo false

    input:
      //  file fasta from fasta_a_aligner4conv_notempty.flatten()
      file fasta from fasta_a_aligner4conv.flatten()
    output:
      set val("${fasta.getBaseName()}") , file ("*.aa.fasta") into (ali4revtrans,ali4renamePCOC,ali4hmmclean )
																					
"""	
nom=\$(basename  $fasta | sed s/.fasta//)
mafft --localpair --maxiterate 1000  $fasta > \$nom"_alignement.aa.fasta"
"""
}

//#######################################################

// back translate in nucleic sequences

//############
//# revtrans #
//############

process revtrans{
    label 'small_mem'
    echo false
    publishDir "${params.out}/2_mafft_alignment/nt_alignment_notclean_min3_sp", mode: 'copy'
    //errorStrategy 'retry'
    //maxRetries 5

	input:
      file x from conca_cds.collect()
      set val(gene) , file (z) from ali4revtrans // .collect()

    output:
        set val(gene) ,  file ("*.nt.fasta") optional true  into (ali4renamediff,alint4hmmaa2nt )

"""
#./rename_cds.py
for a in \$(ls *.aa.fasta )
do
    name=\$(basename  \$a | sed s/.aa.fasta//)
    echo \$name
    grep ">"  \$a | sed "s/>//" > seq_name.txt
    time seqtk subseq conca_cds.fa seq_name.txt > gene_cds.fa
    ./back_translate_ali.py --aa \$a --nt gene_cds.fa --output \${name}.nt.fasta --debug
done
"""
}
//############################################################################


// clean protein alignements

//###############
//# hmmcleaner  #
//###############


process hmmcleaner {
    label 'small_mem'
    tag{"${gene}"}
    publishDir "${params.out}/4_mafft_alignment_hmmcleaner/aa_alignment_hmmcleaner_and_bad_seq_removed_3min_sp", pattern : "*.hmmcleaner.removed_bad_seq.aa.fasta", mode: 'copy'
    publishDir "${params.out}/4_mafft_alignment_hmmcleaner/aa_alignment_hmmcleaner_3min_sp", pattern : "*.aa_hmm.fasta", mode: 'copy'
    echo false 
    validExitStatus 0,2

    input:
     set val(gene) ,  file (x) from ali4hmmclean

    output:
	 set val(gene) ,  file ("*.removed_bad_seq.aa.fasta") optional true into (hmmclean_out_aa_3min_sp, hmmclean_out_aa_3min_sp_4allsp)
	 set val(gene) ,  file ("*.log")  optional true into hmmclean_AA_out_log
	 file ("*") into hmmclean_aa2save
"""

HmmCleaner.pl $x 

./remove_bad_seq.py $x ${gene}*.aa_hmm.fasta ${gene}.hmmcleaner.removed_bad_seq.aa.fasta 0.5

if [ ! -f ${gene}.hmmcleaner.removed_bad_seq.aa.fasta ]
 then
    rm  *log
    rm  *score
fi

"""
}

//#######################################################






concat_ali_aa_nt=hmmclean_AA_out_log.concat(alint4hmmaa2nt)
ali_aa_nt=concat_ali_aa_nt.groupTuple()



//#####################
//# hmmcleaner_AA2NT  #
//#####################


process hmmcleaner_aa2nt{
    label 'small_mem'
    publishDir "${params.out}/4_mafft_alignment_hmmcleaner/nt_alignment_hmmcleaner_and_bad_seq_removed_3min_sp", pattern : "*.hmmcleaner.removed_bad_seq.nt.fasta", mode: 'copy'
    publishDir "${params.out}/4_mafft_alignment_hmmcleaner/nt_alignment_hmmcleaner_3min_sp", pattern : "*.nt.hmm_clean.fasta", mode: 'copy'

    echo false 
    validExitStatus 0,2


    input:
      set val(gene) , file (x)from ali_aa_nt

    output:
         set val(gene)  , file ("*.hmmcleaner.removed_bad_seq.nt.fasta") optional true into (hmmclean_out_nt_3min_sp, hmmclean_out_nt_3min_sp_4allsp)
         file ("*")  optional true into hmmclean_nt2save

         
"""
if [ -e ${gene}_alignement.aa_hmm.log ]
then 
	transferCleaner.pl  ${gene}_alignement.nt.fasta -log=${gene}_alignement.aa_hmm.log  -delchar "?"
	cat ${gene}_alignement.nt_cleaned.ali | sed -e  '/^>/! s/[?\\* ]/-/g'  | grep -v "#" > ${gene}.nt.hmm_clean.fasta
	./remove_bad_seq.py ${gene}_alignement.nt.fasta ${gene}.nt.hmm_clean.fasta ${gene}.hmmcleaner.removed_bad_seq.nt.fasta 0.5
	
	
	echo "transferCleaner.pl  ${gene}"

else 
	echo "pas de fichier log pour ${gene} " 	

fi 
"""
}

//#######################################################




//##############################
//# renomer_AA_not_clean_conv : rename with gene name (internal use)  #
//##############################

process renomer_AA_not_clean_conv {
    label 'small_mem'
    errorStrategy 'retry'
    maxRetries 5
    echo false
    tag{gene} //tag{"${gene}"}  //tag{"${fasta.getName().replaceAll(/_alignement.aa.fasta/, "")}
    publishDir "${params.out}/3_mafft_alignment_notcleaned_renamed/aa_alignment_notcleaned_3min_sp_renamed", mode: 'copy'

    input:
    set val (gene) ,  file (fasta) from ali4renamePCOC

    output:
    set val (gene) ,  file ("*.aa.fasta") into ali4PCOC

"""
#rename sequence using the 2nd par of the sequence name
name=\$(basename $fasta)
name=\${name/.aa.fasta}

./renamed4ali.sh $fasta > "\$name".renamed.aa.fasta
"""
}
//#################################################


//##############################
//# renomer_nt_not_clean_conv : rename with gene name (internal use) #
//##############################


process renomer_nt_not_clean_conv {
    label 'small_mem'
    errorStrategy 'retry'
    maxRetries 5
    echo false
    tag {gene} //tag{"${fasta.getName()}"} //tag{"${fasta.getName().replaceAll(/_alignement.nt.fasta/, "")}"}
    publishDir "${params.out}/3_mafft_alignment_notcleaned_renamed/nt_alignment_notcleaned_3min_sp_renamed", mode: 'copy'

    input:
    set val (gene) , file (fasta) from ali4renamediff // .flatten()

    output:
    set val (gene) ,   file ("*") into ali4diff

"""
#rename sequence using the 2nd par of the sequence name
name=\$(basename $fasta)
name=\${name/.nt.fasta}

./renamed4ali.sh $fasta > "\$name".renamed.nt.fasta
"""
}
//#################################################


// test for different threshold values for removing gap

seuil_gap=[0,0.5,0.1,0.15,0.2]

//#################################
//# renomer_aa_clean_conv 3min_sp  : remove sites with a large proportion of gaps #
//#################################

process renomer_aa_clean_conv{
    label 'small_mem'
    errorStrategy 'retry'
    maxRetries 5
    echo false
    tag{gene} 
    publishDir "${params.out}/5_mafft_alignment_hmmcleaner_renamed/aa_alignment_hmmcleaner_3min_sp_renamed_onlygap_removed_seuil_${n}", pattern: '*.renamed.cleaned.aa.fasta', mode: 'copy'
    publishDir "${params.out}/5_mafft_alignment_hmmcleaner_renamed/gap_prop_aa_alignment_hmmcleaner_3min_sp_renamed_onlygap_removed_seuil", pattern: '*.gap_prop.csv', mode: 'copy'

	input:
		set val (gene) , file (fasta) from hmmclean_out_aa_3min_sp
		each n from seuil_gap

    

    output:
		set val (gene) , file ("*.renamed.cleaned.aa.fasta" ) optional true into (hmmclean_out_aa_3min_sp_witout_only_gap)
		set val (gene) , file ("*.gap_prop.csv" )  optional true into aa_ali_3min_sp_gap_prop
		set val (n) ,  val (gene) , file ("*.renamed.0.1.cleaned.aa.fasta" ) optional true into aliconv4trimalsim// vers trimal 
	    file ("*.renamed.0.1.cleaned.aa.fasta" ) optional true into aliconv4tabletrnas //  vers script  nb transition 

		
"""
python removegap.py $fasta ${n} ${gene}.net_gap.aa.fasta ${gene}.aa.gap_prop.csv
./renamed4ali.sh ${gene}.net_gap.aa.fasta  > ${gene}.renamed.cleaned.aa.fasta


if [ -f ${gene}.renamed.cleaned.aa.fasta ] 
then
	nb_chev=\$(grep -c ">" ${gene}.renamed.cleaned.aa.fasta )
	nb_ligne=\$( wc -l  ${gene}.renamed.cleaned.aa.fasta  | awk '{ print \$1 }')
	if [ \$nb_chev  -eq \$nb_ligne ] 
	then 
		rm ${gene}.renamed.cleaned.aa.fasta
		rm ${gene}.net_gap.aa.fasta ${gene}.aa.gap_prop.csv
	fi	
fi	
	if [ "$n" == "0.1" ]  &&   [ -f ${gene}.renamed.cleaned.aa.fasta ]
	then 
	cat ${gene}.renamed.cleaned.aa.fasta > ${gene}.renamed.0.1.cleaned.aa.fasta 
	fi

 """
}
//#################################################


//###########################
//# script_table_transision #
//###########################

process script_table_transision {
    label 'small_mem'
    echo false
    publishDir "${params.out}/8_table_transision", mode: 'copy'
    input:
    file (fasta) from aliconv4tabletrnas.collect()

    output:
      file ("transition_table.tsv") into table

"""


python script_table_domi.py 



"""
}

//#################################################


/*
//##################
//#   score_t_coffee#
//##################
process score_t_coffee {
    label 'small_mem'
    publishDir "${params.out}/7_t_coffee_TCS_${n}", mode: 'copy'
    echo false
    errorStrategy 'retry'
    maxRetries 25

    input:
     set val (n) , val(gene),  file (x) from aliconv4trimalsim

    output:
      file ("*") into score_simi_trimal

"""
 fasta=\$(ls *.fasta)
t_coffee -infile \$fasta  -evaluate -output=score_ascii,aln,score_html
"""
}
//#######################################################
*/




//#################################
//# renomer_nt_clean_conv 3min_sp #
//#################################

process renomer_nt_clean_conv{
    label 'small_mem'
    errorStrategy 'retry'
    maxRetries 5
    echo false
    tag{gene}
    publishDir "${params.out}/5_mafft_alignment_hmmcleaner_renamed/nt_alignment_hmmcleaner_3min_sp_renamed_onlygap_removed_seuil_${n}", pattern: '*.renamed.cleaned.nt.fasta', mode: 'copy'
    publishDir "${params.out}/5_mafft_alignment_hmmcleaner_renamed/gap_prop_nt_alignment_hmmcleaner_3min_sp_renamed_onlygap_removed_seuil_${n}", pattern: '*.gap_prop.csv', mode: 'copy'
    

    input:
		set val (gene) , file (fasta ) from hmmclean_out_nt_3min_sp
		each n from seuil_gap

    output:
        set val (gene) , file ("*.renamed.cleaned.nt.fasta" ) optional true into hmmclean_out_nt_3min_sp_witout_only_gap
		file ("*") into hmmclean_out_all
"""
python removegap.py $fasta ${n} ${gene}.net_gap.nt.fasta ${gene}.gap_prop.csv
./renamed4ali.sh ${gene}.net_gap.nt.fasta > ${gene}.renamed.cleaned.nt.fasta



if [ -f  ${gene}.renamed.cleaned.nt.fasta  ] 
then
	nb_chev=\$(grep -c ">" ${gene}.renamed.cleaned.nt.fasta )
	nb_ligne=\$( wc -l  ${gene}.renamed.cleaned.nt.fasta | awk '{ print \$1 }')
	if [ \$nb_chev  -eq \$nb_ligne ] 
		then 
		rm ${gene}.renamed.cleaned.nt.fasta
		rm ${gene}.net_gap.nt.fasta ${gene}.gap_prop.csv
		fi 
	

fi 
"""
}
//#################################################





//#######################################
//# renomer_nt_clean_conv only complete  : retains only sites without gaps for phylogenetic analysis#
//#######################################

process renomer_nt_clean_conv_only_complete{
    label 'small_mem'
    errorStrategy 'retry'
    maxRetries 5
    echo false
    tag{gene}
	publishDir "${params.out}/5_mafft_alignment_hmmcleaner_renamed/nt_alignment_hmmcleaner_all_sp_renamed_04gap_removed", pattern: '*.renamed.cleaned.allsp.nt.fasta', mode: 'copy'

    input:
		set val (gene) , file (fasta ) from hmmclean_out_nt_3min_sp_4allsp
		
		
	output: 	
		file ("*.renamed.cleaned.allsp.nt.fasta")  optional true into (ali_nt_renomer4tree,ali_nt_renomer4conca_tree)
		
		
"""
nb_seq=\$(grep -c ">" $fasta )
if [  \$(grep -c ">" $fasta ) -eq $nb_sp ]  
then
    echo "${gene} complete NT ali, nb seq (\$nb_seq) == nb sp ($nb_sp)"
    
	python removegap.py $fasta 0 ${gene}.nt.net_gap.fasta ${gene}.gap_prop.csv
	./renamed4ali.sh ${gene}.nt.net_gap.fasta > ${gene}.renamed.cleaned.allsp.nt.fasta
else
    echo "${gene} not complete NT ali, nb seq (\$nb_seq) != nb sp ($nb_sp)"
fi

if [ -f ${gene}.renamed.cleaned.allsp.nt.fasta ] 
then
	nb_chev=\$(grep -c ">" ${gene}.renamed.cleaned.allsp.nt.fasta )
	nb_ligne=\$( wc -l  ${gene}.renamed.cleaned.allsp.nt.fasta | awk '{ print \$1 }' )
	if [  \$nb_chev -eq \$nb_ligne ] 
	then 
		rm ${gene}.renamed.cleaned.allsp.nt.fasta
		rm ${gene}.gap_prop.csv
	fi

fi 

"""
}
//#################################################


//#######################################
//# renomer_aa_clean_conv only complete #
//#######################################

process renomer_aa_clean_conv_only_complete{
    label 'small_mem'
    errorStrategy 'retry'
    maxRetries 5
    echo false
    tag{gene}
    publishDir "${params.out}/5_mafft_alignment_hmmcleaner_renamed/aa_alignment_hmmcleaner_all_sp_renamed_04gap_removed", pattern: '*.renamed.cleaned.allsp.aa.fasta', mode: 'copy'

    input:
		set val (gene) , file (fasta )from hmmclean_out_aa_3min_sp_4allsp

    output:		

		file ("*.renamed.cleaned.allsp.aa.fasta")  optional true into (ali_aa_renomer4tree,ali_aa_renomer4conca_tree)
		//file ("*") into out_ali_nt
"""
nb_seq=\$(grep -c ">" $fasta )

if [  \$(grep -c ">" $fasta ) -eq $nb_sp ]  
then
    echo "${gene} complete AA ali, nb seq (\$nb_seq) == nb sp ($nb_sp)"
    
	python removegap.py $fasta 0 ${gene}.aa.net_gap.fasta ${gene}.gap_prop.csv
	./renamed4ali.sh ${gene}.aa.net_gap.fasta > ${gene}.renamed.cleaned.allsp.aa.fasta
else
    echo "${gene} not complete AA ali, nb seq (\$nb_seq) != nb sp ($nb_sp)"
fi


if [ -f ${gene}.renamed.cleaned.allsp.aa.fasta ] 
then
	nb_che=\$(grep -c ">" ${gene}.renamed.cleaned.allsp.aa.fasta )
	nb_ligne=\$( wc -l  ${gene}.renamed.cleaned.allsp.aa.fasta | awk '{ print \$1 }' )
	if [ \$nb_che  -eq \$nb_ligne ] 
	then 
		rm ${gene}.renamed.cleaned.allsp.aa.fasta
		rm ${gene}.aa.net_gap.fasta ${gene}.gap_prop.csv
	fi

fi 
"""
}
//#################################################




//##################
//# concatall4test #
//##################

process concatall4test {
    label 'small_mem'
    //errorStrategy 'retry'
    //maxRetries 5
    publishDir "$params.out/6_species_tree/concatenat_all_nt_alignment_hmmcleaner_all_sp_renamed_04gap_removed/", mode: 'copy'
    echo false

    input :
      file x from ali_nt_renomer4conca_tree.collect()
	  each n from 1..TreeReplicates

    output:
		set val (n) , file ("concatenat_all_ali.renamed.cleaned.allsp.nt.fasta") into concatenat_nt_all_ali
"""
./catfasta2phyml.pl -c -f *.renamed.cleaned.allsp.nt.fasta > concatenat_all_ali.renamed.cleaned.allsp.nt.fasta
"""
}
//###########################################


//###############
//# concat4tree #
//###############

process concat4tree {
    label 'small_mem'
    errorStrategy="ignore"
    //errorStrategy 'retry'
    //maxRetries 5
    tag{"${n}"}
    publishDir "$params.out/6_species_tree/random_concatenat_nt_4_sp_tree/rep_${n}/", mode: 'copy'
    echo false

    input :
      each n from 1..TreeReplicates
      file x from ali_nt_renomer4tree.collect()

    output:
      set val(n) , file ("cat_ali_${n}.fasta") into concatene_n
      file '*' into  sub_ali_keep

"""
# script python qui fais des tirage aléatoire dans les alignements et les renome avec subset dedans
# premier agument le nom du fichier avec nom des site conserve et ensuite le nombre de site a conserve par alimgnement
# puis le nb de genes et enfin le fichier de sortie

python subseq_ali.py site_garder.txt 200 500 cat_ali_${n}.fasta



#python subseq_ali.py site_garder.txt 500 200 cat_ali_${n}.fasta  

"""
}
//###########################################


//###############
//# make_a_tree #
//###############

process make_a_tree {
    //errorStrategy 'retry'
    //maxRetries 5
    tag{"${n}"}
    publishDir "$params.out/6_species_tree/rooted_species_tree_from_random_concatenat_nt/rep_${n}/", mode: 'copy'
    echo false

    input:
      set val(n), file  (x) from concatene_n

    output:
      set val(n), file ("*bestTree") into rooted_tree,rooted_tree4test
      file ("*") into info_rooted_tree


"""
# first parse msa to know optimal configuration (cpus) ang get a binary alignment
raxml-ng --parse -msa $x --model GTR+G > optimal_config.txt

max_cpus=`grep -c ^processor /proc/cpuinfo`
max_cpus=\$(( 16 > \$max_cpus ? \$max_cpus : 16 )) # min between max_cpus and 16
echo \$max_cpus

opt_cpus=`grep "Recommended number of threads" optimal_config.txt | cut -f 2 -d ":"`
opt_cpus2=\$(( \$opt_cpus > \$max_cpus ? \$max_cpus : \$opt_cpus )) # min between opt_cpu and 16

# Then run raxml with th optimal cpu number (which must be inferior to the cpu_max) and the binary msa
raxml-ng  --seed 1234   --blopt nr_safe  --threads \$opt_cpus2 --all  -msa "$x".raxml.rba --prefix species_tree_rep_${n} --outgroup ${params.outgroup} # --model LG+G specified above
"""
}


//##########################################################################################################


conca_conca_all_tree=concatenat_nt_all_ali.concat(rooted_tree4test)
conca_and_tree=conca_conca_all_tree.groupTuple().flatMap{ it -> [[ it[0] , it[1].flatten().toList() ]]}


//#############
//# test_tree : to estimate likelihood of the complete dataset given trees built from random subsets #
//#############

process test_tree {
    publishDir "$params.out/6_species_tree/L_sp_tree_build_from_random_concatenat_with_complete_concatenat/rep_${n}/", mode: 'copy'
    echo false

    input:
	 set val(n), file (x) from conca_and_tree

    output:
      file ("*") into test_tree

"""
# x[0] -> conca
# x[1] -> tree_rep_n
raxml-ng --seed 1234 --threads 4 --tree ${x[1]} --blopt nr_safe --evaluate -msa ${x[0]} --model GTR+G --prefix test_tree_rep${n} --outgroup ${params.outgroup} 
"""
}


//##########################################################################################################


//###########
//# add_num #
//###########
process add_num {
    label 'small_mem'
    tag{"${n}"}
    publishDir "$params.out/6_species_tree/species_tree_with_num_on_nodes/rep_${n}/", mode: 'copy'
    echo false

    input:
      set val(n) , file (x) from rooted_tree

    output:
      set val(n) , file ("*") into numeroted_tree

"""
Xvfb :1 -screen 0 1024x768x16 &
export DISPLAY=:1.0
pcoc_num_tree.py -t $x -o species_tree_${n}_with_num_4_pcoc.pdf
"""
}



