#
# Title : Intergenic ORFs as elementary structural modules of de novo gene birth and protein evolution
# Author: Chris Papadopoulos, Isabelle Callebaut, Jean-Christophe Gelly, Isabelle Hatin, Olivier Namy, Maxime Renard, Olivier Lespinet, Anne Lopes
# 

# This pipeline was generated in order to reconstruct the ancestral sequences of de novo 
# genes of S.cerevisiae. The aim is to detect non genic regions on the genome of the 
# neighboring species of S.cerevisiae which correspond to de novo genes in S.cerevisiae. 
# To do so we use blast in order to detect homologous regions. First we search the CDS 
# sequences with blastp, then the Intergenic regions with tblastn and finally all the 
# ORFs detected stop-to-stop with size more than 12 nucleotides. Like this we were able to
# detect anchors on the genome (based on sequence homology) and extract the genomic non
# coding sequence for every de novo gene.
#
# In the second part of the script we work for every de novo gene separetly. We generate 
# nucleotide sequence multiple alignment of each de novo gene with its non coding parteners
# at the neighboring species using MACSE. The MSA was used from PhyML in order to generate
# one phylogenetic tree per de novo gene. Finally the ancestral nucleotide sequence of 
# every de novo gene was reconstructed using PRANK. Finally, the reconstructed ancestral 
# nucleotide sequence was translated into ancIGORFs (3 possible reading frames - stop to stop)
# and the ancIGORFs which gave birth to the de novo gene were identified by homology using 
# the Lalign tool. 

# This protocole IS NOT a script that can be launched by a terminal as it is. Is an 
# indicative code step-by-step of the procedure, the softwares and the parameters used 
# for the ancestral reconstruction and identification. 

# Softwares needed for this pipeline: 
# 1. Blast
# 2. ORFtrack and ORFget (from package ORFmine : https://github.com/i2bc/ORFmine)
# 3. pyHCA (https://github.com/T-B-F/pyHCA)   
# 4. macse
# 5. seqret
# 6. PhyML
# 7. PRANK
# 8. LALIGN

# The absolut path my_path of the directory containing the input files must be modified 
my_path='/Users/christospapadopoulos/Desktop/Reconstruct'
mkdir intermediate

# ========== #
#    CDS     #
# ========== #

# 1. We detect which of the 171 de novo genes of S.cerevisiae have homologs with other 
#    annotated CDS in the neighboring species. 
#    We launch the blast of the de novo genes against the CDS of each species
#    and then we create a table of the first hit using a homemade script (Detect_IGORFs_on_denovo.py)
#
mkdir ${my_path}/intermediate/denovo_vs_CDS_xml
mkdir ${my_path}/intermediate/denovo_vs_CDS_tab
# For each species we launch the following commands (${sp} is the name of the species ie. Spar or Smik etc)

blastp -query ${my_path}/inputs/denovo.pfasta -subject ${my_path}/inputs/CDS_protein_fasta/${sp}_CDS.pfasta -out ${my_path}/intermediate/denovo_vs_CDS_xml/ScerDENOVO-vs-${sp}CDS.xml -outfmt 5

python ${my_path}/scripts/Detect_IGORFs_on_denovo.py -blast ${my_path}/intermediate/denovo_vs_CDS_xml/ScerDENOVO-vs-${sp}CDS.xml -out ${my_path}/intermediate/denovo_vs_CDS_tab/Scer-vs-${sp}CDS.tab

# For each species we generated one file ScerDENOVO-vs-${sp}CDS.xml  and one file Scer-vs-${sp}CDS.tab
# stored in the directory ${my_path}/intermediate/denovo_vs_CDS_xml and 
# ${my_path}/intermediate/denovo_vs_CDS_tab respectively.

# ============ #
#  End of CDS  #
# ============ #

# ========== #
#    IGR     #
# ========== #

# 2. We detect which of the 171 de novo genes of S.cerevisiae have homologs 
#    with intergenic regions. 
#    We launch a homemade script for extraction of the IGRs (Extract_IGR.py)
#    We launch tblastn of the de novo genes against the intergenic regions
#    and then we create a table of the first hit using a homemade script (Detect_IGORFs_on_denovo.py)
#
mkdir ${my_path}/intermediate/IGR_nucleotide_fasta
mkdir ${my_path}/intermediate/denovo_vs_IGR_xml
mkdir ${my_path}/intermediate/denovo_vs_IGR_tab
# For each species we launch the following commands (${sp} is the name of the species Spar or Smik etc)

python ${my_path}/scripts/Extract_IGR.py -genome ${my_path}/inputs/genomes/${sp}.fas -gff ${my_path}/inputs/annotations/${sp}.gff -out  ${my_path}/intermediate/IGR_nucleotide_fasta/${sp}_IGR.fasta

tblastn -query ${my_path}/inputs/denovo.pfasta -subject ${my_path}/intermediate/IGR_nucleotide_fasta/${sp}_IGR.fasta -out ${my_path}/intermediate/denovo_vs_IGR_xml/ScerDENOVO-vs-${sp}IGR.xml -outfmt 5 

python ${my_path}/scripts/Detect_IGORFs_on_denovo.py -blast ${my_path}/intermediate/denovo_vs_IGR_xml/ScerDENOVO-vs-${sp}IGR.xml -out ${my_path}/intermediate/denovo_vs_IGR_tab/Scer-vs-${sp}IGR.tab

# For each species we generated one file ${sp}_IGR.fasta, one file ScerDENOVO-vs-${sp}IGR.xml  
# and one file Scer-vs-${sp}IGR.tab stored in the directory IGR_nucleotide_fasta,
# denovo_vs_IGR_xml and denovo_vs_IGR_tab respectively. 

# /!\ /!\ /!\
# In the Scer-vs-${sp}IGR.tab file when there is a Hit with an integenic region you see "CDS". 
# But it is not hit with CDS is hit with IGR. Hit with CDS is the NaN. 
# To correct it, we MUST transform all the labels CDS with labels IGR
sed -i '' "s|CDS|IGR|g"  ${my_path}/intermediate/denovo_vs_IGR_tab/Scer-vs-S*IGR.tab

# ============ #
#  End of IGR  #
# ============ #

# =============================== #
#    ALL ORFs MORE THAN 12 NT     #
# =============================== #
# 
#  This step needs to download and install the ORFtrack tool from the ORFmine package
#  ORFmine can be freely downloaded from: https://github.com/i2bc/ORFmine
#
#  First we track all the ORFs in the genome of all the species with more than 12 nt. 
#  This will permit us to detect small anchors of our de novo genes in the genome of
#  each one of the species. 
mkdir ${my_path}/intermediate/all_ORFs_more_than_12nt
#  For each species we launch the following commands (${sp} is the name of the species Spar or Smik etc)

orftrack -fna ${my_path}/inputs/genomes/${sp}.fas -gff ${my_path}/inputs/annotations/${sp}.gff -orf_len 12
mv mapping_orf_${sp}.gff ${my_path}/intermediate/all_ORFs_more_than_12nt/mapping_orf_${sp}_70cov_12size.gff

orfget -fna ${my_path}/inputs/genomes/${sp}.fas -gff ${my_path}/intermediate/all_ORFs_more_than_12nt/mapping_orf_${sp}_70cov_12size.gff -o ${my_path}/intermediate/all_ORFs_more_than_12nt/${sp}SUPERMEGA -type both

# For each species we generated one file mapping_orf_${sp}_70cov_12size.gff, one file
# ${sp}SUPERMEGA.pfasta and one file ${sp}SUPERMEGA.nfasta all stored in the 
# all_ORFs_more_than_12nt directory

#  Then we blast the 171 de novo genes of S.cerevisiae against the small ORFs of the
#  other species in order to detect the small anchors in the genome of each one of the 
#  species. These anchors will permit us to detect the genomic region of each de novo 
#  gene which does not have a homolog CDS or Intergenic Region. 
#
mkdir ${my_path}/intermediate/denovo_vs_all_ORFs_more_than_12nt_xml
mkdir ${my_path}/intermediate/denovo_vs_all_ORFs_more_than_12nt_tab
#  For each species we launch the following command (${sp} is the name of the species Spar or Smik etc)
blastp -query ${my_path}/inputs/denovo.pfasta -subject ${my_path}/intermediate/all_ORFs_more_than_12nt/${sp}SUPERMEGA.pfasta -out ${my_path}/intermediate/denovo_vs_all_ORFs_more_than_12nt_xml/ScerDENOVO-vs-${sp}SUPERMEGA.xml -outfmt 5  -task "blastp-short"
#  Then we create a table of the first hit using a homemade script (Detect_IGORFs_on_denovo.py)
python ${my_path}/scripts/Detect_IGORFs_on_denovo.py -blast ${my_path}/intermediate/denovo_vs_all_ORFs_more_than_12nt_xml/ScerDENOVO-vs-${sp}SUPERMEGA.xml -out ${my_path}/intermediate/denovo_vs_all_ORFs_more_than_12nt_tab/Scer-vs-${sp}.tab

# For each species we generated one file ScerDENOVO-vs-${sp}SUPERMEGA.xml and 
# one file ScerDENOVO-vs-${sp}SUPERMEGA.tab stored in the directory 
# ${my_path}/intermediate/denovo_vs_all_ORFs_more_than_12nt_xml and 
# ${my_path}/intermediate/denovo_vs_all_ORFs_more_than_12nt_tab respectively.

# ================================== #
#  End of ALL ORFs MORE THAN 12 NT   #
# ================================== #

# At the end of these three steps we have generated 3 types of TAB files. 
# These TAB files contain the first hit of the blast for each one of the de novo genes 
# 	a. Scer de novo genes VS other species CDS (Scer-vs-${sp}CDS.tab) 
# 	b. Scer de novo genes VS other species Intergenic Regions (Scer-vs-${sp}IGR.tab)
#	c. Scer de novo genes VS other species all ORFs more than 12 nt (Scer-vs-${sp}.tab)
#
# Now we can pass to the Ancestral Reconstruction procedure. 

# ====================== #
#     RECONSTRUCTION     #
# ====================== #

# 1. First we run a homemade script made with R. 
#    This script takes as input the 3 types of tables generated (CDS - IGR - IGORFs-12-nt)
#    and identifies which of the de novo genes have at least three homologous sequences
#    in the neighboring species (e-val < 0.01). Among the three homologous sequences, 
#    at least one had to be noncoding in order the de novo gene to be kept.
#    This step outputs a file named TO_KEEP.tab and is a table containing the de novo 
#    genes which will be used for the reconstruction. 
Rscript ${my_path}/scripts/Create_table_to_keep.R ${my_path}/intermediate

# 2. Then we have to extract the nucleotide sequence for each one of the de novo genes
#    from every species that blast identified a homologous sequence (CDS,IGR or IGORF-12nt)
mkdir ${my_path}/intermediate/FASTA_to_be_aligned
python ${my_path}/scripts/Extract_sequences.py -path ${my_path}

# This script generates one fasta file ${name}.toali and one newick file ${name}.newick
# file  per de novo gene. Each fasta file contains the nucleotide sequences of the genomic
# region homologous with the de novo gene as identified with blast (CDS,IGR or IGORF-12nt). 
# In addition it generates a table file called New_relationships_list.tab which gives information
# about the de novo genes kept and their neighboring species. 

# 3. We make nucleotide sequences alignments for each de novo gene with MACSE for each set 
#    of nucleotide sequences generated by the previous step. 
for i in ${my_path}/intermediate/FASTA_to_be_aligned/*.toali;
do 
	java -jar macse_v2.05.jar -prog alignSequences -seq $i
done
sed -i '' 's|!|-|g' ${my_path}/intermediate/FASTA_to_be_aligned/*_NT.toali
mkdir ${my_path}/intermediate/FASTA_to_be_aligned/macse_results_NT
mkdir ${my_path}/intermediate/FASTA_to_be_aligned/macse_results_AA
mv ${my_path}/intermediate/FASTA_to_be_aligned/*_NT.toali ${my_path}/intermediate/FASTA_to_be_aligned/macse_results_NT
mv ${my_path}/intermediate/FASTA_to_be_aligned/*_AA.toali ${my_path}/intermediate/FASTA_to_be_aligned/macse_results_AA

# For each de novo gene we generated one alignment nucleotide file ${name}_NT.toali and 
# one alignment amino acid file ${name}_AA.toali located in 
# ${my_path}/intermediate/FASTA_to_be_aligned/macse_results_NT and 
# ${my_path}/intermediate/FASTA_to_be_aligned/macse_results_AA respectively

# 4. We transfrom the nucleotide alignments for each de novo gene generated by the 
#    previous step into PHYLIP files using the tool seqret
mkdir ${my_path}/intermediate/FASTA_to_be_aligned/PHYLIP
for i in ${my_path}/intermediate/FASTA_to_be_aligned/macse_results_NT/*_NT.toali
do 
	name=$(basename $i .${i##*.})
	seqret -sequence FASTA::${my_path}/intermediate/FASTA_to_be_aligned/macse_results_NT/${name}.toali -outseq PHYLIP::${my_path}/intermediate/FASTA_to_be_aligned/PHYLIP/${name}.phylip
done

# For each de novo gene we generated one alignment PHYLIP file ${name}_NT.phylip

# 5. We construct one phylogenetic tree per de novo gene by the alignment of the previous 
#    step using phyml
for i in ${my_path}/intermediate/FASTA_to_be_aligned/*.toali
do 
	name=$(basename $i .${i##*.})
	phyml -i ${my_path}/intermediate/FASTA_to_be_aligned/PHYLIP/${name}_NT.phylip -d nt -v e -o lr -c 4 -a e -b 0 -f e -u ${my_path}/intermediate/FASTA_to_be_aligned/${name}.newick
done
# For each de novo gene we generated one tree file ${name}_NT.phylip_phyml_tree.txt
# located in the directory ${my_path}/intermediate/FASTA_to_be_aligned/PHYLIP


# 6. We reconstruct the ancestral sequences for every sequence in the alignment file 
#    using PRANK. As input we give the nucleotide alignment file as generated by MACSE in 
#    step 3 and the gene specific phylogenetic tree as generated in step 5. 

for i in ${my_path}/intermediate/FASTA_to_be_aligned/*.toali
do 
	name=$(basename $i .${i##*.})
	prank -d=${my_path}/intermediate/FASTA_to_be_aligned/macse_results_NT/${name}_NT.toali -t=${my_path}/intermediate/FASTA_to_be_aligned/PHYLIP/${name}_NT.phylip_phyml_tree.txt -o=${my_path}/intermediate/FASTA_to_be_aligned/${name} -once -showanc
done
mkdir ${my_path}/intermediate/FASTA_to_be_aligned/PRANK
mv ${my_path}/intermediate/FASTA_to_be_aligned/*.best.anc.dnd ${my_path}/intermediate/FASTA_to_be_aligned/PRANK
mv ${my_path}/intermediate/FASTA_to_be_aligned/*.best.anc.fas ${my_path}/intermediate/FASTA_to_be_aligned/PRANK
mv ${my_path}/intermediate/FASTA_to_be_aligned/*.best.fas ${my_path}/intermediate/FASTA_to_be_aligned/PRANK

# For each de novo gene we generated one fasta file ${gene}.best.anc.fas which contains 
# the ancestral nucleotide sequence reconstructed for every branch of the tree. 

# 7. Once we reconstructed the ancestral sequence of the de novo gene we translate it into
#    into the 3 reading frames and detect the ancIGORFs that match with the de novo gene 
#    protein sequence. Like this we are capable to detect which ancIGORF (or combination
#    of ancIGORFs) gave birth to each de novo gene. 
# 
#    In order to do that we launch a homemade script Extract_ancestors_fragments.py. The 
#    script uses LALIGN in order to detect the homology of the ancIGORFs and the de novo 
#    gene. For that reason, the code of script must be adjusted and the path of LALIGN 
#    should be given inside the script at the line indicating: 
#               "Let's launch the Lalign and the HCA for the Fragments"
#    In addition the script uses also the pyHCA package in order to calculate the HCA score
#    of the ancIGORFs. pyHCA should be prior installed and can be found at the following link:
#    https://github.com/T-B-F/pyHCA
    
mkdir ${my_path}/AncFragments
cd ${my_path}/AncFragments
for i in ${my_path}/intermediate/FASTA_to_be_aligned/*.toali
do 
	name=$(basename $i .${i##*.})
	python3 ${my_path}/scripts/Extract_ancestors_fragments.py -fasta ${my_path}/inputs/denovo_fastas/${name}.pfasta -anc_list ${my_path}/intermediate/New_relationships_list.tab -type anc -ancs ${my_path}/intermediate/FASTA_to_be_aligned/PRANK/$name.best.anc.fas -newick ${my_path}/intermediate/FASTA_to_be_aligned/${name}.newick >> Fragments_HCA.tmp
done
rm -fr *.NONfrags

# At the end of this step there was generated a directory named ${my_path}/AncFragments
# In this directory there is one file ${name}.frags and one file ${name}.frags_ali per
# de novo gene. The ${name}.frags is a fasta file containing the aminoacid sequence of 
# the ancIGORFs that gave birth to the de novo gene. The ${name}.frags_ali file contains
# an alignment of each ancIGORF with the de novo gene as detected by Lalign. 

# A posterior analysis of all the ancIGORFs and de novo genes reconstruction showed us 
# that one gene named YMR151W was not properly reconstructed and in fact it was a weird 
# false case so we decided to delete it from our analysis. We do this with the following 
# command: 
cd ${my_path}
grep -v YMR151W AncFragments/Fragments_HCA.tmp > AncFragments_HCA.tab

# We have generated the file AncFragments.tab. This file contains all the information 
# needed for every ancIGORF that participated in the birth of a de novo gene. 
# The columns of the file are: 
# De novo gene name ; ancIGORF name ; Ancestor species ; Localization on the sequence ; 
# HCA score of de novo gene ;  HCA score of ancIGORF ; coverage of the de novo gene by this ancIGORF ;  
# coverage of the ancIGORF by this de novo gene ; HCA barcode of de novo gene; HCA barcode of ancIGORF; 
# ancIGORF aminoacid sequence

# We generate the nucleotide FASTA of the AncIGORFs
cd ${my_path}/AncFragments
rm -fr YMR151W*
for i in *.frags_nt;do grep -A1 ">" $i;done > ../AncFragments.nfasta
cd ${my_path}
sed -i '' "s|--||g" AncFragments.nfasta 
sed -i '' '/^[[:space:]]*$/d' AncFragments.nfasta


# We also count the ORF triggers that gave birth to the de novo gene (FrameShift (FS) / Stop codon substitution (CSS)). 
# To do so we use the homemade script mutationa_events.py found in the scripts directory

for file in ${my_path}/AncFragments/*.frags_ali; do python3 ${my_path}/scripts/mutational_events.py -ali ${file} >> mutational_events.txt ; done

# This will generate a file called mutational_events.txt counting for each de novo gene the FS and CSS.





