#
# This text file contains concatenated Perl scripts, as used for data compilation and analysis of
# the paper 'Phylogenomic analysis reveals bees and wasps (Hymenoptera) at the base of the radiation
# of Holometabolous insects' by Joel Savard et al.. 
#
# To run them, you first need to cut them were indicated, save them as independent files, and then 
# run them under Perl with the appropriate command line arguments. 
#
# The relevant extract from the 'MJL_dataUtils' package is at the end of this file; please save this 
# in the same directory as any script that needs it (mostly for convenient input of FASTA format 
# files).
#
# For questions, please contact the last author at M.J.Lercher@bath.ac.uk.
#

#
#======= CUT HERE AND SAVE AS SEPARATE SCRIPT FILE =================================================
#
####################################################################################################
#                                                                                                  #
# Extracts non-redundant sequences from a fasta file                                               #
#                                                                                                  #
####################################################################################################
#!/usr/bin/perl
use strict ;
use MJL_dataUtils ;

(@ARGV == 1) || die "usage: $0 fastaFile\n" ;
my $inFile = shift ;

my $outFile = $inFile ;
$outFile =~ s/\.\w+$// ;
$outFile .= 'NR0.fa' ;

my @seqs = readFasta ($inFile) ;
my $n = scalar @seqs ;

# restrict to non-redundant sequences:
my @seqsNR = nonRedundantSeqS (@seqs) ;
my $nNR = scalar @seqsNR ;

print "compacting $n sequences to $nNR non-redundant sequences...\n" ;

# write to file:
open (OUT, ">$outFile") || die ;
foreach my $seq (@seqsNR) {
	print OUT ">$seq->[0]   $seq->[2]\n$seq->[1]\n\n" ;
}
close OUT ;


#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
=head1 NAME
nonRedundantSeqS

=SYNOPSIS
remove (partial) duplicated sequences from set of sequences
redundant means one is contained in the other
Argument:	array of sequence pointers (name, seq, desc)
Returns:	array of non-redundant sequence pointers (name, seq, desc)

=cut
#.............................................................................

sub nonRedundantSeqS (@) {
	my @seqs = @_ ;

	my $counter = my $percentage = 0 ;
	for (my $i1=0; $i1<@seqs-1; $i1++) {
		my $seq1 = $seqs[$i1][1] ;
		for (my $i2=$i1+1; $i2<@seqs; $i2++) {
			my $seq2 = $seqs[$i2][1] ;
			if (index ($seq1, $seq2) >= 0) {
				$seqs[$i2][2] = 'redundant' ;
			} elsif (index ($seq2, $seq1) >= 0) {
				$seqs[$i1][2] = 'redundant' ;
			}
		}
		if (int($counter++ / scalar (@seqs) * 100) > $percentage) {
			printf STDERR "\r%3d \%", $percentage ;
			$percentage = int($counter++ / scalar (@seqs) * 100) ;
		}
	}
	print STDERR "\n" ;
	# remove redundant proteins and write non-redundant set to file:
	return grep ($_->[2] ne 'redundant', @seqs) ;
}

#
#======= CUT HERE AND SAVE AS SEPARATE SCRIPT FILE =================================================
#
###################################################################################################
#                                                                                                 #
# Convert cDNA/ESTs to AA by blast against Drosophila proteins                                    #
#                                                                                                 #
###################################################################################################

#!/usr/bin/perl
use strict ;

# file & species names:
my @newSpeciesS			= qw/Hc / ;
my @newSpeciesName		= qw/H_coagulata / ;
my @newSpeciesFasta		= qw/H_coagulata.fasta / ;

# parameters specific to the project / machine:
my $fileBase 	= 'Lm2006' ;

# blast programs and database for Drosophila:
my $blastxExec	= '/usr/bin/blastall -a 4 -p blastx -e 1 -v 1 -b 1 -I T' ;
my $blastpExec	= '/usr/bin/blastall -a 4 -p blastp -e 1 -v 1 -b 1 -I T' ;
# my $blastxExec	= '\biotools\blast\blastall -a 4 -p blastx -e 1 -v 1 -b 1 -I T' ;
# my $blastpExec	= '\biotools\blast\blastall -a 4 -p blastp -e 1 -v 1 -b 1 -I T' ;
my $Dm			= 'D_melanogasterNR' ;

###################################################################################################
for (my $i=0; $i<@newSpeciesName; $i++) {
	# file names:
	my $newSpeciesAA	= $newSpeciesName[$i] . '.fa' ;
	my $blastNewspeciesDm  = $newSpeciesS[$i] . 'Dm.bls' ;
	my $blastNewspeciesDmS = $newSpeciesS[$i] . 'DmBLAST.tab' ;

	# blast Newspecies ESTs against Dm:
	print "\nblasting $newSpeciesName[$i] ESTs against Dm...\n" ;
	`$blastxExec -i $newSpeciesFasta[$i] -o $blastNewspeciesDm -d $Dm` unless (-f $blastNewspeciesDm) ;
	processBLAST ($blastNewspeciesDm, $blastNewspeciesDmS)    unless (-f $blastNewspeciesDmS) ;

	#find longest open reading frame in the Dm frame:
	print "finding longest ORF...\n" ;
	translateEST ($blastNewspeciesDmS, $newSpeciesFasta[$i], $newSpeciesAA) unless (-f $newSpeciesAA) ;
}

print "\nPlease call nonRedundant.pl and then Trib_BLAST11.pl\n" ;

###################################################################################################
sub readColumn {
	my $c = $_[1] ;
	open (IN, $_[0]) || die "couldn't open $_[0]" ;
	<IN> ;
	my @o = () ;
	while (<IN>) {
		chomp ;
		my @l = split ("\t", $_) ;
		push @o, $l[$c] ;
	}
	close IN ;
	return @o ;
}

###################################################################################################

sub translateEST {
	my ($blastS, $est, $estAA) = @_ ;

	# read frames from Dm blast:
	open (IN, $blastS) || die ;
	<IN> ;
	my %frame = () ;
	while (<IN>) {
		my @l = split ("\t", $_) ;
		$frame{$l[0]} = $l[1] ; # frame in BLAST := first nucleotide of codon (starting from 1)
	}
	close IN ;

	open (OUTtranslate, ">$estAA") || die ;

	# read all EST sequences and translate each in its frame, finding longest ORF:
	open (IN, $est) || die ;
	my $seqName = my $seq = '' ;
	while (<IN>) {
		if (/^>/) {
			# process previous sequence:
			my $frame = $frame{$seqName} ;
			if ($seqName && $frame && ($frame ne '-')) {
				my $protein = longestORF ($seq, $frame) ;
				print OUTtranslate ">$seqName frame$frame{$seqName}\n$protein\n" ;
			}
			# get new sequence name:
			($seqName) = /^>(?:gi\|)?([^\s\|]+)/ ; ##### THIS LINE CAUSED PROBLEMS WITH RAW DATA BEFORE
			$seq = '' ;
		} else {
			$seq .= $_ ;
		}	
	}
	my $frame = $frame{$seqName} ;
	if ($seqName && $frame && ($frame ne '-')) {
		my $protein = longestORF ($seq, $frame) ;
		##### if you want to modify the sequence name, do it here (although it won't match the blast .tab file anymore):
		print OUTtranslate ">$seqName  frame$frame\n$protein\n" ;
	}
	close IN ;
	close translateOUT ;
}

sub longestORF {
	my ($seq, $frame) = @_ ;

	my ($prot) = translate ($seq, $frame) ;
	my @aa = split ('', $prot) ;
	my $lengthLongest = 0 ;
	my $longestPept = '' ;
	while (@aa) {
		my $pept = '' ;
		while (@aa && (($a = shift(@aa)) ne '*')) {
			$pept .= $a ;
		}
		$pept =~ s/^X+|X+$//g ; # remove leading / trailing unknown aa
		if (length($pept) > $lengthLongest) {
			$lengthLongest = length($pept) ;
			$longestPept = $pept ;
		}
	}

	return ($longestPept) ;
}

#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
=head1 NAME
getFastaList

=head1 SYNOPSIS
read Fasta file and write those sequences specified in a list to a second file
- this is much faster than extracting sequences from a BLAST db.
Returns		number of read sequences
Arguments	hash of arguments:
		inFile		=> Fasta input file name, 
		outFile		=> Fasta output file name, 
		accessions 	=> pointer to list of accessions,
		ignoreVersion 	=> 0,1 [=0] ignore version tail (.3 in NM_005203.3)
		parseID		=> [=0] set to regexp string to parse ID (e.g., '[^\d]*(\d+)')

=cut
#.............................................................................

sub getFastaList {
	my %args = (
		outFile => 'getFasta.fa' ,
		ignoreVersion => 0 ,
		parseID => 0 ,
		verbose => 0 ,
		@_ 
	) ;
	my $ignoreVersion = $args{ignoreVersion} ;
	my $parseID = $args{parseID} ;
	my $verbose = $args{verbose} ;

	my %get = () ;
	foreach my $acc (@{$args{accessions}}) {
		$get{$acc} = 1 ;
	}
	my $total = my $toDo = @{$args{accessions}} ;
	$verbose && printf ("searching %d sequences:\n", $toDo) ;

	open (IN, $args{inFile}) || die "couldn't open $args{inFile}" ;
	open (OUT, ">$args{outFile}") || die "couldn't open $args{outFile}" ;
	
	my $header = '' ;
	while ($header = <IN>) {
		last if $header =~ /^>/ ;
	}

	my $seq = '' ;
	$verbose && (my $t0 = time () - 1e-6) ;
	while (1) {
		my $line = <IN> ;
		unless (($line =~ /^>/) || eof IN) {
			$seq .= $line ;
		} else {
			my ($name, $desc) = ($header =~ /^>(\S*)\s+(.*)/) ;
			foreach my $acc (split /\|/, $name) {
				($acc =~ s/\..*$//) if $ignoreVersion ;
				(($acc) = ($acc =~ /$parseID/)) if $parseID ;
				if ($get{$acc}) {
					print OUT $header, $seq ;
					$toDo-- ;
					delete $get{$acc} ;
					$verbose && !($toDo % 100) && printf (STDERR "\r%6d seqs. to go - %5d seq./sec.      ", $toDo, ($total - $toDo) / (time () - $t0)) ;
					last ;
				}
			}
			last if (eof (IN) || !$toDo) ;
			$header = $line ;
			$seq = '' ;
		}
	}
	close IN ;
	close OUT ;

	# write accession we didn't find to log file:
	$verbose && print ("\nmissed $toDo sequences - see missedSeqs.log") ;
	open (OUT, ">missedSeqs.log") || die ;
	print OUT join ("\n", keys %get) ;
	close OUT ;

	return ($total - $toDo) ;
}

sub translate {
	my ($seq, $frame) = @_ ;

###################################################################################################
# set up the translation table:
	my %translation = (
			TTT => 'F' ,	TCT => 'S' ,	TAT => 'Y' ,	TGT => 'C' ,
			TTC => 'F' , 	TCC => 'S' ,	TAC => 'Y' ,	TGC => 'C' ,
			TTA => 'L' ,	TCA => 'S' ,	TAA => '*' ,	TGA => '*' ,
			TTG => 'L' ,	TCG => 'S' ,	TAG => '*' ,	TGG => 'W' ,

			CTT => 'L' ,	CCT => 'P' ,	CAT => 'H' ,	CGT => 'R' ,
			CTC => 'L' ,	CCC => 'P' ,	CAC => 'H' ,	CGC => 'R' ,
			CTA => 'L' ,	CCA => 'P' ,	CAA => 'Q' ,	CGA => 'R' ,
			CTG => 'L' ,	CCG => 'P' ,	CAG => 'Q' ,	CGG => 'R' ,

			ATT => 'I' ,	ACT => 'T' ,	AAT => 'N' ,	AGT => 'S' ,
			ATC => 'I' ,	ACC => 'T' ,	AAC => 'N' ,	AGC => 'S' ,
			ATA => 'I' ,	ACA => 'T' ,	AAA => 'K' ,	AGA => 'R' ,
			ATG => 'M' ,	ACG => 'T' ,	AAG => 'K' ,	AGG => 'R' ,

			GTT => 'V' ,	GCT => 'A' ,	GAT => 'D' ,	GGT => 'G' ,
			GTC => 'V' ,	GCC => 'A' ,	GAC => 'D' ,	GGC => 'G' ,
			GTA => 'V' ,	GCA => 'A' ,	GAA => 'E' ,	GGA => 'G' ,
			GTG => 'V' ,	GCG => 'A' ,	GAG => 'E' ,	GGG => 'G'
		) ;
###################################################################################################

	$seq =~ s/\s//g ;

	# convert CDS to upper case and replace U with T:
	$seq = "\U$seq" ;
	$seq =~ tr/U/T/ ;

	# use complementary sequence if frame<0:
	if ($frame < 0) {
		$seq = reverse ($seq) ;
		$seq =~ tr/ATGC/TACG/ ;
		$frame = -$frame ;
	}

	# set frame:
	$seq = substr ($seq, $frame - 1) ;

	# translate:
	my $peptide = '' ;
	while (my $codon = substr($seq, 0, 3, "")) {
		$peptide .= $translation{$codon} || 'X' ;
	}
	return $peptide ;
}	

###################################################################################################
#                                                                                                 #
# Extract info from blast output files                                                            #
#                                                                                                 #
###################################################################################################

sub processBLAST {
	my $inFile = shift ;
	my $outFile = shift ;

	open (IN, $inFile) || die "couldn't open $inFile" ;
	open (OUT, ">$outFile") || die "couldn't open $outFile" ;

	print OUT "query	frame	hit	hitFull	Expect	identities	total	startQuery	endQuery\n" ;

	my $n = my $noHit = 0 ;

	while (<IN>) {
		last if /^Query=/ ;
	}
	my $x = $_ ;
	while (<IN>) {
		$x .= $_ ;
		next unless /^Query=/ ;
		$n++ ;
		print OUT process($x, $noHit) ;
		$x = $_ ;
	}
	close IN ;
	$n++ ;
	print OUT process($x, $noHit) ;
	close OUT ;
	return ($n, $noHit) ;
}

###################################################################################################
sub process {
	my $x = shift ;
	my $noHit = shift ;

	# extract info for this query:
	(my $query)	= ($x =~ /^Query=\s*(?:gi\|)?([^\s\|]+)/) ;
	(my $hit)		= ($x =~ /\n>([^\n]+)\n/) ;
	(my $expect)	= ($x =~ /\n Score =\s*[\d\.]+ bits.*Expect(?:\(2\))? =\s*([\d\.e\+\-]+)\s/) ;
	my ($identities, $total) = ($x =~ /\n Identities =\s*(\d+)\/(\d+) /) ;
	(my $frame) 	= ($x =~ /\n Frame =\s*([\+\-\d]+)\s*\n/) ;
	(my $start) 	= ($x =~ /(?:Frame =\s*[\+\-\d]+\s*|\)\n)\nQuery: (\d+) /) ;
	(my $end) 		= ($x =~ /\nQuery:\s+\d+\s+\S+\s+(\d+)\s[^:]*:[^:]*(Subset of the database|Score)/s) ;
	my $acc = '' ;
	if ($hit) {
		($acc) 	= ($hit =~ /^(?:gi\|)?([\w\-\.]+)/) ;
		$acc ||= '-' ;
		($expect = 1 . $expect) if ($expect && $expect =~ /^e/) ;
	} else {
		$noHit++ ;
		$acc = $hit = '-' ;
	}
	$frame	||= '-' ;
	$expect	||= '-' ;
	$total	||= '-' ;
	$start	||= '-' ;
	$end	||= '-' ;
	$identities	||= '-' ;
	return "$query	$frame	$acc	$hit	$expect	$identities	$total	$start	$end\n" ;
}

#
#======= CUT HERE AND SAVE AS SEPARATE SCRIPT FILE ================================================
#
###################################################################################################
#                                                                                                 #
# Does the blast against smallest dataset to select subsets                                       #
#                                                                                                 #
###################################################################################################

#!/usr/bin/perl
use strict ;

my $fileBase 	= 'Lm2006' ;	# could be also Nassonia... (should be smallest data set)
my $LocustaAA	= "L_migratoriaNR.fa" ;		# amino acids from blast against fly

my @species = qw/Dm Ag Bm Tc Am Am2 Nv Ng Ap Hc Da Bi Ra/ ;
my @speciesL = ('D_melanogasterNR', 'A_gambiaeNR', 'B_moriNR', 'T_castaneumNR', 'A_melliferaNR', 'A_mellifera2NR', 'N_vitripennisNR', 'N_giraultiNR', 'A_pisumNR', 'H_coagulataNR', 'D_magnaNR', 'B_microplusNR', 'R_appendiculatusNR') ;

###################################################################################################

my $blastpExec  = '/usr/bin/blastall -p blastp -a 4 -e 1 -v 1 -b 1 -I T' ;

###################################################################################################

# blast Locusta EST AAs ($LocustaAA.fa) against other species:
for (my $i=0; $i<@species; $i++) {
	my $s = $species[$i] ;				# e.g., Dm
	my $sL = $speciesL[$i] ;			# e.g., D_melanogasterNR
	my $fileName = "$fileBase${s}Acc" ;	# we'll write those that match Locusta in here
	next if (-f "$fileName.fa") ;		# skip if we've already got everything
	print "\nblasting Lm AAs against $sL...\n" ;
	my $blastFile = "${fileBase}Lm$s.bls" ;		# raw blast output file
	my $blastFileS = "${fileBase}Lm$s.tab" ;	# processed blast output table with main info
	`$blastpExec -i $LocustaAA -o $blastFile -d $sL` unless (-f $blastFile) ;
	print `processBLAST.pl $blastFile $blastFileS`  unless (-f $blastFileS) ;

	# write those sequences that matched Locusta to accession & fasta file:
	my @acc = readColumn($blastFileS, 2) ;
	open (OUT, ">$fileName.txt") || die ;
	print OUT join("\n", @acc) ;
	close OUT ;
	getFastaList (inFile => "$sL.fa", outFile => "$fileName.fa", accessions => \@acc) ;
}

print "\nPlease call Lm_reciprocalBLAST12.pl\n" ;

###################################################################################################
sub readColumn {
	my $c = $_[1] ;
	open (IN, $_[0]) || die "couldn't open $_[0]" ;
	<IN> ;
	my @o = () ;
	while (<IN>) {
		chomp ;
		my @l = split ("\t", $_) ;
		push @o, $l[$c] ;
	}
	close IN ;
	return @o ;
}

###################################################################################################

sub translateLm {
	my ($blastLmDmS, $Locusta, $LocustaAA) = @_ ;

	# read frames from Dm blast:
	open (IN, $blastLmDmS) || die ;
	<IN> ;
	my %frame = () ;
	while (<IN>) {
		my @l = split ("\t", $_) ;
		$frame{$l[0]} = $l[1] ; # frame in BLAST := first nucleotide of codon (starting from 1)
	}
	close IN ;

	open (OUT, ">$LocustaAA") || die ;

	# read all Locusta sequences and translate each in its frame, finding longest ORF:
	open (IN, $Locusta) || die ;
	my $seqName = my $seq = '' ;
	while (<IN>) {
		if (/^>/) {
			# process previous sequence:
			if ($seqName && ($frame{$seqName} ne '-')) {
				my $protein = longestORF ($seq, $frame{$seqName}) ;
				print OUT ">$seqName\n$protein\n" ;
			}
			# get new sequence name:
			($seqName) = /^>([\w\.]+)/ ;
			$seq = '' ;
		} else {
			$seq .= $_ ;
		}	
	}
	if ($seqName && ($frame{$seqName} ne '-')) {
		my $protein = longestORF ($seq, $frame{$seqName}) ;
		print OUT ">$seqName\n$protein\n" ;
	}
	close IN ;
	close OUT ;
}

sub longestORF {
	my ($seq, $frame) = @_ ;

	my ($prot) = translate ($seq, $frame) ;
	my @aa = split ('', $prot) ;
	my $lengthLongest = 0 ;
	my $longestPept = '' ;
	while (@aa) {
		my $pept = '' ;
		while (@aa && (($a = shift(@aa)) ne '*')) {
			$pept .= $a ;
		}
		$pept =~ s/^X+|X+$//g ; # remove leading / trailing unknown aa
		if (length($pept) > $lengthLongest) {
			$lengthLongest = length($pept) ;
			$longestPept = $pept ;
		}
	}

	return ($longestPept) ;
}

sub translate {
	my ($seq, $frame) = @_ ;

	my $tmpFile = "tmp.fa" ;
	open (OUT2, ">$tmpFile") || die ;
	print OUT2 ">tmp\n$seq" ;
	close OUT2 ;

	my $file = `\\martin\\perl\\sequences\\translate.pl $tmpFile $frame 1` ;
	open (IN2, $file) || die ;
	<IN2> ;
	my $protein = join ("", <IN2>) ;
	$protein =~ s/\s//g ;
	close IN2 ;
	return $protein ;
}	

#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
=head1 NAME
getFastaList

=head1 SYNOPSIS
read Fasta file and write those sequences specified in a list to a second file
- this is much faster than extracting sequences from a BLAST db.
Returns		number of read sequences
Arguments	hash of arguments:
		inFile		=> Fasta input file name, 
		outFile		=> Fasta output file name, 
		accessions 	=> pointer to list of accessions,
		ignoreVersion 	=> 0,1 [=0] ignore version tail (.3 in NM_005203.3)
		parseID		=> [=0] set to regexp string to parse ID (e.g., '[^\d]*(\d+)')

=cut
#.............................................................................

sub getFastaList {
	my %args = (
		outFile => 'getFasta.fa' ,
		ignoreVersion => 0 ,
		parseID => 0 ,
		verbose => 0 ,
		@_ 
	) ;
	my $ignoreVersion = $args{ignoreVersion} ;
	my $parseID = $args{parseID} ;
	my $verbose = $args{verbose} ;

	my %get = () ;
	foreach my $acc (@{$args{accessions}}) {
		$get{$acc} = 1 ;
	}
	my $total = my $toDo = @{$args{accessions}} ;
	$verbose && printf ("searching %d sequences:\n", $toDo) ;

	open (IN, $args{inFile}) || die "couldn't open $args{inFile}" ;
	open (OUT, ">$args{outFile}") || die "couldn't open $args{outFile}" ;
	
	my $header = '' ;
	while ($header = <IN>) {
		last if $header =~ /^>/ ;
	}

	my $seq = '' ;
	$verbose && (my $t0 = time () - 1e-6) ;
	while (1) {
		my $line = <IN> ;
		unless (($line =~ /^>/) || eof IN) {
			$seq .= $line ;
		} else {
			my ($name, $desc) = ($header =~ /^>(\S*)\s+(.*)/) ;
			foreach my $acc (split /\|/, $name) {
				($acc =~ s/\..*$//) if $ignoreVersion ;
				(($acc) = ($acc =~ /$parseID/)) if $parseID ;
				if ($get{$acc}) {
					print OUT $header, $seq ;
					$toDo-- ;
					delete $get{$acc} ;
					$verbose && !($toDo % 100) && printf (STDERR "\r%6d seqs. to go - %5d seq./sec.      ", $toDo, ($total - $toDo) / (time () - $t0)) ;
					last ;
				}
			}
			last if (eof (IN) || !$toDo) ;
			$header = $line ;
			$seq = '' ;
		}
	}
	close IN ;
	close OUT ;

	# write accession we didn't find to log file:
	$verbose && print ("\nmissed $toDo sequences - see missedSeqs.log") ;
	open (OUT, ">missedSeqs.log") || die ;
	print OUT join ("\n", keys %get) ;
	close OUT ;

	return ($total - $toDo) ;
}

#
#======= CUT HERE AND SAVE AS SEPARATE SCRIPT FILE ================================================
#
###################################################################################################
#                                                                                                 #
# This program tests for full orthology by reciprocal blast                                       #
#                                                                                                 #
###################################################################################################

#!/usr/bin/perl
use strict ;

my $eCutoff		= 1e-25	; # only accept hits below this e-value cutoff - should be 1 for the first run!!!!

my $base 		= 'Lm2006' ;
my $execPath	= '/usr/bin/blastall' ;
my $outFile2	= $base . "Reciprocals$eCutoff.tab" ;

my @species  = ('Dm', 'Ag', 'Bm', 'Tc', 'Am', 'Am2', 'Nv', 'Ng', 'Ap', 'Hc', 'Da') ;	# excluding Lm
my %fullName = (	# these are the names of the BLAST databases
		'Dm' => 'D_melanogasterNR' ,
		'Ag' => 'A_gambiaeNR' ,
		'Bm' => 'B_moriNR' ,
		'Tc' => 'T_castaneumNR' ,
		'Am' => 'A_melliferaNR' ,
		'Am2' => 'A_mellifera2NR' ,
		'Nv' => 'N_vitripennisNR' ,
		'Ng' => 'N_giraultiNR' ,
		'Lm' => 'L_migratoriaNR' ,
		'Ap' => 'A_pisumNR' ,
		'Hc' => 'H_coagulataNR' ,
		'Da' => 'D_magnaNR' ) ;

my $blastExec 	=  "$execPath -p blastp -a 4 -v 1 -b 1 -I T -e $eCutoff " ;

my @species1 = @species ; # or: my @species1 = @species[0..2]

###################################################################################################
# take Locusta hits in each species from files, and then blast everybody's sequences against everybody else:
my %hits = () ;
foreach my $s1 (@species1) {
	# make lookup table query->hit for Locusta:
	%{$hits{Lm}{$s1}} = readBlast("${base}Lm$s1.tab") ;
	# blast against all:
	my $query 	  = "$base${s1}Acc.fa" ;
	foreach my $s2 ('Lm', @species) {
		next if ($s1 eq $s2) ;
		my $outFile  = "$base$s1$s2.bls" ;
		my $outFileS = "$base$s1$s2.tab" ;
		print "blasting $fullName{$s1} -> $fullName{$s2} ...\n" ;
		`$blastExec -i $query -o $outFile -d $fullName{$s2}` unless ((-f $outFile) || (-f $outFileS)) ; #skip if done before...
		processBLAST ($outFile, $outFileS) unless (-f $outFileS) ;
		# make lookup table query->hit (check for reciprocal  best hits with this later):
		%{$hits{$s1}{$s2}} = readBlast($outFileS) ;
	}
}

###################################################################################################
# check for reciprocal  best hits (all pairwise comparisons):
open (OUT, ">$outFile2") || die ;
print OUT "L_migratoria" ;
push @species, 'Lm' ;
foreach my $s (@species) {
	print OUT "\t$fullName{$s}" ;
}
foreach my $s1 (0..$#species) {
	foreach my $s2 ($s1+1 .. $#species) {
		print OUT "\t$species[$s1]$species[$s2]" ;
	}
}
print OUT "\n" ;

foreach my $t (keys %{$hits{Lm}{Dm}}) {
	next unless $hits{Lm}{Dm}{$t} ;
	$hits{Lm}{Lm}{$t} = $t ;	# to find back the Locusta gene itself
	print OUT $t ;
	foreach my $i1 (0..$#species) {
		my $h1 = $hits{Lm}{$species[$i1]}{$t} || 0 ;
		print OUT "\t$h1" ;
	}

	# check for pairwise reciprocity:
	foreach my $i1 (0..$#species) {
		my $s1 = $species[$i1] ;
		my $h1 = $hits{Lm}{$s1}{$t} ;
		foreach my $i2 ($i1+1 .. $#species) {
			my $s2 = $species[$i2] ;
			my $h2 = $hits{Lm}{$s2}{$t} ;
			my $s1_s2 = $h1 && $hits{$s1}{$s2}{$h1} ;
			my $s2_s1 = $h2 && $hits{$s2}{$s1}{$h2} ;
			my $rec = ($h1 && $h2 && ($s1_s2 eq $h2) && ($s2_s1 eq $h1)) || 0 ;
			print OUT "\t$rec" ;
		}
	}
	print OUT "\n" ;
}
close OUT ;

###################################################################################################

sub readBlast {
	my $file = $_[0] ;
	my %hash ;
	$hash{'-'} = 0 ;
	open (IN, $file) || die "couldn't open $file" ;
	<IN> ;
	while (<IN>) {
		chomp ;
		my @l = split ("\t", $_) ;
		next if ($eCutoff && ($l[4] > $eCutoff)) ;	## new line to exclude distant hits
		$hash{$l[0]} = ($l[2] ne '-') ? $l[2] : 0 ;
	}
	close IN ;
	return %hash ;
}

###################################################################################################
#                                                                                                 #
# Extract info from blast output files                                                            #
#                                                                                                 #
###################################################################################################

sub processBLAST {
	my $inFile = shift ;
	my $outFile = shift ;

	open (IN, $inFile) || die "couldn't open $inFile" ;
	open (OUT, ">$outFile") || die "couldn't open $outFile" ;

	print OUT "query	frame	hit	hitFull	Expect	identities	total	startQuery	endQuery\n" ;

	my $n = my $noHit = 0 ;

	while (<IN>) {
		last if /^Query=/ ;
	}
	my $x = $_ ;
	while (<IN>) {
		$x .= $_ ;
		next unless /^Query=/ ;
		$n++ ;
		print OUT process($x, $noHit) ;
		$x = $_ ;
	}
	close IN ;
	$n++ ;
	print OUT process($x, $noHit) ;
	close OUT ;
	return ($n, $noHit) ;
}

###################################################################################################
sub process {
	my $x = shift ;
	my $noHit = shift ;

	# extract info for this query:
	(my $query)	= ($x =~ /^Query=\s*(?:gi\|)?([^\s\|]+)/) ;
	(my $hit)		= ($x =~ /\n>([^\n]+)\n/) ;
	(my $expect)	= ($x =~ /\n Score =\s*[\d\.]+ bits.*Expect(?:\(2\))? =\s*([\d\.e\+\-]+)\s/) ;
	my ($identities, $total) = ($x =~ /\n Identities =\s*(\d+)\/(\d+) /) ;
	(my $frame) 	= ($x =~ /\n Frame =\s*([\+\-\d]+)\s*\n/) ;
	(my $start) 	= ($x =~ /(?:Frame =\s*[\+\-\d]+\s*|\)\n)\nQuery: (\d+) /) ;
	(my $end) 		= ($x =~ /\nQuery:\s+\d+\s+\S+\s+(\d+)\s[^:]*:[^:]*(Subset of the database|Score)/s) ;
	my $acc = '' ;
	if ($hit) {
		($acc) 	= ($hit =~ /^(?:gi\|)?([\w\-\.]+)/) ;
		$acc ||= '-' ;
		($expect = 1 . $expect) if ($expect && $expect =~ /^e/) ;
	} else {
		$noHit++ ;
		$acc = $hit = '-' ;
	}
	$frame	||= '-' ;
	$expect	||= '-' ;
	$total	||= '-' ;
	$start	||= '-' ;
	$end	||= '-' ;
	$identities	||= '-' ;
	return "$query	$frame	$acc	$hit	$expect	$identities	$total	$start	$end\n" ;
}

#
#======= CUT HERE AND SAVE AS SEPARATE SCRIPT FILE ================================================
#
###################################################################################################
#                                                                                                 #
# Align amino acid sequences using MUSCLE with default settings                                   #
#                                                                                                 #
###################################################################################################

#!/usr/bin/perl
use strict ;
use MJL_dataUtils ;

my $seqDir = '/home/jsavard/Phylo2006/aa_families/fasta' ;
my $ext = 'fa' ;
my $executable = '/home/jsavard/phylogeny/muscle_V3.51/muscle' ;
my $speciesNames = 1 ;	# set to 1 to replace sequence IDs by species names (else 0)
my $shortNames = 1 ;	# set to 1 to use only first sequence ID as file name (else 0)

$ARGV[0] || die "usage: alignMUSCLE.pl inFile [-seqDir $seqDir]
	inFile contains sequence names to be aligned\n" ;

my $inFile = shift @ARGV ;
while (my $arg = shift @ARGV) {
	if ($arg =~ /^-s/) {
		$seqDir = shift @ARGV ;
	}
}
my $fileBase = $inFile ;
$fileBase =~ s/\.\w+$// ;
my $alignDir = $fileBase ;
mkdir ($alignDir) ;
my $logFile = $fileBase . '_MUSCLE.log' ;
open (LOG, ">>$logFile") || die ;
# $fileBase =~ s/.*\\// ;	# Windows: remove leading directories from $filebase (derived from $inFile)
$fileBase =~ s/.*\/// ;	# Linux: remove leading directories from $filebase (derived from $inFile)	##MJL##

open (IN, "$inFile") || die ;
chomp (my $header = <IN>) ;	# read header and remove \n
my @species = split ("\t", $header) ;	# get species from header (= names for directories)

my $t0 = my $t1 = time() ;	# set time to 0
my $inFASTA = 'tmp_fasta.tmp' ;
while (<IN>) {
	# get gene identifiers from the list file:
	chomp ;
	next unless $_ ;	# skip emtpy lines
	my @gene = split ("\t", $_) ;	# get gene names
	my $outFASTA = $shortNames ? "${fileBase}_$gene[0].fa" : join ("_", @gene) . '.fa' ;	# output family file

	unless (-f $outFASTA) { # skip if we've done this earlier:
		# concatenate the FASTA sequences for all genes into one file:
		unlink $inFASTA ;	# delete file - start with a clean slate
		$outFASTA =~ s/\|//g ;	# remove any '|' in file name
		$outFASTA = "$alignDir/$outFASTA" ;	###MJL### there was a \\ here for windows
		$outFASTA =~ s/\s+//g ;		###MJL### should really be unnecessary: remove \n and other blanks from file name
		my $concatSeq = "" ;
		foreach my $i (0..$#species) {
			my $file = "$seqDir/$species[$i]/$gene[$i].$ext" ;
			$file =~ s/\|//g ;
			if (my @seq = readFasta($file)) {
				my $name = $speciesNames ? $species[$i] : $seq[0][0] ;
				$seq[0][1] =~ s/\*/X/g ;	# MUSCLE doesn't like *
				$concatSeq .= ">$name\n$seq[0][1]\n\n" ;
			} else {
				warn "can't find $file\n" ;
				$outFASTA .= "_incomplete" ;	###MJL###
				print LOG "$outFASTA	$file	not found!\n" ;
			}
		}
		open (INfasta, ">$inFASTA") || die ;
		print INfasta $concatSeq ;
		close INfasta ;
		# align this concatenated FASTA file:
		print "\n\n\n>>>>>>>>> aligning $outFASTA: <<<<<<<<<\n" ; 
		`$executable < $inFASTA > $outFASTA` ;
	}

	$t1 = time() ;
	printf "\naligned $outFASTA in %dsec\n\n", $t1 - $t0 ;
	$t0 = $t1 ;
}
close IN ;
close LOG ;

#
#======= CUT HERE AND SAVE AS SEPARATE PACKAGE FILE "MJL_dataUtils.pm" ============================
#
###################################################################################################
#                                                                                                 #
# Package with data utilities (using BioPerl)                                                     #
#                                                                                                 #
###################################################################################################

package MJL_dataUtils ;
require Exporter ;
our @ISA = ("Exporter") ;

our @EXPORT = qw(longestORF parse_displayID readFasta getFastaList) ;

use strict ;

use Bio::Seq ;
use Bio::SeqIO ;
use File::Basename ;

#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
=head1 NAME
longestORF

=head1 SYNOPSIS
get longest open reading frame for peptide containing STOP codons
Returns		longestPeptide(string), start(int)
Arguments	peptide containing STOP codons (string)

=cut
#.............................................................................

sub longestORF ($) {
	my $peptide = shift ;

	my @pepts = split (/\*/, $peptide) ;
	my $longestPept = '' ;
	foreach my $pept (@pepts) {
		$pept =~ s/^X+|X+$//g ; # remove leading / trailing unknown aa		
		if (length($pept) > length($longestPept)) {
			$longestPept = $pept ;
		}
	}
	my $start = index ($peptide, $longestPept) + 1 ;
	return ($longestPept, $start) ;
}


#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
=head1 NAME
parse_displayID

=head1 SYNOPSIS
parse sequence ID from fasta headers
Returns		id
Arguments	seq object (with display_id in Fasta format)

=cut
#.............................................................................

sub parse_displayID ($) {
	return ($_[0]->display_id =~ /^((?:gi\|)?[^\|,]+)/) ? $1 : $_[0]->display_id ;
}

#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
=head1 NAME
readFasta

=head1 SYNOPSIS
read Fasta file and return a list of all sequences (without BioPerl)
Returns		array of pointers to (name, sequence, description) for all sequences
Arguments	Fasta file name
			[verbose=0]

=cut
#.............................................................................

sub readFasta {
	my $file = shift ;
	my $verbose = shift ;

	unless (open (IN, $file)) {
		warn "couldn't open Fasta file $file for reading" ;
		return () ;
	}
	
	my $header = '' ;
	while ($header = <IN>) {
		last if $header =~ /^>/ ;
	}

	my $seq = '' ;
	my @sequences = () ;
	my $counter = 0 ;
	while (1) {
		my $line = <IN> ;
		unless (($line =~ /^>/) || !$line) {
			$seq .= $line ;
			next ;
		}
		if (($line =~ /^>/) || eof IN) {
			my ($name, $desc) = ($header =~ /^>(\S*)\s+(.*)/) ;
			$seq =~ s/\s//g ;
			my @sequence = ($name, $seq, $desc) ;
			push @sequences, \@sequence ;
			$verbose && !($counter++ % 100) && printf STDERR "\r%6d seqs. read", $counter ;

			last if eof IN ;
			$header = $line ;
			$seq = '' ;
		}
	}
	close IN ;
	$verbose && printf (STDERR "\r%6d seqs. read", $counter) ;

	return @sequences ;
}

#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
=head1 NAME
getFastaList

=head1 SYNOPSIS
read Fasta file and write those sequences specified in a list to a second file
- this is much faster than extracting sequences from a BLAST db.
Returns		number of read sequences
Arguments	hash of arguments:
		inFile		=> Fasta input file name, 
		outFile		=> Fasta output file name, 
		accessions 	=> pointer to list of accessions,
		ignoreVersion 	=> 0,1 [=0] ignore version tail (.3 in NM_005203.3)
		parseID		=> [=0] set to regexp string to parse ID (e.g., '[^\d]*(\d+)')

=cut
#.............................................................................

sub getFastaList {
	my %args = (
		outFile => 'getFasta.fa' ,
		ignoreVersion => 0 ,
		parseID => 0 ,
		verbose => 0 ,
		@_ 
	) ;
	my $ignoreVersion = $args{ignoreVersion} ;
	my $parseID = $args{parseID} ;
	my $verbose = $args{verbose} ;

	my %get = () ;
	foreach my $acc (@{$args{accessions}}) {
		$get{$acc} = 1 ;
	}
	my $total = my $toDo = @{$args{accessions}} ;
	$verbose && printf ("searching %d sequences:\n", $toDo) ;

	open (IN, $args{inFile}) || die "couldn't open $args{inFile}" ;
	open (OUT, ">$args{outFile}") || die "couldn't open $args{outFile}" ;
	
	my $header = '' ;
	while ($header = <IN>) {
		last if $header =~ /^>/ ;
	}

	my $seq = '' ;
	$verbose && (my $t0 = time () - 1e-6) ;
	while (1) {
		my $line = <IN> ;
		unless (($line =~ /^>/) || eof IN) {
			$seq .= $line ;
		} else {
			my ($name, $desc) = ($header =~ /^>(\S*)\s+(.*)/) ;
			foreach my $acc (split /\|/, $name) {
				($acc =~ s/\..*$//) if $ignoreVersion ;
				(($acc) = ($acc =~ /$parseID/)) if $parseID ;
				if ($get{$acc}) {
					print OUT $header, $seq ;
					$toDo-- ;
					delete $get{$acc} ;
					$verbose && !($toDo % 100) && printf (STDERR "\r%6d seqs. to go - %5d seq./sec.      ", $toDo, ($total - $toDo) / (time () - $t0)) ;
					last ;
				}
			}
			last if (eof (IN) || !$toDo) ;
			$header = $line ;
			$seq = '' ;
		}
	}
	close IN ;
	close OUT ;

	# write accession we didn't find to log file:
	$verbose && print ("\nmissed $toDo sequences - see missedSeqs.log") ;
	open (OUT, ">missedSeqs.log") || die ;
	print OUT join ("\n", keys %get) ;
	close OUT ;

	return ($total - $toDo) ;
}