# # This text file contains concatenated Perl scripts, as used for data compilation and analysis of # the paper 'Phylogenomic analysis reveals bees and wasps (Hymenoptera) at the base of the radiation # of Holometabolous insects' by Joel Savard et al.. # # To run them, you first need to cut them were indicated, save them as independent files, and then # run them under Perl with the appropriate command line arguments. # # The relevant extract from the 'MJL_dataUtils' package is at the end of this file; please save this # in the same directory as any script that needs it (mostly for convenient input of FASTA format # files). # # For questions, please contact the last author at M.J.Lercher@bath.ac.uk. # # #======= CUT HERE AND SAVE AS SEPARATE SCRIPT FILE ================================================= # #################################################################################################### # # # Extracts non-redundant sequences from a fasta file # # # #################################################################################################### #!/usr/bin/perl use strict ; use MJL_dataUtils ; (@ARGV == 1) || die "usage: $0 fastaFile\n" ; my $inFile = shift ; my $outFile = $inFile ; $outFile =~ s/\.\w+$// ; $outFile .= 'NR0.fa' ; my @seqs = readFasta ($inFile) ; my $n = scalar @seqs ; # restrict to non-redundant sequences: my @seqsNR = nonRedundantSeqS (@seqs) ; my $nNR = scalar @seqsNR ; print "compacting $n sequences to $nNR non-redundant sequences...\n" ; # write to file: open (OUT, ">$outFile") || die ; foreach my $seq (@seqsNR) { print OUT ">$seq->[0] $seq->[2]\n$seq->[1]\n\n" ; } close OUT ; #::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: =head1 NAME nonRedundantSeqS =SYNOPSIS remove (partial) duplicated sequences from set of sequences redundant means one is contained in the other Argument: array of sequence pointers (name, seq, desc) Returns: array of non-redundant sequence pointers (name, seq, desc) =cut #............................................................................. sub nonRedundantSeqS (@) { my @seqs = @_ ; my $counter = my $percentage = 0 ; for (my $i1=0; $i1<@seqs-1; $i1++) { my $seq1 = $seqs[$i1][1] ; for (my $i2=$i1+1; $i2<@seqs; $i2++) { my $seq2 = $seqs[$i2][1] ; if (index ($seq1, $seq2) >= 0) { $seqs[$i2][2] = 'redundant' ; } elsif (index ($seq2, $seq1) >= 0) { $seqs[$i1][2] = 'redundant' ; } } if (int($counter++ / scalar (@seqs) * 100) > $percentage) { printf STDERR "\r%3d \%", $percentage ; $percentage = int($counter++ / scalar (@seqs) * 100) ; } } print STDERR "\n" ; # remove redundant proteins and write non-redundant set to file: return grep ($_->[2] ne 'redundant', @seqs) ; } # #======= CUT HERE AND SAVE AS SEPARATE SCRIPT FILE ================================================= # ################################################################################################### # # # Convert cDNA/ESTs to AA by blast against Drosophila proteins # # # ################################################################################################### #!/usr/bin/perl use strict ; # file & species names: my @newSpeciesS = qw/Hc / ; my @newSpeciesName = qw/H_coagulata / ; my @newSpeciesFasta = qw/H_coagulata.fasta / ; # parameters specific to the project / machine: my $fileBase = 'Lm2006' ; # blast programs and database for Drosophila: my $blastxExec = '/usr/bin/blastall -a 4 -p blastx -e 1 -v 1 -b 1 -I T' ; my $blastpExec = '/usr/bin/blastall -a 4 -p blastp -e 1 -v 1 -b 1 -I T' ; # my $blastxExec = '\biotools\blast\blastall -a 4 -p blastx -e 1 -v 1 -b 1 -I T' ; # my $blastpExec = '\biotools\blast\blastall -a 4 -p blastp -e 1 -v 1 -b 1 -I T' ; my $Dm = 'D_melanogasterNR' ; ################################################################################################### for (my $i=0; $i<@newSpeciesName; $i++) { # file names: my $newSpeciesAA = $newSpeciesName[$i] . '.fa' ; my $blastNewspeciesDm = $newSpeciesS[$i] . 'Dm.bls' ; my $blastNewspeciesDmS = $newSpeciesS[$i] . 'DmBLAST.tab' ; # blast Newspecies ESTs against Dm: print "\nblasting $newSpeciesName[$i] ESTs against Dm...\n" ; `$blastxExec -i $newSpeciesFasta[$i] -o $blastNewspeciesDm -d $Dm` unless (-f $blastNewspeciesDm) ; processBLAST ($blastNewspeciesDm, $blastNewspeciesDmS) unless (-f $blastNewspeciesDmS) ; #find longest open reading frame in the Dm frame: print "finding longest ORF...\n" ; translateEST ($blastNewspeciesDmS, $newSpeciesFasta[$i], $newSpeciesAA) unless (-f $newSpeciesAA) ; } print "\nPlease call nonRedundant.pl and then Trib_BLAST11.pl\n" ; ################################################################################################### sub readColumn { my $c = $_[1] ; open (IN, $_[0]) || die "couldn't open $_[0]" ; ; my @o = () ; while () { chomp ; my @l = split ("\t", $_) ; push @o, $l[$c] ; } close IN ; return @o ; } ################################################################################################### sub translateEST { my ($blastS, $est, $estAA) = @_ ; # read frames from Dm blast: open (IN, $blastS) || die ; ; my %frame = () ; while () { my @l = split ("\t", $_) ; $frame{$l[0]} = $l[1] ; # frame in BLAST := first nucleotide of codon (starting from 1) } close IN ; open (OUTtranslate, ">$estAA") || die ; # read all EST sequences and translate each in its frame, finding longest ORF: open (IN, $est) || die ; my $seqName = my $seq = '' ; while () { if (/^>/) { # process previous sequence: my $frame = $frame{$seqName} ; if ($seqName && $frame && ($frame ne '-')) { my $protein = longestORF ($seq, $frame) ; print OUTtranslate ">$seqName frame$frame{$seqName}\n$protein\n" ; } # get new sequence name: ($seqName) = /^>(?:gi\|)?([^\s\|]+)/ ; ##### THIS LINE CAUSED PROBLEMS WITH RAW DATA BEFORE $seq = '' ; } else { $seq .= $_ ; } } my $frame = $frame{$seqName} ; if ($seqName && $frame && ($frame ne '-')) { my $protein = longestORF ($seq, $frame) ; ##### if you want to modify the sequence name, do it here (although it won't match the blast .tab file anymore): print OUTtranslate ">$seqName frame$frame\n$protein\n" ; } close IN ; close translateOUT ; } sub longestORF { my ($seq, $frame) = @_ ; my ($prot) = translate ($seq, $frame) ; my @aa = split ('', $prot) ; my $lengthLongest = 0 ; my $longestPept = '' ; while (@aa) { my $pept = '' ; while (@aa && (($a = shift(@aa)) ne '*')) { $pept .= $a ; } $pept =~ s/^X+|X+$//g ; # remove leading / trailing unknown aa if (length($pept) > $lengthLongest) { $lengthLongest = length($pept) ; $longestPept = $pept ; } } return ($longestPept) ; } #::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: =head1 NAME getFastaList =head1 SYNOPSIS read Fasta file and write those sequences specified in a list to a second file - this is much faster than extracting sequences from a BLAST db. Returns number of read sequences Arguments hash of arguments: inFile => Fasta input file name, outFile => Fasta output file name, accessions => pointer to list of accessions, ignoreVersion => 0,1 [=0] ignore version tail (.3 in NM_005203.3) parseID => [=0] set to regexp string to parse ID (e.g., '[^\d]*(\d+)') =cut #............................................................................. sub getFastaList { my %args = ( outFile => 'getFasta.fa' , ignoreVersion => 0 , parseID => 0 , verbose => 0 , @_ ) ; my $ignoreVersion = $args{ignoreVersion} ; my $parseID = $args{parseID} ; my $verbose = $args{verbose} ; my %get = () ; foreach my $acc (@{$args{accessions}}) { $get{$acc} = 1 ; } my $total = my $toDo = @{$args{accessions}} ; $verbose && printf ("searching %d sequences:\n", $toDo) ; open (IN, $args{inFile}) || die "couldn't open $args{inFile}" ; open (OUT, ">$args{outFile}") || die "couldn't open $args{outFile}" ; my $header = '' ; while ($header = ) { last if $header =~ /^>/ ; } my $seq = '' ; $verbose && (my $t0 = time () - 1e-6) ; while (1) { my $line = ; unless (($line =~ /^>/) || eof IN) { $seq .= $line ; } else { my ($name, $desc) = ($header =~ /^>(\S*)\s+(.*)/) ; foreach my $acc (split /\|/, $name) { ($acc =~ s/\..*$//) if $ignoreVersion ; (($acc) = ($acc =~ /$parseID/)) if $parseID ; if ($get{$acc}) { print OUT $header, $seq ; $toDo-- ; delete $get{$acc} ; $verbose && !($toDo % 100) && printf (STDERR "\r%6d seqs. to go - %5d seq./sec. ", $toDo, ($total - $toDo) / (time () - $t0)) ; last ; } } last if (eof (IN) || !$toDo) ; $header = $line ; $seq = '' ; } } close IN ; close OUT ; # write accession we didn't find to log file: $verbose && print ("\nmissed $toDo sequences - see missedSeqs.log") ; open (OUT, ">missedSeqs.log") || die ; print OUT join ("\n", keys %get) ; close OUT ; return ($total - $toDo) ; } sub translate { my ($seq, $frame) = @_ ; ################################################################################################### # set up the translation table: my %translation = ( TTT => 'F' , TCT => 'S' , TAT => 'Y' , TGT => 'C' , TTC => 'F' , TCC => 'S' , TAC => 'Y' , TGC => 'C' , TTA => 'L' , TCA => 'S' , TAA => '*' , TGA => '*' , TTG => 'L' , TCG => 'S' , TAG => '*' , TGG => 'W' , CTT => 'L' , CCT => 'P' , CAT => 'H' , CGT => 'R' , CTC => 'L' , CCC => 'P' , CAC => 'H' , CGC => 'R' , CTA => 'L' , CCA => 'P' , CAA => 'Q' , CGA => 'R' , CTG => 'L' , CCG => 'P' , CAG => 'Q' , CGG => 'R' , ATT => 'I' , ACT => 'T' , AAT => 'N' , AGT => 'S' , ATC => 'I' , ACC => 'T' , AAC => 'N' , AGC => 'S' , ATA => 'I' , ACA => 'T' , AAA => 'K' , AGA => 'R' , ATG => 'M' , ACG => 'T' , AAG => 'K' , AGG => 'R' , GTT => 'V' , GCT => 'A' , GAT => 'D' , GGT => 'G' , GTC => 'V' , GCC => 'A' , GAC => 'D' , GGC => 'G' , GTA => 'V' , GCA => 'A' , GAA => 'E' , GGA => 'G' , GTG => 'V' , GCG => 'A' , GAG => 'E' , GGG => 'G' ) ; ################################################################################################### $seq =~ s/\s//g ; # convert CDS to upper case and replace U with T: $seq = "\U$seq" ; $seq =~ tr/U/T/ ; # use complementary sequence if frame<0: if ($frame < 0) { $seq = reverse ($seq) ; $seq =~ tr/ATGC/TACG/ ; $frame = -$frame ; } # set frame: $seq = substr ($seq, $frame - 1) ; # translate: my $peptide = '' ; while (my $codon = substr($seq, 0, 3, "")) { $peptide .= $translation{$codon} || 'X' ; } return $peptide ; } ################################################################################################### # # # Extract info from blast output files # # # ################################################################################################### sub processBLAST { my $inFile = shift ; my $outFile = shift ; open (IN, $inFile) || die "couldn't open $inFile" ; open (OUT, ">$outFile") || die "couldn't open $outFile" ; print OUT "query frame hit hitFull Expect identities total startQuery endQuery\n" ; my $n = my $noHit = 0 ; while () { last if /^Query=/ ; } my $x = $_ ; while () { $x .= $_ ; next unless /^Query=/ ; $n++ ; print OUT process($x, $noHit) ; $x = $_ ; } close IN ; $n++ ; print OUT process($x, $noHit) ; close OUT ; return ($n, $noHit) ; } ################################################################################################### sub process { my $x = shift ; my $noHit = shift ; # extract info for this query: (my $query) = ($x =~ /^Query=\s*(?:gi\|)?([^\s\|]+)/) ; (my $hit) = ($x =~ /\n>([^\n]+)\n/) ; (my $expect) = ($x =~ /\n Score =\s*[\d\.]+ bits.*Expect(?:\(2\))? =\s*([\d\.e\+\-]+)\s/) ; my ($identities, $total) = ($x =~ /\n Identities =\s*(\d+)\/(\d+) /) ; (my $frame) = ($x =~ /\n Frame =\s*([\+\-\d]+)\s*\n/) ; (my $start) = ($x =~ /(?:Frame =\s*[\+\-\d]+\s*|\)\n)\nQuery: (\d+) /) ; (my $end) = ($x =~ /\nQuery:\s+\d+\s+\S+\s+(\d+)\s[^:]*:[^:]*(Subset of the database|Score)/s) ; my $acc = '' ; if ($hit) { ($acc) = ($hit =~ /^(?:gi\|)?([\w\-\.]+)/) ; $acc ||= '-' ; ($expect = 1 . $expect) if ($expect && $expect =~ /^e/) ; } else { $noHit++ ; $acc = $hit = '-' ; } $frame ||= '-' ; $expect ||= '-' ; $total ||= '-' ; $start ||= '-' ; $end ||= '-' ; $identities ||= '-' ; return "$query $frame $acc $hit $expect $identities $total $start $end\n" ; } # #======= CUT HERE AND SAVE AS SEPARATE SCRIPT FILE ================================================ # ################################################################################################### # # # Does the blast against smallest dataset to select subsets # # # ################################################################################################### #!/usr/bin/perl use strict ; my $fileBase = 'Lm2006' ; # could be also Nassonia... (should be smallest data set) my $LocustaAA = "L_migratoriaNR.fa" ; # amino acids from blast against fly my @species = qw/Dm Ag Bm Tc Am Am2 Nv Ng Ap Hc Da Bi Ra/ ; my @speciesL = ('D_melanogasterNR', 'A_gambiaeNR', 'B_moriNR', 'T_castaneumNR', 'A_melliferaNR', 'A_mellifera2NR', 'N_vitripennisNR', 'N_giraultiNR', 'A_pisumNR', 'H_coagulataNR', 'D_magnaNR', 'B_microplusNR', 'R_appendiculatusNR') ; ################################################################################################### my $blastpExec = '/usr/bin/blastall -p blastp -a 4 -e 1 -v 1 -b 1 -I T' ; ################################################################################################### # blast Locusta EST AAs ($LocustaAA.fa) against other species: for (my $i=0; $i<@species; $i++) { my $s = $species[$i] ; # e.g., Dm my $sL = $speciesL[$i] ; # e.g., D_melanogasterNR my $fileName = "$fileBase${s}Acc" ; # we'll write those that match Locusta in here next if (-f "$fileName.fa") ; # skip if we've already got everything print "\nblasting Lm AAs against $sL...\n" ; my $blastFile = "${fileBase}Lm$s.bls" ; # raw blast output file my $blastFileS = "${fileBase}Lm$s.tab" ; # processed blast output table with main info `$blastpExec -i $LocustaAA -o $blastFile -d $sL` unless (-f $blastFile) ; print `processBLAST.pl $blastFile $blastFileS` unless (-f $blastFileS) ; # write those sequences that matched Locusta to accession & fasta file: my @acc = readColumn($blastFileS, 2) ; open (OUT, ">$fileName.txt") || die ; print OUT join("\n", @acc) ; close OUT ; getFastaList (inFile => "$sL.fa", outFile => "$fileName.fa", accessions => \@acc) ; } print "\nPlease call Lm_reciprocalBLAST12.pl\n" ; ################################################################################################### sub readColumn { my $c = $_[1] ; open (IN, $_[0]) || die "couldn't open $_[0]" ; ; my @o = () ; while () { chomp ; my @l = split ("\t", $_) ; push @o, $l[$c] ; } close IN ; return @o ; } ################################################################################################### sub translateLm { my ($blastLmDmS, $Locusta, $LocustaAA) = @_ ; # read frames from Dm blast: open (IN, $blastLmDmS) || die ; ; my %frame = () ; while () { my @l = split ("\t", $_) ; $frame{$l[0]} = $l[1] ; # frame in BLAST := first nucleotide of codon (starting from 1) } close IN ; open (OUT, ">$LocustaAA") || die ; # read all Locusta sequences and translate each in its frame, finding longest ORF: open (IN, $Locusta) || die ; my $seqName = my $seq = '' ; while () { if (/^>/) { # process previous sequence: if ($seqName && ($frame{$seqName} ne '-')) { my $protein = longestORF ($seq, $frame{$seqName}) ; print OUT ">$seqName\n$protein\n" ; } # get new sequence name: ($seqName) = /^>([\w\.]+)/ ; $seq = '' ; } else { $seq .= $_ ; } } if ($seqName && ($frame{$seqName} ne '-')) { my $protein = longestORF ($seq, $frame{$seqName}) ; print OUT ">$seqName\n$protein\n" ; } close IN ; close OUT ; } sub longestORF { my ($seq, $frame) = @_ ; my ($prot) = translate ($seq, $frame) ; my @aa = split ('', $prot) ; my $lengthLongest = 0 ; my $longestPept = '' ; while (@aa) { my $pept = '' ; while (@aa && (($a = shift(@aa)) ne '*')) { $pept .= $a ; } $pept =~ s/^X+|X+$//g ; # remove leading / trailing unknown aa if (length($pept) > $lengthLongest) { $lengthLongest = length($pept) ; $longestPept = $pept ; } } return ($longestPept) ; } sub translate { my ($seq, $frame) = @_ ; my $tmpFile = "tmp.fa" ; open (OUT2, ">$tmpFile") || die ; print OUT2 ">tmp\n$seq" ; close OUT2 ; my $file = `\\martin\\perl\\sequences\\translate.pl $tmpFile $frame 1` ; open (IN2, $file) || die ; ; my $protein = join ("", ) ; $protein =~ s/\s//g ; close IN2 ; return $protein ; } #::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: =head1 NAME getFastaList =head1 SYNOPSIS read Fasta file and write those sequences specified in a list to a second file - this is much faster than extracting sequences from a BLAST db. Returns number of read sequences Arguments hash of arguments: inFile => Fasta input file name, outFile => Fasta output file name, accessions => pointer to list of accessions, ignoreVersion => 0,1 [=0] ignore version tail (.3 in NM_005203.3) parseID => [=0] set to regexp string to parse ID (e.g., '[^\d]*(\d+)') =cut #............................................................................. sub getFastaList { my %args = ( outFile => 'getFasta.fa' , ignoreVersion => 0 , parseID => 0 , verbose => 0 , @_ ) ; my $ignoreVersion = $args{ignoreVersion} ; my $parseID = $args{parseID} ; my $verbose = $args{verbose} ; my %get = () ; foreach my $acc (@{$args{accessions}}) { $get{$acc} = 1 ; } my $total = my $toDo = @{$args{accessions}} ; $verbose && printf ("searching %d sequences:\n", $toDo) ; open (IN, $args{inFile}) || die "couldn't open $args{inFile}" ; open (OUT, ">$args{outFile}") || die "couldn't open $args{outFile}" ; my $header = '' ; while ($header = ) { last if $header =~ /^>/ ; } my $seq = '' ; $verbose && (my $t0 = time () - 1e-6) ; while (1) { my $line = ; unless (($line =~ /^>/) || eof IN) { $seq .= $line ; } else { my ($name, $desc) = ($header =~ /^>(\S*)\s+(.*)/) ; foreach my $acc (split /\|/, $name) { ($acc =~ s/\..*$//) if $ignoreVersion ; (($acc) = ($acc =~ /$parseID/)) if $parseID ; if ($get{$acc}) { print OUT $header, $seq ; $toDo-- ; delete $get{$acc} ; $verbose && !($toDo % 100) && printf (STDERR "\r%6d seqs. to go - %5d seq./sec. ", $toDo, ($total - $toDo) / (time () - $t0)) ; last ; } } last if (eof (IN) || !$toDo) ; $header = $line ; $seq = '' ; } } close IN ; close OUT ; # write accession we didn't find to log file: $verbose && print ("\nmissed $toDo sequences - see missedSeqs.log") ; open (OUT, ">missedSeqs.log") || die ; print OUT join ("\n", keys %get) ; close OUT ; return ($total - $toDo) ; } # #======= CUT HERE AND SAVE AS SEPARATE SCRIPT FILE ================================================ # ################################################################################################### # # # This program tests for full orthology by reciprocal blast # # # ################################################################################################### #!/usr/bin/perl use strict ; my $eCutoff = 1e-25 ; # only accept hits below this e-value cutoff - should be 1 for the first run!!!! my $base = 'Lm2006' ; my $execPath = '/usr/bin/blastall' ; my $outFile2 = $base . "Reciprocals$eCutoff.tab" ; my @species = ('Dm', 'Ag', 'Bm', 'Tc', 'Am', 'Am2', 'Nv', 'Ng', 'Ap', 'Hc', 'Da') ; # excluding Lm my %fullName = ( # these are the names of the BLAST databases 'Dm' => 'D_melanogasterNR' , 'Ag' => 'A_gambiaeNR' , 'Bm' => 'B_moriNR' , 'Tc' => 'T_castaneumNR' , 'Am' => 'A_melliferaNR' , 'Am2' => 'A_mellifera2NR' , 'Nv' => 'N_vitripennisNR' , 'Ng' => 'N_giraultiNR' , 'Lm' => 'L_migratoriaNR' , 'Ap' => 'A_pisumNR' , 'Hc' => 'H_coagulataNR' , 'Da' => 'D_magnaNR' ) ; my $blastExec = "$execPath -p blastp -a 4 -v 1 -b 1 -I T -e $eCutoff " ; my @species1 = @species ; # or: my @species1 = @species[0..2] ################################################################################################### # take Locusta hits in each species from files, and then blast everybody's sequences against everybody else: my %hits = () ; foreach my $s1 (@species1) { # make lookup table query->hit for Locusta: %{$hits{Lm}{$s1}} = readBlast("${base}Lm$s1.tab") ; # blast against all: my $query = "$base${s1}Acc.fa" ; foreach my $s2 ('Lm', @species) { next if ($s1 eq $s2) ; my $outFile = "$base$s1$s2.bls" ; my $outFileS = "$base$s1$s2.tab" ; print "blasting $fullName{$s1} -> $fullName{$s2} ...\n" ; `$blastExec -i $query -o $outFile -d $fullName{$s2}` unless ((-f $outFile) || (-f $outFileS)) ; #skip if done before... processBLAST ($outFile, $outFileS) unless (-f $outFileS) ; # make lookup table query->hit (check for reciprocal best hits with this later): %{$hits{$s1}{$s2}} = readBlast($outFileS) ; } } ################################################################################################### # check for reciprocal best hits (all pairwise comparisons): open (OUT, ">$outFile2") || die ; print OUT "L_migratoria" ; push @species, 'Lm' ; foreach my $s (@species) { print OUT "\t$fullName{$s}" ; } foreach my $s1 (0..$#species) { foreach my $s2 ($s1+1 .. $#species) { print OUT "\t$species[$s1]$species[$s2]" ; } } print OUT "\n" ; foreach my $t (keys %{$hits{Lm}{Dm}}) { next unless $hits{Lm}{Dm}{$t} ; $hits{Lm}{Lm}{$t} = $t ; # to find back the Locusta gene itself print OUT $t ; foreach my $i1 (0..$#species) { my $h1 = $hits{Lm}{$species[$i1]}{$t} || 0 ; print OUT "\t$h1" ; } # check for pairwise reciprocity: foreach my $i1 (0..$#species) { my $s1 = $species[$i1] ; my $h1 = $hits{Lm}{$s1}{$t} ; foreach my $i2 ($i1+1 .. $#species) { my $s2 = $species[$i2] ; my $h2 = $hits{Lm}{$s2}{$t} ; my $s1_s2 = $h1 && $hits{$s1}{$s2}{$h1} ; my $s2_s1 = $h2 && $hits{$s2}{$s1}{$h2} ; my $rec = ($h1 && $h2 && ($s1_s2 eq $h2) && ($s2_s1 eq $h1)) || 0 ; print OUT "\t$rec" ; } } print OUT "\n" ; } close OUT ; ################################################################################################### sub readBlast { my $file = $_[0] ; my %hash ; $hash{'-'} = 0 ; open (IN, $file) || die "couldn't open $file" ; ; while () { chomp ; my @l = split ("\t", $_) ; next if ($eCutoff && ($l[4] > $eCutoff)) ; ## new line to exclude distant hits $hash{$l[0]} = ($l[2] ne '-') ? $l[2] : 0 ; } close IN ; return %hash ; } ################################################################################################### # # # Extract info from blast output files # # # ################################################################################################### sub processBLAST { my $inFile = shift ; my $outFile = shift ; open (IN, $inFile) || die "couldn't open $inFile" ; open (OUT, ">$outFile") || die "couldn't open $outFile" ; print OUT "query frame hit hitFull Expect identities total startQuery endQuery\n" ; my $n = my $noHit = 0 ; while () { last if /^Query=/ ; } my $x = $_ ; while () { $x .= $_ ; next unless /^Query=/ ; $n++ ; print OUT process($x, $noHit) ; $x = $_ ; } close IN ; $n++ ; print OUT process($x, $noHit) ; close OUT ; return ($n, $noHit) ; } ################################################################################################### sub process { my $x = shift ; my $noHit = shift ; # extract info for this query: (my $query) = ($x =~ /^Query=\s*(?:gi\|)?([^\s\|]+)/) ; (my $hit) = ($x =~ /\n>([^\n]+)\n/) ; (my $expect) = ($x =~ /\n Score =\s*[\d\.]+ bits.*Expect(?:\(2\))? =\s*([\d\.e\+\-]+)\s/) ; my ($identities, $total) = ($x =~ /\n Identities =\s*(\d+)\/(\d+) /) ; (my $frame) = ($x =~ /\n Frame =\s*([\+\-\d]+)\s*\n/) ; (my $start) = ($x =~ /(?:Frame =\s*[\+\-\d]+\s*|\)\n)\nQuery: (\d+) /) ; (my $end) = ($x =~ /\nQuery:\s+\d+\s+\S+\s+(\d+)\s[^:]*:[^:]*(Subset of the database|Score)/s) ; my $acc = '' ; if ($hit) { ($acc) = ($hit =~ /^(?:gi\|)?([\w\-\.]+)/) ; $acc ||= '-' ; ($expect = 1 . $expect) if ($expect && $expect =~ /^e/) ; } else { $noHit++ ; $acc = $hit = '-' ; } $frame ||= '-' ; $expect ||= '-' ; $total ||= '-' ; $start ||= '-' ; $end ||= '-' ; $identities ||= '-' ; return "$query $frame $acc $hit $expect $identities $total $start $end\n" ; } # #======= CUT HERE AND SAVE AS SEPARATE SCRIPT FILE ================================================ # ################################################################################################### # # # Align amino acid sequences using MUSCLE with default settings # # # ################################################################################################### #!/usr/bin/perl use strict ; use MJL_dataUtils ; my $seqDir = '/home/jsavard/Phylo2006/aa_families/fasta' ; my $ext = 'fa' ; my $executable = '/home/jsavard/phylogeny/muscle_V3.51/muscle' ; my $speciesNames = 1 ; # set to 1 to replace sequence IDs by species names (else 0) my $shortNames = 1 ; # set to 1 to use only first sequence ID as file name (else 0) $ARGV[0] || die "usage: alignMUSCLE.pl inFile [-seqDir $seqDir] inFile contains sequence names to be aligned\n" ; my $inFile = shift @ARGV ; while (my $arg = shift @ARGV) { if ($arg =~ /^-s/) { $seqDir = shift @ARGV ; } } my $fileBase = $inFile ; $fileBase =~ s/\.\w+$// ; my $alignDir = $fileBase ; mkdir ($alignDir) ; my $logFile = $fileBase . '_MUSCLE.log' ; open (LOG, ">>$logFile") || die ; # $fileBase =~ s/.*\\// ; # Windows: remove leading directories from $filebase (derived from $inFile) $fileBase =~ s/.*\/// ; # Linux: remove leading directories from $filebase (derived from $inFile) ##MJL## open (IN, "$inFile") || die ; chomp (my $header = ) ; # read header and remove \n my @species = split ("\t", $header) ; # get species from header (= names for directories) my $t0 = my $t1 = time() ; # set time to 0 my $inFASTA = 'tmp_fasta.tmp' ; while () { # get gene identifiers from the list file: chomp ; next unless $_ ; # skip emtpy lines my @gene = split ("\t", $_) ; # get gene names my $outFASTA = $shortNames ? "${fileBase}_$gene[0].fa" : join ("_", @gene) . '.fa' ; # output family file unless (-f $outFASTA) { # skip if we've done this earlier: # concatenate the FASTA sequences for all genes into one file: unlink $inFASTA ; # delete file - start with a clean slate $outFASTA =~ s/\|//g ; # remove any '|' in file name $outFASTA = "$alignDir/$outFASTA" ; ###MJL### there was a \\ here for windows $outFASTA =~ s/\s+//g ; ###MJL### should really be unnecessary: remove \n and other blanks from file name my $concatSeq = "" ; foreach my $i (0..$#species) { my $file = "$seqDir/$species[$i]/$gene[$i].$ext" ; $file =~ s/\|//g ; if (my @seq = readFasta($file)) { my $name = $speciesNames ? $species[$i] : $seq[0][0] ; $seq[0][1] =~ s/\*/X/g ; # MUSCLE doesn't like * $concatSeq .= ">$name\n$seq[0][1]\n\n" ; } else { warn "can't find $file\n" ; $outFASTA .= "_incomplete" ; ###MJL### print LOG "$outFASTA $file not found!\n" ; } } open (INfasta, ">$inFASTA") || die ; print INfasta $concatSeq ; close INfasta ; # align this concatenated FASTA file: print "\n\n\n>>>>>>>>> aligning $outFASTA: <<<<<<<<<\n" ; `$executable < $inFASTA > $outFASTA` ; } $t1 = time() ; printf "\naligned $outFASTA in %dsec\n\n", $t1 - $t0 ; $t0 = $t1 ; } close IN ; close LOG ; # #======= CUT HERE AND SAVE AS SEPARATE PACKAGE FILE "MJL_dataUtils.pm" ============================ # ################################################################################################### # # # Package with data utilities (using BioPerl) # # # ################################################################################################### package MJL_dataUtils ; require Exporter ; our @ISA = ("Exporter") ; our @EXPORT = qw(longestORF parse_displayID readFasta getFastaList) ; use strict ; use Bio::Seq ; use Bio::SeqIO ; use File::Basename ; #::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: =head1 NAME longestORF =head1 SYNOPSIS get longest open reading frame for peptide containing STOP codons Returns longestPeptide(string), start(int) Arguments peptide containing STOP codons (string) =cut #............................................................................. sub longestORF ($) { my $peptide = shift ; my @pepts = split (/\*/, $peptide) ; my $longestPept = '' ; foreach my $pept (@pepts) { $pept =~ s/^X+|X+$//g ; # remove leading / trailing unknown aa if (length($pept) > length($longestPept)) { $longestPept = $pept ; } } my $start = index ($peptide, $longestPept) + 1 ; return ($longestPept, $start) ; } #::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: =head1 NAME parse_displayID =head1 SYNOPSIS parse sequence ID from fasta headers Returns id Arguments seq object (with display_id in Fasta format) =cut #............................................................................. sub parse_displayID ($) { return ($_[0]->display_id =~ /^((?:gi\|)?[^\|,]+)/) ? $1 : $_[0]->display_id ; } #::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: =head1 NAME readFasta =head1 SYNOPSIS read Fasta file and return a list of all sequences (without BioPerl) Returns array of pointers to (name, sequence, description) for all sequences Arguments Fasta file name [verbose=0] =cut #............................................................................. sub readFasta { my $file = shift ; my $verbose = shift ; unless (open (IN, $file)) { warn "couldn't open Fasta file $file for reading" ; return () ; } my $header = '' ; while ($header = ) { last if $header =~ /^>/ ; } my $seq = '' ; my @sequences = () ; my $counter = 0 ; while (1) { my $line = ; unless (($line =~ /^>/) || !$line) { $seq .= $line ; next ; } if (($line =~ /^>/) || eof IN) { my ($name, $desc) = ($header =~ /^>(\S*)\s+(.*)/) ; $seq =~ s/\s//g ; my @sequence = ($name, $seq, $desc) ; push @sequences, \@sequence ; $verbose && !($counter++ % 100) && printf STDERR "\r%6d seqs. read", $counter ; last if eof IN ; $header = $line ; $seq = '' ; } } close IN ; $verbose && printf (STDERR "\r%6d seqs. read", $counter) ; return @sequences ; } #::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: =head1 NAME getFastaList =head1 SYNOPSIS read Fasta file and write those sequences specified in a list to a second file - this is much faster than extracting sequences from a BLAST db. Returns number of read sequences Arguments hash of arguments: inFile => Fasta input file name, outFile => Fasta output file name, accessions => pointer to list of accessions, ignoreVersion => 0,1 [=0] ignore version tail (.3 in NM_005203.3) parseID => [=0] set to regexp string to parse ID (e.g., '[^\d]*(\d+)') =cut #............................................................................. sub getFastaList { my %args = ( outFile => 'getFasta.fa' , ignoreVersion => 0 , parseID => 0 , verbose => 0 , @_ ) ; my $ignoreVersion = $args{ignoreVersion} ; my $parseID = $args{parseID} ; my $verbose = $args{verbose} ; my %get = () ; foreach my $acc (@{$args{accessions}}) { $get{$acc} = 1 ; } my $total = my $toDo = @{$args{accessions}} ; $verbose && printf ("searching %d sequences:\n", $toDo) ; open (IN, $args{inFile}) || die "couldn't open $args{inFile}" ; open (OUT, ">$args{outFile}") || die "couldn't open $args{outFile}" ; my $header = '' ; while ($header = ) { last if $header =~ /^>/ ; } my $seq = '' ; $verbose && (my $t0 = time () - 1e-6) ; while (1) { my $line = ; unless (($line =~ /^>/) || eof IN) { $seq .= $line ; } else { my ($name, $desc) = ($header =~ /^>(\S*)\s+(.*)/) ; foreach my $acc (split /\|/, $name) { ($acc =~ s/\..*$//) if $ignoreVersion ; (($acc) = ($acc =~ /$parseID/)) if $parseID ; if ($get{$acc}) { print OUT $header, $seq ; $toDo-- ; delete $get{$acc} ; $verbose && !($toDo % 100) && printf (STDERR "\r%6d seqs. to go - %5d seq./sec. ", $toDo, ($total - $toDo) / (time () - $t0)) ; last ; } } last if (eof (IN) || !$toDo) ; $header = $line ; $seq = '' ; } } close IN ; close OUT ; # write accession we didn't find to log file: $verbose && print ("\nmissed $toDo sequences - see missedSeqs.log") ; open (OUT, ">missedSeqs.log") || die ; print OUT join ("\n", keys %get) ; close OUT ; return ($total - $toDo) ; }