[Bioperl-l] 65% speedup for Bio::SeqIO::fasta::next_seq test

Tim Bunce Tim.Bunce@pobox.com
Mon, 30 Sep 2002 14:39:57 +0100


The appended patch over the latest repository version introduces
caching of valid alphabets into Bio::SeqIO::alphabet.  This saves
the overhead of using Bio::PrimarySeq::new to validate an alphabet.

The code path for a typical Bio::SeqIO::fasta::next_seq call is reduced from:

    Bio::SeqIO::fasta::next_seq
       Bio::Root::IO::_readline
          File::Temp::BEGIN
       Bio::SeqIO::alphabet
       Bio::SeqIO::sequence_factory
       Bio::Seq::SeqFastaSpeedFactory::create
          Bio::PrimarySeq::_guess_alphabet
             Bio::PrimarySeq::seq
                Bio::Root::RootI::_rearrange
             Bio::PrimarySeq::alphabet
       Bio::Seq::alphabet
          Bio::Seq::primary_seq
          Bio::PrimarySeq::alphabet
       Bio::SeqIO::alphabet
          Bio::PrimarySeq::new
             Bio::Root::Root::BEGIN
                Bio::Root::Root::verbose
             Bio::Root::RootI::_rearrange
             Bio::PrimarySeq::alphabet
          Bio::Root::RootI::DESTROY
             Bio::Root::Root::_cleanup_methods
                UNIVERSAL::isa
    Bio::Seq::DESTROY
    Bio::Root::RootI::DESTROY
       Bio::Root::Root::_cleanup_methods
          UNIVERSAL::isa

to just:
    
    Bio::SeqIO::fasta::next_seq
       Bio::Root::IO::_readline
          File::Temp::BEGIN
       Bio::SeqIO::alphabet
       Bio::SeqIO::sequence_factory
       Bio::Seq::SeqFastaSpeedFactory::create
          Bio::PrimarySeq::_guess_alphabet
             Bio::PrimarySeq::seq
                Bio::Root::RootI::_rearrange
             Bio::PrimarySeq::alphabet
       Bio::Seq::alphabet
          Bio::Seq::primary_seq
          Bio::PrimarySeq::alphabet
       Bio::SeqIO::alphabet
    Bio::Seq::DESTROY
    Bio::Root::RootI::DESTROY
       Bio::Root::Root::_cleanup_methods
          UNIVERSAL::isa

The patch also fine-tunes the verbose and alphabet methods.

Overall the sp_protein.lseg test data is now read 65% faster.

Tim.

p.s. The File::Temp::BEGIN shown by Devel::DProf is actually
Bio::Root::IO::_fh which has reused the CODE address originally
used by File::Temp::BEGIN before it was discarded.


diff -r -du bioperl/core/Bio/Root/Root.pm bioperl-core-timbo/Bio/Root/Root.pm
--- bioperl/core/Bio/Root/Root.pm	Mon Sep 30 11:26:36 2002
+++ bioperl-core-timbo/Bio/Root/Root.pm	Mon Sep 30 13:26:34 2002
@@ -227,13 +227,13 @@
 sub verbose {
    my ($self,$value) = @_;
    # allow one to set global verbosity flag
-   if( $DEBUG ) { return $DEBUG }
+   return $DEBUG  if $DEBUG;
+   return $VERBOSITY unless ref $self;
    
-   if(ref($self) && (defined $value || ! defined $self->{'_root_verbose'}) ) {
-       $value = 0 unless defined $value;
-       $self->{'_root_verbose'} = $value;
-   }
-   return (ref($self) ? $self->{'_root_verbose'} : $VERBOSITY);
+    if (defined $value || ! defined $self->{'_root_verbose'}) {
+       $self->{'_root_verbose'} = $value || 0;
+    }
+    return $self->{'_root_verbose'};
 }
 
 sub _register_for_cleanup {
diff -r -du bioperl/core/Bio/Seq.pm bioperl-core-timbo/Bio/Seq.pm
--- bioperl/core/Bio/Seq.pm	Mon Sep 30 11:25:24 2002
+++ bioperl-core-timbo/Bio/Seq.pm	Mon Sep 30 13:56:01 2002
@@ -790,10 +790,8 @@
 =cut
 
 sub alphabet {
-   my ($self,$value) = @_;
-   if( defined $value ) {
-       return $self->primary_seq->alphabet($value);
-   }
+   my $self = shift;
+   return $self->primary_seq->alphabet(shift) if @_ && defined $_[0];
    return $self->primary_seq->alphabet();
 }
 
diff -r -du bioperl/core/Bio/SeqIO.pm bioperl-core-timbo/Bio/SeqIO.pm
--- bioperl/core/Bio/SeqIO.pm	Mon Aug 12 09:40:04 2002
+++ bioperl-core-timbo/Bio/SeqIO.pm	Mon Sep 30 14:05:07 2002
@@ -321,6 +321,8 @@
     eval { require Bio::SeqIO::staden::read; };
 }
 
+my %valid_alphabet_cache;
+
 =head2 new
 
  Title   : new
@@ -483,16 +485,21 @@
    my ($self, $value) = @_;
 
    if ( defined $value) {
-       # instead of hard-coding the allowed values once more, we check by
-       # creating a dummy sequence object
-       eval {
-	   require Bio::PrimarySeq;
-	   my $seq = Bio::PrimarySeq->new('-alphabet' => $value);
-       };
-       if($@) {
-	   $self->throw("Invalid alphabet: $value\n. See Bio::PrimarySeq for allowed values.");
+       $value = lc $value;
+       unless ($valid_alphabet_cache{$value}) {
+	   # instead of hard-coding the allowed values once more, we check by
+	   # creating a dummy sequence object
+	   eval {
+	       require Bio::PrimarySeq;
+	       my $seq = Bio::PrimarySeq->new('-alphabet' => $value);
+		
+	   };
+	   if ($@) {
+	       $self->throw("Invalid alphabet: $value\n. See Bio::PrimarySeq for allowed values.");
+	   }
+	   $valid_alphabet_cache{$value} = 1;
        }
-       $self->{'alphabet'} = "\L$value";
+       $self->{'alphabet'} = $value;
    }
    return $self->{'alphabet'};
 }