[Bioperl-l] Testing Bioperl objects for equality: attempt 2

Peter van Heusden pvh at egenetics.com
Mon May 3 06:49:56 EDT 2004


Ok, so recently I posted my diffs to the Bioperl source which added 
equals() and diff() methods to Bioperl object. Hilmar objected, citing 
two main objections:

1) The 'hash_key()' methods I added to various objects were 
inappopriately named.
2) Equality is not an objective measure, so a single 'equals' method 
didn't make sense.

To address (2), I've pulled the equals code out of the Bioperl objects, 
and created a new interface, ComparatorI, which can test a certain set 
of Bioperl objects for equality. And I've created a SampleComparator.pm 
module implementing my version of equality.

I've attached the ComparatorI.pm and SampleComparator.pm modules to this 
message, as well as a small patch to this morning's Bioperl which is 
necessary for them to work (it still includes the misnamed hash_key() 
stuff that will be fixed in the future).

I know the name Comparator is clumsy, but its just an interim name. And 
equals() should be renamed is_equal(). But I just wanted to get 
something out there for comment before going further. Here's some sample 
code:

$comp = Bio::SampleComparator();

if ($comp->equals($seq1, $seq2)) {
   print "they are equal\n";
}

$differences = $comp->diff($seq1, $seq2);

Obviously the SampleComparator.pm module would need to keep up-to-date 
with respect to the classes that it can compare, but I don't see any way 
around those kind of dependencies.

Peter
-------------- next part --------------
use strict;
package Bio::SampleComparator;

use Bio::ComparatorI;
use vars qw(@ISA);

@ISA = qw(Bio::ComparatorI);

sub set_supported_classes {
  my $self = shift;

  @{$self->{'supported_classes'}} = qw(
				    Bio::AnnotationI
				    Bio::LocationI
				    Bio::PrimarySeq
				    Bio::Seq
				    Bio::SeqFeatureI
				    Bio::Species
				  );

}

sub diff {
  my $self = shift;
  my $obj1 = shift;
  my $obj2 = shift;
  my $fastmatch = shift;

  return [] if $obj1 == $obj2;

  my @diffs = ();

  (ref($obj1) ne ref($obj2))  && push(@diffs, 'class') &&
    $fastmatch && return undef;

  my $diff_method = $self->can_compare($obj1);

  if (defined($diff_method)) {
    return $self->$diff_method($obj1, $obj2, $fastmatch);
  } else {
    $self->throw("trying to compare unsupported object type: " . ref($obj1));
  }
}


sub diff_BioAnnotationI {
  my $self = shift;
  my $obj1 = shift;
  my $obj2 = shift;
  my $fastmatch = shift;

  my @diffs = ();

  my $obj1_hash = $obj1->hash_tree();
  my $obj2_hash = $obj2->hash_tree();
  foreach my $key (keys(%{$obj1->hash_tree()})) {
    ($obj1_hash->{$key} ne $obj2_hash->{$key}) && push(@diffs, $key) &&
      $fastmatch && return \@diffs;
  }

  return \@diffs;
}


sub diff_BioLocationI {
  my $self = shift;
  my $obj1 = shift;
  my $obj2 = shift;
  my $fastmatch = shift;

  my @diffs = ();
  my $simple_location = ($obj1->max_start() == $obj1->min_start() && 
    $obj1->max_start() == $obj1->min_start()) ? 1 : 0;

  my @intvals = $simple_location ? qw(start end) : 
    qw(min_start max_start min_end max_end);
  push(@intvals, 'is_remote');
 
  foreach my $method (@intvals) {
    ($obj1->$method() != $obj2->$method()) && push(@diffs, $method) &&
      $fastmatch && return \@diffs;
  }

  foreach my $method (qw(start_pos_type end_pos_type location_type)) {
    ($obj1->$method() ne $obj2->$method()) && push(@diffs, $method) &&
      $fastmatch && return \@diffs;
  }

  #compare the classes of the coordinate policy objects
  (ref($obj1->coordinate_policy()) ne ref($obj2->coordinate_policy())) &&
    push(@diffs, 'coordinate_policy') && $fastmatch && return \@diffs;

  return \@diffs;
}

sub diff_BioPrimarySeq {
  my $self = shift;
  my $obj1 = shift;
  my $obj2 = shift;
  my $fastmatch = shift;

  my @diffs = ();

  foreach my $method (qw(seq display_id accession_number alphabet desc)) {
    ($obj1->$method() ne $obj2->$method()) && push(@diffs, $method) &&
      $fastmatch && return \@diffs;
  }
  # primary_id returns a stringified object id if no primary id is set
  # we really don't want to use that for equality comparisons
  if ($obj1->primary_id() ne "$self") {
    ($obj1->primary_id() eq $obj2->primary_id()) && 
      push(@diffs, 'primary_id') && $fastmatch && return \@diffs;
  }
  ($obj1->is_circular() != $obj2->is_circular()) && 
    push(@diffs, 'is_circular') && $fastmatch && return \@diffs;
  
  return \@diffs;
}

sub diff_BioSeq {
  my $self = shift;
  my $obj1 = shift;
  my $obj2 = shift;
  my $fastmatch = shift;

  my @diffs = ();


  # a Seq is made out of a:
  # PrimarySeq primary_seq
  # AnnotationCollectionI annotation->get_Annotations
  # SeqFeatureI get_all_SeqFeatures 
  # Species species

  my $priseq_diff = $self->diff($obj1->primary_seq(), $obj2->primary_seq(), $fastmatch);
  if ($#{$priseq_diff} != -1) {
    push(@diffs, 'primary_seq', $priseq_diff);
    return \@diffs if $fastmatch;
  }
  my $species_diff = $self->diff($obj1->species(), $obj2->species(), $fastmatch);
  if ($#{$species_diff} != -1) {
    push(@diffs, 'species', $species_diff);
    return \@diffs if $fastmatch;
  }

  my @obj1_annotations = sort {$a->hash_key() cmp $b->hash_key()} $obj1->annotation->get_Annotations();
  my @obj2_annotations = sort {$a->hash_key() cmp $b->hash_key()} $obj2->annotation->get_Annotations();
  ($#obj1_annotations != $#obj2_annotations) && 
    push(@diffs, 'num_annotations') && $fastmatch && return \@diffs;
  foreach my $i (0..$#obj1_annotations) {
    my $anno_diffs = $self->diff($obj1_annotations[$i], $obj2_annotations[$i], $fastmatch);
    if ($#{$anno_diffs} != -1) {
      my $subclass = (split(/::/,ref($obj1_annotations[$i])))[2];
      push(@diffs, "annotation::${subclass}_" . $i, $anno_diffs);
      return \@diffs if $fastmatch;
    }
  }

  my @obj1_features = sort {$a->hash_key() cmp $b->hash_key()} $obj1->get_SeqFeatures();
  my @obj2_features = sort {$a->hash_key() cmp $b->hash_key()} $obj2->get_SeqFeatures();
  ($#obj1_features != $#obj2_features) && push(@diffs, 'num_features') &&
    $fastmatch && return \@diffs;
  for my $i (0..$#obj2_features) {
    my $feature_diffs = $self->diff($obj1_features[$i], $obj2_features[$i], $fastmatch);
    if ($#{$feature_diffs} != -1) {
      push(@diffs, 'feature_' . $i, $feature_diffs);
      return \@diffs if $fastmatch;
    }
  }

  return \@diffs;
}

sub diff_BioSeqFeatureI {
  my $self = shift;
  my $obj1 = shift;
  my $obj2 = shift;
  my $fastmatch = shift;

  return [] if $obj1 == $obj2;

  my @diffs = ();
  (ref($obj1) ne ref($obj2)) && push(@diffs, 'class') && $fastmatch &&
    return \@diffs;

  foreach my $method (qw(display_name primary_tag source_tag seq_id)) {
    ($obj1->$method() ne $obj2->$method()) && push(@diffs, $method) &&
      $fastmatch && return \@diffs;
  }

  my @obj1_tags = $obj1->get_all_tags();
  my @obj2_tags = $obj2->get_all_tags();
  ($#obj1_tags != $#obj2_tags) && push(@diffs, 'num_tags') &&
    $fastmatch && return \@diffs;
  for my $tag (@obj1_tags) {
    my @obj1_values = sort $obj1->get_tag_values($tag);
    my @obj2_values = sort $obj2->get_tag_values($tag);

    ($#obj1_values != $#obj2_values) && push(@diffs, $tag . '_num_values') &&
      $fastmatch && return \@diffs;
    for my $i (0..$#obj1_values) {
      ($obj1_values[$i] ne $obj2_values[$i]) && 
	push(@diffs, $tag . '_value_' . $i) && $fastmatch &&
	return \@diffs;
    }
  }

  my $location_diffs = $self->diff($obj1->location(), $obj2->location());
  if ($#{$location_diffs} != -1) {
    push(@diffs, 'location', $location_diffs);
  }

  my $equals = 1;
  my @obj1_subfeatures = sort { $a->hash_key() cmp $b->hash_key() } $obj1->get_SeqFeatures();
  my @obj2_subfeatures = sort { $a->hash_key() cmp $b->hash_key() } $obj1->get_SeqFeatures();
  ($#obj1_subfeatures != $#obj2_subfeatures) && 
    push(@diffs, 'num_subfeatures') && $fastmatch && return \@diffs;
  for my $i (0..$#obj1_subfeatures) {
    my $sub_diffs = $self->diff($obj1_subfeatures[$i], $obj2_subfeatures[$i], $fastmatch);
    if (len(@{$sub_diffs}) != 0) {
      push(@diffs, 'subfeature_' . $i, $sub_diffs);
      return \@diffs if $fastmatch;
    }
  }

  return \@diffs;
}

sub diff_BioSpecies {
  my $self = shift;
  my $obj1 = shift;
  my $obj2 = shift;
  my $fastmatch = shift;

  return [] if $obj1 == $obj2;

  my @diffs = ();

  ref($obj1) ne ref($obj2) && push(@diffs, 'class') && $fastmatch &&
    return \@diffs;

  for my $method (qw(common_name variant organelle species genus sub_species ncbi_taxid division)) {
    $obj1->$method() ne $obj2->$method() && push(@diffs, $method) &&
      $fastmatch && return \@diffs;
  }
  my @obj1_classification = $obj1->classification();
  my @obj2_classification = $obj2->classification();
  my $same_classification = 1;
  for my $i (0..$#obj1_classification) {
    $same_classification = 0 if $obj1_classification[$i] ne $obj2_classification[$i];
  }
  $same_classification == 0 && push(@diffs, 'classification') && 
    $fastmatch && return \@diffs;

  return \@diffs;
}

1;
-------------- next part --------------
package Bio::ComparatorI;
use vars(qw(@ISA));
use strict;
use Bio::Root::Root;

@ISA = qw(Bio::Root::Root);

my $comparatorClassVar;

sub new {
  my $class = shift;

  my $self = bless {}, $class;

  $self->set_supported_classes();
  $self->{'diff_methods'} = {};
  foreach my $supported_class (@{$self->{'supported_classes'}}) {
    my $compacted_name = join('',split(/::/,$supported_class));
    $self->{'diff_methods'}->{$supported_class} = 'diff_' . $compacted_name;
  }

  return $self;
}

=pod
  Title   : equals
  Usage   : $obj->equals($obj, $other_obj)
  Function: Tests if two objects are equal, and returns true if they
            are, and false otherwise.
  Example : 
  Returns : boolean
  Args    : $other_obj (same type as $obj)
=cut

sub equals {
  my $self = shift;
  my $obj1 = shift;
  my $obj2 = shift;

  return ($#{($self->diff($obj1, $obj2, 0))} == -1);
}

=pod
  Title   : diff
  Usage   : $obj->diff($other_obj, [$fastmatch])
  Function: Returns a list of the attributes where $self doesn't match
            $other_obj. If the $fastmatch flag is set to true, only
	    returns the first difference. In the case of complex objects,
	    differences between two components are reflected as an 
	    attribute name followed by a listref listing the attributes
	    of the component object that don't match. Thus, the listref 
	    returned by this method should strictly be interpreted as
	    a recursive data structure.

	    For example, this might be the listref returned by diff()
	    on two Bio::Seq objects: [ 'primaryseq', [ 'display_id' ]]
  Example : 
  Returns : listref of string and arrayref
  Args    : $other_obj (Bio::ComparableI subclass)
            $fastmatch (int) 
=cut

sub diff {
  my $self = shift;

  $self->throw_not_implemented();
}

sub set_supported_classes {
  my $self = shift;

  $self->throw_not_implemented();
}

sub can_compare {
  my $self = shift;
  my $obj1 = shift;

  my $class = ref($obj1);
  my $diff_method;
  if (!defined($self->{'diff_methods'}->{$class})) {
    for my $supported_class (@{$self->{'supported_classes'}}) {
      if ($obj1->isa($supported_class)) {
	$diff_method = $self->{'diff_methods'}->{$class} = 
	  $self->{'diff_methods'}->{$supported_class};
	last;
      }	      
    }
  } else {
    $diff_method = $self->{'diff_methods'}->{$class};
  }

  return $diff_method;
}

1;
-------------- next part --------------
? ComparatorI.pm
? SampleComparator.pm
? Root/.Exception.pm.swp
Index: AnnotationI.pm
===================================================================
RCS file: /home/repository/bioperl/bioperl-live/Bio/AnnotationI.pm,v
retrieving revision 1.7
diff -u -3 -p -r1.7 AnnotationI.pm
--- AnnotationI.pm	2002/10/22 07:38:24	1.7
+++ AnnotationI.pm	2004/05/03 09:48:14
@@ -177,4 +177,13 @@ sub tagname{
     shift->throw_not_implemented();
 }
 
+sub hash_key {
+  my $self = shift;
+  
+  my $hash_tree = $self->hash_tree();
+  my @keys = sort keys(%{$hash_tree});
+  my $string = join(':', map { $hash_tree->{$_} } @keys);
+  return $string;
+}
+
 1;
Index: SeqFeatureI.pm
===================================================================
RCS file: /home/repository/bioperl/bioperl-live/Bio/SeqFeatureI.pm,v
retrieving revision 1.55
diff -u -3 -p -r1.55 SeqFeatureI.pm
--- SeqFeatureI.pm	2004/03/13 01:30:02	1.55
+++ SeqFeatureI.pm	2004/05/03 09:48:45
@@ -368,6 +368,26 @@ sub gff_string{
    return $formatter->gff_string($self);
 }
 
+=head2 hash_key
+
+ Title   : hash_key
+ Usage   : $key = hash_key();
+ Function: Provides a string which can be used as the key for a hash
+           table. This string is composed out of attributes of this
+           object in such a way that if two SeqFeatureI objects have the
+           same hash_key(), they are equivalent.
+
+ Returns : string
+ Args    : None
+
+=cut
+
+sub hash_key {
+  my $self = shift;
+
+  return join(':', $self->primary_tag(), $self->display_name(), $self->start(), $self->end());
+}
+
 my $static_gff_formatter = undef;
 
 =head2 _static_gff_formatter
Index: Annotation/Comment.pm
===================================================================
RCS file: /home/repository/bioperl/bioperl-live/Bio/Annotation/Comment.pm,v
retrieving revision 1.10
diff -u -3 -p -r1.10 Comment.pm
--- Annotation/Comment.pm	2004/05/03 00:28:20	1.10
+++ Annotation/Comment.pm	2004/05/03 09:48:46
@@ -116,6 +116,8 @@ sub hash_tree{
    
    my $h = {};
    $h->{'text'} = $self->text;
+
+   return $h;
 }
 
 =head2 tagname
Index: Annotation/SimpleValue.pm
===================================================================
RCS file: /home/repository/bioperl/bioperl-live/Bio/Annotation/SimpleValue.pm,v
retrieving revision 1.10
diff -u -3 -p -r1.10 SimpleValue.pm
--- Annotation/SimpleValue.pm	2003/03/10 21:53:56	1.10
+++ Annotation/SimpleValue.pm	2004/05/03 09:49:37
@@ -142,6 +142,8 @@ sub hash_tree{
    
    my $h = {};
    $h->{'value'} = $self->value;
+
+   return $h;
 }
 
 =head2 tagname


More information about the Bioperl-l mailing list