[Bioperl-l] Re: Testing Bioperl objects for equality: attempt 2

Hilmar Lapp hlapp at gmx.net
Sun May 23 18:51:08 EDT 2004


Peter, I fixed the two bugs from your diffs. Other than that, I don't 
see your modules in the repository. Did you not want to commit them yet?

	-hilmar

On Monday, May 3, 2004, at 03:49  AM, Peter van Heusden wrote:

> Ok, so recently I posted my diffs to the Bioperl source which added
> equals() and diff() methods to Bioperl object. Hilmar objected, citing
> two main objections:
>
> 1) The 'hash_key()' methods I added to various objects were
> inappopriately named.
> 2) Equality is not an objective measure, so a single 'equals' method
> didn't make sense.
>
> To address (2), I've pulled the equals code out of the Bioperl objects,
> and created a new interface, ComparatorI, which can test a certain set
> of Bioperl objects for equality. And I've created a SampleComparator.pm
> module implementing my version of equality.
>
> I've attached the ComparatorI.pm and SampleComparator.pm modules to 
> this
> message, as well as a small patch to this morning's Bioperl which is
> necessary for them to work (it still includes the misnamed hash_key()
> stuff that will be fixed in the future).
>
> I know the name Comparator is clumsy, but its just an interim name. And
> equals() should be renamed is_equal(). But I just wanted to get
> something out there for comment before going further. Here's some 
> sample
> code:
>
> $comp = Bio::SampleComparator();
>
> if ($comp->equals($seq1, $seq2)) {
>    print "they are equal\n";
> }
>
> $differences = $comp->diff($seq1, $seq2);
>
> Obviously the SampleComparator.pm module would need to keep up-to-date
> with respect to the classes that it can compare, but I don't see any 
> way
> around those kind of dependencies.
>
> Peter
>
> use strict;
> package Bio::SampleComparator;
>
> use Bio::ComparatorI;
> use vars qw(@ISA);
>
> @ISA = qw(Bio::ComparatorI);
>
> sub set_supported_classes {
>   my $self = shift;
>
>   @{$self->{'supported_classes'}} = qw(
> 				    Bio::AnnotationI
> 				    Bio::LocationI
> 				    Bio::PrimarySeq
> 				    Bio::Seq
> 				    Bio::SeqFeatureI
> 				    Bio::Species
> 				  );
>
> }
>
> sub diff {
>   my $self = shift;
>   my $obj1 = shift;
>   my $obj2 = shift;
>   my $fastmatch = shift;
>
>   return [] if $obj1 == $obj2;
>
>   my @diffs = ();
>
>   (ref($obj1) ne ref($obj2))  && push(@diffs, 'class') &&
>     $fastmatch && return undef;
>
>   my $diff_method = $self->can_compare($obj1);
>
>   if (defined($diff_method)) {
>     return $self->$diff_method($obj1, $obj2, $fastmatch);
>   } else {
>     $self->throw("trying to compare unsupported object type: " . 
> ref($obj1));
>   }
> }
>
>
> sub diff_BioAnnotationI {
>   my $self = shift;
>   my $obj1 = shift;
>   my $obj2 = shift;
>   my $fastmatch = shift;
>
>   my @diffs = ();
>
>   my $obj1_hash = $obj1->hash_tree();
>   my $obj2_hash = $obj2->hash_tree();
>   foreach my $key (keys(%{$obj1->hash_tree()})) {
>     ($obj1_hash->{$key} ne $obj2_hash->{$key}) && push(@diffs, $key) &&
>       $fastmatch && return \@diffs;
>   }
>
>   return \@diffs;
> }
>
>
> sub diff_BioLocationI {
>   my $self = shift;
>   my $obj1 = shift;
>   my $obj2 = shift;
>   my $fastmatch = shift;
>
>   my @diffs = ();
>   my $simple_location = ($obj1->max_start() == $obj1->min_start() &&
>     $obj1->max_start() == $obj1->min_start()) ? 1 : 0;
>
>   my @intvals = $simple_location ? qw(start end) :
>     qw(min_start max_start min_end max_end);
>   push(@intvals, 'is_remote');
>
>   foreach my $method (@intvals) {
>     ($obj1->$method() != $obj2->$method()) && push(@diffs, $method) &&
>       $fastmatch && return \@diffs;
>   }
>
>   foreach my $method (qw(start_pos_type end_pos_type location_type)) {
>     ($obj1->$method() ne $obj2->$method()) && push(@diffs, $method) &&
>       $fastmatch && return \@diffs;
>   }
>
>   #compare the classes of the coordinate policy objects
>   (ref($obj1->coordinate_policy()) ne ref($obj2->coordinate_policy())) 
> &&
>     push(@diffs, 'coordinate_policy') && $fastmatch && return \@diffs;
>
>   return \@diffs;
> }
>
> sub diff_BioPrimarySeq {
>   my $self = shift;
>   my $obj1 = shift;
>   my $obj2 = shift;
>   my $fastmatch = shift;
>
>   my @diffs = ();
>
>   foreach my $method (qw(seq display_id accession_number alphabet 
> desc)) {
>     ($obj1->$method() ne $obj2->$method()) && push(@diffs, $method) &&
>       $fastmatch && return \@diffs;
>   }
>   # primary_id returns a stringified object id if no primary id is set
>   # we really don't want to use that for equality comparisons
>   if ($obj1->primary_id() ne "$self") {
>     ($obj1->primary_id() eq $obj2->primary_id()) &&
>       push(@diffs, 'primary_id') && $fastmatch && return \@diffs;
>   }
>   ($obj1->is_circular() != $obj2->is_circular()) &&
>     push(@diffs, 'is_circular') && $fastmatch && return \@diffs;
>
>   return \@diffs;
> }
>
> sub diff_BioSeq {
>   my $self = shift;
>   my $obj1 = shift;
>   my $obj2 = shift;
>   my $fastmatch = shift;
>
>   my @diffs = ();
>
>
>   # a Seq is made out of a:
>   # PrimarySeq primary_seq
>   # AnnotationCollectionI annotation->get_Annotations
>   # SeqFeatureI get_all_SeqFeatures
>   # Species species
>
>   my $priseq_diff = $self->diff($obj1->primary_seq(), 
> $obj2->primary_seq(), $fastmatch);
>   if ($#{$priseq_diff} != -1) {
>     push(@diffs, 'primary_seq', $priseq_diff);
>     return \@diffs if $fastmatch;
>   }
>   my $species_diff = $self->diff($obj1->species(), $obj2->species(), 
> $fastmatch);
>   if ($#{$species_diff} != -1) {
>     push(@diffs, 'species', $species_diff);
>     return \@diffs if $fastmatch;
>   }
>
>   my @obj1_annotations = sort {$a->hash_key() cmp $b->hash_key()} 
> $obj1->annotation->get_Annotations();
>   my @obj2_annotations = sort {$a->hash_key() cmp $b->hash_key()} 
> $obj2->annotation->get_Annotations();
>   ($#obj1_annotations != $#obj2_annotations) &&
>     push(@diffs, 'num_annotations') && $fastmatch && return \@diffs;
>   foreach my $i (0..$#obj1_annotations) {
>     my $anno_diffs = $self->diff($obj1_annotations[$i], 
> $obj2_annotations[$i], $fastmatch);
>     if ($#{$anno_diffs} != -1) {
>       my $subclass = (split(/::/,ref($obj1_annotations[$i])))[2];
>       push(@diffs, "annotation::${subclass}_" . $i, $anno_diffs);
>       return \@diffs if $fastmatch;
>     }
>   }
>
>   my @obj1_features = sort {$a->hash_key() cmp $b->hash_key()} 
> $obj1->get_SeqFeatures();
>   my @obj2_features = sort {$a->hash_key() cmp $b->hash_key()} 
> $obj2->get_SeqFeatures();
>   ($#obj1_features != $#obj2_features) && push(@diffs, 'num_features') 
> &&
>     $fastmatch && return \@diffs;
>   for my $i (0..$#obj2_features) {
>     my $feature_diffs = $self->diff($obj1_features[$i], 
> $obj2_features[$i], $fastmatch);
>     if ($#{$feature_diffs} != -1) {
>       push(@diffs, 'feature_' . $i, $feature_diffs);
>       return \@diffs if $fastmatch;
>     }
>   }
>
>   return \@diffs;
> }
>
> sub diff_BioSeqFeatureI {
>   my $self = shift;
>   my $obj1 = shift;
>   my $obj2 = shift;
>   my $fastmatch = shift;
>
>   return [] if $obj1 == $obj2;
>
>   my @diffs = ();
>   (ref($obj1) ne ref($obj2)) && push(@diffs, 'class') && $fastmatch &&
>     return \@diffs;
>
>   foreach my $method (qw(display_name primary_tag source_tag seq_id)) {
>     ($obj1->$method() ne $obj2->$method()) && push(@diffs, $method) &&
>       $fastmatch && return \@diffs;
>   }
>
>   my @obj1_tags = $obj1->get_all_tags();
>   my @obj2_tags = $obj2->get_all_tags();
>   ($#obj1_tags != $#obj2_tags) && push(@diffs, 'num_tags') &&
>     $fastmatch && return \@diffs;
>   for my $tag (@obj1_tags) {
>     my @obj1_values = sort $obj1->get_tag_values($tag);
>     my @obj2_values = sort $obj2->get_tag_values($tag);
>
>     ($#obj1_values != $#obj2_values) && push(@diffs, $tag . 
> '_num_values') &&
>       $fastmatch && return \@diffs;
>     for my $i (0..$#obj1_values) {
>       ($obj1_values[$i] ne $obj2_values[$i]) &&
> 	push(@diffs, $tag . '_value_' . $i) && $fastmatch &&
> 	return \@diffs;
>     }
>   }
>
>   my $location_diffs = $self->diff($obj1->location(), 
> $obj2->location());
>   if ($#{$location_diffs} != -1) {
>     push(@diffs, 'location', $location_diffs);
>   }
>
>   my $equals = 1;
>   my @obj1_subfeatures = sort { $a->hash_key() cmp $b->hash_key() } 
> $obj1->get_SeqFeatures();
>   my @obj2_subfeatures = sort { $a->hash_key() cmp $b->hash_key() } 
> $obj1->get_SeqFeatures();
>   ($#obj1_subfeatures != $#obj2_subfeatures) &&
>     push(@diffs, 'num_subfeatures') && $fastmatch && return \@diffs;
>   for my $i (0..$#obj1_subfeatures) {
>     my $sub_diffs = $self->diff($obj1_subfeatures[$i], 
> $obj2_subfeatures[$i], $fastmatch);
>     if (len(@{$sub_diffs}) != 0) {
>       push(@diffs, 'subfeature_' . $i, $sub_diffs);
>       return \@diffs if $fastmatch;
>     }
>   }
>
>   return \@diffs;
> }
>
> sub diff_BioSpecies {
>   my $self = shift;
>   my $obj1 = shift;
>   my $obj2 = shift;
>   my $fastmatch = shift;
>
>   return [] if $obj1 == $obj2;
>
>   my @diffs = ();
>
>   ref($obj1) ne ref($obj2) && push(@diffs, 'class') && $fastmatch &&
>     return \@diffs;
>
>   for my $method (qw(common_name variant organelle species genus 
> sub_species ncbi_taxid division)) {
>     $obj1->$method() ne $obj2->$method() && push(@diffs, $method) &&
>       $fastmatch && return \@diffs;
>   }
>   my @obj1_classification = $obj1->classification();
>   my @obj2_classification = $obj2->classification();
>   my $same_classification = 1;
>   for my $i (0..$#obj1_classification) {
>     $same_classification = 0 if $obj1_classification[$i] ne 
> $obj2_classification[$i];
>   }
>   $same_classification == 0 && push(@diffs, 'classification') &&
>     $fastmatch && return \@diffs;
>
>   return \@diffs;
> }
>
> 1;
> package Bio::ComparatorI;
> use vars(qw(@ISA));
> use strict;
> use Bio::Root::Root;
>
> @ISA = qw(Bio::Root::Root);
>
> my $comparatorClassVar;
>
> sub new {
>   my $class = shift;
>
>   my $self = bless {}, $class;
>
>   $self->set_supported_classes();
>   $self->{'diff_methods'} = {};
>   foreach my $supported_class (@{$self->{'supported_classes'}}) {
>     my $compacted_name = join('',split(/::/,$supported_class));
>     $self->{'diff_methods'}->{$supported_class} = 'diff_' . 
> $compacted_name;
>   }
>
>   return $self;
> }
>
> =pod
>   Title   : equals
>   Usage   : $obj->equals($obj, $other_obj)
>   Function: Tests if two objects are equal, and returns true if they
>             are, and false otherwise.
>   Example :
>   Returns : boolean
>   Args    : $other_obj (same type as $obj)
> =cut
>
> sub equals {
>   my $self = shift;
>   my $obj1 = shift;
>   my $obj2 = shift;
>
>   return ($#{($self->diff($obj1, $obj2, 0))} == -1);
> }
>
> =pod
>   Title   : diff
>   Usage   : $obj->diff($other_obj, [$fastmatch])
>   Function: Returns a list of the attributes where $self doesn't match
>             $other_obj. If the $fastmatch flag is set to true, only
> 	    returns the first difference. In the case of complex objects,
> 	    differences between two components are reflected as an
> 	    attribute name followed by a listref listing the attributes
> 	    of the component object that don't match. Thus, the listref
> 	    returned by this method should strictly be interpreted as
> 	    a recursive data structure.
>
> 	    For example, this might be the listref returned by diff()
> 	    on two Bio::Seq objects: [ 'primaryseq', [ 'display_id' ]]
>   Example :
>   Returns : listref of string and arrayref
>   Args    : $other_obj (Bio::ComparableI subclass)
>             $fastmatch (int)
> =cut
>
> sub diff {
>   my $self = shift;
>
>   $self->throw_not_implemented();
> }
>
> sub set_supported_classes {
>   my $self = shift;
>
>   $self->throw_not_implemented();
> }
>
> sub can_compare {
>   my $self = shift;
>   my $obj1 = shift;
>
>   my $class = ref($obj1);
>   my $diff_method;
>   if (!defined($self->{'diff_methods'}->{$class})) {
>     for my $supported_class (@{$self->{'supported_classes'}}) {
>       if ($obj1->isa($supported_class)) {
> 	$diff_method = $self->{'diff_methods'}->{$class} =
> 	  $self->{'diff_methods'}->{$supported_class};
> 	last;
>       }	
>     }
>   } else {
>     $diff_method = $self->{'diff_methods'}->{$class};
>   }
>
>   return $diff_method;
> }
>
> 1;
> ? ComparatorI.pm
> ? SampleComparator.pm
> ? Root/.Exception.pm.swp
> Index: AnnotationI.pm
> ===================================================================
> RCS file: /home/repository/bioperl/bioperl-live/Bio/AnnotationI.pm,v
> retrieving revision 1.7
> diff -u -3 -p -r1.7 AnnotationI.pm
> --- AnnotationI.pm	2002/10/22 07:38:24	1.7
> +++ AnnotationI.pm	2004/05/03 09:48:14
> @@ -177,4 +177,13 @@ sub tagname{
>      shift->throw_not_implemented();
>  }
>
> +sub hash_key {
> +  my $self = shift;
> +
> +  my $hash_tree = $self->hash_tree();
> +  my @keys = sort keys(%{$hash_tree});
> +  my $string = join(':', map { $hash_tree->{$_} } @keys);
> +  return $string;
> +}
> +
>  1;
> Index: SeqFeatureI.pm
> ===================================================================
> RCS file: /home/repository/bioperl/bioperl-live/Bio/SeqFeatureI.pm,v
> retrieving revision 1.55
> diff -u -3 -p -r1.55 SeqFeatureI.pm
> --- SeqFeatureI.pm	2004/03/13 01:30:02	1.55
> +++ SeqFeatureI.pm	2004/05/03 09:48:45
> @@ -368,6 +368,26 @@ sub gff_string{
>     return $formatter->gff_string($self);
>  }
>
> +=head2 hash_key
> +
> + Title   : hash_key
> + Usage   : $key = hash_key();
> + Function: Provides a string which can be used as the key for a hash
> +           table. This string is composed out of attributes of this
> +           object in such a way that if two SeqFeatureI objects have 
> the
> +           same hash_key(), they are equivalent.
> +
> + Returns : string
> + Args    : None
> +
> +=cut
> +
> +sub hash_key {
> +  my $self = shift;
> +
> +  return join(':', $self->primary_tag(), $self->display_name(), 
> $self->start(), $self->end());
> +}
> +
>  my $static_gff_formatter = undef;
>
>  =head2 _static_gff_formatter
> Index: Annotation/Comment.pm
> ===================================================================
> RCS file: 
> /home/repository/bioperl/bioperl-live/Bio/Annotation/Comment.pm,v
> retrieving revision 1.10
> diff -u -3 -p -r1.10 Comment.pm
> --- Annotation/Comment.pm	2004/05/03 00:28:20	1.10
> +++ Annotation/Comment.pm	2004/05/03 09:48:46
> @@ -116,6 +116,8 @@ sub hash_tree{
>
>     my $h = {};
>     $h->{'text'} = $self->text;
> +
> +   return $h;
>  }
>
>  =head2 tagname
> Index: Annotation/SimpleValue.pm
> ===================================================================
> RCS file: 
> /home/repository/bioperl/bioperl-live/Bio/Annotation/SimpleValue.pm,v
> retrieving revision 1.10
> diff -u -3 -p -r1.10 SimpleValue.pm
> --- Annotation/SimpleValue.pm	2003/03/10 21:53:56	1.10
> +++ Annotation/SimpleValue.pm	2004/05/03 09:49:37
> @@ -142,6 +142,8 @@ sub hash_tree{
>
>     my $h = {};
>     $h->{'value'} = $self->value;
> +
> +   return $h;
>  }
>
>  =head2 tagname
>
-- 
-------------------------------------------------------------
Hilmar Lapp                            email: lapp at gnf.org
GNF, San Diego, Ca. 92121              phone: +1-858-812-1757
-------------------------------------------------------------





More information about the Bioperl-l mailing list