[Bioperl-l] Testing Bioperl objects for equality: attempt 2
Peter van Heusden
pvh at egenetics.com
Mon May 3 06:49:56 EDT 2004
Ok, so recently I posted my diffs to the Bioperl source which added
equals() and diff() methods to Bioperl object. Hilmar objected, citing
two main objections:
1) The 'hash_key()' methods I added to various objects were
inappopriately named.
2) Equality is not an objective measure, so a single 'equals' method
didn't make sense.
To address (2), I've pulled the equals code out of the Bioperl objects,
and created a new interface, ComparatorI, which can test a certain set
of Bioperl objects for equality. And I've created a SampleComparator.pm
module implementing my version of equality.
I've attached the ComparatorI.pm and SampleComparator.pm modules to this
message, as well as a small patch to this morning's Bioperl which is
necessary for them to work (it still includes the misnamed hash_key()
stuff that will be fixed in the future).
I know the name Comparator is clumsy, but its just an interim name. And
equals() should be renamed is_equal(). But I just wanted to get
something out there for comment before going further. Here's some sample
code:
$comp = Bio::SampleComparator();
if ($comp->equals($seq1, $seq2)) {
print "they are equal\n";
}
$differences = $comp->diff($seq1, $seq2);
Obviously the SampleComparator.pm module would need to keep up-to-date
with respect to the classes that it can compare, but I don't see any way
around those kind of dependencies.
Peter
-------------- next part --------------
use strict;
package Bio::SampleComparator;
use Bio::ComparatorI;
use vars qw(@ISA);
@ISA = qw(Bio::ComparatorI);
sub set_supported_classes {
my $self = shift;
@{$self->{'supported_classes'}} = qw(
Bio::AnnotationI
Bio::LocationI
Bio::PrimarySeq
Bio::Seq
Bio::SeqFeatureI
Bio::Species
);
}
sub diff {
my $self = shift;
my $obj1 = shift;
my $obj2 = shift;
my $fastmatch = shift;
return [] if $obj1 == $obj2;
my @diffs = ();
(ref($obj1) ne ref($obj2)) && push(@diffs, 'class') &&
$fastmatch && return undef;
my $diff_method = $self->can_compare($obj1);
if (defined($diff_method)) {
return $self->$diff_method($obj1, $obj2, $fastmatch);
} else {
$self->throw("trying to compare unsupported object type: " . ref($obj1));
}
}
sub diff_BioAnnotationI {
my $self = shift;
my $obj1 = shift;
my $obj2 = shift;
my $fastmatch = shift;
my @diffs = ();
my $obj1_hash = $obj1->hash_tree();
my $obj2_hash = $obj2->hash_tree();
foreach my $key (keys(%{$obj1->hash_tree()})) {
($obj1_hash->{$key} ne $obj2_hash->{$key}) && push(@diffs, $key) &&
$fastmatch && return \@diffs;
}
return \@diffs;
}
sub diff_BioLocationI {
my $self = shift;
my $obj1 = shift;
my $obj2 = shift;
my $fastmatch = shift;
my @diffs = ();
my $simple_location = ($obj1->max_start() == $obj1->min_start() &&
$obj1->max_start() == $obj1->min_start()) ? 1 : 0;
my @intvals = $simple_location ? qw(start end) :
qw(min_start max_start min_end max_end);
push(@intvals, 'is_remote');
foreach my $method (@intvals) {
($obj1->$method() != $obj2->$method()) && push(@diffs, $method) &&
$fastmatch && return \@diffs;
}
foreach my $method (qw(start_pos_type end_pos_type location_type)) {
($obj1->$method() ne $obj2->$method()) && push(@diffs, $method) &&
$fastmatch && return \@diffs;
}
#compare the classes of the coordinate policy objects
(ref($obj1->coordinate_policy()) ne ref($obj2->coordinate_policy())) &&
push(@diffs, 'coordinate_policy') && $fastmatch && return \@diffs;
return \@diffs;
}
sub diff_BioPrimarySeq {
my $self = shift;
my $obj1 = shift;
my $obj2 = shift;
my $fastmatch = shift;
my @diffs = ();
foreach my $method (qw(seq display_id accession_number alphabet desc)) {
($obj1->$method() ne $obj2->$method()) && push(@diffs, $method) &&
$fastmatch && return \@diffs;
}
# primary_id returns a stringified object id if no primary id is set
# we really don't want to use that for equality comparisons
if ($obj1->primary_id() ne "$self") {
($obj1->primary_id() eq $obj2->primary_id()) &&
push(@diffs, 'primary_id') && $fastmatch && return \@diffs;
}
($obj1->is_circular() != $obj2->is_circular()) &&
push(@diffs, 'is_circular') && $fastmatch && return \@diffs;
return \@diffs;
}
sub diff_BioSeq {
my $self = shift;
my $obj1 = shift;
my $obj2 = shift;
my $fastmatch = shift;
my @diffs = ();
# a Seq is made out of a:
# PrimarySeq primary_seq
# AnnotationCollectionI annotation->get_Annotations
# SeqFeatureI get_all_SeqFeatures
# Species species
my $priseq_diff = $self->diff($obj1->primary_seq(), $obj2->primary_seq(), $fastmatch);
if ($#{$priseq_diff} != -1) {
push(@diffs, 'primary_seq', $priseq_diff);
return \@diffs if $fastmatch;
}
my $species_diff = $self->diff($obj1->species(), $obj2->species(), $fastmatch);
if ($#{$species_diff} != -1) {
push(@diffs, 'species', $species_diff);
return \@diffs if $fastmatch;
}
my @obj1_annotations = sort {$a->hash_key() cmp $b->hash_key()} $obj1->annotation->get_Annotations();
my @obj2_annotations = sort {$a->hash_key() cmp $b->hash_key()} $obj2->annotation->get_Annotations();
($#obj1_annotations != $#obj2_annotations) &&
push(@diffs, 'num_annotations') && $fastmatch && return \@diffs;
foreach my $i (0..$#obj1_annotations) {
my $anno_diffs = $self->diff($obj1_annotations[$i], $obj2_annotations[$i], $fastmatch);
if ($#{$anno_diffs} != -1) {
my $subclass = (split(/::/,ref($obj1_annotations[$i])))[2];
push(@diffs, "annotation::${subclass}_" . $i, $anno_diffs);
return \@diffs if $fastmatch;
}
}
my @obj1_features = sort {$a->hash_key() cmp $b->hash_key()} $obj1->get_SeqFeatures();
my @obj2_features = sort {$a->hash_key() cmp $b->hash_key()} $obj2->get_SeqFeatures();
($#obj1_features != $#obj2_features) && push(@diffs, 'num_features') &&
$fastmatch && return \@diffs;
for my $i (0..$#obj2_features) {
my $feature_diffs = $self->diff($obj1_features[$i], $obj2_features[$i], $fastmatch);
if ($#{$feature_diffs} != -1) {
push(@diffs, 'feature_' . $i, $feature_diffs);
return \@diffs if $fastmatch;
}
}
return \@diffs;
}
sub diff_BioSeqFeatureI {
my $self = shift;
my $obj1 = shift;
my $obj2 = shift;
my $fastmatch = shift;
return [] if $obj1 == $obj2;
my @diffs = ();
(ref($obj1) ne ref($obj2)) && push(@diffs, 'class') && $fastmatch &&
return \@diffs;
foreach my $method (qw(display_name primary_tag source_tag seq_id)) {
($obj1->$method() ne $obj2->$method()) && push(@diffs, $method) &&
$fastmatch && return \@diffs;
}
my @obj1_tags = $obj1->get_all_tags();
my @obj2_tags = $obj2->get_all_tags();
($#obj1_tags != $#obj2_tags) && push(@diffs, 'num_tags') &&
$fastmatch && return \@diffs;
for my $tag (@obj1_tags) {
my @obj1_values = sort $obj1->get_tag_values($tag);
my @obj2_values = sort $obj2->get_tag_values($tag);
($#obj1_values != $#obj2_values) && push(@diffs, $tag . '_num_values') &&
$fastmatch && return \@diffs;
for my $i (0..$#obj1_values) {
($obj1_values[$i] ne $obj2_values[$i]) &&
push(@diffs, $tag . '_value_' . $i) && $fastmatch &&
return \@diffs;
}
}
my $location_diffs = $self->diff($obj1->location(), $obj2->location());
if ($#{$location_diffs} != -1) {
push(@diffs, 'location', $location_diffs);
}
my $equals = 1;
my @obj1_subfeatures = sort { $a->hash_key() cmp $b->hash_key() } $obj1->get_SeqFeatures();
my @obj2_subfeatures = sort { $a->hash_key() cmp $b->hash_key() } $obj1->get_SeqFeatures();
($#obj1_subfeatures != $#obj2_subfeatures) &&
push(@diffs, 'num_subfeatures') && $fastmatch && return \@diffs;
for my $i (0..$#obj1_subfeatures) {
my $sub_diffs = $self->diff($obj1_subfeatures[$i], $obj2_subfeatures[$i], $fastmatch);
if (len(@{$sub_diffs}) != 0) {
push(@diffs, 'subfeature_' . $i, $sub_diffs);
return \@diffs if $fastmatch;
}
}
return \@diffs;
}
sub diff_BioSpecies {
my $self = shift;
my $obj1 = shift;
my $obj2 = shift;
my $fastmatch = shift;
return [] if $obj1 == $obj2;
my @diffs = ();
ref($obj1) ne ref($obj2) && push(@diffs, 'class') && $fastmatch &&
return \@diffs;
for my $method (qw(common_name variant organelle species genus sub_species ncbi_taxid division)) {
$obj1->$method() ne $obj2->$method() && push(@diffs, $method) &&
$fastmatch && return \@diffs;
}
my @obj1_classification = $obj1->classification();
my @obj2_classification = $obj2->classification();
my $same_classification = 1;
for my $i (0..$#obj1_classification) {
$same_classification = 0 if $obj1_classification[$i] ne $obj2_classification[$i];
}
$same_classification == 0 && push(@diffs, 'classification') &&
$fastmatch && return \@diffs;
return \@diffs;
}
1;
-------------- next part --------------
package Bio::ComparatorI;
use vars(qw(@ISA));
use strict;
use Bio::Root::Root;
@ISA = qw(Bio::Root::Root);
my $comparatorClassVar;
sub new {
my $class = shift;
my $self = bless {}, $class;
$self->set_supported_classes();
$self->{'diff_methods'} = {};
foreach my $supported_class (@{$self->{'supported_classes'}}) {
my $compacted_name = join('',split(/::/,$supported_class));
$self->{'diff_methods'}->{$supported_class} = 'diff_' . $compacted_name;
}
return $self;
}
=pod
Title : equals
Usage : $obj->equals($obj, $other_obj)
Function: Tests if two objects are equal, and returns true if they
are, and false otherwise.
Example :
Returns : boolean
Args : $other_obj (same type as $obj)
=cut
sub equals {
my $self = shift;
my $obj1 = shift;
my $obj2 = shift;
return ($#{($self->diff($obj1, $obj2, 0))} == -1);
}
=pod
Title : diff
Usage : $obj->diff($other_obj, [$fastmatch])
Function: Returns a list of the attributes where $self doesn't match
$other_obj. If the $fastmatch flag is set to true, only
returns the first difference. In the case of complex objects,
differences between two components are reflected as an
attribute name followed by a listref listing the attributes
of the component object that don't match. Thus, the listref
returned by this method should strictly be interpreted as
a recursive data structure.
For example, this might be the listref returned by diff()
on two Bio::Seq objects: [ 'primaryseq', [ 'display_id' ]]
Example :
Returns : listref of string and arrayref
Args : $other_obj (Bio::ComparableI subclass)
$fastmatch (int)
=cut
sub diff {
my $self = shift;
$self->throw_not_implemented();
}
sub set_supported_classes {
my $self = shift;
$self->throw_not_implemented();
}
sub can_compare {
my $self = shift;
my $obj1 = shift;
my $class = ref($obj1);
my $diff_method;
if (!defined($self->{'diff_methods'}->{$class})) {
for my $supported_class (@{$self->{'supported_classes'}}) {
if ($obj1->isa($supported_class)) {
$diff_method = $self->{'diff_methods'}->{$class} =
$self->{'diff_methods'}->{$supported_class};
last;
}
}
} else {
$diff_method = $self->{'diff_methods'}->{$class};
}
return $diff_method;
}
1;
-------------- next part --------------
? ComparatorI.pm
? SampleComparator.pm
? Root/.Exception.pm.swp
Index: AnnotationI.pm
===================================================================
RCS file: /home/repository/bioperl/bioperl-live/Bio/AnnotationI.pm,v
retrieving revision 1.7
diff -u -3 -p -r1.7 AnnotationI.pm
--- AnnotationI.pm 2002/10/22 07:38:24 1.7
+++ AnnotationI.pm 2004/05/03 09:48:14
@@ -177,4 +177,13 @@ sub tagname{
shift->throw_not_implemented();
}
+sub hash_key {
+ my $self = shift;
+
+ my $hash_tree = $self->hash_tree();
+ my @keys = sort keys(%{$hash_tree});
+ my $string = join(':', map { $hash_tree->{$_} } @keys);
+ return $string;
+}
+
1;
Index: SeqFeatureI.pm
===================================================================
RCS file: /home/repository/bioperl/bioperl-live/Bio/SeqFeatureI.pm,v
retrieving revision 1.55
diff -u -3 -p -r1.55 SeqFeatureI.pm
--- SeqFeatureI.pm 2004/03/13 01:30:02 1.55
+++ SeqFeatureI.pm 2004/05/03 09:48:45
@@ -368,6 +368,26 @@ sub gff_string{
return $formatter->gff_string($self);
}
+=head2 hash_key
+
+ Title : hash_key
+ Usage : $key = hash_key();
+ Function: Provides a string which can be used as the key for a hash
+ table. This string is composed out of attributes of this
+ object in such a way that if two SeqFeatureI objects have the
+ same hash_key(), they are equivalent.
+
+ Returns : string
+ Args : None
+
+=cut
+
+sub hash_key {
+ my $self = shift;
+
+ return join(':', $self->primary_tag(), $self->display_name(), $self->start(), $self->end());
+}
+
my $static_gff_formatter = undef;
=head2 _static_gff_formatter
Index: Annotation/Comment.pm
===================================================================
RCS file: /home/repository/bioperl/bioperl-live/Bio/Annotation/Comment.pm,v
retrieving revision 1.10
diff -u -3 -p -r1.10 Comment.pm
--- Annotation/Comment.pm 2004/05/03 00:28:20 1.10
+++ Annotation/Comment.pm 2004/05/03 09:48:46
@@ -116,6 +116,8 @@ sub hash_tree{
my $h = {};
$h->{'text'} = $self->text;
+
+ return $h;
}
=head2 tagname
Index: Annotation/SimpleValue.pm
===================================================================
RCS file: /home/repository/bioperl/bioperl-live/Bio/Annotation/SimpleValue.pm,v
retrieving revision 1.10
diff -u -3 -p -r1.10 SimpleValue.pm
--- Annotation/SimpleValue.pm 2003/03/10 21:53:56 1.10
+++ Annotation/SimpleValue.pm 2004/05/03 09:49:37
@@ -142,6 +142,8 @@ sub hash_tree{
my $h = {};
$h->{'value'} = $self->value;
+
+ return $h;
}
=head2 tagname
More information about the Bioperl-l
mailing list