[Bioperl-l] Substitution models

Jason Stajich jason.stajich at duke.edu
Fri Mar 3 15:41:33 UTC 2006


Great - this can probably handle a better object representation of  
rates we are parsing out of PAML too, not sure the same level of  
detail is available there.

I think that Bio::Phylo is already used by another perl project or  
I'd suggest we start that directory, but we'll have to start to put  
some more of that stuff into a good namespace.  The simple summary  
stats distance models are implemented in Bio::Align::DNAStatistics  
since they are pairwise distances from alignments but we may want to  
unify more of this into a Phylo/MolEvol namespace at some point.  I'm  
happy for someone else to spearhead if they are interested....

-jason
On Mar 3, 2006, at 2:10 AM, Rutger Vos wrote:

> Hi all,
>
> unless I've missed it, there doesn't seem to be an object for  
> nucleotide substitution models. I'd like to implement one. I've  
> attached an example implementation (which needs further methods,  
> tests and pod, obviously). I'd like to solicit the input of the  
> bioperl-architecture gurus on whether the general design of this  
> sample is sane.
>
> The idea is that you would use this to serialize model descriptions  
> between e.g. MrBayes and Paup (which describe the same concepts,  
> but with slightly different syntax).
>
> Best wishes,
>
> Rutger
>
> -- 
> ++++++++++++++++++++++++++++++++++++++++++++++++++++
> Rutger Vos, PhD. candidate
> Department of Biological Sciences
> Simon Fraser University
> 8888 University Drive
> Burnaby, BC, V5A1S6
> Phone: 604-291-5625 Fax: 604-291-3496
> Personal site: http://www.sfu.ca/~rvosa
> FAB* lab: http://www.sfu.ca/~fabstar
> Bio::Phylo: http://search.cpan.org/~rvosa/Bio-Phylo/
> ++++++++++++++++++++++++++++++++++++++++++++++++++++
>
>
> package NucSubstModel;
> use strict;
> use warnings;
> use List::Util qw(sum);
> use Bio::Root::Root;
> use Bio::Tools::IUPAC;
> use vars qw(@ISA);
> @ISA = qw(Bio::Root::Root);
>
> sub new {
>     my $class = shift;
>     my $self = {
>         'matrix' => [
>         # TO: A     C     G     T
>             [ 0.25, 0.25, 0.25, 0.25 ], # FROM: A
>             [ 0.25, 0.25, 0.25, 0.25 ], # FROM: C
>             [ 0.25, 0.25, 0.25, 0.25 ], # FROM: G
>             [ 0.25, 0.25, 0.25, 0.25 ], # FROM: T
>         ],
>         'transl' => {
>             'A'  => { 'id' => 0, 'freq' => 0.25 }, # ids are for  
> matrix lookup
>             'C'  => { 'id' => 1, 'freq' => 0.25 },
>             'G'  => { 'id' => 2, 'freq' => 0.25 },
>             'T'  => { 'id' => 3, 'freq' => 0.25 },
>         },
>         'gshape' => 0, # gamma shap parameter alpha
>         'pinvar' => 0, # proportion of invariant sites
>         'mu'     => 1, # mutation rate paramter mu
>     };
>     bless $self, $class;
>     return $self;
> }
>
> # gets/sets nucleotide frequencies, one at a time. Adjusts matrix.
> # example: $model->freq( 'A' ) returns 0.25
> # example: $model->freq( 'N' ) returns 1
> # example: $model->freq( 'A' => 0.5 ) scales C, G, T downward.
> # does not adjust when ambiguous symbols used, e.g.
> # $model->freq( 'N' => 0.5 ) doesn't work
> sub freq {
>     my ( $self, $nuc, $freq ) = ( $_[0], uc $_[1], $_[2] );
>     if ( exists $Bio::Tools::IUPAC::IUB{$nuc} ) {
>         if ( exists $self->{'transl'}->{$nuc} ) {
>             if ( $freq and $freq > 0 and $freq < 1 ) {
>                 my %tmp = ( $nuc => 1 );
>                 my $diff = $self->{'transl'}->{$nuc}->{'freq'} -  
> $freq;
>                 $self->{'transl'}->{$nuc}->{'freq'} = $freq;
>                 for ( 'A', 'C', 'G', 'T' ) {
>                     next if $tmp{$_};
>                     $self->{'transl'}->{$_}->{'freq'} += ( $diff /  
> 3 );
>                 }
>             }
>             elsif ( $freq and ( $freq <= 0 or $freq >= 1 ) ) {
>                 $self->throw( "Frequency must be between 0 and 1" );
>             }
>             return $self->{'transl'}->{$nuc}->{'freq'};
>         }
>         else {
>             my $disambiguated_freq = 0;
>             foreach ( @{ $Bio::Tools::IUPAC::IUB{$nuc} } ) {
>                 $disambiguated_freq += $self->{'transl'}->{$_}-> 
> {'freq'};
>             }
>             return $disambiguated_freq;
>         }
>     }
>     else {
>         $self->throw( "Nucleotide \"$nuc\" does not exist" );
>     }
> }
>
> # gets/sets substitution probabilities.
> # example: $model->prob( 'A' => 'C' ) returns 0.25
> # example: $model->prob( 'A' => 'N' ) returns 1
> # example: $model->prob( 'A' => 'C', 0.5 ) scales others accordingly
> # does not adjust when ambiguous symbols used, e.g.
> # $model->prob( 'A' => 'N', 0.5 ) doesn't work
> sub prob {
>     my ( $self, $from, $to, $p ) = ( $_[0], uc $_[1], uc $_[2], $_ 
> [3] );
>     if ( exists $self->{'transl'}{$from} and exists $self-> 
> {'transl'}{$to} ) {
>         my ( $i, $j ) = ( $self->{'transl'}{$from}{'id'}, $self-> 
> {'transl'}{$to}{'id'} );
>         if ( $p and $p > 0 and $p < 1 ) {
>             my $mat = $self->{'matrix'};
>             my $scale = $mat->[$i][$j] - $p;
>             $mat->[$i][$j] = $p;
>             for my $f ( 0 .. 3 ) {
>                 for my $t ( 0 .. 3 ) {
>                     next if $f == $i and $t == $j;
>                     $mat->[$f][$t] += ( $scale / 3 ) if $f == $i or  
> $t == $j;
>                 }
>             }
>             for my $f ( 0 .. 3 ) {
>                 next if $f == $i;
>                 my $rowscale = 1 - sum @{ $mat->[$f] };
>                 for my $t ( 0 .. 3 ) {
>                     next if $t == $j;
>                     $mat->[$f][$t] += ( $rowscale / 3 );
>                 }
>             }
>         }
>         elsif ( $p and ( $p <= 0 or $p >= 1 ) ) {
>             $self->throw( "Probability must be between 0 and 1" );
>         }
>         return $self->{'matrix'}[$i][$j];
>     }
>     else {
>         my $disambiguated_p = 0;
>         if ( exists $Bio::Tools::IUPAC::IUB{$from} and exists  
> $Bio::Tools::IUPAC::IUB{$to} ) {
>             my $from_array = $Bio::Tools::IUPAC::IUB{$from};
>             my $to_array = $Bio::Tools::IUPAC::IUB{$to};
>             foreach my $from_sym ( @{ $from_array } ) {
>                 my $i = $self->{'transl'}{$from_sym}{'id'};
>                 foreach my $to_sym ( @{ $to_array } ) {
>                     my $j = $self->{'transl'}{$to_sym}{'id'};
>                     $disambiguated_p += $self->{'matrix'}[$i][$j]
>                 }
>             }
>             return $disambiguated_p / scalar @{ $from_array };
>         }
>         else {
>             $self->throw( "Nucleotide \"$from\" and/or \"$to\" not  
> in IUPAC::IUB" );
>         }
>     }
> }
>
> # gets/sets mutation rate.
> # example: $model->mu() returns 1;
> # example: $model->mu( 0.5 ) doubles diagonal, scales others  
> accordingly
> sub mu {
>     my ( $self, $mu ) = @_;
>     if ( $mu ) {
>         $mu = 1 / $mu;
>         my $scale = $mu - $self->{'mu'};
>         my $mat = $self->{'matrix'};
>         for my $i ( 0 .. $#{ $mat } ) {
>             for my $j ( 0 .. $#{ $mat->[$i] } ) {
>                 if ( $i == $j ) {
>                     $mat->[$i][$j] = $mat->[$i][$j] + $mat->[$i] 
> [$j] * $scale;
>                 }
>                 else {
>                     $mat->[$i][$j] = $mat->[$i][$j] - ( $mat->[$i] 
> [$j] * $scale ) / 3;
>                 }
>             }
>         }
>         $self->{'mu'} = $mu;
>     }
>     return $self->{'mu'};
> }
>
> # gets/sets gamma shape parameter alpha
> sub gshape {
>     my ( $self, $gshape ) = @_;
>     $self->{'gshape'} = $gshape if $gshape;
>     return $self->{'gshape'};
> }
>
> # gets/sets proportion of invariant sites
> sub pinvar {
>     my ( $self, $pinvar ) = @_;
>     $self->{'pinvar'} = $pinvar if $pinvar;
>     return $self->{'pinvar'};
> }
>
> # other methods: raw transition matrix input, raw frequency input,  
> export to
> # and import from MrBayes/Paup/whatever model syntax, also needs  
> pod and tests
>
> 1;_______________________________________________
> Bioperl-l mailing list
> Bioperl-l at lists.open-bio.org
> http://lists.open-bio.org/mailman/listinfo/bioperl-l

--
Jason Stajich
Duke University
http://www.duke.edu/~jes12





More information about the Bioperl-l mailing list