[Bioperl-l] Substitution models
Jason Stajich
jason.stajich at duke.edu
Fri Mar 3 15:41:33 UTC 2006
Great - this can probably handle a better object representation of
rates we are parsing out of PAML too, not sure the same level of
detail is available there.
I think that Bio::Phylo is already used by another perl project or
I'd suggest we start that directory, but we'll have to start to put
some more of that stuff into a good namespace. The simple summary
stats distance models are implemented in Bio::Align::DNAStatistics
since they are pairwise distances from alignments but we may want to
unify more of this into a Phylo/MolEvol namespace at some point. I'm
happy for someone else to spearhead if they are interested....
-jason
On Mar 3, 2006, at 2:10 AM, Rutger Vos wrote:
> Hi all,
>
> unless I've missed it, there doesn't seem to be an object for
> nucleotide substitution models. I'd like to implement one. I've
> attached an example implementation (which needs further methods,
> tests and pod, obviously). I'd like to solicit the input of the
> bioperl-architecture gurus on whether the general design of this
> sample is sane.
>
> The idea is that you would use this to serialize model descriptions
> between e.g. MrBayes and Paup (which describe the same concepts,
> but with slightly different syntax).
>
> Best wishes,
>
> Rutger
>
> --
> ++++++++++++++++++++++++++++++++++++++++++++++++++++
> Rutger Vos, PhD. candidate
> Department of Biological Sciences
> Simon Fraser University
> 8888 University Drive
> Burnaby, BC, V5A1S6
> Phone: 604-291-5625 Fax: 604-291-3496
> Personal site: http://www.sfu.ca/~rvosa
> FAB* lab: http://www.sfu.ca/~fabstar
> Bio::Phylo: http://search.cpan.org/~rvosa/Bio-Phylo/
> ++++++++++++++++++++++++++++++++++++++++++++++++++++
>
>
> package NucSubstModel;
> use strict;
> use warnings;
> use List::Util qw(sum);
> use Bio::Root::Root;
> use Bio::Tools::IUPAC;
> use vars qw(@ISA);
> @ISA = qw(Bio::Root::Root);
>
> sub new {
> my $class = shift;
> my $self = {
> 'matrix' => [
> # TO: A C G T
> [ 0.25, 0.25, 0.25, 0.25 ], # FROM: A
> [ 0.25, 0.25, 0.25, 0.25 ], # FROM: C
> [ 0.25, 0.25, 0.25, 0.25 ], # FROM: G
> [ 0.25, 0.25, 0.25, 0.25 ], # FROM: T
> ],
> 'transl' => {
> 'A' => { 'id' => 0, 'freq' => 0.25 }, # ids are for
> matrix lookup
> 'C' => { 'id' => 1, 'freq' => 0.25 },
> 'G' => { 'id' => 2, 'freq' => 0.25 },
> 'T' => { 'id' => 3, 'freq' => 0.25 },
> },
> 'gshape' => 0, # gamma shap parameter alpha
> 'pinvar' => 0, # proportion of invariant sites
> 'mu' => 1, # mutation rate paramter mu
> };
> bless $self, $class;
> return $self;
> }
>
> # gets/sets nucleotide frequencies, one at a time. Adjusts matrix.
> # example: $model->freq( 'A' ) returns 0.25
> # example: $model->freq( 'N' ) returns 1
> # example: $model->freq( 'A' => 0.5 ) scales C, G, T downward.
> # does not adjust when ambiguous symbols used, e.g.
> # $model->freq( 'N' => 0.5 ) doesn't work
> sub freq {
> my ( $self, $nuc, $freq ) = ( $_[0], uc $_[1], $_[2] );
> if ( exists $Bio::Tools::IUPAC::IUB{$nuc} ) {
> if ( exists $self->{'transl'}->{$nuc} ) {
> if ( $freq and $freq > 0 and $freq < 1 ) {
> my %tmp = ( $nuc => 1 );
> my $diff = $self->{'transl'}->{$nuc}->{'freq'} -
> $freq;
> $self->{'transl'}->{$nuc}->{'freq'} = $freq;
> for ( 'A', 'C', 'G', 'T' ) {
> next if $tmp{$_};
> $self->{'transl'}->{$_}->{'freq'} += ( $diff /
> 3 );
> }
> }
> elsif ( $freq and ( $freq <= 0 or $freq >= 1 ) ) {
> $self->throw( "Frequency must be between 0 and 1" );
> }
> return $self->{'transl'}->{$nuc}->{'freq'};
> }
> else {
> my $disambiguated_freq = 0;
> foreach ( @{ $Bio::Tools::IUPAC::IUB{$nuc} } ) {
> $disambiguated_freq += $self->{'transl'}->{$_}->
> {'freq'};
> }
> return $disambiguated_freq;
> }
> }
> else {
> $self->throw( "Nucleotide \"$nuc\" does not exist" );
> }
> }
>
> # gets/sets substitution probabilities.
> # example: $model->prob( 'A' => 'C' ) returns 0.25
> # example: $model->prob( 'A' => 'N' ) returns 1
> # example: $model->prob( 'A' => 'C', 0.5 ) scales others accordingly
> # does not adjust when ambiguous symbols used, e.g.
> # $model->prob( 'A' => 'N', 0.5 ) doesn't work
> sub prob {
> my ( $self, $from, $to, $p ) = ( $_[0], uc $_[1], uc $_[2], $_
> [3] );
> if ( exists $self->{'transl'}{$from} and exists $self->
> {'transl'}{$to} ) {
> my ( $i, $j ) = ( $self->{'transl'}{$from}{'id'}, $self->
> {'transl'}{$to}{'id'} );
> if ( $p and $p > 0 and $p < 1 ) {
> my $mat = $self->{'matrix'};
> my $scale = $mat->[$i][$j] - $p;
> $mat->[$i][$j] = $p;
> for my $f ( 0 .. 3 ) {
> for my $t ( 0 .. 3 ) {
> next if $f == $i and $t == $j;
> $mat->[$f][$t] += ( $scale / 3 ) if $f == $i or
> $t == $j;
> }
> }
> for my $f ( 0 .. 3 ) {
> next if $f == $i;
> my $rowscale = 1 - sum @{ $mat->[$f] };
> for my $t ( 0 .. 3 ) {
> next if $t == $j;
> $mat->[$f][$t] += ( $rowscale / 3 );
> }
> }
> }
> elsif ( $p and ( $p <= 0 or $p >= 1 ) ) {
> $self->throw( "Probability must be between 0 and 1" );
> }
> return $self->{'matrix'}[$i][$j];
> }
> else {
> my $disambiguated_p = 0;
> if ( exists $Bio::Tools::IUPAC::IUB{$from} and exists
> $Bio::Tools::IUPAC::IUB{$to} ) {
> my $from_array = $Bio::Tools::IUPAC::IUB{$from};
> my $to_array = $Bio::Tools::IUPAC::IUB{$to};
> foreach my $from_sym ( @{ $from_array } ) {
> my $i = $self->{'transl'}{$from_sym}{'id'};
> foreach my $to_sym ( @{ $to_array } ) {
> my $j = $self->{'transl'}{$to_sym}{'id'};
> $disambiguated_p += $self->{'matrix'}[$i][$j]
> }
> }
> return $disambiguated_p / scalar @{ $from_array };
> }
> else {
> $self->throw( "Nucleotide \"$from\" and/or \"$to\" not
> in IUPAC::IUB" );
> }
> }
> }
>
> # gets/sets mutation rate.
> # example: $model->mu() returns 1;
> # example: $model->mu( 0.5 ) doubles diagonal, scales others
> accordingly
> sub mu {
> my ( $self, $mu ) = @_;
> if ( $mu ) {
> $mu = 1 / $mu;
> my $scale = $mu - $self->{'mu'};
> my $mat = $self->{'matrix'};
> for my $i ( 0 .. $#{ $mat } ) {
> for my $j ( 0 .. $#{ $mat->[$i] } ) {
> if ( $i == $j ) {
> $mat->[$i][$j] = $mat->[$i][$j] + $mat->[$i]
> [$j] * $scale;
> }
> else {
> $mat->[$i][$j] = $mat->[$i][$j] - ( $mat->[$i]
> [$j] * $scale ) / 3;
> }
> }
> }
> $self->{'mu'} = $mu;
> }
> return $self->{'mu'};
> }
>
> # gets/sets gamma shape parameter alpha
> sub gshape {
> my ( $self, $gshape ) = @_;
> $self->{'gshape'} = $gshape if $gshape;
> return $self->{'gshape'};
> }
>
> # gets/sets proportion of invariant sites
> sub pinvar {
> my ( $self, $pinvar ) = @_;
> $self->{'pinvar'} = $pinvar if $pinvar;
> return $self->{'pinvar'};
> }
>
> # other methods: raw transition matrix input, raw frequency input,
> export to
> # and import from MrBayes/Paup/whatever model syntax, also needs
> pod and tests
>
> 1;_______________________________________________
> Bioperl-l mailing list
> Bioperl-l at lists.open-bio.org
> http://lists.open-bio.org/mailman/listinfo/bioperl-l
--
Jason Stajich
Duke University
http://www.duke.edu/~jes12
More information about the Bioperl-l
mailing list