[Bioperl-l] EST Alignment questions

Jamie Hatfield jamie@genome.arizona.edu
Fri, 1 Nov 2002 13:17:13 -0700


> From: Steven Lembark
> > I should probably just extend it to do what I want, but I'm 
> not quite
> > *that* familiar with bioperl and ooperl yet.
> 
> What you are trying to do is not all that complicated,
> mainly just deriving a class from the original BioPerl
> object that inserts the gaps (e.g., $myobj->withgaps($seq)
> or something similar).

Something along these lines??  (main changes are in add_seq)

This is the true alignment, so the real sequence and gaps are given and
GappedAlign does the rest.

cons      AAGGCCTGTA
est1    GCAA   C GTAAC
est2         GCC GTATG

=====================BEGIN GappedAlign.pm==============================
=head1 NAME

GappedAlign - Multiple (gapped) alignments held as a set of sequences

=head1 SYNOPSIS

  use Bio::GappedAlign;
  use Bio::AlignIO;
  use strict;

  my $seq;
  my $Align = new Bio::GappedAlign();

  $seq = new Bio::LocatableSeq( -seq => 'AAGGCCTGTA', -id => 'cons',
                                -start => 1, -end => 10);
  $Align->add_seq(-seq => $seq);

  $seq = new Bio::LocatableSeq( -seq => 'GCAACGTAAC', -id => 'est1',
                                -start => 1, -end => 12);
  $Align->add_seq(-seq => $seq, -offset => -2, -gaps => "4 4 4 5");

  $seq = new Bio::LocatableSeq( -seq => 'GCCGTATG', -id => 'est2',
                                -start => 1, -end => 8);
  $Align->add_seq(-seq => $seq, -offset => 3,  -gaps => "3");

  my $out   = Bio::AlignIO->new(-fh => \*STDOUT, -format => 'clustalw');
  $out->write_aln($Align);
  
=cut

# Let the code begin...

package Bio::GappedAlign;
use vars qw(@ISA);
use strict;

use Bio::Root::Root;
use Bio::SimpleAlign;

@ISA = qw(Bio::SimpleAlign);

=head2 new

 Title     : new
 Usage     : my $aln = new Bio::GappedAlign();
 Function  : Creates a new simple align object
 Returns   : Bio::GappedAlign
 Args      : -source => string representing the source program 
                        where this alignment came from

See L<Bio::SimpleAlign> for more information

=cut

sub new {
  my($class,@args) = @_;
  my $self = $class->SUPER::new(@args);

  $self->{'_far_left'} = 0;

  return $self;
}

=head2 add_seq

 Title     : add_seq
 Usage     : $myalign->add_seq();
 Function  : Adds another sequence to the alignment. *Does not* align
             it - just adds it to the hashes.
 Returns   : nothing
 Args      : -seq    => a Bio::LocatableSeq object
             -offset => integer offset from "origin"
	     -gaps   => space seperated string of integer positions
	       where there is are gaps
             -order  => which row in the alignment to put the seq in
(optional)

See L<Bio::LocatableSeq> for more information

=cut

sub add_seq {
    my ($self,@args) = @_;
    my ($seq, $offset, $gaps, $order) = 
         $self->_rearrange([qw(SEQ OFFSET GAPS ORDER)], @args);
    my $gap = $self->gap_char;
    my $s;

    $offset || ($offset = 0);
    $gaps   || ($gaps   = "");

    if( !ref $seq || ! $seq->isa('Bio::LocatableSeq') ) {
	$self->throw("Unable to process non locatable sequences [",
ref($seq), "]");
    }

    $s = $seq->seq;
    map { substr($s, $_, 0) = $gap; }
       sort {$b<=>$a} split(/ /, $gaps);

    if ($offset < $self->{'_far_left'}) {
      ## if we add a seq that is farther left than everybody else, 
      ##  we need to add gaps to other sequences at their start

      ## if you always start adding with the leftmost sequence, you
      ## don't have to worry about this additional overhead.

      my $odiff = $self->{'_far_left'} - $offset;
      my $other;

      foreach $other ( $self->each_seq() ) {
        $other->seq(($gap x $odiff) . $other->seq);
      }

      $self->{'_far_left'} = $offset;
    }
    elsif ($offset > $self->{'_far_left'}) {
      ## we've already seen a sequence farther left than this, so pad
this one

      my $odiff = $offset - $self->{'_far_left'};

      $s = ($gap x $odiff) . $s;

    }

    $seq->seq($s);

    ## let SimpleAlign do the rest of the work

    $self->SUPER::add_seq($seq, $order);
}

1;
======================END GappedAlign.pm===============================

----------------------------------------------------------------------
Jamie Hatfield                              Room 541H, Marley Building
Systems Programmer                          University of Arizona
Arizona Genomics Computational              Tucson, AZ  85721
  Laboratory (AGCoL)                        (520) 626-9598