[Bioperl-l] modification in Bio::SeqIO::kegg.pm
Brian Osborne
osborne1 at optonline.net
Fri Nov 11 16:42:35 EST 2005
Guido,
Which new fields?
Brian O.
On 11/10/05 3:04 AM, "Guido Dieterich" <Guido.Dieterich at gbf.de> wrote:
> Hi Folks,
>
> I have found some errors in the kegg.pm module.
>
> I have also added some new code to read out more FIELDS in a KEGG
> entry ...
>
> Yours
>
> Guido
>
>
> #### MY CODE #####
>
>
>
> # $Id: kegg.pm,v 1.10 2004/12/07 13:03:45 heikki Exp $
> #
> # BioPerl module for Bio::SeqIO::kegg
> #
> # Cared for by Allen Day <allenday at ucla.edu>
> #
> # Copyright Allen Day
> #
> # You may distribute this module under the same terms as perl itself
>
> # POD documentation - main docs before the code
>
> =head1 NAME
>
> Bio::SeqIO::kegg - KEGG sequence input/output stream
>
> =head1 SYNOPSIS
>
> #It is probably best not to use this object directly, but
> #rather go through the SeqIO handler system. Go:
>
> $stream = Bio::SeqIO->new(-file => $filename, -format => 'KEGG');
>
> while ( my $seq = $stream->next_seq() ) {
> # do something with $seq
> }
>
> =head1 DESCRIPTION
>
> This class transforms KEGG gene records into Bio::Seq objects.
>
> =head2 Mapping of record properties to object properties
>
> This section is supposed to document which sections and properties of
> a KEGG databank record end up where in the Bioperl object model. It
> is far from complete and presently focuses only on those mappings
> which may be non-obvious. $seq in the text refers to the
> Bio::Seq::RichSeqI implementing object returned by the parser for each
> record.
>
> =over 4
>
> =item 'ENTRY'
>
> $seq->primary_id
>
> =item 'NAME'
>
> $seq->display_id
>
> =item 'DEFINITION'
>
> $seq->annotation->get_Annotations('description');
>
> =item 'ORTHOLOG'
>
> grep {$_->database eq 'KO'} $seq->annotation->get_Annotations('dblink')
>
> =item 'CLASS'
>
> grep {$_->database eq 'PATH'}
> $seq->annotation->get_Annotations('dblink')
>
> =item 'POSITION'
>
> FIXME, NOT IMPLEMENTED
>
> =item 'PATHWAY'
>
> for my $pathway ( $seq->annotation->get_Annotations('pathway') ) {
>
> }
>
> =item 'DBLINKS'
>
> $seq->annotation->get_Annotations('dblink')
>
> =item 'CODON_USAGE'
>
> FIXME, NOT IMPLEMENTED
>
> =item 'AASEQ'
>
> $seq->translate->seq
>
> =item 'NTSEQ'
>
> $seq-E<gt>seq
>
> =back
>
> =head1 FEEDBACK
>
> =head2 Mailing Lists
>
> User feedback is an integral part of the evolution of this
> and other Bioperl modules. Send your comments and suggestions preferably
> to one of the Bioperl mailing lists.
> Your participation is much appreciated.
>
> bioperl-l at bioperl.org - General discussion
> http://www.bioperl.org/MailList.shtml - About the mailing lists
>
> =head2 Reporting Bugs
>
> Report bugs to the Bioperl bug tracking system to help us keep track
> the bugs and their resolution.
> Bug reports can be submitted via email or the web:
>
> http://bugzilla.bioperl.org/
>
> =head1 AUTHOR - Allen Day
>
> Email allenday at ucla.edu
>
> =head1 APPENDIX
>
> The rest of the documentation details each of the object
> methods. Internal methods are usually preceded with a _
>
> =cut
>
> # Let the code begin...
>
> package Bio::SeqIO::kegg;
> use vars qw(@ISA);
> use strict;
>
> use Bio::SeqIO;
> use Bio::SeqFeature::Generic;
> use Bio::Species;
> use Bio::Seq::SeqFactory;
> use Bio::Annotation::Collection;
> use Bio::Annotation::Comment;
> use Bio::Annotation::DBLink;
>
> @ISA = qw(Bio::SeqIO);
>
> sub _initialize {
> my($self, at args) = @_;
>
> $self->SUPER::_initialize(@args);
> # hash for functions for decoding keys.
> $self->{'_func_ftunit_hash'} = {};
> if( ! defined $self->sequence_factory ) {
> $self->sequence_factory(new Bio::Seq::SeqFactory
> (-verbose => $self->verbose(),
> -type => 'Bio::Seq::RichSeq'));
> }
> }
>
> =head2 next_seq
>
> Title : next_seq
> Usage : $seq = $stream->next_seq()
> Function: returns the next sequence in the stream
> Returns : Bio::Seq::RichSeq object
> Args :
>
> =cut
>
> sub next_seq {
> my ($self, at args) = @_;
> my $builder = $self->sequence_builder();
> my $seq;
> my %params;
>
> my $buffer;
> my (@acc, @features);
> my ($display_id, $annotation);
> my $species;
>
> # initialize; we may come here because of starting over
> @features = ();
> $annotation = undef;
> @acc = ();
> $species = undef;
> %params = (-verbose => $self->verbose); # reset hash
> local($/) = "///\n";
>
> $buffer = $self->_readline();
>
> return undef if( !defined $buffer ); # end of file
> $buffer =~ /^ENTRY/ ||
> $self->throw("KEGG stream with bad ENTRY line. Not KEGG in my book. Got
> '$buffer'");
>
> my %FIELDS;
> my @chunks = split /\n(?=\S)/, $buffer;
>
> foreach my $chunk (@chunks){
> my($key) = $chunk =~ /^(\S+)/;
> $FIELDS{$key} = $chunk;
> }
>
> # my($entry_id,$entry_seqtype,$entry_species) = $FIELDS{ENTRY}
> =~ /^ENTRY\s+(\d+)\s+(\S+)\s+(\S+)\s*$/;
> # changing to split method to get entry_ids that include sequence
> version like Whatever.1
> my(undef,$entry_id,$entry_seqtype,$entry_species) = split(' ',
> $FIELDS{ENTRY});
>
> my($name);
> if ($FIELDS{NAME}) {
> ($name) = $FIELDS{NAME} =~ /^NAME\s+(.+)$/;
> }
>
> my($definition) = $FIELDS{DEFINITION} =~ /^DEFINITION\s+(.+)$/s;
> $definition =~ s/\s+/ /gs;
>
> my($aa_length,$aa_seq) = $FIELDS{AASEQ} =~ /^AASEQ\s+(\d+)\n(.+)$/s;
> $aa_seq =~ s/\s+//g;
> my($nt_length,$nt_seq) = $FIELDS{NTSEQ} =~ /^NTSEQ\s+(\d+)\n(.+)$/s;
> $nt_seq =~ s/\s+//g;
>
> $annotation = Bio::Annotation::Collection->new();
>
> $annotation->add_Annotation('description',Bio::Annotation::Comment->new(-text
> => $definition));
> $annotation->add_Annotation('aa_seq',
> Bio::Annotation::Comment->new(-text => $aa_seq)); ################# NEW
>
> my($ortholog_db,$ortholog_id,$ortholog_desc);
> if ($FIELDS{ORTHOLOG}) {
> ($ortholog_db,$ortholog_id,$ortholog_desc) = $FIELDS{ORTHOLOG}
> =~ /^ORTHOLOG\s+(\S+):\s+(\S+)\s+(.*?)$/;
>
> $annotation->add_Annotation('dblink',Bio::Annotation::DBLink->new(-database =>
> $ortholog_db,
>
> -primary_id => $ortholog_id,
>
> -comment => $ortholog_desc)
> );
> }
> ################ NEW
> if($FIELDS{MOTIF}){
> $FIELDS{MOTIF} =~ s/^MOTIF\s+//;
> while($FIELDS{MOTIF} =~/\s*?(\S+):\s+(.+?)$/mg){
> my $db = $1;
> my $ids = $2;
> foreach my $id (split(/\s+/, $ids)){
>
> $annotation->add_Annotation('dblink',Bio::Annotation::DBLink->new(-database
> =>$db,
>
> -primary_id => $id,
>
> -comment => "")
> );
> }
> }
> }
>
>
> if($FIELDS{PATHWAY}){
> $FIELDS{PATHWAY} =~ s/^PATHWAY\s+//;
> while($FIELDS{PATHWAY} =~ /\s*PATH:\s+(.+)$/mg){
> $annotation->add_Annotation('pathway',
> Bio::Annotation::Comment->new(-text => "$1"));
> }
> }
> ####################################
>
> if ($FIELDS{CLASS}) {
> $FIELDS{CLASS} =~ s/^CLASS\s+//;
> $FIELDS{'CLASS'} =~ s/\n//g;
> while($FIELDS{CLASS} =~ /(.*?)\[(\S+):(\S+)\]/g){
> my ($pathway,$db,$id) = ($1,$2,$3);
> $pathway =~ s/\s+/ /g;
> $pathway =~ s/\s$//g;
> $pathway =~ s/^\s+//;
> $annotation->add_Annotation('pathway',
> Bio::Annotation::Comment->new(-text => $pathway));
>
> $annotation->add_Annotation('dblink',Bio::Annotation::DBLink->new(-database =>
> $db, -primary_id => $id));
> }
> }
>
>
> if($FIELDS{DBLINKS}) {
> $FIELDS{DBLINKS} =~ s/^DBLINKS/ /;
> #print $FIELDS{DBLINKS}, "\n";
> while($FIELDS{DBLINKS} =~ /\s+(\S+):\s+(\S+)\n?/gs){ ### modified
> #print "\$1 $1 \$2 $2\n";
>
> $annotation->add_Annotation('dblink',Bio::Annotation::DBLink->new(-database =>
> $1, -primary_id => $2)) if $1;
> }
> }
>
> $params{'-alphabet'} = 'dna';
> $params{'-seq'} = $nt_seq;
> $params{'-display_id'} = $name;
> $params{'-accession_number'} = $entry_id;
> $params{'-species'} = Bio::Species->new(-common_name =>
> $entry_species);
> $params{'-annotation'} = $annotation;
>
> $builder->add_slot_value(%params);
> $seq = $builder->make_object();
>
> return $seq;
> }
>
> 1;
>
> _______________________________________________
> Bioperl-l mailing list
> Bioperl-l at portal.open-bio.org
> http://portal.open-bio.org/mailman/listinfo/bioperl-l
More information about the Bioperl-l
mailing list