[BioRuby-cvs] bioruby/lib/bio/io pubmed.rb, 1.12, 1.13 fetch.rb, 1.4, 1.5

Thu Mar 16 17:29:07 UTC 2006

Update of /home/repository/bioruby/bioruby/lib/bio/io
In directory pub.open-bio.org:/tmp/cvs-serv7087

Modified Files:
	pubmed.rb fetch.rb 
Log Message:
* Added documentation to pubmed.rb and fetch.rb
* For fetch.rb: replaced 'net/http' with 'open-uri' to allow people behind a proxy to use this class.

Index: pubmed.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/io/pubmed.rb,v
retrieving revision 1.12
retrieving revision 1.13
diff -C2 -d -r1.12 -r1.13
*** pubmed.rb	8 Sep 2005 01:22:12 -0000	1.12
--- pubmed.rb	16 Mar 2006 17:29:05 -0000	1.13
***************
*** 3,6 ****
--- 3,7 ----
  #
  #   Copyright (C) 2001 KATAYAMA Toshiaki <k at bioruby.org>
+ #                 2006 Jan Aerts <jan.aerts at bbsrc.ac.uk>
  #
  #  This library is free software; you can redistribute it and/or
***************
*** 26,61 ****
  module Bio

    class PubMed

!     def self.query(id)
!       host = "www.ncbi.nlm.nih.gov"
!       path = "/entrez/query.fcgi?tool=bioruby&cmd=Text&dopt=MEDLINE&db=PubMed&uid="
! 
!       http = Net::HTTP.new(host)
!       response, = http.get(path + id.to_s)
!       result = response.body
!       if result =~ /#{id}\s+Error/
!         raise( result )
!       else
!         result = result.gsub("\r", "\n").squeeze("\n").gsub(/<\/?pre>/, '')
!         return result
!       end
!     end
! 
!     def self.pmfetch(id)
!       host = "www.ncbi.nlm.nih.gov"
!       path = "/entrez/utils/pmfetch.fcgi?tool=bioruby&mode=text&report=medline&db=PubMed&id="
! 
!       http = Net::HTTP.new(host)
!       response, = http.get(path + id.to_s)
!       result = response.body
!       if result =~ /#{id}\s+Error/
!         raise( result )
!       else
!         result = result.gsub("\r", "\n").squeeze("\n").gsub(/<\/?pre>/, '')
!         return result
!       end
!     end
! 
      def self.search(str)
        host = "www.ncbi.nlm.nih.gov"
--- 27,85 ----
  module Bio

+   # = DESCRIPTION
+   # The Bio::PubMed class provides several ways to retrieve bibliographic
+   # information from the PubMed database at
+   # http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=PubMed. Basically, two
+   # types of queries are possible:
+   # * searching for PubMed IDs given a query string:
+   #   * Bio::PubMed#search
+   #   * Bio::PubMed#esearch
+   # * retrieving the MEDLINE text (i.e. authors, journal, abstract, ...) given a PubMed ID
+   #   * Bio::PubMed#query
+   #   * Bio::PubMed#pmfetch
+   #   * Bio::PubMed#efetch
+   #
+   # The different methods within the same group are interchangeable and should
+   # return the same result.
+   # 
+   # Additional information about the MEDLINE format and PubMed programmable
+   # APIs can be found on the following websites:
+   # * Overview: http://www.ncbi.nlm.nih.gov/entrez/query/static/overview.html
+   # * How to link: http://www.ncbi.nlm.nih.gov/entrez/query/static/linking.html
+   # * MEDLINE format: http://www.ncbi.nlm.nih.gov/entrez/query/static/help/pmhelp.html#MEDLINEDisplayFormat
+   # * Search field descriptions and tags: http://www.ncbi.nlm.nih.gov/entrez/query/static/help/pmhelp.html#SearchFieldDescriptionsandTags
+   # * Entrez utilities index: http://www.ncbi.nlm.nih.gov/entrez/utils/utils_index.html
+   # * PmFetch CGI help: http://www.ncbi.nlm.nih.gov/entrez/utils/pmfetch_help.html
+   # * E-Utilities CGI help: http://eutils.ncbi.nlm.nih.gov/entrez/query/static/eutils_help.html
+   #
+   # = USAGE
+   #  require 'bio'
+   #
+   #  # If you don't know the pubmed ID:
+   #  Bio::PubMed.search("(genome AND analysis) OR bioinformatics)").each do |x|
+   #    p x
+   #  end
+   #  Bio::PubMed.esearch("(genome AND analysis) OR bioinformatics)").each do |x|
+   #    p x
+   #  end
+   #  
+   #  # To retrieve the MEDLINE entry for a given PubMed ID:
+   #  puts Bio::PubMed.query("10592173")
+   #  puts Bio::PubMed.pmfetch("10592173")
+   #  puts Bio::PubMed.efetch("10592173", "14693808")
+   #  # This can be converted into a Bio::MEDLINE object:
+   #  manuscript = Bio::PubMed.query("10592173")
+   #  medline = Bio::MEDLINE(manuscript)
+   #  
+   # = REMARK
+   # This class can not be used at the moment if you're behind a proxy server. This will be solved in the near future.
    class PubMed

!     # Search the PubMed database by given keywords using entrez query and returns
!     # an array of PubMed IDs.
!     # ---
!     # *Arguments*:
!     # * _id_: query string (required)
!     # *Returns*:: array of PubMed IDs
      def self.search(str)
        host = "www.ncbi.nlm.nih.gov"
***************
*** 70,73 ****
--- 94,115 ----
      end

+     # Search the PubMed database by given keywords using E-Utils and returns 
+     # an array of PubMed IDs.
+     # 
+     # For information on the possible arguments, see
+     # http://eutils.ncbi.nlm.nih.gov/entrez/query/static/esearch_help.html#PubMed
+     # ---
+     # *Arguments*:
+     # * _id_: query string (required)
+     # * _field_
+     # * _reldate_
+     # * _mindate_
+     # * _maxdate_
+     # * _datetype_
+     # * _retstart_
+     # * _retmax_ (default 100)
+     # * _retmode_
+     # * _rettype_
+     # *Returns*:: array of PubMed IDs
      def self.esearch(str, hash = {})
        hash['retmax'] = 100 unless hash['retmax']
***************
*** 88,91 ****
--- 130,184 ----
      end

+     # Retrieve PubMed entry by PMID and returns MEDLINE formatted string using
+     # entrez query.
+     # ---
+     # *Arguments*:
+     # * _id_: PubMed ID (required)
+     # *Returns*:: MEDLINE formatted String
+     def self.query(id)
+       host = "www.ncbi.nlm.nih.gov"
+       path = "/entrez/query.fcgi?tool=bioruby&cmd=Text&dopt=MEDLINE&db=PubMed&uid="
+ 
+       http = Net::HTTP.new(host)
+       response, = http.get(path + id.to_s)
+       result = response.body
+       if result =~ /#{id}\s+Error/
+         raise( result )
+       else
+         result = result.gsub("\r", "\n").squeeze("\n").gsub(/<\/?pre>/, '')
+         return result
+       end
+     end
+ 
+     # Retrieve PubMed entry by PMID and returns MEDLINE formatted string using
+     # entrez pmfetch.
+     # ---
+     # *Arguments*:
+     # * _id_: PubMed ID (required)
+     # *Returns*:: MEDLINE formatted String
+     def self.pmfetch(id)
+       host = "www.ncbi.nlm.nih.gov"
+       path = "/entrez/utils/pmfetch.fcgi?tool=bioruby&mode=text&report=medline&db=PubMed&id="
+ 
+       http = Net::HTTP.new(host)
+       response, = http.get(path + id.to_s)
+       result = response.body
+       if result =~ /#{id}\s+Error/
+         raise( result )
+       else
+         result = result.gsub("\r", "\n").squeeze("\n").gsub(/<\/?pre>/, '')
+         return result
+       end
+     end
+ 
+     # Retrieve PubMed entry by PMID and returns MEDLINE formatted string using
+     # entrez efetch. Multiple PubMed IDs can be provided:
+     #   Bio::PubMed.efetch(123)
+     #   Bio::PubMed.efetch(123,456,789)
+     #   Bio::PubMed.efetch([123,456,789])
+     # ---
+     # *Arguments*:
+     # * _ids_: list of PubMed IDs (required)
+     # *Returns*:: MEDLINE formatted String
      def self.efetch(*ids)
        return [] if ids.empty?
***************
*** 125,189 ****

  end
- 
- =begin
- 
- = Bio::PubMed
- 
- These class methods access NCBI/PubMed database via HTTP.
- 
- --- Bio::PubMed.esearch(str, options)
- 
-       Search keywords in PubMed by E-Utils and returns an array of PubMed IDs.
-       Options can be a hash containing keys include 'field', 'reldate',
-       'mindate', 'maxdate', 'datetype', 'retstart', 'retmax', 'retmode',
-       and 'rettype' as specified in the following URL:
- 
-         ((<URL:http://eutils.ncbi.nlm.nih.gov/entrez/query/static/esearch_help.html#PubMed>))
- 
-      Default 'retmax' is 100.
- 
- --- Bio::PubMed.efetch(pmids)
- 
-       Returns an array of MEDLINE records.  A list of PubMed IDs can be
-       supplied as following:
- 
-         Bio::PubMed.efetch(123)
-         Bio::PubMed.efetch(123,456,789)
-         Bio::PubMed.efetch([123,456,789])
- 
- --- Bio::PubMed.query(pmid)
- 
-       Retrieve PubMed entry by PMID and returns MEDLINE format string (can
-       be parsed by the Bio::MEDLINE and can be converted into Bio::Reference
-       object).
- 
- --- Bio::PubMed.pmfetch(pmid)
- 
-       Just another query method (by pmfetch).
- 
- --- Bio::PubMed.search(str)
- 
-       Search the PubMed database by given keywords and returns the list of
-       matched records in MEDLINE format.
- 
- 
- = For more informations
- 
- * Overview
-   * ((<URL:http://www.ncbi.nlm.nih.gov/entrez/query/static/overview.html>))
- * How to link
-   * ((<URL:http://www.ncbi.nlm.nih.gov/entrez/query/static/linking.html>))
- * MEDLINE format
-   * ((<URL:http://www.ncbi.nlm.nih.gov/entrez/query/static/help/pmhelp.html#MEDLINEDisplayFormat>))
- * Search field descriptions and tags
-   * ((<URL:http://www.ncbi.nlm.nih.gov/entrez/query/static/help/pmhelp.html#SearchFieldDescriptionsandTags>))
- * Entrez utilities index
-   * ((<URL:http://www.ncbi.nlm.nih.gov/entrez/utils/utils_index.html>))
- * PmFetch CGI help
-   * ((<URL:http://www.ncbi.nlm.nih.gov/entrez/utils/pmfetch_help.html>))
- * E-Utilities CGI help
-   * ((<URL:http://eutils.ncbi.nlm.nih.gov/entrez/query/static/eutils_help.html>))
- 
- =end
- 
- 
--- 218,219 ----

Index: fetch.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/io/fetch.rb,v
retrieving revision 1.4
retrieving revision 1.5
diff -C2 -d -r1.4 -r1.5
*** fetch.rb	18 Dec 2005 15:58:42 -0000	1.4
--- fetch.rb	16 Mar 2006 17:29:05 -0000	1.5
***************
*** 1,12 ****
  #
! # = bio/io/biofetch.rb - BioFetch access module
! #
! # Copyright::   Copyright (C) 2002, 2005
! #               Toshiaki Katayama <k at bioruby.org>
! # License::     LGPL
  #
! # $Id$
  #
- #--
  #
  #  This library is free software; you can redistribute it and/or
--- 1,10 ----
  #
! # bio/io/biofetch.rb - BioFetch access module
  #
! #  Copyright (C) 2002, 2005 Toshiaki Katayama <k at bioruby.org>
! #               2006 Jan Aerts <jan.aerts at bbsrc.ac.uk>
!            
! #  License: LGPL
  #
  #
  #  This library is free software; you can redistribute it and/or
***************
*** 24,95 ****
  #  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
  #
! #++
  #

  require 'uri'
! require 'net/http'

  module Bio

! class Fetch
! 
!   # Create a new Bio::Fetch server object.
!   # Use Bio::Fetch.new('http://www.ebi.ac.uk/cgi-bin/dbfetch') to connect
!   # to EBI BioFetch server.
!   def initialize(url = 'http://bioruby.org/cgi-bin/biofetch.rb')
!     schema, user, @host, @port, reg, @path, = URI.split(url)
!   end
! 
!   # Set default database to dbname (prepare for get_by_id).
!   attr_accessor :database
! 
!   # Get raw database entry by id (mainly used by Bio::Registry).
!   def get_by_id(id)
!     fetch(@database, id)
!   end
! 
!   # Fetch a database entry as specified by database (db), entry id (id),
!   # 'raw' text or 'html' (style), and format.  When using BioRuby's
!   # BioFetch server, value for the format should not be set.
!   def fetch(db, id, style = 'raw', format = nil)
!     data = [ "db=#{db}", "id=#{id}", "style=#{style}" ]
!     data.push("format=#{format}") if format
!     data = data.join('&')
! 
!     responce, result = Net::HTTP.new(@host, @port).post(@path, data)
!     return result
!   end
! 
!   # Short cut for using BioRuby's BioFetch server.  You can fetch an entry
!   # without creating instance of BioFetch server.
!   def self.query(*args)
!     self.new.fetch(*args)
!   end

!   # What databases are available?
!   def databases
!     query = "info=dbs"
!     responce, result = Net::HTTP.new(@host, @port).post(@path, query)
!     return result
!   end

!   # What formats does the database X have?
!   def formats(database = @database)
!     if database
!       query = "info=formats;db=#{database}"
!       responce, result = Net::HTTP.new(@host, @port).post(@path, query)
        return result
      end
    end

-   # How many entries can be retrieved simultaneously?
-   def maxids
-     query = "info=maxids"
-     responce, result = Net::HTTP.new(@host, @port).post(@path, query)
-     return result
-   end
- 
- end
- 
  end # module Bio

--- 22,183 ----
  #  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
  #
! #  $Id$
  #

  require 'uri'
! require 'open-uri'

  module Bio
+   # = DESCRIPTION
+   # The Bio::Fetch class provides an interface to dbfetch servers. Given
+   # a database name and an accession number, these servers return the nucleic
+   # or amino acid sequence for that accession number in that database.
+   #
+   # Possible dbfetch servers include:
+   # * http://bioruby.org/cgi-bin/biofetch.rb (default)
+   # * http://www.ebi.ac.uk/cgi-bin/dbfetch
+   #
+   # If you're behind a proxy server, be sure to set your HTTP_PROXY
+   # environment variable accordingly.
+   #
+   # = USAGE
+   #  require 'bio'
+   #
+   #  # Retrieve the sequence of accession number M33388 from the EMBL
+   #  # database.
+   #  server = Bio::Fetch.new()  #uses default server
+   #  puts server.fetch('embl','M33388')
+   #  
+   #  # Do the same thing without creating a Bio::Fetch object. This method always
+   #  # uses the default dbfetch server: http://bioruby.org/cgi-bin/biofetch.rb
+   #  puts Bio::Fetch.query('embl','M33388')
+   #
+   #  # To know what databases are available on the bioruby dbfetch server:
+   #  server = Bio::Fetch.new()
+   #  puts server.databases
+   #
+   #  # Some databases provide their data in different formats (e.g. 'fasta',
+   #  # 'genbank' or 'embl'). To check which formats are supported by a given
+   #  # database:
+   #  puts server.formats('embl')
+   #
+   class Fetch
+   
+     # Create a new Bio::Fetch server object that can subsequently be queried
+     # using the Bio::Fetch#fetch method
+     # ---
+     # *Arguments*:
+     # * _url_: URL of dbfetch server (default = 'http://bioruby.org/cgi-bin/biofetch.rb')
+     # *Returns*:: Bio::Fetch object
+     def initialize(url = 'http://bioruby.org/cgi-bin/biofetch.rb')
+       @url = url
+       schema, user, @host, @port, reg, @path, = URI.split(@url)
+     end
+   
+     # The default database to query
+     #--
+     # This will be used by the get_by_id method
+     #++
+     attr_accessor :database
+   
+     # Get raw database entry by id. This method lets the Bio::Registry class
+     # use Bio::Fetch objects and should probably not be used directly.
+     def get_by_id(id)
+       fetch(@database, id)
+     end
+   
+     # Fetch a database entry as specified by database (db), entry id (id),
+     # 'raw' text or 'html' (style), and format.  When using BioRuby's
+     # BioFetch server, value for the format should not be set.
+     # Examples:
+     #   server = Bio::Fetch.new('http://www.ebi.ac.uk/cgi-bin/dbfetch')
+     #   puts server.fetch('embl','M33388','raw','fasta')
+     #   puts server.fetch('refseq','NM_12345','html','embl')
+     # ---
+     # *Arguments*:
+     # * _database_: name of database to query (see Bio::Fetch#databases to get list of supported databases)
+     # * _id_: single ID or ID list separated by commas or white space
+     # * _style_: [raw|html] (default = 'raw')
+     # * _format_: name of output format (see Bio::Fetch#formats)
+     def fetch(db, id, style = 'raw', format = nil)
+       query = [ "db=#{db}", "id=#{id}", "style=#{style}" ]
+       query.push("format=#{format}") if format
+       query = query.join('&')
+   
+       result = open(@url + '?' + query).readlines.join('')
+       return result
+     end
+   
+     # Shortcut for using BioRuby's BioFetch server. You can fetch an entry
+     # without creating an instance of BioFetch server. This method uses the 
+     # default dbfetch server, which is http://bioruby.org/cgi-bin/biofetch.rb
+     # 
+     # Example:
+     #   puts Bio::Fetch.query('refseq','NM_12345')
+     #
+     # ---
+     # *Arguments*:
+     # * _database_: name of database to query (see Bio::Fetch#databases to get list of supported databases)
+     # * _id_: single ID or ID list separated by commas or white space
+     # * _style_: [raw|html] (default = 'raw')
+     # * _format_: name of output format (see Bio::Fetch#formats)
+     def self.query(*args)
+       self.new.fetch(*args)
+     end
+   
+     # Using this method, the user can ask a dbfetch server what databases
+     # it supports. This would normally be the first step you'd take when
+     # you use a dbfetch server for the first time.
+     # Example:
+     #  server = Bio::Fetch.new()
+     #  puts server.databases # returns "aa aax bl cpd dgenes dr ec eg emb ..."
+     #
+     # This method only works for the bioruby dbfetch server. For a list
+     # of databases available from the EBI, see the EBI website at 
+     # http://www.ebi.ac.uk/cgi-bin/dbfetch/
+     # ---
+     # *Returns*:: array of database names
+     def databases
+       query = "info=dbs"

!       result = open(@url + '?' + query).readlines.join('')
!       return result
!     end
!   
!     # Lists the formats that are available for a given database. Like the
!     # Bio::Fetch#databases method, this method is only available on 
!     # the bioruby dbfetch server.
!     # Example:
!     #  server = Bio::Fetch.new()
!     #  puts server.formats('embl') # returns "default fasta"
!     # ---
!     # *Arguments*:
!     # * _database_:: name of database you want the supported formats for
!     # *Returns*:: array of formats
!     def formats(database = @database)
!       if database
!         query = "info=formats;db=#{database}"

!         result = open(@url + '?' + query).readlines.join('')
!         return result
!       end
!     end
!   
!     # A dbfetch server will only return entries up to a given maximum number.
!     # This method retrieves that number from the server. As for the databases
!     # and formats methods, the maxids method only works for the bioruby
!     # dbfetch server.
!     # ---
!     # *Arguments*: none
!     # *Returns*:: number
!     def maxids
!       query = "info=maxids"

!       result = open(@url + '?' + query).readlines.join('')
        return result
      end
+   
    end

  end # module Bio

***************
*** 98,113 ****
  if __FILE__ == $0

- # bfserv = Bio::Fetch.new('http://www.ebi.ac.uk:80/cgi-bin/dbfetch')
-   bfserv = Bio::Fetch.new('http://www.ebi.ac.uk/cgi-bin/dbfetch')
    puts "# test 1"
!   puts bfserv.fetch('embl', 'J00231', 'raw')
    puts "# test 2"
!   puts bfserv.fetch('embl', 'J00231', 'html')
! 
    puts "# test 3"
!   puts Bio::Fetch.query('genbank', 'J00231')
    puts "# test 4"
    puts Bio::Fetch.query('genbank', 'J00231', 'raw', 'fasta')
! 
  end

--- 186,204 ----
  if __FILE__ == $0

    puts "# test 1"
!   br_server = Bio::Fetch.new()
!   puts br_server.databases
!   puts br_server.formats('embl')
!   puts br_server.maxids
!   ebi_server = Bio::Fetch.new('http://www.ebi.ac.uk/cgi-bin/dbfetch')
    puts "# test 2"
!   puts ebi_server.fetch('embl', 'J00231', 'raw')
    puts "# test 3"
!   puts ebi_server.fetch('embl', 'J00231', 'html')
    puts "# test 4"
+   puts Bio::Fetch.query('genbank', 'J00231')
+   puts "# test 5"
    puts Bio::Fetch.query('genbank', 'J00231', 'raw', 'fasta')
!  
  end