[BioRuby-cvs] bioruby/lib/bio/io pubmed.rb, 1.12, 1.13 fetch.rb, 1.4, 1.5
Jan Aerts
aerts at pub.open-bio.org
Thu Mar 16 17:29:07 UTC 2006
Update of /home/repository/bioruby/bioruby/lib/bio/io
In directory pub.open-bio.org:/tmp/cvs-serv7087
Modified Files:
pubmed.rb fetch.rb
Log Message:
* Added documentation to pubmed.rb and fetch.rb
* For fetch.rb: replaced 'net/http' with 'open-uri' to allow people behind a proxy to use this class.
Index: pubmed.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/io/pubmed.rb,v
retrieving revision 1.12
retrieving revision 1.13
diff -C2 -d -r1.12 -r1.13
*** pubmed.rb 8 Sep 2005 01:22:12 -0000 1.12
--- pubmed.rb 16 Mar 2006 17:29:05 -0000 1.13
***************
*** 3,6 ****
--- 3,7 ----
#
# Copyright (C) 2001 KATAYAMA Toshiaki <k at bioruby.org>
+ # 2006 Jan Aerts <jan.aerts at bbsrc.ac.uk>
#
# This library is free software; you can redistribute it and/or
***************
*** 26,61 ****
module Bio
class PubMed
! def self.query(id)
! host = "www.ncbi.nlm.nih.gov"
! path = "/entrez/query.fcgi?tool=bioruby&cmd=Text&dopt=MEDLINE&db=PubMed&uid="
!
! http = Net::HTTP.new(host)
! response, = http.get(path + id.to_s)
! result = response.body
! if result =~ /#{id}\s+Error/
! raise( result )
! else
! result = result.gsub("\r", "\n").squeeze("\n").gsub(/<\/?pre>/, '')
! return result
! end
! end
!
! def self.pmfetch(id)
! host = "www.ncbi.nlm.nih.gov"
! path = "/entrez/utils/pmfetch.fcgi?tool=bioruby&mode=text&report=medline&db=PubMed&id="
!
! http = Net::HTTP.new(host)
! response, = http.get(path + id.to_s)
! result = response.body
! if result =~ /#{id}\s+Error/
! raise( result )
! else
! result = result.gsub("\r", "\n").squeeze("\n").gsub(/<\/?pre>/, '')
! return result
! end
! end
!
def self.search(str)
host = "www.ncbi.nlm.nih.gov"
--- 27,85 ----
module Bio
+ # = DESCRIPTION
+ # The Bio::PubMed class provides several ways to retrieve bibliographic
+ # information from the PubMed database at
+ # http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=PubMed. Basically, two
+ # types of queries are possible:
+ # * searching for PubMed IDs given a query string:
+ # * Bio::PubMed#search
+ # * Bio::PubMed#esearch
+ # * retrieving the MEDLINE text (i.e. authors, journal, abstract, ...) given a PubMed ID
+ # * Bio::PubMed#query
+ # * Bio::PubMed#pmfetch
+ # * Bio::PubMed#efetch
+ #
+ # The different methods within the same group are interchangeable and should
+ # return the same result.
+ #
+ # Additional information about the MEDLINE format and PubMed programmable
+ # APIs can be found on the following websites:
+ # * Overview: http://www.ncbi.nlm.nih.gov/entrez/query/static/overview.html
+ # * How to link: http://www.ncbi.nlm.nih.gov/entrez/query/static/linking.html
+ # * MEDLINE format: http://www.ncbi.nlm.nih.gov/entrez/query/static/help/pmhelp.html#MEDLINEDisplayFormat
+ # * Search field descriptions and tags: http://www.ncbi.nlm.nih.gov/entrez/query/static/help/pmhelp.html#SearchFieldDescriptionsandTags
+ # * Entrez utilities index: http://www.ncbi.nlm.nih.gov/entrez/utils/utils_index.html
+ # * PmFetch CGI help: http://www.ncbi.nlm.nih.gov/entrez/utils/pmfetch_help.html
+ # * E-Utilities CGI help: http://eutils.ncbi.nlm.nih.gov/entrez/query/static/eutils_help.html
+ #
+ # = USAGE
+ # require 'bio'
+ #
+ # # If you don't know the pubmed ID:
+ # Bio::PubMed.search("(genome AND analysis) OR bioinformatics)").each do |x|
+ # p x
+ # end
+ # Bio::PubMed.esearch("(genome AND analysis) OR bioinformatics)").each do |x|
+ # p x
+ # end
+ #
+ # # To retrieve the MEDLINE entry for a given PubMed ID:
+ # puts Bio::PubMed.query("10592173")
+ # puts Bio::PubMed.pmfetch("10592173")
+ # puts Bio::PubMed.efetch("10592173", "14693808")
+ # # This can be converted into a Bio::MEDLINE object:
+ # manuscript = Bio::PubMed.query("10592173")
+ # medline = Bio::MEDLINE(manuscript)
+ #
+ # = REMARK
+ # This class can not be used at the moment if you're behind a proxy server. This will be solved in the near future.
class PubMed
! # Search the PubMed database by given keywords using entrez query and returns
! # an array of PubMed IDs.
! # ---
! # *Arguments*:
! # * _id_: query string (required)
! # *Returns*:: array of PubMed IDs
def self.search(str)
host = "www.ncbi.nlm.nih.gov"
***************
*** 70,73 ****
--- 94,115 ----
end
+ # Search the PubMed database by given keywords using E-Utils and returns
+ # an array of PubMed IDs.
+ #
+ # For information on the possible arguments, see
+ # http://eutils.ncbi.nlm.nih.gov/entrez/query/static/esearch_help.html#PubMed
+ # ---
+ # *Arguments*:
+ # * _id_: query string (required)
+ # * _field_
+ # * _reldate_
+ # * _mindate_
+ # * _maxdate_
+ # * _datetype_
+ # * _retstart_
+ # * _retmax_ (default 100)
+ # * _retmode_
+ # * _rettype_
+ # *Returns*:: array of PubMed IDs
def self.esearch(str, hash = {})
hash['retmax'] = 100 unless hash['retmax']
***************
*** 88,91 ****
--- 130,184 ----
end
+ # Retrieve PubMed entry by PMID and returns MEDLINE formatted string using
+ # entrez query.
+ # ---
+ # *Arguments*:
+ # * _id_: PubMed ID (required)
+ # *Returns*:: MEDLINE formatted String
+ def self.query(id)
+ host = "www.ncbi.nlm.nih.gov"
+ path = "/entrez/query.fcgi?tool=bioruby&cmd=Text&dopt=MEDLINE&db=PubMed&uid="
+
+ http = Net::HTTP.new(host)
+ response, = http.get(path + id.to_s)
+ result = response.body
+ if result =~ /#{id}\s+Error/
+ raise( result )
+ else
+ result = result.gsub("\r", "\n").squeeze("\n").gsub(/<\/?pre>/, '')
+ return result
+ end
+ end
+
+ # Retrieve PubMed entry by PMID and returns MEDLINE formatted string using
+ # entrez pmfetch.
+ # ---
+ # *Arguments*:
+ # * _id_: PubMed ID (required)
+ # *Returns*:: MEDLINE formatted String
+ def self.pmfetch(id)
+ host = "www.ncbi.nlm.nih.gov"
+ path = "/entrez/utils/pmfetch.fcgi?tool=bioruby&mode=text&report=medline&db=PubMed&id="
+
+ http = Net::HTTP.new(host)
+ response, = http.get(path + id.to_s)
+ result = response.body
+ if result =~ /#{id}\s+Error/
+ raise( result )
+ else
+ result = result.gsub("\r", "\n").squeeze("\n").gsub(/<\/?pre>/, '')
+ return result
+ end
+ end
+
+ # Retrieve PubMed entry by PMID and returns MEDLINE formatted string using
+ # entrez efetch. Multiple PubMed IDs can be provided:
+ # Bio::PubMed.efetch(123)
+ # Bio::PubMed.efetch(123,456,789)
+ # Bio::PubMed.efetch([123,456,789])
+ # ---
+ # *Arguments*:
+ # * _ids_: list of PubMed IDs (required)
+ # *Returns*:: MEDLINE formatted String
def self.efetch(*ids)
return [] if ids.empty?
***************
*** 125,189 ****
end
-
- =begin
-
- = Bio::PubMed
-
- These class methods access NCBI/PubMed database via HTTP.
-
- --- Bio::PubMed.esearch(str, options)
-
- Search keywords in PubMed by E-Utils and returns an array of PubMed IDs.
- Options can be a hash containing keys include 'field', 'reldate',
- 'mindate', 'maxdate', 'datetype', 'retstart', 'retmax', 'retmode',
- and 'rettype' as specified in the following URL:
-
- ((<URL:http://eutils.ncbi.nlm.nih.gov/entrez/query/static/esearch_help.html#PubMed>))
-
- Default 'retmax' is 100.
-
- --- Bio::PubMed.efetch(pmids)
-
- Returns an array of MEDLINE records. A list of PubMed IDs can be
- supplied as following:
-
- Bio::PubMed.efetch(123)
- Bio::PubMed.efetch(123,456,789)
- Bio::PubMed.efetch([123,456,789])
-
- --- Bio::PubMed.query(pmid)
-
- Retrieve PubMed entry by PMID and returns MEDLINE format string (can
- be parsed by the Bio::MEDLINE and can be converted into Bio::Reference
- object).
-
- --- Bio::PubMed.pmfetch(pmid)
-
- Just another query method (by pmfetch).
-
- --- Bio::PubMed.search(str)
-
- Search the PubMed database by given keywords and returns the list of
- matched records in MEDLINE format.
-
-
- = For more informations
-
- * Overview
- * ((<URL:http://www.ncbi.nlm.nih.gov/entrez/query/static/overview.html>))
- * How to link
- * ((<URL:http://www.ncbi.nlm.nih.gov/entrez/query/static/linking.html>))
- * MEDLINE format
- * ((<URL:http://www.ncbi.nlm.nih.gov/entrez/query/static/help/pmhelp.html#MEDLINEDisplayFormat>))
- * Search field descriptions and tags
- * ((<URL:http://www.ncbi.nlm.nih.gov/entrez/query/static/help/pmhelp.html#SearchFieldDescriptionsandTags>))
- * Entrez utilities index
- * ((<URL:http://www.ncbi.nlm.nih.gov/entrez/utils/utils_index.html>))
- * PmFetch CGI help
- * ((<URL:http://www.ncbi.nlm.nih.gov/entrez/utils/pmfetch_help.html>))
- * E-Utilities CGI help
- * ((<URL:http://eutils.ncbi.nlm.nih.gov/entrez/query/static/eutils_help.html>))
-
- =end
-
-
--- 218,219 ----
Index: fetch.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/io/fetch.rb,v
retrieving revision 1.4
retrieving revision 1.5
diff -C2 -d -r1.4 -r1.5
*** fetch.rb 18 Dec 2005 15:58:42 -0000 1.4
--- fetch.rb 16 Mar 2006 17:29:05 -0000 1.5
***************
*** 1,12 ****
#
! # = bio/io/biofetch.rb - BioFetch access module
! #
! # Copyright:: Copyright (C) 2002, 2005
! # Toshiaki Katayama <k at bioruby.org>
! # License:: LGPL
#
! # $Id$
#
- #--
#
# This library is free software; you can redistribute it and/or
--- 1,10 ----
#
! # bio/io/biofetch.rb - BioFetch access module
#
! # Copyright (C) 2002, 2005 Toshiaki Katayama <k at bioruby.org>
! # 2006 Jan Aerts <jan.aerts at bbsrc.ac.uk>
!
! # License: LGPL
#
#
# This library is free software; you can redistribute it and/or
***************
*** 24,95 ****
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#
! #++
#
require 'uri'
! require 'net/http'
module Bio
! class Fetch
!
! # Create a new Bio::Fetch server object.
! # Use Bio::Fetch.new('http://www.ebi.ac.uk/cgi-bin/dbfetch') to connect
! # to EBI BioFetch server.
! def initialize(url = 'http://bioruby.org/cgi-bin/biofetch.rb')
! schema, user, @host, @port, reg, @path, = URI.split(url)
! end
!
! # Set default database to dbname (prepare for get_by_id).
! attr_accessor :database
!
! # Get raw database entry by id (mainly used by Bio::Registry).
! def get_by_id(id)
! fetch(@database, id)
! end
!
! # Fetch a database entry as specified by database (db), entry id (id),
! # 'raw' text or 'html' (style), and format. When using BioRuby's
! # BioFetch server, value for the format should not be set.
! def fetch(db, id, style = 'raw', format = nil)
! data = [ "db=#{db}", "id=#{id}", "style=#{style}" ]
! data.push("format=#{format}") if format
! data = data.join('&')
!
! responce, result = Net::HTTP.new(@host, @port).post(@path, data)
! return result
! end
!
! # Short cut for using BioRuby's BioFetch server. You can fetch an entry
! # without creating instance of BioFetch server.
! def self.query(*args)
! self.new.fetch(*args)
! end
! # What databases are available?
! def databases
! query = "info=dbs"
! responce, result = Net::HTTP.new(@host, @port).post(@path, query)
! return result
! end
! # What formats does the database X have?
! def formats(database = @database)
! if database
! query = "info=formats;db=#{database}"
! responce, result = Net::HTTP.new(@host, @port).post(@path, query)
return result
end
end
- # How many entries can be retrieved simultaneously?
- def maxids
- query = "info=maxids"
- responce, result = Net::HTTP.new(@host, @port).post(@path, query)
- return result
- end
-
- end
-
end # module Bio
--- 22,183 ----
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#
! # $Id$
#
require 'uri'
! require 'open-uri'
module Bio
+ # = DESCRIPTION
+ # The Bio::Fetch class provides an interface to dbfetch servers. Given
+ # a database name and an accession number, these servers return the nucleic
+ # or amino acid sequence for that accession number in that database.
+ #
+ # Possible dbfetch servers include:
+ # * http://bioruby.org/cgi-bin/biofetch.rb (default)
+ # * http://www.ebi.ac.uk/cgi-bin/dbfetch
+ #
+ # If you're behind a proxy server, be sure to set your HTTP_PROXY
+ # environment variable accordingly.
+ #
+ # = USAGE
+ # require 'bio'
+ #
+ # # Retrieve the sequence of accession number M33388 from the EMBL
+ # # database.
+ # server = Bio::Fetch.new() #uses default server
+ # puts server.fetch('embl','M33388')
+ #
+ # # Do the same thing without creating a Bio::Fetch object. This method always
+ # # uses the default dbfetch server: http://bioruby.org/cgi-bin/biofetch.rb
+ # puts Bio::Fetch.query('embl','M33388')
+ #
+ # # To know what databases are available on the bioruby dbfetch server:
+ # server = Bio::Fetch.new()
+ # puts server.databases
+ #
+ # # Some databases provide their data in different formats (e.g. 'fasta',
+ # # 'genbank' or 'embl'). To check which formats are supported by a given
+ # # database:
+ # puts server.formats('embl')
+ #
+ class Fetch
+
+ # Create a new Bio::Fetch server object that can subsequently be queried
+ # using the Bio::Fetch#fetch method
+ # ---
+ # *Arguments*:
+ # * _url_: URL of dbfetch server (default = 'http://bioruby.org/cgi-bin/biofetch.rb')
+ # *Returns*:: Bio::Fetch object
+ def initialize(url = 'http://bioruby.org/cgi-bin/biofetch.rb')
+ @url = url
+ schema, user, @host, @port, reg, @path, = URI.split(@url)
+ end
+
+ # The default database to query
+ #--
+ # This will be used by the get_by_id method
+ #++
+ attr_accessor :database
+
+ # Get raw database entry by id. This method lets the Bio::Registry class
+ # use Bio::Fetch objects and should probably not be used directly.
+ def get_by_id(id)
+ fetch(@database, id)
+ end
+
+ # Fetch a database entry as specified by database (db), entry id (id),
+ # 'raw' text or 'html' (style), and format. When using BioRuby's
+ # BioFetch server, value for the format should not be set.
+ # Examples:
+ # server = Bio::Fetch.new('http://www.ebi.ac.uk/cgi-bin/dbfetch')
+ # puts server.fetch('embl','M33388','raw','fasta')
+ # puts server.fetch('refseq','NM_12345','html','embl')
+ # ---
+ # *Arguments*:
+ # * _database_: name of database to query (see Bio::Fetch#databases to get list of supported databases)
+ # * _id_: single ID or ID list separated by commas or white space
+ # * _style_: [raw|html] (default = 'raw')
+ # * _format_: name of output format (see Bio::Fetch#formats)
+ def fetch(db, id, style = 'raw', format = nil)
+ query = [ "db=#{db}", "id=#{id}", "style=#{style}" ]
+ query.push("format=#{format}") if format
+ query = query.join('&')
+
+ result = open(@url + '?' + query).readlines.join('')
+ return result
+ end
+
+ # Shortcut for using BioRuby's BioFetch server. You can fetch an entry
+ # without creating an instance of BioFetch server. This method uses the
+ # default dbfetch server, which is http://bioruby.org/cgi-bin/biofetch.rb
+ #
+ # Example:
+ # puts Bio::Fetch.query('refseq','NM_12345')
+ #
+ # ---
+ # *Arguments*:
+ # * _database_: name of database to query (see Bio::Fetch#databases to get list of supported databases)
+ # * _id_: single ID or ID list separated by commas or white space
+ # * _style_: [raw|html] (default = 'raw')
+ # * _format_: name of output format (see Bio::Fetch#formats)
+ def self.query(*args)
+ self.new.fetch(*args)
+ end
+
+ # Using this method, the user can ask a dbfetch server what databases
+ # it supports. This would normally be the first step you'd take when
+ # you use a dbfetch server for the first time.
+ # Example:
+ # server = Bio::Fetch.new()
+ # puts server.databases # returns "aa aax bl cpd dgenes dr ec eg emb ..."
+ #
+ # This method only works for the bioruby dbfetch server. For a list
+ # of databases available from the EBI, see the EBI website at
+ # http://www.ebi.ac.uk/cgi-bin/dbfetch/
+ # ---
+ # *Returns*:: array of database names
+ def databases
+ query = "info=dbs"
! result = open(@url + '?' + query).readlines.join('')
! return result
! end
!
! # Lists the formats that are available for a given database. Like the
! # Bio::Fetch#databases method, this method is only available on
! # the bioruby dbfetch server.
! # Example:
! # server = Bio::Fetch.new()
! # puts server.formats('embl') # returns "default fasta"
! # ---
! # *Arguments*:
! # * _database_:: name of database you want the supported formats for
! # *Returns*:: array of formats
! def formats(database = @database)
! if database
! query = "info=formats;db=#{database}"
! result = open(@url + '?' + query).readlines.join('')
! return result
! end
! end
!
! # A dbfetch server will only return entries up to a given maximum number.
! # This method retrieves that number from the server. As for the databases
! # and formats methods, the maxids method only works for the bioruby
! # dbfetch server.
! # ---
! # *Arguments*: none
! # *Returns*:: number
! def maxids
! query = "info=maxids"
! result = open(@url + '?' + query).readlines.join('')
return result
end
+
end
end # module Bio
***************
*** 98,113 ****
if __FILE__ == $0
- # bfserv = Bio::Fetch.new('http://www.ebi.ac.uk:80/cgi-bin/dbfetch')
- bfserv = Bio::Fetch.new('http://www.ebi.ac.uk/cgi-bin/dbfetch')
puts "# test 1"
! puts bfserv.fetch('embl', 'J00231', 'raw')
puts "# test 2"
! puts bfserv.fetch('embl', 'J00231', 'html')
!
puts "# test 3"
! puts Bio::Fetch.query('genbank', 'J00231')
puts "# test 4"
puts Bio::Fetch.query('genbank', 'J00231', 'raw', 'fasta')
!
end
--- 186,204 ----
if __FILE__ == $0
puts "# test 1"
! br_server = Bio::Fetch.new()
! puts br_server.databases
! puts br_server.formats('embl')
! puts br_server.maxids
! ebi_server = Bio::Fetch.new('http://www.ebi.ac.uk/cgi-bin/dbfetch')
puts "# test 2"
! puts ebi_server.fetch('embl', 'J00231', 'raw')
puts "# test 3"
! puts ebi_server.fetch('embl', 'J00231', 'html')
puts "# test 4"
+ puts Bio::Fetch.query('genbank', 'J00231')
+ puts "# test 5"
puts Bio::Fetch.query('genbank', 'J00231', 'raw', 'fasta')
!
end
More information about the bioruby-cvs
mailing list