[BioRuby-cvs] bioruby/lib/bio/io pubmed.rb,1.17,1.18

Sat Nov 10 08:21:56 UTC 2007

Update of /home/repository/bioruby/bioruby/lib/bio/io
In directory dev.open-bio.org:/tmp/cvs-serv21448

Modified Files:
	pubmed.rb 
Log Message:
* search, query is fixed to use new NCBI URI (previous fix was wrong and
  insufficient).
* esearch is enhanced to accept hash['rettype'] == "count" as suggested
  by Kaustubh Patil


Index: pubmed.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/io/pubmed.rb,v
retrieving revision 1.17
retrieving revision 1.18
diff -C2 -d -r1.17 -r1.18
*** pubmed.rb	4 Nov 2007 11:50:59 -0000	1.17
--- pubmed.rb	10 Nov 2007 08:21:54 -0000	1.18
***************
*** 19,34 ****
  # The Bio::PubMed class provides several ways to retrieve bibliographic
  # information from the PubMed database at
! # http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=PubMed. Basically, two
! # types of queries are possible:
  #
  # * searching for PubMed IDs given a query string:
! #   * Bio::PubMed#search
! #   * Bio::PubMed#esearch
  #
  # * retrieving the MEDLINE text (i.e. authors, journal, abstract, ...)
  #   given a PubMed ID
! #   * Bio::PubMed#query
! #   * Bio::PubMed#pmfetch
! #   * Bio::PubMed#efetch
  #
  # The different methods within the same group are interchangeable and should
--- 19,35 ----
  # The Bio::PubMed class provides several ways to retrieve bibliographic
  # information from the PubMed database at
! #   http://www.ncbi.nlm.nih.gov/sites/entrez?db=PubMed
! #
! # Basically, two types of queries are possible:
  #
  # * searching for PubMed IDs given a query string:
! #   * Bio::PubMed#esearch  (recommended)
! #   * Bio::PubMed#search   (only retrieves top 20 hits)
  #
  # * retrieving the MEDLINE text (i.e. authors, journal, abstract, ...)
  #   given a PubMed ID
! #   * Bio::PubMed#efetch   (recommended)
! #   * Bio::PubMed#query    (unstable for the change of the HTML design)
! #   * Bio::PubMed#pmfetch  (still working but could be obsoleted by NCBI)
  #
  # The different methods within the same group are interchangeable and should
***************
*** 38,48 ****
  # APIs can be found on the following websites:
  #
! # * Overview: http://www.ncbi.nlm.nih.gov/entrez/query/static/overview.html
! # * How to link: http://www.ncbi.nlm.nih.gov/entrez/query/static/linking.html
! # * MEDLINE format: http://www.ncbi.nlm.nih.gov/entrez/query/static/help/pmhelp.html#MEDLINEDisplayFormat
! # * Search field descriptions and tags: http://www.ncbi.nlm.nih.gov/entrez/query/static/help/pmhelp.html#SearchFieldDescriptionsandTags
! # * Entrez utilities index: http://www.ncbi.nlm.nih.gov/entrez/utils/utils_index.html
! # * PmFetch CGI help: http://www.ncbi.nlm.nih.gov/entrez/utils/pmfetch_help.html
! # * E-Utilities CGI help: http://eutils.ncbi.nlm.nih.gov/entrez/query/static/eutils_help.html
  #
  # == Usage
--- 39,50 ----
  # APIs can be found on the following websites:
  #
! # * PubMed Overview:
! #     http://www.ncbi.nlm.nih.gov/entrez/query/static/overview.html
! # * PubMed help:
! #     http://www.ncbi.nlm.nih.gov/entrez/query/static/help/pmhelp.html
! # * Entrez utilities index:
! #      http://www.ncbi.nlm.nih.gov/entrez/utils/utils_index.html
! # * How to link:
! #     http://www.ncbi.nlm.nih.gov/books/bv.fcgi?rid=helplinks.chapter.linkshelp
  #
  # == Usage
***************
*** 51,89 ****
  #
  #   # If you don't know the pubmed ID:
! #   Bio::PubMed.search("(genome AND analysis) OR bioinformatics)").each do |x|
  #     p x
  #   end
! #   Bio::PubMed.esearch("(genome AND analysis) OR bioinformatics)").each do |x|
  #     p x
  #   end
  #   
  #   # To retrieve the MEDLINE entry for a given PubMed ID:
  #   puts Bio::PubMed.query("10592173")
  #   puts Bio::PubMed.pmfetch("10592173")
! #   puts Bio::PubMed.efetch("10592173", "14693808")
  #   # This can be converted into a Bio::MEDLINE object:
  #   manuscript = Bio::PubMed.query("10592173")
! #   medline = Bio::MEDLINE(manuscript)
  #  
  class PubMed
  
-   # Search the PubMed database by given keywords using entrez query and returns
-   # an array of PubMed IDs.
-   # ---
-   # *Arguments*:
-   # * _id_: query string (required)
-   # *Returns*:: array of PubMed IDs
-   def self.search(str)
-     host = 'www.ncbi.nlm.nih.gov'
-     path = "sites/entrez?tool=bioruby&cmd=Text&dopt=MEDLINE&db=PubMed&uid="
- 
-     http = Bio::Command.new_http(host)
-     response, = http.get(path + CGI.escape(str))
-     result = response.body
-     result = result.gsub("\r", "\n").squeeze("\n")
-     result = result.scan(/<pre>(.*?)<\/pre>/m).flatten
-     return result
-   end
- 
    # Search the PubMed database by given keywords using E-Utils and returns 
    # an array of PubMed IDs.
--- 53,75 ----
  #
  #   # If you don't know the pubmed ID:
! #   Bio::PubMed.esearch("(genome AND analysis) OR bioinformatics)").each do |x|
  #     p x
  #   end
! #
! #   Bio::PubMed.search("(genome AND analysis) OR bioinformatics)").each do |x|
  #     p x
  #   end
  #   
  #   # To retrieve the MEDLINE entry for a given PubMed ID:
+ #   puts Bio::PubMed.efetch("10592173", "14693808")
  #   puts Bio::PubMed.query("10592173")
  #   puts Bio::PubMed.pmfetch("10592173")
! #
  #   # This can be converted into a Bio::MEDLINE object:
  #   manuscript = Bio::PubMed.query("10592173")
! #   medline = Bio::MEDLINE.new(manuscript)
  #  
  class PubMed
  
    # Search the PubMed database by given keywords using E-Utils and returns 
    # an array of PubMed IDs.
***************
*** 103,107 ****
    # * _retmode_
    # * _rettype_
!   # *Returns*:: array of PubMed IDs
    def self.esearch(str, hash = {})
      hash['retmax'] = 100 unless hash['retmax']
--- 89,93 ----
    # * _retmode_
    # * _rettype_
!   # *Returns*:: array of PubMed IDs or a number of results
    def self.esearch(str, hash = {})
      hash['retmax'] = 100 unless hash['retmax']
***************
*** 118,122 ****
      response, = http.get(path + CGI.escape(str))
      result = response.body
!     result = result.scan(/<Id>(.*?)<\/Id>/m).flatten
      return result
    end
--- 104,154 ----
      response, = http.get(path + CGI.escape(str))
      result = response.body
!     if hash['rettype'] == 'count'
!       result = result.scan(/<Count>(.*?)<\/Count>/m).flatten.first.to_i
!     else
!       result = result.scan(/<Id>(.*?)<\/Id>/m).flatten
!     end
!     return result
!   end
! 
!   # Retrieve PubMed entry by PMID and returns MEDLINE formatted string using
!   # entrez efetch. Multiple PubMed IDs can be provided:
!   #   Bio::PubMed.efetch(123)
!   #   Bio::PubMed.efetch(123,456,789)
!   #   Bio::PubMed.efetch([123,456,789])
!   # ---
!   # *Arguments*:
!   # * _ids_: list of PubMed IDs (required)
!   # *Returns*:: MEDLINE formatted String
!   def self.efetch(*ids)
!     return [] if ids.empty?
! 
!     host = "eutils.ncbi.nlm.nih.gov"
!     path = "/entrez/eutils/efetch.fcgi?tool=bioruby&db=pubmed&retmode=text&rettype=medline&id="
! 
!     list = ids.join(",")
! 
!     http = Bio::Command.new_http(host)
!     response, = http.get(path + list)
!     result = response.body
!     result = result.split(/\n\n+/)
!     return result
!   end
! 
!   # Search the PubMed database by given keywords using entrez query and returns
!   # an array of PubMed IDs. Caution: this method returns the first 20 hits only.
!   # Instead, use of the 'esearch' method is strongly recomended.
!   # ---
!   # *Arguments*:
!   # * _id_: query string (required)
!   # *Returns*:: array of PubMed IDs
!   def self.search(str)
!     host = "www.ncbi.nlm.nih.gov"
!     path = "/sites/entrez?tool=bioruby&cmd=Search&doptcmdl=Brief&db=PubMed&term="
! 
!     http = Bio::Command.new_http(host)
!     response, = http.get(path + CGI.escape(str))
!     result = response.body
!     result = result.scan(/value="(\d+)" id="UidCheckBox"/m).flatten
      return result
    end
***************
*** 128,143 ****
    # * _id_: PubMed ID (required)
    # *Returns*:: MEDLINE formatted String
!   def self.query(id)
      host = "www.ncbi.nlm.nih.gov"
!     path = "/entrez/query.fcgi?tool=bioruby&cmd=Text&dopt=MEDLINE&db=PubMed&uid="
  
      http = Bio::Command.new_http(host)
!     response, = http.get(path + id.to_s)
      result = response.body
!     if result =~ /#{id}\s+Error/
        raise( result )
      else
!       result = result.gsub("\r", "\n").squeeze("\n").gsub(/<\/?pre>/, '')
!       return result
      end
    end
--- 160,183 ----
    # * _id_: PubMed ID (required)
    # *Returns*:: MEDLINE formatted String
!   def self.query(*ids)
      host = "www.ncbi.nlm.nih.gov"
!     path = "/sites/entrez?tool=bioruby&cmd=Text&dopt=MEDLINE&db=PubMed&uid="
! 
!     list = ids.join(",")
  
      http = Bio::Command.new_http(host)
!     response, = http.get(path + list)
      result = response.body
!     result = result.scan(/<pre>\s*(.*?)<\/pre>/m).flatten
! 
!     if result =~ /id:.*Error occurred/
!       # id: xxxxx Error occurred: Article does not exist
        raise( result )
      else
!       if ids.size > 1
!         return result
!       else
!         return result.first
!       end
      end
    end
***************
*** 164,191 ****
    end
  
-   # Retrieve PubMed entry by PMID and returns MEDLINE formatted string using
-   # entrez efetch. Multiple PubMed IDs can be provided:
-   #   Bio::PubMed.efetch(123)
-   #   Bio::PubMed.efetch(123,456,789)
-   #   Bio::PubMed.efetch([123,456,789])
-   # ---
-   # *Arguments*:
-   # * _ids_: list of PubMed IDs (required)
-   # *Returns*:: MEDLINE formatted String
-   def self.efetch(*ids)
-     return [] if ids.empty?
- 
-     host = "eutils.ncbi.nlm.nih.gov"
-     path = "/entrez/eutils/efetch.fcgi?tool=bioruby&db=pubmed&retmode=text&rettype=medline&id="
- 
-     ids = ids.join(",")
- 
-     http = Bio::Command.new_http(host)
-     response, = http.get(path + ids)
-     result = response.body
-     result = result.split(/\n\n+/)
-     return result
-   end
- 
  end # PubMed
  
--- 204,207 ----
***************
*** 195,211 ****
  if __FILE__ == $0
  
!   puts Bio::PubMed.query("10592173")
!   puts "--- ---"
!   puts Bio::PubMed.pmfetch("10592173")
!   puts "--- ---"
!   Bio::PubMed.search("(genome AND analysis) OR bioinformatics)").each do |x|
!     p x
!   end
!   puts "--- ---"
    Bio::PubMed.esearch("(genome AND analysis) OR bioinformatics)").each do |x|
      p x
    end
!   puts "--- ---"
    puts Bio::PubMed.efetch("10592173", "14693808")
  
  end
--- 211,233 ----
  if __FILE__ == $0
  
!   puts "--- Search PubMed by E-Utils ---"
    Bio::PubMed.esearch("(genome AND analysis) OR bioinformatics)").each do |x|
      p x
    end
! 
!   puts "--- Retrieve PubMed entry by E-Utils ---"
    puts Bio::PubMed.efetch("10592173", "14693808")
  
+   puts "--- Search PubMed by Entrez CGI ---"
+   Bio::PubMed.search("(genome AND analysis) OR bioinformatics)").each do |x|
+     p x
+   end
+ 
+   puts "--- Retrieve PubMed entry by Entrez CGI ---"
+   puts Bio::PubMed.query("10592173")
+ 
+ 
+   puts "--- Retrieve PubMed entry by PMfetch ---"
+   puts Bio::PubMed.pmfetch("10592173")
+ 
  end