[BioRuby] [PATCH] GO annotations fixes and improvements

Ralf Stephan ralf at ark.in-berlin.de
Tue Aug 3 06:58:16 UTC 2010


Hello,

seeing the file bio/db/go.rb is seven years old,
I have fixed and improved the GO annotations
parsing (now GAF1, GAF2, Phenote) and output
(GAF1, GAF2) for inclusion in next bioruby version.

0001-Fix-parsing-of-GAF-1.0-files-preliminary-adaptation.patch
0002-Add-parsing-and-output-of-GAF-2.0-files.patch
0003-Add-documentation-copyright.patch
0004-Add-Phenote-GOA-file-format-parsing-GAF1-output.patch

I hope you will accept the patch set. Enjoy,

ralf

>From 05b435e0e3f791d0fae38a5d76cbc522835bf085 Mon Sep 17 00:00:00 2001
From: R. Stephan <ralf at ark.in-berlin.de>
Date: Mon, 2 Aug 2010 19:43:58 +0200
Subject: [PATCH] Fix parsing of GAF 1.0 files, preliminary adaptations

---
 lib/bio/db/go.rb |   42 ++++++++++++++++++++++++++++--------------
 1 files changed, 28 insertions(+), 14 deletions(-)

diff --git a/lib/bio/db/go.rb b/lib/bio/db/go.rb
index 6b5d539..a8d3f47 100644
--- a/lib/bio/db/go.rb
+++ b/lib/bio/db/go.rb
@@ -186,6 +186,18 @@ class GO
   #    p [entry.entry_id, entry.evidence, entry.goid]
   #  end
   #
+  class ArrayOrString
+    def initialize(arg)
+      @var = arg
+    end
+    def join(char)
+      if @var.instance_of? String
+        then return @var
+        else return @var.join(char)
+      end
+    end
+  end
+
   class GeneAssociation # < Bio::DB
 
     # Delimiter
@@ -253,30 +265,34 @@ class GO
     
     # 
     attr_reader :assigned_by 
-    
+
     alias entry_id db_object_id
 
 
-    # Parsing an entry (in a line) in the gene_association flatfile.  
-    def initialize(entry) 
-      tmp = entry.chomp.split(/\t/)
+    # Assign fields of an entry (in a line).  
+    def assign(tmp) 
       @db                = tmp[0] 
       @db_object_id      = tmp[1]
       @db_object_symbol  = tmp[2]
       @qualifier         = tmp[3]  # 
       @goid              = tmp[4]
-      @db_reference      = tmp[5].split(/\|/)  #
+      @db_reference      = ArrayOrString.new(tmp[5].split(/\|/))  #
       @evidence          = tmp[6]
-      @with              = tmp[7].split(/\|/)  # 
+      @with              = ArrayOrString.new(tmp[7].split(/\|/))  # 
       @aspect            = tmp[8]
       @db_object_name    = tmp[9]  #
-      @db_object_synonym = tmp[10].split(/\|/) #
+      @db_object_synonym = ArrayOrString.new(tmp[10].split(/\|/)) #
       @db_object_type    = tmp[11]
       @taxon             = tmp[12] # taxon:4932
       @date              = tmp[13] # 20010118
       @assigned_by       = tmp[14] 
     end
 
+    # Parsing an entry (in a line) in the gene_association flatfile.  
+    def initialize(entry) 
+      tmp = entry.chomp.split(/\t/)
+      self.assign(tmp)
+    end
 
     # Returns GO_ID in /\d{7}/ format. Giving not nil arg, returns 
     # /GO:\d{7}/ style.
@@ -293,17 +309,15 @@ class GO
 
     # Bio::GO::GeneAssociation#to_str -> a line of gene_association file.
     def to_str
-      return [@db, @db_object_id, @db_object_symbol, @quialifier, @goid, 
-              @qualifier.join("|"), @evidence, @with.join("|"), @aspect,
+      return [@db, @db_object_id, @db_object_symbol, @qualifier, @goid, 
+              @db_reference.join("|"), @evidence, @with.join("|"), @aspect,
               @db_object_name, @db_object_synonym.join("|"), @db_object_type,
               @taxon, @date, @assigned_by].join("\t")
     end
 
   end # class GeneAssociation   
 
-
-
-  # = Container class for files in geneontology.org/go/external2go/*2go.
+# = Container class for files in geneontology.org/go/external2go/*2go.
   #
   # The line syntax is: 
   #
@@ -402,8 +416,8 @@ class GO
     end
 
   end # class External2go
-  
-end # class GO
+
+end
 
 end # module Bio
 
-- 
1.5.5

>From 1dbca2952239c4028a89a507d1badd5935c9e477 Mon Sep 17 00:00:00 2001
From: R. Stephan <ralf at ark.in-berlin.de>
Date: Mon, 2 Aug 2010 20:12:36 +0200
Subject: [PATCH] Add parsing and output of GAF 2.0 files

---
 lib/bio/db/go.rb |   32 ++++++++++++++++++++++++++++++++
 1 files changed, 32 insertions(+), 0 deletions(-)

diff --git a/lib/bio/db/go.rb b/lib/bio/db/go.rb
index a8d3f47..affbe66 100644
--- a/lib/bio/db/go.rb
+++ b/lib/bio/db/go.rb
@@ -266,6 +266,11 @@ class GO
     # 
     attr_reader :assigned_by 
 
+    attr_reader :annotation_extension
+
+    attr_reader :gene_product_form_id
+    
+
     alias entry_id db_object_id
 
 
@@ -286,6 +291,8 @@ class GO
       @taxon             = tmp[12] # taxon:4932
       @date              = tmp[13] # 20010118
       @assigned_by       = tmp[14] 
+      @annotation_extension = tmp[15]
+      @gene_product_form_id = tmp[16]
     end
 
     # Parsing an entry (in a line) in the gene_association flatfile.  
@@ -317,6 +324,31 @@ class GO
 
   end # class GeneAssociation   
 
+  class GeneAssociation2 < GeneAssociation
+
+    # Iterator through all entries
+    def self.parser(str)
+      if block_given?
+        str.each_line(DELIMITER) {|line|
+          next if /^!/ =~ line
+          yield GeneAssociation2.new(line)
+        }
+      else
+        galist = []
+        str.each_line(DELIMITER) {|line|
+          next if /^!/ =~ line
+          galist << GeneAssociation2.new(line)
+        }
+        return galist
+      end
+    end
+
+    # Bio::GO::GeneAssociation#to_str -> a line of gene_association file.
+    def to_str
+      return [super.to_str, @annotation_extension, @gene_product_form_id].join("\t")
+    end
+  end
+
 # = Container class for files in geneontology.org/go/external2go/*2go.
   #
   # The line syntax is: 
-- 
1.5.5

>From 4370b2bf3dc53f49334f9fb3948dc2fb584b75e5 Mon Sep 17 00:00:00 2001
From: R. Stephan <ralf at ark.in-berlin.de>
Date: Mon, 2 Aug 2010 20:28:45 +0200
Subject: [PATCH] Add documentation, copyright

---
 bin/bioruby         |   47 ------
 bin/br_biofetch.rb  |   47 ------
 bin/br_bioflat.rb   |  293 -----------------------------------
 bin/br_biogetseq.rb |   45 ------
 bin/br_pmfetch.rb   |  422 ---------------------------------------------------
 lib/bio/db/go.rb    |   21 +++-
 6 files changed, 18 insertions(+), 857 deletions(-)
 delete mode 100755 bin/bioruby
 delete mode 100755 bin/br_biofetch.rb
 delete mode 100755 bin/br_bioflat.rb
 delete mode 100755 bin/br_biogetseq.rb
 delete mode 100755 bin/br_pmfetch.rb

diff --git a/bin/bioruby b/bin/bioruby
deleted file mode 100755
index 9980af8..0000000
--- a/bin/bioruby
+++ /dev/null
@@ -1,47 +0,0 @@
-#!/usr/bin/env ruby
-#
-# = BioRuby shell - command line interface for the BioRuby library
-#
-# Copyright::   Copyright (C) 2005, 2006, 2007
-#               Toshiaki Katayama <k at bioruby.org>
-# License::     The Ruby License
-#
-# $Id:$
-#
-
-begin
-  require 'rubygems'
-  gem 'bio', '>= 1.1.0'
-rescue LoadError
-  require 'bio'
-end
-require 'bio/shell'
-
-# required to run commands (getseq, ls etc.)
-include Bio::Shell
-
-# setup command line options, working directory, and irb configurations
-Bio::Shell::Setup.new
-
-# loading workspace and command history
-Bio::Shell.load_session
-
-# sets default email address for Entrez eUtils.
-Bio::NCBI.default_email ||= 'staff at bioruby.org'
-
-# main loop
-if Bio::Shell.cache[:rails]
-  Bio::Shell.cache[:rails].join
-else
-  Signal.trap("SIGINT") do
-    Bio::Shell.cache[:irb].signal_handle
-  end
-
-  catch(:IRB_EXIT) do
-    Bio::Shell.cache[:irb].eval_input
-  end
-end
-
-# saving workspace, command history and configuration before exit
-Bio::Shell.save_session
-
diff --git a/bin/br_biofetch.rb b/bin/br_biofetch.rb
deleted file mode 100755
index 40319cf..0000000
--- a/bin/br_biofetch.rb
+++ /dev/null
@@ -1,47 +0,0 @@
-#!/usr/bin/env ruby
-#
-# = biofetch - BioFetch client
-#
-# Copyright::   Copyright (C) 2002
-#               Toshiaki Katayama <k at bioruby.org>
-# License::     The Ruby License
-#
-# $Id: br_biofetch.rb,v 1.4 2007/04/05 23:35:39 trevor Exp $
-#
-
-require 'bio/io/fetch'
-
-def usage
-  default_url = 'http://bioruby.org/cgi-bin/biofetch.rb'
-  another_url = 'http://www.ebi.ac.uk/cgi-bin/dbfetch'
-  puts "#{$0} [-s[erver] #{another_url}] db id [style] [format]"
-  puts "  server : URL of the BioFetch CGI (default is #{default_url})"
-  puts "      db : database name (embl, genbank, etc.)"
-  puts "      id : entry id"
-  puts "   style : 'raw' or 'html' (default is 'raw')"
-  puts "  format : change the output format ('default', 'fasta', etc.)"
-end
-
-if ARGV.empty? or ARGV[0] =~ /^--?h/
-  usage
-  exit 1
-end
-
-case ARGV[0]
-when /^--?s/				# User specified server
-  ARGV.shift
-  serv = Bio::Fetch.new(ARGV.shift)
-  puts serv.fetch(*ARGV)
-when /^--?e/				# EBI server
-  ARGV.shift
-  serv = Bio::Fetch.new('http://www.ebi.ac.uk/cgi-bin/dbfetch')
-  puts serv.fetch(*ARGV)
-when /^--?r/				# BioRuby server
-  ARGV.shift
-  serv = Bio::Fetch.new('http://bioruby.org/cgi-bin/biofetch.rb')
-  puts serv.fetch(*ARGV)
-else					# Default server
-  puts Bio::Fetch.query(*ARGV)
-end
-
-
diff --git a/bin/br_bioflat.rb b/bin/br_bioflat.rb
deleted file mode 100755
index 279da9b..0000000
--- a/bin/br_bioflat.rb
+++ /dev/null
@@ -1,293 +0,0 @@
-#!/usr/bin/env ruby
-# 
-# = bioflat - OBDA flat file indexer (executable)
-# 
-# Copyright::   Copyright (C) 2002
-#               Naohisa Goto <ng at bioruby.org>
-# License::     The Ruby License
-#
-# $Id: br_bioflat.rb,v 1.17 2007/04/05 23:35:39 trevor Exp $ 
-# 
-
-require 'bio'
-
-def usage
-  print <<EOM
-Search:
-  #{$0} [--search] [options...] [DIR/]DBNAME KEYWORDS
-or
-  #{$0} [--search] --location DIR --dbname DBNAME [options...] KEYWORDS
-
-Search options:
-  --namespace NAME       set serch namespace to NAME
-  (or --name NAME)         You can set this option many times to specify
-                           more than one namespace.
-
-Create index:
-  #{$0} --create --location DIR --dbname DBNAME [--format <genbank|embl|fasta>] [options...] [--files] FILES
-Update index:
-  #{$0} --update --location DIR --dbname DBNAME [options...] [--files] FILES
-
-Create index options:
-  --primary=UNIQUE       set primary namespece to UNIQUE
-                           Default primary/secondary namespaces depend on
-                           each format of flatfiles.
-  --secondary=KEY        set secondary namespaces.
-                           You may use this option many times to specify
-                           more than one namespace.
-  --add-secondary=KEY    add secondary namespaces to default specification.
-                           You can use this option many times.
-
-Options only valid for --create (or --update) --type flat:
-  --sort=/path/to/sort   use external sort program (e.g. /usr/bin/sort)
-  --sort=BUILTIN         use builtin sort routine
-                         (default: /usr/bin/sort or BUILTIN)
-  --env=/path/to/env     use env program to run sort (default: /usr/bin/env)
-  --env-arg=XXXXXX       argument given to the env program (default: LC_ALL=C)
-                         (multiple --env-arg=XXXXXX can be specified)
-
-Options only valid for --update:
-  --renew                re-read all flatfiles and update whole index
-
-Backward compatibility:
-  --makeindex DIR/DBNAME
-      same as --create --type flat --location DIR --dbname DBNAME
-  --makeindexBDB DIR/DBNAME
-      same as --create --type bdb  --location DIR --dbname DBNAME
-  --format=CLASS
-      instead of genbank|embl|fasta, specifing a class name is allowed
-
-Show namespaces:
-  #{$0} --show-namespaces [--location DIR --dbname DBNAME] [DIR/DBNAME]
-or
-  #{$0} --show-namespaces [--format=CLASS]
-or
-  #{$0} --show-namespaces --files file
-
-EOM
-
-end
-
-
-def do_index(mode = :create)
-  case ARGV[0]
-  when /^\-\-?make/
-    dbpath = ARGV[1]
-    args = ARGV[2..-1]
-    is_bdb = nil
-  when /^\-\-?make.*bdb/i
-    dbname = ARGV[1]
-    args = ARGV[2..-1]
-    is_bdb = Bio::FlatFileIndex::MAGIC_BDB
-  when /^\-\-create/, /^\-\-update/
-    args = ARGV[1..-1]
-  else
-    usage
-  end
-
-  options = {}
-
-  while args.first =~ /^\-/
-    case x = args.shift
-
-    # OBDA stuff
-
-    when /^\-\-?format$/
-      args.shift
-      format = nil		# throw this f*ckin' mess for auto detect :)
-    when /^\-\-?location/
-      location = args.shift.chomp('/')
-    when /^\-\-?dbname/
-      dbname = args.shift
-    when /^\-\-?(index)?type/
-      indextype = args.shift
-      case indextype
-      when /bdb/
-	is_bdb = Bio::FlatFileIndex::MAGIC_BDB
-      when /flat/
-	is_bdb = nil
-      else
-	usage
-      end
-
-    # BioRuby extension
-
-    when /^\-\-?files/i
-      break
-
-    when /^\-\-?format\=(.*)/i
-      format = $1
-
-    when /^\-\-?sort\=(.*)/i
-      options['sort_program'] = $1
-      options['onmemory'] = nil
-    when /^\-\-?no\-?te?mp/i
-      options['onmemory'] = true
-
-    when /^\-\-?env\=(.*)/i
-      options['env_program'] = $1
-
-    when /^\-\-?env-arg(?:ument)?\=(.*)/i
-      options['env_program_arguments'] ||= []
-      options['env_program_arguments'].push $1
-
-    when /^\-\-?primary.*\=(.*)/i
-      options['primary_namespace'] = $1
-
-    when /^\-\-?add-secondary.*\=(.*)/i
-      unless options['additional_secondary_namespaces'] then
-	options['additional_secondary_namespaces'] = []
-      end
-      options['additional_secondary_namespaces'] << $1 if $1.length > 0
-
-    when /^\-\-?secondary.*\=(.*)/i
-      unless options['secondary_namespaces'] then
-	options['secondary_namespaces'] = []
-      end
-      options['secondary_namespaces'] << $1 if $1.length > 0
-
-    when /^\-\-?renew/
-      options['renew'] = true
-
-    else
-      $stderr.print "Warning: ignoring invalid option #{x.inspect}\n"
-    end
-  end
-
-  dbpath = File.join(location, dbname) unless dbpath
-  if mode == :update then
-    Bio::FlatFileIndex::update_index(dbpath, format, options, *args)
-  else
-    Bio::FlatFileIndex::makeindex(is_bdb, dbpath, format, options, *args)
-  end
-end
-
-
-def do_search
-  dbname = nil
-  location = nil
-  names = []
-  while x = ARGV.shift
-    case x
-    when /\A\-\-?search/i
-      #do nothing
-    when /\A\-\-?location/i
-      location = ARGV.shift.to_s.chomp('/')
-    when /\A\-\-?dbname/i
-      dbname = ARGV.shift
-    when /\A\-\-?name(?:space)?(?:\=(.+))?/i
-      if $1 then
-	names << $1
-      elsif x = ARGV.shift
-	names << x
-      end
-    else
-      ARGV.unshift x
-      break
-    end
-  end
-  dbname = ARGV.shift unless dbname
-  dbname = File.join(location, dbname) unless location.to_s.empty?
-  db = Bio::FlatFileIndex.open(dbname)
-  ARGV.each do |key|
-    $stderr.print "Searching for \'#{key}\'...\n"
-    #r = db.search(key)
-    #$stderr.print "OK, #{r.size} entry found\n"
-    #if r.size > 0 then
-    #  print r
-    #end
-    begin
-      if names.empty? then
-	r = db.include?(key)
-      else
-	r = db.include_in_namespaces?(key, *names)
-      end
-    rescue RuntimeError
-      $stderr.print "ERROR: #{$!}\n"
-      next
-    end
-    r = [] unless r
-    $stderr.print "OK, #{r.size} entry found\n"
-    r.each do |i|
-      print db.search_primary(i)
-    end
-  end
-  db.close
-end
-
-
-def do_show_namespaces
-  dbname = nil
-  location = nil
-  files = nil
-  format = nil
-  names = []
-  while x = ARGV.shift
-    case x
-    when /\A\-\-?(show\-)?name(space)?s/i
-      #do nothing
-    when /\A\-\-?location/i
-      location = ARGV.shift.to_s.chomp('/')
-    when /\A\-\-?dbname/i
-      dbname = ARGV.shift
-    when /\A\-\-?format(?:\=(.+))?/i
-      if $1 then
-	format = $1
-      elsif x = ARGV.shift
-	format = x
-      end
-    when /\A\-\-?files/i
-      files = ARGV
-      break
-    else
-      ARGV.unshift x
-      break
-    end
-  end
-  if files then
-    k = nil
-    files.each do |x|
-      k = Bio::FlatFile.autodetect_file(x)
-      break if k
-    end
-    if k then
-      $stderr.print "Format: #{k.to_s}\n"
-      format = k
-    else
-      $stderr.print "ERROR: couldn't determine file format\n"
-      return
-    end
-  end
-  $stderr.print "Namespaces: (first line: primary namespace)\n"
-  if format then
-    parser = Bio::FlatFileIndex::Indexer::Parser.new(format)
-    print parser.primary.name, "\n"
-    puts parser.secondary.keys
-  else
-    dbname = ARGV.shift unless dbname
-    dbname = File.join(location, dbname) unless location.to_s.empty?
-    db = Bio::FlatFileIndex.open(dbname)
-    puts db.namespaces
-    db.close
-  end
-end
-
-if ARGV.size > 1
-  case ARGV[0]
-  when /--make/, /--create/
-    Bio::FlatFileIndex::DEBUG.out = true
-    do_index
-  when /--update/
-    Bio::FlatFileIndex::DEBUG.out = true
-    do_index(:update)
-  when /\A\-\-?(show\-)?name(space)?s/i
-    do_show_namespaces
-  when /--search/
-    do_search
-  else #default is search
-    do_search
-  end
-else
-  usage
-end
-
diff --git a/bin/br_biogetseq.rb b/bin/br_biogetseq.rb
deleted file mode 100755
index 76c94de..0000000
--- a/bin/br_biogetseq.rb
+++ /dev/null
@@ -1,45 +0,0 @@
-#!/usr/bin/env ruby
-# 
-# = biogetseq - OBDA sequence data retrieval (executable)
-# 
-# Copyright::   Copyright (C) 2003
-#               Toshiaki Katayama <k at bioruby.org>
-# License::     The Ruby License
-#
-# $Id: br_biogetseq.rb,v 1.4 2007/04/05 23:35:39 trevor Exp $
-# 
-
-require 'bio'
-
-def usage
-  print <<END
-  #{$0} --dbname <dbname> [--namespace <namespace>] entry_id [entry_id]
-END
-  exit 1
-end
-
-if ARGV.size < 3
-  usage
-end
-
-while ARGV.first =~ /^-/
-  case ARGV.shift
-  when /^\-\-format/
-    ARGV.shift
-    raise NotImplementedError
-  when /^\-\-dbname/
-    dbname = ARGV.shift
-  when /^\-\-namespace/
-    namespace = ARGV.shift
-  end
-end
-
-reg = Bio::Registry.new
-db = reg.get_database(dbname)
-if namespace
-  db['namespace'] = namespace
-end
-ARGV.each do |entry|
-  puts db.get_by_id(entry)
-end
-
diff --git a/bin/br_pmfetch.rb b/bin/br_pmfetch.rb
deleted file mode 100755
index eb0f4ed..0000000
--- a/bin/br_pmfetch.rb
+++ /dev/null
@@ -1,422 +0,0 @@
-#!/usr/bin/env ruby
-#
-# = pmfetch - PubMed client
-#
-# Copyright::   Copyright (C) 2004, 2005
-#               Toshiaki Katayama <k at bioruby.org>
-# License::     The Ruby License
-#
-# $Id:$
-#
-
-require 'bio'
-
-PROG_VER  = "Powered by BioRuby #{Bio::BIORUBY_VERSION_ID}"
-PROG_NAME = File.basename($0)
-
-
-require 'getoptlong'
-
-
-### formatting
-
-class String
-  def fill(fill_column = 80, prefix = '', separater = ' ')
-    prefix = ' ' * prefix if prefix.is_a?(Integer)
-    maxlen = fill_column - prefix.length
-    raise "prefix is longer than fill_column" if maxlen <= 0
-
-    cursor = pos = 0
-    lines = []
-    while cursor < self.length
-      line = self[cursor, maxlen]
-      pos = line.rindex(separater)
-      pos = nil if line.length < maxlen
-      if pos
-        len = pos + separater.length
-        lines << self[cursor, len]
-        cursor += len
-      else
-        lines << self[cursor, maxlen]
-        cursor += maxlen
-      end
-    end
-    return lines.join("\n#{prefix}")
-  end
-end
-
-
-module Bio
-  class Reference
-    def report
-      if (num = @authors.size) > 10
-        authors = "#{@authors[0]} et al. (#{num} authors)"
-      elsif num > 4
-        sep = ',' * (num - 1)
-        authors = "#{@authors[0]}#{sep} #{@authors[-1]}"
-      else
-        authors = authors_join(' & ')
-      end
-      journal = "#{@journal} #{@year} #{@volume}(#{@issue}):#{@pages}"
-
-      indent = 8
-      prefix = ' ' * indent
-      [
-        "#{@pages[/\d+/]}".ljust(indent) + "#{@title}".fill(78, indent),
-        authors,
-        "#{journal} [PMID:#{@pubmed}]",
-      ].join("\n#{prefix}")
-    end
-  end
-end
-
-
-class PMFetch
-
-  class Examples < StandardError; end
-  class Version < StandardError; end
-  class Usage < StandardError; end
-
-  ### default options
-
-  def initialize
-    @format = 'rd'
-    @search_opts = {
-      'retmax' => 20,
-    }
-    @query = nil
-    @query_opts = []
-    @pmid_list_only = false
-
-    pmfetch
-  end
-
-
-  ### main
-
-  def pmfetch
-    begin
-      set_options
-      parse_options
-      check_query
-    rescue PMFetch::Examples
-      puts examples
-      exit
-    rescue PMFetch::Version
-      puts version
-      exit
-    rescue PMFetch::Usage
-      puts usage
-      exit
-    rescue GetoptLong::MissingArgument, GetoptLong::InvalidOption
-      puts usage
-      exit
-    end
-
-    list = pm_esearch
-
-    if list.empty?
-      ;
-    elsif @pmid_list_only
-      puts list
-    else
-      pm_efetch(list)
-    end
-  end
-
-
-  ### help
-
-  def usage
-%Q[
-Usage: #{PROG_NAME} [options...] "query string"
-    or #{PROG_NAME} --query "query string" [other options...] 
-
-Options:
- -q  --query "genome AND virus"  Query string for PubMed search
- -t  --title "mobile elements"   Title of the article to search
- -j  --journal "genome res"      Journal title to search
- -v  --volume #                  Journal volume to search
- -i  --issue #                   Journal issue to search
- -p  --page #                    First page number of the article to search
- -a  --author "Altschul SF"      Author name to search
- -m  --mesh "SARS virus"         MeSH term to search
- -f  --format bibtex             Summary output format
-     --pmidlist                  Output only a list of PubMed IDs
- -n  --retmax #                  Number of articles to retrieve at the maximum
- -N  --retstart #                Starting number of the articles to retrieve
- -s  --sort pub+date             Sort method for the summary output
-     --reldate #                 Search articles published within recent # days
-     --mindate YYYY/MM/DD        Search articles published after the date
-     --maxdate YYYY/MM/DD        Search articles published before the date
-     --help                      Output this help, then exit
-     --examples                  Output examples, then exit
-     --version                   Output version number, then exit
-
-Formats:
- endnote, medline, bibitem, bibtex, report, rd,
- nature, science, genome_res, genome_biol, nar, current, trends, cell
-
-Sort:
- author, journal, pub+date, page
-
-See the following pages for the PubMed search options:
- http://www.ncbi.nlm.nih.gov/entrez/query/static/help/pmhelp.html
- http://www.ncbi.nlm.nih.gov/entrez/query/static/esearch_help.html
-
-#{version}
-
-]
-  end
-
-  def version
-    PROG_VER
-  end
-
-  def examples
-    DATA.read.gsub('PMFetch', PROG_NAME)
-  end
-
-
-  private
-
-
-  ### options
-
-  def set_options
-    @parser = GetoptLong.new
-
-    @parser.set_options(
-	[ '--query',	'-q',	GetoptLong::REQUIRED_ARGUMENT ],
-	[ '--title',	'-t',	GetoptLong::REQUIRED_ARGUMENT ],
-	[ '--journal',	'-j',	GetoptLong::REQUIRED_ARGUMENT ],
-	[ '--volume',	'-v',	GetoptLong::REQUIRED_ARGUMENT ],
-	[ '--issue',	'-i',	GetoptLong::REQUIRED_ARGUMENT ],
-	[ '--page',	'-p',	GetoptLong::REQUIRED_ARGUMENT ],
-	[ '--author',	'-a',	GetoptLong::REQUIRED_ARGUMENT ],
-	[ '--mesh',	'-m',	GetoptLong::REQUIRED_ARGUMENT ],
-	[ '--format',	'-f',	GetoptLong::REQUIRED_ARGUMENT ],
-	[ '--pmidlist',		GetoptLong::NO_ARGUMENT ],
-	[ '--retmax',	'-n',	GetoptLong::REQUIRED_ARGUMENT ],
-	[ '--retstart', '-N',	GetoptLong::REQUIRED_ARGUMENT ],
-	[ '--sort',	'-s',	GetoptLong::REQUIRED_ARGUMENT ],
-	[ '--reldate',		GetoptLong::REQUIRED_ARGUMENT ],
-	[ '--mindate',		GetoptLong::REQUIRED_ARGUMENT ],
-	[ '--maxdate',		GetoptLong::REQUIRED_ARGUMENT ],
-	[ '--examples',		GetoptLong::NO_ARGUMENT ],
-	[ '--help',		GetoptLong::NO_ARGUMENT ],
-	[ '--version',		GetoptLong::NO_ARGUMENT ]
-    )
-  end
-
-  def parse_options
-    @parser.each_option do |optname, optarg|
-      case optname
-      when /--query/
-        @query = optarg
-      when /--title/
-        @query_opts << "#{optarg}[ti]"
-      when /--journal/
-        @query_opts << "#{optarg}[ta]"
-      when /--volume/
-        @query_opts << "#{optarg}[vi]"
-      when /--issue/
-        @query_opts << "#{optarg}[ip]"
-      when /--page/
-        @query_opts << "#{optarg}[pg]"
-      when /--author/
-        @query_opts << "#{optarg}[au]"
-      when /--mesh/
-        @query_opts << "#{optarg}[mh]"
-      when /--format/
-        @format = optarg
-      when /--pmidlist/
-        @pmid_list_only = true
-      when /--examples/
-        raise PMFetch::Examples
-      when /--help/
-        raise PMFetch::Usage
-      when /--version/
-        raise PMFetch::Version
-      when /--sort/
-        @sort = optarg
-        @search_opts["sort"] = @sort unless @sort == "page"
-      else
-        optname.delete!('-')
-        @search_opts[optname] = optarg
-      end
-    end
-  end
-
-
-  ### check query
-
-  def check_query
-    p @query if $DEBUG
-    @query ||= ARGV.join(" ") unless ARGV.empty?
-
-    p @query if $DEBUG
-    @query_str = [ @query, @query_opts ].flatten.compact.join(" AND ")
-
-    p @query_str if $DEBUG
-    if @query_str.empty?
-      raise PMFetch::Usage
-    end
-  end
-
-
-  ### search
-
-  def pm_esearch
-    return Bio::PubMed.esearch(@query_str, @search_opts)
-  end
-
-  def pm_efetch(list)
-    entries = Bio::PubMed.efetch(list)
-
-    if @format == 'medline'
-      medline_format(entries)
-    else
-      entries = parse_entries(entries)
-      if @sort == 'page'
-        entries = sort_entries(entries)
-      end
-      if @format == 'report'
-        report_format(entries)
-      else
-        other_format(entries)
-      end
-    end
-  end
-
-
-  ### output
-
-  def medline_format(entries)
-    entries.each do |entry|
-      puts entry
-      puts '//'
-    end
-  end
-
-  def parse_entries(entries)
-    entries.map { |entry| Bio::MEDLINE.new(entry) }
-  end
-
-  def sort_entries(entries)
-    if RUBY_VERSION > "1.8.0"
-       entries.sort_by { |x|
-         [ x.journal, x.volume.to_i, x.issue.to_i, x.pages.to_i ]
-       }
-    else
-      entries.map { |x|
-        [ x.journal, x.volume.to_i, x.issue.to_i, x.pages.to_i, x ]
-      }.sort { |a, b|
-        a[0..3] <=> b[0..3]
-      }.map { |y|
-        y.pop
-      }
-    end
-  end
-
-  def report_format(entries)
-    entries.each do |entry|
-      puts entry.reference.report
-      puts
-    end
-  end
-
-  def other_format(entries)
-    entries.each do |entry|
-      puts entry.reference.format(@format)
-      puts
-    end
-  end
-
-end
-
-
-PMFetch.new
-
-
-__END__
-
-= Examples : PubMed search
-
-These four lines will do the same job.
-
-  % PMFetch transcription factor
-  % PMFetch "transcription factor"
-  % PMFetch --query "transcription factor"
-  % PMFetch -q "transcription factor"
-
-
-Retrieve max 100 artiecles (20 is a NCBI's default) at a time, use --retmax as
-
-  % PMFetch -q "transcription factor" --retmax 100
-
-and, to retrieve next 100 articles, use --retstart as
-
-  % PMFetch -q "transcription factor" --retmax 100 --retstart 100
-
-
-You can narrow the search target for an issue of the journal.
-
-  % PMFetch --journal development --volume 131 --issue 3  transcription factor
-
-
-Short options are also available.
-
-  % PMFetch -j development -v 131 -i 3  transcription factor
-
-
-Search articles indexed in PubMed within these 90 days.
-
-  % PMFetch -q "transcription factor" --reldate 90
-
-
-Search articles indexed in PubMed during the period of 2001/04/01 to 2001/08/31
-
-  % PMFetch -q "transcription factor" --mindate 2001/04/01 --maxdate 2001/08/31
-
-
-Output format can be changed by --format option.
-
-  % PMFetch -q "transcription factor" -j development -v 131 -i 3 -f report
-  % PMFetch -q "transcription factor" -j development -v 131 -i 3 -f rd
-  % PMFetch -q "transcription factor" -j development -v 131 -i 3 -f endnote
-  % PMFetch -q "transcription factor" -j development -v 131 -i 3 -f medline
-  % PMFetch -q "transcription factor" -j development -v 131 -i 3 -f bibitem
-  % PMFetch -q "transcription factor" -j development -v 131 -i 3 -f bibtex
-  % PMFetch -q "transcription factor" -j development -v 131 -i 3 -f nature
-  % PMFetch -q "transcription factor" -j development -v 131 -i 3 -f science
-
-
-Generate title listings for the journal report meeting (don't forget
-to inclease the number of --retmax for fetching all titles).
-
-  % PMFetch -f report -j development -v 131 -i 3 -n 100
-
-
-Search by author name.
-
-  % PMFetch -a "Karlin S"
-  % PMFetch -a "Koonin EV"
-
-
-Search by MeSH term.
-
-  % PMFetch -m "computational biology"
-  % PMFetch -m "SARS virus"
-
-
-Search by PubMed ID (PMID).
-
-  % PMFetch 12345
-
-
-Output PMID only.
-
-  % PMFetch --pmidlist tardigrada
-
-
diff --git a/lib/bio/db/go.rb b/lib/bio/db/go.rb
index affbe66..62f78ba 100644
--- a/lib/bio/db/go.rb
+++ b/lib/bio/db/go.rb
@@ -1,8 +1,9 @@
 #
 # = bio/db/go.rb - Classes for Gene Ontology
 #
-# Copyright::   Copyright (C) 2003 
+# Copyright::   Copyright (C) 2003, 2010 
 #               Mitsuteru C. Nakao <n at bioruby.org>
+#               R. Stephan <ralf at ark.in-berlin.de>
 # License::     The Ruby License
 #
 #  $Id:$
@@ -174,8 +175,8 @@ class GO
   # = Bio::GO::GeneAssociation
   # $CVSROOT/go/gene-associations/gene_association.*
   #
-  # Data parser for the gene_association go annotation.
-  # See also the file format http://www.geneontology.org/doc/GO.annotation.html#file
+  # Data parser for the gene_association go annotation 1.0.
+  # See also the file format http://www.geneontology.org/GO.format.gaf-1_0.shtml
   #
   # == Example
   #
@@ -324,6 +325,20 @@ class GO
 
   end # class GeneAssociation   
 
+  # = Bio::GO::GeneAssociation2
+  #
+  # Data parser for the gene_association go annotation 2.0.
+  # See also the file format http://www.geneontology.org/GO.format.gaf-2_0.shtml
+  #
+  # == Example
+  #
+  #  mgi_data = File.open('gene_association.mgi').read
+  #  mgi = Bio::GO::GeneAssociation2.parser(mgi_data)
+  #
+  #  Bio::GO::GeneAssociation.parser(mgi_data) do |entry|
+  #    p [entry.entry_id, entry.evidence, entry.goid]
+  #  end
+  #
   class GeneAssociation2 < GeneAssociation
 
     # Iterator through all entries
-- 
1.5.5

>From c6729520a9faf985975fb7f5b93128cdbe31b0e8 Mon Sep 17 00:00:00 2001
From: R. Stephan <ralf at ark.in-berlin.de>
Date: Tue, 3 Aug 2010 08:47:31 +0200
Subject: [PATCH] Add Phenote GOA file format parsing, GAF1 output

---
 lib/bio/db/go.rb |   61 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 files changed, 60 insertions(+), 1 deletions(-)

diff --git a/lib/bio/db/go.rb b/lib/bio/db/go.rb
index 62f78ba..b265c7e 100644
--- a/lib/bio/db/go.rb
+++ b/lib/bio/db/go.rb
@@ -358,12 +358,71 @@ class GO
       end
     end
 
-    # Bio::GO::GeneAssociation#to_str -> a line of gene_association file.
+    # Bio::GO::GeneAssociation2#to_str -> a line of gene_association file.
     def to_str
       return [super.to_str, @annotation_extension, @gene_product_form_id].join("\t")
     end
   end
 
+  # = Bio::GO::Phenote_GOA
+  #
+  # Data parser for the Phenote file format which is similar to GAF1.
+  # We serialize to GAF1 format (to_str).
+  # See http://www.phenote.org
+  # See also the file format http://www.geneontology.org/GO.format.gaf-1_0.shtml
+  #
+  # == Example
+  #
+  #  mgi_data = File.open('gene_association.mgi').read
+  #  mgi = Bio::GO::Phenote_GOA.parser(mgi_data)
+  #
+  #  Bio::GO::Phenote_GOA.parser(mgi_data) do |entry|
+  #    p.to_str
+  #  end
+  
+  class Phenote_GOA < GeneAssociation
+
+    # Retruns an Array of parsed Phenote file.
+    # Block is acceptable.  
+    def self.parser(str)
+      if block_given?
+        str.each_line(DELIMITER) {|line|
+          next if /^DB\t/ =~ line
+          yield Phenote_GOA.new(line)
+        }
+      else
+        galist = []
+        str.each_line(DELIMITER) {|line|
+          next if /^DB\t/ =~ line
+          galist << Phenote_GOA.new(line)
+        }
+        return galist
+      end
+    end
+
+    # Assign fields of an entry (in a line) in Phenote format.  
+    def assign(tmp) 
+      @db                = tmp[0] 
+      @db_object_id      = tmp[1]
+      @db_object_symbol  = tmp[2]
+      @qualifier         = tmp[3]  # 
+      @goid              = tmp[4]
+      # We ignore Phenote's tmp[5]
+      @db_reference      = ArrayOrString.new(tmp[6].split(/\|/))  #
+      @evidence          = tmp[7]
+      @with              = ArrayOrString.new(tmp[8].split(/\|/))  # 
+      @aspect            = tmp[9]
+      @db_object_name    = tmp[10]  #
+      @db_object_synonym = ArrayOrString.new(tmp[11].split(/\|/)) #
+      @db_object_type    = tmp[12]
+      @taxon             = tmp[13] # taxon:4932
+      @date              = tmp[14] # 20010118
+      @assigned_by       = tmp[15] 
+      # We ignore Phenote's tmp[16-18]
+    end
+  end
+
+  #
 # = Container class for files in geneontology.org/go/external2go/*2go.
   #
   # The line syntax is: 
-- 
1.5.5



Ralf Stephan
http://www.ark.in-berlin.de
pub   1024D/C5114CB2 2009-06-07 [expires: 2011-06-06]
      Key fingerprint = 76AE 0D21 C06C CBF9 24F8  7835 1809 DE97 C511 4CB2








More information about the BioRuby mailing list