[BioRuby-cvs] bioruby/lib/bio/db newick.rb,1.7,1.8
Naohisa Goto
ngoto at dev.open-bio.org
Wed Dec 12 16:06:24 UTC 2007
Update of /home/repository/bioruby/bioruby/lib/bio/db
In directory dev.open-bio.org:/tmp/cvs-serv19312/lib/bio/db
Modified Files:
newick.rb
Log Message:
* lib/bio/db/newick.rb:
Changed to be compliant with the Gary Olsen's Interpretation of
the "Newick's 8:45" Tree Format Standard. In addtion, RDoc is improved.
* test/unit/bio/db/test_newick.rb
More tests are added.
* ChangeLog
ChangeLog in 12/Dec/2007 are added.
Index: newick.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/db/newick.rb,v
retrieving revision 1.7
retrieving revision 1.8
diff -C2 -d -r1.7 -r1.8
*** newick.rb 5 Apr 2007 23:35:40 -0000 1.7
--- newick.rb 12 Dec 2007 16:06:22 -0000 1.8
***************
*** 9,13 ****
--- 9,23 ----
# $Id$
#
+ # == Description
+ #
+ # This file contains parser and formatter of Newick and NHX.
+ #
+ # == References
+ #
+ # * http://evolution.genetics.washington.edu/phylip/newick_doc.html
+ # * http://www.phylosoft.org/forester/NHX.html
+ #
+ require 'strscan'
require 'bio/tree'
***************
*** 19,22 ****
--- 29,33 ----
#+++
+ # default options
DEFAULT_OPTIONS =
{ :indent => ' ' }
***************
*** 33,40 ****
private :__get_option
# formats leaf
def __to_newick_format_leaf(node, edge, options)
! label = get_node_name(node).to_s
dist = get_edge_distance_string(edge)
--- 44,67 ----
private :__get_option
+
+ # formats Newick label (unquoted_label or quoted_label)
+ def __to_newick_format_label(str, options)
+ if __get_option(:parser, options) == :naive then
+ return str.to_s
+ end
+ str = str.to_s
+ if /([\(\)\,\:\[\]\_\'\x00-\x1f\x7f])/ =~ str then
+ # quoted_label
+ return "\'" + str.gsub(/\'/, "\'\'") + "\'"
+ end
+ # unquoted_label
+ return str.gsub(/ /, '_')
+ end
+ private :__to_newick_format_label
+
# formats leaf
def __to_newick_format_leaf(node, edge, options)
! label = __to_newick_format_label(get_node_name(node), options)
dist = get_edge_distance_string(edge)
***************
*** 63,67 ****
def __to_newick_format_leaf_NHX(node, edge, options)
! label = get_node_name(node).to_s
dist = get_edge_distance_string(edge)
--- 90,94 ----
def __to_newick_format_leaf_NHX(node, edge, options)
! label = __to_newick_format_label(get_node_name(node), options)
dist = get_edge_distance_string(edge)
***************
*** 166,174 ****
# If block is given, the order of the node is sorted
# (as the same manner as Enumerable#sort).
! # Description about options.
! # :indent : indent string; set false to disable (default: ' ')
! # :bootstrap_style : :disabled disables bootstrap representations
! # :traditional traditional style
! # :molphy Molphy style (default)
def output_newick(options = {}, &block) #:yields: node1, node2
root = @root
--- 193,204 ----
# If block is given, the order of the node is sorted
# (as the same manner as Enumerable#sort).
! #
! # Available options:
! # <tt>:indent</tt>::
! # indent string; set false to disable (default: ' ')
! # <tt>:bootstrap_style</tt>::
! # <tt>:disabled</tt> disables bootstrap representations.
! # <tt>:traditional</tt> for traditional style.
! # <tt>:molphy</tt> for Molphy style (default).
def output_newick(options = {}, &block) #:yields: node1, node2
root = @root
***************
*** 186,191 ****
# If block is given, the order of the node is sorted
# (as the same manner as Enumerable#sort).
! # Description about options.
! # :indent : indent string; set false to disable (default: ' ')
def output_nhx(options = {}, &block) #:yields: node1, node2
root = @root
--- 216,224 ----
# If block is given, the order of the node is sorted
# (as the same manner as Enumerable#sort).
! #
! # Available options:
! # <tt>:indent</tt>::
! # indent string; set false to disable (default: ' ')
! #
def output_nhx(options = {}, &block) #:yields: node1, node2
root = @root
***************
*** 258,268 ****
# _options_ for parsing can be set.
#
! # Note: molphy-style bootstrap values may be parsed, even if
! # the options[:bootstrap_style] is set to :traditional or :disabled.
! # Note: By default, if all of the internal node's names are numeric
# and there are no NHX and no molphy-style boostrap values,
# the names of internal nodes are regarded as bootstrap values.
! # options[:bootstrap_style] = :disabled or :molphy to disable the feature
! # (or at least one NHX tag exists).
def initialize(str, options = nil)
str = str.sub(/\;(.*)/m, ';')
--- 291,316 ----
# _options_ for parsing can be set.
#
! # Available options:
! # <tt>:bootstrap_style</tt>::
! # <tt>:traditional</tt> for traditional bootstrap style,
! # <tt>:molphy</tt> for molphy style,
! # <tt>:disabled</tt> to ignore bootstrap strings.
! # For details of default actions, please read the notes below.
! # <tt>:parser</tt>::
! # <tt>:naive</tt> for using naive parser, compatible with
! # BioRuby 1.1.0, which ignores quoted strings and
! # do not convert underscores to spaces.
! #
! # Notes for bootstrap style:
! # Molphy-style bootstrap values may always be parsed, even if
! # the <tt>options[:bootstrap_style]</tt> is set to
! # <tt>:traditional</tt> or <tt>:disabled</tt>.
! #
! # Note for default or traditional bootstrap style:
! # By default, if all of the internal node's names are numeric
# and there are no NHX and no molphy-style boostrap values,
# the names of internal nodes are regarded as bootstrap values.
! # <tt>options[:bootstrap_style] = :disabled</tt> or <tt>:molphy</tt>
! # to disable the feature (or at least one NHX tag exists).
def initialize(str, options = nil)
str = str.sub(/\;(.*)/m, ';')
***************
*** 309,354 ****
# Parses newick formatted leaf (or internal node) name.
! def __parse_newick_leaf(str, node, edge, options)
! case str
! when /(.*)\:(.*)\[(.*)\]/
! node.name = $1
! edge.distance_string = $2 if $2 and !($2.strip.empty?)
! # bracketted string into bstr
! bstr = $3
! when /(.*)\[(.*)\]/
! node.name = $1
! # bracketted string into bstr
! bstr = $2
! when /(.*)\:(.*)/
! node.name = $1
! edge.distance_string = $2 if $2 and !($2.strip.empty?)
! else
! node.name = str
end
! # determines NHX or Molphy-style bootstrap
! if bstr and !(bstr.strip.empty?)
case __get_option(:original_format, options)
when :nhx
# regarded as NHX string which might be broken
! __parse_nhx(bstr, node, edge)
when :traditional
# simply ignored
else
! case bstr
when /\A\&\&NHX/
# NHX string
# force to set NHX mode
@options[:original_format] = :nhx
! __parse_nhx(bstr, node, edge)
else
# Molphy-style boostrap values
# let molphy mode if nothing determined
@options[:original_format] ||= :molphy
node.bootstrap_string = bstr
! end #case bstr
end
end
# returns true
true
--- 357,410 ----
# Parses newick formatted leaf (or internal node) name.
! def __parse_newick_leaf(leaf_tokens, node, edge, options)
! t = leaf_tokens.shift
! if !t.kind_of?(Symbol) then
! node.name = t
! t = leaf_tokens.shift
end
! if t == :':' then
! t = leaf_tokens.shift
! if !t.kind_of?(Symbol) then
! edge.distance_string = t if t and !(t.strip.empty?)
! t = leaf_tokens.shift
! end
! end
!
! if t == :'[' then
! btokens = leaf_tokens
case __get_option(:original_format, options)
when :nhx
# regarded as NHX string which might be broken
! __parse_nhx(btokens, node, edge)
when :traditional
# simply ignored
else
! case btokens[0].to_s.strip
! when ''
! # not automatically determined
when /\A\&\&NHX/
# NHX string
# force to set NHX mode
@options[:original_format] = :nhx
! __parse_nhx(btokens, node, edge)
else
# Molphy-style boostrap values
# let molphy mode if nothing determined
@options[:original_format] ||= :molphy
+ bstr = ''
+ while t = btokens.shift and t != :']'
+ bstr.concat t.to_s
+ end
node.bootstrap_string = bstr
! end #case btokens[0]
end
end
+ if !btokens and !leaf_tokens.empty? then
+ # syntax error?
+ end
+ node.name ||= '' # compatibility for older BioRuby
+
# returns true
true
***************
*** 356,363 ****
# Parses NHX (New Hampshire eXtended) string
! def __parse_nhx(bstr, node, edge)
! a = bstr.split(/\:/)
! a.shift if a[0] == '&&NHX'
! a.each do |str|
tag, val = str.split(/\=/, 2)
case tag
--- 412,420 ----
# Parses NHX (New Hampshire eXtended) string
! def __parse_nhx(btokens, node, edge)
! btokens.shift if btokens[0] == '&&NHX'
! btokens.each do |str|
! break if str == :']'
! next if str.kind_of?(Symbol)
tag, val = str.split(/\=/, 2)
case tag
***************
*** 392,395 ****
--- 449,543 ----
end
+ # splits string to tokens
+ def __parse_newick_tokenize(str, options)
+ str = str.chop if str[-1..-1] == ';'
+ # http://evolution.genetics.washington.edu/phylip/newick_doc.html
+ # quoted_label ==> ' string_of_printing_characters '
+ # single quote in quoted_label is '' (two single quotes)
+ #
+
+ if __get_option(:parser, options) == :naive then
+ ary = str.split(/([\(\)\,\:\[\]])/)
+ ary.collect! { |x| x.strip!; x.empty? ? nil : x }
+ ary.compact!
+ ary.collect! do |x|
+ if /\A([\(\)\,\:\[\]])\z/ =~ x then
+ x.intern
+ else
+ x
+ end
+ end
+ return ary
+ end
+
+ tokens = []
+ ss = StringScanner.new(str)
+
+ while !(ss.eos?)
+ if ss.scan(/\s+/) then
+ # do nothing
+
+ elsif ss.scan(/[\(\)\,\:\[\]]/) then
+ # '(' or ')' or ',' or ':' or '[' or ']'
+ t = ss.matched
+ tokens.push t.intern
+
+ elsif ss.scan(/\'/) then
+ # quoted_label
+ t = ''
+ while true
+ if ss.scan(/([^\']*)\'/) then
+ t.concat ss[1]
+ if ss.scan(/\'/) then
+ # single quote in quoted_label
+ t.concat ss.matched
+ else
+ break
+ end
+ else
+ # incomplete quoted_label?
+ break
+ end
+ end #while true
+ unless ss.match?(/\s*[\(\)\,\:\[\]]/) or ss.match?(/\s*\z/) then
+ # label continues? (illegal, but try to rescue)
+ if ss.scan(/[^\(\)\,\:\[\]]+/) then
+ t.concat ss.matched.lstrip
+ end
+ end
+ tokens.push t
+
+ elsif ss.scan(/[^\(\)\,\:\[\]]+/) then
+ # unquoted_label
+ t = ss.matched.strip
+ t.gsub!(/[\r\n]/, '')
+ # unquoted underscore should be converted to blank
+ t.gsub!(/\_/, ' ')
+ tokens.push t unless t.empty?
+
+ else
+ # unquoted_label in end of string
+ t = ss.rest.strip
+ t.gsub!(/[\r\n]/, '')
+ # unquoted underscore should be converted to blank
+ t.gsub!(/\_/, ' ')
+ tokens.push t unless t.empty?
+ ss.terminate
+
+ end
+ end #while !(ss.eos?)
+
+ tokens
+ end
+
+ # get tokens for a leaf
+ def __parse_newick_get_tokens_for_leaf(ary)
+ r = []
+ while t = ary[0] and t != :',' and t != :')' and t != :'('
+ r.push ary.shift
+ end
+ r
+ end
+
# Parses newick formatted string.
def __parse_newick(str, options = {})
***************
*** 402,409 ****
node_stack = []
# preparation of tokens
! str = str.chop if str[-1..-1] == ';'
! ary = str.split(/([\(\)\,])/)
! ary.collect! { |x| x.strip!; x.empty? ? nil : x }
! ary.compact!
previous_token = nil
# main loop
--- 550,554 ----
node_stack = []
# preparation of tokens
! ary = __parse_newick_tokenize(str, options)
previous_token = nil
# main loop
***************
*** 411,416 ****
#p token
case token
! when ','
! if previous_token == ',' or previous_token == '(' then
# there is a leaf whose name is empty.
ary.unshift(token)
--- 556,561 ----
#p token
case token
! when :','
! if previous_token == :',' or previous_token == :'(' then
# there is a leaf whose name is empty.
ary.unshift(token)
***************
*** 418,422 ****
token = nil
end
! when '('
node = Node.new
nodes << node
--- 563,567 ----
token = nil
end
! when :'('
node = Node.new
nodes << node
***************
*** 424,429 ****
node_stack.push(cur_node)
cur_node = node
! when ')'
! if previous_token == ',' or previous_token == '(' then
# there is a leaf whose name is empty.
ary.unshift(token)
--- 569,574 ----
node_stack.push(cur_node)
cur_node = node
! when :')'
! if previous_token == :',' or previous_token == :'(' then
# there is a leaf whose name is empty.
ary.unshift(token)
***************
*** 432,439 ****
else
edge = Edge.new
! next_token = ary[0]
! if next_token and next_token != ',' and next_token != ')' then
! __parse_newick_leaf(next_token, cur_node, edge, options)
! ary.shift
end
parent = node_stack.pop
--- 577,584 ----
else
edge = Edge.new
! leaf_tokens = __parse_newick_get_tokens_for_leaf(ary)
! token = nil
! if leaf_tokens.size > 0 then
! __parse_newick_leaf(leaf_tokens, cur_node, edge, options)
end
parent = node_stack.pop
***************
*** 445,449 ****
leaf = Node.new
edge = Edge.new
! __parse_newick_leaf(token, leaf, edge, options)
nodes << leaf
edges << Bio::Relation.new(cur_node, leaf, edge)
--- 590,597 ----
leaf = Node.new
edge = Edge.new
! ary.unshift(token)
! leaf_tokens = __parse_newick_get_tokens_for_leaf(ary)
! token = nil
! __parse_newick_leaf(leaf_tokens, leaf, edge, options)
nodes << leaf
edges << Bio::Relation.new(cur_node, leaf, edge)
More information about the bioruby-cvs
mailing list