[BioRuby-cvs] bioruby/lib/bio/db newick.rb,1.7,1.8

Naohisa Goto ngoto at dev.open-bio.org
Wed Dec 12 16:06:24 UTC 2007


Update of /home/repository/bioruby/bioruby/lib/bio/db
In directory dev.open-bio.org:/tmp/cvs-serv19312/lib/bio/db

Modified Files:
	newick.rb 
Log Message:
* lib/bio/db/newick.rb:
  Changed to be compliant with the Gary Olsen's Interpretation of
  the "Newick's 8:45" Tree Format Standard. In addtion, RDoc is improved.
* test/unit/bio/db/test_newick.rb
  More tests are added.
* ChangeLog
  ChangeLog in 12/Dec/2007 are added.


Index: newick.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/db/newick.rb,v
retrieving revision 1.7
retrieving revision 1.8
diff -C2 -d -r1.7 -r1.8
*** newick.rb	5 Apr 2007 23:35:40 -0000	1.7
--- newick.rb	12 Dec 2007 16:06:22 -0000	1.8
***************
*** 9,13 ****
--- 9,23 ----
  # $Id$
  #
+ # == Description
+ #
+ # This file contains parser and formatter of Newick and NHX.
+ #
+ # == References
+ #
+ # * http://evolution.genetics.washington.edu/phylip/newick_doc.html
+ # * http://www.phylosoft.org/forester/NHX.html
+ #
  
+ require 'strscan'
  require 'bio/tree'
  
***************
*** 19,22 ****
--- 29,33 ----
      #+++
  
+     # default options
      DEFAULT_OPTIONS =
        { :indent => '  ' }
***************
*** 33,40 ****
      private :__get_option
  
      # formats leaf
      def __to_newick_format_leaf(node, edge, options)
  
!       label = get_node_name(node).to_s
  
        dist = get_edge_distance_string(edge)
--- 44,67 ----
      private :__get_option
  
+ 
+     # formats Newick label (unquoted_label or quoted_label)
+     def __to_newick_format_label(str, options)
+       if __get_option(:parser, options) == :naive then
+         return str.to_s
+       end
+       str = str.to_s
+       if /([\(\)\,\:\[\]\_\'\x00-\x1f\x7f])/ =~ str then
+         # quoted_label
+         return "\'" + str.gsub(/\'/, "\'\'") + "\'"
+       end
+       # unquoted_label
+       return str.gsub(/ /, '_')
+     end
+     private :__to_newick_format_label
+ 
      # formats leaf
      def __to_newick_format_leaf(node, edge, options)
  
!       label = __to_newick_format_label(get_node_name(node), options)
  
        dist = get_edge_distance_string(edge)
***************
*** 63,67 ****
      def __to_newick_format_leaf_NHX(node, edge, options)
  
!       label = get_node_name(node).to_s
  
        dist = get_edge_distance_string(edge)
--- 90,94 ----
      def __to_newick_format_leaf_NHX(node, edge, options)
  
!       label = __to_newick_format_label(get_node_name(node), options)
  
        dist = get_edge_distance_string(edge)
***************
*** 166,174 ****
      # If block is given, the order of the node is sorted
      # (as the same manner as Enumerable#sort).
!     # Description about options.
!     #  :indent : indent string; set false to disable (default: '  ')
!     #  :bootstrap_style : :disabled    disables bootstrap representations
!     #                     :traditional traditional style
!     #                     :molphy      Molphy style (default)
      def output_newick(options = {}, &block) #:yields: node1, node2
        root = @root
--- 193,204 ----
      # If block is given, the order of the node is sorted
      # (as the same manner as Enumerable#sort).
!     #
!     # Available options:
!     # <tt>:indent</tt>::
!     #     indent string; set false to disable (default: '  ')
!     # <tt>:bootstrap_style</tt>::
!     #     <tt>:disabled</tt> disables bootstrap representations.
!     #     <tt>:traditional</tt> for traditional style.
!     #     <tt>:molphy</tt> for Molphy style (default).
      def output_newick(options = {}, &block) #:yields: node1, node2
        root = @root
***************
*** 186,191 ****
      # If block is given, the order of the node is sorted
      # (as the same manner as Enumerable#sort).
!     # Description about options.
!     #  :indent        : indent string; set false to disable (default: '  ')
      def output_nhx(options = {}, &block) #:yields: node1, node2
        root = @root
--- 216,224 ----
      # If block is given, the order of the node is sorted
      # (as the same manner as Enumerable#sort).
!     #
!     # Available options:
!     # <tt>:indent</tt>::
!     #     indent string; set false to disable (default: '  ')
!     #
      def output_nhx(options = {}, &block) #:yields: node1, node2
        root = @root
***************
*** 258,268 ****
      # _options_ for parsing can be set.
      #
!     # Note: molphy-style bootstrap values may be parsed, even if
!     # the options[:bootstrap_style] is set to :traditional or :disabled.
!     # Note: By default, if all of the internal node's names are numeric
      # and there are no NHX and no molphy-style boostrap values,
      # the names of internal nodes are regarded as bootstrap values.
!     # options[:bootstrap_style] = :disabled or :molphy to disable the feature
!     # (or at least one NHX tag exists).
      def initialize(str, options = nil)
        str = str.sub(/\;(.*)/m, ';')
--- 291,316 ----
      # _options_ for parsing can be set.
      #
!     # Available options:
!     # <tt>:bootstrap_style</tt>::
!     #     <tt>:traditional</tt> for traditional bootstrap style,
!     #     <tt>:molphy</tt> for molphy style,
!     #     <tt>:disabled</tt> to ignore bootstrap strings.
!     #     For details of default actions, please read the notes below.
!     # <tt>:parser</tt>::
!     #     <tt>:naive</tt> for using naive parser, compatible with
!     #     BioRuby 1.1.0, which ignores quoted strings and
!     #     do not convert underscores to spaces. 
!     #
!     # Notes for bootstrap style:
!     # Molphy-style bootstrap values may always be parsed, even if
!     # the <tt>options[:bootstrap_style]</tt> is set to
!     # <tt>:traditional</tt> or <tt>:disabled</tt>.
!     #
!     # Note for default or traditional bootstrap style:
!     # By default, if all of the internal node's names are numeric
      # and there are no NHX and no molphy-style boostrap values,
      # the names of internal nodes are regarded as bootstrap values.
!     # <tt>options[:bootstrap_style] = :disabled</tt> or <tt>:molphy</tt>
!     # to disable the feature (or at least one NHX tag exists).
      def initialize(str, options = nil)
        str = str.sub(/\;(.*)/m, ';')
***************
*** 309,354 ****
  
      # Parses newick formatted leaf (or internal node) name.
!     def __parse_newick_leaf(str, node, edge, options)
!       case str
!       when /(.*)\:(.*)\[(.*)\]/
!         node.name = $1
!         edge.distance_string = $2 if $2 and !($2.strip.empty?)
!         # bracketted string into bstr
!         bstr = $3
!       when /(.*)\[(.*)\]/
!         node.name = $1
!         # bracketted string into bstr
!         bstr = $2
!       when /(.*)\:(.*)/
!         node.name = $1
!         edge.distance_string = $2 if $2 and !($2.strip.empty?)
!       else
!         node.name = str
        end
  
!       # determines NHX or Molphy-style bootstrap
!       if bstr and !(bstr.strip.empty?)
          case __get_option(:original_format, options)
          when :nhx
            # regarded as NHX string which might be broken
!           __parse_nhx(bstr, node, edge)
          when :traditional
            # simply ignored
          else
!           case bstr
            when /\A\&\&NHX/
              # NHX string
              # force to set NHX mode
              @options[:original_format] = :nhx
!             __parse_nhx(bstr, node, edge)
            else
              # Molphy-style boostrap values
              # let molphy mode if nothing determined
              @options[:original_format] ||= :molphy
              node.bootstrap_string = bstr
!           end #case bstr
          end
        end
  
        # returns true
        true
--- 357,410 ----
  
      # Parses newick formatted leaf (or internal node) name.
!     def __parse_newick_leaf(leaf_tokens, node, edge, options)
!       t = leaf_tokens.shift
!       if !t.kind_of?(Symbol) then
!         node.name = t
!         t = leaf_tokens.shift
        end
  
!       if t == :':' then
!         t = leaf_tokens.shift
!         if !t.kind_of?(Symbol) then
!           edge.distance_string = t if t and !(t.strip.empty?)
!           t = leaf_tokens.shift
!         end
!       end
! 
!       if t == :'[' then
!         btokens = leaf_tokens
          case __get_option(:original_format, options)
          when :nhx
            # regarded as NHX string which might be broken
!           __parse_nhx(btokens, node, edge)
          when :traditional
            # simply ignored
          else
!           case btokens[0].to_s.strip
!           when ''
!             # not automatically determined
            when /\A\&\&NHX/
              # NHX string
              # force to set NHX mode
              @options[:original_format] = :nhx
!             __parse_nhx(btokens, node, edge)
            else
              # Molphy-style boostrap values
              # let molphy mode if nothing determined
              @options[:original_format] ||= :molphy
+             bstr = ''
+             while t = btokens.shift and t != :']'
+               bstr.concat t.to_s
+             end
              node.bootstrap_string = bstr
!           end #case btokens[0]
          end
        end
  
+       if !btokens and !leaf_tokens.empty? then
+         # syntax error?
+       end
+       node.name ||= '' # compatibility for older BioRuby
+ 
        # returns true
        true
***************
*** 356,363 ****
  
      # Parses NHX (New Hampshire eXtended) string
!     def __parse_nhx(bstr, node, edge)
!       a = bstr.split(/\:/)
!       a.shift if a[0] == '&&NHX'
!       a.each do |str|
          tag, val = str.split(/\=/, 2)
          case tag
--- 412,420 ----
  
      # Parses NHX (New Hampshire eXtended) string
!     def __parse_nhx(btokens, node, edge)
!       btokens.shift if btokens[0] == '&&NHX'
!       btokens.each do |str|
!         break if str == :']'
!         next if str.kind_of?(Symbol)
          tag, val = str.split(/\=/, 2)
          case tag
***************
*** 392,395 ****
--- 449,543 ----
      end
  
+     # splits string to tokens
+     def __parse_newick_tokenize(str, options)
+       str = str.chop if str[-1..-1] == ';'
+       # http://evolution.genetics.washington.edu/phylip/newick_doc.html
+       # quoted_label ==> ' string_of_printing_characters '
+       # single quote in quoted_label is '' (two single quotes)
+       #
+ 
+       if __get_option(:parser, options) == :naive then
+         ary = str.split(/([\(\)\,\:\[\]])/)
+         ary.collect! { |x| x.strip!; x.empty? ? nil : x }
+         ary.compact!
+         ary.collect! do |x|
+           if /\A([\(\)\,\:\[\]])\z/ =~ x then
+             x.intern
+           else
+             x
+           end
+         end
+         return ary
+       end
+ 
+       tokens = []
+       ss = StringScanner.new(str)
+ 
+       while !(ss.eos?)
+         if ss.scan(/\s+/) then
+           # do nothing
+ 
+         elsif ss.scan(/[\(\)\,\:\[\]]/) then
+           # '(' or ')' or ',' or ':' or '[' or ']'
+           t = ss.matched
+           tokens.push t.intern
+ 
+         elsif ss.scan(/\'/) then
+           # quoted_label
+           t = ''
+           while true
+             if ss.scan(/([^\']*)\'/) then
+               t.concat ss[1]
+               if  ss.scan(/\'/) then
+                 # single quote in quoted_label
+                 t.concat ss.matched
+               else
+                 break
+               end
+             else
+               # incomplete quoted_label?
+               break
+             end
+           end #while true
+           unless ss.match?(/\s*[\(\)\,\:\[\]]/) or ss.match?(/\s*\z/) then
+             # label continues? (illegal, but try to rescue)
+             if ss.scan(/[^\(\)\,\:\[\]]+/) then
+               t.concat ss.matched.lstrip
+             end
+           end
+           tokens.push t
+ 
+         elsif ss.scan(/[^\(\)\,\:\[\]]+/) then
+           # unquoted_label
+           t = ss.matched.strip
+           t.gsub!(/[\r\n]/, '')
+           # unquoted underscore should be converted to blank
+           t.gsub!(/\_/, ' ')
+           tokens.push t unless t.empty?
+ 
+         else
+           # unquoted_label in end of string
+           t = ss.rest.strip
+           t.gsub!(/[\r\n]/, '')
+           # unquoted underscore should be converted to blank
+           t.gsub!(/\_/, ' ')
+           tokens.push t unless t.empty?
+           ss.terminate
+ 
+         end
+       end #while !(ss.eos?)
+ 
+       tokens
+     end
+ 
+     # get tokens for a leaf
+     def __parse_newick_get_tokens_for_leaf(ary)
+       r = []
+       while t = ary[0] and t != :',' and t != :')' and t != :'('
+         r.push ary.shift
+       end
+       r
+     end
+ 
      # Parses newick formatted string.
      def __parse_newick(str, options = {})
***************
*** 402,409 ****
        node_stack = []
        # preparation of tokens
!       str = str.chop if str[-1..-1] == ';'
!       ary = str.split(/([\(\)\,])/)
!       ary.collect! { |x| x.strip!; x.empty? ? nil : x }
!       ary.compact!
        previous_token = nil
        # main loop
--- 550,554 ----
        node_stack = []
        # preparation of tokens
!       ary = __parse_newick_tokenize(str, options)
        previous_token = nil
        # main loop
***************
*** 411,416 ****
          #p token
          case token
!         when ','
!           if previous_token == ',' or previous_token == '(' then
              # there is a leaf whose name is empty.
              ary.unshift(token)
--- 556,561 ----
          #p token
          case token
!         when :','
!           if previous_token == :',' or previous_token == :'(' then
              # there is a leaf whose name is empty.
              ary.unshift(token)
***************
*** 418,422 ****
              token = nil
            end
!         when '('
            node = Node.new
            nodes << node
--- 563,567 ----
              token = nil
            end
!         when :'('
            node = Node.new
            nodes << node
***************
*** 424,429 ****
            node_stack.push(cur_node)
            cur_node = node
!         when ')'
!           if previous_token == ',' or previous_token == '(' then
              # there is a leaf whose name is empty.
              ary.unshift(token)
--- 569,574 ----
            node_stack.push(cur_node)
            cur_node = node
!         when :')'
!           if previous_token == :',' or previous_token == :'(' then
              # there is a leaf whose name is empty.
              ary.unshift(token)
***************
*** 432,439 ****
            else
              edge = Edge.new
!             next_token = ary[0]
!             if next_token and next_token != ',' and next_token != ')' then
!               __parse_newick_leaf(next_token, cur_node, edge, options)
!               ary.shift
              end
              parent = node_stack.pop
--- 577,584 ----
            else
              edge = Edge.new
!             leaf_tokens = __parse_newick_get_tokens_for_leaf(ary)
!             token = nil
!             if leaf_tokens.size > 0 then
!               __parse_newick_leaf(leaf_tokens, cur_node, edge, options)
              end
              parent = node_stack.pop
***************
*** 445,449 ****
            leaf = Node.new
            edge = Edge.new
!           __parse_newick_leaf(token, leaf, edge, options)
            nodes << leaf
            edges << Bio::Relation.new(cur_node, leaf, edge)
--- 590,597 ----
            leaf = Node.new
            edge = Edge.new
!           ary.unshift(token)
!           leaf_tokens = __parse_newick_get_tokens_for_leaf(ary)
!           token = nil
!           __parse_newick_leaf(leaf_tokens, leaf, edge, options)
            nodes << leaf
            edges << Bio::Relation.new(cur_node, leaf, edge)




More information about the bioruby-cvs mailing list