[BioRuby-cvs] bioruby/lib/bio phylogenetictree.rb,NONE,1.1

Naohisa Goto ngoto at dev.open-bio.org
Thu Oct 5 13:38:24 UTC 2006


Update of /home/repository/bioruby/bioruby/lib/bio
In directory dev.open-bio.org:/tmp/cvs-serv12262/lib/bio

Added Files:
	phylogenetictree.rb 
Log Message:
* lib/bio/phylogenetictree.rb: Bio::PhylogeneticTree is phylogenetic tree
  data structure class.
* lib/bio/db/newick.rb: Bio::Newick is the Newick Standard (aka.
  New Hampshire Format) phylogenetic tree parser. Some methods for
  formatting Newick output also exists in this file.


--- NEW FILE: phylogenetictree.rb ---
#
# = bio/phylogenetictree.rb - phylogenetic tree data structure class
#
# Copyright::   Copyright (C) 2006
#               Naohisa Goto <ng at bioruby.org>
# License::     Ruby's
#
# $Id: phylogenetictree.rb,v 1.1 2006/10/05 13:38:21 ngoto Exp $
#

require 'matrix'
require 'bio/pathway'

module Bio

  # This is the class for phylogenetic tree.
  # It stores a phylogenetic tree.
  # 
  # Internally, it is based on Bio::Pathway class.
  # However, users cannot handle Bio::Pathway object directly.
  #
  # This is alpha version. Incompatible changes may be made frequently.
  class PhylogeneticTree

    # Error when there are no path between specified nodes
    class NoPathError < RuntimeError; end

    # Error when the two nodes are not adjacent.
    class NotAdjacentNodesError < RuntimeError; end

    # Edge object of each node.
    # By default, the object doesn't contain any node information.
    class Edge

      # creates a new edge.
      def initialize(distance = nil)
        if distance.kind_of?(Numeric)
          self.distance = distance
        elsif distance
          self.distance_string = distance
        end
      end

      # evolutionary distance
      attr_reader :distance

      # evolutionary distance represented as a string
      attr_reader :distance_string

      # set evolutionary distance value
      def distance=(num)
        @distance = num
        @distance_string = (num ? num.to_s : num)
      end

      # set evolutionary distance value from a string
      def distance_string=(str)
        if str.to_s.strip.empty?
          @distance = nil
          @distance_string = str
        else
          @distance = str.to_f
          @distance_string = str
        end
      end

      # visualization of this object
      def inspect
        "<Edge distance=#{@distance.inspect}>"
      end

      # string representation of this object
      def to_s
        @distance_string.to_s
      end
    end #class Edge

    # Gets distance value from the given edge.
    # Returns float or any other numeric value or nil.
    def get_edge_distance(edge)
      begin
        dist = edge.distance
      rescue NoMethodError
        dist = edge
      end
      dist
    end

    # Gets distance string from the given edge.
    # Returns a string or nil.
    def get_edge_distance_string(edge)
      begin
        dist = edge.distance_string
      rescue NoMethodError
        dist = (edge ? edge.to_s : nil)
      end
      dist
    end

    # Returns edge1 + edge2
    def get_edge_merged(edge1, edge2)
      dist1 = get_edge_distance(edge1)
      dist2 = get_edge_distance(edge2)
      if dist1 and dist2 then
        Edge.new(dist1 + dist2)
      elsif dist1 then
        Edge.new(dist1)
      elsif dist2 then
        Edge.new(dist2)
      else
        Edge.new
      end
    end

    # Node object.
    class Node

      # Creates a new node.
      def initialize(name = nil)
        @name = name if name
      end

      # name of the node
      attr_accessor :name

      # bootstrap value
      attr_reader :bootstrap

      # bootstrap value as a string
      attr_reader :bootstrap_string

      # sets a bootstrap value
      def bootstrap=(num)
        @bootstrap_string = (num ? num.to_s : num)
        @bootstrap = num
      end

      # sets a bootstrap value from a string
      def bootstrap_string=(str)
        if str.to_s.strip.empty?
          @bootstrap = nil
          @bootstrap_string = str
        else
          i = str.to_i
          f = str.to_f
          @bootstrap = (i == f ? i : f)
          @bootstrap_string = str
        end
      end

      # visualization of this object
      def inspect
        if @name and !@name.empty? then
          str = "(Node:#{@name.inspect}"
        else
          str = sprintf('(Node:%x', (self.__id__ << 1) & 0xffffffff)
        end
        str += " bootstrap=#{@bootstrap.inspect}" if @bootstrap
        str += ")"
        str
      end

      # string representation of this object
      def to_s
        @name.to_s
      end
    end #class Node

    # Gets node name
    def get_node_name(node)
      begin
        node.name
      rescue NoMethodError
        node.to_s
      end
    end

    def get_node_bootstrap(node)
      begin
        node.bootstrap
      rescue NoMethodError
        nil
      end
    end

    def get_node_bootstrap_string(node)
      begin
        node.bootstrap_string
      rescue NoMethodError
        nil
      end
    end

    # Creates a new phylogenetic tree.
    # When no arguments are given, it creates a new empty tree.
    # When a PhylogeneticTree object is given, it copies the tree.
    # Note that the  new tree shares Node and Edge objects
    # with the given tree.
    def initialize(tree = nil)
      # creates an undirected adjacency list graph
      @pathway = Bio::Pathway.new([], true)
      @root = nil
      @options = {}
      self.concat(tree) if tree
    end

    # root node of this tree
    # (even if unrooted tree, it is used by some methods)
    attr_accessor :root

    # tree options; mainly used for tree output
    attr_accessor :options

    # Clears all nodes and edges.
    # Returns self.
    # Note that options and root are also cleared.
    def clear
      initialize
      self
    end

    # Returns all nodes as an array.
    def nodes
      @pathway.graph.keys
    end

    # Number of nodes.
    def number_of_nodes
      @pathway.nodes
    end

    # Iterates over each node of this tree.
    def each_node(&x) #:yields: node
      @pathway.graph.each_key(&x)
      self
    end

    # Iterates over each edges of this tree.
    def each_edge #:yields: source, target, edge
      @pathway.relations.each do |rel|
        yield rel.node[0], rel.node[1], rel.relation
      end
      self
    end

    # Returns all edges an array of [ node0, node1, edge ]
    def edges
      @pathway.relations.collect do |rel|
        [ rel.node[0], rel.node[1], rel.relation ]
      end
    end

    # Returns number of edges in the tree.
    def number_of_edges
      @pathway.relations.size
    end

    # Returns an array of adjacent nodes of the given node.
    def adjacent_nodes(node)
      h = @pathway.graph[node]
      h ? h.keys : []
    end

    # Returns all connected edges with adjacent nodes.
    # Returns an array of the array [ source, target, edge ].
    #
    # The reason why the method name is "out_edges" is that
    # it comes from the Boost Graph Library.
    def out_edges(source)
      h = @pathway.graph[source]
      if h
        h.collect { |key, val| [ source, key, val ] }
      else
        []
      end
    end

    # Iterates over each connected edges of the given node.
    # Returns self.
    #
    # The reason why the method name is "each_out_edge" is that
    # it comes from the Boost Graph Library.
    def each_out_edge(source) #:yields: source, target, edge
      h = @pathway.graph[source]
      h.each { |key, val| yield source, key, val } if h
      self
    end

    # Returns number of edges in the given node.
    #
    # The reason why the method name is "out_degree" is that
    # it comes from the Boost Graph Library.
    def out_degree(source)
      h = @pathway.graph[source]
      h ? h.size : 0
    end

    # Returns an edge from source to target.
    # If source and target are not adjacent nodes, returns nil.
    def get_edge(source, target)
      h = @pathway.graph[source]
      h ? h[target] : nil
    end

    # Adds a new edge to the tree.
    # Returns the newly added edge.
    # If the edge already exists, it is overwritten with new one.
    def add_edge(source, target, edge = Edge.new)
      @pathway.append(Bio::Relation.new(source, target, edge))
      edge
    end

    # Adds a node to the tree.
    # Returns self.
    # If the node already exists, it does nothing.
    def add_node(node)
      @pathway.graph[node] ||= {}
      self
    end

    # If the node exists, returns true.
    # Otherwise, returns false.
    def include?(node)
      @pathway.graph[node] ? true : false
    end

    # Removes all edges connected with the node.
    # Returns self.
    # If the node does not exist, raises IndexError.
    def clear_node(node)
      unless self.include?(node)
        raise IndexError, 'the node does not exist'
      end
      @pathway.relations.delete_if do |rel|
        rel.node.include?(node)
      end
      @pathway.graph[node].each_key do |k|
        @pathway.graph[k].delete(node)
      end
      self
    end

    # Removes the given node from the tree.
    # All edges connected with the node are also removed.
    # Returns self.
    # If the node does not exist, raises IndexError.
    def remove_node(node)
      self.clear_node(node)
      @pathway.graph.delete(node)
      self
    end

    # Removes each node if the block returns not nil.
    # All edges connected with the removed nodes are also removed.
    # Returns self.
    def remove_node_if
      all = self.nodes
      all.each do |node|
        if yield node then
          self.clear_node(node)
          @pathway.graph.delete(node)
        end
      end
      self
    end

    # Removes an edge between source and target.
    #---
    # If two or more edges exists between source and target,
    # all of them are removed.
    #+++
    def remove_edge(source, target)
      fwd = [ source, target ]
      rev = [ target, source ]
      @pathway.relations.delete_if do |rel|
        rel.node == fwd or rel.node == rev
      end
      h = @pathway.graph[source]
      h.delete(target) if h
      h = @pathway.graph[target]
      h.delete(source) if h
      self
    end

    # Removes each edge if the block returns not nil.
    # Returns self.
    def remove_edge_if #:yields: source, target, edge
      removed_rel = []
      @pathway.relations.delete_if do |rel|
        if yield rel.node[0], rel.node[1], edge then
          removed_rel << rel
          true
        end
      end
      removed_rel.each do |rel|
        source = rel[0]
        target = rel[1]
        h = @pathway.graph[source]
        h.delete(target) if h
        h = @pathway.graph[target]
        h.delete(source) if h
      end
      self
    end

    # Replaces each node by each block's return value.
    # Returns self.
    def collect_node! #:yields: node
      tr = {}
      self.each_node do |node|
        tr[node] = yield node
      end
      # replaces nodes in @pathway.relations
      @pathway.relations.each do |rel|
        rel.node.collect! { |node| tr[node] }
      end
      # re-generates @pathway from relations
      @pathway.to_list
      # adds orphan nodes
      tr.each_value do |newnode|
        @pathway.graph[newnode] ||= {}
      end
      self
    end

    # Replaces each edge by each block's return value.
    # Returns self.
    def collect_edge! #:yields: source, target, edge
      @pathway.relations.each do |rel|
        newedge = yield rel.node[0], rel.node[1], rel.relation
        rel.relation = newedge
        @pathway.append(rel, false)
      end
      self
    end

    # Gets the sub-tree consisted of given nodes.
    # _nodes_ must be an array of nodes.
    # Nodes that do not exist in the original tree are ignored.
    # Returns a PhylogeneticTree object.
    # Note that the sub-tree shares Node and Edge objects
    # with the original tree.
    def subtree(nodes)
      nodes = nodes.find_all do |x|
        @pathway.graph[x]
      end
      return self.class.new if nodes.empty?
      # creates subtree
      new_tree = self.class.new
      nodes.each do |x|
        new_tree.add_node(x)
      end
      self.each_edge do |node1, node2, edge|
        if new_tree.include?(node1) and new_tree.include?(node2) then
          new_tree.add_edge(node1, node2, edge)
        end
      end
      return new_tree
    end

    # Gets the sub-tree consisted of given nodes and
    # all internal nodes connected between given nodes.
    # _nodes_ must be an array of nodes.
    # Nodes that do not exist in the original tree are ignored.
    # Returns a PhylogeneticTree object.
    # The result is unspecified for cyclic trees.
    # Note that the sub-tree shares Node and Edge objects
    # with the original tree.
    def subtree_with_all_paths(nodes)
      hash = {}
      nodes.each { |x| hash[x] = true }
      nodes.each_index do |i|
        node1 = nodes[i]
        (0...i).each do |j|
          node2 = nodes[j]
          unless node1 == node2 then
            begin
              path = self.path(node1, node2)
            rescue IndexError, NoPathError
              path = []
            end
            path.each { |x| hash[x] = true }
          end
        end
      end
      self.subtree(hash.keys)
    end

    # Concatenates the other tree.
    # If the same edge exists, the edge in _other_ is used.
    # Returns self.
    # The result is unspecified if _other_ isn't a PhylogeneticTree object.
    # Note that the Node and Edge objects in the _other_ tree are
    # shared in the concatinated tree.
    def concat(other)
      #raise TypeError unless other.kind_of?(self.class)
      other.each_node do |node|
        self.add_node(node)
      end
      other.each_edge do |node1, node2, edge|
        self.add_edge(node1, node2, edge)
      end
      self
    end

    # Gets path from node1 to node2.
    # Retruns an array of nodes, including node1 and node2.
    # If node1 and/or node2 do not exist, IndexError is raised.
    # If node1 and node2 are not connected, NoPathError is raised.
    # The result is unspecified for cyclic trees.
    def path(node1, node2)
      raise IndexError, 'node1 not found' unless @pathway.graph[node1]
      raise IndexError, 'node2 not found' unless @pathway.graph[node2]
      return [ node1 ] if node1 == node2
      step, path = @pathway.bfs_shortest_path(node1, node2)
      unless path[0] == node1 and path[-1] == node2 then
        raise NoPathError, 'node1 and node2 are not connected'
      end
      path
    end

    # Iterates over each edge from node1 to node2.
    # The result is unspecified for cyclic trees.
    def each_edge_in_path(node1, node2)
      path = self.path(node1, node2)
      source = path.shift
      path.each do |target|
        edge = self.get_edge(source, target)
        yield source, target, edge
        source = target
      end
      self
    end

    # Returns distance between node1 and node2.
    # It would raise error if the edges didn't contain distance values.
    # The result is unspecified for cyclic trees.
    def distance(node1, node2)
      distance = 0
      self.each_edge_in_path(node1, node2) do |source, target, edge|
        distance += get_edge_distance(edge)
      end
      distance
    end

    # Gets the parent node of the _node_.
    # If _root_ isn't specified or _root_ is <code>nil</code>, @root is used.
    # Returns an <code>Node</code> object or nil.
    # The result is unspecified for cyclic trees.
    def parent(node, root = nil)
      root ||= @root
      self.path(root, node)[-2]
    end

    # Gets the adjacent children nodes of the _node_.
    # If _root_ isn't specified or _root_ is <code>nil</code>, @root is used.
    # Returns an array of <code>Node</code>s.
    # The result is unspecified for cyclic trees.
    def children(node, root = nil)
      root ||= @root
      path = self.path(root, node)
      result = self.adjacent_nodes(node)
      result -= path
      result
    end

    # Gets all descendent nodes of the _node_.
    # If _root_ isn't specified or _root_ is <code>nil</code>, @root is used.
    # Returns an array of <code>Node</code>s.
    # The result is unspecified for cyclic trees.
    def descendents(node, root = nil)
      root ||= @root
      distance, route = @pathway.breadth_first_search(root)
      d = distance[node]
      result = []
      distance.each do |key, val|
        if val > d then
          x = key
          while x = route[x]
            if x == node then
              result << key
              break
            end
            break if distance[x] <= d
          end
        end
      end
      result
    end

    # If _node_ is nil, returns an array of 
    # all leaves (nodes connected with one edge).
    # Otherwise, gets all descendent leaf nodes of the _node_.
    # If _root_ isn't specified or _root_ is <code>nil</code>, @root is used.
    # Returns an array of <code>Node</code>s.
    # The result is unspecified for cyclic trees.
    def leaves(node = nil, root = nil)
      unless node then
        nodes = []
        self.each_node do |x|
          nodes << x if self.out_degree(x) == 1
        end
        return nodes
      else
        root ||= @root
        self.descendents(node, root).find_all do |x|
          self.adjacent_nodes(x).size == 1
        end
      end
    end

    # Gets all ancestral nodes of the _node_.
    # If _root_ isn't specified or _root_ is <code>nil</code>, @root is used.
    # Returns an array of <code>Node</code>s.
    # The result is unspecified for cyclic trees.
    def ancestors(node, root = nil)
      root ||= @root
      (self.path(root, node) - [ node ]).reverse
    end

    # Gets the lowest common ancestor of the two nodes.
    # If _root_ isn't specified or _root_ is <code>nil</code>, @root is used.
    # Returns a <code>Node</code> object or nil.
    # The result is unspecified for cyclic trees.
    def lowest_common_ancestor(node1, node2, root = nil)
      root ||= @root
      distance, route = @pathway.breadth_first_search(root)
      x = node1; r1 = []
      begin; r1 << x; end while x = route[x]
      x = node2; r2 = []
      begin; r2 << x; end while x = route[x]
      return (r1 & r2).first
    end


    # Calculates distance matrix of given nodes.
    # If _nodes_ is nil, or is ommited, it acts the same as
    # tree.distance_matrix(tree.leaves).
    # Returns a matrix object.
    # The result is unspecified for cyclic trees.
    # Note 1: The diagonal values of the matrix are 0.
    # Note 2: If the distance cannot be calculated, nil will be set.
    def distance_matrix(nodes = nil)
      nodes ||= self.leaves
      matrix = []
      nodes.each_index do |i|
        row = []
        nodes.each_index do |j|
          if i == j then
            distance = 0
          elsif r = matrix[j] and val = r[i] then
            distance = val
          else
            distance = (self.distance(nodes[i], nodes[j]) rescue nil)
          end
          row << distance
        end
        matrix << row
      end
      Matrix.rows(matrix, false)
    end

    # Shows the adjacency matrix representation of the tree.
    # It shows matrix only for given nodes.
    # If _nodes_ is nil or is ommitted,
    # it acts the same as tree.adjacency_matrix(tree.nodes).
    # If a block is given, for each edge,
    # it yields _source_, _target_, and _edge_, and
    # uses the returned value of the block.
    # Without blocks, it uses edge.
    # Returns a matrix object.
    def adjacency_matrix(nodes = nil,
                         default_value = nil,
                         diagonal_value = nil) #:yields: source, target, edge
      nodes ||= self.nodes
      size = nodes.size
      hash = {}
      nodes.each_with_index { |x, i| hash[x] = i }
      # prepares an matrix
      matrix = Array.new(size, nil)
      matrix.collect! { |x| Array.new(size, default_value) }
      (0...size).each { |i| matrix[i][i] = diagonal_value }
      # fills the matrix from each edge
      self.each_edge do |source, target, edge|
        i_source = hash[source]
        i_target = hash[target]
        if i_source and i_target then
          val = block_given? ? (yield source, target, edge) : edge
          matrix[i_source][i_target] = val
          matrix[i_target][i_source] = val
        end
      end
      Matrix.rows(matrix, false)
    end

    # Removes all nodes that are not branches nor leaves.
    # That is, removes nodes connected with exactly two edges.
    # For each removed node, two adjacent edges are merged and
    # a new edge are created.
    # Returns removed nodes.
    # Note that orphan nodes are still kept unchanged.
    def remove_nonsense_nodes
      hash = {}
      self.each_node do |node|
        hash[node] = true if @pathway.graph[node].size == 2
      end
      hash.each_key do |node|
        adjs = @pathway.graph[node].keys
        edges = @pathway.graph[node].values
        new_edge = get_edge_merged(edges[0], edges[1])
        @pathway.graph[adjs[0]].delete(node)
        @pathway.graph[adjs[1]].delete(node)
        @pathway.graph.delete(node)
        @pathway.append(Bio::Relation.new(adjs[0], adjs[1], new_edge))
      end
      #@pathway.to_relations
      @pathway.relations.reject! do |rel|
        hash[rel.node[0]] or hash[rel.node[1]]
      end
      return hash.keys
    end

    # Insert a new node between adjacent nodes node1 and node2.
    # The old edge between node1 and node2 are changed to the edge
    # between new_node and node2.
    # The edge between node1 and new_node is newly created.
    #
    # If new_distance is specified, the distance between
    # node1 and new_node is set to new_distance, and
    # distance between new_node and node2 is set to
    # <code>tree.get_edge(node1, node2).distance - new_distance</code>.
    #
    # Returns self.
    # If node1 and node2 are not adjacent, raises NotAdjacentNodesError.
    #
    # If new_node already exists in the tree, the tree would become
    # circular. In addition, if the edge between new_node and
    # node1 (or node2) already exists, it will be erased.
    def insert_node(node1, node2, new_node, new_distance = nil)
      unless edge = self.get_edge(node1, node2) then
        raise NotAdjacentNodesError, 'node1 and node2 are not adjacent.'
      end
      new_edge = Edge.new(new_distance)
      self.remove_edge(node1, node2)
      self.add_edge(node1, new_node, new_edge)
      if new_distance and old_distance = get_edge_distance(edge) then
        old_distance -= new_distance
        begin
          edge.distance = old_distance
        rescue NoMethodError
          edge = old_distance
        end
      end
      self.add_edge(new_node, node2, edge)
      self
    end
  end #class PhylogeneticTree
end #module Bio

#---
# temporary added
#+++
require 'bio/db/newick'





More information about the bioruby-cvs mailing list