[BioRuby-cvs] bioruby/lib/bio/db/kegg taxonomy.rb,NONE,1.1

Katayama Toshiaki k at dev.open-bio.org
Mon Jul 9 08:48:06 UTC 2007


Update of /home/repository/bioruby/bioruby/lib/bio/db/kegg
In directory dev.open-bio.org:/tmp/cvs-serv29795

Added Files:
	taxonomy.rb 
Log Message:
* Bio::KEGG::Taxonomy class which treats taxonomyc tree structure of the
  KEGG taxonomy file available at ftp.genome.jp/pub/kegg/genes/taxonomy
  which is a replacement of the previously keggtab file (Bio::KEGG::Keggtab).


--- NEW FILE: taxonomy.rb ---
#
# = bio/db/kegg/taxonomy.rb - KEGG taxonomy parser class
#
# Copyright::  Copyright (C) 2007 Toshiaki Katayama <k at bioruby.org>
# License::    The Ruby License
#
#  $Id: taxonomy.rb,v 1.1 2007/07/09 08:48:03 k Exp $
#

module Bio
class KEGG

# == Description
#
# Parse the KEGG 'taxonomy' file which describes taxonomic classification
# of organisms.
#
# == References
#
# The KEGG 'taxonomy' file is available at
#
# * ftp://ftp.genome.jp/pub/kegg/genes/taxonomy
#
class Taxonomy

  def initialize(filename, orgs = [])
    @tree = Hash.new
    @path = Array.new
    @leaves = Hash.new

    # ¥ë¡¼¥È¥Î¡¼¥É¤ò Genes ¤È¤¹¤ë
    @root = 'Genes'

    hier = Array.new
    level = 0
    label = nil

    File.open(filename).each do |line|
      next if line.strip.empty?

      # ¥¿¥¯¥½¥Î¥ß¡¼³¬ÁØ¹Ô - # ¤Î¸Ä¿ô¤Ç³¬Áؤò¥¤¥ó¥Ç¥ó¥È¤¹¤ë½èÍý
      if line[/^#/]
	level = line[/^#+/].length
	label = line[/[A-z].*/]
	hier[level] = sanitize(label)

      # À¸Êª¼ï¥ê¥¹¥È¹Ô - À¸Êª¼ï¥³¡¼¥É¤È¥¹¥È¥ì¥¤¥ó°ã¤¤¤ò¤Þ¤È¤á¤ë½èÍý
      else
	tax, org, name, desc = line.chomp.split("\t")
        if orgs.nil? or orgs.empty? or orgs.include?(org)
          species, strain, = name.split('_')
          # (0) species ̾¤¬Ä¾Á°¤Î¹Ô¤Î¤â¤Î¤ÈƱ¤¸¾ì¹ç¡¢¤½¤Î¥°¥ë¡¼¥×¤ËÄɲÃ
          #  Gamma/enterobacteria ¤Ê¤É³ç¤ê¤¬Âç¤Þ¤«¤Ç¼ï¿ô¤Î¿¤¤¥°¥ë¡¼¥×¤ò
          #  Ʊ¤¸¼ï̾¡Ê¥¹¥È¥ì¥¤¥ó°ã¤¤¡Ë¤´¤È¤Ë¥µ¥Ö¥°¥ë¡¼¥×²½¤¹¤ë¤Î¤¬ÌÜŪ
          #   ex. E.coli, H.influenzae ¤Ê¤É
          # ¥È¥ê¥Ã¥­¡¼¤ÊÉôʬ¡§
          #  ¤â¤· species ̾¤¬¾å°ÌÃæ´Ö¥Î¡¼¥É¡Ê### ¹Ô¤Ê¤É¡Ë¤ÈƱ¤¸¡Ê´û½Ð¡Ë¤Ç¤¢¤ì¤Ð
          #  Tree ¤ò Hash ¤Ç»ý¤Ä»ÅÍͤȥ³¥ó¥Õ¥ê¥¯¥È¤¹¤ë¤Î¤ÇÊÌ̾¤¬É¬Í×
          # (1) species ̾¤¬·ÏÅý¤Î°Û¤Ê¤ë¾å°ÌÃæ´Ö¥Î¡¼¥É̾¤ÈƱ¤¸¾ì¹ç
          #   ¢ª ¤È¤ê¤¢¤¨¤º species ̾¤Ë _sp ¤ò¤Ä¤±¤Æ¥³¥ó¥Õ¥ê¥¯¥È¤òÈò¤±¤ë (1-1)
          #      ¤¹¤Ç¤Ë _sp ¤â»È¤ï¤ì¤Æ¤¤¤ë¾ì¹ç¤Ï strain ̾¤ò»È¤¦ (1-2)
          #   ex. Bacteria/Proteobacteria/Beta/T.denitrificans/tbd ¤È
          #       Bacteria/Proteobacteria/Epsilon/T.denitrificans_ATCC33889/tdn
          #    -> Bacteria/Proteobacteria/Beta/T.denitrificans/tbd ¤È
          #       Bacteria/Proteobacteria/Epsilon/T.denitrificans_sp/tdn
          # (2) species ̾¤¬¾åµ­Ãæ´Ö¥Î¡¼¥É̾¤ÈƱ¤¸¾ì¹ç
          #   ¢ª ¤È¤ê¤¢¤¨¤º species ̾¤Ë _sp ¤ò¤Ä¤±¤Æ¥³¥ó¥Õ¥ê¥¯¥È¤òÈò¤±¤ë
          #   ex. Bacteria/Cyanobacgteria/Cyanobacteria_CYA/cya
          #       Bacteria/Cyanobacgteria/Cyanobacteria_CYB/cya
          #       Bacteria/Proteobacteria/Magnetococcus/Magnetococcus_MC1/mgm
          #    -> Bacteria/Cyanobacgteria/Cyanobacteria_sp/cya
          #       Bacteria/Cyanobacgteria/Cyanobacteria_sp/cya
          #       Bacteria/Proteobacteria/Magnetococcus/Magnetococcus_sp/mgm
          sp_group = "#{species}_sp"
          if @tree[species]
            if hier[level+1] == species
              # case (0)
            else
              # case (1-1)
              species = sp_group
              # case (1-2)
              if @tree[sp_group] and hier[level+1] != species
                species = name
              end
            end
          else
            if hier[level] == species
              # case (2)
              species = sp_group
            end
          end
          # hier ¤Ï [nil, Eukaryotes, Fungi, Ascomycetes, Saccharomycetes] ¤Ë
          # species ¤È org ¤Î [S_cerevisiae, sce] ¤ò²Ã¤¨¤¿·Á¼°
          hier[level+1] = species
          #hier[level+1] = sanitize(species)
          hier[level+2] = org
          ary = hier[1, level+2]
          warn ary.inspect if $DEBUG
          add_to_tree(ary)
          add_to_leaves(ary)
          add_to_path(ary)
        end
      end
    end
    return tree
  end

  attr_reader :tree
  attr_reader :path
  attr_reader :leaves
  attr_accessor :root

  def organisms(group)
    @leaves[group]
  end

  # root ¥Î¡¼¥É¤Î²¼¤Ë [node, subnode, subsubnode, ..., leaf] ¤Ê¥Ñ¥¹¤òÄɲÃ
  # ³ÆÃæ´Ö¥Î¡¼¥É¤¬»ÒÍ×ÁǤò¥Ï¥Ã¥·¥å¤ÇÊÝ»ý
  def add_to_tree(ary)
    parent = @root
    ary.each do |node|
      @tree[parent] ||= Hash.new
      @tree[parent][node] = nil
      parent = node
    end
  end

  # ³ÆÃæ´Ö¥Î¡¼¥É¤ËÂбþ¤¹¤ë¥ê¡¼¥Õ¤Î¥ê¥¹¥È¤òÊÝ»ý
  def add_to_leaves(ary)
    leaf = ary.last
    ary.each do |node|
      @leaves[node] ||= Array.new
      @leaves[node] << leaf
    end
  end

  # ³ÆÃæ´Ö¥Î¡¼¥É¤Þ¤Ç¤Î¥Ñ¥¹¤òÊÝ»ý
  def add_to_path(ary)
    @path << ary
  end

  # ¿Æ¥Î¡¼¥É¤«¤é¸«¤Æ»Ò¥Î¡¼¥É¤¬Â¹¥Î¡¼¥É¤ò£±¤Ä¤·¤«»ý¤Ã¤Æ¤¤¤Ê¤¤¾ì¹ç¡¢
  # ¹¥Î¡¼¥É¤Î»Ò¶¡¡Ê¤Ò¹¡Ë¤ò¡¢»Ò¥Î¡¼¥É¤Î»Ò¡Ê¹¡Ë¤È¤¹¤ë
  #
  # ex.
  #  Plants / Monocotyledons / grass family / osa --> Plants / Monocotyledons / osa
  #
  def compact(node = root)
    # »Ò¥Î¡¼¥É¤¬¤¢¤ê
    if subnodes = @tree[node]
      # ¤½¤ì¤¾¤ì¤Î»Ò¥Î¡¼¥É¤Ë¤Ä¤¤¤Æ
      subnodes.keys.each do |subnode|
        # ¹¥Î¡¼¥É¤ò¼èÆÀ
        if subsubnodes = @tree[subnode]
          # ¹¥Î¡¼¥É¤Î¿ô¤¬ 1 ¤Ä¤Î¾ì¹ç
          if subsubnodes.keys.size == 1
            # ¹¥Î¡¼¥É¤Î̾Á°¤ò¼èÆÀ
            subsubnode = subsubnodes.keys.first
            # ¹¥Î¡¼¥É¤Î»Ò¶¡¤ò¼èÆÀ
            if subsubsubnodes = @tree[subsubnode]
              # ¹¥Î¡¼¥É¤Î»Ò¶¡¤ò»Ò¥Î¡¼¥É¤Î»Ò¶¡¤Ë¤¹¤²¤«¤¨
              @tree[subnode] = subsubsubnodes
              # ¹¥Î¡¼¥É¤òºï½ü
              @tree[subnode].delete(subsubnode)
              warn "--- compact: #{subsubnode} is replaced by #{subsubsubnodes}" if $DEBUG
              # ¿·¤·¤¤Â¹¥Î¡¼¥É¤Ç¤â compact ¤¬É¬Íפ«¤â¤·¤ì¤Ê¤¤¤¿¤á·«¤êÊÖ¤¹
              retry
            end
          end
        end
        # »Ò¥Î¡¼¥É¤ò¿Æ¥Î¡¼¥É¤È¤·¤ÆºÆµ¢
        compact(subnode)
      end
    end
  end

  # ¥ê¡¼¥Õ¥Î¡¼¥É¤¬£±¤Ä¤Î¾ì¹ç¡¢¿Æ¥Î¡¼¥É¤ò¥ê¡¼¥Õ¥Î¡¼¥É¤Ë¤¹¤²¤«¤¨¤ë
  #
  # ex.
  #  Plants / Monocotyledons / osa --> Plants / osa
  #
  def reduce(node = root)
    # »Ò¥Î¡¼¥É¤¬¤¢¤ê
    if subnodes = @tree[node]
      # ¤½¤ì¤¾¤ì¤Î»Ò¥Î¡¼¥É¤Ë¤Ä¤¤¤Æ
      subnodes.keys.each do |subnode|
        # ¹¥Î¡¼¥É¤ò¼èÆÀ
        if subsubnodes = @tree[subnode]
          # ¹¥Î¡¼¥É¤Î¿ô¤¬ 1 ¤Ä¤Î¾ì¹ç
          if subsubnodes.keys.size == 1
            # ¹¥Î¡¼¥É¤Î̾Á°¤ò¼èÆÀ
            subsubnode = subsubnodes.keys.first
            # ¹¥Î¡¼¥É¤¬¥ê¡¼¥Õ¤Î¾ì¹ç
            unless @tree[subsubnode]
              # ¹¥Î¡¼¥É¤ò»Ò¥Î¡¼¥É¤Ë¤¹¤²¤«¤¨
              @tree[node].update(subsubnodes)
              # »Ò¥Î¡¼¥É¤òºï½ü
              @tree[node].delete(subnode)
              warn "--- reduce: #{subnode} is replaced by #{subsubnode}" if $DEBUG
            end
          end
        end
        # »Ò¥Î¡¼¥É¤ò¿Æ¥Î¡¼¥É¤È¤·¤ÆºÆµ¢
        reduce(subnode)
      end
    end
  end

  # Í¿¤¨¤é¤ì¤¿¥Î¡¼¥É¤È¡¢»Ò¥Î¡¼¥É¤Î¥ê¥¹¥È¡ÊHash¡Ë¤ò¤¦¤±¤È¤ê¡¢
  # »Ò¥Î¡¼¥É¤Ë¤Ä¤¤¤Æ¥¤¥Æ¥ì¡¼¥·¥ç¥ó¤¹¤ë
  def dfs(parent, &block)
    if children = @tree[parent]
      yield parent, children
      children.keys.each do |child|
        dfs(child, &block)
      end
    end
  end

  # ¸½ºß¤Î³¬Áؤο¼¤µ¤â¥¤¥Æ¥ì¡¼¥·¥ç¥ó¤ËÅϤ¹
  def dfs_with_level(parent, &block)
    @level ||= 0
    if children = @tree[parent]
      yield parent, children, @level
      @level += 1
      children.keys.each do |child|
        dfs_with_level(child, &block)
      end
      @level -= 1
    end
  end

  # ¥Ä¥ê¡¼¹½Â¤¤ò¥¢¥¹¥­¡¼¥¢¡¼¥È¤Çɽ¼¨¤¹¤ë
  def to_s
    result = "#{@root}\n"
    @tree[@root].keys.each do |node|
      result += subtree(node, "  ")
    end
    return result
  end

  private

  # ¾åµ­ to_s ÍѤβ¼ÀÁ¤±¥á¥½¥Ã¥É
  def subtree(node, indent)
    result = "#{indent}+- #{node}\n"
    indent += "  "
    @tree[node].keys.each do |child|
      if @tree[child]
        result += subtree(child, indent)
      else
        result += "#{indent}+- #{child}\n"
      end
    end
    return result
  end

  def sanitize(str)
    str.gsub(/[^A-z0-9]/, '_')
  end

end # Taxonomy

end # KEGG
end # Bio



if __FILE__ == $0

  # Usage:
  # % wget ftp://ftp.genome.jp/pub/kegg/genes/taxonomy
  # % ruby taxonomy.rb taxonomy | less -S

  taxonomy = ARGV.shift
  org_list = ARGV.shift || nil

  if org_list
    orgs = File.readlines(org_list).map{|x| x.strip}
  else
    orgs = nil
  end

  tree = Bio::KEGG::Taxonomy.new(taxonomy, orgs)

  puts ">>> tree - original"
  puts tree

  puts ">>> tree - after compact"
  tree.compact
  puts tree

  puts ">>> tree - after reduce"
  tree.reduce
  puts tree

  puts ">>> path - sorted"
  tree.path.sort.each do |path|
    puts path.join("/")
  end

  puts ">>> group : orgs"
  tree.dfs(tree.root) do |parent, children|
    if orgs = tree.organisms(parent)
      puts "#{parent.ljust(30)} (#{orgs.size})\t#{orgs.join(', ')}"
    end
  end

  puts ">>> group : subgroups"
  tree.dfs_with_level(tree.root) do |parent, children, level|
    subgroups = children.keys.sort
    indent = " " * level
    label  = "#{indent} #{level} #{parent}"
    puts "#{label.ljust(35)}\t#{subgroups.join(', ')}"
  end

end




More information about the bioruby-cvs mailing list