[BioRuby-cvs] bioruby/lib/bio/db/kegg taxonomy.rb,NONE,1.1
Katayama Toshiaki
k at dev.open-bio.org
Mon Jul 9 08:48:06 UTC 2007
Update of /home/repository/bioruby/bioruby/lib/bio/db/kegg
In directory dev.open-bio.org:/tmp/cvs-serv29795
Added Files:
taxonomy.rb
Log Message:
* Bio::KEGG::Taxonomy class which treats taxonomyc tree structure of the
KEGG taxonomy file available at ftp.genome.jp/pub/kegg/genes/taxonomy
which is a replacement of the previously keggtab file (Bio::KEGG::Keggtab).
--- NEW FILE: taxonomy.rb ---
#
# = bio/db/kegg/taxonomy.rb - KEGG taxonomy parser class
#
# Copyright:: Copyright (C) 2007 Toshiaki Katayama <k at bioruby.org>
# License:: The Ruby License
#
# $Id: taxonomy.rb,v 1.1 2007/07/09 08:48:03 k Exp $
#
module Bio
class KEGG
# == Description
#
# Parse the KEGG 'taxonomy' file which describes taxonomic classification
# of organisms.
#
# == References
#
# The KEGG 'taxonomy' file is available at
#
# * ftp://ftp.genome.jp/pub/kegg/genes/taxonomy
#
class Taxonomy
def initialize(filename, orgs = [])
@tree = Hash.new
@path = Array.new
@leaves = Hash.new
# ¥ë¡¼¥È¥Î¡¼¥É¤ò Genes ¤È¤¹¤ë
@root = 'Genes'
hier = Array.new
level = 0
label = nil
File.open(filename).each do |line|
next if line.strip.empty?
# ¥¿¥¯¥½¥Î¥ß¡¼³¬ÁØ¹Ô - # ¤Î¸Ä¿ô¤Ç³¬Áؤò¥¤¥ó¥Ç¥ó¥È¤¹¤ë½èÍý
if line[/^#/]
level = line[/^#+/].length
label = line[/[A-z].*/]
hier[level] = sanitize(label)
# À¸Êª¼ï¥ê¥¹¥È¹Ô - À¸Êª¼ï¥³¡¼¥É¤È¥¹¥È¥ì¥¤¥ó°ã¤¤¤ò¤Þ¤È¤á¤ë½èÍý
else
tax, org, name, desc = line.chomp.split("\t")
if orgs.nil? or orgs.empty? or orgs.include?(org)
species, strain, = name.split('_')
# (0) species ̾¤¬Ä¾Á°¤Î¹Ô¤Î¤â¤Î¤ÈƱ¤¸¾ì¹ç¡¢¤½¤Î¥°¥ë¡¼¥×¤ËÄɲÃ
# Gamma/enterobacteria ¤Ê¤É³ç¤ê¤¬Âç¤Þ¤«¤Ç¼ï¿ô¤Î¿¤¤¥°¥ë¡¼¥×¤ò
# Ʊ¤¸¼ï̾¡Ê¥¹¥È¥ì¥¤¥ó°ã¤¤¡Ë¤´¤È¤Ë¥µ¥Ö¥°¥ë¡¼¥×²½¤¹¤ë¤Î¤¬ÌÜŪ
# ex. E.coli, H.influenzae ¤Ê¤É
# ¥È¥ê¥Ã¥¡¼¤ÊÉôʬ¡§
# ¤â¤· species ̾¤¬¾å°ÌÃæ´Ö¥Î¡¼¥É¡Ê### ¹Ô¤Ê¤É¡Ë¤ÈƱ¤¸¡Ê´û½Ð¡Ë¤Ç¤¢¤ì¤Ð
# Tree ¤ò Hash ¤Ç»ý¤Ä»ÅÍͤȥ³¥ó¥Õ¥ê¥¯¥È¤¹¤ë¤Î¤ÇÊÌ̾¤¬É¬Í×
# (1) species ̾¤¬·ÏÅý¤Î°Û¤Ê¤ë¾å°ÌÃæ´Ö¥Î¡¼¥É̾¤ÈƱ¤¸¾ì¹ç
# ¢ª ¤È¤ê¤¢¤¨¤º species ̾¤Ë _sp ¤ò¤Ä¤±¤Æ¥³¥ó¥Õ¥ê¥¯¥È¤òÈò¤±¤ë (1-1)
# ¤¹¤Ç¤Ë _sp ¤â»È¤ï¤ì¤Æ¤¤¤ë¾ì¹ç¤Ï strain ̾¤ò»È¤¦ (1-2)
# ex. Bacteria/Proteobacteria/Beta/T.denitrificans/tbd ¤È
# Bacteria/Proteobacteria/Epsilon/T.denitrificans_ATCC33889/tdn
# -> Bacteria/Proteobacteria/Beta/T.denitrificans/tbd ¤È
# Bacteria/Proteobacteria/Epsilon/T.denitrificans_sp/tdn
# (2) species ̾¤¬¾åµÃæ´Ö¥Î¡¼¥É̾¤ÈƱ¤¸¾ì¹ç
# ¢ª ¤È¤ê¤¢¤¨¤º species ̾¤Ë _sp ¤ò¤Ä¤±¤Æ¥³¥ó¥Õ¥ê¥¯¥È¤òÈò¤±¤ë
# ex. Bacteria/Cyanobacgteria/Cyanobacteria_CYA/cya
# Bacteria/Cyanobacgteria/Cyanobacteria_CYB/cya
# Bacteria/Proteobacteria/Magnetococcus/Magnetococcus_MC1/mgm
# -> Bacteria/Cyanobacgteria/Cyanobacteria_sp/cya
# Bacteria/Cyanobacgteria/Cyanobacteria_sp/cya
# Bacteria/Proteobacteria/Magnetococcus/Magnetococcus_sp/mgm
sp_group = "#{species}_sp"
if @tree[species]
if hier[level+1] == species
# case (0)
else
# case (1-1)
species = sp_group
# case (1-2)
if @tree[sp_group] and hier[level+1] != species
species = name
end
end
else
if hier[level] == species
# case (2)
species = sp_group
end
end
# hier ¤Ï [nil, Eukaryotes, Fungi, Ascomycetes, Saccharomycetes] ¤Ë
# species ¤È org ¤Î [S_cerevisiae, sce] ¤ò²Ã¤¨¤¿·Á¼°
hier[level+1] = species
#hier[level+1] = sanitize(species)
hier[level+2] = org
ary = hier[1, level+2]
warn ary.inspect if $DEBUG
add_to_tree(ary)
add_to_leaves(ary)
add_to_path(ary)
end
end
end
return tree
end
attr_reader :tree
attr_reader :path
attr_reader :leaves
attr_accessor :root
def organisms(group)
@leaves[group]
end
# root ¥Î¡¼¥É¤Î²¼¤Ë [node, subnode, subsubnode, ..., leaf] ¤Ê¥Ñ¥¹¤òÄɲÃ
# ³ÆÃæ´Ö¥Î¡¼¥É¤¬»ÒÍ×ÁǤò¥Ï¥Ã¥·¥å¤ÇÊÝ»ý
def add_to_tree(ary)
parent = @root
ary.each do |node|
@tree[parent] ||= Hash.new
@tree[parent][node] = nil
parent = node
end
end
# ³ÆÃæ´Ö¥Î¡¼¥É¤ËÂбþ¤¹¤ë¥ê¡¼¥Õ¤Î¥ê¥¹¥È¤òÊÝ»ý
def add_to_leaves(ary)
leaf = ary.last
ary.each do |node|
@leaves[node] ||= Array.new
@leaves[node] << leaf
end
end
# ³ÆÃæ´Ö¥Î¡¼¥É¤Þ¤Ç¤Î¥Ñ¥¹¤òÊÝ»ý
def add_to_path(ary)
@path << ary
end
# ¿Æ¥Î¡¼¥É¤«¤é¸«¤Æ»Ò¥Î¡¼¥É¤¬Â¹¥Î¡¼¥É¤ò£±¤Ä¤·¤«»ý¤Ã¤Æ¤¤¤Ê¤¤¾ì¹ç¡¢
# ¹¥Î¡¼¥É¤Î»Ò¶¡¡Ê¤Ò¹¡Ë¤ò¡¢»Ò¥Î¡¼¥É¤Î»Ò¡Ê¹¡Ë¤È¤¹¤ë
#
# ex.
# Plants / Monocotyledons / grass family / osa --> Plants / Monocotyledons / osa
#
def compact(node = root)
# »Ò¥Î¡¼¥É¤¬¤¢¤ê
if subnodes = @tree[node]
# ¤½¤ì¤¾¤ì¤Î»Ò¥Î¡¼¥É¤Ë¤Ä¤¤¤Æ
subnodes.keys.each do |subnode|
# ¹¥Î¡¼¥É¤ò¼èÆÀ
if subsubnodes = @tree[subnode]
# ¹¥Î¡¼¥É¤Î¿ô¤¬ 1 ¤Ä¤Î¾ì¹ç
if subsubnodes.keys.size == 1
# ¹¥Î¡¼¥É¤Î̾Á°¤ò¼èÆÀ
subsubnode = subsubnodes.keys.first
# ¹¥Î¡¼¥É¤Î»Ò¶¡¤ò¼èÆÀ
if subsubsubnodes = @tree[subsubnode]
# ¹¥Î¡¼¥É¤Î»Ò¶¡¤ò»Ò¥Î¡¼¥É¤Î»Ò¶¡¤Ë¤¹¤²¤«¤¨
@tree[subnode] = subsubsubnodes
# ¹¥Î¡¼¥É¤òºï½ü
@tree[subnode].delete(subsubnode)
warn "--- compact: #{subsubnode} is replaced by #{subsubsubnodes}" if $DEBUG
# ¿·¤·¤¤Â¹¥Î¡¼¥É¤Ç¤â compact ¤¬É¬Íפ«¤â¤·¤ì¤Ê¤¤¤¿¤á·«¤êÊÖ¤¹
retry
end
end
end
# »Ò¥Î¡¼¥É¤ò¿Æ¥Î¡¼¥É¤È¤·¤ÆºÆµ¢
compact(subnode)
end
end
end
# ¥ê¡¼¥Õ¥Î¡¼¥É¤¬£±¤Ä¤Î¾ì¹ç¡¢¿Æ¥Î¡¼¥É¤ò¥ê¡¼¥Õ¥Î¡¼¥É¤Ë¤¹¤²¤«¤¨¤ë
#
# ex.
# Plants / Monocotyledons / osa --> Plants / osa
#
def reduce(node = root)
# »Ò¥Î¡¼¥É¤¬¤¢¤ê
if subnodes = @tree[node]
# ¤½¤ì¤¾¤ì¤Î»Ò¥Î¡¼¥É¤Ë¤Ä¤¤¤Æ
subnodes.keys.each do |subnode|
# ¹¥Î¡¼¥É¤ò¼èÆÀ
if subsubnodes = @tree[subnode]
# ¹¥Î¡¼¥É¤Î¿ô¤¬ 1 ¤Ä¤Î¾ì¹ç
if subsubnodes.keys.size == 1
# ¹¥Î¡¼¥É¤Î̾Á°¤ò¼èÆÀ
subsubnode = subsubnodes.keys.first
# ¹¥Î¡¼¥É¤¬¥ê¡¼¥Õ¤Î¾ì¹ç
unless @tree[subsubnode]
# ¹¥Î¡¼¥É¤ò»Ò¥Î¡¼¥É¤Ë¤¹¤²¤«¤¨
@tree[node].update(subsubnodes)
# »Ò¥Î¡¼¥É¤òºï½ü
@tree[node].delete(subnode)
warn "--- reduce: #{subnode} is replaced by #{subsubnode}" if $DEBUG
end
end
end
# »Ò¥Î¡¼¥É¤ò¿Æ¥Î¡¼¥É¤È¤·¤ÆºÆµ¢
reduce(subnode)
end
end
end
# Í¿¤¨¤é¤ì¤¿¥Î¡¼¥É¤È¡¢»Ò¥Î¡¼¥É¤Î¥ê¥¹¥È¡ÊHash¡Ë¤ò¤¦¤±¤È¤ê¡¢
# »Ò¥Î¡¼¥É¤Ë¤Ä¤¤¤Æ¥¤¥Æ¥ì¡¼¥·¥ç¥ó¤¹¤ë
def dfs(parent, &block)
if children = @tree[parent]
yield parent, children
children.keys.each do |child|
dfs(child, &block)
end
end
end
# ¸½ºß¤Î³¬Áؤο¼¤µ¤â¥¤¥Æ¥ì¡¼¥·¥ç¥ó¤ËÅϤ¹
def dfs_with_level(parent, &block)
@level ||= 0
if children = @tree[parent]
yield parent, children, @level
@level += 1
children.keys.each do |child|
dfs_with_level(child, &block)
end
@level -= 1
end
end
# ¥Ä¥ê¡¼¹½Â¤¤ò¥¢¥¹¥¡¼¥¢¡¼¥È¤Çɽ¼¨¤¹¤ë
def to_s
result = "#{@root}\n"
@tree[@root].keys.each do |node|
result += subtree(node, " ")
end
return result
end
private
# ¾åµ to_s ÍѤβ¼ÀÁ¤±¥á¥½¥Ã¥É
def subtree(node, indent)
result = "#{indent}+- #{node}\n"
indent += " "
@tree[node].keys.each do |child|
if @tree[child]
result += subtree(child, indent)
else
result += "#{indent}+- #{child}\n"
end
end
return result
end
def sanitize(str)
str.gsub(/[^A-z0-9]/, '_')
end
end # Taxonomy
end # KEGG
end # Bio
if __FILE__ == $0
# Usage:
# % wget ftp://ftp.genome.jp/pub/kegg/genes/taxonomy
# % ruby taxonomy.rb taxonomy | less -S
taxonomy = ARGV.shift
org_list = ARGV.shift || nil
if org_list
orgs = File.readlines(org_list).map{|x| x.strip}
else
orgs = nil
end
tree = Bio::KEGG::Taxonomy.new(taxonomy, orgs)
puts ">>> tree - original"
puts tree
puts ">>> tree - after compact"
tree.compact
puts tree
puts ">>> tree - after reduce"
tree.reduce
puts tree
puts ">>> path - sorted"
tree.path.sort.each do |path|
puts path.join("/")
end
puts ">>> group : orgs"
tree.dfs(tree.root) do |parent, children|
if orgs = tree.organisms(parent)
puts "#{parent.ljust(30)} (#{orgs.size})\t#{orgs.join(', ')}"
end
end
puts ">>> group : subgroups"
tree.dfs_with_level(tree.root) do |parent, children, level|
subgroups = children.keys.sort
indent = " " * level
label = "#{indent} #{level} #{parent}"
puts "#{label.ljust(35)}\t#{subgroups.join(', ')}"
end
end
More information about the bioruby-cvs
mailing list