[BioRuby-cvs] bioruby/lib/bio/db soft.rb,NONE,1.1

Fri Feb 2 06:13:12 UTC 2007

Update of /home/repository/bioruby/bioruby/lib/bio/db
In directory dev.open-bio.org:/tmp/cvs-serv12663/lib/bio/db

Added Files:
	soft.rb 
Log Message:
Addition of Bio::SOFT for reading SOFT formatted files in NCBI GEO

--- NEW FILE: soft.rb ---
#
# bio/db/soft.rb - Interface for SOFT formatted files
#
# Author::    Trevor Wennblom  <mailto:trevor at corevx.com>
# Copyright:: Copyright (c) 2007 Midwinter Laboratories, LLC (http://midwinterlabs.com)
# License::   Distributes under the same terms as Ruby
#
#  $Id: soft.rb,v 1.1 2007/02/02 06:13:10 trevor Exp $
#

module Bio #:nodoc:

#
# bio/db/soft.rb - Interface for SOFT formatted files
#
# Author::    Trevor Wennblom  <mailto:trevor at corevx.com>
# Copyright:: Copyright (c) 2007 Midwinter Laboratories, LLC (http://midwinterlabs.com)
# License::   Distributes under the same terms as Ruby
#
#
# = Description
# 
# "SOFT (Simple Omnibus in Text Format) is a compact, simple, line-based, 
# ASCII text format that incorporates experimental data and metadata."
# -- <em>GEO, National Center for Biotechnology Information</em>
#
# The Bio::SOFT module reads SOFT Series or Platform formatted files that 
# contain information
# describing one database, one series, one platform, and many samples (GEO
# accessions).  The data from the file can then be viewed with Ruby methods.
#
# Bio::SOFT also supports the reading of SOFT DataSet files which contain
# one database, one dataset, and many subsets.
#
# Format specification is located here:
# * http://www.ncbi.nlm.nih.gov/projects/geo/info/soft2.html#SOFTformat
#
# SOFT data files may be directly downloaded here:
# * ftp://ftp.ncbi.nih.gov/pub/geo/DATA/SOFT
#
# NCBI's Gene Expression Omnibus (GEO) is here:
# * http://www.ncbi.nlm.nih.gov/geo
#
# = Usage
# 
# If an attribute has more than one value then the values are stored in an
# Array of String objects.  Otherwise the attribute is stored as a String.
#
# The platform and each sample may contain a table of data.  A dataset from a
# DataSet file may also contain a table.
#
# Attributes are dynamically created based on the data in the file.
# Predefined keys have not been created in advance due to the variability of
# SOFT files in-the-wild.
#
# Keys are generally stored as Symbols.  In the case of keys for samples and
# table headings may alternatively be accessed with Strings.
# The names of samples (geo accessions) are case sensitive.  Table headers 
# are case insensitive.
#
#   require 'bio'
#
#   lines = IO.readlines('GSE3457_family.soft') 
#   soft = Bio::SOFT.new(lines)
#   
#   soft.platform[:geo_accession]             # => "GPL2092"
#   soft.platform[:organism]                  # => "Populus"
#   soft.platform[:contributor]               # => ["Jingyi,,Li", "Olga,,Shevchenko", "Steve,H,Strauss", "Amy,M,Brunner"]
#   soft.platform[:data_row_count]            # => "240"
#   soft.platform.keys.sort {|a,b| a.to_s <=> b.to_s}[0..2] # => [:contact_address, :contact_city, :contact_country]
#   soft.platform[:"contact_zip/postal_code"] # => "97331"
#   soft.platform[:table].header              # => ["ID", "GB_ACC", "SPOT_ID", "Function/Family", "ORGANISM", "SEQUENCE"]
#   soft.platform[:table].header_description  # => {"ORGANISM"=>"sequence sources", "SEQUENCE"=>"oligo sequence used", "Function/Family"=>"gene functions and family", "ID"=>"", "SPOT_ID"=>"", "GB_ACC"=>"Gene bank accession number"}
#   soft.platform[:table].rows.size           # => 240
#   soft.platform[:table].rows[5]             # => ["A039P68U", "AI163321", "", "TF, flowering protein CONSTANS", "P. tremula x P. tremuloides", "AGAAAATTCGATATACTGTCCGTAAAGAGGTAGCACTTAGAATGCAACGGAATAAAGGGCAGTTCACCTC"]
#   soft.platform[:table].rows[5][4]          # => "P. tremula x P. tremuloides"
#   soft.platform[:table].rows[5][:organism]  # => "P. tremula x P. tremuloides"
#   soft.platform[:table].rows[5]['ORGANISM'] # => "P. tremula x P. tremuloides"
#   
#   soft.series[:geo_accession]               # => "GSE3457"
#   soft.series[:contributor]                 # => ["Jingyi,,Li", "Olga,,Shevchenko", "Ove,,Nilsson", "Steve,H,Strauss", "Amy,M,Brunner"]
#   soft.series[:platform_id]                 # => "GPL2092"
#   soft.series[:sample_id].size              # => 74
#   soft.series[:sample_id][0..4]             # => ["GSM77557", "GSM77558", "GSM77559", "GSM77560", "GSM77561"]
#   
#   soft.database[:name]                      # => "Gene Expression Omnibus (GEO)"
#   soft.database[:ref]                       # => "Nucleic Acids Res. 2005 Jan 1;33 Database Issue:D562-6"
#   soft.database[:institute]                 # => "NCBI NLM NIH"
#   
#   soft.samples.size                         # => 74
#   soft.samples[:GSM77600][:series_id]       # => "GSE3457"
#   soft.samples['GSM77600'][:series_id]      # => "GSE3457"
#   soft.samples[:GSM77600][:platform_id]     # => "GPL2092"
#   soft.samples[:GSM77600][:type]            # => "RNA"
#   soft.samples[:GSM77600][:title]           # => "jst2b2"
#   soft.samples[:GSM77600][:table].header    # => ["ID_REF", "VALUE"]
#   soft.samples[:GSM77600][:table].header_description # => {"ID_REF"=>"", "VALUE"=>"normalized signal intensities"}
#   soft.samples[:GSM77600][:table].rows.size # => 217
#   soft.samples[:GSM77600][:table].rows[5]   # => ["A039P68U", "8.19"]
#   soft.samples[:GSM77600][:table].rows[5][0]        # => "A039P68U"
#   soft.samples[:GSM77600][:table].rows[5][:id_ref]  # => "A039P68U"
#   soft.samples[:GSM77600][:table].rows[5]['ID_REF'] # => "A039P68U"
#
#   
#   lines = IO.readlines('GDS100.soft') 
#   soft = Bio::SOFT.new(lines)
#   
#   soft.database[:name]                      # => "Gene Expression Omnibus (GEO)"
#   soft.database[:ref]                       # => "Nucleic Acids Res. 2005 Jan 1;33 Database Issue:D562-6"
#   soft.database[:institute]                 # => "NCBI NLM NIH"
#   
#   soft.subsets.size                         # => 8
#   soft.subsets.keys                         # => ["GDS100_1", "GDS100_2", "GDS100_3", "GDS100_4", "GDS100_5", "GDS100_6", "GDS100_7", "GDS100_8"]
#   soft.subsets[:GDS100_7]                   # => {:dataset_id=>"GDS100", :type=>"time", :sample_id=>"GSM548,GSM543", :description=>"60 minute"}
#   soft.subsets['GDS100_7'][:sample_id]      # => "GSM548,GSM543"
#   soft.subsets[:GDS100_7][:sample_id]       # => "GSM548,GSM543"
#   soft.subsets[:GDS100_7][:dataset_id]      # => "GDS100"
#   
#   soft.dataset[:order]                      # => "none"
#   soft.dataset[:sample_organism]            # => "Escherichia coli"
#   soft.dataset[:table].header               # => ["ID_REF", "IDENTIFIER", "GSM549", "GSM542", "GSM543", "GSM547", "GSM544", "GSM545", "GSM546", "GSM548"]
#   soft.dataset[:table].rows.size            # => 5764
#   soft.dataset[:table].rows[5]              # => ["6", "EMPTY", "0.097", "0.217", "0.242", "0.067", "0.104", "0.162", "0.104", "0.154"]
#   soft.dataset[:table].rows[5][4]           # => "0.242"
#   soft.dataset[:table].rows[5][:gsm549]     # => "0.097"
#   soft.dataset[:table].rows[5][:GSM549]     # => "0.097"
#   soft.dataset[:table].rows[5]['GSM549']    # => "0.097"
# 
class SOFT
  attr_accessor :database
  attr_accessor :series, :platform, :samples
  attr_accessor :dataset, :subsets
  
  LINE_TYPE_ENTITY_INDICATOR = '^'
  LINE_TYPE_ENTITY_ATTRIBUTE = '!'
  LINE_TYPE_TABLE_HEADER = '#'
  # data table row defined by absence of line type character
  
  TABLE_COLUMN_DELIMITER = "\t"
  
  # Constructor
  #
  # ---
  # *Arguments*
  # * +lines+: (_required_) contents of SOFT formatted file 
  # *Returns*:: Bio::SOFT
  def initialize(lines=nil)
    @database = Database.new
    
    @series = Series.new
    @platform = Platform.new
    @samples = Samples.new
    
    @dataset = Dataset.new
    @subsets = Subsets.new
    
    process(lines)
  end
  
  # Classes for Platform and Series files
  
  class Samples < Hash #:nodoc:
    def [](x)
      x = x.to_s if x.kind_of?( Symbol )
      super(x)
    end
  end
  
  class Entity < Hash #:nodoc:
  end

  class Sample < Entity #:nodoc:
  end
  
  class Platform < Entity #:nodoc:
  end
  
  class Series < Entity #:nodoc:
  end
  
  # Classes for DataSet files
  
  class Subsets < Samples #:nodoc:
  end
  
  class Subset < Entity #:nodoc:
  end
  
  class Dataset < Entity #:nodoc:
  end
  
  # Classes important for all types

  class Database < Entity #:nodoc:
  end
  
  class Table #:nodoc:
    attr_accessor :header
    attr_accessor :header_description
    attr_accessor :rows
    
    class Header < Array #:nodoc:
      # @column_index contains column name => numerical index of column
      attr_accessor :column_index
      
      def initialize
        @column_index = {}
      end
    end
    
    class Row < Array #:nodoc:
      attr_accessor :header_object
      
      def initialize( n, header_object=nil )
        @header_object = header_object
        super(n)
      end
      
      def [](x)
        if x.kind_of?( Fixnum )
          super(x)
        else
          begin
            x = x.to_s.downcase.to_sym
            z = @header_object.column_index[x]
            unless z.kind_of?( Fixnum )
              raise IndexError, "#{x.inspect} is not a valid index.  Contents of @header_object.column_index: #{@header_object.column_index.inspect}"
            end
            self[ z ]
          rescue NoMethodError
            unless @header_object
              $stderr.puts "Table::Row @header_object undefined!"
            end
            raise
          end
        end
      end
    end
    
    def initialize()
      @header_description = {}
      @header = Header.new
      @rows = []
    end
    
    def add_header( line )
      raise "Can only define one header" unless @header.empty?      
      @header = @header.concat( parse_row( line ) )  # beware of clobbering this into an Array
      @header.each_with_index do |key, i|
        @header.column_index[key.downcase.to_sym] = i
      end
    end
    
    def add_row( line )
      @rows << Row.new( parse_row( line ), @header )
    end
    
    def add_header_or_row( line )
      @header.empty? ? add_header( line ) : add_row( line )        
    end
    
    protected
    def parse_row( line )
      line.split( TABLE_COLUMN_DELIMITER )
    end
  end
  
  #########
  protected
  #########
  
  def process(lines)
    current_indicator = nil
    current_class_accessor = nil
    in_table = false
        
    lines.each_with_index do |line, line_number|
      line.strip!
      next if line.nil? or line.empty?
      case line[0].chr
      when LINE_TYPE_ENTITY_INDICATOR
        current_indicator, value = split_label_value_in( line[1..-1] )

        case current_indicator
        when 'DATABASE'
          current_class_accessor = @database
        when 'DATASET'
          current_class_accessor = @dataset
        when 'PLATFORM'
          current_class_accessor = @platform
        when 'SERIES'
          current_class_accessor = @series
        when 'SAMPLE'
          @samples[value] = Sample.new
          current_class_accessor = @samples[value]
        when 'SUBSET'
          @subsets[value] = Subset.new
          current_class_accessor = @subsets[value]
        else
          custom_raise( line_number, error_msg(40, line) )
        end
          
      when LINE_TYPE_ENTITY_ATTRIBUTE
        if( current_indicator == nil )
          custom_raise( line_number, error_msg(30) )
        end
        
        # Handle lines such as '!platform_table_begin' and '!platform_table_end'
        if in_table
          if line =~ %r{table_begin}
            next
          elsif line =~ %r{table_end}
            in_table = false
            next
          end
        end
        
        key, value = split_label_value_in( line, true )
        key_s = key.to_sym
        
        if current_class_accessor.include?( key_s )
          if current_class_accessor[ key_s ].class != Array
            current_class_accessor[ key_s ] = [ current_class_accessor[ key_s ] ]
          end
          current_class_accessor[key.to_sym] << value
        else
          current_class_accessor[key.to_sym] = value
        end
        
      when LINE_TYPE_TABLE_HEADER
        if( (current_indicator != 'SAMPLE') and (current_indicator != 'PLATFORM') and (current_indicator != 'DATASET') )
          custom_raise( line_number, error_msg(20, current_indicator.inspect) )
        end
        
        in_table = true   # may be redundant, computationally not worth checking

        # We only expect one table per platform or sample
        current_class_accessor[:table] ||= Table.new
        key, value = split_label_value_in( line )
        # key[1..-1] -- Remove first character which is the LINE_TYPE_TABLE_HEADER
        current_class_accessor[:table].header_description[ key[1..-1] ] = value
        
      else
        # Type: No line type - should be a row in a table.
        
        if( (current_indicator == nil) or (in_table == false) )
          custom_raise( line_number, error_msg(10) )
        end
        current_class_accessor[:table].add_header_or_row( line )
      end
    end
  end
  
  def error_msg( i, extra_info=nil )
    case i
    when 10
      x = ["Lines without line-type characters are rows in a table, but",
      "a line containing an entity indicator such as",
      "\"#{LINE_TYPE_ENTITY_INDICATOR}SAMPLE\",",
      "\"#{LINE_TYPE_ENTITY_INDICATOR}PLATFORM\",",
      "or \"#{LINE_TYPE_ENTITY_INDICATOR}DATASET\" has not been",
      "previously encountered or it does not appear that this line is",
      "in a table."]
    when 20
      # tables are allowed inside samples and platforms
      x = ["Tables are only allowed inside SAMPLE and PLATFORM.",
        "Current table information found inside #{extra_info}."]
    when 30
      x = ["Entity attribute line (\"#{LINE_TYPE_ENTITY_ATTRIBUTE}\")",
        "found before entity indicator line (\"#{LINE_TYPE_ENTITY_INDICATOR}\")"]
    when 40
      x = ["Unkown entity indicator.  Must be DATABASE, SAMPLE, PLATFORM,",
        "SERIES, DATASET, or SUBSET."]
    else
      raise IndexError, "Unknown error message requested."
    end
    
    x.join(" ")
  end
  
  def custom_raise( line_number_with_0_based_indexing, msg )
    raise ["Error processing input line: #{line_number_with_0_based_indexing+1}",
      msg].join("\t")
  end
  
  def split_label_value_in( line, shift_key=false )
    line =~ %r{\s*=\s*}
    key, value = $`, $'
    
    if shift_key
      key =~ %r{_}
      key = $'
    end
    
    if( (key == nil) or (value == nil) )
      puts line.inspect
      raise
    end
    
    [key, value]
  end

end # SOFT
end # Bio