[BioRuby-cvs] bioruby/lib/bio/appl/gcg msf.rb, NONE, 1.1 seq.rb, NONE, 1.1

Thu Dec 14 19:52:55 UTC 2006

Update of /home/repository/bioruby/bioruby/lib/bio/appl/gcg
In directory dev.open-bio.org:/tmp/cvs-serv15819/lib/bio/appl/gcg

Added Files:
	msf.rb seq.rb 
Log Message:
New files/classes: Bio::GCG::Msf in lib/bio/appl/gcg/msf.rb for GCG msf 
multiple sequence alignment format parser, and Bio::GCG::Seq in 
lib/bio/appl/gcg/seq.rb for GCG sequence format parser.

Autoload of the classes (in bio.rb) and file format autodetection
(in flatfile.rb) are also supported.

Bio::Alignment::Output#output_msf, #output(:msf, ...) are added
to generate msf formatted string from multiple alignment object.

--- NEW FILE: msf.rb ---
#
# = bio/appl/gcg/msf.rb - GCG multiple sequence alignment (.msf) parser class
#
# Copyright::   Copyright (C) 2003, 2006
#               Naohisa Goto <ng at bioruby.org>
# License::     Ruby's
#
# $Id: msf.rb,v 1.1 2006/12/14 19:52:53 ngoto Exp $
#
# = About Bio::GCG::Msf
#
# Please refer document of Bio::GCG::Msf.
#

#---
# (depends on autoload)
#require 'bio/appl/gcg/seq'
#+++

module Bio
  module GCG

    # The msf is a multiple sequence alignment format developed by Wisconsin.
    # Bio::GCG::Msf is a msf format parser.
    class Msf #< DB

      # delimiter used by Bio::FlatFile
      DELIMITER = RS = nil

      # Creates a new Msf object.
      def initialize(str)
        str = str.sub(/\A[\r\n]+/, '')
        if /^\!\![A-Z]+\_MULTIPLE\_ALIGNMNENT/ =~ str[/.*/] then
          @heading = str[/.*/] # '!!NA_MULTIPLE_ALIGNMENT 1.0' or like this
          str.sub!(/.*/, '')
        end
        str.sub!(/.*\.\.$/m, '')
        @description = $&.to_s.sub(/^.*\.\.$/, '').to_s
        d = $&.to_s
        if m = /(.+)\s+MSF\:\s+(\d+)\s+Type\:\s+(\w)\s+(.+)\s+(Comp)?Check\:\s+(\d+)/.match(d) then
          @entry_id = m[1].to_s.strip
          @length   = (m[2] ? m[2].to_i : nil)
          @seq_type = m[3]
          @date     = m[4].to_s.strip
          @checksum = (m[6] ? m[6].to_i : nil)
        end

        str.sub!(/.*\/\/$/m, '')
        a = $&.to_s.split(/^/)
        @seq_info = []
        a.each do |x|
          if /Name\: / =~ x then
            s = {}
            x.scan(/(\S+)\: +(\S*)/) { |y| s[$1] = $2 }
            @seq_info << s
          end
        end

        @data = str
        @description.sub!(/\A(\r\n|\r|\n)/, '')
        @align = nil
      end

      # description
      attr_reader :description

      # ID of the alignment
      attr_reader :entry_id

      # alignment length
      attr_reader :length

      # sequence type ("N" for DNA/RNA or "P" for protein)
      attr_reader :seq_type

      # date
      attr_reader :date

      # checksum
      attr_reader :checksum

      # heading
      # ('!!NA_MULTIPLE_ALIGNMENT 1.0' or whatever like this)
      attr_reader :heading

      #---
      ## data (internally used, will be obsoleted)
      #attr_reader :data
      #
      ## seq. info. (internally used, will be obsoleted)
      #attr_reader :seq_info
      #+++

      # symbol comparison table
      def symbol_comparison_table
        unless defined?(@symbol_comparison_table)
          /Symbol comparison table\: +(\S+)/ =~ @description
          @symbol_comparison_table = $1
        end
        @symbol_comparison_table
      end

      # gap weight
      def gap_weight
        unless defined?(@gap_weight)
          /GapWeight\: +(\S+)/ =~ @description
          @gap_weight = $1
        end
        @gap_weight
      end

      # gap length weight
      def gap_length_weight
        unless defined?(@gap_length_weight)
          /GapLengthWeight\: +(\S+)/ =~ @description
          @gap_length_weight = $1
        end
        @gap_length_weight
      end

      # CompCheck field
      def compcheck
        unless defined?(@compcheck)
          if /CompCheck\: +(\d+)/ =~ @description then
            @compcheck = $1.to_i
          else
            @compcheck = nil
          end
        end
        @compcheck
      end

      # parsing
      def do_parse
        return if @align
        a = @data.strip.split(/\n\n/)
        @seq_data = Array.new(@seq_info.size)
        @seq_data.collect! { |x| Array.new }
        a.each do |x|
          b = x.split(/\n/)
          nw = 0
          if b.size > @seq_info.size then
            if /^ +/ =~ b.shift.to_s
              nw = $&.to_s.length
            end
          end
          if nw > 0 then
            b.each_with_index { |y, i| y[0, nw] = ''; @seq_data[i] << y }
          else
            b.each_with_index { |y, i|
              @seq_data[i] << y.strip.split(/  +/, 2)[1].to_s
            }
          end
        end

        case seq_type
        when 'P', 'p'
          k = Bio::Sequence::AA
        when 'N', 'n'
          k = Bio::Sequence::NA
        else
          k = Bio::Sequence::Generic
        end
        @seq_data.collect! do |x|
          y = x.join('')
          y.gsub!(/[\s\d]+/, '')
          k.new(y)
        end

        aln = Bio::Alignment.new
        @seq_data.each_with_index do |x, i|
          aln.store(@seq_info[i]['Name'], x)
        end
        @align = aln
      end
      private :do_parse

      # returns Bio::Alignment object.
      def alignment
        do_parse
        @align
      end

      # gets seq data (used internally) (will be obsoleted)
      def seq_data
        do_parse
        @seq_data
      end

      # validates checksum
      def validate_checksum
        do_parse
        valid = true
        total = 0
        @seq_data.each_with_index do |x, i|
          sum = Bio::GCG::Seq.calc_checksum(x)
          if sum != @seq_info[i]['Check'].to_i
            valid = false
            break
          end
          total += sum
        end
        return false unless valid
        if @checksum != 0 # "Check:" field of BioPerl is always 0
          valid = ((total % 10000) == @checksum)
        end
        valid
      end

    end #class Msf
  end #module GCG
end # module Bio

--- NEW FILE: seq.rb ---
#
# = bio/appl/gcg/seq.rb - GCG sequence file format class (.seq/.pep file)
#
# Copyright::   Copyright (C) 2003, 2006
#               Naohisa Goto <ng at bioruby.org>
# License::     Ruby's
#
# $Id: seq.rb,v 1.1 2006/12/14 19:52:53 ngoto Exp $
#
# = About Bio::GCG::Msf
#
# Please refer document of Bio::GCG::Msf.
#

module Bio
  module GCG

    # 
    # = Bio::GCG::Seq
    # 
    # This is GCG sequence file format (.seq or .pep) parser class.
    #
    # = References
    #
    # * Information about GCG Wisconsin Package(R)
    # http://www.accelrys.com/products/gcg_wisconsin_package .
    # * EMBOSS sequence formats
    # http://www.hgmp.mrc.ac.uk/Software/EMBOSS/Themes/SequenceFormats.html
    # * BioPerl document
    # http://docs.bioperl.org/releases/bioperl-1.2.3/Bio/SeqIO/gcg.html
    class Seq #< DB

      # delimiter used by Bio::FlatFile
      DELIMITER = RS = nil

      # Creates new instance of this class.
      # str must be a GCG seq formatted string.
      def initialize(str)
        @heading = str[/.*/] # '!!NA_SEQUENCE 1.0' or like this
        str = str.sub(/.*/, '')
        str.sub!(/.*\.\.$/m, '')
        @definition = $&.to_s.sub(/^.*\.\.$/, '').to_s
        desc = $&.to_s
        if m = /(.+)\s+Length\:\s+(\d+)\s+(.+)\s+Type\:\s+(\w)\s+Check\:\s+(\d+)/.match(desc) then
          @entry_id = m[1].to_s.strip
          @length   = (m[2] ? m[2].to_i : nil)
          @date     = m[3].to_s.strip
          @seq_type = m[4]
          @checksum = (m[5] ? m[5].to_i : nil)
        end
        @data = str
        @seq = nil
        @definition.strip!
      end

      # ID field.
      attr_reader :entry_id

      # Description field.
      attr_reader :definition

      # "Length:" field.
      # Note that sometimes this might differ from real sequence length.
      attr_reader :length

      # Date field of this entry.
      attr_reader :date

      # "Type:" field, which indicates sequence type.
      # "N" means nucleic acid sequence, "P" means protein sequence.
      attr_reader :seq_type

      # "Check:" field, which indicates checksum of current sequence.
      attr_reader :checksum

      # heading
      # ('!!NA_SEQUENCE 1.0' or whatever like this)
      attr_reader :heading

      #---
      ## data (internally used, will be obsoleted)
      #attr_reader :data
      #+++

      # Sequence data.
      # The class of the sequence is Bio::Sequence::NA, Bio::Sequence::AA
      # or Bio::Sequence::Generic, according to the sequence type.
      def seq
        unless @seq then
          case @seq_type
          when 'N', 'n'
            k = Bio::Sequence::NA
          when 'P', 'p'
            k = Bio::Sequence::AA
          else
            k = Bio::Sequence
          end
          @seq = k.new(@data.tr('^-a-zA-Z.~', ''))
        end
        @seq
      end

      # If you know the sequence is AA, use this method.
      # Returns a Bio::Sequence::AA object.
      #
      # If you call naseq for protein sequence,
      # or aaseq for nucleic sequence, RuntimeError will be raised.
      def aaseq
        if seq.is_a?(Bio::Sequence::AA) then
          @seq
        else
          raise 'seq_type != \'P\''
        end
      end

      # If you know the sequence is NA, use this method.
      # Returens a Bio::Sequence::NA object.
      #
      # If you call naseq for protein sequence,
      # or aaseq for nucleic sequence, RuntimeError will be raised.
      def naseq
        if seq.is_a?(Bio::Sequence::NA) then
          @seq
        else
          raise 'seq_type != \'N\''
        end
      end

      # Validates checksum.
      # If validation succeeds, returns true.
      # Otherwise, returns false.
      def validate_checksum
        checksum == self.class.calc_checksum(seq)
      end

      #---
      # class methods
      #+++

      # Calculates checksum from given string.
      def self.calc_checksum(str)
        # Reference: Bio::SeqIO::gcg of BioPerl-1.2.3
        idx = 0
        sum = 0
        str.upcase.tr('^A-Z.~', '').each_byte do |c|
          idx += 1
          sum += idx * c
          idx = 0 if idx >= 57
        end
        (sum % 10000)
      end

      # Creates a new GCG sequence format text.
      # Parameters can be omitted.
      #
      # Examples:
      #  Bio::GCG::Seq.to_gcg(:definition=>'H.sapiens DNA',
      #                       :seq_type=>'N', :entry_id=>'gi-1234567',
      #                       :seq=>seq, :date=>date)
      #
      def self.to_gcg(hash)
        seq = hash[:seq]
        if seq.is_a?(Bio::Sequence::NA) then
          seq_type = 'N'
        elsif seq.is_a?(Bio::Sequence::AA) then
          seq_type = 'P'
        else
          seq_type = (hash[:seq_type] or 'P')
        end
        if seq_type == 'N' then
          head = '!!NA_SEQUENCE 1.0'
        else
          head = '!!AA_SEQUENCE 1.0'
        end
        date = (hash[:date] or Time.now.strftime('%B %d, %Y %H:%M'))
        entry_id = hash[:entry_id].to_s.strip
        len = seq.length
        checksum = self.calc_checksum(seq)
        definition = hash[:definition].to_s.strip
        seq = seq.upcase.gsub(/.{1,50}/, "\\0\n")
        seq.gsub!(/.{10}/, "\\0 ")
        w = len.to_s.size + 1
        i = 1
        seq.gsub!(/^/) { |x| s = sprintf("\n%*d ", w, i); i += 50; s }

        [ head, "\n", definition, "\n\n",
          "#{entry_id}  Length: #{len}  #{date}  " \
          "Type: #{seq_type}  Check: #{checksum}  ..\n",
          seq, "\n" ].join('')
      end

    end #class Seq
  end #module GCG
end #module Bio