[BioRuby-cvs] bioruby/lib/bio/sequence aa.rb, NONE, 1.1 common.rb, NONE, 1.1 compat.rb, NONE, 1.1 format.rb, NONE, 1.1 na.rb, NONE, 1.1

Sun Jan 22 23:13:38 EST 2006

Update of /home/repository/bioruby/bioruby/lib/bio/sequence
In directory pub.open-bio.org:/tmp/cvs-serv16731/sequence

Added Files:
	aa.rb common.rb compat.rb format.rb na.rb 
Log Message:
* refactored to store annotations in Bio::Sequence class
* common methods are separated into Bio::Sequence::Common module
* Bio::Sequence no longer inherits String
* Bio::Sequence::NA and AA inherits String and include Bio::Sequence::Common
* lib/bio/sequence.rb is a container for rich sequence
* lib/bio/sequence/common.rb contains Bio::Sequence::Common module
* lib/bio/sequence/na.rb defines Bio::Sequence::NA class
* lib/bio/sequence/aa.rb defines Bio::Sequence::AA class
* lib/bio/sequence/format.rb is for sequence format converter (define output formats)
* lib/bio/sequence/compat.rb is just for backward compatibility

--- NEW FILE: compat.rb ---
# only for backward compatibility, use Bio::Sequence#output(:fasta) instead

module Bio

class Sequence

  def to_s
    String.new(@seq)
  end
  alias to_str to_s

  module Common

  # Output the FASTA format string of the sequence.  The 1st argument is
  # used as the comment string.  If the 2nd option is given, the output
  # sequence will be folded.
  def to_fasta(header = '', width = nil)
    ">#{header}\n" +
    if width
      self.to_s.gsub(Regexp.new(".{1,#{width}}"), "\\0\n")
    else
      self.to_s + "\n"
    end
  end

end # Common

class NA

  def self.randomize(*arg, &block)
    self.new('').randomize(*arg, &block)
  end

end # NA

class AA

  def self.randomize(*arg, &block)
    self.new('').randomize(*arg, &block)
  end

end # AA

end # Sequence

end # Bio

--- NEW FILE: na.rb ---
module Bio

class Sequence

# Nucleic Acid sequence

class NA < String

  include Bio::Sequence::Common

  # Generate a nucleic acid sequence object from a string.
  def initialize(str)
    super
    self.downcase!
    self.tr!(" \t\n\r",'')
  end

  # This method depends on Locations class, see bio/location.rb
  def splicing(position)
    mRNA = super
    if mRNA.rna?
      mRNA.tr!('t', 'u')
    else
      mRNA.tr!('u', 't')
    end
    mRNA
  end

  # Returns complement sequence without reversing ("atgc" -> "tacg")
  def forward_complement
    s = self.class.new(self)
    s.forward_complement!
    s
  end

  # Convert to complement sequence without reversing ("atgc" -> "tacg")
  def forward_complement!
    if self.rna?
      self.tr!('augcrymkdhvbswn', 'uacgyrkmhdbvswn')
    else
      self.tr!('atgcrymkdhvbswn', 'tacgyrkmhdbvswn')
    end
    self
  end

  # Returns reverse complement sequence ("atgc" -> "gcat")
  def reverse_complement
    s = self.class.new(self)
    s.reverse_complement!
    s
  end

  # Convert to reverse complement sequence ("atgc" -> "gcat")
  def reverse_complement!
    self.reverse!
    self.forward_complement!
  end

  # Aliases for short
  alias complement reverse_complement
  alias complement! reverse_complement!

  # Translate into the amino acid sequence from the given frame and the
  # selected codon table.  The table also can be a Bio::CodonTable object.
  # The 'unknown' character is used for invalid/unknown codon (can be
  # used for 'nnn' and/or gap translation in practice).
  #
  # Frame can be 1, 2 or 3 for the forward strand and -1, -2 or -3
  # (4, 5 or 6 is also accepted) for the reverse strand.
  def translate(frame = 1, table = 1, unknown = 'X')
    if table.is_a?(Bio::CodonTable)
      ct = table
    else
      ct = Bio::CodonTable[table]
    end
    naseq = self.dna
    case frame
    when 1, 2, 3
      from = frame - 1
    when 4, 5, 6
      from = frame - 4
      naseq.complement!
    when -1, -2, -3
      from = -1 - frame
      naseq.complement!
    else
      from = 0
    end
    nalen = naseq.length - from
    nalen -= nalen % 3
    aaseq = naseq[from, nalen].gsub(/.{3}/) {|codon| ct[codon] or unknown}
    return Bio::Sequence::AA.new(aaseq)
  end

  # Returns counts of the each codon in the sequence by Hash.
  def codon_usage
    hash = Hash.new(0)
    self.window_search(3, 3) do |codon|
      hash[codon] += 1
    end
    return hash
  end

  # Calculate the ratio of GC / ATGC bases in percent.
  def gc_percent
    count = self.composition
    at = count['a'] + count['t'] + count['u']
    gc = count['g'] + count['c']
    gc = 100 * gc / (at + gc)
    return gc
  end

  # Show abnormal bases other than 'atgcu'.
  def illegal_bases
    self.scan(/[^atgcu]/).sort.uniq
  end

  # Estimate the weight of this biological string molecule.
  # NucleicAcid is defined in bio/data/na.rb
  def molecular_weight
    if self.rna?
      NucleicAcid.weight(self, true)
    else
      NucleicAcid.weight(self)
    end
  end

  # Convert the universal code string into the regular expression.
  def to_re
    if self.rna?
      NucleicAcid.to_re(self.dna, true)
    else
      NucleicAcid.to_re(self)
    end
  end

  # Convert the self string into the list of the names of the each base.
  def names
    array = []
    self.each_byte do |x|
      array.push(NucleicAcid.names[x.chr.upcase])
    end
    return array
  end

  # Output a DNA string by substituting 'u' to 't'.
  def dna
    self.tr('u', 't')
  end

  def dna!
    self.tr!('u', 't')
  end

  # Output a RNA string by substituting 't' to 'u'.
  def rna
    self.tr('t', 'u')
  end

  def rna!
    self.tr!('t', 'u')
  end

  def rna?
    self.index('u')
  end
  protected :rna?

  def pikachu
    self.dna.tr("atgc", "pika") # joke, of course :-)
  end

end # NA

end # Sequence

end # Bio

--- NEW FILE: format.rb ---
# porting from N. Goto's feature-output.rb on BioRuby list.

module Bio

class Sequence

  # Output the FASTA format string of the sequence.  The 1st argument is
  # used in the comment line.  If the 2nd argument (integer) is given,
  # the output sequence will be folded.
  def format_fasta(header = nil, width = nil)
    header ||= "#{@entry_id} #{@definition}"

    ">#{header}\n" +
    if width
      @seq.to_s.gsub(Regexp.new(".{1,#{width}}"), "\\0\n")
    else
      @seq.to_s + "\n"
    end
  end

  def format_genbank
    prefix = ' ' * 5
    indent = prefix + ' ' * 16
    fwidth = 79 - indent.length

    format_features(prefix, indent, fwidth)
  end

  def format_embl
    prefix = 'FT   '
    indent = prefix + ' ' * 16
    fwidth = 80 - indent.length

    format_features(prefix, indent, fwidth)
  end

  private

  def format_features(prefix, indent, width)
    result = ''
    @features.each do |feature|
      result << prefix + sprintf("%-16s", feature.feature)

      position = feature.position
      #position = feature.locations.to_s

      head = ''
      wrap(position, width).each_line do |line|
        result << head << line
        head = indent
      end

      result << format_qualifiers(feature.qualifiers, width)
    end
    return result
  end

  def format_qualifiers(qualifiers, indent, width)
    qualifiers.each do |qualifier|
      q = qualifier.qualifier
      v = qualifier.value.to_s

      if v == true
        lines = wrap('/' + q, width)
      elsif q == 'translation'
        lines = fold('/' + q + '=' + val, width)
      else
        if v[/\D/]
          #v.delete!("\x00-\x1f\x7f-\xff")
          v.gsub!(/"/, '""')
          v = '"' + v + '"'
        end
        lines = wrap('/' + q + '=' + val, width)
      end

      return lines.gsub(/^/, indent)
    end
  end

  def fold(str, width)
    str.gsub(Regexp.new("(.{1,#{width}})"), "\\1\n")
  end

  def wrap(str, width)
    result = []
    left = str.dup
    while left and left.length > width
      line = nil
      width.downto(1) do |i|
        if left[i..i] == ' ' or /[,;]/ =~ left[(i-1)..(i-1)]  then
          line = left[0..(i-1)].sub(/ +\z/, '')
          left = left[i..-1].sub(/\A +/, '')
          break
        end
      end
      if line.nil? then
        line = left[0..(width-1)]
        left = left[width..-1]
      end
      result << line
    end
    result << left if left
    return result.join("\n")
  end

end # Sequence

end # Bio

--- NEW FILE: common.rb ---
module Bio

class Sequence

module Common

  def to_s
    String.new(self)
  end
  alias to_str to_s

  # Force self to re-initialize for clean up (remove white spaces,
  # case unification).
  def seq
    self.class.new(self)
  end

  # Similar to the 'seq' method, but changes the self object destructively.
  def normalize!
    initialize(self)
    self
  end
  alias seq! normalize!

  def <<(*arg)
    super(self.class.new(*arg))
  end
  alias concat <<

  def +(*arg)
    self.class.new(super(*arg))
  end

  # Returns the subsequence of the self string.
  def subseq(s = 1, e = self.length)
    return nil if s < 1 or e < 1
    s -= 1
    e -= 1
    self[s..e]
  end

  # This method iterates on sub string with specified length 'window_size'.
  # By specifing 'step_size', codon sized shifting or spliting genome
  # sequence with ovelapping each end can easily be yielded.
  #
  # The remainder sequence at the terminal end will be returned.
  #
  # Example:
  #   # prints average GC% on each 100bp
  #   seq.window_search(100) do |subseq|
  #     puts subseq.gc
  #   end
  #   # prints every translated peptide (length 5aa) in the same frame
  #   seq.window_search(15, 3) do |subseq|
  #     puts subseq.translate
  #   end
  #   # split genome sequence by 10000bp with 1000bp overlap in fasta format
  #   i = 1
  #   remainder = seq.window_search(10000, 9000) do |subseq|
  #     puts subseq.to_fasta("segment #{i}", 60)
  #     i += 1
  #   end
  #   puts remainder.to_fasta("segment #{i}", 60)
  #
  def window_search(window_size, step_size = 1)
    i = 0
    0.step(self.length - window_size, step_size) do |i| 
      yield self[i, window_size]                        
    end                          
    return self[i + window_size .. -1] 
  end

  # This method receive a hash of residues/bases to the particular values,
  # and sum up the value along with the self sequence.  Especially useful
  # to use with the window_search method and amino acid indices etc.
  def total(hash)
    hash.default = 0.0 unless hash.default
    sum = 0.0
    self.each_byte do |x|
      begin
        sum += hash[x.chr]
      end
    end
    return sum
  end

  # Returns a hash of the occurrence counts for each residue or base.
  def composition
    count = Hash.new(0)
    self.scan(/./) do |x|
      count[x] += 1
    end
    return count
  end

  # Returns a randomized sequence keeping its composition by default.
  # The argument is required when generating a random sequence from the empty
  # sequence (used by the class methods NA.randomize, AA.randomize).
  # If the block is given, yields for each random residue/base.
  def randomize(hash = nil)
    length = self.length
    if hash
      count = hash.clone
      count.each_value {|x| length += x}
    else
      count = self.composition
    end

    seq = ''
    tmp = {}
    length.times do 
      count.each do |k, v|
        tmp[k] = v * rand
      end
      max = tmp.max {|a, b| a[1] <=> b[1]}
      count[max.first] -= 1

      if block_given?
        yield max.first
      else
        seq += max.first
      end
    end
    return self.class.new(seq)
  end

  # Generate a new random sequence with the given frequency of bases
  # or residues.  The sequence length is determined by the sum of each
  # base/residue occurences.
  def self.randomize(*arg, &block)
    self.new('').randomize(*arg, &block)
  end

  # Receive a GenBank style position string and convert it to the Locations
  # objects to splice the sequence itself.  See also: bio/location.rb
  #
  # This method depends on Locations class, see bio/location.rb
  def splicing(position)
    unless position.is_a?(Locations) then
      position = Locations.new(position)
    end
    s = ''
    position.each do |location|
      if location.sequence
        s << location.sequence
      else
        exon = self.subseq(location.from, location.to)
        begin
          exon.complement! if location.strand < 0
        rescue NameError
        end
        s << exon
      end
    end
    return self.class.new(s)
  end

end # Common

end # Sequence

end # Bio

--- NEW FILE: aa.rb ---
module Bio

class Sequence

# Amino Acid sequence

class AA < String

  include Bio::Sequence::Common

  # Generate a amino acid sequence object from a string.
  def initialize(str)
    super
    self.upcase!
    self.tr!(" \t\n\r",'')
  end

  # Estimate the weight of this protein.
  # AminoAcid is defined in bio/data/aa.rb
  def molecular_weight
    AminoAcid.weight(self)
  end

  def to_re
    AminoAcid.to_re(self)
  end

  # Generate the list of the names of the each residue along with the
  # sequence (3 letters code).
  def codes
    array = []
    self.each_byte do |x|
      array.push(AminoAcid.names[x.chr])
    end
    return array
  end

  # Similar to codes but returns long names.
  def names
    self.codes.map do |x|
      AminoAcid.names[x]
    end
  end

end # AA

end # Sequence

end # Bio