[BioRuby-cvs] bioruby/lib/bio/sequence aa.rb, 1.2, 1.3 common.rb, 1.2, 1.3 compat.rb, 1.2, 1.3 format.rb, 1.2, 1.3 generic.rb, 1.3, 1.4 na.rb, 1.2, 1.3
Katayama Toshiaki
k at dev.open-bio.org
Sun Mar 26 02:28:01 UTC 2006
Update of /home/repository/bioruby/bioruby/lib/bio/sequence
In directory dev.open-bio.org:/tmp/cvs-serv28853/sequence
Modified Files:
aa.rb common.rb compat.rb format.rb generic.rb na.rb
Log Message:
* comprehensive documentations contributed by Ryan Raaum and Jan Aerts are added.
* bug fixes in sequence.rb contributed by Ryan Raaum
* Added 'U' and 'u' to the bases counted towards the nucleic acid total in Bio::Sequence#guess. (Without this, RNA sequences were "guessed" to be Amino Acid sequences).
* Changed the arguments for method_missing in Bio::Sequence from (*arg) to (sym, *args, &block). With this argument set, blocks will be properly passed through to the encapsulated object.
Index: compat.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/sequence/compat.rb,v
retrieving revision 1.2
retrieving revision 1.3
diff -C2 -d -r1.2 -r1.3
*** compat.rb 6 Feb 2006 14:18:03 -0000 1.2
--- compat.rb 26 Mar 2006 02:27:59 -0000 1.3
***************
*** 3,7 ****
#
# Copyright:: Copyright (C) 2006
! # Toshiaki Katayama <k at bioruby.org>
# License:: Ruby's
#
--- 3,8 ----
#
# Copyright:: Copyright (C) 2006
! # Toshiaki Katayama <k at bioruby.org>,
! # Ryan Raaum <ryan at raaum.org>
# License:: Ruby's
#
***************
*** 18,21 ****
--- 19,33 ----
autoload :AA, 'bio/sequence/aa'
+ # Return sequence as
+ # String[http://corelib.rubyonrails.org/classes/String.html].
+ # The original sequence is unchanged.
+ #
+ # seq = Bio::Sequence.new('atgc')
+ # puts s.to_s #=> 'atgc'
+ # puts s.to_s.class #=> String
+ # puts s #=> 'atgc'
+ # puts s.class #=> Bio::Sequence
+ # ---
+ # *Returns*:: String object
def to_s
String.new(@seq)
***************
*** 26,32 ****
--- 38,51 ----
module Common
+ # *DEPRECIATED* Do not use! Use Bio::Sequence#output instead.
+ #
# Output the FASTA format string of the sequence. The 1st argument is
# used as the comment string. If the 2nd option is given, the output
# sequence will be folded.
+ # ---
+ # *Arguments*:
+ # * (optional) _header_: String object
+ # * (optional) _width_: Fixnum object (default nil)
+ # *Returns*:: String
def to_fasta(header = '', width = nil)
warn "Bio::Sequence#to_fasta is obsolete. Use Bio::Sequence#output(:fasta) instead" if $DEBUG
***************
*** 44,52 ****
class NA
def self.randomize(*arg, &block)
self.new('').randomize(*arg, &block)
end
! def pikachu
self.dna.tr("atgc", "pika") # joke, of course :-)
end
--- 63,89 ----
class NA
+ # Generate a new random sequence with the given frequency of bases.
+ # The sequence length is determined by their cumulative sum.
+ # (See also Bio::Sequence::Common#randomize which creates a new
+ # randomized sequence object using the base composition of an existing
+ # sequence instance).
+ #
+ # counts = {'a'=>1,'c'=>2,'g'=>3,'t'=>4}
+ # puts Bio::Sequence::NA.randomize(counts) #=> "ggcttgttac" (for example)
+ #
+ # You may also feed the output of randomize into a block
+ #
+ # actual_counts = {'a'=>0, 'c'=>0, 'g'=>0, 't'=>0}
+ # Bio::Sequence::NA.randomize(counts) {|x| actual_counts[x] += 1}
+ # actual_counts #=> {"a"=>1, "c"=>2, "g"=>3, "t"=>4}
+ # ---
+ # *Arguments*:
+ # * (optional) _hash_: Hash object
+ # *Returns*:: Bio::Sequence::NA object
def self.randomize(*arg, &block)
self.new('').randomize(*arg, &block)
end
! def pikachu #:nodoc:
self.dna.tr("atgc", "pika") # joke, of course :-)
end
***************
*** 57,60 ****
--- 94,115 ----
class AA
+ # Generate a new random sequence with the given frequency of bases.
+ # The sequence length is determined by their cumulative sum.
+ # (See also Bio::Sequence::Common#randomize which creates a new
+ # randomized sequence object using the base composition of an existing
+ # sequence instance).
+ #
+ # counts = {'R'=>1,'L'=>2,'E'=>3,'A'=>4}
+ # puts Bio::Sequence::AA.randomize(counts) #=> "AAEAELALRE" (for example)
+ #
+ # You may also feed the output of randomize into a block
+ #
+ # actual_counts = {'R'=>0,'L'=>0,'E'=>0,'A'=>0}
+ # Bio::Sequence::AA.randomize(counts) {|x| actual_counts[x] += 1}
+ # actual_counts #=> {"A"=>4, "L"=>2, "E"=>3, "R"=>1}
+ # ---
+ # *Arguments*:
+ # * (optional) _hash_: Hash object
+ # *Returns*:: Bio::Sequence::AA object
def self.randomize(*arg, &block)
self.new('').randomize(*arg, &block)
Index: common.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/sequence/common.rb,v
retrieving revision 1.2
retrieving revision 1.3
diff -C2 -d -r1.2 -r1.3
*** common.rb 6 Feb 2006 14:16:17 -0000 1.2
--- common.rb 26 Mar 2006 02:27:59 -0000 1.3
***************
*** 3,7 ****
#
# Copyright:: Copyright (C) 2006
! # Toshiaki Katayama <k at bioruby.org>
# License:: Ruby's
#
--- 3,8 ----
#
# Copyright:: Copyright (C) 2006
! # Toshiaki Katayama <k at bioruby.org>,
! # Ryan Raaum <ryan at raaum.org>
# License:: Ruby's
#
***************
*** 15,22 ****
class Sequence
! # This module provides common methods for biological sequence classes
! # which must inherit String.
module Common
def to_s
String.new(self)
--- 16,53 ----
class Sequence
! # = DESCRIPTION
! # Bio::Sequence::Common is a
! # Mixin[http://www.rubycentral.com/book/tut_modules.html]
! # implementing methods common to
! # Bio::Sequence::AA and Bio::Sequence::NA. All of these methods
! # are available to either Amino Acid or Nucleic Acid sequences, and
! # by encapsulation are also available to Bio::Sequence objects.
! #
! # = USAGE
! #
! # # Create a sequence
! # dna = Bio::Sequence.auto('atgcatgcatgc')
! #
! # # Splice out a subsequence using a Genbank-style location string
! # puts dna.splice('complement(1..4)')
! #
! # # What is the base composition?
! # puts dna.composition
! #
! # # Create a random sequence with the composition of a current sequence
! # puts dna.randomize
module Common
+ # Return sequence as
+ # String[http://corelib.rubyonrails.org/classes/String.html].
+ # The original sequence is unchanged.
+ #
+ # seq = Bio::Sequence::NA.new('atgc')
+ # puts s.to_s #=> 'atgc'
+ # puts s.to_s.class #=> String
+ # puts s #=> 'atgc'
+ # puts s.class #=> Bio::Sequence::NA
+ # ---
+ # *Returns*:: String object
def to_s
String.new(self)
***************
*** 24,34 ****
alias to_str to_s
! # Force self to re-initialize for clean up (remove white spaces,
! # case unification).
def seq
self.class.new(self)
end
! # Similar to the 'seq' method, but changes the self object destructively.
def normalize!
initialize(self)
--- 55,79 ----
alias to_str to_s
! # Create a new sequence based on the current sequence.
! # The original sequence is unchanged.
! #
! # s = Bio::Sequence::NA.new('atgc')
! # s2 = s.seq
! # puts s2 #=> 'atgc'
! # ---
! # *Returns*:: new Bio::Sequence::NA/AA object
def seq
self.class.new(self)
end
! # Normalize the current sequence, removing all whitespace and
! # transforming all positions to uppercase if the sequence is AA or
! # transforming all positions to lowercase if the sequence is NA.
! # The original sequence is modified.
! #
! # s = Bio::Sequence::NA.new('atgc')
! # s.normalize!
! # ---
! # *Returns*:: current Bio::Sequence::NA/AA object (modified)
def normalize!
initialize(self)
***************
*** 37,40 ****
--- 82,95 ----
alias seq! normalize!
+ # Add new data to the end of the current sequence.
+ # The original sequence is modified.
+ #
+ # s = Bio::Sequence::NA.new('atgc')
+ # s << 'atgc'
+ # puts s #=> "atgcatgc"
+ # s << s
+ # puts s #=> "atgcatgcatgcatgc"
+ # ---
+ # *Returns*:: current Bio::Sequence::NA/AA object (modified)
def <<(*arg)
super(self.class.new(*arg))
***************
*** 42,50 ****
alias concat <<
def +(*arg)
self.class.new(super(*arg))
end
! # Returns the subsequence of the self string.
def subseq(s = 1, e = self.length)
raise "Error: start/end position must be a positive integer" unless s > 0 and e > 0
--- 97,141 ----
alias concat <<
+ # Create a new sequence by adding to an existing sequence.
+ # The existing sequence is not modified.
+ #
+ # s = Bio::Sequence::NA.new('atgc')
+ # s2 = s + 'atgc'
+ # puts s2 #=> "atgcatgc"
+ # puts s #=> "atgc"
+ #
+ # The new sequence is of the same class as the existing sequence if
+ # the new data was added to an existing sequence,
+ #
+ # puts s2.class == s.class #=> true
+ #
+ # but if an existing sequence is added to a String, the result is a String
+ #
+ # s3 = 'atgc' + s
+ # puts s3.class #=> String
+ # ---
+ # *Returns*:: new Bio::Sequence::NA/AA *or* String object
def +(*arg)
self.class.new(super(*arg))
end
! # Returns a new sequence containing the subsequence identified by the
! # start and end numbers given as parameters. *Important:* Biological
! # sequence numbering conventions (one-based) rather than ruby's
! # (zero-based) numbering conventions are used.
! #
! # s = Bio::Sequence::NA.new('atggaatga')
! # puts s.subseq(1,3) #=> "atg"
! #
! # Start defaults to 1 and end defaults to the entire existing string, so
! # subseq called without any parameters simply returns a new sequence
! # identical to the existing sequence.
! #
! # puts s.subseq #=> "atggaatga"
! # ---
! # *Arguments*:
! # * (optional) _s_(start): Integer (default 1)
! # * (optional) _e_(end): Integer (default current sequence length)
! # *Returns*:: new Bio::Sequence::NA/AA object
def subseq(s = 1, e = self.length)
raise "Error: start/end position must be a positive integer" unless s > 0 and e > 0
***************
*** 54,80 ****
end
! # This method iterates on sub string with specified length 'window_size'.
! # By specifing 'step_size', codon sized shifting or spliting genome
! # sequence with ovelapping each end can easily be yielded.
#
! # The remainder sequence at the terminal end will be returned.
#
! # Example:
! # # prints average GC% on each 100bp
! # seq.window_search(100) do |subseq|
# puts subseq.gc
# end
! # # prints every translated peptide (length 5aa) in the same frame
! # seq.window_search(15, 3) do |subseq|
# puts subseq.translate
# end
! # # split genome sequence by 10000bp with 1000bp overlap in fasta format
# i = 1
! # remainder = seq.window_search(10000, 9000) do |subseq|
# puts subseq.to_fasta("segment #{i}", 60)
# i += 1
# end
# puts remainder.to_fasta("segment #{i}", 60)
! #
def window_search(window_size, step_size = 1)
i = 0
--- 145,177 ----
end
! # This method steps through a sequences in steps of 'step_size' by
! # subsequences of 'window_size'. Typically used with a block.
! # Any remaining sequence at the terminal end will be returned.
#
! # Prints average GC% on each 100bp
#
! # s.window_search(100) do |subseq|
# puts subseq.gc
# end
! #
! # Prints every translated peptide (length 5aa) in the same frame
! #
! # s.window_search(15, 3) do |subseq|
# puts subseq.translate
# end
! #
! # Split genome sequence by 10000bp with 1000bp overlap in fasta format
! #
# i = 1
! # remainder = s.window_search(10000, 9000) do |subseq|
# puts subseq.to_fasta("segment #{i}", 60)
# i += 1
# end
# puts remainder.to_fasta("segment #{i}", 60)
! # ---
! # *Arguments*:
! # * (required) _window_size_: Fixnum
! # * (optional) _step_size_: Fixnum (default 1)
! # *Returns*:: new Bio::Sequence::NA/AA object
def window_search(window_size, step_size = 1)
i = 0
***************
*** 85,91 ****
end
! # This method receive a hash of residues/bases to the particular values,
! # and sum up the value along with the self sequence. Especially useful
! # to use with the window_search method and amino acid indices etc.
def total(hash)
hash.default = 0.0 unless hash.default
--- 182,195 ----
end
! # Returns a float total value for the sequence given a hash of
! # base or residue values,
! #
! # values = {'a' => 0.1, 't' => 0.2, 'g' => 0.3, 'c' => 0.4}
! # s = Bio::Sequence::NA.new('atgc')
! # puts s.total(values) #=> 1.0
! # ---
! # *Arguments*:
! # * (required) _hash_: Hash object
! # *Returns*:: Float object
def total(hash)
hash.default = 0.0 unless hash.default
***************
*** 100,103 ****
--- 204,212 ----
# Returns a hash of the occurrence counts for each residue or base.
+ #
+ # s = Bio::Sequence::NA.new('atgc')
+ # puts s.composition #=> {"a"=>1, "c"=>1, "g"=>1, "t"=>1}
+ # ---
+ # *Returns*:: Hash object
def composition
count = Hash.new(0)
***************
*** 108,118 ****
end
! # Returns a randomized sequence keeping its composition by default.
! # The argument is required when generating a random sequence from the empty
! # sequence (used by the class methods NA.randomize, AA.randomize).
! # If the block is given, yields for each random residue/base.
def randomize(hash = nil)
length = self.length
if hash
count = hash.clone
count.each_value {|x| length += x}
--- 217,244 ----
end
! # Returns a randomized sequence. The default is to retain the same
! # base/residue composition as the original. If a hash of base/residue
! # counts is given, the new sequence will be based on that hash
! # composition. If a block is given, each new randomly selected
! # position will be passed into the block. In all cases, the
! # original sequence is not modified.
! #
! # s = Bio::Sequence::NA.new('atgc')
! # puts s.randomize #=> "tcag" (for example)
! #
! # new_composition = {'a' => 2, 't' => 2}
! # puts s.randomize(new_composition) #=> "ttaa" (for example)
! #
! # count = 0
! # s.randomize { |x| count += 1 }
! # puts count #=> 4
! # ---
! # *Arguments*:
! # * (optional) _hash_: Hash object
! # *Returns*:: new Bio::Sequence::NA/AA object
def randomize(hash = nil)
length = self.length
if hash
+ length = 0
count = hash.clone
count.each_value {|x| length += x}
***************
*** 139,151 ****
end
! # Generate a new random sequence with the given frequency of bases
! # or residues. The sequence length is determined by the sum of each
! # base/residue occurences.
def self.randomize(*arg, &block)
self.new('').randomize(*arg, &block)
end
! # Receive a GenBank style position string and convert it to the Locations
! # objects to splice the sequence itself. See also: bio/location.rb
def splice(position)
unless position.is_a?(Locations) then
--- 265,305 ----
end
! # Generate a new random sequence with the given frequency of bases.
! # The sequence length is determined by their cumulative sum.
! # (See also Bio::Sequence::Common#randomize which creates a new
! # randomized sequence object using the base composition of an existing
! # sequence instance).
! #
! # counts = {'R'=>1,'L'=>2,'E'=>3,'A'=>4}
! # puts Bio::Sequence::AA.randomize(counts) #=> "AAEAELALRE" (for example)
! #
! # You may also feed the output of randomize into a block
! #
! # actual_counts = {'R'=>0,'L'=>0,'E'=>0,'A'=>0}
! # Bio::Sequence::AA.randomize(counts) {|x| actual_counts[x] += 1}
! # actual_counts #=> {"A"=>4, "L"=>2, "E"=>3, "R"=>1}
! # ---
! # *Arguments*:
! # * (optional) _hash_: Hash object
! # *Returns*:: Bio::Sequence::NA/AA object
def self.randomize(*arg, &block)
self.new('').randomize(*arg, &block)
end
! # Return a new sequence extracted from the original using a GenBank style
! # position string. See also documentation for the Bio::Location class.
! #
! # s = Bio::Sequence::NA.new('atgcatgcatgcatgc')
! # puts s.splice('1..3') #=> "atg"
! # puts s.splice('join(1..3,8..10)') #=> "atgcat"
! # puts s.splice('complement(1..3)') #=> "cat"
! # puts s.splice('complement(join(1..3,8..10))') #=> "atgcat"
! #
! # Note that 'complement'ed Genbank position strings will have no
! # effect on Bio::Sequence::AA objects.
! # ---
! # *Arguments*:
! # * (required) _position_: String *or* Bio::Location object
! # *Returns*:: Bio::Sequence::NA/AA object
def splice(position)
unless position.is_a?(Locations) then
Index: format.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/sequence/format.rb,v
retrieving revision 1.2
retrieving revision 1.3
diff -C2 -d -r1.2 -r1.3
*** format.rb 6 Feb 2006 14:20:35 -0000 1.2
--- format.rb 26 Mar 2006 02:27:59 -0000 1.3
***************
*** 4,8 ****
# Copyright:: Copyright (C) 2006
# Toshiaki Katayama <k at bioruby.org>,
! # Naohisa Goto <ng at bioruby.org>
# License:: Ruby's
#
--- 4,9 ----
# Copyright:: Copyright (C) 2006
# Toshiaki Katayama <k at bioruby.org>,
! # Naohisa Goto <ng at bioruby.org>,
! # Ryan Raaum <ryan at raaum.org>
# License:: Ruby's
#
***************
*** 21,29 ****
class Sequence
module Format
! # Output the FASTA format string of the sequence. The 1st argument is
! # used in the comment line. If the 2nd argument (integer) is given,
! # the output sequence will be folded.
def format_fasta(header = nil, width = nil)
header ||= "#{@entry_id} #{@definition}"
--- 22,56 ----
class Sequence
+ # = DESCRIPTION
+ # A Mixin[http://www.rubycentral.com/book/tut_modules.html]
+ # of methods used by Bio::Sequence#output to output sequences in
+ # common bioinformatic formats. These are not called in isolation.
+ #
+ # = USAGE
+ # # Given a Bio::Sequence object,
+ # puts s.output(:fasta)
+ # puts s.output(:genbank)
+ # puts s.output(:embl)
module Format
! # INTERNAL USE ONLY, YOU SHOULD NOT CALL THIS METHOD. (And in any
! # case, it would be difficult to successfully call this method outside
! # its expected context).
! #
! # Output the FASTA format string of the sequence.
! #
! # UNFORTUNATLY, the current implementation of Bio::Sequence is incapable of
! # using either the header or width arguments. So something needs to be
! # changed...
! #
! # Currently, this method is used in Bio::Sequence#output like so,
! #
! # s = Bio::Sequence.new('atgc')
! # puts s.output(:fasta) #=> "> \natgc\n"
! # ---
! # *Arguments*:
! # * (optional) _header_: String (default nil)
! # * (optional) _width_: Fixnum (default nil)
! # *Returns*:: String object
def format_fasta(header = nil, width = nil)
header ||= "#{@entry_id} #{@definition}"
***************
*** 37,44 ****
end
! def format_gff
raise NotImplementedError
end
def format_genbank
prefix = ' ' * 5
--- 64,83 ----
end
! # Not yet implemented :)
! # Remove the nodoc command after implementation!
! # ---
! # *Returns*:: String object
! def format_gff #:nodoc:
raise NotImplementedError
end
+ # INTERNAL USE ONLY, YOU SHOULD NOT CALL THIS METHOD. (And in any
+ # case, it would be difficult to successfully call this method outside
+ # its expected context).
+ #
+ # Output the Genbank format string of the sequence.
+ # Used in Bio::Sequence#output.
+ # ---
+ # *Returns*:: String object
def format_genbank
prefix = ' ' * 5
***************
*** 49,52 ****
--- 88,99 ----
end
+ # INTERNAL USE ONLY, YOU SHOULD NOT CALL THIS METHOD. (And in any
+ # case, it would be difficult to successfully call this method outside
+ # its expected context).
+ #
+ # Output the EMBL format string of the sequence.
+ # Used in Bio::Sequence#output.
+ # ---
+ # *Returns*:: String object
def format_embl
prefix = 'FT '
Index: aa.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/sequence/aa.rb,v
retrieving revision 1.2
retrieving revision 1.3
diff -C2 -d -r1.2 -r1.3
*** aa.rb 6 Feb 2006 14:11:31 -0000 1.2
--- aa.rb 26 Mar 2006 02:27:59 -0000 1.3
***************
*** 3,7 ****
#
# Copyright:: Copyright (C) 2006
! # Toshiaki Katayama <k at bioruby.org>
# License:: Ruby's
#
--- 3,8 ----
#
# Copyright:: Copyright (C) 2006
! # Toshiaki Katayama <k at bioruby.org>,
! # Ryan Raaum <ryan at raaum.org>
# License:: Ruby's
#
***************
*** 17,27 ****
class Sequence
!
! # Amino Acid sequence
class AA < String
include Bio::Sequence::Common
! # Generate a amino acid sequence object from a string.
def initialize(str)
super
--- 18,61 ----
class Sequence
! # = DESCRIPTION
! # Bio::Sequence::AA represents a bare Amino Acid sequence in bioruby.
! #
! # = USAGE
! # # Create an Amino Acid sequence.
! # aa = Bio::Sequence::AA.new('ACDEFGHIKLMNPQRSTVWYU')
! #
! # # What are the three-letter codes for all the residues?
! # puts aa.codes
! #
! # # What are the names of all the residues?
! # puts aa.names
! #
! # # What is the molecular weight of this peptide?
! # puts aa.molecular_weight
class AA < String
include Bio::Sequence::Common
! # Generate an amino acid sequence object from a string.
! #
! # s = Bio::Sequence::AA.new("RRLEHTFVFLRNFSLMLLRY")
! #
! # or maybe (if you have an amino acid sequence in a file)
! #
! # s = Bio::Sequence:AA.new(File.open('aa.txt').read)
! #
! # Amino Acid sequences are *always* all uppercase in bioruby
! #
! # s = Bio::Sequence::AA.new("rrLeHtfV")
! # puts s #=> "RRLEHTFVF"
! #
! # Whitespace is stripped from the sequence
! #
! # s = Bio::Sequence::AA.new("RRL\nELA\tRG\r RL")
! # puts s #=> "RRLELARGRL"
! # ---
! # *Arguments*:
! # * (required) _str_: String
! # *Returns*:: Bio::Sequence::AA object
def initialize(str)
super
***************
*** 31,45 ****
! # Estimate the weight of this protein.
def molecular_weight
Bio::AminoAcid.weight(self)
end
def to_re
Bio::AminoAcid.to_re(self)
end
! # Generate the list of the names of the each residue along with the
! # sequence (3 letters code).
def codes
array = []
--- 65,98 ----
! # Estimate molecular weight based on
! # Fasman1976[http://www.genome.ad.jp/dbget-bin/www_bget?aaindex+FASG760101]
! #
! # s = Bio::Sequence::AA.new("RRLE")
! # puts s.molecular_weight #=> 572.655
! # ---
! # *Returns*:: Float object
def molecular_weight
Bio::AminoAcid.weight(self)
end
+ # Create a ruby regular expression instance
+ # (Regexp)[http://corelib.rubyonrails.org/classes/Regexp.html]
+ #
+ # s = Bio::Sequence::AA.new("RRLE")
+ # puts s.to_re #=> /RRLE/
+ # ---
+ # *Returns*:: Regexp object
def to_re
Bio::AminoAcid.to_re(self)
end
! # Generate the list of the names of each residue along with the
! # sequence (3 letters code). Codes used in bioruby are found in the
! # Bio::AminoAcid::NAMES hash.
! #
! # s = Bio::Sequence::AA.new("RRLE")
! # puts s.codes #=> ["Arg", "Arg", "Leu", "Glu"]
! # ---
! # *Returns*:: Array object
def codes
array = []
***************
*** 50,54 ****
end
! # Similar to codes but returns long names.
def names
self.codes.map do |x|
--- 103,115 ----
end
! # Generate the list of the names of each residue along with the
! # sequence (full name). Names used in bioruby are found in the
! # Bio::AminoAcid::NAMES hash.
! #
! # s = Bio::Sequence::AA.new("RRLE")
! # puts s.names
! # #=> ["arginine", "arginine", "leucine", "glutamic acid"]
! # ---
! # *Returns*:: Array object
def names
self.codes.map do |x|
Index: generic.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/sequence/generic.rb,v
retrieving revision 1.3
retrieving revision 1.4
diff -C2 -d -r1.3 -r1.4
*** generic.rb 6 Feb 2006 14:26:04 -0000 1.3
--- generic.rb 26 Mar 2006 02:27:59 -0000 1.4
***************
*** 14,18 ****
class Sequence
! class Generic < String
include Bio::Sequence::Common
--- 14,18 ----
class Sequence
! class Generic < String #:nodoc:
include Bio::Sequence::Common
Index: na.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/sequence/na.rb,v
retrieving revision 1.2
retrieving revision 1.3
diff -C2 -d -r1.2 -r1.3
*** na.rb 6 Feb 2006 14:13:52 -0000 1.2
--- na.rb 26 Mar 2006 02:27:59 -0000 1.3
***************
*** 3,7 ****
#
# Copyright:: Copyright (C) 2006
! # Toshiaki Katayama <k at bioruby.org>
# License:: Ruby's
#
--- 3,8 ----
#
# Copyright:: Copyright (C) 2006
! # Toshiaki Katayama <k at bioruby.org>,
! # Ryan Raaum <ryan at raaum.org>
# License:: Ruby's
#
***************
*** 19,28 ****
! # Nucleic Acid sequence
class NA < String
include Bio::Sequence::Common
! # Generate a nucleic acid sequence object from a string.
def initialize(str)
super
--- 20,78 ----
! # = DESCRIPTION
! # Bio::Sequence::NA represents a bare Nucleic Acid sequence in bioruby.
! #
! # = USAGE
! # # Create a Nucleic Acid sequence.
! # dna = Bio::Sequence.auto('atgcatgcATGCATGCAAAA')
! # rna = Bio::Sequence.auto('augcaugcaugcaugcaaaa')
! #
! # # What are the names of all the bases?
! # puts dna.names
! # puts rna.names
! #
! # # What is the GC percentage?
! # puts dna.gc_percent
! # puts rna.gc_percent
! #
! # # What is the molecular weight?
! # puts dna.molecular_weight
! # puts rna.molecular_weight
! #
! # # What is the reverse complement?
! # puts dna.reverse_complement
! # puts dna.complement
! #
! # # Is this sequence DNA or RNA?
! # puts dna.rna?
! #
! # # Translate my sequence (see method docs for many options)
! # puts dna.translate
! # puts rna.translate
class NA < String
include Bio::Sequence::Common
! # Generate an nucleic acid sequence object from a string.
! #
! # s = Bio::Sequence::NA.new("aagcttggaccgttgaagt")
! #
! # or maybe (if you have an nucleic acid sequence in a file)
! #
! # s = Bio::Sequence:NA.new(File.open('dna.txt').read)
! #
! # Nucleic Acid sequences are *always* all lowercase in bioruby
! #
! # s = Bio::Sequence::NA.new("AAGcTtGG")
! # puts s #=> "aagcttgg"
! #
! # Whitespace is stripped from the sequence
! #
! # seq = Bio::Sequence::NA.new("atg\nggg\ttt\r gc")
! # puts s #=> "atggggttgc"
! # ---
! # *Arguments*:
! # * (required) _str_: String
! # *Returns*:: Bio::Sequence::NA object
def initialize(str)
super
***************
*** 31,36 ****
end
! # This method depends on Locations class, see bio/location.rb
! def splicing(position)
mRNA = super
if mRNA.rna?
--- 81,86 ----
end
! # Alias of Bio::Sequence::Common splice method, documented there.
! def splicing(position) #:nodoc:
mRNA = super
if mRNA.rna?
***************
*** 42,46 ****
end
! # Returns complement sequence without reversing ("atgc" -> "tacg")
def forward_complement
s = self.class.new(self)
--- 92,103 ----
end
! # Returns a new complementary sequence object (without reversing).
! # The original sequence object is not modified.
! #
! # s = Bio::Sequence::NA.new('atgc')
! # puts s.forward_complement #=> 'tacg'
! # puts s #=> 'atgc'
! # ---
! # *Returns*:: new Bio::Sequence::NA object
def forward_complement
s = self.class.new(self)
***************
*** 49,53 ****
end
! # Convert to complement sequence without reversing ("atgc" -> "tacg")
def forward_complement!
if self.rna?
--- 106,117 ----
end
! # Converts the current sequence into its complement (without reversing).
! # The original sequence object is modified.
! #
! # seq = Bio::Sequence::NA.new('atgc')
! # puts s.forward_complement! #=> 'tacg'
! # puts s #=> 'tacg'
! # ---
! # *Returns*:: current Bio::Sequence::NA object (modified)
def forward_complement!
if self.rna?
***************
*** 59,63 ****
end
! # Returns reverse complement sequence ("atgc" -> "gcat")
def reverse_complement
s = self.class.new(self)
--- 123,134 ----
end
! # Returns a new sequence object with the reverse complement
! # sequence to the original. The original sequence is not modified.
! #
! # s = Bio::Sequence::NA.new('atgc')
! # puts s.reverse_complement #=> 'gcat'
! # puts s #=> 'atgc'
! # ---
! # *Returns*:: new Bio::Sequence::NA object
def reverse_complement
s = self.class.new(self)
***************
*** 66,70 ****
end
! # Convert to reverse complement sequence ("atgc" -> "gcat")
def reverse_complement!
self.reverse!
--- 137,148 ----
end
! # Converts the original sequence into its reverse complement.
! # The original sequence is modified.
! #
! # s = Bio::Sequence::NA.new('atgc')
! # puts s.reverse_complement #=> 'gcat'
! # puts s #=> 'gcat'
! # ---
! # *Returns*:: current Bio::Sequence::NA object (modified)
def reverse_complement!
self.reverse!
***************
*** 72,87 ****
end
! # Aliases for short
alias complement reverse_complement
alias complement! reverse_complement!
! # Translate into the amino acid sequence from the given frame and the
! # selected codon table. The table also can be a Bio::CodonTable object.
! # The 'unknown' character is used for invalid/unknown codon (can be
! # used for 'nnn' and/or gap translation in practice).
#
! # Frame can be 1, 2 or 3 for the forward strand and -1, -2 or -3
! # (4, 5 or 6 is also accepted) for the reverse strand.
def translate(frame = 1, table = 1, unknown = 'X')
if table.is_a?(Bio::CodonTable)
--- 150,235 ----
end
! # Alias for Bio::Sequence::NA#reverse_complement
alias complement reverse_complement
+
+ # Alias for Bio::Sequence::NA#reverse_complement!
alias complement! reverse_complement!
! # Translate into an amino acid sequence.
! #
! # s = Bio::Sequence::NA.new('atggcgtga')
! # puts s.translate #=> "MA*"
#
! # By default, translate starts in reading frame position 1, but you
! # can start in either 2 or 3 as well,
! #
! # puts s.translate(2) #=> "WR"
! # puts s.translate(3) #=> "GV"
! #
! # You may also translate the reverse complement in one step by using frame
! # values of -1, -2, and -3 (or 4, 5, and 6)
! #
! # puts s.translate(-1) #=> "SRH"
! # puts s.translate(4) #=> "SRH"
! # puts s.reverse_complement.translate(1) #=> "SRH"
! #
! # The default codon table in the translate function is the Standard
! # Eukaryotic codon table. The translate function takes either a
! # number or a Bio::CodonTable object for its table argument.
! # The available tables are
! # (NCBI[http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi?mode=t]):
! #
! # 1. "Standard (Eukaryote)"
! # 2. "Vertebrate Mitochondrial"
! # 3. "Yeast Mitochondorial"
! # 4. "Mold, Protozoan, Coelenterate Mitochondrial and Mycoplasma/Spiroplasma"
! # 5. "Invertebrate Mitochondrial"
! # 6. "Ciliate Macronuclear and Dasycladacean"
! # 9. "Echinoderm Mitochondrial"
! # 10. "Euplotid Nuclear"
! # 11. "Bacteria"
! # 12. "Alternative Yeast Nuclear"
! # 13. "Ascidian Mitochondrial"
! # 14. "Flatworm Mitochondrial"
! # 15. "Blepharisma Macronuclear"
! # 16. "Chlorophycean Mitochondrial"
! # 21. "Trematode Mitochondrial"
! # 22. "Scenedesmus obliquus mitochondrial"
! # 23. "Thraustochytrium Mitochondrial"
! #
! # If you are using anything other than the default table, you must specify
! # frame in the translate method call,
! #
! # puts s.translate #=> "MA*" (using defaults)
! # puts s.translate(1,1) #=> "MA*" (same as above, but explicit)
! # puts s.translate(1,2) #=> "MAW" (different codon table)
! #
! # and using a Bio::CodonTable instance in the translate method call,
! #
! # mt_table = Bio::CodonTable[2]
! # puts s.translate(1, mt_table) #=> "MAW"
! #
! # By default, any invalid or unknown codons (as could happen if the
! # sequence contains ambiguities) will be represented by 'X' in the
! # translated sequence.
! # You may change this to any character of your choice.
! #
! # s = Bio::Sequence::NA.new('atgcNNtga')
! # puts s.translate #=> "MX*"
! # puts s.translate(1,1,'9') #=> "M9*"
! #
! # The translate method considers gaps to be unknown characters and treats
! # them as such (i.e. does not collapse sequences prior to translation), so
! #
! # s = Bio::Sequence::NA.new('atgc--tga')
! # puts s.translate #=> "MX*"
! # ---
! # *Arguments*:
! # * (optional) _frame_: one of 1,2,3,4,5,6,-1,-2,-3 (default 1)
! # * (optional) _table_: Fixnum in range 1,23 or Bio::CodonTable object
! # (default 1)
! # * (optional) _unknown_: Character (default 'X')
! # *Returns*:: Bio::Sequence::AA object
def translate(frame = 1, table = 1, unknown = 'X')
if table.is_a?(Bio::CodonTable)
***************
*** 109,113 ****
end
! # Returns counts of the each codon in the sequence by Hash.
def codon_usage
hash = Hash.new(0)
--- 257,276 ----
end
! # Returns counts of each codon in the sequence in a hash.
! #
! # s = Bio::Sequence::NA.new('atggcgtga')
! # puts s.codon_usage #=> {"gcg"=>1, "tga"=>1, "atg"=>1}
! #
! # This method does not validate codons! Any three letter group is a 'codon'. So,
! #
! # s = Bio::Sequence::NA.new('atggNNtga')
! # puts s.codon_usage #=> {"tga"=>1, "gnn"=>1, "atg"=>1}
! #
! # seq = Bio::Sequence::NA.new('atgg--tga')
! # puts s.codon_usage #=> {"tga"=>1, "g--"=>1, "atg"=>1}
! #
! # Also, there is no option to work in any frame other than the first.
! # ---
! # *Returns*:: Hash object
def codon_usage
hash = Hash.new(0)
***************
*** 118,122 ****
end
! # Calculate the ratio of GC / ATGC bases in percent.
def gc_percent
count = self.composition
--- 281,291 ----
end
! # Calculate the ratio of GC / ATGC bases as a percentage rounded to
! # the nearest whole number.
! #
! # s = Bio::Sequence::NA.new('atggcgtga')
! # puts s.gc_percent #=> 55
! # ---
! # *Returns*:: Fixnum
def gc_percent
count = self.composition
***************
*** 127,136 ****
end
! # Show abnormal bases other than 'atgcu'.
def illegal_bases
self.scan(/[^atgcu]/).sort.uniq
end
! # Estimate the weight of this biological string molecule.
def molecular_weight
if self.rna?
--- 296,322 ----
end
! # Returns an alphabetically sorted array of any non-standard bases
! # (other than 'atgcu').
! #
! # s = Bio::Sequence::NA.new('atgStgQccR')
! # puts s.illegal_bases #=> ["q", "r", "s"]
! # ---
! # *Returns*:: Array object
def illegal_bases
self.scan(/[^atgcu]/).sort.uniq
end
! # Estimate molecular weight (using the values from BioPerl's
! # SeqStats.pm[http://doc.bioperl.org/releases/bioperl-1.0.1/Bio/Tools/SeqStats.html] module).
! #
! # s = Bio::Sequence::NA.new('atggcgtga')
! # puts s.molecular_weight #=> 2841.00708
! #
! # RNA and DNA do not have the same molecular weights,
! #
! # s = Bio::Sequence::NA.new('auggcguga')
! # puts s.molecular_weight #=> 2956.94708
! # ---
! # *Returns*:: Float object
def molecular_weight
if self.rna?
***************
*** 141,145 ****
end
! # Convert the universal code string into the regular expression.
def to_re
if self.rna?
--- 327,337 ----
end
! # Create a ruby regular expression instance
! # (Regexp)[http://corelib.rubyonrails.org/classes/Regexp.html]
! #
! # s = Bio::Sequence::NA.new('atggcgtga')
! # puts s.to_re #=> /atggcgtga/
! # ---
! # *Returns*:: Regexp object
def to_re
if self.rna?
***************
*** 150,154 ****
end
! # Convert the self string into the list of the names of the each base.
def names
array = []
--- 342,353 ----
end
! # Generate the list of the names of each nucleotide along with the
! # sequence (full name). Names used in bioruby are found in the
! # Bio::AminoAcid::NAMES hash.
! #
! # s = Bio::Sequence::NA.new('atg')
! # puts s.names #=> ["Adenine", "Thymine", "Guanine"]
! # ---
! # *Returns*:: Array object
def names
array = []
***************
*** 159,176 ****
end
! # Output a DNA string by substituting 'u' to 't'.
def dna
self.tr('u', 't')
end
def dna!
self.tr!('u', 't')
end
! # Output a RNA string by substituting 't' to 'u'.
def rna
self.tr('t', 'u')
end
def rna!
self.tr!('t', 'u')
--- 358,405 ----
end
! # Returns a new sequence object with any 'u' bases changed to 't'.
! # The original sequence is not modified.
! #
! # s = Bio::Sequence::NA.new('augc')
! # puts s.dna #=> 'atgc'
! # puts s #=> 'augc'
! # ---
! # *Returns*:: new Bio::Sequence::NA object
def dna
self.tr('u', 't')
end
+ # Changes any 'u' bases in the original sequence to 't'.
+ # The original sequence is modified.
+ #
+ # s = Bio::Sequence::NA.new('augc')
+ # puts s.dna! #=> 'atgc'
+ # puts s #=> 'atgc'
+ # ---
+ # *Returns*:: current Bio::Sequence::NA object (modified)
def dna!
self.tr!('u', 't')
end
! # Returns a new sequence object with any 't' bases changed to 'u'.
! # The original sequence is not modified.
! #
! # s = Bio::Sequence::NA.new('atgc')
! # puts s.dna #=> 'augc'
! # puts s #=> 'atgc'
! # ---
! # *Returns*:: new Bio::Sequence::NA object
def rna
self.tr('t', 'u')
end
+ # Changes any 't' bases in the original sequence to 'u'.
+ # The original sequence is modified.
+ #
+ # s = Bio::Sequence::NA.new('atgc')
+ # puts s.dna! #=> 'augc'
+ # puts s #=> 'augc'
+ # ---
+ # *Returns*:: current Bio::Sequence::NA object (modified)
def rna!
self.tr!('t', 'u')
More information about the bioruby-cvs
mailing list