[BioRuby-cvs] bioruby/lib/bio sequence.rb,0.45,0.46
Katayama Toshiaki
k at pub.open-bio.org
Tue Nov 15 09:49:02 EST 2005
Update of /home/repository/bioruby/bioruby/lib/bio
In directory pub.open-bio.org:/tmp/cvs-serv31691/lib/bio
Modified Files:
sequence.rb
Log Message:
* converted to RDoc
* Bio::Sequence::NA#codon_usage method is added
Index: sequence.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/sequence.rb,v
retrieving revision 0.45
retrieving revision 0.46
diff -C2 -d -r0.45 -r0.46
*** sequence.rb 5 Nov 2005 08:22:52 -0000 0.45
--- sequence.rb 15 Nov 2005 14:49:00 -0000 0.46
***************
*** 1,8 ****
#
! # bio/sequence.rb - biological sequence class
#
! # Copyright (C) 2000-2005 KATAYAMA Toshiaki <k at bioruby.org>
! # Copyright (C) 2001 Yoshinori K. Okuji <o at bioruby.org>
! # Copyright (C) 2003 GOTO Naohisa <ng at bioruby.org>
#
# This library is free software; you can redistribute it and/or
--- 1,19 ----
#
! # = bio/sequence.rb - biological sequence class
#
! # Copyright:: Copyright (C) 2000-2005
! # Toshiaki Katayama <k at bioruby.org>,
! # Yoshinori K. Okuji <okuji at embug.org>,
! # Naohisa Goto <ng at bioruby.org>
! # License:: LGPL
! #
! # $Id$
! #
! #--
! # *TODO* remove this functionality?
! # You can use Bio::Seq instead of Bio::Sequence for short.
! #++
! #
! #--
#
# This library is free software; you can redistribute it and/or
***************
*** 20,24 ****
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#
! # $Id$
#
--- 31,35 ----
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#
! #++
#
***************
*** 30,359 ****
module Bio
! # Nucleic/Amino Acid sequence
! class Sequence < String
! def self.auto(str)
! moltype = guess(str)
! if moltype == NA
! NA.new(str)
! else
! AA.new(str)
! end
end
! def guess(threshold = 0.9)
! cmp = self.composition
!
! bases = cmp['A'] + cmp['T'] + cmp['G'] + cmp['C'] +
! cmp['a'] + cmp['t'] + cmp['g'] + cmp['c']
! total = self.length - cmp['N'] - cmp['n']
! if bases.to_f / total > threshold
! return NA
! else
! return AA
! end
! end
! def self.guess(str, *args)
! self.new(str).guess(*args)
end
! def to_s
! String.new(self)
! end
! alias to_str to_s
! def seq
! self.class.new(self)
! end
! def normalize!
! initialize(self)
! self
! end
! alias seq! normalize!
! def <<(*arg)
! super(self.class.new(*arg))
! end
! alias concat <<
! def +(*arg)
! self.class.new(super(*arg))
! end
! def subseq(s = 1, e = self.length)
! return nil if s < 1 or e < 1
! s -= 1
! e -= 1
! self[s..e]
end
! def to_fasta(header = '', width = nil)
! ">#{header}\n" +
! if width
! self.to_s.gsub(Regexp.new(".{1,#{width}}"), "\\0\n")
! else
! self.to_s + "\n"
end
end
! def window_search(window_size, step_size = 1)
! i = 0
! 0.step(self.length - window_size, step_size) do |i|
! yield self[i, window_size]
! end
! return self[i + window_size .. -1]
end
! def total(hash)
! hash.default = 0.0 unless hash.default
! sum = 0.0
! self.each_byte do |x|
! begin
! sum += hash[x.chr]
! end
! end
! return sum
end
! def composition
! count = Hash.new(0)
! self.scan(/./) do |x|
! count[x] += 1
end
! return count
! end
! def randomize(hash = nil)
! length = self.length
! if hash
! count = hash.clone
! count.each_value {|x| length += x}
else
! count = self.composition
end
! seq = ''
! tmp = {}
! length.times do
! count.each do |k, v|
! tmp[k] = v * rand
! end
! max = tmp.max {|a, b| a[1] <=> b[1]}
! count[max.first] -= 1
! if block_given?
! yield max.first
! else
! seq += max.first
end
end
- return self.class.new(seq)
end
! def self.randomize(*arg, &block)
! self.new('').randomize(*arg, &block)
end
# This method depends on Locations class, see bio/location.rb
def splicing(position)
! unless position.is_a?(Locations) then
! position = Locations.new(position)
! end
! s = ''
! position.each do |location|
! if location.sequence
! s << location.sequence
! else
! exon = self.subseq(location.from, location.to)
! begin
! exon.complement! if location.strand < 0
! rescue NameError
! end
! s << exon
! end
end
! return self.class.new(s)
end
! # Nucleic Acid sequence
!
! class NA < Sequence
!
! def initialize(str)
! super
! self.downcase!
! self.tr!(" \t\n\r",'')
! end
!
! # This method depends on Locations class, see bio/location.rb
! def splicing(position)
! mRNA = super
! if mRNA.rna?
! mRNA.tr!('t', 'u')
! else
! mRNA.tr!('u', 't')
! end
! mRNA
! end
!
! def complement
! s = self.class.new(self)
! s.complement!
! s
end
! def complement!
! if self.rna?
! self.reverse!
! self.tr!('augcrymkdhvbswn', 'uacgyrkmhdbvswn')
! else
! self.reverse!
! self.tr!('atgcrymkdhvbswn', 'tacgyrkmhdbvswn')
! end
! self
end
!
! def translate(frame = 1, table = 1, unknown = 'X')
! if table.is_a?(Bio::CodonTable)
! ct = table
! else
! ct = Bio::CodonTable[table]
! end
! naseq = self.dna
! case frame
! when 1, 2, 3
! frame -= 1
! when 4, 5, 6
! frame -= 4
! naseq.complement!
! when -1, -2, -3
! frame = -1 - frame
! naseq.complement!
! else
! frame = 0
! end
! nalen = naseq.length - (naseq.length - frame) % 3
! aaseq = naseq[frame, nalen].gsub(/.{3}/) {|codon| ct[codon] or unknown}
! return Bio::Sequence::AA.new(aaseq)
end
! def gc_percent
! count = self.composition
! at = count['a'] + count['t'] + count['u']
! gc = count['g'] + count['c']
! gc = 100 * gc / (at + gc)
! return gc
end
! def illegal_bases
! self.scan(/[^atgcu]/).sort.uniq
! end
! # NucleicAcid is defined in bio/data/na.rb
! def molecular_weight
! if self.rna?
! NucleicAcid.weight(self, true)
! else
! NucleicAcid.weight(self)
! end
! end
! def to_re
! if self.rna?
! NucleicAcid.to_re(self.dna, true)
! else
! NucleicAcid.to_re(self)
! end
end
! def names
! array = []
! self.each_byte do |x|
! array.push(NucleicAcid.names[x.chr.upcase])
! end
! return array
end
! def dna
! self.tr('u', 't')
end
! def dna!
! self.tr!('u', 't')
! end
! def rna
! self.tr('t', 'u')
! end
! def rna!
! self.tr!('t', 'u')
! end
! def rna?
! self.index('u')
! end
! protected :rna?
! def pikachu
! self.dna.tr("atgc", "pika") # joke, of course :-)
! end
end
- # Amino Acid sequence
! class AA < Sequence
! def initialize(str)
! super
! self.upcase!
! self.tr!(" \t\n\r",'')
! end
! # AminoAcid is defined in bio/data/aa.rb
! def molecular_weight
! AminoAcid.weight(self)
! end
! def to_re
! AminoAcid.to_re(self)
! end
! def codes
! array = []
! self.each_byte do |x|
! array.push(AminoAcid.names[x.chr])
! end
! return array
! end
! def names
! self.codes.map do |x|
! AminoAcid.names[x]
! end
end
end
end
- class Seq < Sequence
- attr_accessor :entry_id, :definition, :features, :references, :comments,
- :date, :keywords, :dblinks, :taxonomy, :moltype
- end
end
if __FILE__ == $0
--- 41,444 ----
module Bio
! # Nucleic/Amino Acid sequence
! class Sequence < String
! def self.auto(str)
! moltype = guess(str)
! if moltype == NA
! NA.new(str)
! else
! AA.new(str)
end
+ end
! def guess(threshold = 0.9)
! cmp = self.composition
! bases = cmp['A'] + cmp['T'] + cmp['G'] + cmp['C'] +
! cmp['a'] + cmp['t'] + cmp['g'] + cmp['c']
! total = self.length - cmp['N'] - cmp['n']
! if bases.to_f / total > threshold
! return NA
! else
! return AA
end
+ end
! def self.guess(str, *args)
! self.new(str).guess(*args)
! end
! def to_s
! String.new(self)
! end
! alias to_str to_s
! # Force self to re-initialize for clean up (remove white spaces,
! # case unification).
! def seq
! self.class.new(self)
! end
! # Similar to the 'seq' method, but changes the self object destructively.
! def normalize!
! initialize(self)
! self
! end
! alias seq! normalize!
! def <<(*arg)
! super(self.class.new(*arg))
! end
! alias concat <<
+ def +(*arg)
+ self.class.new(super(*arg))
+ end
! # Returns the subsequence of the self string.
! def subseq(s = 1, e = self.length)
! return nil if s < 1 or e < 1
! s -= 1
! e -= 1
! self[s..e]
! end
!
! # Output the FASTA format string of the sequence. The 1st argument is
! # used as the comment string. If the 2nd option is given, the output
! # sequence will be folded.
! def to_fasta(header = '', width = nil)
! ">#{header}\n" +
! if width
! self.to_s.gsub(Regexp.new(".{1,#{width}}"), "\\0\n")
! else
! self.to_s + "\n"
end
+ end
! # This method iterates on sub string with specified length 'window_size'.
! # By specifing 'step_size', codon sized shifting or spliting genome
! # sequence with ovelapping each end can easily be yielded.
! #
! # The remainder sequence at the terminal end will be returned.
! #
! # Example:
! # # prints average GC% on each 100bp
! # seq.window_search(100) do |subseq|
! # puts subseq.gc
! # end
! # # prints every translated peptide (length 5aa) in the same frame
! # seq.window_search(15, 3) do |subseq|
! # puts subseq.translate
! # end
! # # split genome sequence by 10000bp with 1000bp overlap in fasta format
! # i = 1
! # remainder = seq.window_search(10000, 9000) do |subseq|
! # puts subseq.to_fasta("segment #{i}", 60)
! # i += 1
! # end
! # puts remainder.to_fasta("segment #{i}", 60)
! #
! def window_search(window_size, step_size = 1)
! i = 0
! 0.step(self.length - window_size, step_size) do |i|
! yield self[i, window_size]
! end
! return self[i + window_size .. -1]
! end
!
! # This method receive a hash of residues/bases to the particular values,
! # and sum up the value along with the self sequence. Especially useful
! # to use with the window_search method and amino acid indices etc.
! def total(hash)
! hash.default = 0.0 unless hash.default
! sum = 0.0
! self.each_byte do |x|
! begin
! sum += hash[x.chr]
end
end
+ return sum
+ end
! # Returns a hash of the occurrence counts for each residue or base.
! def composition
! count = Hash.new(0)
! self.scan(/./) do |x|
! count[x] += 1
end
+ return count
+ end
! # Returns a randomized sequence keeping its composition by default.
! # The argument is required when generating a random sequence from the empty
! # sequence (used by the class methods NA.randomize, AA.randomize).
! # If the block is given, yields for each random residue/base.
! def randomize(hash = nil)
! length = self.length
! if hash
! count = hash.clone
! count.each_value {|x| length += x}
! else
! count = self.composition
end
! seq = ''
! tmp = {}
! length.times do
! count.each do |k, v|
! tmp[k] = v * rand
end
! max = tmp.max {|a, b| a[1] <=> b[1]}
! count[max.first] -= 1
! if block_given?
! yield max.first
else
! seq += max.first
end
+ end
+ return self.class.new(seq)
+ end
! # Generate a new random sequence with the given frequency of bases
! # or residues. The sequence length is determined by the sum of each
! # base/residue occurences.
! def self.randomize(*arg, &block)
! self.new('').randomize(*arg, &block)
! end
! # Receive a GenBank style position string and convert it to the Locations
! # objects to splice the sequence itself. See also: bio/location.rb
! #
! # This method depends on Locations class, see bio/location.rb
! def splicing(position)
! unless position.is_a?(Locations) then
! position = Locations.new(position)
! end
! s = ''
! position.each do |location|
! if location.sequence
! s << location.sequence
! else
! exon = self.subseq(location.from, location.to)
! begin
! exon.complement! if location.strand < 0
! rescue NameError
end
+ s << exon
end
end
+ return self.class.new(s)
+ end
!
! # Nucleic Acid sequence
!
! class NA < Sequence
!
! # Generate a nucleic acid sequence object from a string.
! def initialize(str)
! super
! self.downcase!
! self.tr!(" \t\n\r",'')
end
# This method depends on Locations class, see bio/location.rb
def splicing(position)
! mRNA = super
! if mRNA.rna?
! mRNA.tr!('t', 'u')
! else
! mRNA.tr!('u', 't')
end
! mRNA
end
+ # Returns a reverse complement sequence (including the universal codes).
+ def complement
+ s = self.class.new(self)
+ s.complement!
+ s
+ end
! def complement!
! if self.rna?
! self.reverse!
! self.tr!('augcrymkdhvbswn', 'uacgyrkmhdbvswn')
! else
! self.reverse!
! self.tr!('atgcrymkdhvbswn', 'tacgyrkmhdbvswn')
end
+ self
+ end
! # Translate into the amino acid sequence from the given frame and the
! # selected codon table. The table also can be a Bio::CodonTable object.
! # The 'unknown' character is used for invalid/unknown codon (can be
! # used for 'nnn' and/or gap translation in practice).
! #
! # Frame can be 1, 2 or 3 for the forward strand and -1, -2 or -3
! # (4, 5 or 6 is also accepted) for the reverse strand.
! def translate(frame = 1, table = 1, unknown = 'X')
! if table.is_a?(Bio::CodonTable)
! ct = table
! else
! ct = Bio::CodonTable[table]
end
! naseq = self.dna
! case frame
! when 1, 2, 3
! frame -= 1
! when 4, 5, 6
! frame -= 4
! naseq.complement!
! when -1, -2, -3
! frame = -1 - frame
! naseq.complement!
! else
! frame = 0
end
+ nalen = naseq.length - (naseq.length - frame) % 3
+ aaseq = naseq[frame, nalen].gsub(/.{3}/) {|codon| ct[codon] or unknown}
+ return Bio::Sequence::AA.new(aaseq)
+ end
! # Returns counts of the each codon in the sequence by Hash.
! def codon_usage
! hash = Hash.new(0)
! self.window_search(3, 3) do |codon|
! hash[codon] += 1
end
+ return hash
+ end
! # Calculate the ratio of GC / ATGC bases in percent.
! def gc_percent
! count = self.composition
! at = count['a'] + count['t'] + count['u']
! gc = count['g'] + count['c']
! gc = 100 * gc / (at + gc)
! return gc
! end
! # Show abnormal bases other than 'atgcu'.
! def illegal_bases
! self.scan(/[^atgcu]/).sort.uniq
! end
! # Estimate the weight of this biological string molecule.
! # NucleicAcid is defined in bio/data/na.rb
! def molecular_weight
! if self.rna?
! NucleicAcid.weight(self, true)
! else
! NucleicAcid.weight(self)
end
+ end
! # Convert the universal code string into the regular expression.
! def to_re
! if self.rna?
! NucleicAcid.to_re(self.dna, true)
! else
! NucleicAcid.to_re(self)
end
+ end
! # Convert the self string into the list of the names of the each base.
! def names
! array = []
! self.each_byte do |x|
! array.push(NucleicAcid.names[x.chr.upcase])
end
+ return array
+ end
! # Output a DNA string by substituting 'u' to 't'.
! def dna
! self.tr('u', 't')
! end
! def dna!
! self.tr!('u', 't')
! end
! # Output a RNA string by substituting 't' to 'u'.
! def rna
! self.tr('t', 'u')
! end
! def rna!
! self.tr!('t', 'u')
! end
! def rna?
! self.index('u')
! end
! protected :rna?
+ def pikachu
+ self.dna.tr("atgc", "pika") # joke, of course :-)
end
+ end
! # Amino Acid sequence
! class AA < Sequence
! # Generate a amino acid sequence object from a string.
! def initialize(str)
! super
! self.upcase!
! self.tr!(" \t\n\r",'')
! end
! # Estimate the weight of this protein.
! # AminoAcid is defined in bio/data/aa.rb
! def molecular_weight
! AminoAcid.weight(self)
! end
! def to_re
! AminoAcid.to_re(self)
! end
! # Generate the list of the names of the each residue along with the
! # sequence (3 letters code).
! def codes
! array = []
! self.each_byte do |x|
! array.push(AminoAcid.names[x.chr])
end
+ return array
+ end
+ # Similar to codes but returns long names.
+ def names
+ self.codes.map do |x|
+ AminoAcid.names[x]
+ end
end
end
+ end # Sequence
+ class Seq < Sequence
+ attr_accessor :entry_id, :definition, :features, :references, :comments,
+ :date, :keywords, :dblinks, :taxonomy, :moltype
end
+ end # Bio
+
+
if __FILE__ == $0
***************
*** 488,654 ****
end
-
- =begin
-
- = Bio::Sequence
-
- You can use Bio::Seq instead of Bio::Sequence for short.
-
- --- Bio::Sequence#seq
-
- Force self to re-initialize for clean up (remove white spaces,
- case unification).
-
- --- Bio::Sequence#seq!
- --- Bio::Sequence#normalize!
-
- Similar to the 'seq' method, but changes the self object destructively.
-
- --- Bio::Sequence#subseq(start = 1, end = self.length)
-
- Returns the subsequence of the self string.
-
- --- Bio::Sequence#to_fasta(header = '', width = nil)
-
- Output the FASTA format string of the sequence. The 1st argument is
- used as the comment string. If the 2nd option is given, the output
- sequence will be folded.
-
- --- Bio::Sequence#fasta(factory, header = '')
-
- Execute fasta by the factory (Bio::Fasta object) and returns
- Bio::Fasta::Report object. See Bio::Fasta for more details.
-
- --- Bio::Sequence#blast(factory, header = '')
-
- Execute blast by the factory (Bio::Blast object) and returns
- Bio::Blast::Report object. See Bio::Blast for more details.
-
- --- Bio::Sequence#splicing(position)
-
- Receive a GenBank style position string and convert it to the Locations
- objects to splice the sequence itself. See also: bio/location.rb
-
- --- Bio::Sequence#window_search(window_size, step_size = 1)
-
- This method iterates on sub string with specified length 'window_size'.
- By specifing 'step_size', codon sized shifting or spliting genome
- sequence with ovelapping each end can easily be yielded.
-
- The remainder sequence at the terminal end will be returned.
-
- Example:
- # prints average GC% on each 100bp
- seq.window_search(100) do |subseq|
- puts subseq.gc
- end
- # prints every translated peptide (length 5aa) in the same frame
- seq.window_search(15, 3) do |subseq|
- puts subseq.translate
- end
- # split genome sequence by 10000bp with 1000bp overlap in fasta format
- i = 1
- remainder = seq.window_search(10000, 9000) do |subseq|
- puts subseq.to_fasta("segment #{i}", 60)
- i += 1
- end
- puts remainder.to_fasta("segment #{i}", 60)
-
- --- Bio::Sequence#total(hash)
-
- This method receive a hash of residues/bases to the particular values,
- and sum up the value along with the self sequence. Especially useful
- to use with the window_search method and amino acid indices etc.
-
- --- Bio::Sequence#composition
-
- Returns a hash of the occurrence counts for each residue or base.
-
- --- Bio::Sequence#randomize(count = nil)
-
- Returns a randomized sequence keeping its composition by default.
- The argument is required when generating a random sequence from the empty
- sequence (used by the class methods NA.randomize, AA.randomize).
- If the block is given, yields for each random residue/base.
-
- --- Bio::Sequence.randomize(composition)
-
- Generate a new random sequence with the given frequency of bases
- or residues. The sequence length is determined by the sum of each
- base/residue occurences.
-
-
- == Bio::Sequence::NA
-
- --- Bio::Sequence::NA.new(str)
-
- Generate a nucleic acid sequence object from a string.
-
- --- Bio::Sequence::NA#complement
- --- Bio::Sequence::NA#complement!
-
- Returns a reverse complement sequence (including the universal codes).
-
- --- Bio::Sequence::NA#translate(frame = 1, table = 1, unknown = 'X')
-
- Translate into the amino acid sequence from the given frame and the
- selected codon table. The table also can be a Bio::CodonTable object.
- The 'unknown' character is used for invalid/unknown codon (can be
- used for 'nnn' and/or gap translation in practice).
-
- Frame can be 1, 2 or 3 for the forward strand and -1, -2 or -3
- (4, 5 or 6 is also accepted) for the reverse strand.
-
- --- Bio::Sequence::NA#gc_percent
- --- Bio::Sequence::NA#gc
-
- Calculate the ratio of GC / ATGC bases in percent.
-
- --- Bio::Sequence::NA#illegal_bases
-
- Show abnormal bases other than 'atgcu'.
-
- --- Bio::Sequence::NA#molecular_weight
-
- Estimate the weight of this biological string molecule.
-
- --- Bio::Sequence::NA#to_re
-
- Convert the universal code string into the regular expression.
-
- --- Bio::Sequence::NA#names
-
- Convert the self string into the list of the names of the each base.
-
- --- Bio::Sequence::NA#dna
- --- Bio::Sequence::NA#dna!
-
- Output a DNA string by substituting 'u' to 't'.
-
- --- Bio::Sequence::NA#rna
- --- Bio::Sequence::NA#rna!
-
- Output a RNA string by substituting 't' to 'u'.
-
-
- == Bio::Sequence::AA
-
- --- Bio::Sequence::AA.new(str)
-
- Generate a amino acid sequence object from a string.
-
- --- Bio::Sequence::AA#codes
-
- Generate the list of the names of the each residue along with the
- sequence (3 letters code).
-
- --- Bio::Sequence::NA#names
-
- Similar to codes but returns long names.
-
- --- Bio::Sequence::AA#molecular_weight
-
- Estimate the weight of this protein.
-
- =end
--- 573,575 ----
More information about the bioruby-cvs
mailing list