[BioRuby-cvs] bioruby/lib/bio/sequence aa.rb, NONE, 1.1 common.rb,
NONE, 1.1 compat.rb, NONE, 1.1 format.rb, NONE, 1.1 na.rb, NONE, 1.1
Katayama Toshiaki
k at pub.open-bio.org
Sun Jan 22 23:13:38 EST 2006
Update of /home/repository/bioruby/bioruby/lib/bio/sequence
In directory pub.open-bio.org:/tmp/cvs-serv16731/sequence
Added Files:
aa.rb common.rb compat.rb format.rb na.rb
Log Message:
* refactored to store annotations in Bio::Sequence class
* common methods are separated into Bio::Sequence::Common module
* Bio::Sequence no longer inherits String
* Bio::Sequence::NA and AA inherits String and include Bio::Sequence::Common
* lib/bio/sequence.rb is a container for rich sequence
* lib/bio/sequence/common.rb contains Bio::Sequence::Common module
* lib/bio/sequence/na.rb defines Bio::Sequence::NA class
* lib/bio/sequence/aa.rb defines Bio::Sequence::AA class
* lib/bio/sequence/format.rb is for sequence format converter (define output formats)
* lib/bio/sequence/compat.rb is just for backward compatibility
--- NEW FILE: compat.rb ---
# only for backward compatibility, use Bio::Sequence#output(:fasta) instead
module Bio
class Sequence
def to_s
String.new(@seq)
end
alias to_str to_s
module Common
# Output the FASTA format string of the sequence. The 1st argument is
# used as the comment string. If the 2nd option is given, the output
# sequence will be folded.
def to_fasta(header = '', width = nil)
">#{header}\n" +
if width
self.to_s.gsub(Regexp.new(".{1,#{width}}"), "\\0\n")
else
self.to_s + "\n"
end
end
end # Common
class NA
def self.randomize(*arg, &block)
self.new('').randomize(*arg, &block)
end
end # NA
class AA
def self.randomize(*arg, &block)
self.new('').randomize(*arg, &block)
end
end # AA
end # Sequence
end # Bio
--- NEW FILE: na.rb ---
module Bio
class Sequence
# Nucleic Acid sequence
class NA < String
include Bio::Sequence::Common
# Generate a nucleic acid sequence object from a string.
def initialize(str)
super
self.downcase!
self.tr!(" \t\n\r",'')
end
# This method depends on Locations class, see bio/location.rb
def splicing(position)
mRNA = super
if mRNA.rna?
mRNA.tr!('t', 'u')
else
mRNA.tr!('u', 't')
end
mRNA
end
# Returns complement sequence without reversing ("atgc" -> "tacg")
def forward_complement
s = self.class.new(self)
s.forward_complement!
s
end
# Convert to complement sequence without reversing ("atgc" -> "tacg")
def forward_complement!
if self.rna?
self.tr!('augcrymkdhvbswn', 'uacgyrkmhdbvswn')
else
self.tr!('atgcrymkdhvbswn', 'tacgyrkmhdbvswn')
end
self
end
# Returns reverse complement sequence ("atgc" -> "gcat")
def reverse_complement
s = self.class.new(self)
s.reverse_complement!
s
end
# Convert to reverse complement sequence ("atgc" -> "gcat")
def reverse_complement!
self.reverse!
self.forward_complement!
end
# Aliases for short
alias complement reverse_complement
alias complement! reverse_complement!
# Translate into the amino acid sequence from the given frame and the
# selected codon table. The table also can be a Bio::CodonTable object.
# The 'unknown' character is used for invalid/unknown codon (can be
# used for 'nnn' and/or gap translation in practice).
#
# Frame can be 1, 2 or 3 for the forward strand and -1, -2 or -3
# (4, 5 or 6 is also accepted) for the reverse strand.
def translate(frame = 1, table = 1, unknown = 'X')
if table.is_a?(Bio::CodonTable)
ct = table
else
ct = Bio::CodonTable[table]
end
naseq = self.dna
case frame
when 1, 2, 3
from = frame - 1
when 4, 5, 6
from = frame - 4
naseq.complement!
when -1, -2, -3
from = -1 - frame
naseq.complement!
else
from = 0
end
nalen = naseq.length - from
nalen -= nalen % 3
aaseq = naseq[from, nalen].gsub(/.{3}/) {|codon| ct[codon] or unknown}
return Bio::Sequence::AA.new(aaseq)
end
# Returns counts of the each codon in the sequence by Hash.
def codon_usage
hash = Hash.new(0)
self.window_search(3, 3) do |codon|
hash[codon] += 1
end
return hash
end
# Calculate the ratio of GC / ATGC bases in percent.
def gc_percent
count = self.composition
at = count['a'] + count['t'] + count['u']
gc = count['g'] + count['c']
gc = 100 * gc / (at + gc)
return gc
end
# Show abnormal bases other than 'atgcu'.
def illegal_bases
self.scan(/[^atgcu]/).sort.uniq
end
# Estimate the weight of this biological string molecule.
# NucleicAcid is defined in bio/data/na.rb
def molecular_weight
if self.rna?
NucleicAcid.weight(self, true)
else
NucleicAcid.weight(self)
end
end
# Convert the universal code string into the regular expression.
def to_re
if self.rna?
NucleicAcid.to_re(self.dna, true)
else
NucleicAcid.to_re(self)
end
end
# Convert the self string into the list of the names of the each base.
def names
array = []
self.each_byte do |x|
array.push(NucleicAcid.names[x.chr.upcase])
end
return array
end
# Output a DNA string by substituting 'u' to 't'.
def dna
self.tr('u', 't')
end
def dna!
self.tr!('u', 't')
end
# Output a RNA string by substituting 't' to 'u'.
def rna
self.tr('t', 'u')
end
def rna!
self.tr!('t', 'u')
end
def rna?
self.index('u')
end
protected :rna?
def pikachu
self.dna.tr("atgc", "pika") # joke, of course :-)
end
end # NA
end # Sequence
end # Bio
--- NEW FILE: format.rb ---
# porting from N. Goto's feature-output.rb on BioRuby list.
module Bio
class Sequence
# Output the FASTA format string of the sequence. The 1st argument is
# used in the comment line. If the 2nd argument (integer) is given,
# the output sequence will be folded.
def format_fasta(header = nil, width = nil)
header ||= "#{@entry_id} #{@definition}"
">#{header}\n" +
if width
@seq.to_s.gsub(Regexp.new(".{1,#{width}}"), "\\0\n")
else
@seq.to_s + "\n"
end
end
def format_genbank
prefix = ' ' * 5
indent = prefix + ' ' * 16
fwidth = 79 - indent.length
format_features(prefix, indent, fwidth)
end
def format_embl
prefix = 'FT '
indent = prefix + ' ' * 16
fwidth = 80 - indent.length
format_features(prefix, indent, fwidth)
end
private
def format_features(prefix, indent, width)
result = ''
@features.each do |feature|
result << prefix + sprintf("%-16s", feature.feature)
position = feature.position
#position = feature.locations.to_s
head = ''
wrap(position, width).each_line do |line|
result << head << line
head = indent
end
result << format_qualifiers(feature.qualifiers, width)
end
return result
end
def format_qualifiers(qualifiers, indent, width)
qualifiers.each do |qualifier|
q = qualifier.qualifier
v = qualifier.value.to_s
if v == true
lines = wrap('/' + q, width)
elsif q == 'translation'
lines = fold('/' + q + '=' + val, width)
else
if v[/\D/]
#v.delete!("\x00-\x1f\x7f-\xff")
v.gsub!(/"/, '""')
v = '"' + v + '"'
end
lines = wrap('/' + q + '=' + val, width)
end
return lines.gsub(/^/, indent)
end
end
def fold(str, width)
str.gsub(Regexp.new("(.{1,#{width}})"), "\\1\n")
end
def wrap(str, width)
result = []
left = str.dup
while left and left.length > width
line = nil
width.downto(1) do |i|
if left[i..i] == ' ' or /[,;]/ =~ left[(i-1)..(i-1)] then
line = left[0..(i-1)].sub(/ +\z/, '')
left = left[i..-1].sub(/\A +/, '')
break
end
end
if line.nil? then
line = left[0..(width-1)]
left = left[width..-1]
end
result << line
end
result << left if left
return result.join("\n")
end
end # Sequence
end # Bio
--- NEW FILE: common.rb ---
module Bio
class Sequence
module Common
def to_s
String.new(self)
end
alias to_str to_s
# Force self to re-initialize for clean up (remove white spaces,
# case unification).
def seq
self.class.new(self)
end
# Similar to the 'seq' method, but changes the self object destructively.
def normalize!
initialize(self)
self
end
alias seq! normalize!
def <<(*arg)
super(self.class.new(*arg))
end
alias concat <<
def +(*arg)
self.class.new(super(*arg))
end
# Returns the subsequence of the self string.
def subseq(s = 1, e = self.length)
return nil if s < 1 or e < 1
s -= 1
e -= 1
self[s..e]
end
# This method iterates on sub string with specified length 'window_size'.
# By specifing 'step_size', codon sized shifting or spliting genome
# sequence with ovelapping each end can easily be yielded.
#
# The remainder sequence at the terminal end will be returned.
#
# Example:
# # prints average GC% on each 100bp
# seq.window_search(100) do |subseq|
# puts subseq.gc
# end
# # prints every translated peptide (length 5aa) in the same frame
# seq.window_search(15, 3) do |subseq|
# puts subseq.translate
# end
# # split genome sequence by 10000bp with 1000bp overlap in fasta format
# i = 1
# remainder = seq.window_search(10000, 9000) do |subseq|
# puts subseq.to_fasta("segment #{i}", 60)
# i += 1
# end
# puts remainder.to_fasta("segment #{i}", 60)
#
def window_search(window_size, step_size = 1)
i = 0
0.step(self.length - window_size, step_size) do |i|
yield self[i, window_size]
end
return self[i + window_size .. -1]
end
# This method receive a hash of residues/bases to the particular values,
# and sum up the value along with the self sequence. Especially useful
# to use with the window_search method and amino acid indices etc.
def total(hash)
hash.default = 0.0 unless hash.default
sum = 0.0
self.each_byte do |x|
begin
sum += hash[x.chr]
end
end
return sum
end
# Returns a hash of the occurrence counts for each residue or base.
def composition
count = Hash.new(0)
self.scan(/./) do |x|
count[x] += 1
end
return count
end
# Returns a randomized sequence keeping its composition by default.
# The argument is required when generating a random sequence from the empty
# sequence (used by the class methods NA.randomize, AA.randomize).
# If the block is given, yields for each random residue/base.
def randomize(hash = nil)
length = self.length
if hash
count = hash.clone
count.each_value {|x| length += x}
else
count = self.composition
end
seq = ''
tmp = {}
length.times do
count.each do |k, v|
tmp[k] = v * rand
end
max = tmp.max {|a, b| a[1] <=> b[1]}
count[max.first] -= 1
if block_given?
yield max.first
else
seq += max.first
end
end
return self.class.new(seq)
end
# Generate a new random sequence with the given frequency of bases
# or residues. The sequence length is determined by the sum of each
# base/residue occurences.
def self.randomize(*arg, &block)
self.new('').randomize(*arg, &block)
end
# Receive a GenBank style position string and convert it to the Locations
# objects to splice the sequence itself. See also: bio/location.rb
#
# This method depends on Locations class, see bio/location.rb
def splicing(position)
unless position.is_a?(Locations) then
position = Locations.new(position)
end
s = ''
position.each do |location|
if location.sequence
s << location.sequence
else
exon = self.subseq(location.from, location.to)
begin
exon.complement! if location.strand < 0
rescue NameError
end
s << exon
end
end
return self.class.new(s)
end
end # Common
end # Sequence
end # Bio
--- NEW FILE: aa.rb ---
module Bio
class Sequence
# Amino Acid sequence
class AA < String
include Bio::Sequence::Common
# Generate a amino acid sequence object from a string.
def initialize(str)
super
self.upcase!
self.tr!(" \t\n\r",'')
end
# Estimate the weight of this protein.
# AminoAcid is defined in bio/data/aa.rb
def molecular_weight
AminoAcid.weight(self)
end
def to_re
AminoAcid.to_re(self)
end
# Generate the list of the names of the each residue along with the
# sequence (3 letters code).
def codes
array = []
self.each_byte do |x|
array.push(AminoAcid.names[x.chr])
end
return array
end
# Similar to codes but returns long names.
def names
self.codes.map do |x|
AminoAcid.names[x]
end
end
end # AA
end # Sequence
end # Bio
More information about the bioruby-cvs
mailing list