[BioRuby-cvs] bioruby/lib/bio/appl/gcg msf.rb, NONE, 1.1 seq.rb, NONE, 1.1
Naohisa Goto
ngoto at dev.open-bio.org
Thu Dec 14 19:52:55 UTC 2006
Update of /home/repository/bioruby/bioruby/lib/bio/appl/gcg
In directory dev.open-bio.org:/tmp/cvs-serv15819/lib/bio/appl/gcg
Added Files:
msf.rb seq.rb
Log Message:
New files/classes: Bio::GCG::Msf in lib/bio/appl/gcg/msf.rb for GCG msf
multiple sequence alignment format parser, and Bio::GCG::Seq in
lib/bio/appl/gcg/seq.rb for GCG sequence format parser.
Autoload of the classes (in bio.rb) and file format autodetection
(in flatfile.rb) are also supported.
Bio::Alignment::Output#output_msf, #output(:msf, ...) are added
to generate msf formatted string from multiple alignment object.
--- NEW FILE: msf.rb ---
#
# = bio/appl/gcg/msf.rb - GCG multiple sequence alignment (.msf) parser class
#
# Copyright:: Copyright (C) 2003, 2006
# Naohisa Goto <ng at bioruby.org>
# License:: Ruby's
#
# $Id: msf.rb,v 1.1 2006/12/14 19:52:53 ngoto Exp $
#
# = About Bio::GCG::Msf
#
# Please refer document of Bio::GCG::Msf.
#
#---
# (depends on autoload)
#require 'bio/appl/gcg/seq'
#+++
module Bio
module GCG
# The msf is a multiple sequence alignment format developed by Wisconsin.
# Bio::GCG::Msf is a msf format parser.
class Msf #< DB
# delimiter used by Bio::FlatFile
DELIMITER = RS = nil
# Creates a new Msf object.
def initialize(str)
str = str.sub(/\A[\r\n]+/, '')
if /^\!\![A-Z]+\_MULTIPLE\_ALIGNMNENT/ =~ str[/.*/] then
@heading = str[/.*/] # '!!NA_MULTIPLE_ALIGNMENT 1.0' or like this
str.sub!(/.*/, '')
end
str.sub!(/.*\.\.$/m, '')
@description = $&.to_s.sub(/^.*\.\.$/, '').to_s
d = $&.to_s
if m = /(.+)\s+MSF\:\s+(\d+)\s+Type\:\s+(\w)\s+(.+)\s+(Comp)?Check\:\s+(\d+)/.match(d) then
@entry_id = m[1].to_s.strip
@length = (m[2] ? m[2].to_i : nil)
@seq_type = m[3]
@date = m[4].to_s.strip
@checksum = (m[6] ? m[6].to_i : nil)
end
str.sub!(/.*\/\/$/m, '')
a = $&.to_s.split(/^/)
@seq_info = []
a.each do |x|
if /Name\: / =~ x then
s = {}
x.scan(/(\S+)\: +(\S*)/) { |y| s[$1] = $2 }
@seq_info << s
end
end
@data = str
@description.sub!(/\A(\r\n|\r|\n)/, '')
@align = nil
end
# description
attr_reader :description
# ID of the alignment
attr_reader :entry_id
# alignment length
attr_reader :length
# sequence type ("N" for DNA/RNA or "P" for protein)
attr_reader :seq_type
# date
attr_reader :date
# checksum
attr_reader :checksum
# heading
# ('!!NA_MULTIPLE_ALIGNMENT 1.0' or whatever like this)
attr_reader :heading
#---
## data (internally used, will be obsoleted)
#attr_reader :data
#
## seq. info. (internally used, will be obsoleted)
#attr_reader :seq_info
#+++
# symbol comparison table
def symbol_comparison_table
unless defined?(@symbol_comparison_table)
/Symbol comparison table\: +(\S+)/ =~ @description
@symbol_comparison_table = $1
end
@symbol_comparison_table
end
# gap weight
def gap_weight
unless defined?(@gap_weight)
/GapWeight\: +(\S+)/ =~ @description
@gap_weight = $1
end
@gap_weight
end
# gap length weight
def gap_length_weight
unless defined?(@gap_length_weight)
/GapLengthWeight\: +(\S+)/ =~ @description
@gap_length_weight = $1
end
@gap_length_weight
end
# CompCheck field
def compcheck
unless defined?(@compcheck)
if /CompCheck\: +(\d+)/ =~ @description then
@compcheck = $1.to_i
else
@compcheck = nil
end
end
@compcheck
end
# parsing
def do_parse
return if @align
a = @data.strip.split(/\n\n/)
@seq_data = Array.new(@seq_info.size)
@seq_data.collect! { |x| Array.new }
a.each do |x|
b = x.split(/\n/)
nw = 0
if b.size > @seq_info.size then
if /^ +/ =~ b.shift.to_s
nw = $&.to_s.length
end
end
if nw > 0 then
b.each_with_index { |y, i| y[0, nw] = ''; @seq_data[i] << y }
else
b.each_with_index { |y, i|
@seq_data[i] << y.strip.split(/ +/, 2)[1].to_s
}
end
end
case seq_type
when 'P', 'p'
k = Bio::Sequence::AA
when 'N', 'n'
k = Bio::Sequence::NA
else
k = Bio::Sequence::Generic
end
@seq_data.collect! do |x|
y = x.join('')
y.gsub!(/[\s\d]+/, '')
k.new(y)
end
aln = Bio::Alignment.new
@seq_data.each_with_index do |x, i|
aln.store(@seq_info[i]['Name'], x)
end
@align = aln
end
private :do_parse
# returns Bio::Alignment object.
def alignment
do_parse
@align
end
# gets seq data (used internally) (will be obsoleted)
def seq_data
do_parse
@seq_data
end
# validates checksum
def validate_checksum
do_parse
valid = true
total = 0
@seq_data.each_with_index do |x, i|
sum = Bio::GCG::Seq.calc_checksum(x)
if sum != @seq_info[i]['Check'].to_i
valid = false
break
end
total += sum
end
return false unless valid
if @checksum != 0 # "Check:" field of BioPerl is always 0
valid = ((total % 10000) == @checksum)
end
valid
end
end #class Msf
end #module GCG
end # module Bio
--- NEW FILE: seq.rb ---
#
# = bio/appl/gcg/seq.rb - GCG sequence file format class (.seq/.pep file)
#
# Copyright:: Copyright (C) 2003, 2006
# Naohisa Goto <ng at bioruby.org>
# License:: Ruby's
#
# $Id: seq.rb,v 1.1 2006/12/14 19:52:53 ngoto Exp $
#
# = About Bio::GCG::Msf
#
# Please refer document of Bio::GCG::Msf.
#
module Bio
module GCG
#
# = Bio::GCG::Seq
#
# This is GCG sequence file format (.seq or .pep) parser class.
#
# = References
#
# * Information about GCG Wisconsin Package(R)
# http://www.accelrys.com/products/gcg_wisconsin_package .
# * EMBOSS sequence formats
# http://www.hgmp.mrc.ac.uk/Software/EMBOSS/Themes/SequenceFormats.html
# * BioPerl document
# http://docs.bioperl.org/releases/bioperl-1.2.3/Bio/SeqIO/gcg.html
class Seq #< DB
# delimiter used by Bio::FlatFile
DELIMITER = RS = nil
# Creates new instance of this class.
# str must be a GCG seq formatted string.
def initialize(str)
@heading = str[/.*/] # '!!NA_SEQUENCE 1.0' or like this
str = str.sub(/.*/, '')
str.sub!(/.*\.\.$/m, '')
@definition = $&.to_s.sub(/^.*\.\.$/, '').to_s
desc = $&.to_s
if m = /(.+)\s+Length\:\s+(\d+)\s+(.+)\s+Type\:\s+(\w)\s+Check\:\s+(\d+)/.match(desc) then
@entry_id = m[1].to_s.strip
@length = (m[2] ? m[2].to_i : nil)
@date = m[3].to_s.strip
@seq_type = m[4]
@checksum = (m[5] ? m[5].to_i : nil)
end
@data = str
@seq = nil
@definition.strip!
end
# ID field.
attr_reader :entry_id
# Description field.
attr_reader :definition
# "Length:" field.
# Note that sometimes this might differ from real sequence length.
attr_reader :length
# Date field of this entry.
attr_reader :date
# "Type:" field, which indicates sequence type.
# "N" means nucleic acid sequence, "P" means protein sequence.
attr_reader :seq_type
# "Check:" field, which indicates checksum of current sequence.
attr_reader :checksum
# heading
# ('!!NA_SEQUENCE 1.0' or whatever like this)
attr_reader :heading
#---
## data (internally used, will be obsoleted)
#attr_reader :data
#+++
# Sequence data.
# The class of the sequence is Bio::Sequence::NA, Bio::Sequence::AA
# or Bio::Sequence::Generic, according to the sequence type.
def seq
unless @seq then
case @seq_type
when 'N', 'n'
k = Bio::Sequence::NA
when 'P', 'p'
k = Bio::Sequence::AA
else
k = Bio::Sequence
end
@seq = k.new(@data.tr('^-a-zA-Z.~', ''))
end
@seq
end
# If you know the sequence is AA, use this method.
# Returns a Bio::Sequence::AA object.
#
# If you call naseq for protein sequence,
# or aaseq for nucleic sequence, RuntimeError will be raised.
def aaseq
if seq.is_a?(Bio::Sequence::AA) then
@seq
else
raise 'seq_type != \'P\''
end
end
# If you know the sequence is NA, use this method.
# Returens a Bio::Sequence::NA object.
#
# If you call naseq for protein sequence,
# or aaseq for nucleic sequence, RuntimeError will be raised.
def naseq
if seq.is_a?(Bio::Sequence::NA) then
@seq
else
raise 'seq_type != \'N\''
end
end
# Validates checksum.
# If validation succeeds, returns true.
# Otherwise, returns false.
def validate_checksum
checksum == self.class.calc_checksum(seq)
end
#---
# class methods
#+++
# Calculates checksum from given string.
def self.calc_checksum(str)
# Reference: Bio::SeqIO::gcg of BioPerl-1.2.3
idx = 0
sum = 0
str.upcase.tr('^A-Z.~', '').each_byte do |c|
idx += 1
sum += idx * c
idx = 0 if idx >= 57
end
(sum % 10000)
end
# Creates a new GCG sequence format text.
# Parameters can be omitted.
#
# Examples:
# Bio::GCG::Seq.to_gcg(:definition=>'H.sapiens DNA',
# :seq_type=>'N', :entry_id=>'gi-1234567',
# :seq=>seq, :date=>date)
#
def self.to_gcg(hash)
seq = hash[:seq]
if seq.is_a?(Bio::Sequence::NA) then
seq_type = 'N'
elsif seq.is_a?(Bio::Sequence::AA) then
seq_type = 'P'
else
seq_type = (hash[:seq_type] or 'P')
end
if seq_type == 'N' then
head = '!!NA_SEQUENCE 1.0'
else
head = '!!AA_SEQUENCE 1.0'
end
date = (hash[:date] or Time.now.strftime('%B %d, %Y %H:%M'))
entry_id = hash[:entry_id].to_s.strip
len = seq.length
checksum = self.calc_checksum(seq)
definition = hash[:definition].to_s.strip
seq = seq.upcase.gsub(/.{1,50}/, "\\0\n")
seq.gsub!(/.{10}/, "\\0 ")
w = len.to_s.size + 1
i = 1
seq.gsub!(/^/) { |x| s = sprintf("\n%*d ", w, i); i += 50; s }
[ head, "\n", definition, "\n\n",
"#{entry_id} Length: #{len} #{date} " \
"Type: #{seq_type} Check: #{checksum} ..\n",
seq, "\n" ].join('')
end
end #class Seq
end #module GCG
end #module Bio
More information about the bioruby-cvs
mailing list