[BioRuby-cvs] bioruby/lib/bio alignment.rb,1.17,1.18
Naohisa Goto
ngoto at dev.open-bio.org
Thu Dec 14 12:39:48 UTC 2006
Update of /home/repository/bioruby/bioruby/lib/bio
In directory dev.open-bio.org:/tmp/cvs-serv13080/lib/bio
Modified Files:
alignment.rb
Log Message:
Bio::Alignment::ClustalWFormatter was removed and methods were renemed
and moved to Bio::Alignment::Output.
Output of Phylip interleaved and non-interleaved and Molphy
multiple alignment formats are supported.
Some bug fix about ClustalW output about SequenceHash.
Some changes in SequenceHash.
Bio::Alignment::EnumerableExtension#sequnece_names are newly added.
to_fasta and to_clustal methods are now obsoleted.
Instead, please use output methods.
Index: alignment.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/alignment.rb,v
retrieving revision 1.17
retrieving revision 1.18
diff -C2 -d -r1.17 -r1.18
*** alignment.rb 13 Dec 2006 16:58:39 -0000 1.17
--- alignment.rb 14 Dec 2006 12:39:45 -0000 1.18
***************
*** 623,627 ****
elsif seqclass == Bio::Sequence::NA then
amino = false
! elsif self.find { |x| /[EFILPQ]/i =~ x } then
amino = true
else
--- 623,627 ----
elsif seqclass == Bio::Sequence::NA then
amino = false
! elsif self.each_seq { |x| /[EFILPQ]/i =~ x } then
amino = true
else
***************
*** 856,869 ****
end #module EnumerableExtension
! # ClustalWFormatter is a module to create ClustalW-formatted text
! # from an alignment object.
! #
! # It will be obsoleted and the methods will be frequently changed.
! module ClustalWFormatter
! # Check whether there are same names.
#
# array:: names of the sequences (array of string)
# len:: length to check (default:30)
! def have_same_name?(array, len = 30)
na30 = array.collect do |k|
k.to_s.split(/[\x00\s]/)[0].to_s[0, len].gsub(/\:\;\,\(\)/, '_').to_s
--- 856,882 ----
end #module EnumerableExtension
! module Output
! def output(format, *arg)
! case format
! when :clustal
! output_clustal(*arg)
! when :fasta
! output_fasta(*arg)
! when :phylip
! output_phylip(*arg)
! when :phylipnon
! output_phylipnon(*arg)
! when :molphy
! output_molphy(*arg)
! else
! raise "Unknown format: #{format.inspect}"
! end
! end
!
! # Check whether there are same names for ClustalW format.
#
# array:: names of the sequences (array of string)
# len:: length to check (default:30)
! def __clustal_have_same_name?(array, len = 30)
na30 = array.collect do |k|
k.to_s.split(/[\x00\s]/)[0].to_s[0, len].gsub(/\:\;\,\(\)/, '_').to_s
***************
*** 892,904 ****
end
end
! private :have_same_name?
! # Changes sequence names if there are conflicted names.
#
# array:: names of the sequences (array of string)
# len:: length to check (default:30)
! def avoid_same_name(array, len = 30)
na = array.collect { |k| k.to_s.gsub(/[\r\n\x00]/, ' ') }
! if dupidx = have_same_name?(na, len)
procs = [
Proc.new { |s, i|
--- 905,918 ----
end
end
! private :__clustal_have_same_name?
! # Changes sequence names if there are conflicted names
! # for ClustalW format.
#
# array:: names of the sequences (array of string)
# len:: length to check (default:30)
! def __clustal_avoid_same_name(array, len = 30)
na = array.collect { |k| k.to_s.gsub(/[\r\n\x00]/, ' ') }
! if dupidx = __clustal_have_same_name?(na, len)
procs = [
Proc.new { |s, i|
***************
*** 914,918 ****
na[i] = pr.call(s.to_s, i)
end
! dupidx = have_same_name?(na, len)
break unless dupidx
end
--- 928,932 ----
na[i] = pr.call(s.to_s, i)
end
! dupidx = __clustal_have_same_name?(na, len)
break unless dupidx
end
***************
*** 925,929 ****
na
end
! private :avoid_same_name
# Generates ClustalW-formatted text
--- 939,943 ----
na
end
! private :__clustal_avoid_same_name
# Generates ClustalW-formatted text
***************
*** 931,935 ****
# names:: names of the sequences
# options:: options
! def clustalw_formatter(seqs, names, options = {})
#(original)
aln = [ "CLUSTAL (0.00) multiple sequence alignment\n\n" ]
--- 945,949 ----
# names:: names of the sequences
# options:: options
! def __clustal_formatter(seqs, names, options = {})
#(original)
aln = [ "CLUSTAL (0.00) multiple sequence alignment\n\n" ]
***************
*** 946,950 ****
end
if !options.has_key?(:avoid_same_name) or options[:avoid_same_name]
! sn = avoid_same_name(sn)
end
--- 960,964 ----
end
if !options.has_key?(:avoid_same_name) or options[:avoid_same_name]
! sn = __clustal_avoid_same_name(sn)
end
***************
*** 971,976 ****
mline = (options[:match_line] or seqs.match_line(mopt))
! aseqs = seqs.collect do |s|
! s.to_s.gsub(seqs.gap_regexp, gchar)
end
case options[:case].to_s
--- 985,991 ----
mline = (options[:match_line] or seqs.match_line(mopt))
! aseqs = Array.new(seqs.size).clear
! seqs.each_seq do |s|
! aseqs << s.to_s.gsub(seqs.gap_regexp, gchar)
end
case options[:case].to_s
***************
*** 1006,1012 ****
aln.join('')
end
! private :clustalw_formatter
! end #module ClustalWFormatter
# Bio::Alignment::ArrayExtension is a set of useful methods for
--- 1021,1190 ----
aln.join('')
end
! private :__clustal_formatter
!
! # Generates ClustalW-formatted text
! # seqs:: sequences (must be an alignment object)
! # names:: names of the sequences
! # options:: options
! def output_clustal(options = {})
! __clustal_formatter(self, self.sequence_names, options)
! end
!
! # to_clustal is deprecated. Instead, please use output_clustal.
! #---
! #alias to_clustal output_clustal
! #+++
! def to_clustal(*arg)
! warn "to_clustal is deprecated. Please use output_clustal."
! output_clustal(*arg)
! end
!
! # Generates fasta format text and returns a string.
! def output_fasta(options={})
! #(original)
! width = (options[:width] or 70)
! if options[:avoid_same_name] then
! na = __clustal_avoid_same_name(self.sequence_names, 30)
! else
! na = self.sequence_names.collect do |k|
! k.to_s.gsub(/[\r\n\x00]/, ' ')
! end
! end
! if width and width > 0 then
! w_reg = Regexp.new(".{1,#{width}}")
! self.collect do |s|
! ">#{na.shift}\n" + s.to_s.gsub(w_reg, "\\0\n")
! end.join('')
! else
! self.collect do |s|
! ">#{na.shift}\n" + s.to_s + "\n"
! end.join('')
! end
! end
!
! # generates phylip interleaved alignment format as a string
! def output_phylip(options = {})
! aln, aseqs, lines = __output_phylip_common(options)
! lines.times do
! aseqs.each { |a| aln << a.shift }
! aln << "\n"
! end
! aln.pop if aln[-1] == "\n"
! aln.join('')
! end
!
! # generates Phylip3.2 (old) non-interleaved format as a string
! def output_phylipnon(options = {})
! aln, aseqs, lines = __output_phylip_common(options)
! aln.first + aseqs.join('')
! end
+ # common routine for interleaved/non-interleaved phylip format
+ def __output_phylip_common(options = {})
+ len = self.alignment_length
+ aln = [ " #{self.size} #{len}\n" ]
+ sn = self.sequence_names.collect { |x| x.to_s.gsub(/[\r\n\x00]/, ' ') }
+ if options[:replace_space]
+ sn.collect! { |x| x.gsub(/\s/, '_') }
+ end
+ if !options.has_key?(:escape) or options[:escape]
+ sn.collect! { |x| x.gsub(/[\:\;\,\(\)]/, '_') }
+ end
+ if !options.has_key?(:split) or options[:split]
+ sn.collect! { |x| x.split(/\s/)[0].to_s }
+ end
+ if !options.has_key?(:avoid_same_name) or options[:avoid_same_name]
+ sn = __clustal_avoid_same_name(sn, 10)
+ end
+
+ namewidth = 10
+ seqwidth = (options[:width] or 60)
+ seqwidth = seqwidth.div(10) * 10
+ seqregexp = Regexp.new("(.{1,#{seqwidth.div(10) * 11}})")
+ gchar = (options[:gap_char] or '-')
+
+ aseqs = Array.new(len).clear
+ self.each_seq do |s|
+ aseqs << s.to_s.gsub(self.gap_regexp, gchar)
+ end
+ case options[:case].to_s
+ when /lower/i
+ aseqs.each { |s| s.downcase! }
+ when /upper/i
+ aseqs.each { |s| s.upcase! }
+ end
+
+ aseqs.collect! do |s|
+ snx = sn.shift
+ head = sprintf("%*s", -namewidth, snx.to_s)[0, namewidth]
+ head2 = ' ' * namewidth
+ s << (gchar * (len - s.length))
+ s.gsub!(/(.{1,10})/n, " \\1")
+ s.gsub!(seqregexp, "\\1\n")
+ a = s.split(/^/)
+ head += a.shift
+ ret = a.collect { |x| head2 + x }
+ ret.unshift(head)
+ ret
+ end
+ lines = (len + seqwidth - 1).div(seqwidth)
+ [ aln, aseqs, lines ]
+ end
+
+ # Generates Molphy alignment format text as a string
+ def output_molphy(options = {})
+ len = self.alignment_length
+ header = "#{self.size} #{len}\n"
+ sn = self.sequence_names.collect { |x| x.to_s.gsub(/[\r\n\x00]/, ' ') }
+ if options[:replace_space]
+ sn.collect! { |x| x.gsub(/\s/, '_') }
+ end
+ if !options.has_key?(:escape) or options[:escape]
+ sn.collect! { |x| x.gsub(/[\:\;\,\(\)]/, '_') }
+ end
+ if !options.has_key?(:split) or options[:split]
+ sn.collect! { |x| x.split(/\s/)[0].to_s }
+ end
+ if !options.has_key?(:avoid_same_name) or options[:avoid_same_name]
+ sn = __clustal_avoid_same_name(sn, 30)
+ end
+
+ seqwidth = (options[:width] or 60)
+ seqregexp = Regexp.new("(.{1,#{seqwidth}})")
+ gchar = (options[:gap_char] or '-')
+
+ aseqs = Array.new(len).clear
+ self.each_seq do |s|
+ aseqs << s.to_s.gsub(self.gap_regexp, gchar)
+ end
+ case options[:case].to_s
+ when /lower/i
+ aseqs.each { |s| s.downcase! }
+ when /upper/i
+ aseqs.each { |s| s.upcase! }
+ end
+
+ aseqs.collect! do |s|
+ s << (gchar * (len - s.length))
+ s.gsub!(seqregexp, "\\1\n")
+ sn.shift + "\n" + s
+ end
+ aseqs.unshift(header)
+ aseqs.join('')
+ end
+ end #module Output
+
+ module EnumerableExtension
+ include Output
+
+ # Returns an array of sequence names.
+ # The order of the names must be the same as
+ # the order of <tt>each_seq</tt>.
+ def sequence_names
+ i = 0
+ self.each_seq { |s| i += 1 }
+ (0...i).to_a
+ end
+ end #module EnumerableExtension
# Bio::Alignment::ArrayExtension is a set of useful methods for
***************
*** 1028,1037 ****
each(&block)
end
-
- include ClustalWFormatter
- # Returns a string of Clustal W formatted text of the alignment.
- def to_clustal(options = {})
- clustalw_formatter(self, (0...(self.size)).to_a, options)
- end
end #module ArrayExtension
--- 1206,1209 ----
***************
*** 1060,1065 ****
#
# It works the same as Hash#each_value.
! def each_seq(&block) #:yields: seq
! each_value(&block)
end
--- 1232,1238 ----
#
# It works the same as Hash#each_value.
! def each_seq #:yields: seq
! #each_value(&block)
! each_key { |k| yield self[k] }
end
***************
*** 1123,1135 ****
end
! include ClustalWFormatter
! # Returns a string of Clustal W formatted text of the alignment.
! def to_clustal(options = {})
! seqs = SequenceArray.new
! names = self.keys
! names.each do |k|
! seqs << self[k]
! end
! clustalw_formatter(seqs, names, options)
end
end #module HashExtension
--- 1296,1304 ----
end
! # Returns an array of sequence names.
! # The order of the names must be the same as
! # the order of <tt>each_seq</tt>.
! def sequence_names
! self.keys
end
end #module HashExtension
***************
*** 1783,1787 ****
width = options[:width] unless width
if options[:avoid_same_name] then
! na = avoid_same_name(self.keys, 30)
else
na = self.keys.collect { |k| k.to_s.gsub(/[\r\n\x00]/, ' ') }
--- 1952,1956 ----
width = options[:width] unless width
if options[:avoid_same_name] then
! na = __clustal_avoid_same_name(self.keys, 30)
else
na = self.keys.collect { |k| k.to_s.gsub(/[\r\n\x00]/, ' ') }
***************
*** 1814,1828 ****
#
# The specification of the argument will be changed.
def to_fasta(*arg)
#(original)
self.to_fasta_array(*arg).join('')
end
- include ClustalWFormatter
- # Returns a string of Clustal W formatted text of the alignment.
- def to_clustal(options = {})
- clustalw_formatter(self, self.keys, options)
- end
-
# The method name <tt>consensus</tt> will be obsoleted.
# Please use <tt>consensus_string</tt> instead.
--- 1983,1995 ----
#
# The specification of the argument will be changed.
+ #
+ # Note: <tt>to_fasta</tt> is deprecated.
+ # Please use <tt>output_fasta</tt> instead.
def to_fasta(*arg)
#(original)
+ warn "to_fasta is deprecated. Please use output_fasta."
self.to_fasta_array(*arg).join('')
end
# The method name <tt>consensus</tt> will be obsoleted.
# Please use <tt>consensus_string</tt> instead.
More information about the bioruby-cvs
mailing list