[BioRuby-cvs] bioruby/lib/bio sequence.rb,0.45,0.46

Tue Nov 15 09:49:02 EST 2005

Update of /home/repository/bioruby/bioruby/lib/bio
In directory pub.open-bio.org:/tmp/cvs-serv31691/lib/bio

Modified Files:
	sequence.rb 
Log Message:
* converted to RDoc
* Bio::Sequence::NA#codon_usage method is added

Index: sequence.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/sequence.rb,v
retrieving revision 0.45
retrieving revision 0.46
diff -C2 -d -r0.45 -r0.46
*** sequence.rb	5 Nov 2005 08:22:52 -0000	0.45
--- sequence.rb	15 Nov 2005 14:49:00 -0000	0.46
***************
*** 1,8 ****
  #
! # bio/sequence.rb - biological sequence class
  #
! #   Copyright (C) 2000-2005 KATAYAMA Toshiaki <k at bioruby.org>
! #   Copyright (C) 2001 Yoshinori K. Okuji <o at bioruby.org>
! #   Copyright (C) 2003 GOTO Naohisa <ng at bioruby.org>
  #
  #  This library is free software; you can redistribute it and/or
--- 1,19 ----
  #
! # = bio/sequence.rb - biological sequence class
  #
! # Copyright::	Copyright (C) 2000-2005
! #		Toshiaki Katayama <k at bioruby.org>,
! #		Yoshinori K. Okuji <okuji at embug.org>,
! #		Naohisa Goto <ng at bioruby.org>
! # License::	LGPL
! #
! # $Id$
! #
! #--
! # *TODO* remove this functionality?
! # You can use Bio::Seq instead of Bio::Sequence for short.
! #++
! #
! #--
  #
  #  This library is free software; you can redistribute it and/or
***************
*** 20,24 ****
  #  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
  #
! #  $Id$
  #

--- 31,35 ----
  #  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
  #
! #++
  #

***************
*** 30,359 ****
  module Bio

!   # Nucleic/Amino Acid sequence

!   class Sequence < String

!     def self.auto(str)
!       moltype = guess(str)
!       if moltype == NA
!         NA.new(str)
!       else
!         AA.new(str)
!       end
      end

!     def guess(threshold = 0.9)
!       cmp = self.composition
! 
!       bases = cmp['A'] + cmp['T'] + cmp['G'] + cmp['C'] + 
!               cmp['a'] + cmp['t'] + cmp['g'] + cmp['c']

!       total = self.length - cmp['N'] - cmp['n']

!       if bases.to_f / total > threshold
!         return NA
!       else
!         return AA
!       end
!     end 

!     def self.guess(str, *args)
!       self.new(str).guess(*args)
      end

!     def to_s
!       String.new(self)
!     end
!     alias to_str to_s

!     def seq
!       self.class.new(self)
!     end

!     def normalize!
!       initialize(self)
!       self
!     end
!     alias seq! normalize!

!     def <<(*arg)
!       super(self.class.new(*arg))
!     end
!     alias concat <<

!     def +(*arg)
!       self.class.new(super(*arg))
!     end

!     def subseq(s = 1, e = self.length)
!       return nil if s < 1 or e < 1
!       s -= 1
!       e -= 1
!       self[s..e]
      end

!     def to_fasta(header = '', width = nil)
!       ">#{header}\n" +
!       if width
!         self.to_s.gsub(Regexp.new(".{1,#{width}}"), "\\0\n")
!       else
!         self.to_s + "\n"
        end
      end

!     def window_search(window_size, step_size = 1)
!       i = 0
!       0.step(self.length - window_size, step_size) do |i| 
!         yield self[i, window_size]                        
!       end                          
!       return self[i + window_size .. -1] 
      end

!     def total(hash)
!       hash.default = 0.0 unless hash.default
!       sum = 0.0
!       self.each_byte do |x|
!         begin
!           sum += hash[x.chr]
!         end
!       end
!       return sum
      end

!     def composition
!       count = Hash.new(0)
!       self.scan(/./) do |x|
!         count[x] += 1
        end
!       return count
!     end

!     def randomize(hash = nil)
!       length = self.length
!       if hash
!         count = hash.clone
!         count.each_value {|x| length += x}
        else
!         count = self.composition
        end

!       seq = ''
!       tmp = {}
!       length.times do 
!         count.each do |k, v|
!           tmp[k] = v * rand
!         end
!         max = tmp.max {|a, b| a[1] <=> b[1]}
!         count[max.first] -= 1

!         if block_given?
!           yield max.first
!         else
!           seq += max.first
          end
        end
-       return self.class.new(seq)
      end

!     def self.randomize(*arg, &block)
!       self.new('').randomize(*arg, &block)
      end

      # This method depends on Locations class, see bio/location.rb
      def splicing(position)
!       unless position.is_a?(Locations) then
!         position = Locations.new(position)
!       end
!       s = ''
!       position.each do |location|
!         if location.sequence
!           s << location.sequence
!         else
!           exon = self.subseq(location.from, location.to)
!           begin
!             exon.complement! if location.strand < 0
!           rescue NameError
!           end
!           s << exon
!         end
        end
!       return self.class.new(s)
      end

!     # Nucleic Acid sequence
! 
!     class NA < Sequence
! 
!       def initialize(str)
!         super
!         self.downcase!
!         self.tr!(" \t\n\r",'')
!       end
! 
!       # This method depends on Locations class, see bio/location.rb
!       def splicing(position)
!         mRNA = super
!         if mRNA.rna?
!           mRNA.tr!('t', 'u')
!         else
!           mRNA.tr!('u', 't')
!         end
!         mRNA
!       end
! 
!       def complement
!         s = self.class.new(self)
!         s.complement!
!         s
        end

!       def complement!
!         if self.rna?
!           self.reverse!
!           self.tr!('augcrymkdhvbswn', 'uacgyrkmhdbvswn')
!         else
!           self.reverse!
!           self.tr!('atgcrymkdhvbswn', 'tacgyrkmhdbvswn')
!         end
!         self
        end
! 
!       def translate(frame = 1, table = 1, unknown = 'X')
!         if table.is_a?(Bio::CodonTable)
!           ct = table
!         else
!           ct = Bio::CodonTable[table]
!         end
!         naseq = self.dna
!         case frame
!         when 1, 2, 3
!           frame -= 1
!         when 4, 5, 6
!           frame -= 4
!           naseq.complement!
!         when -1, -2, -3
!           frame = -1 - frame
!           naseq.complement!
!         else
!           frame = 0
!         end
!         nalen = naseq.length - (naseq.length - frame) % 3
!         aaseq = naseq[frame, nalen].gsub(/.{3}/) {|codon| ct[codon] or unknown}
!         return Bio::Sequence::AA.new(aaseq)
        end

!       def gc_percent
!         count = self.composition
!         at = count['a'] + count['t'] + count['u']
!         gc = count['g'] + count['c']
!         gc = 100 * gc / (at + gc)
!         return gc
        end

!       def illegal_bases
!         self.scan(/[^atgcu]/).sort.uniq
!       end

!       # NucleicAcid is defined in bio/data/na.rb
!       def molecular_weight
!         if self.rna?
!           NucleicAcid.weight(self, true)
!         else
!           NucleicAcid.weight(self)
!         end
!       end

!       def to_re
!         if self.rna?
!           NucleicAcid.to_re(self.dna, true)
!         else
!           NucleicAcid.to_re(self)
!         end
        end

!       def names
!         array = []
!         self.each_byte do |x|
!           array.push(NucleicAcid.names[x.chr.upcase])
!         end
!         return array
        end

!       def dna
!         self.tr('u', 't')
        end

!       def dna!
!         self.tr!('u', 't')
!       end

!       def rna
!         self.tr('t', 'u')
!       end

!       def rna!
!         self.tr!('t', 'u')
!       end

!       def rna?
!         self.index('u')
!       end
!       protected :rna?

!       def pikachu
!         self.dna.tr("atgc", "pika")	# joke, of course :-)
!       end

      end

-     # Amino Acid sequence

!     class AA < Sequence

!       def initialize(str)
!         super
!         self.upcase!
!         self.tr!(" \t\n\r",'')
!       end

!       # AminoAcid is defined in bio/data/aa.rb
!       def molecular_weight
!         AminoAcid.weight(self)
!       end

!       def to_re
!         AminoAcid.to_re(self)
!       end

!       def codes
!         array = []
!         self.each_byte do |x|
!           array.push(AminoAcid.names[x.chr])
!         end
!         return array
!       end

!       def names
!         self.codes.map do |x|
!           AminoAcid.names[x]
!         end
        end

      end

    end

-   class Seq < Sequence
-     attr_accessor :entry_id, :definition, :features, :references, :comments,
-       :date, :keywords, :dblinks, :taxonomy, :moltype
-   end

  end

  if __FILE__ == $0

--- 41,444 ----
  module Bio

! # Nucleic/Amino Acid sequence

! class Sequence < String

!   def self.auto(str)
!     moltype = guess(str)
!     if moltype == NA
!       NA.new(str)
!     else
!       AA.new(str)
      end
+   end

!   def guess(threshold = 0.9)
!     cmp = self.composition

!     bases = cmp['A'] + cmp['T'] + cmp['G'] + cmp['C'] + 
!             cmp['a'] + cmp['t'] + cmp['g'] + cmp['c']

!     total = self.length - cmp['N'] - cmp['n']

!     if bases.to_f / total > threshold
!       return NA
!     else
!       return AA
      end
+   end 

!   def self.guess(str, *args)
!     self.new(str).guess(*args)
!   end

!   def to_s
!     String.new(self)
!   end
!   alias to_str to_s

!   # Force self to re-initialize for clean up (remove white spaces,
!   # case unification).
!   def seq
!     self.class.new(self)
!   end

!   # Similar to the 'seq' method, but changes the self object destructively.
!   def normalize!
!     initialize(self)
!     self
!   end
!   alias seq! normalize!

!   def <<(*arg)
!     super(self.class.new(*arg))
!   end
!   alias concat <<

+   def +(*arg)
+     self.class.new(super(*arg))
+   end

!   # Returns the subsequence of the self string.
!   def subseq(s = 1, e = self.length)
!     return nil if s < 1 or e < 1
!     s -= 1
!     e -= 1
!     self[s..e]
!   end
! 
!   # Output the FASTA format string of the sequence.  The 1st argument is
!   # used as the comment string.  If the 2nd option is given, the output
!   # sequence will be folded.
!   def to_fasta(header = '', width = nil)
!     ">#{header}\n" +
!     if width
!       self.to_s.gsub(Regexp.new(".{1,#{width}}"), "\\0\n")
!     else
!       self.to_s + "\n"
      end
+   end

!   # This method iterates on sub string with specified length 'window_size'.
!   # By specifing 'step_size', codon sized shifting or spliting genome
!   # sequence with ovelapping each end can easily be yielded.
!   #
!   # The remainder sequence at the terminal end will be returned.
!   #
!   # Example:
!   #   # prints average GC% on each 100bp
!   #   seq.window_search(100) do |subseq|
!   #     puts subseq.gc
!   #   end
!   #   # prints every translated peptide (length 5aa) in the same frame
!   #   seq.window_search(15, 3) do |subseq|
!   #     puts subseq.translate
!   #   end
!   #   # split genome sequence by 10000bp with 1000bp overlap in fasta format
!   #   i = 1
!   #   remainder = seq.window_search(10000, 9000) do |subseq|
!   #     puts subseq.to_fasta("segment #{i}", 60)
!   #     i += 1
!   #   end
!   #   puts remainder.to_fasta("segment #{i}", 60)
!   #
!   def window_search(window_size, step_size = 1)
!     i = 0
!     0.step(self.length - window_size, step_size) do |i| 
!       yield self[i, window_size]                        
!     end                          
!     return self[i + window_size .. -1] 
!   end
! 
!   # This method receive a hash of residues/bases to the particular values,
!   # and sum up the value along with the self sequence.  Especially useful
!   # to use with the window_search method and amino acid indices etc.
!   def total(hash)
!     hash.default = 0.0 unless hash.default
!     sum = 0.0
!     self.each_byte do |x|
!       begin
!         sum += hash[x.chr]
        end
      end
+     return sum
+   end

!   # Returns a hash of the occurrence counts for each residue or base.
!   def composition
!     count = Hash.new(0)
!     self.scan(/./) do |x|
!       count[x] += 1
      end
+     return count
+   end

!   # Returns a randomized sequence keeping its composition by default.
!   # The argument is required when generating a random sequence from the empty
!   # sequence (used by the class methods NA.randomize, AA.randomize).
!   # If the block is given, yields for each random residue/base.
!   def randomize(hash = nil)
!     length = self.length
!     if hash
!       count = hash.clone
!       count.each_value {|x| length += x}
!     else
!       count = self.composition
      end

!     seq = ''
!     tmp = {}
!     length.times do 
!       count.each do |k, v|
!         tmp[k] = v * rand
        end
!       max = tmp.max {|a, b| a[1] <=> b[1]}
!       count[max.first] -= 1

!       if block_given?
!         yield max.first
        else
!         seq += max.first
        end
+     end
+     return self.class.new(seq)
+   end

!   # Generate a new random sequence with the given frequency of bases
!   # or residues.  The sequence length is determined by the sum of each
!   # base/residue occurences.
!   def self.randomize(*arg, &block)
!     self.new('').randomize(*arg, &block)
!   end

!   # Receive a GenBank style position string and convert it to the Locations
!   # objects to splice the sequence itself.  See also: bio/location.rb
!   #
!   # This method depends on Locations class, see bio/location.rb
!   def splicing(position)
!     unless position.is_a?(Locations) then
!       position = Locations.new(position)
!     end
!     s = ''
!     position.each do |location|
!       if location.sequence
!         s << location.sequence
!       else
!         exon = self.subseq(location.from, location.to)
!         begin
!           exon.complement! if location.strand < 0
!         rescue NameError
          end
+         s << exon
        end
      end
+     return self.class.new(s)
+   end

! 
!   # Nucleic Acid sequence
! 
!   class NA < Sequence
! 
!     # Generate a nucleic acid sequence object from a string.
!     def initialize(str)
!       super
!       self.downcase!
!       self.tr!(" \t\n\r",'')
      end

      # This method depends on Locations class, see bio/location.rb
      def splicing(position)
!       mRNA = super
!       if mRNA.rna?
!         mRNA.tr!('t', 'u')
!       else
!         mRNA.tr!('u', 't')
        end
!       mRNA
      end

+     # Returns a reverse complement sequence (including the universal codes).
+     def complement
+       s = self.class.new(self)
+       s.complement!
+       s
+     end

!     def complement!
!       if self.rna?
!         self.reverse!
!         self.tr!('augcrymkdhvbswn', 'uacgyrkmhdbvswn')
!       else
!         self.reverse!
!         self.tr!('atgcrymkdhvbswn', 'tacgyrkmhdbvswn')
        end
+       self
+     end

!     # Translate into the amino acid sequence from the given frame and the
!     # selected codon table.  The table also can be a Bio::CodonTable object.
!     # The 'unknown' character is used for invalid/unknown codon (can be
!     # used for 'nnn' and/or gap translation in practice).
!     #
!     # Frame can be 1, 2 or 3 for the forward strand and -1, -2 or -3
!     # (4, 5 or 6 is also accepted) for the reverse strand.
!     def translate(frame = 1, table = 1, unknown = 'X')
!       if table.is_a?(Bio::CodonTable)
!         ct = table
!       else
!         ct = Bio::CodonTable[table]
        end
!       naseq = self.dna
!       case frame
!       when 1, 2, 3
!         frame -= 1
!       when 4, 5, 6
!         frame -= 4
!         naseq.complement!
!       when -1, -2, -3
!         frame = -1 - frame
!         naseq.complement!
!       else
!         frame = 0
        end
+       nalen = naseq.length - (naseq.length - frame) % 3
+       aaseq = naseq[frame, nalen].gsub(/.{3}/) {|codon| ct[codon] or unknown}
+       return Bio::Sequence::AA.new(aaseq)
+     end

!     # Returns counts of the each codon in the sequence by Hash.
!     def codon_usage
!       hash = Hash.new(0)
!       self.window_search(3, 3) do |codon|
!         hash[codon] += 1
        end
+       return hash
+     end

!     # Calculate the ratio of GC / ATGC bases in percent.
!     def gc_percent
!       count = self.composition
!       at = count['a'] + count['t'] + count['u']
!       gc = count['g'] + count['c']
!       gc = 100 * gc / (at + gc)
!       return gc
!     end

!     # Show abnormal bases other than 'atgcu'.
!     def illegal_bases
!       self.scan(/[^atgcu]/).sort.uniq
!     end

!     # Estimate the weight of this biological string molecule.
!     # NucleicAcid is defined in bio/data/na.rb
!     def molecular_weight
!       if self.rna?
!         NucleicAcid.weight(self, true)
!       else
!         NucleicAcid.weight(self)
        end
+     end

!     # Convert the universal code string into the regular expression.
!     def to_re
!       if self.rna?
!         NucleicAcid.to_re(self.dna, true)
!       else
!         NucleicAcid.to_re(self)
        end
+     end

!     # Convert the self string into the list of the names of the each base.
!     def names
!       array = []
!       self.each_byte do |x|
!         array.push(NucleicAcid.names[x.chr.upcase])
        end
+       return array
+     end

!     # Output a DNA string by substituting 'u' to 't'.
!     def dna
!       self.tr('u', 't')
!     end

!     def dna!
!       self.tr!('u', 't')
!     end

!     # Output a RNA string by substituting 't' to 'u'.
!     def rna
!       self.tr('t', 'u')
!     end

!     def rna!
!       self.tr!('t', 'u')
!     end

!     def rna?
!       self.index('u')
!     end
!     protected :rna?

+     def pikachu
+       self.dna.tr("atgc", "pika")	# joke, of course :-)
      end

+   end

!   # Amino Acid sequence

!   class AA < Sequence

!     # Generate a amino acid sequence object from a string.
!     def initialize(str)
!       super
!       self.upcase!
!       self.tr!(" \t\n\r",'')
!     end

!     # Estimate the weight of this protein.
!     # AminoAcid is defined in bio/data/aa.rb
!     def molecular_weight
!       AminoAcid.weight(self)
!     end

!     def to_re
!       AminoAcid.to_re(self)
!     end

!     # Generate the list of the names of the each residue along with the
!     # sequence (3 letters code).
!     def codes
!       array = []
!       self.each_byte do |x|
!         array.push(AminoAcid.names[x.chr])
        end
+       return array
+     end

+     # Similar to codes but returns long names.
+     def names
+       self.codes.map do |x|
+         AminoAcid.names[x]
+       end
      end

    end

+ end # Sequence

+ class Seq < Sequence
+   attr_accessor :entry_id, :definition, :features, :references, :comments,
+     :date, :keywords, :dblinks, :taxonomy, :moltype
  end

+ end # Bio
+ 
+ 
  if __FILE__ == $0

***************
*** 488,654 ****
  end

- 
- =begin
- 
- = Bio::Sequence
- 
- You can use Bio::Seq instead of Bio::Sequence for short.
- 
- --- Bio::Sequence#seq
- 
-       Force self to re-initialize for clean up (remove white spaces,
-       case unification).
- 
- --- Bio::Sequence#seq!
- --- Bio::Sequence#normalize!
- 
-       Similar to the 'seq' method, but changes the self object destructively.
- 
- --- Bio::Sequence#subseq(start = 1, end = self.length)
- 
-       Returns the subsequence of the self string.
- 
- --- Bio::Sequence#to_fasta(header = '', width = nil)
- 
-       Output the FASTA format string of the sequence.  The 1st argument is
-       used as the comment string.  If the 2nd option is given, the output
-       sequence will be folded.
- 
- --- Bio::Sequence#fasta(factory, header = '')
- 
-       Execute fasta by the factory (Bio::Fasta object) and returns
-       Bio::Fasta::Report object.  See Bio::Fasta for more details.
- 
- --- Bio::Sequence#blast(factory, header = '')
- 
-       Execute blast by the factory (Bio::Blast object) and returns
-       Bio::Blast::Report object.  See Bio::Blast for more details.
- 
- --- Bio::Sequence#splicing(position)
- 
-       Receive a GenBank style position string and convert it to the Locations
-       objects to splice the sequence itself.  See also: bio/location.rb
- 
- --- Bio::Sequence#window_search(window_size, step_size = 1)
- 
-       This method iterates on sub string with specified length 'window_size'.
-       By specifing 'step_size', codon sized shifting or spliting genome
-       sequence with ovelapping each end can easily be yielded.
- 
-       The remainder sequence at the terminal end will be returned.
- 
-       Example:
-         # prints average GC% on each 100bp
-         seq.window_search(100) do |subseq|
-           puts subseq.gc
-         end
-         # prints every translated peptide (length 5aa) in the same frame
-         seq.window_search(15, 3) do |subseq|
-           puts subseq.translate
-         end
-         # split genome sequence by 10000bp with 1000bp overlap in fasta format
-         i = 1
-         remainder = seq.window_search(10000, 9000) do |subseq|
-           puts subseq.to_fasta("segment #{i}", 60)
-           i += 1
-         end
-         puts remainder.to_fasta("segment #{i}", 60)
- 
- --- Bio::Sequence#total(hash)
- 
-       This method receive a hash of residues/bases to the particular values,
-       and sum up the value along with the self sequence.  Especially useful
-       to use with the window_search method and amino acid indices etc.
- 
- --- Bio::Sequence#composition
- 
-       Returns a hash of the occurrence counts for each residue or base.
- 
- --- Bio::Sequence#randomize(count = nil)
- 
-       Returns a randomized sequence keeping its composition by default.
-       The argument is required when generating a random sequence from the empty
-       sequence (used by the class methods NA.randomize, AA.randomize).
-       If the block is given, yields for each random residue/base.
- 
- --- Bio::Sequence.randomize(composition)
- 
-       Generate a new random sequence with the given frequency of bases
-       or residues.  The sequence length is determined by the sum of each
-       base/residue occurences.
- 
- 
- == Bio::Sequence::NA
- 
- --- Bio::Sequence::NA.new(str)
- 
-       Generate a nucleic acid sequence object from a string.
- 
- --- Bio::Sequence::NA#complement
- --- Bio::Sequence::NA#complement!
- 
-       Returns a reverse complement sequence (including the universal codes).
- 
- --- Bio::Sequence::NA#translate(frame = 1, table = 1, unknown = 'X')
- 
-       Translate into the amino acid sequence from the given frame and the
-       selected codon table.  The table also can be a Bio::CodonTable object.
-       The 'unknown' character is used for invalid/unknown codon (can be
-       used for 'nnn' and/or gap translation in practice).
- 
-       Frame can be 1, 2 or 3 for the forward strand and -1, -2 or -3
-       (4, 5 or 6 is also accepted) for the reverse strand.
- 
- --- Bio::Sequence::NA#gc_percent
- --- Bio::Sequence::NA#gc
- 
-       Calculate the ratio of GC / ATGC bases in percent.
- 
- --- Bio::Sequence::NA#illegal_bases
- 
-       Show abnormal bases other than 'atgcu'.
- 
- --- Bio::Sequence::NA#molecular_weight
- 
-       Estimate the weight of this biological string molecule.
- 
- --- Bio::Sequence::NA#to_re
- 
-       Convert the universal code string into the regular expression.
- 
- --- Bio::Sequence::NA#names
- 
-       Convert the self string into the list of the names of the each base.
- 
- --- Bio::Sequence::NA#dna
- --- Bio::Sequence::NA#dna!
- 
-       Output a DNA string by substituting 'u' to 't'.
- 
- --- Bio::Sequence::NA#rna
- --- Bio::Sequence::NA#rna!
- 
-       Output a RNA string by substituting 't' to 'u'.
- 
- 
- == Bio::Sequence::AA
- 
- --- Bio::Sequence::AA.new(str)
- 
-       Generate a amino acid sequence object from a string.
- 
- --- Bio::Sequence::AA#codes
- 
-       Generate the list of the names of the each residue along with the
-       sequence (3 letters code).
- 
- --- Bio::Sequence::NA#names
- 
-       Similar to codes but returns long names.
- 
- --- Bio::Sequence::AA#molecular_weight
- 
-       Estimate the weight of this protein.
- 
- =end

--- 573,575 ----