[BioRuby-cvs] bioruby/lib/bio/util/restriction_enzyme analysis_basic.rb, NONE, 1.1 analysis.rb, 1.7, 1.8 double_stranded.rb, 1.3, 1.4

Mon Jan 1 23:47:30 UTC 2007

Update of /home/repository/bioruby/bioruby/lib/bio/util/restriction_enzyme
In directory dev.open-bio.org:/tmp/cvs-serv2413/restriction_enzyme

Modified Files:
	analysis.rb double_stranded.rb 
Added Files:
	analysis_basic.rb 
Log Message:


--- NEW FILE: analysis_basic.rb ---
#
# bio/util/restrction_enzyme/analysis_basic.rb - Does the work of fragmenting the DNA from the enzymes
#
# Author::    Trevor Wennblom  <mailto:trevor at corevx.com>
# Copyright:: Copyright (c) 2005-2007 Midwinter Laboratories, LLC (http://midwinterlabs.com)
# License::   Distributes under the same terms as Ruby
#
#  $Id: analysis_basic.rb,v 1.1 2007/01/01 23:47:27 trevor Exp $
#

#--
#if RUBY_VERSION[0..2] == '1.9' or RUBY_VERSION == '2.0'
#  err = "This class makes use of 'include' on ranges quite a bit.  Possibly unstable in development Ruby.  2005/12/20."
#  err += "http://blade.nagaokaut.ac.jp/cgi-bin/vframe.rb/ruby/ruby-talk/167182?167051-169742"
#  raise err
#end
#++

require 'pathname'
libpath = Pathname.new(File.join(File.dirname(__FILE__), ['..'] * 4, 'lib')).cleanpath.to_s
$:.unshift(libpath) unless $:.include?(libpath)

require 'bio'
class Bio::Sequence::NA
  # Example:
  #
  #   seq = Bio::Sequence::NA.new('gaattc')
  #   cuts = seq.cut_with_enzyme('EcoRI')
  #
  # _or_
  #
  #   seq = Bio::Sequence::NA.new('gaattc')
  #   cuts = seq.cut_with_enzyme('g^aattc')
  # ---
  # See Bio::RestrictionEnzyme::Analysis.cut
  def cut_with_enzyme(*args)
    Bio::RestrictionEnzyme::Analysis.cut(self, *args)
  end
  alias cut_with_enzymes cut_with_enzyme
end

require 'pp'

require 'bio/util/restriction_enzyme'
require 'bio/util/restriction_enzyme/analysis/sequence_range'

class Bio::RestrictionEnzyme

#
# bio/util/restrction_enzyme/analysis_basic.rb - Does the work of fragmenting the DNA from the enzymes
#
# Author::    Trevor Wennblom  <mailto:trevor at corevx.com>
# Copyright:: Copyright (c) 2005-2007 Midwinter Laboratories, LLC (http://midwinterlabs.com)
# License::   Distributes under the same terms as Ruby
#
class Analysis

  def self.cut( sequence, *args )
#    self.new.cut( sequence, *args )
  end

  def self.cut_without_permutations( sequence, *args )
    self.new.cut_without_permutations( sequence, *args )
  end

  # Example:
  #
  #   Analysis.cut_without_permutations('gaattc', 'EcoRI')
  #
  # _same as:_
  #
  #   Analysis.cut_without_permutations('gaattc', 'g^aattc')
  # ---
  # *Arguments*
  # * +sequence+: +String+ kind of object that will be used as a nucleic acid sequence
  # * +args+: Series of 
  # *Returns*:: +Hash+ ?(array?) of Bio::RestrictionEnzyme::Analysis::UniqueFragment objects
  def cut_without_permutations( sequence, *args )
    return {} if !sequence.kind_of?(String) or sequence.empty?
    sequence = Bio::Sequence::NA.new( sequence )

    tmp = create_enzyme_actions( sequence, *args )
    enzyme_actions = tmp[0].merge(tmp[1])

    sr_with_cuts = SequenceRange.new( 0, 0, sequence.size-1, sequence.size-1 )
    enzyme_actions.each do |id, enzyme_action|
      enzyme_action.cut_ranges.each do |cut_range|
        sr_with_cuts.add_cut_range(cut_range)
      end
    end

    sr_with_cuts.fragments.primary = sequence
    sr_with_cuts.fragments.complement = sequence.forward_complement
    unique_fragments_for_display( {0 => sr_with_cuts} )
  end

  #########
  protected
  #########
  
  UniqueFragment = Struct.new(:primary, :complement)
  class UniqueFragments < Array
    def primary
      tmp = []
      self.each { |uf| tmp << uf.primary }
      tmp.sort.map { |e| e.tr(' ', '') }
    end
    def complement
      tmp = []
      self.each { |uf| tmp << uf.complement }
      tmp.sort.map { |e| e.tr(' ', '') }
    end
  end

  def unique_fragments_for_display( hash_of_sequence_ranges_with_cuts )
    uf_ary = UniqueFragments.new
    return uf_ary if hash_of_sequence_ranges_with_cuts == nil or hash_of_sequence_ranges_with_cuts.empty?

    hash_of_sequence_ranges_with_cuts.each do |permutation, sr_with_cuts|
      sr_with_cuts.fragments.for_display.each do |fragment|
        uf = UniqueFragment.new
        uf.primary = fragment.primary
        uf.complement = fragment.complement

        duplicate = false
        uf_ary.each do |element|
          if (uf.primary == element.primary) and (uf.complement == element.complement)
            duplicate = true
            break
          end
        end

        uf_ary << uf unless duplicate
      end
    end
    uf_ary
  end

  # Creates an array of EnzymeActions based on the DNA sequence and supplied enzymes.
  #
  # +sequence+:: The string of DNA to match the enzyme recognition sites against
  # +args+:: The enzymes to use.
  def create_enzyme_actions( sequence, *args )
    id = 0
    enzyme_actions_that_sometimes_cut = {}
    enzyme_actions_that_always_cut = {}
    indicies_of_sometimes_cut = []

    args.each do |enzyme|
      enzyme = Bio::RestrictionEnzyme.new(enzyme) unless enzyme.class == Bio::RestrictionEnzyme::DoubleStranded
      find_match_locations( sequence, enzyme.primary.to_re ).each do |offset|
        #enzyme_actions_that_always_cut[id] = enzyme_to_enzyme_action( enzyme, offset )
        enzyme_actions_that_always_cut[id] = enzyme.create_action_at( offset )
        id += 1
      end
    end

    # enzyme_actions_that_always_cut may lose members, the members to be lost are recorded in indicies_of_sometimes_cut

    max = enzyme_actions_that_always_cut.size - 1
    0.upto(max) do |i|
      enzyme_action = enzyme_actions_that_always_cut[i]
      conflict = false
      other_cut_ranges = {}
      enzyme_actions_that_always_cut.each { |key,i_ea| next if i == key; other_cut_ranges[key] = i_ea.cut_ranges }

      other_cut_ranges.each do |key, cut_ranges|
        cut_ranges.each do |cut_range|
          next unless cut_range.class == VerticalCutRange  # we aren't concerned with horizontal cuts
          previous_cut_left = cut_range.range.first 
          previous_cut_right = cut_range.range.last

          if (enzyme_action.right <= previous_cut_left) or
             (enzyme_action.left > previous_cut_right) or
             (enzyme_action.left > previous_cut_left and enzyme_action.right <= previous_cut_right) # in between cuts
            # no conflict
          else
            conflict = true
          end

          indicies_of_sometimes_cut += [i, key] if conflict == true
        end
      end
    end

    indicies_of_sometimes_cut.uniq.each do |i|
      enzyme_actions_that_sometimes_cut[i] = enzyme_actions_that_always_cut[i]
      enzyme_actions_that_always_cut.delete(i)
    end

    [enzyme_actions_that_sometimes_cut, enzyme_actions_that_always_cut]
  end

  # Returns an +Array+ of the match indicies of a RegExp to a string.
  #
  # ---
  # *Arguments*
  # * +string+: The string to scan
  # * +re+: A RegExp to use
  # *Returns*:: +Array+ with indicies of match locations
  def find_match_locations( string, re )
    md = string.match( re )
    locations = []
    counter = 0
    while md
      # save the match index relative to the original string
      locations << (counter += md.begin(0))
      # find the next match
      md = string[ (counter += 1)..-1 ].match( re )
    end
    locations
  end
  
end # Analysis
end # Bio::RestrictionEnzyme

Index: analysis.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/util/restriction_enzyme/analysis.rb,v
retrieving revision 1.7
retrieving revision 1.8
diff -C2 -d -r1.7 -r1.8
*** analysis.rb	1 Jan 2007 02:16:05 -0000	1.7
--- analysis.rb	1 Jan 2007 23:47:27 -0000	1.8
***************
*** 21,37 ****
  $:.unshift(libpath) unless $:.include?(libpath)
  
! require 'bio'
! class Bio::Sequence::NA
!   # See Bio::RestrictionEnzyme::Analysis.cut
!   def cut_with_enzyme(*args)
!     Bio::RestrictionEnzyme::Analysis.cut(self, *args)
!   end
!   alias cut_with_enzymes cut_with_enzyme
! end
! 
! require 'pp'
! 
! require 'bio/util/restriction_enzyme'
! require 'bio/util/restriction_enzyme/analysis/sequence_range'
  
  class Bio::RestrictionEnzyme
--- 21,25 ----
  $:.unshift(libpath) unless $:.include?(libpath)
  
! require 'bio/util/restriction_enzyme/analysis_basic'
  
  class Bio::RestrictionEnzyme
***************
*** 50,84 ****
    end
  
!   def self.cut_without_permutations( sequence, *args )
!     self.new.cut_without_permutations( sequence, *args )
!   end
! 
!   def self.cut_and_return_by_permutations( sequence, *args )
!     self.new.cut_and_return_by_permutations( sequence, *args )
!   end
! 
!   def cut_without_permutations( sequence, *args )
!     return {} if !sequence.kind_of?(String) or sequence.empty?
!     sequence = Bio::Sequence::NA.new( sequence )
! 
!     #enzyme_actions = create_enzyme_actions( sequence, *args )
!     tmp = create_enzyme_actions( sequence, *args )
!     enzyme_actions = tmp[0].merge(tmp[1])
! 
!     sr_with_cuts = SequenceRange.new( 0, 0, sequence.size-1, sequence.size-1 )
!     enzyme_actions.each do |id, enzyme_action|
!       enzyme_action.cut_ranges.each do |cut_range|
!         sr_with_cuts.add_cut_range(cut_range)
!       end
!     end
! 
!     sr_with_cuts.fragments.primary = sequence
!     sr_with_cuts.fragments.complement = sequence.forward_complement
! 
!     tmp = {}
!     tmp[0] = sr_with_cuts
!     unique_fragments_for_display( tmp )
    end
  
    def cut_and_return_by_permutations( sequence, *args )
      return {} if !sequence.kind_of?(String) or sequence.empty?
--- 38,51 ----
    end
  
!   def cut( sequence, *args )
!     return nil if !sequence.kind_of?(String) or sequence.empty?
!     hash_of_sequence_ranges_with_cuts = cut_and_return_by_permutations( sequence, *args )
!     unique_fragments_for_display( hash_of_sequence_ranges_with_cuts )
    end
  
+   #########
+   protected
+   #########
+   
    def cut_and_return_by_permutations( sequence, *args )
      return {} if !sequence.kind_of?(String) or sequence.empty?
***************
*** 123,134 ****
            previous_cut_right = cut_range.range.last
  
- =begin
- puts "--- #{permutation.inspect} ---"
- puts "Previous cut left: #{previous_cut_left}"
- puts "EA.left #{enzyme_action.left}"
- puts "Previous cut right: #{previous_cut_right}"
- puts "EA.right: #{enzyme_action.right}"
- =end
- 
            # Keep in mind: 
            # * The cut location is to the immediate right of the base located at the index.
--- 90,93 ----
***************
*** 140,148 ****
               (enzyme_action.left > previous_cut_left and enzyme_action.right <= previous_cut_right) # in between cuts
              # no conflict
- #puts "no conflict"
- 
            else
              conflict = true
- #puts "conflict"
            end
          end
--- 99,104 ----
***************
*** 161,178 ****
      end
  
- #pp    hash_of_sequence_ranges_with_cuts
      hash_of_sequence_ranges_with_cuts
    end
  
-   def cut( sequence, *args )
-     return nil if !sequence.kind_of?(String) or sequence.empty?
-     hash_of_sequence_ranges_with_cuts = cut_and_return_by_permutations( sequence, *args )
-     unique_fragments_for_display( hash_of_sequence_ranges_with_cuts )
-   end
- 
-   #########
-   protected
-   #########
- 
    def permute(count, permutations = [[0]])
      return permutations if count <= 1
--- 117,123 ----
***************
*** 190,366 ****
    end
  
-   UniqueFragment = Struct.new(:primary, :complement)
-   class UniqueFragments < Array
-     def primary
-       tmp = []
-       self.each { |uf| tmp << uf.primary }
-       tmp.sort.map { |e| e.tr(' ', '') }
-     end
-     def complement
-       tmp = []
-       self.each { |uf| tmp << uf.complement }
-       tmp.sort.map { |e| e.tr(' ', '') }
-     end
-   end
- 
-   def unique_fragments_for_display( hash_of_sequence_ranges_with_cuts )
-     uf_ary = UniqueFragments.new
-     return uf_ary if hash_of_sequence_ranges_with_cuts == nil or hash_of_sequence_ranges_with_cuts.empty?
- 
-     hash_of_sequence_ranges_with_cuts.each do |permutation, sr_with_cuts|
-       sr_with_cuts.fragments.for_display.each do |fragment|
-         uf = UniqueFragment.new
-         uf.primary = fragment.primary
-         uf.complement = fragment.complement
- 
-         duplicate = false
-         uf_ary.each do |element|
-           if (uf.primary == element.primary) and (uf.complement == element.complement)
-             duplicate = true
-             break
-           end
-         end
- 
-         uf_ary << uf unless duplicate
-       end
-     end
-     uf_ary
-   end
-   
- 
- =begin
-   Strand = Struct.new(:primary, :complement, :p_left, :p_right, :c_left, :c_right)
- 
-   def ts_fragments_to_strands( sequence, fragments )
-     sequence = Bio::Sequence::NA.new( sequence )
-     strands = []
-     fragments.each do |f|
-       p = sequence[f.p_left..f.p_right]
-       c = sequence[f.c_left..f.c_right]
-       strands << Strand.new(p, c, f.p_left, f.p_right, f.c_left, f.c_right)
-     end
-     strands
-   end
- =end
- 
-   # Defines a single enzyme action, in this case being a range that correlates
-   # to the DNA sequence that may contain it's own internal cuts.
-   class EnzymeAction < SequenceRange
-   end
- 
-   # Creates an array of EnzymeActions based on the DNA sequence and supplied enzymes.
-   #
-   # +sequence+:: The string of DNA to match the enzyme recognition sites against
-   # +args+:: The enzymes to use.
-   def create_enzyme_actions( sequence, *args )
-     id = 0
-     enzyme_actions_that_sometimes_cut = {}
-     enzyme_actions_that_always_cut = {}
-     indicies_of_sometimes_cut = []
- 
-     args.each do |enzyme|
-       enzyme = Bio::RestrictionEnzyme.new(enzyme) unless enzyme.class == Bio::RestrictionEnzyme::DoubleStranded
-       find_match_locations( sequence, enzyme.primary.to_re ).each do |offset|
-         enzyme_actions_that_always_cut[id] = enzyme_to_enzyme_action( enzyme, offset )
-         id += 1
-       end
-     end
- 
-     # enzyme_actions_that_always_cut may lose members, the members to be lost are recorded in indicies_of_sometimes_cut
- 
-     max = enzyme_actions_that_always_cut.size - 1
-     0.upto(max) do |i|
-       enzyme_action = enzyme_actions_that_always_cut[i]
-       conflict = false
-       other_cut_ranges = {}
-       #enzyme_actions.each { |key,enzyme_action| next if i == key; puts "i: #{i}, key: #{key}"; previous_cut_ranges += enzyme_action.cut_ranges }
- #      enzyme_actions_that_always_cut.each { |key,i_ea|  next if i == key; puts "i: #{i}, key: #{key}"; other_cut_ranges[key] = i_ea.cut_ranges }
-       enzyme_actions_that_always_cut.each { |key,i_ea| next if i == key; other_cut_ranges[key] = i_ea.cut_ranges }
- #      puts "Enzyme action #{i}:"
- #      pp enzyme_actions[i]
- #      pp enzyme_action
- #      puts "Previous cut ranges:"
- #      pp previous_cut_ranges
- 
-       other_cut_ranges.each do |key, cut_ranges|
-         cut_ranges.each do |cut_range|
-           next unless cut_range.class == VerticalCutRange  # we aren't concerned with horizontal cuts
-           previous_cut_left = cut_range.range.first 
-           previous_cut_right = cut_range.range.last
- 
-           if (enzyme_action.right <= previous_cut_left) or
-              (enzyme_action.left > previous_cut_right) or
-              (enzyme_action.left > previous_cut_left and enzyme_action.right <= previous_cut_right) # in between cuts
-             # no conflict
- #  puts "no conflict"
- 
-           else
-             conflict = true
- #  puts "conflict"
-   #puts "cut range:"
-   #pp cut_range
-   #puts "enzyme action:"
-   #pp enzyme_action
-           end
- 
-           indicies_of_sometimes_cut += [i, key] if conflict == true
-         end
-       end
- 
-       # We don't need to make permutations with this enzyme action if it always cuts
- #      indicies << i if conflict == false
-     end
- #    pp indicies_of_sometimes_cut
- 
-     indicies_of_sometimes_cut.uniq.each do |i|
-       enzyme_actions_that_sometimes_cut[i] = enzyme_actions_that_always_cut[i]
-       enzyme_actions_that_always_cut.delete(i)
-     end
- #puts 'Always cut:'
- #pp enzyme_actions_that_always_cut
- #puts 'Permute:'
- #pp enzyme_actions_that_sometimes_cut
- 
-     [enzyme_actions_that_sometimes_cut, enzyme_actions_that_always_cut]
-   end
- 
-   # Returns the offsets of the match of a RegExp to a string.
-   #
-   # +string+:: The string to scan.
-   # +re+:: A regexp to use.
-   def find_match_locations( string, re )
-     md = string.match( re )
-     locations = []
-     location = -1
-     while md
-       location += md.pre_match.size + 1
-       locations << location
-       # md[0] is the same as $&, or "the match" itself
-       md = (md[0][1..-1] + md.post_match).match( re )
-     end
-     locations
-   end
- 
-   # Takes a RestrictionEnzyme and a numerical offset to the sequence and 
-   # returns an EnzymeAction
-   #
-   # +enzyme+:: RestrictionEnzyme
-   # +offset+:: Numerical offset of where the enzyme action occurs on the seqeunce
-   def enzyme_to_enzyme_action( enzyme, offset )
-     enzyme_action = EnzymeAction.new(offset, offset + enzyme.primary.size-1, offset, offset + enzyme.complement.size-1)
- 
-     enzyme.cut_locations.each do |cut_pair|
-       p = cut_pair[0]     
-       c = cut_pair[1]
-       if c >= p
-         enzyme_action.add_cut_range(offset+p, nil, nil, offset+c)
-       else
-         enzyme_action.add_cut_range(nil, offset+p, offset+c, nil)
-       end
-     end
- 
-     enzyme_action
-   end
- 
  end # Analysis
  end # Bio::RestrictionEnzyme
--- 135,138 ----

Index: double_stranded.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/util/restriction_enzyme/double_stranded.rb,v
retrieving revision 1.3
retrieving revision 1.4
diff -C2 -d -r1.3 -r1.4
*** double_stranded.rb	1 Jan 2007 05:07:04 -0000	1.3
--- double_stranded.rb	1 Jan 2007 23:47:27 -0000	1.4
***************
*** 14,17 ****
--- 14,19 ----
  require 'bio/db/rebase'
  require 'bio/util/restriction_enzyme'
+ require 'bio/util/restriction_enzyme/analysis/sequence_range'
+ 
  require 'bio/util/restriction_enzyme/cut_symbol'
  require 'bio/util/restriction_enzyme/single_strand'
***************
*** 96,99 ****
--- 98,102 ----
        # Decide if this String is an enzyme name or a pattern
        if Bio::RestrictionEnzyme.enzyme_name?( erp )
+         # FIXME we added this to rebase...
          # Check if it's a known name
          known_enzyme = false
***************
*** 157,160 ****
--- 160,252 ----
      !blunt?
    end
+   
+   # Takes a RestrictionEnzyme object and a numerical offset to the sequence and 
+   # returns an EnzymeAction
+   #
+   # +restriction_enzyme+:: RestrictionEnzyme
+   # +offset+:: Numerical offset of where the enzyme action occurs on the seqeunce
+   def create_action_at( offset )
+   #def enzyme_to_enzyme_action( restriction_enzyme, offset )
+     enzyme_action = EnzymeAction.new( offset, 
+                                       offset + @primary.size-1, 
+                                       offset, 
+                                       offset + @complement.size-1)
+ 
+     @cut_locations.each do |cut_location_pair|
+       # cut_pair is a DoubleStranded::CutLocationPair
+       p, c = cut_location_pair.primary, cut_location_pair.complement
+       if c >= p
+         enzyme_action.add_cut_range(offset+p, nil, nil, offset+c)
+       else
+         enzyme_action.add_cut_range(nil, offset+p, offset+c, nil)
+       end
+     end
+ 
+     enzyme_action
+   end
+   
+   # An EnzymeAction is a way of representing a potential effect that a
+   # RestrictionEnzyme may have on a nucleotide sequence, an 'action'.
+   #
+   # Multiple cuts in multiple locations on a sequence may occur in one
+   # 'action' if it is done by a single enzyme.
+   #
+   # An EnzymeAction is a series of locations that represents where the restriction
+   # enzyme will bind on the sequence, as well as what ranges are cut on the
+   # sequence itself.  The complexity is due to the fact that our virtual
+   # restriction enzyme may create multiple segments from its cutting action, 
+   # on which another restriction enzyme may operate upon.
+   #
+   # For example, the DNA sequence:
+   # 
+   #   5' - G A A T A A A C G A - 3'
+   #   3' - C T T A T T T G C T - 5'
+   #
+   # When mixed with the restriction enzyme with the following cut pattern:
+   #
+   #   5' -   A|A T A A A C|G   - 3'
+   #           +-+         +  
+   #   3' -   T T|A T T T G|C   - 5'
+   #
+   # And also mixed with the restriction enzyme of the following cut pattern:
+   #
+   #   5' -         A A|A C     - 3'
+   #                 +-+  
+   #   3' -         T|T T G     - 5'
+   #
+   # Would result in a DNA sequence with these cuts:
+   # 
+   #   5' - G A|A T A A|A C|G A - 3'
+   #           +-+   +-+   +
+   #   3' - C T T|A T|T T G|C T - 5'
+   #
+   # Or these separate "free-floating" sequences:
+   #
+   #   5' - G A   - 3'
+   #   3' - C T T - 5'
+   #
+   #   5' - A T A A - 3'
+   #   3' -   A T   - 5'
+   #
+   #   5' -   A C - 3'
+   #   3' - T T G - 5'
+   #  
+   #   5' - G A - 3'
+   #   3' - C T - 5'
+   #
+   # This would be represented by two EnzymeActions - one for each
+   # RestrictionEnzyme.
+   #
+   # To initialize an EnzymeAction you must first instantiate it with the
+   # beginning and ending locations of where it will operate on a nucleotide
+   # sequence.
+   #
+   # Next the ranges of cu
+   #
+   # An EnzymeAction is
+   # Defines a single enzyme action, in this case being a range that correlates
+   # to the DNA sequence that may contain it's own internal cuts.
+   class EnzymeAction < Bio::RestrictionEnzyme::Analysis::SequenceRange
+   end
  
    #########