[BioRuby-cvs] bioruby/lib/bio/util/restriction_enzyme README, NONE, 1.1 analysis.rb, NONE, 1.1 cut_symbol.rb, NONE, 1.1 double_stranded.rb, NONE, 1.1 enzymes.yaml, NONE, 1.1 integer.rb, NONE, 1.1 single_strand.rb, NONE, 1.1 single_strand_complement.rb, NONE, 1.1 string_formatting.rb, NONE, 1.1

Trevor Wennblom trevor at pub.open-bio.org
Wed Feb 1 07:27:28 UTC 2006


Update of /home/repository/bioruby/bioruby/lib/bio/util/restriction_enzyme
In directory pub.open-bio.org:/tmp/cvs-serv28844

Added Files:
	README analysis.rb cut_symbol.rb double_stranded.rb 
	enzymes.yaml integer.rb single_strand.rb 
	single_strand_complement.rb string_formatting.rb 
Log Message:
Bio::RestrictionEnzyme


--- NEW FILE: integer.rb ---
#
# bio/util/restrction_enzyme/integer.rb - 
#
# Copyright::  Copyright (C) 2006 Trevor Wennblom <trevor at corevx.com>
# License::    LGPL
#
#  $Id: integer.rb,v 1.1 2006/02/01 07:34:11 trevor Exp $
#
#
#--
#
#  This library is free software; you can redistribute it and/or
#  modify it under the terms of the GNU Lesser General Public
#  License as published by the Free Software Foundation; either
#  version 2 of the License, or (at your option) any later version.
#
#  This library is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
#  Lesser General Public License for more details.
#
#  You should have received a copy of the GNU Lesser General Public
#  License along with this library; if not, write to the Free Software
#  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
#
#++
#
#

=begin rdoc
bio/util/restrction_enzyme/integer.rb - 
=end
class Integer
  def negative?
    self < 0
  end
end

--- NEW FILE: string_formatting.rb ---
require 'pathname'
libpath = Pathname.new(File.join(File.dirname(__FILE__), ['..'] * 4, 'lib')).cleanpath.to_s
$:.unshift(libpath) unless $:.include?(libpath)

require 'bio/util/restriction_enzyme/cut_symbol'

module Bio; end
class Bio::RestrictionEnzyme

#
# bio/util/restriction_enzyme/string_formatting.rb - Useful functions for string manipulation
#
# Copyright::  Copyright (C) 2006 Trevor Wennblom <trevor at corevx.com>
# License::    LGPL
#
#  $Id: string_formatting.rb,v 1.1 2006/02/01 07:34:11 trevor Exp $
#
#
#--
#
#  This library is free software; you can redistribute it and/or
#  modify it under the terms of the GNU Lesser General Public
#  License as published by the Free Software Foundation; either
#  version 2 of the License, or (at your option) any later version.
#
#  This library is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
#  Lesser General Public License for more details.
#
#  You should have received a copy of the GNU Lesser General Public
#  License along with this library; if not, write to the Free Software
#  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
#
#++
#
=begin rdoc
bio/util/restriction_enzyme/string_formatting.rb - Useful functions for string manipulation
=end
module StringFormatting
  include CutSymbol
  extend CutSymbol

  # Return the sequence with spacing for alignment.  Does not add whitespace
  # around cut symbols.
  #
  # Example:
  #   pattern = 'n^ng^arraxt^n'
  #   add_spacing( pattern )
  #
  # Returns:
  #   "n^n g^a r r a x t^n"
  #
  def add_spacing( seq, cs = cut_symbol )
    str = ''
    flag = false
    seq.each_byte do |c|
      c = c.chr
      if c == cs
        str += c
        flag = false
      elsif flag
        str += ' ' + c
      else
        str += c
        flag = true
      end
    end
    str
  end

  # Remove extraneous nucleic acid wildcards ('n' padding) from the
  # left and right sides
  def strip_padding( s )
    if s[0].chr == 'n'
      s =~ %r{(n+)(.+)}
      s = $2
    end
    if s[-1].chr == 'n'
      s =~ %r{(.+?)(n+)$}
      s = $1
    end
    s
  end

  # Remove extraneous nucleic acid wildcards ('n' padding) from the
  # left and right sides and remove cut symbols
  def strip_cuts_and_padding( s )
    strip_padding( s.tr(cut_symbol, '') )
  end

  # Return the 'n' padding on the left side of the strand
  def left_padding( s )
    s =~ %r{^n+}
    ret = $&
    ret ? ret : ''  # Don't pass nil values
  end

  # Return the 'n' padding on the right side of the strand
  def right_padding( s )
    s =~ %r{n+$}
    ret = $&
    ret ? ret : ''  # Don't pass nil values
  end


end

end

--- NEW FILE: single_strand.rb ---
require 'pathname'
libpath = Pathname.new(File.join(File.dirname(__FILE__), ['..'] * 4, 'lib')).cleanpath.to_s
$:.unshift(libpath) unless $:.include?(libpath)

require 'bio/util/restriction_enzyme/single_strand/cut_locations_in_enzyme_notation'
require 'bio/util/restriction_enzyme/cut_symbol'
require 'bio/util/restriction_enzyme/string_formatting'
require 'bio/sequence'

class Bio::RestrictionEnzyme

#
# bio/util/restriction_enzyme/single_strand.rb - Single strand of a restriction enzyme sequence
#
# Copyright::  Copyright (C) 2006 Trevor Wennblom <trevor at corevx.com>
# License::    LGPL
#
#  $Id: single_strand.rb,v 1.1 2006/02/01 07:34:11 trevor Exp $
#
#
#--
#
#  This library is free software; you can redistribute it and/or
#  modify it under the terms of the GNU Lesser General Public
#  License as published by the Free Software Foundation; either
#  version 2 of the License, or (at your option) any later version.
#
#  This library is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
#  Lesser General Public License for more details.
#
#  You should have received a copy of the GNU Lesser General Public
#  License along with this library; if not, write to the Free Software
#  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
#
#++
#

=begin rdoc
bio/util/restriction_enzyme/single_strand.rb - Single strand of a restriction enzyme sequence

A single strand of restriction enzyme sequence pattern with a 5' to 3' 
orientation.
 
DoubleStranded puts the SingleStrand and SingleStrandComplement together to 
create the sequence pattern with cuts on both strands.
=end
class SingleStrand < Bio::Sequence::NA
  include CutSymbol
  include StringFormatting

  # The cut locations in enzyme notation. Contains a 
  # CutLocationsInEnzymeNotation object.
  attr_reader :cut_locations_in_enzyme_notation

  # The cut locations transformed from enzyme index notation to 0-based 
  # array index notation.  Contains an Array
  attr_reader :cut_locations

  # Orientation of the strand, 5' to 3'
  def orientation; [5,3]; end

  # +sequence+:: The enzyme sequence.
  # +c+:: Cut locations in enzyme notation.  See CutLocationsInEnzymeNotation.
  #
  # * +sequence+ is required, +c+ is optional
  # * You cannot provide a sequence with cut symbols and provide cut locations.
  # * If +c+ is omitted, +input_pattern+ must contain a cut symbol.
  # * +sequence+ cannot contain adjacent cut symbols.
  # * +c+ is in enzyme index notation and therefore cannot contain a 0.
  #
  # +sequence+ must be a kind of:
  # * String
  # * Bio::Sequence::NA
  # * Bio::RestrictionEnzyme::SingleStrand
  #
  # +c+ must be a kind of:
  # * Integer, one or more
  # * Array
  # * CutLocationsInEnzymeNotation
  #
  def initialize( sequence, *c )
    c.flatten! # if an array was supplied as an argument
    validate_args(sequence, c)
    sequence.downcase!
    
    if sequence =~ re_cut_symbol
      @cut_locations_in_enzyme_notation = CutLocationsInEnzymeNotation.new( strip_padding(sequence) )
    else
      @cut_locations_in_enzyme_notation = CutLocationsInEnzymeNotation.new( c )
    end

    @stripped = Bio::Sequence::NA.new( strip_cuts_and_padding( sequence ) )
    super( pattern )
    @cut_locations = @cut_locations_in_enzyme_notation.to_array_index
  end

  # Returns true if this enzyme is palindromic with its reverse complement.
  # Does not report if the +cut_locations+ are palindromic or not.
  #
  # Examples:
  # * This would be palindromic: 
  #     5' - ATGCAT - 3'
  #          TACGTA
  #
  # * This would not be palindromic: 
  #     5' - ATGCGTA - 3'
  #          TACGCAT
  #
  def palindromic?
    @stripped.reverse_complement == @stripped
  end

  # Pattern with no cut symbols and no 'n' padding.
  # * <code>SingleStrand.new('garraxt', [-2, 1, 7]).stripped  # "garraxt"</code>
  attr_reader :stripped

  # The sequence with 'n' padding and cut symbols.
  # * <code>SingleStrand.new('garraxt', [-2, 1, 7]).with_cut_symbols  # => "n^ng^arraxt^n"</code>
  def with_cut_symbols
    s = pattern
    @cut_locations_in_enzyme_notation.to_array_index.sort.reverse.each { |c| s.insert(c+1, cut_symbol) }
    s
  end

  # The sequence with 'n' padding on the left and right for cuts larger than the sequence.
  # * <code>SingleStrand.new('garraxt', [-2, 1, 7]).pattern  # => "nngarraxtn"</code>
  def pattern
    return stripped if @cut_locations_in_enzyme_notation.min == nil
    left = (@cut_locations_in_enzyme_notation.min.negative? ? 'n' * @cut_locations_in_enzyme_notation.min.abs : '')

    # Add one more 'n' if a cut is at the last position 
    right = (@cut_locations_in_enzyme_notation.max >= @stripped.length ? 'n' * (@cut_locations_in_enzyme_notation.max - @stripped.length + 1) : '')
    [left, stripped, right].to_s
  end

  # The sequence with 'n' pads, cut symbols, and spacing for alignment.
  # * <code>SingleStrand.new('garraxt', [-2, 1, 7]).with_spaces # => "n^n g^a r r a x t^n"</code>
  def with_spaces
    add_spacing( with_cut_symbols )
  end


  # NOTE: BEING WORKED ON, BUG EXISTS IN Bio::NucleicAcid
=begin  
  to_re - important
  example z = [agc]
  z must match [agcz]
  not just [agc]
=end

  #########
  protected
  #########

  def validate_args( input_pattern, input_cut_locations )
    unless input_pattern.kind_of?(String)
      err = "input_pattern is not a String, Bio::Sequence::NA, or Bio::RestrictionEnzyme::SingleStrand::Sequence object\n"
      err += "pattern: #{input_pattern}\n"
      err += "class: #{input_pattern.class}"
      raise ArgumentError, err
    end

    if ( input_pattern =~ re_cut_symbol ) and !input_cut_locations.empty?
      err = "Cut symbol found in sequence, but cut locations were also supplied.  Ambiguous.\n"
      err += "pattern: #{input_pattern}\n"
      err += "symbol: #{cut_symbol}\n"
      err += "locations: #{input_cut_locations.inspect}"
      raise ArgumentError, err
    end

    input_pattern.each_byte do |c| 
      c = c.chr.downcase
      unless Bio::NucleicAcid::NAMES.has_key?(c) or c == 'x' or c == 'X' or c == cut_symbol
        err = "Invalid character in pattern.\n"
        err += "Not a nucleotide or representation of possible nucleotides.  See Bio::NucleicAcid::NAMES for more information.\n"
        err += "char: #{c}\n"
        err += "input_pattern: #{input_pattern}"
        raise ArgumentError, err
      end
    end
  end

  # Tadayoshi Funaba's method as discussed in Programming Ruby 2ed, p390
  def self.once(*ids)
    for id in ids
      module_eval <<-"end;"
        alias_method :__#{id.to_i}__, :#{id.to_s}
        private :__#{id.to_i}__
        def #{id.to_s}(*args, &block)
          (@__#{id.to_i}__ ||= [__#{id.to_i}__(*args, &block)])[0]
        end
      end;
    end
  end

  once :pattern, :with_cut_symbols, :with_spaces, :to_re

end

end

--- NEW FILE: README ---
todo

--- NEW FILE: enzymes.yaml ---
--- 
TspRI: 
  :c4: "0"
  :c1: "7"
  :len: "5"
  :c2: "-3"
  :c3: "0"
  :pattern: CASTG
  :name: TspRI
  :blunt: "0"
  :ncuts: "2"
MvnI: 
  :c4: "0"
  :c1: "2"
  :len: "4"
  :c2: "2"
  :c3: "0"
  :pattern: CGCG
  :name: MvnI
[...6692 lines suppressed...]
BseMI: 
  :c4: "0"
  :c1: "8"
  :len: "6"
  :c2: "6"
  :c3: "0"
  :pattern: GCAATG
  :name: BseMI
  :blunt: "0"
  :ncuts: "2"
EcoRII: 
  :c4: "0"
  :c1: "-1"
  :len: "5"
  :c2: "5"
  :c3: "0"
  :pattern: CCWGG
  :name: EcoRII
  :blunt: "0"
  :ncuts: "2"

--- NEW FILE: single_strand_complement.rb ---
require 'pathname'
libpath = Pathname.new(File.join(File.dirname(__FILE__), ['..'] * 4, 'lib')).cleanpath.to_s
$:.unshift(libpath) unless $:.include?(libpath)

require 'bio/util/restriction_enzyme/single_strand'

module Bio; end
class Bio::RestrictionEnzyme

#
# bio/util/restriction_enzyme/single_strand_complement.rb - Single strand restriction enzyme sequence in complement orientation
#
# Copyright::  Copyright (C) 2006 Trevor Wennblom <trevor at corevx.com>
# License::    LGPL
#
#  $Id: single_strand_complement.rb,v 1.1 2006/02/01 07:34:11 trevor Exp $
#
#
#--
#
#  This library is free software; you can redistribute it and/or
#  modify it under the terms of the GNU Lesser General Public
#  License as published by the Free Software Foundation; either
#  version 2 of the License, or (at your option) any later version.
#
#  This library is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
#  Lesser General Public License for more details.
#
#  You should have received a copy of the GNU Lesser General Public
#  License along with this library; if not, write to the Free Software
#  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
#
#++
#
=begin rdoc
bio/util/restriction_enzyme/single_strand_complement.rb - Single strand restriction enzyme sequence in complement orientation

A single strand of restriction enzyme sequence pattern with a 3' to 5' orientation.
=end
class SingleStrandComplement < SingleStrand
  # Orientation of the strand, 3' to 5'
  def orientation; [3, 5]; end
end

end

--- NEW FILE: analysis.rb ---
#if RUBY_VERSION[0..2] == '1.9' or RUBY_VERSION == '2.0'
#  err = "This class makes use of 'include' on ranges quite a bit.  Possibly unstable in development Ruby.  2005/12/20."
#  err += "http://blade.nagaokaut.ac.jp/cgi-bin/vframe.rb/ruby/ruby-talk/167182?167051-169742"
#  raise err
#end

require 'pathname'
libpath = Pathname.new(File.join(File.dirname(__FILE__), ['..'] * 4, 'lib')).cleanpath.to_s
$:.unshift(libpath) unless $:.include?(libpath)

require 'bio'

class Bio::Sequence::NA
  def cut_with_enzyme(*args)
    Bio::RestrictionEnzyme::Analysis.cut(self, *args)
  end
  alias cut_with_enzymes cut_with_enzyme
end

require 'pp'

require 'bio/util/restriction_enzyme'
require 'bio/util/restriction_enzyme/analysis/sequence_range.rb'
require 'bio/util/restriction_enzyme/analysis/permutation.rb'

class Bio::RestrictionEnzyme
#
# bio/util/restrction_enzyme/analysis.rb - Does the work of fragmenting the DNA from the enzymes
#
# Copyright::  Copyright (C) 2006 Trevor Wennblom <trevor at corevx.com>
# License::    LGPL
#
#  $Id: analysis.rb,v 1.1 2006/02/01 07:34:11 trevor Exp $
#
#
#--
#
#  This library is free software; you can redistribute it and/or
#  modify it under the terms of the GNU Lesser General Public
#  License as published by the Free Software Foundation; either
#  version 2 of the License, or (at your option) any later version.
#
#  This library is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
#  Lesser General Public License for more details.
#
#  You should have received a copy of the GNU Lesser General Public
#  License along with this library; if not, write to the Free Software
#  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
#
#++
#
#

=begin rdoc
bio/util/restrction_enzyme/analysis.rb - Does the work of fragmenting the DNA from the enzymes
=end
class Analysis

  def self.cut( sequence, *args )
    self.new.cut( sequence, *args )
  end

  def self.cut_without_permutations( sequence, *args )
    self.new.cut_without_permutations( sequence, *args )
  end

  def self.cut_and_return_by_permutations( sequence, *args )
    self.new.cut_and_return_by_permutations( sequence, *args )
  end


  def cut_without_permutations( sequence, *args )
    sequence = Bio::Sequence::NA.new( sequence )
    enzyme_actions = create_enzyme_actions( sequence, *args )
    sr_with_cuts = SequenceRange.new( 0, 0, sequence.size-1, sequence.size-1 )
    enzyme_actions.each do |id, enzyme_action|
      enzyme_action.cut_ranges.each do |cut_range|
        sr_with_cuts.add_cut_range(cut_range)
      end
    end

    sr_with_cuts.fragments.primary = sequence
    sr_with_cuts.fragments.complement = sequence.forward_complement

    tmp = {}
    tmp[0] = sr_with_cuts
    unique_fragments_for_display( tmp )
  end

  def cut_and_return_by_permutations( sequence, *args )
    sequence = Bio::Sequence::NA.new( sequence )
    enzyme_actions = create_enzyme_actions( sequence, *args )
    permutations = Permutation.new(enzyme_actions.size).map { |p| p.value }

    # Indexed by permutation.
    hash_of_sequence_ranges_with_cuts = {}

    permutations.each do |permutation|
      previous_cut_ranges = []
      sr_with_cuts = SequenceRange.new( 0, 0, sequence.size-1, sequence.size-1 )

      permutation.each do |id|
        enzyme_action = enzyme_actions[id]

        # conflict is false if the current enzyme action may cut in it's range.
        # conflict is true if it cannot do to a previous enzyme action making
        # a cut where this enzyme action needs a whole recognition site.
        conflict = false

        # If current size of enzyme_action overlaps with previous cut_range, don't cut
        # note that the enzyme action may fall in the middle of a previous enzyme action
        # so all cut locations must be checked that would fall underneath.
        previous_cut_ranges.each do |cut_range|
          next unless cut_range.class == VerticalCutRange  # we aren't concerned with horizontal cuts
          previous_cut_left = cut_range.range.first 
          previous_cut_right = cut_range.range.last

=begin
puts "--- #{permutation.inspect} ---"
puts "Previous cut left: #{previous_cut_left}"
puts "EA.left #{enzyme_action.left}"
puts "Previous cut right: #{previous_cut_right}"
puts "EA.right: #{enzyme_action.right}"
=end

          # Keep in mind: 
          # * The cut location is to the immediate right of the base located at the index.
          #   ex: at^gc -- the cut location is at index 1
          # * The enzyme action location is located at the base of the index.
          #   ex: atgc -- 0 => 'a', 1 => 't', 2 => 'g', 3 => 'c'
          if (enzyme_action.right <= previous_cut_left) or
             (enzyme_action.left > previous_cut_right) or
             (enzyme_action.left > previous_cut_left and enzyme_action.right <= previous_cut_right) # in between cuts
            # no conflict
#puts "no conflict"

          else
            conflict = true
#puts "conflict"
          end
        end

        next if conflict == true
        enzyme_action.cut_ranges.each { |cut_range| sr_with_cuts.add_cut_range(cut_range) }
        previous_cut_ranges += enzyme_action.cut_ranges
      end

      hash_of_sequence_ranges_with_cuts[permutation] = sr_with_cuts
    end

    hash_of_sequence_ranges_with_cuts.each do |permutation, sr_with_cuts|
      sr_with_cuts.fragments.primary = sequence
      sr_with_cuts.fragments.complement = sequence.forward_complement
    end

#pp    hash_of_sequence_ranges_with_cuts
    hash_of_sequence_ranges_with_cuts
  end

  def cut( sequence, *args )
    hash_of_sequence_ranges_with_cuts = cut_and_return_by_permutations( sequence, *args )
    unique_fragments_for_display( hash_of_sequence_ranges_with_cuts )
  end

  #########
  protected
  #########

  UniqueFragment = Struct.new(:primary, :complement)
  class UniqueFragments < Array
    def primary
      tmp = []
      self.each { |uf| tmp << uf.primary }
      tmp.sort.map { |e| e.tr(' ', '') }
    end
    def complement
      tmp = []
      self.each { |uf| tmp << uf.complement }
      tmp.sort.map { |e| e.tr(' ', '') }
    end
  end

  def unique_fragments_for_display( hash_of_sequence_ranges_with_cuts )
    uf_ary = UniqueFragments.new
    hash_of_sequence_ranges_with_cuts.each do |permutation, sr_with_cuts|
      sr_with_cuts.fragments.for_display.each do |fragment|
        uf = UniqueFragment.new
        uf.primary = fragment.primary
        uf.complement = fragment.complement

        duplicate = false
        uf_ary.each do |element|
          if (uf.primary == element.primary) and (uf.complement == element.complement)
            duplicate = true
            break
          end
        end

        uf_ary << uf unless duplicate
      end
    end
    uf_ary
  end
  

=begin
  Strand = Struct.new(:primary, :complement, :p_left, :p_right, :c_left, :c_right)

  def ts_fragments_to_strands( sequence, fragments )
    sequence = Bio::Sequence::NA.new( sequence )
    strands = []
    fragments.each do |f|
      p = sequence[f.p_left..f.p_right]
      c = sequence[f.c_left..f.c_right]
      strands << Strand.new(p, c, f.p_left, f.p_right, f.c_left, f.c_right)
    end
    strands
  end
=end

  # Defines a single enzyme action, in this case being a range that correlates
  # to the DNA sequence that may contain it's own internal cuts.
  class EnzymeAction < SequenceRange
  end

  # Creates an array of EnzymeActions based on the DNA sequence and supplied enzymes.
  #
  # +sequence+:: The string of DNA to match the enzyme recognition sites against
  # +args+:: The enzymes to use.
  def create_enzyme_actions( sequence, *args )
    id = 0
    enzyme_actions = {}

    args.each do |enzyme|
      enzyme = Bio::RestrictionEnzyme.new(enzyme) unless enzyme.class == Bio::RestrictionEnzyme::DoubleStranded
      find_match_locations( sequence, enzyme.primary.to_re ).each do |offset|
        enzyme_actions[id] = enzyme_to_enzyme_action( enzyme, offset )
        id += 1
      end
    end

    enzyme_actions
  end

  # Returns the offsets of the match of a RegExp to a string.
  #
  # +string+:: The string to scan.
  # +re+:: A regexp to use.
  def find_match_locations( string, re )
    md = string.match( re )
    locations = []
    location = -1
    while md
      location += md.pre_match.size + 1
      locations << location
      # md[0] is the same as $&, or "the match" itself
      md = (md[0][1..-1] + md.post_match).match( re )
    end
    locations
  end

  # Takes a RestrictionEnzyme and a numerical offset to the sequence and 
  # returns an EnzymeAction
  #
  # +enzyme+:: RestrictionEnzyme
  # +offset+:: Numerical offset of where the enzyme action occurs on the seqeunce
  def enzyme_to_enzyme_action( enzyme, offset )
    enzyme_action = EnzymeAction.new(offset, offset + enzyme.primary.size-1, offset, offset + enzyme.complement.size-1)

    enzyme.cut_locations.each do |cut_pair|
      p = cut_pair[0]     
      c = cut_pair[1]
      if c >= p
        enzyme_action.add_cut_range(offset+p, nil, nil, offset+c)
      else
        enzyme_action.add_cut_range(nil, offset+p, offset+c, nil)
      end
    end

    enzyme_action
  end

end
end

--- NEW FILE: cut_symbol.rb ---
module Bio; end
class Bio::RestrictionEnzyme

#
# bio/util/restrction_enzyme/cut_symbol.rb - 
#
# Copyright::  Copyright (C) 2006 Trevor Wennblom <trevor at corevx.com>
# License::    LGPL
#
#  $Id: cut_symbol.rb,v 1.1 2006/02/01 07:34:11 trevor Exp $
#
#
#--
#
#  This library is free software; you can redistribute it and/or
#  modify it under the terms of the GNU Lesser General Public
#  License as published by the Free Software Foundation; either
#  version 2 of the License, or (at your option) any later version.
#
#  This library is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
#  Lesser General Public License for more details.
#
#  You should have received a copy of the GNU Lesser General Public
#  License along with this library; if not, write to the Free Software
#  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
#
#++
#
#

=begin rdoc
bio/util/restrction_enzyme/cut_symbol.rb - 
=end
module CutSymbol

  require 'singleton'

  class CutSymbol__
    include Singleton
    attr_accessor :cut_symbol
    attr_accessor :escaped_cut_symbol
  end

  # NOTE verify this sometime
  def cut_symbol=(c)
    CutSymbol__.instance.cut_symbol = c
  end

  def cut_symbol
    CutSymbol__.instance.cut_symbol ||= '^'
  end

  def escaped_cut_symbol
    CutSymbol__.instance.escaped_cut_symbol ||= "\\#{cut_symbol}"  # \^
  end

  # Used to check if multiple cut symbols are next to each other
  def re_cut_symbol_adjacent
    %r"#{escaped_cut_symbol}{2}"
  end

  # A Regexp of the cut_symbol.  Convenience method.
  def re_cut_symbol
    %r"#{escaped_cut_symbol}"
  end

end
end

--- NEW FILE: double_stranded.rb ---
require 'pathname'
libpath = Pathname.new(File.join(File.dirname(__FILE__), ['..'] * 4, 'lib')).cleanpath.to_s
$:.unshift(libpath) unless $:.include?(libpath)

require 'bio/db/rebase'
require 'bio/util/restriction_enzyme'
require 'bio/util/restriction_enzyme/cut_symbol'
require 'bio/util/restriction_enzyme/single_strand'
require 'bio/util/restriction_enzyme/single_strand_complement'
require 'bio/util/restriction_enzyme/double_stranded/aligned_strands'
require 'bio/util/restriction_enzyme/double_stranded/cut_locations'
require 'bio/util/restriction_enzyme/double_stranded/cut_locations_in_enzyme_notation'

module Bio; end
class Bio::RestrictionEnzyme

#
# bio/util/restriction_enzyme/double_stranded.rb - DoubleStranded restriction enzyme sequence
#
# Copyright::  Copyright (C) 2006 Trevor Wennblom <trevor at corevx.com>
# License::    LGPL
#
#  $Id: double_stranded.rb,v 1.1 2006/02/01 07:34:11 trevor Exp $
#
#
#--
#
#  This library is free software; you can redistribute it and/or
#  modify it under the terms of the GNU Lesser General Public
#  License as published by the Free Software Foundation; either
#  version 2 of the License, or (at your option) any later version.
#
#  This library is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
#  Lesser General Public License for more details.
#
#  You should have received a copy of the GNU Lesser General Public
#  License along with this library; if not, write to the Free Software
#  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
#
#++
#

=begin rdoc
bio/util/restriction_enzyme/double_stranded.rb - DoubleStranded restriction enzyme sequence

A pair of +SingleStrand+ and +SingleStrandComplement+ objects with methods to
add utility to their relation.

== Notes
 * This is created by Bio::RestrictionEnzyme.new for convenience.
 * The two strands accessible are +primary+ and +complement+.
 * SingleStrand methods may be used on DoubleStranded and they will be passed to +primary+.

=end
class DoubleStranded
  include CutSymbol
  extend CutSymbol
  include StringFormatting
  extend StringFormatting

  # The primary strand
  attr_reader :primary

  # The complement strand
  attr_reader :complement

  # Cut locations in 0-based index format, DoubleStranded::CutLocations object
  attr_reader :cut_locations

  # Cut locations in enzyme index notation, DoubleStranded::CutLocationsInEnzymeNotation object
  attr_reader :cut_locations_in_enzyme_notation

  # +erp+:: Enzyme or Rebase or Pattern.  One of three:  The name of an enzyme.  A REBASE::EnzymeEntry object.  A nucleotide pattern.
  # +raw_cut_pairs+:: The cut locations in enzyme index notation.
  #
  # Enzyme index notation:: 1.._n_, value before 1 is -1
  #
  # Examples of the allowable cut locations for +raw_cut_pairs+ follows.  'p' and
  # 'c' refer to a cut location on the 'p'rimary and 'c'omplement strands.
  #
  #   1, [3,2], [20,22], 57
  #   p, [p,c], [p, c],  p
  #
  # Which is the same as:
  #
  #   1, (3..2), (20..22), 57
  #   p, (p..c), (p..c),   p
  #
  # Examples of partial cuts:
  #   1, [nil,2], [20,nil], 57
  #   p, [p,  c], [p, c],   p
  #
  def initialize(erp, *raw_cut_pairs)
    k = erp.class

    if k == Bio::REBASE::EnzymeEntry
      # Passed a Bio::REBASE::EnzymeEntry object

      unless raw_cut_pairs.empty?
        err = "A Bio::REBASE::EnzymeEntry object was passed, however the cut locations contained values.  Ambiguous or redundant.\n"
        err += "inspect = #{raw_cut_pairs.inspect}"
        raise ArgumentError, err
      end
      initialize_with_rebase( erp )

    elsif erp.kind_of? String
      # Passed something that could be an enzyme pattern or an anzyme name

      # Decide if this String is an enzyme name or a pattern
      if Bio::RestrictionEnzyme.enzyme_name?( erp )
        # Check if it's a known name
        known_enzyme = false
        known_enzyme = true if Bio::RestrictionEnzyme.rebase[ erp ]

        # Try harder to find the enzyme
        unless known_enzyme
          re = %r"^#{erp}$"i
          Bio::RestrictionEnzyme.rebase.each { |name, v| (known_enzyme = true; erp = name; break) if name =~ re }
        end

        if known_enzyme
          initialize_with_rebase( Bio::RestrictionEnzyme.rebase[erp] )
        else
          raise IndexError, "No entry found for enzyme named '#{erp}'"
        end

      else
        # Not an enzyme name, so a pattern is assumed
        if erp =~ re_cut_symbol
          initialize_with_pattern_and_cut_symbols( erp )
        else
          initialize_with_pattern_and_cut_locations( erp, raw_cut_pairs )
        end
      end

    elsif k == NilClass
      err = "Passed a nil value.  Perhaps you tried to pass a Bio::REBASE::EnzymeEntry that does not exist?\n"
      err += "inspect = #{erp.inspect}"
      raise ArgumentError, err
    else
      err = "I don't know what to do with class #{k} for erp.\n"
      err += "inspect = #{erp.inspect}"
      raise ArgumentError, err
    end

  end

  # See AlignedStrands.align
  def aligned_strands
    AlignedStrands.align(@primary.pattern, @complement.pattern)
  end

  # See AlignedStrands.align_with_cuts
  def aligned_strands_with_cuts
    AlignedStrands.align_with_cuts(@primary.pattern, @complement.pattern, @primary.cut_locations, @complement.cut_locations)
  end

  # Returns +true+ if the cut pattern creates blunt fragments
  def blunt?
    as = aligned_strands_with_cuts
    ary = [as.primary, as.complement]
    ary.collect! { |seq| seq.split( cut_symbol ) }
    # convert the cut sections to their lengths
    ary.each { |i| i.collect! { |c| c.length } }
    ary[0] == ary[1]
  end

  # Returns +true+ if the cut pattern creates sticky fragments
  def sticky?
    !blunt?
  end

  #########
  protected
  #########

  def initialize_with_pattern_and_cut_symbols( s )
    p_cl = SingleStrand::CutLocationsInEnzymeNotation.new( strip_padding(s) )
    s = Bio::Sequence::NA.new( strip_cuts_and_padding(s) )

    # * Reflect cuts that are in enzyme notation
    # * 0 is not a valid enzyme index, decrement 0 and all negative
#    c_cl = p_cl.collect { |n| s.length - n }.collect { |n| n <= 0 ? n - 1 : n } #wrong.
    c_cl = p_cl.collect {|n| (n >= s.length or n < 1) ? ((s.length - n) - 1) : (s.length - n)}

    create_cut_locations( p_cl.zip(c_cl) )
    create_primary_and_complement( s, p_cl, c_cl )
  end

  def initialize_with_pattern_and_cut_locations( s, raw_cl )
    create_cut_locations(raw_cl)
    create_primary_and_complement( Bio::Sequence::NA.new(s), @cut_locations_in_enzyme_notation.primary, @cut_locations_in_enzyme_notation.complement )
  end

  def create_primary_and_complement(primary_seq, p_cuts, c_cuts)
    @primary = SingleStrand.new( primary_seq, p_cuts )
    @complement = SingleStrandComplement.new( primary_seq.forward_complement, c_cuts )
  end

  def create_cut_locations(raw_cl)
    @cut_locations_in_enzyme_notation = CutLocationsInEnzymeNotation.new( *raw_cl.collect {|cl| CutLocationPairInEnzymeNotation.new(cl)} )
    @cut_locations = @cut_locations_in_enzyme_notation.to_array_index
  end

  def initialize_with_rebase( e )
    p_cl = [e.primary_strand_cut1, e.primary_strand_cut2]
    c_cl = [e.complementary_strand_cut1, e.complementary_strand_cut2]

    # If there's no cut in REBASE it's represented as a 0.
    # 0 is an invalid index, it just means no cut.
    p_cl.delete(0)
    c_cl.delete(0)
    raise IndexError unless p_cl.size == c_cl.size
    initialize_with_pattern_and_cut_locations( e.pattern, p_cl.zip(c_cl) )
  end

end

end




More information about the bioruby-cvs mailing list