[BioRuby-cvs] bioruby/lib/bio/db/embl format.erb, NONE, 1.1.2.1 common.rb, 1.12, 1.12.2.1 embl.rb, 1.29.2.1, 1.29.2.2

Wed Feb 20 09:56:24 UTC 2008

Update of /home/repository/bioruby/bioruby/lib/bio/db/embl
In directory dev.open-bio.org:/tmp/cvs-serv15755/lib/bio/db/embl

Modified Files:
      Tag: BRANCH-biohackathon2008
	common.rb embl.rb 
Added Files:
      Tag: BRANCH-biohackathon2008
	format.erb 
Log Message:
* Rewrote some of the code for converting EMBL files into Bio::Sequence.
* Added functionality to export Bio::Sequence to EMBL format.

Changes:
* renamed Sequence::Format#wrap and #fold to String#wrap and #fold (stored in bio.rb)
* lib/bio/db/common.rb:
    - rewrote def ref and def references
    - added to_biosequence
    - def references now returns an Array instead of a Bio::References object (tests changed accordingly)
* lib/bio/db/embl/embl.rb
    - def ft now returns Array instead of Bio::Features object (tests changed accordingly)
* lib/bio/db/embl/format.erb
* lib/bio/sequence/common.rb
    - added def format_embl


--- NEW FILE: format.erb ---
ID   <%= entry_id %>; SV <%= sequence_version %>; <%= topology %>; <%= molecule_type %>; <%= data_class %>; <%= division %>; <%= seq.length %> BP.
XX
AC   <%= accessions.reject{|a| a.nil?}.join('; ') + ';' %>
XX
DT   <%= date_created %>
DT   <%= date_modified %>
XX
DE   <%= definition %>
XX
KW   <%= keywords.join('; ') %>.
XX
OS   <%= species %>
<%= classification.join('; ').wrap(80, 'OC   ') %>.
XX   
<%= references.collect{|ref| ref.format('embl')}.join("\n") %>
XX
FH   Key             Location/Qualifiers
FH
<%  prefix = 'FT   '
    indent = prefix + ' ' * 16
    fwidth = 80 - indent.length %><%= format_features(prefix, indent, fwidth) %>XX
SQ   Sequence <%= seq.length %> BP; <%= seq.composition.collect{|k,v| "#{v} #{k.upcase}"}.join('; ') + '; ' + (seq.gsub(/[ACTGactg]/, '').length.to_s ) + ' other;' %>
<%= seq.format_embl %>
//
Index: embl.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/db/embl/embl.rb,v
retrieving revision 1.29.2.1
retrieving revision 1.29.2.2
diff -C2 -d -r1.29.2.1 -r1.29.2.2
*** embl.rb	15 Feb 2008 04:49:37 -0000	1.29.2.1
--- embl.rb	20 Feb 2008 09:56:22 -0000	1.29.2.2
***************
*** 123,126 ****
--- 123,130 ----
    alias molecule_type molecule
  
+   def data_class
+     id_line('DATA_CLASS')
+   end
+   
    def topology
      id_line('TOPOLOGY')
***************
*** 254,258 ****
      unless @data['FT']
        @data['FT'] = Array.new
-       ary = Array.new
        in_quote = false
        @orig['FT'].each_line do |line|
--- 258,261 ----
***************
*** 262,268 ****
          body = line[20,60].chomp # feature value (position, /qualifier=)
          if line =~ /^FT {3}(\S+)/
!           ary.push([ $1, body ]) # [ feature, position, /q="data", ... ]
          elsif body =~ /^ \// and not in_quote
!           ary.last.push(body)    # /q="data..., /q=data, /q
  
            if body =~ /=" / and body !~ /"$/
--- 265,271 ----
          body = line[20,60].chomp # feature value (position, /qualifier=)
          if line =~ /^FT {3}(\S+)/
!           @data['FT'].push([ $1, body ]) # [ feature, position, /q="data", ... ]
          elsif body =~ /^ \// and not in_quote
!           @data['FT'].last.push(body)    # /q="data..., /q=data, /q
  
            if body =~ /=" / and body !~ /"$/
***************
*** 271,275 ****
  
          else
!           ary.last.last << body # ...data..., ...data..."
  
            if body =~ /"$/
--- 274,278 ----
  
          else
!           @data['FT'].last.last << body # ...data..., ...data..."
  
            if body =~ /"$/
***************
*** 279,287 ****
        end
  
!       ary.map! do |subary|
          parse_qualifiers(subary)
        end
  
-       @data['FT'] = Features.new(ary)
      end
      if block_given?
--- 282,289 ----
        end
  
!       @data['FT'].map! do |subary|
          parse_qualifiers(subary)
        end
  
      end
      if block_given?
***************
*** 373,378 ****
      bio_seq.entry_id = self.entry_id
      bio_seq.primary_accession = self.accessions[0]
!     bio_seq.secondary_accessions = self.accessions[1,-1]
      bio_seq.molecule_type = self.molecule_type
      bio_seq.definition = self.description
      bio_seq.topology = self.topology
--- 375,381 ----
      bio_seq.entry_id = self.entry_id
      bio_seq.primary_accession = self.accessions[0]
!     bio_seq.secondary_accessions = self.accessions[1,-1] || []
      bio_seq.molecule_type = self.molecule_type
+     bio_seq.data_class = self.data_class
      bio_seq.definition = self.description
      bio_seq.topology = self.topology
***************
*** 382,386 ****
      bio_seq.sequence_version = self.version
      bio_seq.keywords = self.keywords
!     bio_seq.species = self.os(0)[0]['os'] + ' ' + self.os(0)[0]['name']
      bio_seq.classification = self.oc
      bio_seq.references = self.references
--- 385,389 ----
      bio_seq.sequence_version = self.version
      bio_seq.keywords = self.keywords
!     bio_seq.species = self.fetch('OS')
      bio_seq.classification = self.oc
      bio_seq.references = self.references
***************
*** 435,439 ****
    indent = prefix + ' ' * 16
    fwidth = 80 - indent.length
!   
    parser = Bio::FlatFile.auto('/home/aertsj/LocalDocuments/hackathon/aj224122.embl')
    parser.each do |entry|
--- 438,443 ----
    indent = prefix + ' ' * 16
    fwidth = 80 - indent.length
! 
! #  parser = Bio::FlatFile.auto('/home/aertsj/LocalDocuments/bioruby_biohackathon/bioruby/test/data/embl/AB090716.embl')
    parser = Bio::FlatFile.auto('/home/aertsj/LocalDocuments/hackathon/aj224122.embl')
    parser.each do |entry|

Index: common.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/db/embl/common.rb,v
retrieving revision 1.12
retrieving revision 1.12.2.1
diff -C2 -d -r1.12 -r1.12.2.1
*** common.rb	5 Apr 2007 23:35:40 -0000	1.12
--- common.rb	20 Feb 2008 09:56:22 -0000	1.12.2.1
***************
*** 241,265 ****
    def ref
      unless @data['R']
!       ary = Array.new
!       get('R').split(/\nRN   /).each do |str|
!         raw = {'RN' => '', 'RC' => '', 'RP' => '', 'RX' => '', 
!                'RA' => '', 'RT' => '', 'RL' => '', 'RG' => ''}
!         str = 'RN   ' + str unless /^RN   / =~ str
!         str.split("\n").each do |line|
!           if /^(R[NPXARLCTG])   (.+)/ =~ line
!             raw[$1] += $2 + ' '
!           else
!             raise "Invalid format in R lines, \n[#{line}]\n"
            end
          end
!         raw.each_value {|v| 
!           v.strip! 
!           v.sub!(/^"/,'')
!           v.sub!(/;$/,'')
!           v.sub!(/"$/,'')
!         }
!         ary.push(raw)
        end
-       @data['R'] = ary
      end
      @data['R']
--- 241,305 ----
    def ref
      unless @data['R']
!       @data['R'] = Array.new
!       # Get the different references as 'blurbs' (the lines together)
!       reference_blurbs = get('R').split(/\nRN   /)
!       reference_blurbs.each_index do |i|
!         reference_blurbs[i] = 'RN   ' + reference_blurbs[i] unless reference_blurbs[i] =~ /^RN   /
!       end
!       
!       # For each reference, we'll first create a hash that looks like below.
!       # Suppose the input is:
!       #   RA   name1, name2, name3
!       #   RA   name4
!       #   RT   some part of the title that
!       #   RT   did not fit on one line
!       # Then the hash looks like:
!       #   h = {
!       #         'RA' => ["name1, name2, name3", "name4"],
!       #         'RT' => ["some part of the title that", "did not fit on one line"]
!       #       }
!       reference_blurbs.each do |rb|
!         line_based_data = Hash.new
!         rb.split(/\n/).each do |line|
!           key, value = line.scan(/^(R[A-Z])   "?(\[?.*[A-Za-z0-9]\]?)/)[0]
!           if line_based_data[key].nil?
!             line_based_data[key] = Array.new
            end
+           line_based_data[key].push(value)
          end
! 
!         # Now we have to sanitize the hash: the authors should be kept in an 
!         # array, the title should be 1 string, ... So the hash should look like:
!         #  h = {
!         #        'RA' => ["name1", "name2", "name3", "name4"],
!         #        'RT' => 'some part of the title that did not fit on one line'
!         #      }
!         line_based_data.keys.each do |key|
!           if ['RC', 'RP', 'RT', 'RL'].include?(key)
!             line_based_data[key] = line_based_data[key].join(' ')
!           elsif ['RA', 'RX'].include?(key)
!             sanitized_data = Array.new
!             line_based_data[key].each do |v|
!               sanitized_data.push(v.split(/\s*,\s*/))
!             end
!             line_based_data[key] = sanitized_data.flatten
!           elsif key == 'RN'
!             line_based_data[key] = line_based_data[key][0].sub(/^\[/,'').sub(/\]$/,'').to_i
!           end
!         end
!         
!         # And put it in @data. @data in the end looks like this:
!         #  data = [
!         #           {
!         #             'RA' => ["name1", "name2", "name3", "name4"],
!         #             'RT' => 'some part of the title that did not fit on one line'
!         #           },
!         #           {
!         #             'RA' => ["name1", "name2", "name3", "name4"],
!         #             'RT' => 'some part of the title that did not fit on one line'
!         #           }
!         #         ]
!         @data['R'].push(line_based_data)
        end
      end
      @data['R']
***************
*** 270,306 ****
    def references
      unless @data['references']
!       ary = self.ref.map {|ent|
!         hash = Hash.new('')
!         ent.each {|key, value|
            case key
            when 'RA'
!             hash['authors'] = value.split(/, /)
            when 'RT'
              hash['title'] = value
            when 'RL'
!             if value =~ /(.*) (\d+) \((\d+)\), (\d+-\d+) \((\d+)\)$/
!               hash['journal'] = $1
!               hash['volume']  = $2
!               hash['issue']   = $3
!               hash['pages']   = $4
!               hash['year']    = $5
!             else
!               hash['journal'] = value
!             end
            when 'RX'  # PUBMED, MEDLINE
!             value.split('.').each {|item|
                tag, xref = item.split(/; /).map {|i| i.strip }
                hash[ tag.downcase ]  = xref
              }
            end
!         }
!         Reference.new(hash)
!       }
!       @data['references'] = References.new(ary)
      end
      @data['references']
    end
  
- 
    # returns contents in the DR line.
    # * Bio::EMBLDB::Common#dr  -> [ <Database cross-reference Hash>* ]
--- 310,345 ----
    def references
      unless @data['references']
!       @data['references'] = Array.new
!       self.ref.each do |ref|
!         hash = Hash.new
!         ref.each do |key, value|
            case key
+           when 'RN'
+             hash['embl_gb_record_number'] = value
+           when 'RC'
+             hash['comments'] = value
+           when 'RX'
+             hash['xrefs'] = value
+           when 'RP'
+             hash['sequence_position'] = value
            when 'RA'
!             hash['authors'] = value
            when 'RT'
              hash['title'] = value
            when 'RL'
!             hash['journal'] = value
            when 'RX'  # PUBMED, MEDLINE
!             value.each {|item|
                tag, xref = item.split(/; /).map {|i| i.strip }
                hash[ tag.downcase ]  = xref
              }
            end
!         end
!         @data['references'].push(Reference.new(hash))
!       end
      end
      @data['references']
    end
  
    # returns contents in the DR line.
    # * Bio::EMBLDB::Common#dr  -> [ <Database cross-reference Hash>* ]