[BioRuby-cvs] bioruby/lib/bio/db/embl format.erb, NONE, 1.1.2.1 common.rb, 1.12, 1.12.2.1 embl.rb, 1.29.2.1, 1.29.2.2
Jan Aerts
aerts at dev.open-bio.org
Wed Feb 20 09:56:24 UTC 2008
Update of /home/repository/bioruby/bioruby/lib/bio/db/embl
In directory dev.open-bio.org:/tmp/cvs-serv15755/lib/bio/db/embl
Modified Files:
Tag: BRANCH-biohackathon2008
common.rb embl.rb
Added Files:
Tag: BRANCH-biohackathon2008
format.erb
Log Message:
* Rewrote some of the code for converting EMBL files into Bio::Sequence.
* Added functionality to export Bio::Sequence to EMBL format.
Changes:
* renamed Sequence::Format#wrap and #fold to String#wrap and #fold (stored in bio.rb)
* lib/bio/db/common.rb:
- rewrote def ref and def references
- added to_biosequence
- def references now returns an Array instead of a Bio::References object (tests changed accordingly)
* lib/bio/db/embl/embl.rb
- def ft now returns Array instead of Bio::Features object (tests changed accordingly)
* lib/bio/db/embl/format.erb
* lib/bio/sequence/common.rb
- added def format_embl
--- NEW FILE: format.erb ---
ID <%= entry_id %>; SV <%= sequence_version %>; <%= topology %>; <%= molecule_type %>; <%= data_class %>; <%= division %>; <%= seq.length %> BP.
XX
AC <%= accessions.reject{|a| a.nil?}.join('; ') + ';' %>
XX
DT <%= date_created %>
DT <%= date_modified %>
XX
DE <%= definition %>
XX
KW <%= keywords.join('; ') %>.
XX
OS <%= species %>
<%= classification.join('; ').wrap(80, 'OC ') %>.
XX
<%= references.collect{|ref| ref.format('embl')}.join("\n") %>
XX
FH Key Location/Qualifiers
FH
<% prefix = 'FT '
indent = prefix + ' ' * 16
fwidth = 80 - indent.length %><%= format_features(prefix, indent, fwidth) %>XX
SQ Sequence <%= seq.length %> BP; <%= seq.composition.collect{|k,v| "#{v} #{k.upcase}"}.join('; ') + '; ' + (seq.gsub(/[ACTGactg]/, '').length.to_s ) + ' other;' %>
<%= seq.format_embl %>
//
Index: embl.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/db/embl/embl.rb,v
retrieving revision 1.29.2.1
retrieving revision 1.29.2.2
diff -C2 -d -r1.29.2.1 -r1.29.2.2
*** embl.rb 15 Feb 2008 04:49:37 -0000 1.29.2.1
--- embl.rb 20 Feb 2008 09:56:22 -0000 1.29.2.2
***************
*** 123,126 ****
--- 123,130 ----
alias molecule_type molecule
+ def data_class
+ id_line('DATA_CLASS')
+ end
+
def topology
id_line('TOPOLOGY')
***************
*** 254,258 ****
unless @data['FT']
@data['FT'] = Array.new
- ary = Array.new
in_quote = false
@orig['FT'].each_line do |line|
--- 258,261 ----
***************
*** 262,268 ****
body = line[20,60].chomp # feature value (position, /qualifier=)
if line =~ /^FT {3}(\S+)/
! ary.push([ $1, body ]) # [ feature, position, /q="data", ... ]
elsif body =~ /^ \// and not in_quote
! ary.last.push(body) # /q="data..., /q=data, /q
if body =~ /=" / and body !~ /"$/
--- 265,271 ----
body = line[20,60].chomp # feature value (position, /qualifier=)
if line =~ /^FT {3}(\S+)/
! @data['FT'].push([ $1, body ]) # [ feature, position, /q="data", ... ]
elsif body =~ /^ \// and not in_quote
! @data['FT'].last.push(body) # /q="data..., /q=data, /q
if body =~ /=" / and body !~ /"$/
***************
*** 271,275 ****
else
! ary.last.last << body # ...data..., ...data..."
if body =~ /"$/
--- 274,278 ----
else
! @data['FT'].last.last << body # ...data..., ...data..."
if body =~ /"$/
***************
*** 279,287 ****
end
! ary.map! do |subary|
parse_qualifiers(subary)
end
- @data['FT'] = Features.new(ary)
end
if block_given?
--- 282,289 ----
end
! @data['FT'].map! do |subary|
parse_qualifiers(subary)
end
end
if block_given?
***************
*** 373,378 ****
bio_seq.entry_id = self.entry_id
bio_seq.primary_accession = self.accessions[0]
! bio_seq.secondary_accessions = self.accessions[1,-1]
bio_seq.molecule_type = self.molecule_type
bio_seq.definition = self.description
bio_seq.topology = self.topology
--- 375,381 ----
bio_seq.entry_id = self.entry_id
bio_seq.primary_accession = self.accessions[0]
! bio_seq.secondary_accessions = self.accessions[1,-1] || []
bio_seq.molecule_type = self.molecule_type
+ bio_seq.data_class = self.data_class
bio_seq.definition = self.description
bio_seq.topology = self.topology
***************
*** 382,386 ****
bio_seq.sequence_version = self.version
bio_seq.keywords = self.keywords
! bio_seq.species = self.os(0)[0]['os'] + ' ' + self.os(0)[0]['name']
bio_seq.classification = self.oc
bio_seq.references = self.references
--- 385,389 ----
bio_seq.sequence_version = self.version
bio_seq.keywords = self.keywords
! bio_seq.species = self.fetch('OS')
bio_seq.classification = self.oc
bio_seq.references = self.references
***************
*** 435,439 ****
indent = prefix + ' ' * 16
fwidth = 80 - indent.length
!
parser = Bio::FlatFile.auto('/home/aertsj/LocalDocuments/hackathon/aj224122.embl')
parser.each do |entry|
--- 438,443 ----
indent = prefix + ' ' * 16
fwidth = 80 - indent.length
!
! # parser = Bio::FlatFile.auto('/home/aertsj/LocalDocuments/bioruby_biohackathon/bioruby/test/data/embl/AB090716.embl')
parser = Bio::FlatFile.auto('/home/aertsj/LocalDocuments/hackathon/aj224122.embl')
parser.each do |entry|
Index: common.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/db/embl/common.rb,v
retrieving revision 1.12
retrieving revision 1.12.2.1
diff -C2 -d -r1.12 -r1.12.2.1
*** common.rb 5 Apr 2007 23:35:40 -0000 1.12
--- common.rb 20 Feb 2008 09:56:22 -0000 1.12.2.1
***************
*** 241,265 ****
def ref
unless @data['R']
! ary = Array.new
! get('R').split(/\nRN /).each do |str|
! raw = {'RN' => '', 'RC' => '', 'RP' => '', 'RX' => '',
! 'RA' => '', 'RT' => '', 'RL' => '', 'RG' => ''}
! str = 'RN ' + str unless /^RN / =~ str
! str.split("\n").each do |line|
! if /^(R[NPXARLCTG]) (.+)/ =~ line
! raw[$1] += $2 + ' '
! else
! raise "Invalid format in R lines, \n[#{line}]\n"
end
end
! raw.each_value {|v|
! v.strip!
! v.sub!(/^"/,'')
! v.sub!(/;$/,'')
! v.sub!(/"$/,'')
! }
! ary.push(raw)
end
- @data['R'] = ary
end
@data['R']
--- 241,305 ----
def ref
unless @data['R']
! @data['R'] = Array.new
! # Get the different references as 'blurbs' (the lines together)
! reference_blurbs = get('R').split(/\nRN /)
! reference_blurbs.each_index do |i|
! reference_blurbs[i] = 'RN ' + reference_blurbs[i] unless reference_blurbs[i] =~ /^RN /
! end
!
! # For each reference, we'll first create a hash that looks like below.
! # Suppose the input is:
! # RA name1, name2, name3
! # RA name4
! # RT some part of the title that
! # RT did not fit on one line
! # Then the hash looks like:
! # h = {
! # 'RA' => ["name1, name2, name3", "name4"],
! # 'RT' => ["some part of the title that", "did not fit on one line"]
! # }
! reference_blurbs.each do |rb|
! line_based_data = Hash.new
! rb.split(/\n/).each do |line|
! key, value = line.scan(/^(R[A-Z]) "?(\[?.*[A-Za-z0-9]\]?)/)[0]
! if line_based_data[key].nil?
! line_based_data[key] = Array.new
end
+ line_based_data[key].push(value)
end
!
! # Now we have to sanitize the hash: the authors should be kept in an
! # array, the title should be 1 string, ... So the hash should look like:
! # h = {
! # 'RA' => ["name1", "name2", "name3", "name4"],
! # 'RT' => 'some part of the title that did not fit on one line'
! # }
! line_based_data.keys.each do |key|
! if ['RC', 'RP', 'RT', 'RL'].include?(key)
! line_based_data[key] = line_based_data[key].join(' ')
! elsif ['RA', 'RX'].include?(key)
! sanitized_data = Array.new
! line_based_data[key].each do |v|
! sanitized_data.push(v.split(/\s*,\s*/))
! end
! line_based_data[key] = sanitized_data.flatten
! elsif key == 'RN'
! line_based_data[key] = line_based_data[key][0].sub(/^\[/,'').sub(/\]$/,'').to_i
! end
! end
!
! # And put it in @data. @data in the end looks like this:
! # data = [
! # {
! # 'RA' => ["name1", "name2", "name3", "name4"],
! # 'RT' => 'some part of the title that did not fit on one line'
! # },
! # {
! # 'RA' => ["name1", "name2", "name3", "name4"],
! # 'RT' => 'some part of the title that did not fit on one line'
! # }
! # ]
! @data['R'].push(line_based_data)
end
end
@data['R']
***************
*** 270,306 ****
def references
unless @data['references']
! ary = self.ref.map {|ent|
! hash = Hash.new('')
! ent.each {|key, value|
case key
when 'RA'
! hash['authors'] = value.split(/, /)
when 'RT'
hash['title'] = value
when 'RL'
! if value =~ /(.*) (\d+) \((\d+)\), (\d+-\d+) \((\d+)\)$/
! hash['journal'] = $1
! hash['volume'] = $2
! hash['issue'] = $3
! hash['pages'] = $4
! hash['year'] = $5
! else
! hash['journal'] = value
! end
when 'RX' # PUBMED, MEDLINE
! value.split('.').each {|item|
tag, xref = item.split(/; /).map {|i| i.strip }
hash[ tag.downcase ] = xref
}
end
! }
! Reference.new(hash)
! }
! @data['references'] = References.new(ary)
end
@data['references']
end
-
# returns contents in the DR line.
# * Bio::EMBLDB::Common#dr -> [ <Database cross-reference Hash>* ]
--- 310,345 ----
def references
unless @data['references']
! @data['references'] = Array.new
! self.ref.each do |ref|
! hash = Hash.new
! ref.each do |key, value|
case key
+ when 'RN'
+ hash['embl_gb_record_number'] = value
+ when 'RC'
+ hash['comments'] = value
+ when 'RX'
+ hash['xrefs'] = value
+ when 'RP'
+ hash['sequence_position'] = value
when 'RA'
! hash['authors'] = value
when 'RT'
hash['title'] = value
when 'RL'
! hash['journal'] = value
when 'RX' # PUBMED, MEDLINE
! value.each {|item|
tag, xref = item.split(/; /).map {|i| i.strip }
hash[ tag.downcase ] = xref
}
end
! end
! @data['references'].push(Reference.new(hash))
! end
end
@data['references']
end
# returns contents in the DR line.
# * Bio::EMBLDB::Common#dr -> [ <Database cross-reference Hash>* ]
More information about the bioruby-cvs
mailing list