[BioRuby-cvs] bioruby/lib/bio/db/embl sptr.rb,1.23,1.24
Mitsuteru C. Nakao
nakao at pub.open-bio.org
Mon Aug 8 03:27:12 EDT 2005
Update of /home/repository/bioruby/bioruby/lib/bio/db/embl
In directory pub.open-bio.org:/tmp/cvs-serv29902/lib/bio/db/embl
Modified Files:
sptr.rb
Log Message:
* Added Bio::SPTR#protein_name method (proposed by Luca Pireddu).
* Added Bio::SPTR#synomyms method (proposed by Luca Pireddu).
* Changed Bio::SPTR#gn to parsing newly GN line format (proposed by Luca
Pireddu).
* Changed Bio::SPTR#gene_names method (proposed by Luca Pireddu).
* Changed Bio::SPTR#gene_name method (proposed by Luca Pireddu).
Index: sptr.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/db/embl/sptr.rb,v
retrieving revision 1.23
retrieving revision 1.24
diff -C2 -d -r1.23 -r1.24
*** sptr.rb 25 Aug 2004 16:57:12 -0000 1.23
--- sptr.rb 8 Aug 2005 06:41:13 -0000 1.24
***************
*** 67,70 ****
--- 67,71 ----
end
+ #
def entry
id_line('ENTRY_NAME')
***************
*** 73,76 ****
--- 74,78 ----
alias entry_id entry
+ #
def molecule
id_line('MOLECULE_TYPE')
***************
*** 78,81 ****
--- 80,84 ----
alias molecule_type molecule
+ #
def sequence_length
id_line('SEQUENCE_LENGTH')
***************
*** 98,102 ****
# #accession -> accessions.first
! @@ac_regrexp = /[OPQ][0-9][A-Z0-9]{3}[0-9]/
# DT Line; date (3/entry)
--- 101,106 ----
# #accession -> accessions.first
! @@ac_regrexp = /[OPQ][0-9][A-Z0-9]{3}[0-9]/
!
# DT Line; date (3/entry)
***************
*** 133,139 ****
--- 137,179 ----
# CONTEINS >=0
#
+ # Returns the proposed official name of the protein
+ def protein_name
+ name = ""
+ if de_line = fetch('DE') then
+ str = de_line[/^[^\[]*/] # everything preceding the first [ (the "contains" part)
+ name = str[/^[^(]*/].strip
+ name << ' (Fragment)' if str =~ /fragment/i
+ end
+ return name
+ end
+ # synonyms are each placed in () following the official name on the DE line
+ # Returns an array of synonyms (unofficial names)
+ def synonyms
+ ary = Array.new
+ if de_line = fetch('DE') then
+ line = de_line.sub(/\[.*\]/,'') # ignore stuff between [ and ]. That's the "contains" part
+ line.scan(/\([^)]+/) do |synonym|
+ unless synonym =~ /fragment/i then
+ ary << synonym[1..-1].strip # index to remove the leading (
+ end
+ end
+ end
+ return ary
+ end
# GN Line: Gene name(s) (>=0, optional)
+ def gn
+ return @data['GN'] if @data['GN']
+
+ case fetch('GN')
+ when /Name=/ then
+ return gn_uniprot_parser
+ else
+ return gn_old_parser
+ end
+ end
+
+ # GN Line: Gene name(s) (>=0, optional)
# GN HNS OR DRDX OR OSMZ OR BGLY.
# GN CECA1 AND CECA2.
***************
*** 146,192 ****
# #gene_names -> Array
#
! def gn
! unless @data['GN']
! if get('GN').size > 0
! names = fetch('GN').sub(/\.$/,'').split(/ AND /)
! names.map! {|synonyms|
! synonyms = synonyms.gsub(/\(|\)/,'').split(/ OR /).map {|e|
! e.strip
! }
}
end
! @data['GN'] = names
end
return @data['GN']
end
! alias gene_names gn
# Bio::SPTR#gene_name -> String
#
def gene_name
! begin
! @data['GN'][0][0]
! rescue NameError
! nil
! end
end
# OS Line; organism species (>=1)
! # "OS Trifolium repens (white clover)"
! #
! # OS Genus species (name).
! # OS Genus species (name0) (name1).
! # OS Genus species (name0) (name1).
! # OS Genus species (name0), G s0 (name0), and G s (name1).
#
! # Bio::EMBLDB#os -> Array w/in Hash
# [{'name'=>'(Human)', 'os'=>'Homo sapiens'},
# {'name'=>'(Rat)', 'os'=>'Rattus norveticus'}]
! # Bio::EMBLDB#os[0]['name'] => "(Human)"
! # Bio::EMBLDB#os[0] => {'name'=>"(Human)", 'os'=>'Homo sapiens'}
! # Bio::EMBLDB#os(0) => "Homo sapiens (Human)"
#
! # Bio::SPTR#os -> Array w/in Hash
# Bio::SPTR#os(num) -> String
--- 186,298 ----
# #gene_names -> Array
#
! def gn_old_parser
! names = Array.new
! if get('GN').size > 0
! names = fetch('GN').sub(/\.$/,'').split(/ AND /)
! names.map! { |synonyms|
! synonyms = synonyms.gsub(/\(|\)/,'').split(/ OR /).map { |e|
! e.strip
}
+ }
+ end
+ return @data['GN'] = names
+ end
+ private :gn_old_parser
+
+
+ # The new format of the GN line is:
+ # GN Name=; Synonyms=[, ...]; OrderedLocusNames=[, ...];
+ # GN ORFNames=[, ...];
+ #
+ # GN and
+ #
+ # Bio::SPTR#gn -> [ <gene record>* ]
+ # where <gene record> is:
+ # { :name => '...',
+ # :synonyms => [ 's1', 's2', ... ],
+ # :loci => [ 'l1', 'l2', ... ],
+ # :orfs => [ 'o1', 'o2', ... ]
+ # }
+ def gn_uniprot_parser
+ @data['GN'] = Array.new
+ gn_line = fetch('GN').strip
+ records = gn_line.split(/\s*and\s*/)
+ records.each do |record|
+ gene_hash = {:name => '', :synonyms => [], :loci => [], :orfs => []}
+ record.each(';') do |element|
+ case element
+ when /Name=/ then
+ gene_hash[:name] = $'[0..-2]
+ when /Synonyms=/ then
+ gene_hash[:synonyms] = $'[0..-2].split(/\s*,\s*/)
+ when /OrderedLocusNames=/ then
+ gene_hash[:loci] = $'[0..-2].split(/\s*,\s*/)
+ when /ORFNames=/ then
+ gene_hash[:orfs] = $'[0..-2].split(/\s*,\s*/)
+ end
end
! @data['GN'] << gene_hash
end
return @data['GN']
end
! private :gn_uniprot_parser
!
!
! # Bio::SPTR#gene_names -> [String]
! def gene_names
! gn # set @data['GN'] if it hasn't been already done
! if @data['GN'].first.class == Hash then
! @data['GN'].collect { |element| element[:name] }
! else
! @data['GN'].first
! end
! end
!
# Bio::SPTR#gene_name -> String
#
def gene_name
! gene_names.first
end
+
# OS Line; organism species (>=1)
! # "OS Genus species (name)."
! # "OS Genus species (name0) (name1)."
! # "OS Genus species (name0) (name1)."
! # "OS Genus species (name0), G s0 (name0), and G s (name0) (name1)."
! # "OS Homo sapiens (Human), and Rarrus norveticus (Rat)"
#
! # Bio::EMBLDB#os -> Array of Hash
# [{'name'=>'(Human)', 'os'=>'Homo sapiens'},
# {'name'=>'(Rat)', 'os'=>'Rattus norveticus'}]
! # Bio::SPTR#os[0]['name'] => "(Human)"
! # Bio::EPTR#os[0] => {'name'=>"(Human)", 'os'=>'Homo sapiens'}
! # Bio::EPTR#os(0) => "Homo sapiens (Human)"
#
! # Bio::SPTR#os -> Array of Hash
# Bio::SPTR#os(num) -> String
+ def os(num = nil)
+ unless @data['OS']
+ os = Array.new
+ fetch('OS').split(/, and|, /).each do |tmp|
+ if tmp =~ /([A-Z][a-z]* *[\w\d \:\'\+\-]+[\w\d])/
+ org = $1
+ tmp =~ /(\(.+\))/
+ os.push({'name' => $1, 'os' => org})
+ else
+ raise "Error: OS Line. #{$!}\n#{fetch('OS')}\n"
+ end
+ end
+ @data['OS'] = os
+ end
+
+ if num
+ # EX. "Trifolium repens (white clover)"
+ return "#{@data['OS'][num]['os']} #{@data['OS'][num]['name']}"
+ else
+ return @data['OS']
+ end
+ end
***************
*** 211,220 ****
def ox
unless @data['OX']
! tmp = fetch('OX').sub(/\.$/,'').split(/;/).map {|e| e.strip }
hsh = Hash.new
! tmp.each {|e|
db,refs = e.split(/=/)
hsh[db] = refs.split(/, */)
! }
@data['OX'] = hsh
end
--- 317,326 ----
def ox
unless @data['OX']
! tmp = fetch('OX').sub(/\.$/,'').split(/;/).map { |e| e.strip }
hsh = Hash.new
! tmp.each do |e|
db,refs = e.split(/=/)
hsh[db] = refs.split(/, */)
! end
@data['OX'] = hsh
end
***************
*** 272,276 ****
begin
! fetch('CC').split(/#{cmt}/)[0].sub(dlm,'').split(dlm).each {|tmp|
if /(^[A-Z ]+[A-Z]): (.+)/ =~ tmp
key = $1
--- 378,382 ----
begin
! fetch('CC').split(/#{cmt}/)[0].sub(dlm,'').split(dlm).each do |tmp|
if /(^[A-Z ]+[A-Z]): (.+)/ =~ tmp
key = $1
***************
*** 282,296 ****
end
else
! raise ["Error: #{entry_id}: CC Lines", '',
tmp, '', '', fetch('CC'),''].join("\n")
end
! }
rescue NameError
if fetch('CC') == ''
return {}
else
! raise "Error: Invalid CC Lines: #{entry_id}: " +
! "\n'#{self.get('CC')}'\n" +
! "(#{$!})"
end
rescue NoMethodError
--- 388,401 ----
end
else
! raise ["Error: [#{entry_id}]: CC Lines", '',
tmp, '', '', fetch('CC'),''].join("\n")
end
! end
rescue NameError
if fetch('CC') == ''
return {}
else
! raise ["Error: Invalid CC Lines: [#{entry_id}]: ",
! "\n'#{self.get('CC')}'\n", "(#{$!})"].join
end
rescue NoMethodError
***************
*** 314,319 ****
# Event, Named isoforms, Comment, [Name, Synonyms, IsoId, Sequnce]+
! tmp = {'Event'=>nil, 'Named isoforms'=>nil, 'Comment'=>nil,
! 'Variants' => []}
if /Event=(.+?);/ =~ ap
--- 419,423 ----
# Event, Named isoforms, Comment, [Name, Synonyms, IsoId, Sequnce]+
! tmp = {'Event' => nil, 'Named isoforms' => nil, 'Comment' => nil, 'Variants' => []}
if /Event=(.+?);/ =~ ap
***************
*** 326,332 ****
tmp['Comment'] = $1
end
! ap.scan(/Name=.+?Sequence=.+?;/).each {|ent|
tmp['Variants'] << cc_ap_variants_parse(ent)
! }
return tmp
--- 430,436 ----
tmp['Comment'] = $1
end
! ap.scan(/Name=.+?Sequence=.+?;/).each do |ent|
tmp['Variants'] << cc_ap_variants_parse(ent)
! end
return tmp
***************
*** 338,344 ****
return db unless db
! db.each {|e|
! db = {'NAME'=>nil,'NOTE'=>nil,'WWW'=>nil,'FTP'=>nil}
! e.sub(/.$/,'').split(/;/).each {|line|
case line
when /NAME=(.+)/
--- 442,448 ----
return db unless db
! db.each do |e|
! db = {'NAME' => nil, 'NOTE' => nil, 'WWW' => nil, 'FTP' => nil}
! e.sub(/.$/,'').split(/;/).each do |line|
case line
when /NAME=(.+)/
***************
*** 351,357 ****
db['FTP'] = $1
end
! }
tmp.push(db)
! }
return tmp
--- 455,461 ----
db['FTP'] = $1
end
! end
tmp.push(db)
! end
return tmp
***************
*** 362,368 ****
return ms unless ms
! ms.each {|m|
mass = {'MW'=>nil,'MW_ERR'=>nil,'METHOD'=>nil,'RANGE'=>nil}
! m.sub(/.$/,'').split(/;/).each {|line|
case line
when /MW=(.+)/
--- 466,472 ----
return ms unless ms
! ms.each do |m|
mass = {'MW'=>nil,'MW_ERR'=>nil,'METHOD'=>nil,'RANGE'=>nil}
! m.sub(/.$/,'').split(/;/).each do |line|
case line
when /MW=(.+)/
***************
*** 375,381 ****
mass['RANGE'] = $1 # RANGE class ?
end
! }
tmp.push(mass)
! }
return tmp
--- 479,485 ----
mass['RANGE'] = $1 # RANGE class ?
end
! end
tmp.push(mass)
! end
return tmp
***************
*** 390,402 ****
def cc_ap_variants_parse(ent)
! tmp = {}
! ent.split(/; /).map {|e| e.split(/=/) }.each {|e|
case e[0]
when 'Sequence'
e[1] = e[1].sub(/;/,'').split(/, /)
end
! tmp[e[0]] = e[1]
! }
! tmp
end
private :cc_ap_variants_parse
--- 494,506 ----
def cc_ap_variants_parse(ent)
! hsh = {}
! ent.split(/; /).map {|e| e.split(/=/) }.each do |e|
case e[0]
when 'Sequence'
e[1] = e[1].sub(/;/,'').split(/, /)
end
! hsh[e[0]] = e[1]
! end
! return hsh
end
private :cc_ap_variants_parse
***************
*** 404,409 ****
-
-
# DR Line; defabases cross-reference (>=0)
# a cross_ref pre one line
--- 508,511 ----
***************
*** 419,423 ****
-
# KW Line; keyword (>=1)
# KW [Keyword;]+
--- 521,524 ----
***************
*** 471,475 ****
end
-
case last_feature
when 'VARSPLIC', 'VARIANT', 'CONFLICT'
--- 572,575 ----
***************
*** 502,507 ****
end
! table.each_key {|k|
! table[k].each {|e|
if / -> / =~ e['Description']
pattern = /([A-Z][A-Z ]*[A-Z]*) -> ([A-Z][A-Z ]*[A-Z]*)/
--- 602,607 ----
end
! table.each_key do |k|
! table[k].each do |e|
if / -> / =~ e['Description']
pattern = /([A-Z][A-Z ]*[A-Z]*) -> ([A-Z][A-Z ]*[A-Z]*)/
***************
*** 522,528 ****
}
end
! }
! }
!
@data['FT'] = table
end
--- 622,627 ----
}
end
! end
! end
@data['FT'] = table
end
***************
*** 549,553 ****
unless @data['SQ']
if fetch('SQ') =~ /(\d+) AA\; (\d+) MW; (.+) CRC64;/
! @data['SQ'] = { 'aalen'=>$1.to_i, 'MW'=>$2.to_i, 'CRC64'=>$3 }
else
raise "Invalid SQ Line: \n'#{fetch('SQ')}'"
--- 648,652 ----
unless @data['SQ']
if fetch('SQ') =~ /(\d+) AA\; (\d+) MW; (.+) CRC64;/
! @data['SQ'] = { 'aalen' => $1.to_i, 'MW' => $2.to_i, 'CRC64' => $3 }
else
raise "Invalid SQ Line: \n'#{fetch('SQ')}'"
***************
*** 624,627 ****
--- 723,727 ----
cmd "Bio::SPTR.new($ent).gn", 'GN'
cmd "Bio::SPTR.new($ent).gene_name"
+ cmd "Bio::SPTR.new($ent).gene_names"
cmd "Bio::SPTR.new($ent).dt", "DT"
***************
*** 632,635 ****
--- 732,737 ----
cmd "Bio::SPTR.new($ent).de", 'DE'
cmd "Bio::SPTR.new($ent).definition"
+ cmd "Bio::SPTR.new($ent).protein_name"
+ cmd "Bio::SPTR.new($ent).synonyms"
cmd "Bio::SPTR.new($ent).kw", 'KW'
***************
*** 684,690 ****
--- Bio::SPTR#entry_id -> str
-
--- Bio::SPTR#molecule -> str
-
--- Bio::SPTR#sequence_length -> int
--- 786,790 ----
***************
*** 695,703 ****
--- Bio::SPTR#accessions -> ary
--- Bio::SPTR#accession -> accessions.first
=== GN line (Gene name(s))
! --- Bio::SPTR#gn -> [ary, ...]
! --- Bio::SPTR#gene_name -> gn[0][0]
=== DT lines (Date)
--- 795,806 ----
--- Bio::SPTR#accessions -> ary
--- Bio::SPTR#accession -> accessions.first
+
=== GN line (Gene name(s))
! --- Bio::SPTR#gn -> [ary, ...] or [{:name => str, :synonyms => [], :loci => [], :orfs => []}]
! --- Bio::SPTR#gene_name -> str
! --- Bio::SPTR#gene_names -> [str] or [str]
!
=== DT lines (Date)
***************
*** 708,715 ****
--- 811,828 ----
key := (created|annotation|sequence)
+
=== DE lines (Description)
--- Bio::SPTR#de -> str
#definition -> str
+
+ --- Bio::SPTR#protein_name
+
+ Returns the proposed official name of the protein
+
+
+ --- Bio::SPTR#synonyms
+
+ Returns an array of synonyms (unofficial names)
=== KW lines (Keyword)
More information about the bioruby-cvs
mailing list