From ngoto at dev.open-bio.org Mon Jun 2 05:33:50 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Mon, 02 Jun 2008 09:33:50 +0000
Subject: [BioRuby-cvs] bioruby/lib/bio reference.rb,1.26,1.27
Message-ID: <200806020933.m529Xoou025921@dev.open-bio.org>
Update of /home/repository/bioruby/bioruby/lib/bio
In directory dev.open-bio.org:/tmp/cvs-serv25887
Modified Files:
reference.rb
Log Message:
reverted to 1.24, because of potential security problem about "eval" in
bibtex method.
Index: reference.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/reference.rb,v
retrieving revision 1.26
retrieving revision 1.27
diff -C2 -d -r1.26 -r1.27
*** reference.rb 31 May 2008 09:36:55 -0000 1.26
--- reference.rb 2 Jun 2008 09:33:48 -0000 1.27
***************
*** 71,74 ****
--- 71,77 ----
attr_reader :abstract
+ # An URL String.
+ attr_reader :url
+
# MeSH terms in an Array.
attr_reader :mesh
***************
*** 77,83 ****
attr_reader :affiliations
- # An URL String.
- attr_reader :url
-
# Create a new Bio::Reference object from a Hash of values.
# Data is extracted from the values for keys:
--- 80,83 ----
***************
*** 232,236 ****
lines << "%P #{@pages}" unless @pages.empty?
lines << "%M #{@pubmed}" unless @pubmed.to_s.empty?
! lines << "%U #{url}" unless url.empty?
lines << "%X #{@abstract}" unless @abstract.empty?
@mesh.each do |term|
--- 232,241 ----
lines << "%P #{@pages}" unless @pages.empty?
lines << "%M #{@pubmed}" unless @pubmed.to_s.empty?
! if @pubmed
! cgi = "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi"
! opts = "cmd=Retrieve&db=PubMed&dopt=Citation&list_uids"
! @url = "#{cgi}?#{opts}=#{@pubmed}"
! end
! lines << "%U #{@url}" unless @url.empty?
lines << "%X #{@abstract}" unless @abstract.empty?
@mesh.each do |term|
***************
*** 294,321 ****
# *Arguments*:
# * (optional) _section_: BiBTeX section as String
- # * (optional) _keywords_: Array of additional keywords, e.g. ['abstract']
# *Returns*:: String
! def bibtex(section = nil, add_keywords = [])
section = "article" unless section
authors = authors_join(' and ', ' and ')
pages = @pages.sub('-', '--')
! keywords = "author title journal year volume number pages url".split(/ /)
! bib = "@#{section}{PMID:#{@pubmed},\n"
! (keywords+add_keywords).each do | kw |
! if kw == 'author'
! ref = authors
! elsif kw == 'title'
! # strip final dot from title
! ref = @title.sub(/\.$/,'')
! elsif kw == 'number'
! ref = @issue
! elsif kw == 'url'
! ref = url
! else
! ref = eval('@'+kw)
! end
! bib += " #{kw.ljust(12)} = {#{ref}},\n" if ref != ''
! end
! bib+"}\n"
end
--- 299,318 ----
# *Arguments*:
# * (optional) _section_: BiBTeX section as String
# *Returns*:: String
! def bibtex(section = nil)
section = "article" unless section
authors = authors_join(' and ', ' and ')
pages = @pages.sub('-', '--')
! return <<-"END".gsub(/\t/, '')
! @#{section}{PMID:#{@pubmed},
! author = {#{authors}},
! title = {#{@title}},
! journal = {#{@journal}},
! year = {#{@year}},
! volume = {#{@volume}},
! number = {#{@issue}},
! pages = {#{pages}},
! }
! END
end
***************
*** 503,518 ****
end
- # Returns a valid URL for pubmed records
- #
- # *Returns*:: String
- def url
- return @url if @url and @url != ''
- if @pubmed != ''
- cgi = "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi"
- opts = "cmd=Retrieve&db=PubMed&dopt=Citation&list_uids"
- return "#{cgi}?#{opts}=#{@pubmed}"
- end
- ''
- end
private
--- 500,503 ----
***************
*** 542,546 ****
end
-
end
--- 527,530 ----
From ngoto at dev.open-bio.org Mon Jun 2 05:47:11 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Mon, 02 Jun 2008 09:47:11 +0000
Subject: [BioRuby-cvs] bioruby/lib/bio reference.rb,1.27,1.28
Message-ID: <200806020947.m529lBCN026079@dev.open-bio.org>
Update of /home/repository/bioruby/bioruby/lib/bio
In directory dev.open-bio.org:/tmp/cvs-serv26058/lib/bio
Modified Files:
reference.rb
Log Message:
* New method Bio::Reference#pubmed_url added (renamed the url method in
revision 1.25).
* Bio::Reference#endnote is changed not to overwrite url if url is
already given by user.
Index: reference.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/reference.rb,v
retrieving revision 1.27
retrieving revision 1.28
diff -C2 -d -r1.27 -r1.28
*** reference.rb 2 Jun 2008 09:33:48 -0000 1.27
--- reference.rb 2 Jun 2008 09:47:08 -0000 1.28
***************
*** 232,241 ****
lines << "%P #{@pages}" unless @pages.empty?
lines << "%M #{@pubmed}" unless @pubmed.to_s.empty?
! if @pubmed
! cgi = "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi"
! opts = "cmd=Retrieve&db=PubMed&dopt=Citation&list_uids"
! @url = "#{cgi}?#{opts}=#{@pubmed}"
! end
! lines << "%U #{@url}" unless @url.empty?
lines << "%X #{@abstract}" unless @abstract.empty?
@mesh.each do |term|
--- 232,237 ----
lines << "%P #{@pages}" unless @pages.empty?
lines << "%M #{@pubmed}" unless @pubmed.to_s.empty?
! url = @url.empty? ? pubmed_url : @url
! lines << "%U #{url}" unless url.empty?
lines << "%X #{@abstract}" unless @abstract.empty?
@mesh.each do |term|
***************
*** 500,503 ****
--- 496,510 ----
end
+ # Returns a valid URL for pubmed records
+ #
+ # *Returns*:: String
+ def pubmed_url
+ unless @pubmed.to_s.empty?
+ cgi = "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi"
+ opts = "cmd=Retrieve&db=PubMed&dopt=Citation&list_uids"
+ return "#{cgi}?#{opts}=#{@pubmed}"
+ end
+ ''
+ end
private
From ngoto at dev.open-bio.org Wed Jun 4 10:56:40 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Wed, 04 Jun 2008 14:56:40 +0000
Subject: [BioRuby-cvs] bioruby/lib/bio reference.rb,1.28,1.29
Message-ID: <200806041456.m54Eue8E001532@dev.open-bio.org>
Update of /home/repository/bioruby/bioruby/lib/bio
In directory dev.open-bio.org:/tmp/cvs-serv1512/lib/bio
Modified Files:
reference.rb
Log Message:
improvement of Bio::Reference#bibtex method
Index: reference.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/reference.rb,v
retrieving revision 1.28
retrieving revision 1.29
diff -C2 -d -r1.28 -r1.29
*** reference.rb 2 Jun 2008 09:47:08 -0000 1.28
--- reference.rb 4 Jun 2008 14:56:37 -0000 1.29
***************
*** 167,184 ****
# *Arguments*:
# * (optional) _style_: String with style identifier
! # * (optional) _option_: Option for styles accepting one
# *Returns*:: String
! def format(style = nil, option = nil)
case style
when 'endnote'
return endnote
when 'bibitem'
! return bibitem(option)
when 'bibtex'
! return bibtex(option)
when 'rd'
! return rd(option)
when /^nature$/i
! return nature(option)
when /^science$/i
return science
--- 167,184 ----
# *Arguments*:
# * (optional) _style_: String with style identifier
! # * (optional) _options_: Options for styles accepting one
# *Returns*:: String
! def format(style = nil, *options)
case style
when 'endnote'
return endnote
when 'bibitem'
! return bibitem(*options)
when 'bibtex'
! return bibtex(*options)
when 'rd'
! return rd(*options)
when /^nature$/i
! return nature(*options)
when /^science$/i
return science
***************
*** 295,314 ****
# *Arguments*:
# * (optional) _section_: BiBTeX section as String
# *Returns*:: String
! def bibtex(section = nil)
section = "article" unless section
authors = authors_join(' and ', ' and ')
! pages = @pages.sub('-', '--')
! return <<-"END".gsub(/\t/, '')
! @#{section}{PMID:#{@pubmed},
! author = {#{authors}},
! title = {#{@title}},
! journal = {#{@journal}},
! year = {#{@year}},
! volume = {#{@volume}},
! number = {#{@issue}},
! pages = {#{pages}},
! }
! END
end
--- 295,340 ----
# *Arguments*:
# * (optional) _section_: BiBTeX section as String
+ # * (optional) _label_: Label string cited by LaTeX documents.
+ # Default is "PMID:#{pubmed}".
+ # * (optional) _keywords_: Hash of additional keywords,
+ # e.g. { 'abstract' => 'This is abstract.' }.
+ # You can also override default keywords.
+ # To disable default keywords, specify false as
+ # value, e.g. { 'url' => false, 'year' => false }.
# *Returns*:: String
! def bibtex(section = nil, label = nil, keywords = {})
section = "article" unless section
authors = authors_join(' and ', ' and ')
! thepages = pages.to_s.empty? ? nil : pages.sub(/\-/, '--')
! unless label then
! label = "PMID:#{pubmed}"
! end
! theurl = if !(url.to_s.empty?) then
! url
! elsif pmurl = pubmed_url and !(pmurl.to_s.empty?) then
! pmurl
! else
! nil
! end
! hash = {
! 'author' => authors.empty? ? nil : authors,
! 'title' => title.to_s.empty? ? nil : title,
! 'number' => issue.to_s.empty? ? nil : issue,
! 'pages' => thepages,
! 'url' => theurl
! }
! keys = %w( author title journal year volume number pages url )
! keys.each do |k|
! hash[k] = self.__send__(k.intern) unless hash.has_key?(k)
! end
! hash.merge!(keywords) { |k, v1, v2| v2.nil? ? v1 : v2 }
! bib = [ "@#{section}{#{label}," ]
! keys.concat((hash.keys - keys).sort)
! keys.each do |kw|
! ref = hash[kw]
! bib.push " #{kw.ljust(12)} = {#{ref}}," if ref
! end
! bib.push "}\n"
! return bib.join("\n")
end
From ngoto at dev.open-bio.org Wed Jun 4 10:58:10 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Wed, 04 Jun 2008 14:58:10 +0000
Subject: [BioRuby-cvs] bioruby/test/unit/bio test_reference.rb,1.4,1.5
Message-ID: <200806041458.m54EwAo2001581@dev.open-bio.org>
Update of /home/repository/bioruby/bioruby/test/unit/bio
In directory dev.open-bio.org:/tmp/cvs-serv1561/test/unit/bio
Modified Files:
test_reference.rb
Log Message:
test changed due to the improvement of Bio::Reference#bibtex
Index: test_reference.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/test/unit/bio/test_reference.rb,v
retrieving revision 1.4
retrieving revision 1.5
diff -C2 -d -r1.4 -r1.5
*** test_reference.rb 31 May 2008 09:36:56 -0000 1.4
--- test_reference.rb 4 Jun 2008 14:58:08 -0000 1.5
***************
*** 103,112 ****
def test_format_bibtex
! str = "@article{PMID:12345678,\n author = {Hoge, J.P. and Fuga, F.B.},\n title = {Title of the study},\n journal = {Theor. J. Hoge},\n year = {2001},\n volume = {12},\n number = {3},\n pages = {123-145},\n url = {http://example.com},\n}\n"
!
assert_equal(str, @obj.format('bibtex'))
assert_equal(str, @obj.bibtex)
end
def test_format_rd
str = "== Title of the study.\n\n* Hoge, J.P. and Fuga, F.B.\n\n* Theor. J. Hoge 2001 12:123-145 [PMID:12345678]\n\nHoge fuga. hoge fuga."
--- 103,147 ----
def test_format_bibtex
! str =<<__END__
! @article{PMID:12345678,
! author = {Hoge, J.P. and Fuga, F.B.},
! title = {Title of the study.},
! journal = {Theor. J. Hoge},
! year = {2001},
! volume = {12},
! number = {3},
! pages = {123--145},
! url = {http://example.com},
! }
! __END__
assert_equal(str, @obj.format('bibtex'))
assert_equal(str, @obj.bibtex)
end
+ def test_format_bibtex_with_arguments
+ str =<<__END__
+ @inproceedings{YourArticle,
+ author = {Hoge, J.P. and Fuga, F.B.},
+ title = {Title of the study.},
+ year = {2001},
+ volume = {12},
+ number = {3},
+ pages = {123--145},
+ booktitle = {Theor. J. Hoge},
+ month = {December},
+ }
+ __END__
+ assert_equal(str, @obj.format('bibtex', 'inproceedings', 'YourArticle',
+ { 'journal' => false,
+ 'url' => false,
+ 'booktitle' => @obj.journal,
+ 'month' => 'December'}))
+ assert_equal(str, @obj.bibtex('inproceedings', 'YourArticle',
+ { 'journal' => false,
+ 'url' => false,
+ 'booktitle' => @obj.journal,
+ 'month' => 'December'}))
+ end
+
def test_format_rd
str = "== Title of the study.\n\n* Hoge, J.P. and Fuga, F.B.\n\n* Theor. J. Hoge 2001 12:123-145 [PMID:12345678]\n\nHoge fuga. hoge fuga."
From ngoto at dev.open-bio.org Fri Jun 13 07:20:26 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Fri, 13 Jun 2008 11:20:26 +0000
Subject: [BioRuby-cvs] bioruby/lib/bio reference.rb,1.29,1.30
Message-ID: <200806131120.m5DBKQLQ004888@dev.open-bio.org>
Update of /home/repository/bioruby/bioruby/lib/bio
In directory dev.open-bio.org:/tmp/cvs-serv4830/lib/bio
Modified Files:
reference.rb
Log Message:
modified RDoc for Bio::Reference#bibitem
Index: reference.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/reference.rb,v
retrieving revision 1.29
retrieving revision 1.30
diff -C2 -d -r1.29 -r1.30
*** reference.rb 4 Jun 2008 14:56:37 -0000 1.29
--- reference.rb 13 Jun 2008 11:20:23 -0000 1.30
***************
*** 252,255 ****
--- 252,257 ----
# {\em Theor. J. Hoge}, 12(3):123--145, 2001.
# ---
+ # *Arguments*:
+ # * (optional) _item_: label string (default: "PMID:#{pubmed}").
# *Returns*:: String
def bibitem(item = nil)
From ngoto at dev.open-bio.org Fri Jun 13 07:37:27 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Fri, 13 Jun 2008 11:37:27 +0000
Subject: [BioRuby-cvs]
bioruby/test/unit/bio/util/restriction_enzyme/double_stranded
test_aligned_strands.rb, 1.3, 1.4
Message-ID: <200806131137.m5DBbRnA005201@dev.open-bio.org>
Update of /home/repository/bioruby/bioruby/test/unit/bio/util/restriction_enzyme/double_stranded
In directory dev.open-bio.org:/tmp/cvs-serv5181/test/unit/bio/util/restriction_enzyme/double_stranded
Modified Files:
test_aligned_strands.rb
Log Message:
"require 'bio/sequence'" is needed to run the tests in this file.
Index: test_aligned_strands.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/test/unit/bio/util/restriction_enzyme/double_stranded/test_aligned_strands.rb,v
retrieving revision 1.3
retrieving revision 1.4
diff -C2 -d -r1.3 -r1.4
*** test_aligned_strands.rb 5 Apr 2007 23:35:44 -0000 1.3
--- test_aligned_strands.rb 13 Jun 2008 11:37:25 -0000 1.4
***************
*** 14,17 ****
--- 14,18 ----
require 'test/unit'
+ require 'bio/sequence'
require 'bio/util/restriction_enzyme/double_stranded/aligned_strands'
require 'bio/util/restriction_enzyme/double_stranded'
From ngoto at dev.open-bio.org Fri Jun 13 07:39:41 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Fri, 13 Jun 2008 11:39:41 +0000
Subject: [BioRuby-cvs]
bioruby/test/unit/bio/util/restriction_enzyme/double_stranded
test_aligned_strands.rb, 1.3, 1.3.2.1
Message-ID: <200806131139.m5DBdfXW005450@dev.open-bio.org>
Update of /home/repository/bioruby/bioruby/test/unit/bio/util/restriction_enzyme/double_stranded
In directory dev.open-bio.org:/tmp/cvs-serv5209/test/unit/bio/util/restriction_enzyme/double_stranded
Modified Files:
Tag: BRANCH-biohackathon2008
test_aligned_strands.rb
Log Message:
merged change from rev. 1.3 to 1.4 in the CVS trunk
Index: test_aligned_strands.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/test/unit/bio/util/restriction_enzyme/double_stranded/test_aligned_strands.rb,v
retrieving revision 1.3
retrieving revision 1.3.2.1
diff -C2 -d -r1.3 -r1.3.2.1
*** test_aligned_strands.rb 5 Apr 2007 23:35:44 -0000 1.3
--- test_aligned_strands.rb 13 Jun 2008 11:39:39 -0000 1.3.2.1
***************
*** 14,17 ****
--- 14,18 ----
require 'test/unit'
+ require 'bio/sequence'
require 'bio/util/restriction_enzyme/double_stranded/aligned_strands'
require 'bio/util/restriction_enzyme/double_stranded'
From ngoto at dev.open-bio.org Tue Jun 17 08:23:52 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Tue, 17 Jun 2008 12:23:52 +0000
Subject: [BioRuby-cvs] bioruby/lib/bio reference.rb,1.24.2.6,1.24.2.7
Message-ID: <200806171223.m5HCNqfC020085@dev.open-bio.org>
Update of /home/repository/bioruby/bioruby/lib/bio
In directory dev.open-bio.org:/tmp/cvs-serv20065/lib/bio
Modified Files:
Tag: BRANCH-biohackathon2008
reference.rb
Log Message:
merged changes in trunk (revision 1.30)
Index: reference.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/reference.rb,v
retrieving revision 1.24.2.6
retrieving revision 1.24.2.7
diff -C2 -d -r1.24.2.6 -r1.24.2.7
*** reference.rb 23 Apr 2008 18:52:18 -0000 1.24.2.6
--- reference.rb 17 Jun 2008 12:23:49 -0000 1.24.2.7
***************
*** 180,186 ****
# *Arguments*:
# * (optional) _style_: String with style identifier
! # * (optional) _option_: Option for styles accepting one
# *Returns*:: String
! def format(style = nil, option = nil)
case style
when 'embl'
--- 180,186 ----
# *Arguments*:
# * (optional) _style_: String with style identifier
! # * (optional) _options_: Options for styles accepting one
# *Returns*:: String
! def format(style = nil, *options)
case style
when 'embl'
***************
*** 189,199 ****
return endnote
when 'bibitem'
! return bibitem(option)
when 'bibtex'
! return bibtex(option)
when 'rd'
! return rd(option)
when /^nature$/i
! return nature(option)
when /^science$/i
return science
--- 189,199 ----
return endnote
when 'bibitem'
! return bibitem(*options)
when 'bibtex'
! return bibtex(*options)
when 'rd'
! return rd(*options)
when /^nature$/i
! return nature(*options)
when /^science$/i
return science
***************
*** 247,256 ****
lines << "%P #{@pages}" unless @pages.empty?
lines << "%M #{@pubmed}" unless @pubmed.to_s.empty?
! if @pubmed
! cgi = "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi"
! opts = "cmd=Retrieve&db=PubMed&dopt=Citation&list_uids"
! @url = "#{cgi}?#{opts}=#{@pubmed}"
! end
! lines << "%U #{@url}" unless @url.empty?
lines << "%X #{@abstract}" unless @abstract.empty?
@mesh.each do |term|
--- 247,252 ----
lines << "%P #{@pages}" unless @pages.empty?
lines << "%M #{@pubmed}" unless @pubmed.to_s.empty?
! u = @url.empty? ? pubmed_url : @url
! lines << "%U #{u}" unless u.empty?
lines << "%X #{@abstract}" unless @abstract.empty?
@mesh.each do |term|
***************
*** 289,292 ****
--- 285,290 ----
# {\em Theor. J. Hoge}, 12(3):123--145, 2001.
# ---
+ # *Arguments*:
+ # * (optional) _item_: label string (default: "PMID:#{pubmed}").
# *Returns*:: String
def bibitem(item = nil)
***************
*** 332,351 ****
# *Arguments*:
# * (optional) _section_: BiBTeX section as String
# *Returns*:: String
! def bibtex(section = nil)
section = "article" unless section
authors = authors_join(' and ', ' and ')
! pages = @pages.sub('-', '--')
! return <<-"END".gsub(/\t/, '')
! @#{section}{PMID:#{@pubmed},
! author = {#{authors}},
! title = {#{@title}},
! journal = {#{@journal}},
! year = {#{@year}},
! volume = {#{@volume}},
! number = {#{@issue}},
! pages = {#{pages}},
! }
! END
end
--- 330,375 ----
# *Arguments*:
# * (optional) _section_: BiBTeX section as String
+ # * (optional) _label_: Label string cited by LaTeX documents.
+ # Default is "PMID:#{pubmed}".
+ # * (optional) _keywords_: Hash of additional keywords,
+ # e.g. { 'abstract' => 'This is abstract.' }.
+ # You can also override default keywords.
+ # To disable default keywords, specify false as
+ # value, e.g. { 'url' => false, 'year' => false }.
# *Returns*:: String
! def bibtex(section = nil, label = nil, keywords = {})
section = "article" unless section
authors = authors_join(' and ', ' and ')
! thepages = pages.to_s.empty? ? nil : pages.sub(/\-/, '--')
! unless label then
! label = "PMID:#{pubmed}"
! end
! theurl = if !(url.to_s.empty?) then
! url
! elsif pmurl = pubmed_url and !(pmurl.to_s.empty?) then
! pmurl
! else
! nil
! end
! hash = {
! 'author' => authors.empty? ? nil : authors,
! 'title' => title.to_s.empty? ? nil : title,
! 'number' => issue.to_s.empty? ? nil : issue,
! 'pages' => thepages,
! 'url' => theurl
! }
! keys = %w( author title journal year volume number pages url )
! keys.each do |k|
! hash[k] = self.__send__(k.intern) unless hash.has_key?(k)
! end
! hash.merge!(keywords) { |k, v1, v2| v2.nil? ? v1 : v2 }
! bib = [ "@#{section}{#{label}," ]
! keys.concat((hash.keys - keys).sort)
! keys.each do |kw|
! ref = hash[kw]
! bib.push " #{kw.ljust(12)} = {#{ref}}," if ref
! end
! bib.push "}\n"
! return bib.join("\n")
end
***************
*** 533,536 ****
--- 557,571 ----
end
+ # Returns a valid URL for pubmed records
+ #
+ # *Returns*:: String
+ def pubmed_url
+ unless @pubmed.to_s.empty?
+ cgi = "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi"
+ opts = "cmd=Retrieve&db=PubMed&dopt=Citation&list_uids"
+ return "#{cgi}?#{opts}=#{@pubmed}"
+ end
+ ''
+ end
private
From ngoto at dev.open-bio.org Tue Jun 17 08:24:44 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Tue, 17 Jun 2008 12:24:44 +0000
Subject: [BioRuby-cvs] bioruby/test/unit/bio test_reference.rb, 1.3.2.1,
1.3.2.2
Message-ID: <200806171224.m5HCOiAk020113@dev.open-bio.org>
Update of /home/repository/bioruby/bioruby/test/unit/bio
In directory dev.open-bio.org:/tmp/cvs-serv20093/test/unit/bio
Modified Files:
Tag: BRANCH-biohackathon2008
test_reference.rb
Log Message:
merged changes from trunk (revision 1.5)
Index: test_reference.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/test/unit/bio/test_reference.rb,v
retrieving revision 1.3.2.1
retrieving revision 1.3.2.2
diff -C2 -d -r1.3.2.1 -r1.3.2.2
*** test_reference.rb 8 May 2008 05:38:01 -0000 1.3.2.1
--- test_reference.rb 17 Jun 2008 12:24:41 -0000 1.3.2.2
***************
*** 92,96 ****
def test_format_endnote
! str = "%0 Journal Article\n%A Hoge, J.P.\n%A Fuga, F.B.\n%D 2001\n%T Title of the study.\n%J Theor. J. Hoge\n%V 12\n%N 3\n%P 123-145\n%M 12345678\n%U http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=PubMed&dopt=Citation&list_uids=12345678\n%X Hoge fuga. hoge fuga.\n%K Hoge\n%+ Tokyo"
assert_equal(str, @obj.format('endnote'))
assert_equal(str, @obj.endnote)
--- 92,96 ----
def test_format_endnote
! str = "%0 Journal Article\n%A Hoge, J.P.\n%A Fuga, F.B.\n%D 2001\n%T Title of the study.\n%J Theor. J. Hoge\n%V 12\n%N 3\n%P 123-145\n%M 12345678\n%U http://example.com\n%X Hoge fuga. hoge fuga.\n%K Hoge\n%+ Tokyo"
assert_equal(str, @obj.format('endnote'))
assert_equal(str, @obj.endnote)
***************
*** 104,122 ****
def test_format_bibtex
! str =< false,
+ 'url' => false,
+ 'booktitle' => @obj.journal,
+ 'month' => 'December'}))
+ assert_equal(str, @obj.bibtex('inproceedings', 'YourArticle',
+ { 'journal' => false,
+ 'url' => false,
+ 'booktitle' => @obj.journal,
+ 'month' => 'December'}))
+ end
+
def test_format_rd
str = "== Title of the study.\n\n* Hoge, J.P. and Fuga, F.B.\n\n* Theor. J. Hoge 2001 12:123-145 [PMID:12345678]\n\nHoge fuga. hoge fuga."
From ngoto at dev.open-bio.org Tue Jun 17 11:25:24 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Tue, 17 Jun 2008 15:25:24 +0000
Subject: [BioRuby-cvs] bioruby/lib/bio sequence.rb,0.58.2.11,0.58.2.12
Message-ID: <200806171525.m5HFPOpk020858@dev.open-bio.org>
Update of /home/repository/bioruby/bioruby/lib/bio
In directory dev.open-bio.org:/tmp/cvs-serv20823/lib/bio
Modified Files:
Tag: BRANCH-biohackathon2008
sequence.rb
Log Message:
* Some attributes are added: strandedness (strand information),
release_created, release_modified (release information),
entry_version (version of the entry numbered by database administrator),
organelle (organelle information), other_seqids (sequence IDs other than
accessions), and id_namespace (namespace of accessions).
Most of them are added because corresponding tags are defined in the
INSDSeq XML v1.4 ( http://www.insdc.org/files/documents/INSD_V1.4.dtd ).
The "id_namespace" will be used to output NCBI style fasta format.
* The "taxonomy" attribute is changed to be an alias of the "classification"
attribute.
* The "date" attribute is removed.
* RDoc documents of attributes are updated.
Index: sequence.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/sequence.rb,v
retrieving revision 0.58.2.11
retrieving revision 0.58.2.12
diff -C2 -d -r0.58.2.11 -r0.58.2.12
*** sequence.rb 24 Apr 2008 14:28:25 -0000 0.58.2.11
--- sequence.rb 17 Jun 2008 15:25:22 -0000 0.58.2.12
***************
*** 118,149 ****
end
! # The sequence identifier. For example, for a sequence
! # of Genbank origin, this is the accession number.
attr_accessor :entry_id
! # A String with a description of the sequence
attr_accessor :definition
! # An Array of Bio::Feature objects
attr_accessor :features
! # An Array of Bio::Reference objects
attr_accessor :references
! # A comment String
attr_accessor :comments
! # Date from sequence source. Often date of deposition.
! attr_accessor :date
!
! # An Array of Strings
attr_accessor :keywords
! # An Array of Strings; links to other database entries.
attr_accessor :dblinks
!
! # A taxonomy String
! attr_accessor :taxonomy
!
# Bio::Sequence::NA/AA
attr_accessor :moltype
--- 118,145 ----
end
! # The sequence identifier (String). For example, for a sequence
! # of Genbank origin, this is the locus name.
! # For a sequence of EMBL origin, this is the primary accession number.
attr_accessor :entry_id
! # A String with a description of the sequence (String)
attr_accessor :definition
! # Features (An Array of Bio::Feature objects)
attr_accessor :features
! # References (An Array of Bio::Reference objects)
attr_accessor :references
! # Comments (String or an Array of String)
attr_accessor :comments
! # Keywords (An Array of String)
attr_accessor :keywords
! # Links to other database entries.
! # (An Array of Bio::Sequence::DBLink objects)
attr_accessor :dblinks
!
# Bio::Sequence::NA/AA
attr_accessor :moltype
***************
*** 157,166 ****
#+++
! # Version number of the sequence (String).
attr_accessor :sequence_version
! # Topology (String). "circular" or "linear".
attr_accessor :topology
# molecular type (String). "DNA" or "RNA" for nucleotide sequence.
attr_accessor :molecule_type
--- 153,170 ----
#+++
! # Version number of the sequence (String or Integer).
! # Unlike entry_version, sequence_version will be changed
! # when the submitter of the sequence updates the entry.
! # Normally, the same entry taken from different databases (EMBL, GenBank,
! # and DDBJ) may have the same sequence_version.
attr_accessor :sequence_version
! # Topology (String). "circular", "linear", or nil.
attr_accessor :topology
+ # Strandedness (String). "single" (single-stranded),
+ # "double" (double-stranded), "mixed" (mixed-stranded), or nil.
+ attr_accessor :strandedness
+
# molecular type (String). "DNA" or "RNA" for nucleotide sequence.
attr_accessor :molecule_type
***************
*** 180,189 ****
attr_accessor :secondary_accessions
! # Created date of the sequence entry (String)
attr_accessor :date_created
! # Last modified date of the sequence entry (String)
attr_accessor :date_modified
# Organism species (String). For example, "Escherichia coli".
attr_accessor :species
--- 184,208 ----
attr_accessor :secondary_accessions
! # Created date of the sequence entry (Date, DateTime, Time, or String)
attr_accessor :date_created
! # Last modified date of the sequence entry (Date, DateTime, Time, or String)
attr_accessor :date_modified
+ # Release information when created (String)
+ attr_accessor :release_created
+
+ # Release information when last-modified (String)
+ attr_accessor :release_modified
+
+ # Version of the entry (String or Integer).
+ # Unlike sequence_version, entry_version is a database
+ # maintainer's internal version number.
+ # The version number will be changed when the database maintainer
+ # modifies the entry.
+ # The same enrty in EMBL, GenBank, and DDBJ may have different
+ # entry_version.
+ attr_accessor :entry_version
+
# Organism species (String). For example, "Escherichia coli".
attr_accessor :species
***************
*** 192,195 ****
--- 211,231 ----
# (Array of String)
attr_accessor :classification
+ alias taxonomy classification
+
+ # (not well supported) Organelle information (String).
+ attr_accessor :organelle
+
+ # Namespace of the sequence IDs described in entry_id, primary_accession,
+ # and secondary_accessions methods (String).
+ # For example, 'EMBL', 'GenBank', 'DDBJ', 'RefSeq'.
+ attr_accessor :id_namespace
+
+ # Sequence identifiers which are not described in entry_id,
+ # primary_accession,and secondary_accessions methods
+ # (Array of Bio::Sequence::DBLink objects).
+ # For example, NCBI GI number can be stored.
+ # Note that only identifiers of the entry itself should be stored.
+ # For database cross references, dblinks should be used.
+ attr_accessor :other_seqids
# Guess the type of sequence, Amino Acid or Nucleic Acid, and create a
From ngoto at dev.open-bio.org Tue Jun 17 11:44:24 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Tue, 17 Jun 2008 15:44:24 +0000
Subject: [BioRuby-cvs] bioruby/test/unit/bio/sequence test_dblink.rb, NONE,
1.1.2.1
Message-ID: <200806171544.m5HFiOIl021028@dev.open-bio.org>
Update of /home/repository/bioruby/bioruby/test/unit/bio/sequence
In directory dev.open-bio.org:/tmp/cvs-serv21001/test/unit/bio/sequence
Added Files:
Tag: BRANCH-biohackathon2008
test_dblink.rb
Log Message:
New class Bio::Sequence::DBLink are added to store IDs and database names
together in an object.
--- NEW FILE: test_dblink.rb ---
#
# test/unit/bio/sequence/test_dblink.rb - Unit test for Bio::Sequencce::DBLink
#
# Copyright:: Copyright (C) 2008 Naohisa Goto
# License:: The Ruby License
#
# $Id: test_dblink.rb,v 1.1.2.1 2008/06/17 15:44:22 ngoto Exp $
#
require 'pathname'
libpath = Pathname.new(File.join(File.dirname(__FILE__), ['..'] * 4, 'lib')).cleanpath.to_s
$:.unshift(libpath) unless $:.include?(libpath)
require 'test/unit'
require 'bio/sequence'
require 'bio/sequence/dblink'
module Bio
class TestSequenceDBLink < Test::Unit::TestCase
def setup
@xref = Bio::Sequence::DBLink.new('EMBL', 'Z14088', 'CAA78466.1',
'-', 'mRNA')
end
def test_database
assert_equal('EMBL', @xref.database)
end
def test_id
assert_equal('Z14088', @xref.id)
end
def test_secondary_ids
assert_equal([ 'CAA78466.1', '-', 'mRNA' ],
@xref.secondary_ids)
end
end #class
class TestSequenceDBLinkClassMethods < Test::Unit::TestCase
def test_parse_embl_DR_line
str = 'DR EPD; EP07077; HS_HBG1.'
xref = Bio::Sequence::DBLink.parse_embl_DR_line(str)
assert_equal('EPD', xref.database)
assert_equal('EP07077', xref.id)
assert_equal([ 'HS_HBG1' ], xref.secondary_ids)
end
def test_parse_uniprot_DR_line
str = 'DR EMBL; Z14088; CAA78466.1; -; mRNA.'
xref = Bio::Sequence::DBLink.parse_uniprot_DR_line(str)
assert_equal('EMBL', xref.database)
assert_equal('Z14088', xref.id)
assert_equal([ 'CAA78466.1', '-', 'mRNA' ],
xref.secondary_ids)
end
end #class
end #module Bio
From ngoto at dev.open-bio.org Tue Jun 17 11:44:24 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Tue, 17 Jun 2008 15:44:24 +0000
Subject: [BioRuby-cvs] bioruby/lib/bio/sequence dblink.rb,NONE,1.1.2.1
Message-ID: <200806171544.m5HFiOF6021023@dev.open-bio.org>
Update of /home/repository/bioruby/bioruby/lib/bio/sequence
In directory dev.open-bio.org:/tmp/cvs-serv21001/lib/bio/sequence
Added Files:
Tag: BRANCH-biohackathon2008
dblink.rb
Log Message:
New class Bio::Sequence::DBLink are added to store IDs and database names
together in an object.
--- NEW FILE: dblink.rb ---
#
# = bio/sequence/dblink.rb - sequence ID with database name
#
# Copyright:: Copyright (C) 2008
# Naohisa Goto
# License:: The Ruby License
#
# $Id: dblink.rb,v 1.1.2.1 2008/06/17 15:44:22 ngoto Exp $
#
require 'bio/sequence'
# Bio::Sequence::DBLink stores IDs with the database name.
# Its main purpose is to store database cross-reference information
# for a sequence entry.
class Bio::Sequence::DBLink
# creates a new DBLink object
def initialize(database, primary_id, *secondary_ids)
@database = database
@id = primary_id
@secondary_ids = secondary_ids
end
# Database name, or namespace identifier (String).
attr_reader :database
# Primary identifier (String)
attr_reader :id
# Secondary identifiers (Array of String)
attr_reader :secondary_ids
#--
# class methods
#++
# Parses DR line in EMBL entry, and returns a DBLink object.
def self.parse_embl_DR_line(str)
str = str.sub(/\.\s*\z/, '')
str.sub!(/\ADR /, '')
self.new(*(str.split(/\s*\;\s*/, 3)))
end
# Parses DR line in UniProt entry, and returns a DBLink object.
def self.parse_uniprot_DR_line(str)
str = str.sub(/\.\s*\z/, '')
str.sub!(/\ADR /, '')
self.new(*(str.split(/\s*\;\s*/)))
end
end #class Bio::Sequence::DBLink
From ngoto at dev.open-bio.org Tue Jun 17 11:50:07 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Tue, 17 Jun 2008 15:50:07 +0000
Subject: [BioRuby-cvs] bioruby/lib/bio/sequence format.rb,1.4.2.7,1.4.2.8
Message-ID: <200806171550.m5HFo7Jm021095@dev.open-bio.org>
Update of /home/repository/bioruby/bioruby/lib/bio/sequence
In directory dev.open-bio.org:/tmp/cvs-serv21057/lib/bio/sequence
Modified Files:
Tag: BRANCH-biohackathon2008
format.rb
Log Message:
* In the wrap method, changed to recognize "\n" in given string.
* Some helper methods are added to help formatting date string.
Index: format.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/sequence/format.rb,v
retrieving revision 1.4.2.7
retrieving revision 1.4.2.8
diff -C2 -d -r1.4.2.7 -r1.4.2.8
*** format.rb 4 Mar 2008 11:10:28 -0000 1.4.2.7
--- format.rb 17 Jun 2008 15:50:05 -0000 1.4.2.8
***************
*** 285,305 ****
def wrap_and_split_lines(str, width)
result = []
! left = str.dup
! while left and left.length > width
! line = nil
! width.downto(1) do |i|
! if left[i..i] == ' ' or /[\,\;]/ =~ left[(i-1)..(i-1)] then
! line = left[0..(i-1)].sub(/ +\z/, '')
! left = left[i..-1].sub(/\A +/, '')
! break
end
end
! if line.nil? then
! line = left[0..(width-1)]
! left = left[width..-1]
! end
! result << line
end
- result << left if left and !(left.to_s.empty?)
return result
end
--- 285,309 ----
def wrap_and_split_lines(str, width)
result = []
! lefts = str.chomp.split(/(?:\r\n|\r|\n)/)
! lefts.each do |left|
! left.rstrip!
! while left and left.length > width
! line = nil
! width.downto(1) do |i|
! if left[i..i] == ' ' or /[\,\;]/ =~ left[(i-1)..(i-1)] then
! line = left[0..(i-1)].sub(/ +\z/, '')
! left = left[i..-1].sub(/\A +/, '')
! break
! end
end
+ if line.nil? then
+ line = left[0..(width-1)]
+ left = left[width..-1]
+ end
+ result << line
+ left = nil if left.to_s.empty?
end
! result << left if left
end
return result
end
***************
*** 320,323 ****
--- 324,352 ----
end
+ #--
+ # internal use only
+ MonthStr = [ nil,
+ 'JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN',
+ 'JUL', 'AUG', 'SEP', 'OCT', 'NOV', 'DEC'
+ ].collect { |x| x.freeze }.freeze
+ #++
+
+ # formats a date from Date, DateTime, or Time object, or String.
+ def format_date(d)
+ begin
+ yy = d.year
+ mm = d.month
+ dd = d.day
+ rescue NoMethodError, NameError, ArgumentError, TypeError
+ return sprintf("%-11s", d)
+ end
+ sprintf("%02d-%-3s-%04d", dd, MonthStr[mm], yy)
+ end
+
+ # null date
+ def null_date
+ Date.new(0, 1, 1)
+ end
+
end #module INSDFeatureHelper
From ngoto at dev.open-bio.org Tue Jun 17 11:53:23 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Tue, 17 Jun 2008 15:53:23 +0000
Subject: [BioRuby-cvs] bioruby/lib/bio/db/genbank common.rb, 1.11.2.4,
1.11.2.5
Message-ID: <200806171553.m5HFrNlb021165@dev.open-bio.org>
Update of /home/repository/bioruby/bioruby/lib/bio/db/genbank
In directory dev.open-bio.org:/tmp/cvs-serv21145/lib/bio/db/genbank
Modified Files:
Tag: BRANCH-biohackathon2008
common.rb
Log Message:
Bio::GenBank#comment (and Bio::GenPept#comment) is changed not to remove
newlines inside the comment.
Index: common.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/db/genbank/common.rb,v
retrieving revision 1.11.2.4
retrieving revision 1.11.2.5
diff -C2 -d -r1.11.2.4 -r1.11.2.5
*** common.rb 7 May 2008 12:25:42 -0000 1.11.2.4
--- common.rb 17 Jun 2008 15:53:21 -0000 1.11.2.5
***************
*** 196,200 ****
# COMMENT -- Returns contents of the COMMENT record as a String.
def comment
! field_fetch('COMMENT')
end
--- 196,203 ----
# COMMENT -- Returns contents of the COMMENT record as a String.
def comment
! str = get('COMMENT').to_s.sub(/\ACOMMENT /, '')
! str.gsub!(/^ {12}/, '')
! str.chomp!
! str
end
From ngoto at dev.open-bio.org Tue Jun 17 11:56:20 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Tue, 17 Jun 2008 15:56:20 +0000
Subject: [BioRuby-cvs] bioruby/lib/bio/db/genbank genbank.rb, 0.40.2.3,
0.40.2.4
Message-ID: <200806171556.m5HFuKdb021193@dev.open-bio.org>
Update of /home/repository/bioruby/bioruby/lib/bio/db/genbank
In directory dev.open-bio.org:/tmp/cvs-serv21173/lib/bio/db/genbank
Modified Files:
Tag: BRANCH-biohackathon2008
genbank.rb
Log Message:
* Bio::GenBank#to_biosequence is changed to imporve support of sequence output
and data exchange.
* Bio::GenBank#date_created is added. It returns Date object.
Index: genbank.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/db/genbank/genbank.rb,v
retrieving revision 0.40.2.3
retrieving revision 0.40.2.4
diff -C2 -d -r0.40.2.3 -r0.40.2.4
*** genbank.rb 4 Mar 2008 09:22:35 -0000 0.40.2.3
--- genbank.rb 17 Jun 2008 15:56:18 -0000 0.40.2.4
***************
*** 8,13 ****
--- 8,16 ----
#
+ require 'date'
require 'bio/db'
require 'bio/db/genbank/common'
+ require 'bio/sequence'
+ require 'bio/sequence/dblink'
module Bio
***************
*** 122,129 ****
--- 125,142 ----
alias nalen length
+ # (obsolete???) length of the sequence
def seq_len
seq.length
end
+ # modified date. Returns Date object, String or nil.
+ def date_modified
+ begin
+ Date.parse(self.date)
+ rescue ArgumentError, TypeError, NoMethodError, NameError
+ self.date
+ end
+ end
+
# converts Bio::GenBank to Bio::Sequence
# ---
***************
*** 132,135 ****
--- 145,156 ----
def to_biosequence
sequence = Bio::Sequence.new(seq)
+
+ sequence.id_namespace =
+ if /\_/ =~ self.accession.to_s then
+ 'RefSeq'
+ else
+ 'GenBank'
+ end
+
sequence.entry_id = self.entry_id
***************
*** 137,147 ****
sequence.secondary_accessions = self.accessions - [ self.accession ]
sequence.molecule_type = self.natype
sequence.division = self.division
sequence.topology = self.circular
sequence.sequence_version = self.version
#sequence.date_created = nil #????
! sequence.date_modified = self.date
sequence.definition = self.definition
--- 158,177 ----
sequence.secondary_accessions = self.accessions - [ self.accession ]
+ if /GI\:(.+)/ =~ self.gi.to_s then
+ sequence.other_seqids = [ Bio::Sequence::DBLink.new('GI', $1) ]
+ end
+
sequence.molecule_type = self.natype
sequence.division = self.division
sequence.topology = self.circular
+ sequence.strandedness = case self.strand.to_s.downcase;
+ when 'ss-'; 'single';
+ when 'ds-'; 'double';
+ when 'ms-'; 'mixed';
+ else nil; end
sequence.sequence_version = self.version
#sequence.date_created = nil #????
! sequence.date_modified = date_modified
sequence.definition = self.definition
***************
*** 149,153 ****
sequence.species = self.organism
sequence.classification = self.taxonomy.to_s.sub(/\.\z/, '').split(/\s*\;\s*/)
! #sequence.organnella = nil # not used
sequence.comments = self.comment
sequence.references = self.references
--- 179,183 ----
sequence.species = self.organism
sequence.classification = self.taxonomy.to_s.sub(/\.\z/, '').split(/\s*\;\s*/)
! #sequence.organelle = nil # yet unsupported
sequence.comments = self.comment
sequence.references = self.references
From ngoto at dev.open-bio.org Tue Jun 17 11:59:26 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Tue, 17 Jun 2008 15:59:26 +0000
Subject: [BioRuby-cvs] bioruby/lib/bio/db/genbank format_genbank.rb, 1.1.2.4,
1.1.2.5
Message-ID: <200806171559.m5HFxQa4021221@dev.open-bio.org>
Update of /home/repository/bioruby/bioruby/lib/bio/db/genbank
In directory dev.open-bio.org:/tmp/cvs-serv21201/lib/bio/db/genbank
Modified Files:
Tag: BRANCH-biohackathon2008
format_genbank.rb
Log Message:
* Added support for COMMENT.
* Added support for GI number output.
* Many improvements are added.
Index: format_genbank.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/db/genbank/Attic/format_genbank.rb,v
retrieving revision 1.1.2.4
retrieving revision 1.1.2.5
diff -C2 -d -r1.1.2.4 -r1.1.2.5
*** format_genbank.rb 28 May 2008 13:26:33 -0000 1.1.2.4
--- format_genbank.rb 17 Jun 2008 15:59:24 -0000 1.1.2.5
***************
*** 101,104 ****
--- 101,115 ----
end
+ # formats comments lines as GenBank
+ def comments_format_genbank(cmnts)
+ return '' if !cmnts or cmnts.empty?
+ cmnts = [ cmnts ] unless cmnts.kind_of?(Array)
+ a = []
+ cmnts.each do |str|
+ a.push "COMMENT #{ genbank_wrap(str) }\n"
+ end
+ a.join('')
+ end
+
# formats sequence lines as GenBank
def seq_format_genbank(str)
***************
*** 113,122 ****
end
# Erb template of GenBank format for Bio::Sequence
erb_template <<'__END_OF_TEMPLATE__'
! LOCUS <%= sprintf("%-16s", entry_id) %> <%= sprintf("%11d", length) %> bp <%= sprintf("%3s", '') %><%= sprintf("%-6s", molecule_type) %> <%= sprintf("%-8s", topology) %><%= sprintf("%4s", division) %> <%= sprintf("%-11s", date_modified) %>
DEFINITION <%= genbank_wrap_dot(definition.to_s) %>
ACCESSION <%= genbank_wrap(([ primary_accession ] + (secondary_accessions or [])).join(" ")) %>
! VERSION <%= primary_accession %>.<%= sequence_version %><% unless true or gi_number.to_s.empty? %>GI:<%= gi_number %><% end %>
KEYWORDS <%= genbank_wrap_dot((keywords or []).join('; ')) %>
SOURCE <%= genbank_wrap(species) %>
--- 124,168 ----
end
+ # formats date
+ def date_format_genbank
+ date_modified || date_created || null_date
+ end
+
+ # moleculue type
+ def mol_type_genbank
+ if /(DNA|(t|r|m|u|sn|sno)?RNA)/i =~ molecule_type.to_s then
+ $1.sub(/[DR]NA/) { |x| x.upcase }
+ else
+ 'NA'
+ end
+ end
+
+ # NCBI GI number
+ def ncbi_gi_number
+ ids = other_seqids
+ if ids and r = ids.find { |x| x.database == 'GI' } then
+ r.id
+ else
+ nil
+ end
+ end
+
+ # strandedness
+ def strandedness_genbank
+ return nil unless strandedness
+ case strandedness
+ when 'single'; 'ss-';
+ when 'double'; 'ds-';
+ when 'mixed'; 'ms-';
+ else; nil
+ end
+ end
+
# Erb template of GenBank format for Bio::Sequence
erb_template <<'__END_OF_TEMPLATE__'
! LOCUS <%= sprintf("%-16s", entry_id) %> <%= sprintf("%11d", length) %> bp <%= sprintf("%3s", strandedness_genbank) %><%= sprintf("%-6s", mol_type_genbank) %> <%= sprintf("%-8s", topology) %><%= sprintf("%4s", division) %> <%= date_format_genbank %>
DEFINITION <%= genbank_wrap_dot(definition.to_s) %>
ACCESSION <%= genbank_wrap(([ primary_accession ] + (secondary_accessions or [])).join(" ")) %>
! VERSION <%= primary_accession %>.<%= sequence_version %><% if gi = ncbi_gi_number then %> GI:<%= gi %><% end %>
KEYWORDS <%= genbank_wrap_dot((keywords or []).join('; ')) %>
SOURCE <%= genbank_wrap(species) %>
***************
*** 129,132 ****
--- 175,179 ----
%><%= reference_format_genbank(ref, n) %><%
end
+ %><%= comments_format_genbank(comments)
%>FEATURES Location/Qualifiers
<%= format_features_genbank(features || [])
From ngoto at dev.open-bio.org Tue Jun 17 12:04:38 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Tue, 17 Jun 2008 16:04:38 +0000
Subject: [BioRuby-cvs] bioruby/lib/bio/db/embl embl.rb,1.29.2.6,1.29.2.7
Message-ID: <200806171604.m5HG4cnr021274@dev.open-bio.org>
Update of /home/repository/bioruby/bioruby/lib/bio/db/embl
In directory dev.open-bio.org:/tmp/cvs-serv21250/lib/bio/db/embl
Modified Files:
Tag: BRANCH-biohackathon2008
embl.rb
Log Message:
* Bio::EMBL#cc is changed to cut heading "CC ".
* Bio::EMBL#to_biosequence to improve support for sequence output
and data exchange.
* To get parse result of DT lines more easily, Bio::EMBL#date_modified,
date_created, release_modified, release_created, and entry_version
methods are added.
Index: embl.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/db/embl/embl.rb,v
retrieving revision 1.29.2.6
retrieving revision 1.29.2.7
diff -C2 -d -r1.29.2.6 -r1.29.2.7
*** embl.rb 28 May 2008 13:09:03 -0000 1.29.2.6
--- embl.rb 17 Jun 2008 16:04:36 -0000 1.29.2.7
***************
*** 32,39 ****
--- 32,42 ----
#
+ require 'date'
require 'bio/db'
require 'bio/db/embl/common'
require 'bio/compat/features'
require 'bio/compat/references'
+ require 'bio/sequence'
+ require 'bio/sequence/dblink'
module Bio
***************
*** 323,329 ****
# CC Line; comments of notes (>=0)
def cc
! get('CC')
end
!
##
--- 326,332 ----
# CC Line; comments of notes (>=0)
def cc
! get('CC').to_s.gsub(/^CC /, '')
end
! alias comment cc
##
***************
*** 376,379 ****
--- 379,436 ----
#++
+ # modified date. Returns Date object, String or nil.
+ def date_modified
+ parse_date(self.dt['updated'])
+ end
+
+ # created date. Returns Date object, String or nil.
+ def date_created
+ parse_date(self.dt['created'])
+ end
+
+ # release number when last updated
+ def release_modified
+ parse_release_version(self.dt['updated'])[0]
+ end
+
+ # release number when created
+ def release_created
+ parse_release_version(self.dt['created'])[0]
+ end
+
+ # entry version number numbered by EMBL
+ def entry_version
+ parse_release_version(self.dt['updated'])[1]
+ end
+
+ # parse date string. Returns Date object.
+ def parse_date(str)
+ begin
+ Date.parse(str)
+ rescue ArgumentError, TypeError, NoMethodError, NameError
+ str
+ end
+ end
+ private :parse_date
+
+ # extracts release and version numbers from DT line
+ def parse_release_version(str)
+ return [ nil, nil ] unless str
+ a = str.split(/[\(\,\)]/)
+ dstr = a.shift
+ rel = nil
+ ver = nil
+ a.each do |x|
+ case x
+ when /Rel\.\s*(.+)/
+ rel = $1.strip
+ when /Version\s*(.+)/
+ ver = $1.strip
+ end
+ end
+ [ rel, ver ]
+ end
+ private :parse_release_version
+
# converts the entry to Bio::Sequence object
# ---
***************
*** 382,385 ****
--- 439,444 ----
def to_biosequence
bio_seq = Bio::Sequence.new(self.seq)
+
+ bio_seq.id_namespace = 'EMBL'
bio_seq.entry_id = self.entry_id
bio_seq.primary_accession = self.accessions[0]
***************
*** 389,394 ****
bio_seq.definition = self.description
bio_seq.topology = self.topology
! bio_seq.date_created = self.dt['created']
! bio_seq.date_modified = self.dt['updated']
bio_seq.division = self.division
bio_seq.sequence_version = self.version
--- 448,456 ----
bio_seq.definition = self.description
bio_seq.topology = self.topology
! bio_seq.date_created = self.date_created
! bio_seq.date_modified = self.date_modified
! bio_seq.release_created = self.release_created
! bio_seq.release_modified = self.release_modified
! bio_seq.entry_version = self.entry_version
bio_seq.division = self.division
bio_seq.sequence_version = self.version
***************
*** 396,402 ****
bio_seq.species = self.fetch('OS')
bio_seq.classification = self.oc
bio_seq.references = self.references
bio_seq.features = self.ft
!
return bio_seq
end
--- 458,469 ----
bio_seq.species = self.fetch('OS')
bio_seq.classification = self.oc
+ # bio_seq.organelle = self.fetch('OG') # unsupported yet
bio_seq.references = self.references
bio_seq.features = self.ft
! bio_seq.comments = self.cc
! bio_seq.dblinks = get('DR').split(/\n/).collect { |x|
! Bio::Sequence::DBLink.parse_embl_DR_line(x)
! }
!
return bio_seq
end
From ngoto at dev.open-bio.org Tue Jun 17 12:06:06 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Tue, 17 Jun 2008 16:06:06 +0000
Subject: [BioRuby-cvs] bioruby/lib/bio/db/embl format_embl.rb, 1.1.2.5,
1.1.2.6
Message-ID: <200806171606.m5HG66iI021322@dev.open-bio.org>
Update of /home/repository/bioruby/bioruby/lib/bio/db/embl
In directory dev.open-bio.org:/tmp/cvs-serv21282/lib/bio/db/embl
Modified Files:
Tag: BRANCH-biohackathon2008
format_embl.rb
Log Message:
* Added support for CC lines (comments).
* Added support for DR lines (database cross references).
* Many improvements.
Index: format_embl.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/db/embl/Attic/format_embl.rb,v
retrieving revision 1.1.2.5
retrieving revision 1.1.2.6
diff -C2 -d -r1.1.2.5 -r1.1.2.6
*** format_embl.rb 28 May 2008 13:38:07 -0000 1.1.2.5
--- format_embl.rb 17 Jun 2008 16:06:04 -0000 1.1.2.6
***************
*** 2,6 ****
# = bio/db/embl/format_embl.rb - EMBL format generater
#
! # Copyright:: Copyright (C) 2008 Jan Aerts
# License:: The Ruby License
#
--- 2,8 ----
# = bio/db/embl/format_embl.rb - EMBL format generater
#
! # Copyright:: Copyright (C) 2008
! # Jan Aerts ,
! # Naohisa Goto
# License:: The Ruby License
#
***************
*** 125,136 ****
end
# Erb template of EMBL format for Bio::Sequence
erb_template <<'__END_OF_TEMPLATE__'
! ID <%= entry_id %>; SV <%= sequence_version %>; <%= topology %>; <%= molecule_type %>; <%= data_class %>; <%= division %>; <%= seq.length %> BP.
XX
<%= embl_wrap('AC ', accessions.reject{|a| a.nil?}.join('; ') + ';') %>
XX
! DT <%= date_created %>
! DT <%= date_modified %>
XX
<%= embl_wrap('DE ', definition) %>
--- 127,166 ----
end
+ # moleculue type
+ def mol_type_embl
+ if mt = molecule_type then
+ mt
+ elsif f = (features or []).find { |f| f.feature == 'source' } and
+ q = f.qualifiers.find { |q| q.qualifier == 'mol_type' } then
+ q.value
+ else
+ 'NA'
+ end
+ end
+
+ # CC line. Comments.
+ def comments_format_embl(cmnts)
+ return '' if !cmnts or cmnts.empty?
+ cmnts = [ cmnts ] unless cmnts.kind_of?(Array)
+ a = []
+ cmnts.each do |str|
+ a.push embl_wrap('CC ', str)
+ end
+ unless a.empty? then
+ a.push "XX "
+ a.push '' # dummy to put "\n" at the end of the string
+ end
+ a.join("\n")
+ end
+
+
# Erb template of EMBL format for Bio::Sequence
erb_template <<'__END_OF_TEMPLATE__'
! ID <%= primary_accession || entry_id %>; SV <%= sequence_version %>; <%= topology %>; <%= mol_type_embl %>; <%= data_class %>; <%= division %>; <%= seq.length %> BP.
XX
<%= embl_wrap('AC ', accessions.reject{|a| a.nil?}.join('; ') + ';') %>
XX
! DT <%= format_date(date_created || null_date) %> (Rel. <%= release_created || 0 %>, Created)
! DT <%= format_date(date_modified || null_date) %> (Rel. <%= release_modified || 0 %>, Last updated, Version <%= entry_version || 0 %>)
XX
<%= embl_wrap('DE ', definition) %>
***************
*** 142,146 ****
XX
<% hash = {}; (references || []).each do |ref| %><%= reference_format_embl(ref, hash) %>
! <% end %>FH Key Location/Qualifiers
FH
<%= format_features_embl(features || []) %>XX
--- 172,181 ----
XX
<% hash = {}; (references || []).each do |ref| %><%= reference_format_embl(ref, hash) %>
! <% end %><% (dblinks || []).each do |r|
! %>DR <%= r.database %>; <%= r.id %><% unless r.secondary_ids.empty? %>; <%= r.secondary_ids[0] %><% end %>.
! <% end %><% if dblinks and !dblinks.empty? then
! %>XX
! <% end %><%= comments_format_embl(comments)
! %>FH Key Location/Qualifiers
FH
<%= format_features_embl(features || []) %>XX
From ngoto at dev.open-bio.org Tue Jun 17 12:09:55 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Tue, 17 Jun 2008 16:09:55 +0000
Subject: [BioRuby-cvs] bioruby/test/unit/bio/db/embl test_embl_to_bioseq.rb,
1.1.2.1, 1.1.2.2
Message-ID: <200806171609.m5HG9tFR021392@dev.open-bio.org>
Update of /home/repository/bioruby/bioruby/test/unit/bio/db/embl
In directory dev.open-bio.org:/tmp/cvs-serv21372/test/unit/bio/db/embl
Modified Files:
Tag: BRANCH-biohackathon2008
test_embl_to_bioseq.rb
Log Message:
Unit test related to Bio::Sequence#date_created and date_modified are
changed because these methods are changed to store Date (or Time or DateTime)
objects instead of String objects.
Index: test_embl_to_bioseq.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/test/unit/bio/db/embl/Attic/test_embl_to_bioseq.rb,v
retrieving revision 1.1.2.1
retrieving revision 1.1.2.2
diff -C2 -d -r1.1.2.1 -r1.1.2.2
*** test_embl_to_bioseq.rb 20 Feb 2008 09:56:22 -0000 1.1.2.1
--- test_embl_to_bioseq.rb 17 Jun 2008 16:09:53 -0000 1.1.2.2
***************
*** 53,59 ****
end
! def test_dates
! assert_equal('25-OCT-2002 (Rel. 73, Created)', @bio_seq.date_created)
! assert_equal('14-NOV-2006 (Rel. 89, Last updated, Version 3)', @bio_seq.date_modified)
end
--- 53,76 ----
end
! def test_date_created
! # '25-OCT-2002 (Rel. 73, Created)'
! assert_equal(Date.parse('25-OCT-2002'), @bio_seq.date_created)
! end
!
! def test_date_modified
! # '14-NOV-2006 (Rel. 89, Last updated, Version 3)'
! assert_equal(Date.parse('14-NOV-2006'), @bio_seq.date_modified)
! end
!
! def test_release_created
! assert_equal('73', @bio_seq.release_created)
! end
!
! def test_release_modified
! assert_equal('89', @bio_seq.release_modified)
! end
!
! def test_entry_version
! assert_equal('3', @bio_seq.entry_version)
end
***************
*** 129,135 ****
end
! def test_dates
! assert_equal('25-OCT-2002 (Rel. 73, Created)', @bio_seq_2.date_created)
! assert_equal('14-NOV-2006 (Rel. 89, Last updated, Version 3)', @bio_seq_2.date_modified)
end
--- 146,169 ----
end
! def test_date_created
! # '25-OCT-2002 (Rel. 73, Created)'
! assert_equal(Date.parse('25-OCT-2002'), @bio_seq_2.date_created)
! end
!
! def test_date_modified
! # '14-NOV-2006 (Rel. 89, Last updated, Version 3)'
! assert_equal(Date.parse('14-NOV-2006'), @bio_seq_2.date_modified)
! end
!
! def test_release_created
! assert_equal('73', @bio_seq_2.release_created)
! end
!
! def test_release_modified
! assert_equal('89', @bio_seq_2.release_modified)
! end
!
! def test_entry_version
! assert_equal('3', @bio_seq_2.entry_version)
end
From ngoto at dev.open-bio.org Thu Jun 19 08:45:18 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Thu, 19 Jun 2008 12:45:18 +0000
Subject: [BioRuby-cvs] bioruby/lib/bio/db/embl format_embl.rb, 1.1.2.6,
1.1.2.7
Message-ID: <200806191245.m5JCjIps000652@dev.open-bio.org>
Update of /home/repository/bioruby/bioruby/lib/bio/db/embl
In directory dev.open-bio.org:/tmp/cvs-serv596/lib/bio/db/embl
Modified Files:
Tag: BRANCH-biohackathon2008
format_embl.rb
Log Message:
avoid error when keywords or classification is nil
Index: format_embl.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/db/embl/Attic/format_embl.rb,v
retrieving revision 1.1.2.6
retrieving revision 1.1.2.7
diff -C2 -d -r1.1.2.6 -r1.1.2.7
*** format_embl.rb 17 Jun 2008 16:06:04 -0000 1.1.2.6
--- format_embl.rb 19 Jun 2008 12:45:15 -0000 1.1.2.7
***************
*** 166,173 ****
<%= embl_wrap('DE ', definition) %>
XX
! <%= embl_wrap('KW ', keywords.join('; ') + '.') %>
XX
OS <%= species %>
! <%= embl_wrap('OC ', classification.join('; ') + '.') %>
XX
<% hash = {}; (references || []).each do |ref| %><%= reference_format_embl(ref, hash) %>
--- 166,173 ----
<%= embl_wrap('DE ', definition) %>
XX
! <%= embl_wrap('KW ', (keywords || []).join('; ') + '.') %>
XX
OS <%= species %>
! <%= embl_wrap('OC ', (classification || []).join('; ') + '.') %>
XX
<% hash = {}; (references || []).each do |ref| %><%= reference_format_embl(ref, hash) %>
From ngoto at dev.open-bio.org Fri Jun 20 09:22:34 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Fri, 20 Jun 2008 13:22:34 +0000
Subject: [BioRuby-cvs] bioruby/lib/bio/db fasta.rb,1.28,1.28.2.1
Message-ID: <200806201322.m5KDMYOR021703@dev.open-bio.org>
Update of /home/repository/bioruby/bioruby/lib/bio/db
In directory dev.open-bio.org:/tmp/cvs-serv21681
Modified Files:
Tag: BRANCH-biohackathon2008
fasta.rb
Log Message:
Split Bio::FastaDefline class into lib/bio/db/fasta/defline.rb
Index: fasta.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/db/fasta.rb,v
retrieving revision 1.28
retrieving revision 1.28.2.1
diff -C2 -d -r1.28 -r1.28.2.1
*** fasta.rb 5 Apr 2007 23:35:40 -0000 1.28
--- fasta.rb 20 Jun 2008 13:22:31 -0000 1.28.2.1
***************
*** 15,57 ****
# == Examples
#
! # rub = Bio::FastaDefline.new('>gi|671595|emb|CAA85678.1| rubisco large subunit [Perovskia abrotanoides]')
! # rub.entry_id ==> 'gi|671595'
! # rub.get('emb') ==> 'CAA85678.1'
! # rub.emb ==> 'CAA85678.1'
! # rub.gi ==> '671595'
! # rub.accession ==> 'CAA85678'
! # rub.accessions ==> [ 'CAA85678' ]
! # rub.acc_version ==> 'CAA85678.1'
! # rub.locus ==> nil
! # rub.list_ids ==> [["gi", "671595"],
! # ["emb", "CAA85678.1", nil],
! # ["Perovskia abrotanoides"]]
! #
! # ckr = Bio::FastaDefline.new(">gi|2495000|sp|Q63931|CCKR_CAVPO CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)\001gi|2147182|pir||I51898 cholecystokinin A receptor - guinea pig\001gi|544724|gb|AAB29504.1| cholecystokinin A receptor; CCK-A receptor [Cavia]")
! # ckr.entry_id ==> "gi|2495000"
! # ckr.sp ==> "CCKR_CAVPO"
! # ckr.pir ==> "I51898"
! # ckr.gb ==> "AAB29504.1"
! # ckr.gi ==> "2495000"
! # ckr.accession ==> "AAB29504"
! # ckr.accessions ==> ["Q63931", "AAB29504"]
! # ckr.acc_version ==> "AAB29504.1"
! # ckr.locus ==> nil
! # ckr.description ==>
! # "CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)"
! # ckr.descriptions ==>
! # ["CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)",
! # "cholecystokinin A receptor - guinea pig",
! # "cholecystokinin A receptor; CCK-A receptor [Cavia]"]
! # ckr.words ==>
! # ["cavia", "cck-a", "cck-ar", "cholecystokinin", "guinea", "pig",
! # "receptor", "type"]
! # ckr.id_strings ==>
! # ["2495000", "Q63931", "CCKR_CAVPO", "2147182", "I51898",
! # "544724", "AAB29504.1", "Cavia"]
! # ckr.list_ids ==>
! # [["gi", "2495000"], ["sp", "Q63931", "CCKR_CAVPO"],
! # ["gi", "2147182"], ["pir", nil, "I51898"], ["gi", "544724"],
! # ["gb", "AAB29504.1", nil], ["Cavia"]]
#
# == References
--- 15,19 ----
# == Examples
#
! # See documents of Bio::FastaFormat class.
#
# == References
***************
*** 66,69 ****
--- 28,32 ----
require 'bio/db'
require 'bio/sequence'
+ require 'bio/db/fasta/defline'
module Bio
***************
*** 363,825 ****
end #class FastaNumericFormat
-
- # Parsing FASTA Defline, and extract IDs and other informations.
- # IDs are NSIDs (NCBI standard FASTA sequence identifiers)
- # or ":"-separated IDs.
- #
- # specs are described in:
- # ftp://ftp.ncbi.nih.gov/blast/documents/README.formatdb
- # http://blast.wustl.edu/doc/FAQ-Indexing.html#Identifiers
- #
- # === Examples
- #
- # rub = Bio::FastaDefline.new('>gi|671595|emb|CAA85678.1| rubisco large subunit [Perovskia abrotanoides]')
- # rub.entry_id ==> 'gi|671595'
- # rub.get('emb') ==> 'CAA85678.1'
- # rub.emb ==> 'CAA85678.1'
- # rub.gi ==> '671595'
- # rub.accession ==> 'CAA85678'
- # rub.accessions ==> [ 'CAA85678' ]
- # rub.acc_version ==> 'CAA85678.1'
- # rub.locus ==> nil
- # rub.list_ids ==> [["gi", "671595"],
- # ["emb", "CAA85678.1", nil],
- # ["Perovskia abrotanoides"]]
- #
- # ckr = Bio::FastaDefline.new(">gi|2495000|sp|Q63931|CCKR_CAVPO CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)\001gi|2147182|pir||I51898 cholecystokinin A receptor - guinea pig\001gi|544724|gb|AAB29504.1| cholecystokinin A receptor; CCK-A receptor [Cavia]")
- # ckr.entry_id ==> "gi|2495000"
- # ckr.sp ==> "CCKR_CAVPO"
- # ckr.pir ==> "I51898"
- # ckr.gb ==> "AAB29504.1"
- # ckr.gi ==> "2495000"
- # ckr.accession ==> "AAB29504"
- # ckr.accessions ==> ["Q63931", "AAB29504"]
- # ckr.acc_version ==> "AAB29504.1"
- # ckr.locus ==> nil
- # ckr.description ==>
- # "CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)"
- # ckr.descriptions ==>
- # ["CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)",
- # "cholecystokinin A receptor - guinea pig",
- # "cholecystokinin A receptor; CCK-A receptor [Cavia]"]
- # ckr.words ==>
- # ["cavia", "cck-a", "cck-ar", "cholecystokinin", "guinea", "pig",
- # "receptor", "type"]
- # ckr.id_strings ==>
- # ["2495000", "Q63931", "CCKR_CAVPO", "2147182", "I51898",
- # "544724", "AAB29504.1", "Cavia"]
- # ckr.list_ids ==>
- # [["gi", "2495000"], ["sp", "Q63931", "CCKR_CAVPO"],
- # ["gi", "2147182"], ["pir", nil, "I51898"], ["gi", "544724"],
- # ["gb", "AAB29504.1", nil], ["Cavia"]]
- #
- # === Refereneces
- #
- # * Fasta format description (NCBI)
- # http://www.ncbi.nlm.nih.gov/BLAST/fasta.shtml
- #
- # * Frequently Asked Questions: Indexing of Sequence Identifiers (by Warren R. Gish.)
- # http://blast.wustl.edu/doc/FAQ-Indexing.html#Identifiers
- #
- # * README.formatdb
- # ftp://ftp.ncbi.nih.gov/blast/documents/README.formatdb
- #
- class FastaDefline
-
- NSIDs = {
- # NCBI and WU-BLAST
- 'gi' => [ 'gi' ], # NCBI GI
- 'gb' => [ 'acc_version', 'locus' ], # GenBank
- 'emb' => [ 'acc_version', 'locus' ], # EMBL
- 'dbj' => [ 'acc_version', 'locus' ], # DDBJ
- 'sp' => [ 'accession', 'entry_id' ], # SWISS-PROT
- 'pdb' => [ 'entry_id', 'chain' ], # PDB
- 'bbs' => [ 'number' ], # GenInfo Backbone Id
- 'gnl' => [ 'database' , 'entry_id' ], # General database identifier
- 'ref' => [ 'acc_version' , 'locus' ], # NCBI Reference Sequence
- 'lcl' => [ 'entry_id' ], # Local Sequence identifier
-
- # WU-BLAST and NCBI
- 'pir' => [ 'accession', 'entry_id' ], # PIR
- 'prf' => [ 'accession', 'entry_id' ], # Protein Research Foundation
- 'pat' => [ 'country', 'number', 'serial' ], # Patents
-
- # WU-BLAST only
- 'bbm' => [ 'number' ], # NCBI GenInfo Backbone database identifier
- 'gim' => [ 'number' ], # NCBI GenInfo Import identifier
- 'gp' => [ 'acc_version', 'locus' ], # GenPept
- 'oth' => [ 'accession', 'name', 'release' ], # Other (user-definable) identifier
- 'tpd' => [ 'accession', 'name' ], # Third party annotation, DDBJ
- 'tpe' => [ 'accession', 'name' ], # Third party annotation, EMBL
- 'tpg' => [ 'accession', 'name' ], # Third party annotation, GenBank
-
- # Original
- 'ri' => [ 'entry_id', 'rearray_id', 'len' ], # RIKEN FANTOM DB
- }
-
- # Shows array that contains IDs (or ID-like strings).
- # Returns an array of arrays of strings.
- attr_reader :list_ids
-
- # Shows a possibly unique identifier.
- # Returns a string.
- attr_reader :entry_id
-
- # Parses given string.
- def initialize(str)
- @deflines = []
- @info = {}
- @list_ids = []
-
- @entry_id = nil
-
- lines = str.split("\x01")
- lines.each do |line|
- add_defline(line)
- end
- end #def initialize
-
- # Parses given string and adds parsed data.
- def add_defline(str)
- case str
- when /^\>?\s*((?:[^\|\s]*\|)+[^\s]+)\s*(.*)$/
- # NSIDs
- # examples:
- # >gi|9910844|sp|Q9UWG2|RL3_METVA 50S ribosomal protein L3P
- #
- # note: regexp (:?) means grouping without backreferences
- i = $1
- d = $2
- tks = i.split('|')
- tks << '' if i[-1,1] == '|'
- a = parse_NSIDs(tks)
- i = a[0].join('|')
- a.unshift('|')
- d = tks.join('|') + ' ' + d unless tks.empty?
- a << d
- this_line = a
- match_EC(d)
- parse_square_brackets(d).each do |x|
- if !match_EC(x, false) and x =~ /\A[A-Z]/ then
- di = [ x ]
- @list_ids << di
- @info['organism'] = x unless @info['organism']
- end
- end
-
- when /^\>?\s*([a-zA-Z0-9]+\:[^\s]+)\s*(.*)$/
- # examples:
- # >sce:YBR160W CDC28, SRM5; cyclin-dependent protein kinase catalytic subunit [EC:2.7.1.-] [SP:CC28_YEAST]
- # >emb:CACDC28 [X80034] C.albicans CDC28 gene
- i = $1
- d = $2
- a = parse_ColonSepID(i)
- i = a.join(':')
- this_line = [ ':', a , d ]
- match_EC(d)
- parse_square_brackets(d).each do |x|
- if !match_EC(x, false) and x =~ /:/ then
- parse_ColonSepID(x)
- elsif x =~ /\A\s*([A-Z][A-Z0-9_\.]+)\s*\z/ then
- @list_ids << [ $1 ]
- end
- end
-
- when /^\>?\s*(\S+)(?:\s+(.+))?$/
- # examples:
- # >ABC12345 this is test
- i = $1
- d = $2.to_s
- @list_ids << [ i.chomp('.') ]
- this_line = [ '', [ i ], d ]
- match_EC(d)
- else
- i = str
- d = ''
- match_EC(i)
- this_line = [ '', [ i ], d ]
- end
-
- @deflines << this_line
- @entry_id = i unless @entry_id
- end
-
- def match_EC(str, write_flag = true)
- di = nil
- str.scan(/EC\:((:?[\-\d]+\.){3}(:?[\-\d]+))/i) do |x|
- di = [ 'EC', $1 ]
- if write_flag then
- @info['ec'] = di[1] if (!@info['ec'] or @info['ec'].to_s =~ /\-/)
- @list_ids << di
- end
- end
- di
- end
- private :match_EC
-
- def parse_square_brackets(str)
- r = []
- str.scan(/\[([^\]]*)\]/) do |x|
- r << x[0]
- end
- r
- end
- private :parse_square_brackets
-
- def parse_ColonSepID(str)
- di = str.split(':', 2)
- di << nil if di.size <= 1
- @list_ids << di
- di
- end
- private :parse_ColonSepID
-
- def parse_NSIDs(ary)
- # this method destroys ary
- data = []
- while token = ary.shift
- if labels = self.class::NSIDs[token] then
- di = [ token ]
- idtype = token
- labels.each do |x|
- token = ary.shift
- break unless token
- if self.class::NSIDs[token] then
- ary.unshift(token)
- break #each
- end
- if token.length > 0 then
- di << token
- else
- di << nil
- end
- end
- data << di
- else
- if token.length > 0 then
- # UCID (uncontrolled identifiers)
- di = [ token ]
- data << di
- @info['ucid'] = token unless @info['ucid']
- end
- break #while
- end
- end #while
- @list_ids.concat data
- data
- end #def parse_NSIDs
- private :parse_NSIDs
-
-
- # Shows original string.
- # Note that the result of this method may be different from
- # original string which is given in FastaDefline.new method.
- def to_s
- @deflines.collect { |a|
- s = a[0]
- (a[1..-2].collect { |x| x.join(s) }.join(s) + ' ' + a[-1]).strip
- }.join("\x01")
- end
-
- # Shows description.
- def description
- @deflines[0].to_a[-1]
- end
-
- # Returns descriptions.
- def descriptions
- @deflines.collect do |a|
- a[-1]
- end
- end
-
- # Shows ID-like strings.
- # Returns an array of strings.
- def id_strings
- r = []
- @list_ids.each do |a|
- if a.size >= 2 then
- r.concat a[1..-1].find_all { |x| x }
- else
- if a[0].to_s.size > 0 and a[0] =~ /\A[A-Za-z0-9\.\-\_]+\z/
- r << a[0]
- end
- end
- end
- r.concat( words(true, []).find_all do |x|
- x =~ /\A[A-Z][A-Za-z0-9\_]*[0-9]+[A-Za-z0-9\_]+\z/ or
- x =~ /\A[A-Z][A-Z0-9]*\_[A-Z0-9\_]+\z/
- end)
- r
- end
-
- KillWords = [
- 'an', 'the', 'this', 'that',
- 'is', 'are', 'were', 'was', 'be', 'can', 'may', 'might',
- 'as', 'at', 'by', 'for', 'in', 'of', 'on', 'to', 'with',
- 'from', 'and', 'or', 'not',
- 'dna', 'rna', 'mrna', 'cdna', 'orf',
- 'aa', 'nt', 'pct', 'id', 'ec', 'sp', 'subsp',
- 'similar', 'involved', 'identical', 'identity',
- 'cds', 'clone', 'library', 'contig', 'contigs',
- 'homolog', 'homologue', 'homologs', 'homologous',
- 'protein', 'proteins', 'gene', 'genes',
- 'product', 'products', 'sequence', 'sequences',
- 'strain', 'strains', 'region', 'regions',
- ]
- KillWordsHash = {}
- KillWords.each { |x| KillWordsHash[x] = true }
-
- KillRegexpArray = [
- /\A\d{1,3}\%?\z/,
- /\A[A-Z][A-Za-z0-9\_]*[0-9]+[A-Za-z0-9\_]+\z/,
- /\A[A-Z][A-Z0-9]*\_[A-Z0-9\_]+\z/
- ]
-
- # Shows words used in the defline. Returns an Array.
- def words(case_sensitive = nil, kill_regexp = self.class::KillRegexpArray,
- kwhash = self.class::KillWordsHash)
- a = descriptions.join(' ').split(/[\.\,\;\:\(\)\[\]\{\}\<\>\"\'\`\~\/\|\?\!\&\@\#\s\x00-\x1f\x7f]+/)
- a.collect! do |x|
- x.sub!(/\A[\$\*\-\+]+/, '')
- x.sub!(/[\$\*\-\=]+\z/, '')
- if x.size <= 1 then
- nil
- elsif kwhash[x.downcase] then
- nil
- else
- if kill_regexp.find { |expr| expr =~ x } then
- nil
- else
- x
- end
- end
- end
- a.compact!
- a.collect! { |x| x.downcase } unless case_sensitive
- a.sort!
- a.uniq!
- a
- end
-
- # Returns identifires by a database name.
- def get(dbname)
- db = dbname.to_s
- r = nil
- unless r = @info[db] then
- di = @list_ids.find { |x| x[0] == db.to_s }
- if di and di.size <= 2 then
- r = di[-1]
- elsif di then
- labels = self.class::NSIDs[db]
- [ 'acc_version', 'entry_id',
- 'locus', 'accession', 'number'].each do |x|
- if i = labels.index(x) then
- r = di[i+1]
- break if r
- end
- end
- r = di[1..-1].find { |x| x } unless r
- end
- @info[db] = r if r
- end
- r
- end
-
- # Returns an identifier by given type.
- def get_by_type(type_str)
- @list_ids.each do |x|
- if labels = self.class::NSIDs[x[0]] then
- if i = labels.index(type_str) then
- return x[i+1]
- end
- end
- end
- nil
- end
-
- # Returns identifiers by given type.
- def get_all_by_type(*type_strarg)
- d = []
- @list_ids.each do |x|
- if labels = self.class::NSIDs[x[0]] then
- type_strarg.each do |y|
- if i = labels.index(y) then
- d << x[i+1] if x[i+1]
- end
- end
- end
- end
- d
- end
-
- # Shows locus.
- # If the entry has more than two of such IDs,
- # only the first ID are shown.
- # Returns a string or nil.
- def locus
- unless defined?(@locus)
- @locus = get_by_type('locus')
- end
- @locus
- end
-
- # Shows GI.
- # If the entry has more than two of such IDs,
- # only the first ID are shown.
- # Returns a string or nil.
- def gi
- unless defined?(@gi) then
- @gi = get_by_type('gi')
- end
- @gi
- end
-
- # Shows accession with version number.
- # If the entry has more than two of such IDs,
- # only the first ID are shown.
- # Returns a string or nil.
- def acc_version
- unless defined?(@acc_version) then
- @acc_version = get_by_type('acc_version')
- end
- @acc_version
- end
-
- # Shows accession numbers.
- # Returns an array of strings.
- def accessions
- unless defined?(@accessions) then
- @accessions = get_all_by_type('accession', 'acc_version')
- @accessions.collect! { |x| x.sub(/\..*\z/, '') }
- end
- @accessions
- end
-
- # Shows an accession number.
- def accession
- unless defined?(@accession) then
- if acc_version then
- @accession = acc_version.split('.')[0]
- else
- @accession = accessions[0]
- end
- end
- @accession
- end
-
- def method_missing(name, *args)
- # raise ArgumentError,
- # "wrong # of arguments(#{args.size} for 1)" if args.size >= 2
- r = get(name, *args)
- if !r and !(self.class::NSIDs[name.to_s]) then
- raise "NameError: undefined method `#{name.inspect}'"
- end
- r
- end
-
-
- end #class FastaDefline
-
end #module Bio
--- 326,329 ----
From ngoto at dev.open-bio.org Fri Jun 20 09:22:34 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Fri, 20 Jun 2008 13:22:34 +0000
Subject: [BioRuby-cvs] bioruby/lib/bio/db/fasta defline.rb,NONE,1.1.2.1
Message-ID: <200806201322.m5KDMYlh021706@dev.open-bio.org>
Update of /home/repository/bioruby/bioruby/lib/bio/db/fasta
In directory dev.open-bio.org:/tmp/cvs-serv21681/fasta
Added Files:
Tag: BRANCH-biohackathon2008
defline.rb
Log Message:
Split Bio::FastaDefline class into lib/bio/db/fasta/defline.rb
--- NEW FILE: defline.rb ---
#
# = bio/db/fasta/defline.rb - FASTA defline parser class
#
# Copyright:: Copyright (C) 2001, 2002
# GOTO Naohisa ,
# Toshiaki Katayama
# License:: The Ruby License
#
# $Id: defline.rb,v 1.1.2.1 2008/06/20 13:22:32 ngoto Exp $
#
# == Description
#
# Bio::FastaDefline is a parser class for definition line (defline)
# of the FASTA format.
#
# == Examples
#
# rub = Bio::FastaDefline.new('>gi|671595|emb|CAA85678.1| rubisco large subunit [Perovskia abrotanoides]')
# rub.entry_id ==> 'gi|671595'
# rub.get('emb') ==> 'CAA85678.1'
# rub.emb ==> 'CAA85678.1'
# rub.gi ==> '671595'
# rub.accession ==> 'CAA85678'
# rub.accessions ==> [ 'CAA85678' ]
# rub.acc_version ==> 'CAA85678.1'
# rub.locus ==> nil
# rub.list_ids ==> [["gi", "671595"],
# ["emb", "CAA85678.1", nil],
# ["Perovskia abrotanoides"]]
#
# ckr = Bio::FastaDefline.new(">gi|2495000|sp|Q63931|CCKR_CAVPO CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)\001gi|2147182|pir||I51898 cholecystokinin A receptor - guinea pig\001gi|544724|gb|AAB29504.1| cholecystokinin A receptor; CCK-A receptor [Cavia]")
# ckr.entry_id ==> "gi|2495000"
# ckr.sp ==> "CCKR_CAVPO"
# ckr.pir ==> "I51898"
# ckr.gb ==> "AAB29504.1"
# ckr.gi ==> "2495000"
# ckr.accession ==> "AAB29504"
# ckr.accessions ==> ["Q63931", "AAB29504"]
# ckr.acc_version ==> "AAB29504.1"
# ckr.locus ==> nil
# ckr.description ==>
# "CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)"
# ckr.descriptions ==>
# ["CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)",
# "cholecystokinin A receptor - guinea pig",
# "cholecystokinin A receptor; CCK-A receptor [Cavia]"]
# ckr.words ==>
# ["cavia", "cck-a", "cck-ar", "cholecystokinin", "guinea", "pig",
# "receptor", "type"]
# ckr.id_strings ==>
# ["2495000", "Q63931", "CCKR_CAVPO", "2147182", "I51898",
# "544724", "AAB29504.1", "Cavia"]
# ckr.list_ids ==>
# [["gi", "2495000"], ["sp", "Q63931", "CCKR_CAVPO"],
# ["gi", "2147182"], ["pir", nil, "I51898"], ["gi", "544724"],
# ["gb", "AAB29504.1", nil], ["Cavia"]]
#
# == References
#
# * FASTA format (WikiPedia)
# http://en.wikipedia.org/wiki/FASTA_format
#
# * Fasta format description (NCBI)
# http://www.ncbi.nlm.nih.gov/BLAST/fasta.shtml
#
module Bio
#--
# split from fasta.rb revision 1.28
#++
# Parsing FASTA Defline, and extract IDs and other informations.
# IDs are NSIDs (NCBI standard FASTA sequence identifiers)
# or ":"-separated IDs.
#
# specs are described in:
# ftp://ftp.ncbi.nih.gov/blast/documents/README.formatdb
# http://blast.wustl.edu/doc/FAQ-Indexing.html#Identifiers
#
# === Examples
#
# rub = Bio::FastaDefline.new('>gi|671595|emb|CAA85678.1| rubisco large subunit [Perovskia abrotanoides]')
# rub.entry_id ==> 'gi|671595'
# rub.get('emb') ==> 'CAA85678.1'
# rub.emb ==> 'CAA85678.1'
# rub.gi ==> '671595'
# rub.accession ==> 'CAA85678'
# rub.accessions ==> [ 'CAA85678' ]
# rub.acc_version ==> 'CAA85678.1'
# rub.locus ==> nil
# rub.list_ids ==> [["gi", "671595"],
# ["emb", "CAA85678.1", nil],
# ["Perovskia abrotanoides"]]
#
# ckr = Bio::FastaDefline.new(">gi|2495000|sp|Q63931|CCKR_CAVPO CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)\001gi|2147182|pir||I51898 cholecystokinin A receptor - guinea pig\001gi|544724|gb|AAB29504.1| cholecystokinin A receptor; CCK-A receptor [Cavia]")
# ckr.entry_id ==> "gi|2495000"
# ckr.sp ==> "CCKR_CAVPO"
# ckr.pir ==> "I51898"
# ckr.gb ==> "AAB29504.1"
# ckr.gi ==> "2495000"
# ckr.accession ==> "AAB29504"
# ckr.accessions ==> ["Q63931", "AAB29504"]
# ckr.acc_version ==> "AAB29504.1"
# ckr.locus ==> nil
# ckr.description ==>
# "CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)"
# ckr.descriptions ==>
# ["CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)",
# "cholecystokinin A receptor - guinea pig",
# "cholecystokinin A receptor; CCK-A receptor [Cavia]"]
# ckr.words ==>
# ["cavia", "cck-a", "cck-ar", "cholecystokinin", "guinea", "pig",
# "receptor", "type"]
# ckr.id_strings ==>
# ["2495000", "Q63931", "CCKR_CAVPO", "2147182", "I51898",
# "544724", "AAB29504.1", "Cavia"]
# ckr.list_ids ==>
# [["gi", "2495000"], ["sp", "Q63931", "CCKR_CAVPO"],
# ["gi", "2147182"], ["pir", nil, "I51898"], ["gi", "544724"],
# ["gb", "AAB29504.1", nil], ["Cavia"]]
#
# === Refereneces
#
# * Fasta format description (NCBI)
# http://www.ncbi.nlm.nih.gov/BLAST/fasta.shtml
#
# * Frequently Asked Questions: Indexing of Sequence Identifiers (by Warren R. Gish.)
# http://blast.wustl.edu/doc/FAQ-Indexing.html#Identifiers
#
# * README.formatdb
# ftp://ftp.ncbi.nih.gov/blast/documents/README.formatdb
#
class FastaDefline
NSIDs = {
# NCBI and WU-BLAST
'gi' => [ 'gi' ], # NCBI GI
'gb' => [ 'acc_version', 'locus' ], # GenBank
'emb' => [ 'acc_version', 'locus' ], # EMBL
'dbj' => [ 'acc_version', 'locus' ], # DDBJ
'sp' => [ 'accession', 'entry_id' ], # SWISS-PROT
'pdb' => [ 'entry_id', 'chain' ], # PDB
'bbs' => [ 'number' ], # GenInfo Backbone Id
'gnl' => [ 'database' , 'entry_id' ], # General database identifier
'ref' => [ 'acc_version' , 'locus' ], # NCBI Reference Sequence
'lcl' => [ 'entry_id' ], # Local Sequence identifier
# WU-BLAST and NCBI
'pir' => [ 'accession', 'entry_id' ], # PIR
'prf' => [ 'accession', 'entry_id' ], # Protein Research Foundation
'pat' => [ 'country', 'number', 'serial' ], # Patents
# WU-BLAST only
'bbm' => [ 'number' ], # NCBI GenInfo Backbone database identifier
'gim' => [ 'number' ], # NCBI GenInfo Import identifier
'gp' => [ 'acc_version', 'locus' ], # GenPept
'oth' => [ 'accession', 'name', 'release' ], # Other (user-definable) identifier
'tpd' => [ 'accession', 'name' ], # Third party annotation, DDBJ
'tpe' => [ 'accession', 'name' ], # Third party annotation, EMBL
'tpg' => [ 'accession', 'name' ], # Third party annotation, GenBank
# Original
'ri' => [ 'entry_id', 'rearray_id', 'len' ], # RIKEN FANTOM DB
}
# Shows array that contains IDs (or ID-like strings).
# Returns an array of arrays of strings.
attr_reader :list_ids
# Shows a possibly unique identifier.
# Returns a string.
attr_reader :entry_id
# Parses given string.
def initialize(str)
@deflines = []
@info = {}
@list_ids = []
@entry_id = nil
lines = str.split("\x01")
lines.each do |line|
add_defline(line)
end
end #def initialize
# Parses given string and adds parsed data.
def add_defline(str)
case str
when /^\>?\s*((?:[^\|\s]*\|)+[^\s]+)\s*(.*)$/
# NSIDs
# examples:
# >gi|9910844|sp|Q9UWG2|RL3_METVA 50S ribosomal protein L3P
#
# note: regexp (:?) means grouping without backreferences
i = $1
d = $2
tks = i.split('|')
tks << '' if i[-1,1] == '|'
a = parse_NSIDs(tks)
i = a[0].join('|')
a.unshift('|')
d = tks.join('|') + ' ' + d unless tks.empty?
a << d
this_line = a
match_EC(d)
parse_square_brackets(d).each do |x|
if !match_EC(x, false) and x =~ /\A[A-Z]/ then
di = [ x ]
@list_ids << di
@info['organism'] = x unless @info['organism']
end
end
when /^\>?\s*([a-zA-Z0-9]+\:[^\s]+)\s*(.*)$/
# examples:
# >sce:YBR160W CDC28, SRM5; cyclin-dependent protein kinase catalytic subunit [EC:2.7.1.-] [SP:CC28_YEAST]
# >emb:CACDC28 [X80034] C.albicans CDC28 gene
i = $1
d = $2
a = parse_ColonSepID(i)
i = a.join(':')
this_line = [ ':', a , d ]
match_EC(d)
parse_square_brackets(d).each do |x|
if !match_EC(x, false) and x =~ /:/ then
parse_ColonSepID(x)
elsif x =~ /\A\s*([A-Z][A-Z0-9_\.]+)\s*\z/ then
@list_ids << [ $1 ]
end
end
when /^\>?\s*(\S+)(?:\s+(.+))?$/
# examples:
# >ABC12345 this is test
i = $1
d = $2.to_s
@list_ids << [ i.chomp('.') ]
this_line = [ '', [ i ], d ]
match_EC(d)
else
i = str
d = ''
match_EC(i)
this_line = [ '', [ i ], d ]
end
@deflines << this_line
@entry_id = i unless @entry_id
end
def match_EC(str, write_flag = true)
di = nil
str.scan(/EC\:((:?[\-\d]+\.){3}(:?[\-\d]+))/i) do |x|
di = [ 'EC', $1 ]
if write_flag then
@info['ec'] = di[1] if (!@info['ec'] or @info['ec'].to_s =~ /\-/)
@list_ids << di
end
end
di
end
private :match_EC
def parse_square_brackets(str)
r = []
str.scan(/\[([^\]]*)\]/) do |x|
r << x[0]
end
r
end
private :parse_square_brackets
def parse_ColonSepID(str)
di = str.split(':', 2)
di << nil if di.size <= 1
@list_ids << di
di
end
private :parse_ColonSepID
def parse_NSIDs(ary)
# this method destroys ary
data = []
while token = ary.shift
if labels = self.class::NSIDs[token] then
di = [ token ]
idtype = token
labels.each do |x|
token = ary.shift
break unless token
if self.class::NSIDs[token] then
ary.unshift(token)
break #each
end
if token.length > 0 then
di << token
else
di << nil
end
end
data << di
else
if token.length > 0 then
# UCID (uncontrolled identifiers)
di = [ token ]
data << di
@info['ucid'] = token unless @info['ucid']
end
break #while
end
end #while
@list_ids.concat data
data
end #def parse_NSIDs
private :parse_NSIDs
# Shows original string.
# Note that the result of this method may be different from
# original string which is given in FastaDefline.new method.
def to_s
@deflines.collect { |a|
s = a[0]
(a[1..-2].collect { |x| x.join(s) }.join(s) + ' ' + a[-1]).strip
}.join("\x01")
end
# Shows description.
def description
@deflines[0].to_a[-1]
end
# Returns descriptions.
def descriptions
@deflines.collect do |a|
a[-1]
end
end
# Shows ID-like strings.
# Returns an array of strings.
def id_strings
r = []
@list_ids.each do |a|
if a.size >= 2 then
r.concat a[1..-1].find_all { |x| x }
else
if a[0].to_s.size > 0 and a[0] =~ /\A[A-Za-z0-9\.\-\_]+\z/
r << a[0]
end
end
end
r.concat( words(true, []).find_all do |x|
x =~ /\A[A-Z][A-Za-z0-9\_]*[0-9]+[A-Za-z0-9\_]+\z/ or
x =~ /\A[A-Z][A-Z0-9]*\_[A-Z0-9\_]+\z/
end)
r
end
KillWords = [
'an', 'the', 'this', 'that',
'is', 'are', 'were', 'was', 'be', 'can', 'may', 'might',
'as', 'at', 'by', 'for', 'in', 'of', 'on', 'to', 'with',
'from', 'and', 'or', 'not',
'dna', 'rna', 'mrna', 'cdna', 'orf',
'aa', 'nt', 'pct', 'id', 'ec', 'sp', 'subsp',
'similar', 'involved', 'identical', 'identity',
'cds', 'clone', 'library', 'contig', 'contigs',
'homolog', 'homologue', 'homologs', 'homologous',
'protein', 'proteins', 'gene', 'genes',
'product', 'products', 'sequence', 'sequences',
'strain', 'strains', 'region', 'regions',
]
KillWordsHash = {}
KillWords.each { |x| KillWordsHash[x] = true }
KillRegexpArray = [
/\A\d{1,3}\%?\z/,
/\A[A-Z][A-Za-z0-9\_]*[0-9]+[A-Za-z0-9\_]+\z/,
/\A[A-Z][A-Z0-9]*\_[A-Z0-9\_]+\z/
]
# Shows words used in the defline. Returns an Array.
def words(case_sensitive = nil, kill_regexp = self.class::KillRegexpArray,
kwhash = self.class::KillWordsHash)
a = descriptions.join(' ').split(/[\.\,\;\:\(\)\[\]\{\}\<\>\"\'\`\~\/\|\?\!\&\@\#\s\x00-\x1f\x7f]+/)
a.collect! do |x|
x.sub!(/\A[\$\*\-\+]+/, '')
x.sub!(/[\$\*\-\=]+\z/, '')
if x.size <= 1 then
nil
elsif kwhash[x.downcase] then
nil
else
if kill_regexp.find { |expr| expr =~ x } then
nil
else
x
end
end
end
a.compact!
a.collect! { |x| x.downcase } unless case_sensitive
a.sort!
a.uniq!
a
end
# Returns identifires by a database name.
def get(dbname)
db = dbname.to_s
r = nil
unless r = @info[db] then
di = @list_ids.find { |x| x[0] == db.to_s }
if di and di.size <= 2 then
r = di[-1]
elsif di then
labels = self.class::NSIDs[db]
[ 'acc_version', 'entry_id',
'locus', 'accession', 'number'].each do |x|
if i = labels.index(x) then
r = di[i+1]
break if r
end
end
r = di[1..-1].find { |x| x } unless r
end
@info[db] = r if r
end
r
end
# Returns an identifier by given type.
def get_by_type(type_str)
@list_ids.each do |x|
if labels = self.class::NSIDs[x[0]] then
if i = labels.index(type_str) then
return x[i+1]
end
end
end
nil
end
# Returns identifiers by given type.
def get_all_by_type(*type_strarg)
d = []
@list_ids.each do |x|
if labels = self.class::NSIDs[x[0]] then
type_strarg.each do |y|
if i = labels.index(y) then
d << x[i+1] if x[i+1]
end
end
end
end
d
end
# Shows locus.
# If the entry has more than two of such IDs,
# only the first ID are shown.
# Returns a string or nil.
def locus
unless defined?(@locus)
@locus = get_by_type('locus')
end
@locus
end
# Shows GI.
# If the entry has more than two of such IDs,
# only the first ID are shown.
# Returns a string or nil.
def gi
unless defined?(@gi) then
@gi = get_by_type('gi')
end
@gi
end
# Shows accession with version number.
# If the entry has more than two of such IDs,
# only the first ID are shown.
# Returns a string or nil.
def acc_version
unless defined?(@acc_version) then
@acc_version = get_by_type('acc_version')
end
@acc_version
end
# Shows accession numbers.
# Returns an array of strings.
def accessions
unless defined?(@accessions) then
@accessions = get_all_by_type('accession', 'acc_version')
@accessions.collect! { |x| x.sub(/\..*\z/, '') }
end
@accessions
end
# Shows an accession number.
def accession
unless defined?(@accession) then
if acc_version then
@accession = acc_version.split('.')[0]
else
@accession = accessions[0]
end
end
@accession
end
def method_missing(name, *args)
# raise ArgumentError,
# "wrong # of arguments(#{args.size} for 1)" if args.size >= 2
r = get(name, *args)
if !r and !(self.class::NSIDs[name.to_s]) then
raise "NameError: undefined method `#{name.inspect}'"
end
r
end
end #class FastaDefline
end #module Bio
From ngoto at dev.open-bio.org Fri Jun 20 09:30:16 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Fri, 20 Jun 2008 13:30:16 +0000
Subject: [BioRuby-cvs] bioruby/lib/bio/db fasta.rb,1.28.2.1,1.28.2.2
Message-ID: <200806201330.m5KDUGds021895@dev.open-bio.org>
Update of /home/repository/bioruby/bioruby/lib/bio/db
In directory dev.open-bio.org:/tmp/cvs-serv21857
Modified Files:
Tag: BRANCH-biohackathon2008
fasta.rb
Log Message:
Here-document separater string in example is changed to aviod confusion
about "END" which is also a reserved word in Ruby.
Index: fasta.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/db/fasta.rb,v
retrieving revision 1.28.2.1
retrieving revision 1.28.2.2
diff -C2 -d -r1.28.2.1 -r1.28.2.2
*** fasta.rb 20 Jun 2008 13:22:31 -0000 1.28.2.1
--- fasta.rb 20 Jun 2008 13:30:14 -0000 1.28.2.2
***************
*** 3,7 ****
#
# Copyright:: Copyright (C) 2001, 2002
! # GOTO Naohisa ,
# Toshiaki Katayama
# License:: The Ruby License
--- 3,7 ----
#
# Copyright:: Copyright (C) 2001, 2002
! # Naohisa Goto ,
# Toshiaki Katayama
# License:: The Ruby License
***************
*** 45,49 ****
# === Examples
#
! # f_str = <sce:YBR160W CDC28, SRM5; cyclin-dependent protein kinase catalytic subunit [EC:2.7.1.-] [SP:CC28_YEAST]
# MSGELANYKRLEKVGEGTYGVVYKALDLRPGQGQRVVALKKIRLESEDEG
--- 45,49 ----
# === Examples
#
! # f_str = <sce:YBR160W CDC28, SRM5; cyclin-dependent protein kinase catalytic subunit [EC:2.7.1.-] [SP:CC28_YEAST]
# MSGELANYKRLEKVGEGTYGVVYKALDLRPGQGQRVVALKKIRLESEDEG
***************
*** 65,69 ****
# FERLCELLGYDNVFPLIINIKTKSNGGYQLCGSISIIKIEEELKSVGFER
# KTGDPLEWRRLFKKISTICRDIILIPN
! # END
#
# f = Bio::FastaFormat.new(f_str)
--- 65,69 ----
# FERLCELLGYDNVFPLIINIKTKSNGGYQLCGSISIIKIEEELKSVGFER
# KTGDPLEWRRLFKKISTICRDIILIPN
! # END_OF_STRING
#
# f = Bio::FastaFormat.new(f_str)
From ngoto at dev.open-bio.org Fri Jun 20 09:43:38 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Fri, 20 Jun 2008 13:43:38 +0000
Subject: [BioRuby-cvs] bioruby/lib/bio/db fasta.rb,1.28.2.2,1.28.2.3
Message-ID: <200806201343.m5KDhcUr021965@dev.open-bio.org>
Update of /home/repository/bioruby/bioruby/lib/bio/db
In directory dev.open-bio.org:/tmp/cvs-serv21945
Modified Files:
Tag: BRANCH-biohackathon2008
fasta.rb
Log Message:
Bio::FastaFormat#to_seq is renamed to to_biosequence with improvement.
The "to_seq" method is now an alias of to_biosequence.
Index: fasta.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/db/fasta.rb,v
retrieving revision 1.28.2.2
retrieving revision 1.28.2.3
diff -C2 -d -r1.28.2.2 -r1.28.2.3
*** fasta.rb 20 Jun 2008 13:30:14 -0000 1.28.2.2
--- fasta.rb 20 Jun 2008 13:43:36 -0000 1.28.2.3
***************
*** 28,31 ****
--- 28,32 ----
require 'bio/db'
require 'bio/sequence'
+ require 'bio/sequence/dblink'
require 'bio/db/fasta/defline'
***************
*** 217,226 ****
# because of efficiency.
#
! def to_seq
seq
obj = Bio::Sequence.new(@seq)
! obj.definition = self.definition
obj
end
# Parsing FASTA Defline, and extract IDs.
--- 218,243 ----
# because of efficiency.
#
! def to_biosequence
seq
obj = Bio::Sequence.new(@seq)
! d = self.identifiers
! # accessions
! obj.primary_accession = d.accessions.first
! obj.secondary_accessions = d.accessions[1..-1]
! # entry_id
! obj.entry_id = d.locus unless d.locus.to_s.empty?
! # GI
! other = []
! other.push Bio::Sequence::DBLink.new('GI', d.gi) if d.gi
! obj.other_seqids = other unless other.empty?
! # definition
! if d.accessions.empty? and other.empty? then
! obj.definition = self.definition
! else
! obj.definition = d.description
! end
obj
end
+ alias to_seq to_biosequence
# Parsing FASTA Defline, and extract IDs.
From ngoto at dev.open-bio.org Mon Jun 2 09:33:50 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Mon, 02 Jun 2008 09:33:50 +0000
Subject: [BioRuby-cvs] bioruby/lib/bio reference.rb,1.26,1.27
Message-ID: <200806020933.m529Xoou025921@dev.open-bio.org>
Update of /home/repository/bioruby/bioruby/lib/bio
In directory dev.open-bio.org:/tmp/cvs-serv25887
Modified Files:
reference.rb
Log Message:
reverted to 1.24, because of potential security problem about "eval" in
bibtex method.
Index: reference.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/reference.rb,v
retrieving revision 1.26
retrieving revision 1.27
diff -C2 -d -r1.26 -r1.27
*** reference.rb 31 May 2008 09:36:55 -0000 1.26
--- reference.rb 2 Jun 2008 09:33:48 -0000 1.27
***************
*** 71,74 ****
--- 71,77 ----
attr_reader :abstract
+ # An URL String.
+ attr_reader :url
+
# MeSH terms in an Array.
attr_reader :mesh
***************
*** 77,83 ****
attr_reader :affiliations
- # An URL String.
- attr_reader :url
-
# Create a new Bio::Reference object from a Hash of values.
# Data is extracted from the values for keys:
--- 80,83 ----
***************
*** 232,236 ****
lines << "%P #{@pages}" unless @pages.empty?
lines << "%M #{@pubmed}" unless @pubmed.to_s.empty?
! lines << "%U #{url}" unless url.empty?
lines << "%X #{@abstract}" unless @abstract.empty?
@mesh.each do |term|
--- 232,241 ----
lines << "%P #{@pages}" unless @pages.empty?
lines << "%M #{@pubmed}" unless @pubmed.to_s.empty?
! if @pubmed
! cgi = "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi"
! opts = "cmd=Retrieve&db=PubMed&dopt=Citation&list_uids"
! @url = "#{cgi}?#{opts}=#{@pubmed}"
! end
! lines << "%U #{@url}" unless @url.empty?
lines << "%X #{@abstract}" unless @abstract.empty?
@mesh.each do |term|
***************
*** 294,321 ****
# *Arguments*:
# * (optional) _section_: BiBTeX section as String
- # * (optional) _keywords_: Array of additional keywords, e.g. ['abstract']
# *Returns*:: String
! def bibtex(section = nil, add_keywords = [])
section = "article" unless section
authors = authors_join(' and ', ' and ')
pages = @pages.sub('-', '--')
! keywords = "author title journal year volume number pages url".split(/ /)
! bib = "@#{section}{PMID:#{@pubmed},\n"
! (keywords+add_keywords).each do | kw |
! if kw == 'author'
! ref = authors
! elsif kw == 'title'
! # strip final dot from title
! ref = @title.sub(/\.$/,'')
! elsif kw == 'number'
! ref = @issue
! elsif kw == 'url'
! ref = url
! else
! ref = eval('@'+kw)
! end
! bib += " #{kw.ljust(12)} = {#{ref}},\n" if ref != ''
! end
! bib+"}\n"
end
--- 299,318 ----
# *Arguments*:
# * (optional) _section_: BiBTeX section as String
# *Returns*:: String
! def bibtex(section = nil)
section = "article" unless section
authors = authors_join(' and ', ' and ')
pages = @pages.sub('-', '--')
! return <<-"END".gsub(/\t/, '')
! @#{section}{PMID:#{@pubmed},
! author = {#{authors}},
! title = {#{@title}},
! journal = {#{@journal}},
! year = {#{@year}},
! volume = {#{@volume}},
! number = {#{@issue}},
! pages = {#{pages}},
! }
! END
end
***************
*** 503,518 ****
end
- # Returns a valid URL for pubmed records
- #
- # *Returns*:: String
- def url
- return @url if @url and @url != ''
- if @pubmed != ''
- cgi = "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi"
- opts = "cmd=Retrieve&db=PubMed&dopt=Citation&list_uids"
- return "#{cgi}?#{opts}=#{@pubmed}"
- end
- ''
- end
private
--- 500,503 ----
***************
*** 542,546 ****
end
-
end
--- 527,530 ----
From ngoto at dev.open-bio.org Mon Jun 2 09:47:11 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Mon, 02 Jun 2008 09:47:11 +0000
Subject: [BioRuby-cvs] bioruby/lib/bio reference.rb,1.27,1.28
Message-ID: <200806020947.m529lBCN026079@dev.open-bio.org>
Update of /home/repository/bioruby/bioruby/lib/bio
In directory dev.open-bio.org:/tmp/cvs-serv26058/lib/bio
Modified Files:
reference.rb
Log Message:
* New method Bio::Reference#pubmed_url added (renamed the url method in
revision 1.25).
* Bio::Reference#endnote is changed not to overwrite url if url is
already given by user.
Index: reference.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/reference.rb,v
retrieving revision 1.27
retrieving revision 1.28
diff -C2 -d -r1.27 -r1.28
*** reference.rb 2 Jun 2008 09:33:48 -0000 1.27
--- reference.rb 2 Jun 2008 09:47:08 -0000 1.28
***************
*** 232,241 ****
lines << "%P #{@pages}" unless @pages.empty?
lines << "%M #{@pubmed}" unless @pubmed.to_s.empty?
! if @pubmed
! cgi = "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi"
! opts = "cmd=Retrieve&db=PubMed&dopt=Citation&list_uids"
! @url = "#{cgi}?#{opts}=#{@pubmed}"
! end
! lines << "%U #{@url}" unless @url.empty?
lines << "%X #{@abstract}" unless @abstract.empty?
@mesh.each do |term|
--- 232,237 ----
lines << "%P #{@pages}" unless @pages.empty?
lines << "%M #{@pubmed}" unless @pubmed.to_s.empty?
! url = @url.empty? ? pubmed_url : @url
! lines << "%U #{url}" unless url.empty?
lines << "%X #{@abstract}" unless @abstract.empty?
@mesh.each do |term|
***************
*** 500,503 ****
--- 496,510 ----
end
+ # Returns a valid URL for pubmed records
+ #
+ # *Returns*:: String
+ def pubmed_url
+ unless @pubmed.to_s.empty?
+ cgi = "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi"
+ opts = "cmd=Retrieve&db=PubMed&dopt=Citation&list_uids"
+ return "#{cgi}?#{opts}=#{@pubmed}"
+ end
+ ''
+ end
private
From ngoto at dev.open-bio.org Wed Jun 4 14:56:40 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Wed, 04 Jun 2008 14:56:40 +0000
Subject: [BioRuby-cvs] bioruby/lib/bio reference.rb,1.28,1.29
Message-ID: <200806041456.m54Eue8E001532@dev.open-bio.org>
Update of /home/repository/bioruby/bioruby/lib/bio
In directory dev.open-bio.org:/tmp/cvs-serv1512/lib/bio
Modified Files:
reference.rb
Log Message:
improvement of Bio::Reference#bibtex method
Index: reference.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/reference.rb,v
retrieving revision 1.28
retrieving revision 1.29
diff -C2 -d -r1.28 -r1.29
*** reference.rb 2 Jun 2008 09:47:08 -0000 1.28
--- reference.rb 4 Jun 2008 14:56:37 -0000 1.29
***************
*** 167,184 ****
# *Arguments*:
# * (optional) _style_: String with style identifier
! # * (optional) _option_: Option for styles accepting one
# *Returns*:: String
! def format(style = nil, option = nil)
case style
when 'endnote'
return endnote
when 'bibitem'
! return bibitem(option)
when 'bibtex'
! return bibtex(option)
when 'rd'
! return rd(option)
when /^nature$/i
! return nature(option)
when /^science$/i
return science
--- 167,184 ----
# *Arguments*:
# * (optional) _style_: String with style identifier
! # * (optional) _options_: Options for styles accepting one
# *Returns*:: String
! def format(style = nil, *options)
case style
when 'endnote'
return endnote
when 'bibitem'
! return bibitem(*options)
when 'bibtex'
! return bibtex(*options)
when 'rd'
! return rd(*options)
when /^nature$/i
! return nature(*options)
when /^science$/i
return science
***************
*** 295,314 ****
# *Arguments*:
# * (optional) _section_: BiBTeX section as String
# *Returns*:: String
! def bibtex(section = nil)
section = "article" unless section
authors = authors_join(' and ', ' and ')
! pages = @pages.sub('-', '--')
! return <<-"END".gsub(/\t/, '')
! @#{section}{PMID:#{@pubmed},
! author = {#{authors}},
! title = {#{@title}},
! journal = {#{@journal}},
! year = {#{@year}},
! volume = {#{@volume}},
! number = {#{@issue}},
! pages = {#{pages}},
! }
! END
end
--- 295,340 ----
# *Arguments*:
# * (optional) _section_: BiBTeX section as String
+ # * (optional) _label_: Label string cited by LaTeX documents.
+ # Default is "PMID:#{pubmed}".
+ # * (optional) _keywords_: Hash of additional keywords,
+ # e.g. { 'abstract' => 'This is abstract.' }.
+ # You can also override default keywords.
+ # To disable default keywords, specify false as
+ # value, e.g. { 'url' => false, 'year' => false }.
# *Returns*:: String
! def bibtex(section = nil, label = nil, keywords = {})
section = "article" unless section
authors = authors_join(' and ', ' and ')
! thepages = pages.to_s.empty? ? nil : pages.sub(/\-/, '--')
! unless label then
! label = "PMID:#{pubmed}"
! end
! theurl = if !(url.to_s.empty?) then
! url
! elsif pmurl = pubmed_url and !(pmurl.to_s.empty?) then
! pmurl
! else
! nil
! end
! hash = {
! 'author' => authors.empty? ? nil : authors,
! 'title' => title.to_s.empty? ? nil : title,
! 'number' => issue.to_s.empty? ? nil : issue,
! 'pages' => thepages,
! 'url' => theurl
! }
! keys = %w( author title journal year volume number pages url )
! keys.each do |k|
! hash[k] = self.__send__(k.intern) unless hash.has_key?(k)
! end
! hash.merge!(keywords) { |k, v1, v2| v2.nil? ? v1 : v2 }
! bib = [ "@#{section}{#{label}," ]
! keys.concat((hash.keys - keys).sort)
! keys.each do |kw|
! ref = hash[kw]
! bib.push " #{kw.ljust(12)} = {#{ref}}," if ref
! end
! bib.push "}\n"
! return bib.join("\n")
end
From ngoto at dev.open-bio.org Wed Jun 4 14:58:10 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Wed, 04 Jun 2008 14:58:10 +0000
Subject: [BioRuby-cvs] bioruby/test/unit/bio test_reference.rb,1.4,1.5
Message-ID: <200806041458.m54EwAo2001581@dev.open-bio.org>
Update of /home/repository/bioruby/bioruby/test/unit/bio
In directory dev.open-bio.org:/tmp/cvs-serv1561/test/unit/bio
Modified Files:
test_reference.rb
Log Message:
test changed due to the improvement of Bio::Reference#bibtex
Index: test_reference.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/test/unit/bio/test_reference.rb,v
retrieving revision 1.4
retrieving revision 1.5
diff -C2 -d -r1.4 -r1.5
*** test_reference.rb 31 May 2008 09:36:56 -0000 1.4
--- test_reference.rb 4 Jun 2008 14:58:08 -0000 1.5
***************
*** 103,112 ****
def test_format_bibtex
! str = "@article{PMID:12345678,\n author = {Hoge, J.P. and Fuga, F.B.},\n title = {Title of the study},\n journal = {Theor. J. Hoge},\n year = {2001},\n volume = {12},\n number = {3},\n pages = {123-145},\n url = {http://example.com},\n}\n"
!
assert_equal(str, @obj.format('bibtex'))
assert_equal(str, @obj.bibtex)
end
def test_format_rd
str = "== Title of the study.\n\n* Hoge, J.P. and Fuga, F.B.\n\n* Theor. J. Hoge 2001 12:123-145 [PMID:12345678]\n\nHoge fuga. hoge fuga."
--- 103,147 ----
def test_format_bibtex
! str =<<__END__
! @article{PMID:12345678,
! author = {Hoge, J.P. and Fuga, F.B.},
! title = {Title of the study.},
! journal = {Theor. J. Hoge},
! year = {2001},
! volume = {12},
! number = {3},
! pages = {123--145},
! url = {http://example.com},
! }
! __END__
assert_equal(str, @obj.format('bibtex'))
assert_equal(str, @obj.bibtex)
end
+ def test_format_bibtex_with_arguments
+ str =<<__END__
+ @inproceedings{YourArticle,
+ author = {Hoge, J.P. and Fuga, F.B.},
+ title = {Title of the study.},
+ year = {2001},
+ volume = {12},
+ number = {3},
+ pages = {123--145},
+ booktitle = {Theor. J. Hoge},
+ month = {December},
+ }
+ __END__
+ assert_equal(str, @obj.format('bibtex', 'inproceedings', 'YourArticle',
+ { 'journal' => false,
+ 'url' => false,
+ 'booktitle' => @obj.journal,
+ 'month' => 'December'}))
+ assert_equal(str, @obj.bibtex('inproceedings', 'YourArticle',
+ { 'journal' => false,
+ 'url' => false,
+ 'booktitle' => @obj.journal,
+ 'month' => 'December'}))
+ end
+
def test_format_rd
str = "== Title of the study.\n\n* Hoge, J.P. and Fuga, F.B.\n\n* Theor. J. Hoge 2001 12:123-145 [PMID:12345678]\n\nHoge fuga. hoge fuga."
From ngoto at dev.open-bio.org Fri Jun 13 11:20:26 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Fri, 13 Jun 2008 11:20:26 +0000
Subject: [BioRuby-cvs] bioruby/lib/bio reference.rb,1.29,1.30
Message-ID: <200806131120.m5DBKQLQ004888@dev.open-bio.org>
Update of /home/repository/bioruby/bioruby/lib/bio
In directory dev.open-bio.org:/tmp/cvs-serv4830/lib/bio
Modified Files:
reference.rb
Log Message:
modified RDoc for Bio::Reference#bibitem
Index: reference.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/reference.rb,v
retrieving revision 1.29
retrieving revision 1.30
diff -C2 -d -r1.29 -r1.30
*** reference.rb 4 Jun 2008 14:56:37 -0000 1.29
--- reference.rb 13 Jun 2008 11:20:23 -0000 1.30
***************
*** 252,255 ****
--- 252,257 ----
# {\em Theor. J. Hoge}, 12(3):123--145, 2001.
# ---
+ # *Arguments*:
+ # * (optional) _item_: label string (default: "PMID:#{pubmed}").
# *Returns*:: String
def bibitem(item = nil)
From ngoto at dev.open-bio.org Fri Jun 13 11:37:27 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Fri, 13 Jun 2008 11:37:27 +0000
Subject: [BioRuby-cvs]
bioruby/test/unit/bio/util/restriction_enzyme/double_stranded
test_aligned_strands.rb, 1.3, 1.4
Message-ID: <200806131137.m5DBbRnA005201@dev.open-bio.org>
Update of /home/repository/bioruby/bioruby/test/unit/bio/util/restriction_enzyme/double_stranded
In directory dev.open-bio.org:/tmp/cvs-serv5181/test/unit/bio/util/restriction_enzyme/double_stranded
Modified Files:
test_aligned_strands.rb
Log Message:
"require 'bio/sequence'" is needed to run the tests in this file.
Index: test_aligned_strands.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/test/unit/bio/util/restriction_enzyme/double_stranded/test_aligned_strands.rb,v
retrieving revision 1.3
retrieving revision 1.4
diff -C2 -d -r1.3 -r1.4
*** test_aligned_strands.rb 5 Apr 2007 23:35:44 -0000 1.3
--- test_aligned_strands.rb 13 Jun 2008 11:37:25 -0000 1.4
***************
*** 14,17 ****
--- 14,18 ----
require 'test/unit'
+ require 'bio/sequence'
require 'bio/util/restriction_enzyme/double_stranded/aligned_strands'
require 'bio/util/restriction_enzyme/double_stranded'
From ngoto at dev.open-bio.org Fri Jun 13 11:39:41 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Fri, 13 Jun 2008 11:39:41 +0000
Subject: [BioRuby-cvs]
bioruby/test/unit/bio/util/restriction_enzyme/double_stranded
test_aligned_strands.rb, 1.3, 1.3.2.1
Message-ID: <200806131139.m5DBdfXW005450@dev.open-bio.org>
Update of /home/repository/bioruby/bioruby/test/unit/bio/util/restriction_enzyme/double_stranded
In directory dev.open-bio.org:/tmp/cvs-serv5209/test/unit/bio/util/restriction_enzyme/double_stranded
Modified Files:
Tag: BRANCH-biohackathon2008
test_aligned_strands.rb
Log Message:
merged change from rev. 1.3 to 1.4 in the CVS trunk
Index: test_aligned_strands.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/test/unit/bio/util/restriction_enzyme/double_stranded/test_aligned_strands.rb,v
retrieving revision 1.3
retrieving revision 1.3.2.1
diff -C2 -d -r1.3 -r1.3.2.1
*** test_aligned_strands.rb 5 Apr 2007 23:35:44 -0000 1.3
--- test_aligned_strands.rb 13 Jun 2008 11:39:39 -0000 1.3.2.1
***************
*** 14,17 ****
--- 14,18 ----
require 'test/unit'
+ require 'bio/sequence'
require 'bio/util/restriction_enzyme/double_stranded/aligned_strands'
require 'bio/util/restriction_enzyme/double_stranded'
From ngoto at dev.open-bio.org Tue Jun 17 12:23:52 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Tue, 17 Jun 2008 12:23:52 +0000
Subject: [BioRuby-cvs] bioruby/lib/bio reference.rb,1.24.2.6,1.24.2.7
Message-ID: <200806171223.m5HCNqfC020085@dev.open-bio.org>
Update of /home/repository/bioruby/bioruby/lib/bio
In directory dev.open-bio.org:/tmp/cvs-serv20065/lib/bio
Modified Files:
Tag: BRANCH-biohackathon2008
reference.rb
Log Message:
merged changes in trunk (revision 1.30)
Index: reference.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/reference.rb,v
retrieving revision 1.24.2.6
retrieving revision 1.24.2.7
diff -C2 -d -r1.24.2.6 -r1.24.2.7
*** reference.rb 23 Apr 2008 18:52:18 -0000 1.24.2.6
--- reference.rb 17 Jun 2008 12:23:49 -0000 1.24.2.7
***************
*** 180,186 ****
# *Arguments*:
# * (optional) _style_: String with style identifier
! # * (optional) _option_: Option for styles accepting one
# *Returns*:: String
! def format(style = nil, option = nil)
case style
when 'embl'
--- 180,186 ----
# *Arguments*:
# * (optional) _style_: String with style identifier
! # * (optional) _options_: Options for styles accepting one
# *Returns*:: String
! def format(style = nil, *options)
case style
when 'embl'
***************
*** 189,199 ****
return endnote
when 'bibitem'
! return bibitem(option)
when 'bibtex'
! return bibtex(option)
when 'rd'
! return rd(option)
when /^nature$/i
! return nature(option)
when /^science$/i
return science
--- 189,199 ----
return endnote
when 'bibitem'
! return bibitem(*options)
when 'bibtex'
! return bibtex(*options)
when 'rd'
! return rd(*options)
when /^nature$/i
! return nature(*options)
when /^science$/i
return science
***************
*** 247,256 ****
lines << "%P #{@pages}" unless @pages.empty?
lines << "%M #{@pubmed}" unless @pubmed.to_s.empty?
! if @pubmed
! cgi = "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi"
! opts = "cmd=Retrieve&db=PubMed&dopt=Citation&list_uids"
! @url = "#{cgi}?#{opts}=#{@pubmed}"
! end
! lines << "%U #{@url}" unless @url.empty?
lines << "%X #{@abstract}" unless @abstract.empty?
@mesh.each do |term|
--- 247,252 ----
lines << "%P #{@pages}" unless @pages.empty?
lines << "%M #{@pubmed}" unless @pubmed.to_s.empty?
! u = @url.empty? ? pubmed_url : @url
! lines << "%U #{u}" unless u.empty?
lines << "%X #{@abstract}" unless @abstract.empty?
@mesh.each do |term|
***************
*** 289,292 ****
--- 285,290 ----
# {\em Theor. J. Hoge}, 12(3):123--145, 2001.
# ---
+ # *Arguments*:
+ # * (optional) _item_: label string (default: "PMID:#{pubmed}").
# *Returns*:: String
def bibitem(item = nil)
***************
*** 332,351 ****
# *Arguments*:
# * (optional) _section_: BiBTeX section as String
# *Returns*:: String
! def bibtex(section = nil)
section = "article" unless section
authors = authors_join(' and ', ' and ')
! pages = @pages.sub('-', '--')
! return <<-"END".gsub(/\t/, '')
! @#{section}{PMID:#{@pubmed},
! author = {#{authors}},
! title = {#{@title}},
! journal = {#{@journal}},
! year = {#{@year}},
! volume = {#{@volume}},
! number = {#{@issue}},
! pages = {#{pages}},
! }
! END
end
--- 330,375 ----
# *Arguments*:
# * (optional) _section_: BiBTeX section as String
+ # * (optional) _label_: Label string cited by LaTeX documents.
+ # Default is "PMID:#{pubmed}".
+ # * (optional) _keywords_: Hash of additional keywords,
+ # e.g. { 'abstract' => 'This is abstract.' }.
+ # You can also override default keywords.
+ # To disable default keywords, specify false as
+ # value, e.g. { 'url' => false, 'year' => false }.
# *Returns*:: String
! def bibtex(section = nil, label = nil, keywords = {})
section = "article" unless section
authors = authors_join(' and ', ' and ')
! thepages = pages.to_s.empty? ? nil : pages.sub(/\-/, '--')
! unless label then
! label = "PMID:#{pubmed}"
! end
! theurl = if !(url.to_s.empty?) then
! url
! elsif pmurl = pubmed_url and !(pmurl.to_s.empty?) then
! pmurl
! else
! nil
! end
! hash = {
! 'author' => authors.empty? ? nil : authors,
! 'title' => title.to_s.empty? ? nil : title,
! 'number' => issue.to_s.empty? ? nil : issue,
! 'pages' => thepages,
! 'url' => theurl
! }
! keys = %w( author title journal year volume number pages url )
! keys.each do |k|
! hash[k] = self.__send__(k.intern) unless hash.has_key?(k)
! end
! hash.merge!(keywords) { |k, v1, v2| v2.nil? ? v1 : v2 }
! bib = [ "@#{section}{#{label}," ]
! keys.concat((hash.keys - keys).sort)
! keys.each do |kw|
! ref = hash[kw]
! bib.push " #{kw.ljust(12)} = {#{ref}}," if ref
! end
! bib.push "}\n"
! return bib.join("\n")
end
***************
*** 533,536 ****
--- 557,571 ----
end
+ # Returns a valid URL for pubmed records
+ #
+ # *Returns*:: String
+ def pubmed_url
+ unless @pubmed.to_s.empty?
+ cgi = "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi"
+ opts = "cmd=Retrieve&db=PubMed&dopt=Citation&list_uids"
+ return "#{cgi}?#{opts}=#{@pubmed}"
+ end
+ ''
+ end
private
From ngoto at dev.open-bio.org Tue Jun 17 12:24:44 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Tue, 17 Jun 2008 12:24:44 +0000
Subject: [BioRuby-cvs] bioruby/test/unit/bio test_reference.rb, 1.3.2.1,
1.3.2.2
Message-ID: <200806171224.m5HCOiAk020113@dev.open-bio.org>
Update of /home/repository/bioruby/bioruby/test/unit/bio
In directory dev.open-bio.org:/tmp/cvs-serv20093/test/unit/bio
Modified Files:
Tag: BRANCH-biohackathon2008
test_reference.rb
Log Message:
merged changes from trunk (revision 1.5)
Index: test_reference.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/test/unit/bio/test_reference.rb,v
retrieving revision 1.3.2.1
retrieving revision 1.3.2.2
diff -C2 -d -r1.3.2.1 -r1.3.2.2
*** test_reference.rb 8 May 2008 05:38:01 -0000 1.3.2.1
--- test_reference.rb 17 Jun 2008 12:24:41 -0000 1.3.2.2
***************
*** 92,96 ****
def test_format_endnote
! str = "%0 Journal Article\n%A Hoge, J.P.\n%A Fuga, F.B.\n%D 2001\n%T Title of the study.\n%J Theor. J. Hoge\n%V 12\n%N 3\n%P 123-145\n%M 12345678\n%U http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=PubMed&dopt=Citation&list_uids=12345678\n%X Hoge fuga. hoge fuga.\n%K Hoge\n%+ Tokyo"
assert_equal(str, @obj.format('endnote'))
assert_equal(str, @obj.endnote)
--- 92,96 ----
def test_format_endnote
! str = "%0 Journal Article\n%A Hoge, J.P.\n%A Fuga, F.B.\n%D 2001\n%T Title of the study.\n%J Theor. J. Hoge\n%V 12\n%N 3\n%P 123-145\n%M 12345678\n%U http://example.com\n%X Hoge fuga. hoge fuga.\n%K Hoge\n%+ Tokyo"
assert_equal(str, @obj.format('endnote'))
assert_equal(str, @obj.endnote)
***************
*** 104,122 ****
def test_format_bibtex
! str =< false,
+ 'url' => false,
+ 'booktitle' => @obj.journal,
+ 'month' => 'December'}))
+ assert_equal(str, @obj.bibtex('inproceedings', 'YourArticle',
+ { 'journal' => false,
+ 'url' => false,
+ 'booktitle' => @obj.journal,
+ 'month' => 'December'}))
+ end
+
def test_format_rd
str = "== Title of the study.\n\n* Hoge, J.P. and Fuga, F.B.\n\n* Theor. J. Hoge 2001 12:123-145 [PMID:12345678]\n\nHoge fuga. hoge fuga."
From ngoto at dev.open-bio.org Tue Jun 17 15:25:24 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Tue, 17 Jun 2008 15:25:24 +0000
Subject: [BioRuby-cvs] bioruby/lib/bio sequence.rb,0.58.2.11,0.58.2.12
Message-ID: <200806171525.m5HFPOpk020858@dev.open-bio.org>
Update of /home/repository/bioruby/bioruby/lib/bio
In directory dev.open-bio.org:/tmp/cvs-serv20823/lib/bio
Modified Files:
Tag: BRANCH-biohackathon2008
sequence.rb
Log Message:
* Some attributes are added: strandedness (strand information),
release_created, release_modified (release information),
entry_version (version of the entry numbered by database administrator),
organelle (organelle information), other_seqids (sequence IDs other than
accessions), and id_namespace (namespace of accessions).
Most of them are added because corresponding tags are defined in the
INSDSeq XML v1.4 ( http://www.insdc.org/files/documents/INSD_V1.4.dtd ).
The "id_namespace" will be used to output NCBI style fasta format.
* The "taxonomy" attribute is changed to be an alias of the "classification"
attribute.
* The "date" attribute is removed.
* RDoc documents of attributes are updated.
Index: sequence.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/sequence.rb,v
retrieving revision 0.58.2.11
retrieving revision 0.58.2.12
diff -C2 -d -r0.58.2.11 -r0.58.2.12
*** sequence.rb 24 Apr 2008 14:28:25 -0000 0.58.2.11
--- sequence.rb 17 Jun 2008 15:25:22 -0000 0.58.2.12
***************
*** 118,149 ****
end
! # The sequence identifier. For example, for a sequence
! # of Genbank origin, this is the accession number.
attr_accessor :entry_id
! # A String with a description of the sequence
attr_accessor :definition
! # An Array of Bio::Feature objects
attr_accessor :features
! # An Array of Bio::Reference objects
attr_accessor :references
! # A comment String
attr_accessor :comments
! # Date from sequence source. Often date of deposition.
! attr_accessor :date
!
! # An Array of Strings
attr_accessor :keywords
! # An Array of Strings; links to other database entries.
attr_accessor :dblinks
!
! # A taxonomy String
! attr_accessor :taxonomy
!
# Bio::Sequence::NA/AA
attr_accessor :moltype
--- 118,145 ----
end
! # The sequence identifier (String). For example, for a sequence
! # of Genbank origin, this is the locus name.
! # For a sequence of EMBL origin, this is the primary accession number.
attr_accessor :entry_id
! # A String with a description of the sequence (String)
attr_accessor :definition
! # Features (An Array of Bio::Feature objects)
attr_accessor :features
! # References (An Array of Bio::Reference objects)
attr_accessor :references
! # Comments (String or an Array of String)
attr_accessor :comments
! # Keywords (An Array of String)
attr_accessor :keywords
! # Links to other database entries.
! # (An Array of Bio::Sequence::DBLink objects)
attr_accessor :dblinks
!
# Bio::Sequence::NA/AA
attr_accessor :moltype
***************
*** 157,166 ****
#+++
! # Version number of the sequence (String).
attr_accessor :sequence_version
! # Topology (String). "circular" or "linear".
attr_accessor :topology
# molecular type (String). "DNA" or "RNA" for nucleotide sequence.
attr_accessor :molecule_type
--- 153,170 ----
#+++
! # Version number of the sequence (String or Integer).
! # Unlike entry_version, sequence_version will be changed
! # when the submitter of the sequence updates the entry.
! # Normally, the same entry taken from different databases (EMBL, GenBank,
! # and DDBJ) may have the same sequence_version.
attr_accessor :sequence_version
! # Topology (String). "circular", "linear", or nil.
attr_accessor :topology
+ # Strandedness (String). "single" (single-stranded),
+ # "double" (double-stranded), "mixed" (mixed-stranded), or nil.
+ attr_accessor :strandedness
+
# molecular type (String). "DNA" or "RNA" for nucleotide sequence.
attr_accessor :molecule_type
***************
*** 180,189 ****
attr_accessor :secondary_accessions
! # Created date of the sequence entry (String)
attr_accessor :date_created
! # Last modified date of the sequence entry (String)
attr_accessor :date_modified
# Organism species (String). For example, "Escherichia coli".
attr_accessor :species
--- 184,208 ----
attr_accessor :secondary_accessions
! # Created date of the sequence entry (Date, DateTime, Time, or String)
attr_accessor :date_created
! # Last modified date of the sequence entry (Date, DateTime, Time, or String)
attr_accessor :date_modified
+ # Release information when created (String)
+ attr_accessor :release_created
+
+ # Release information when last-modified (String)
+ attr_accessor :release_modified
+
+ # Version of the entry (String or Integer).
+ # Unlike sequence_version, entry_version is a database
+ # maintainer's internal version number.
+ # The version number will be changed when the database maintainer
+ # modifies the entry.
+ # The same enrty in EMBL, GenBank, and DDBJ may have different
+ # entry_version.
+ attr_accessor :entry_version
+
# Organism species (String). For example, "Escherichia coli".
attr_accessor :species
***************
*** 192,195 ****
--- 211,231 ----
# (Array of String)
attr_accessor :classification
+ alias taxonomy classification
+
+ # (not well supported) Organelle information (String).
+ attr_accessor :organelle
+
+ # Namespace of the sequence IDs described in entry_id, primary_accession,
+ # and secondary_accessions methods (String).
+ # For example, 'EMBL', 'GenBank', 'DDBJ', 'RefSeq'.
+ attr_accessor :id_namespace
+
+ # Sequence identifiers which are not described in entry_id,
+ # primary_accession,and secondary_accessions methods
+ # (Array of Bio::Sequence::DBLink objects).
+ # For example, NCBI GI number can be stored.
+ # Note that only identifiers of the entry itself should be stored.
+ # For database cross references, dblinks should be used.
+ attr_accessor :other_seqids
# Guess the type of sequence, Amino Acid or Nucleic Acid, and create a
From ngoto at dev.open-bio.org Tue Jun 17 15:44:24 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Tue, 17 Jun 2008 15:44:24 +0000
Subject: [BioRuby-cvs] bioruby/test/unit/bio/sequence test_dblink.rb, NONE,
1.1.2.1
Message-ID: <200806171544.m5HFiOIl021028@dev.open-bio.org>
Update of /home/repository/bioruby/bioruby/test/unit/bio/sequence
In directory dev.open-bio.org:/tmp/cvs-serv21001/test/unit/bio/sequence
Added Files:
Tag: BRANCH-biohackathon2008
test_dblink.rb
Log Message:
New class Bio::Sequence::DBLink are added to store IDs and database names
together in an object.
--- NEW FILE: test_dblink.rb ---
#
# test/unit/bio/sequence/test_dblink.rb - Unit test for Bio::Sequencce::DBLink
#
# Copyright:: Copyright (C) 2008 Naohisa Goto
# License:: The Ruby License
#
# $Id: test_dblink.rb,v 1.1.2.1 2008/06/17 15:44:22 ngoto Exp $
#
require 'pathname'
libpath = Pathname.new(File.join(File.dirname(__FILE__), ['..'] * 4, 'lib')).cleanpath.to_s
$:.unshift(libpath) unless $:.include?(libpath)
require 'test/unit'
require 'bio/sequence'
require 'bio/sequence/dblink'
module Bio
class TestSequenceDBLink < Test::Unit::TestCase
def setup
@xref = Bio::Sequence::DBLink.new('EMBL', 'Z14088', 'CAA78466.1',
'-', 'mRNA')
end
def test_database
assert_equal('EMBL', @xref.database)
end
def test_id
assert_equal('Z14088', @xref.id)
end
def test_secondary_ids
assert_equal([ 'CAA78466.1', '-', 'mRNA' ],
@xref.secondary_ids)
end
end #class
class TestSequenceDBLinkClassMethods < Test::Unit::TestCase
def test_parse_embl_DR_line
str = 'DR EPD; EP07077; HS_HBG1.'
xref = Bio::Sequence::DBLink.parse_embl_DR_line(str)
assert_equal('EPD', xref.database)
assert_equal('EP07077', xref.id)
assert_equal([ 'HS_HBG1' ], xref.secondary_ids)
end
def test_parse_uniprot_DR_line
str = 'DR EMBL; Z14088; CAA78466.1; -; mRNA.'
xref = Bio::Sequence::DBLink.parse_uniprot_DR_line(str)
assert_equal('EMBL', xref.database)
assert_equal('Z14088', xref.id)
assert_equal([ 'CAA78466.1', '-', 'mRNA' ],
xref.secondary_ids)
end
end #class
end #module Bio
From ngoto at dev.open-bio.org Tue Jun 17 15:44:24 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Tue, 17 Jun 2008 15:44:24 +0000
Subject: [BioRuby-cvs] bioruby/lib/bio/sequence dblink.rb,NONE,1.1.2.1
Message-ID: <200806171544.m5HFiOF6021023@dev.open-bio.org>
Update of /home/repository/bioruby/bioruby/lib/bio/sequence
In directory dev.open-bio.org:/tmp/cvs-serv21001/lib/bio/sequence
Added Files:
Tag: BRANCH-biohackathon2008
dblink.rb
Log Message:
New class Bio::Sequence::DBLink are added to store IDs and database names
together in an object.
--- NEW FILE: dblink.rb ---
#
# = bio/sequence/dblink.rb - sequence ID with database name
#
# Copyright:: Copyright (C) 2008
# Naohisa Goto
# License:: The Ruby License
#
# $Id: dblink.rb,v 1.1.2.1 2008/06/17 15:44:22 ngoto Exp $
#
require 'bio/sequence'
# Bio::Sequence::DBLink stores IDs with the database name.
# Its main purpose is to store database cross-reference information
# for a sequence entry.
class Bio::Sequence::DBLink
# creates a new DBLink object
def initialize(database, primary_id, *secondary_ids)
@database = database
@id = primary_id
@secondary_ids = secondary_ids
end
# Database name, or namespace identifier (String).
attr_reader :database
# Primary identifier (String)
attr_reader :id
# Secondary identifiers (Array of String)
attr_reader :secondary_ids
#--
# class methods
#++
# Parses DR line in EMBL entry, and returns a DBLink object.
def self.parse_embl_DR_line(str)
str = str.sub(/\.\s*\z/, '')
str.sub!(/\ADR /, '')
self.new(*(str.split(/\s*\;\s*/, 3)))
end
# Parses DR line in UniProt entry, and returns a DBLink object.
def self.parse_uniprot_DR_line(str)
str = str.sub(/\.\s*\z/, '')
str.sub!(/\ADR /, '')
self.new(*(str.split(/\s*\;\s*/)))
end
end #class Bio::Sequence::DBLink
From ngoto at dev.open-bio.org Tue Jun 17 15:50:07 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Tue, 17 Jun 2008 15:50:07 +0000
Subject: [BioRuby-cvs] bioruby/lib/bio/sequence format.rb,1.4.2.7,1.4.2.8
Message-ID: <200806171550.m5HFo7Jm021095@dev.open-bio.org>
Update of /home/repository/bioruby/bioruby/lib/bio/sequence
In directory dev.open-bio.org:/tmp/cvs-serv21057/lib/bio/sequence
Modified Files:
Tag: BRANCH-biohackathon2008
format.rb
Log Message:
* In the wrap method, changed to recognize "\n" in given string.
* Some helper methods are added to help formatting date string.
Index: format.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/sequence/format.rb,v
retrieving revision 1.4.2.7
retrieving revision 1.4.2.8
diff -C2 -d -r1.4.2.7 -r1.4.2.8
*** format.rb 4 Mar 2008 11:10:28 -0000 1.4.2.7
--- format.rb 17 Jun 2008 15:50:05 -0000 1.4.2.8
***************
*** 285,305 ****
def wrap_and_split_lines(str, width)
result = []
! left = str.dup
! while left and left.length > width
! line = nil
! width.downto(1) do |i|
! if left[i..i] == ' ' or /[\,\;]/ =~ left[(i-1)..(i-1)] then
! line = left[0..(i-1)].sub(/ +\z/, '')
! left = left[i..-1].sub(/\A +/, '')
! break
end
end
! if line.nil? then
! line = left[0..(width-1)]
! left = left[width..-1]
! end
! result << line
end
- result << left if left and !(left.to_s.empty?)
return result
end
--- 285,309 ----
def wrap_and_split_lines(str, width)
result = []
! lefts = str.chomp.split(/(?:\r\n|\r|\n)/)
! lefts.each do |left|
! left.rstrip!
! while left and left.length > width
! line = nil
! width.downto(1) do |i|
! if left[i..i] == ' ' or /[\,\;]/ =~ left[(i-1)..(i-1)] then
! line = left[0..(i-1)].sub(/ +\z/, '')
! left = left[i..-1].sub(/\A +/, '')
! break
! end
end
+ if line.nil? then
+ line = left[0..(width-1)]
+ left = left[width..-1]
+ end
+ result << line
+ left = nil if left.to_s.empty?
end
! result << left if left
end
return result
end
***************
*** 320,323 ****
--- 324,352 ----
end
+ #--
+ # internal use only
+ MonthStr = [ nil,
+ 'JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN',
+ 'JUL', 'AUG', 'SEP', 'OCT', 'NOV', 'DEC'
+ ].collect { |x| x.freeze }.freeze
+ #++
+
+ # formats a date from Date, DateTime, or Time object, or String.
+ def format_date(d)
+ begin
+ yy = d.year
+ mm = d.month
+ dd = d.day
+ rescue NoMethodError, NameError, ArgumentError, TypeError
+ return sprintf("%-11s", d)
+ end
+ sprintf("%02d-%-3s-%04d", dd, MonthStr[mm], yy)
+ end
+
+ # null date
+ def null_date
+ Date.new(0, 1, 1)
+ end
+
end #module INSDFeatureHelper
From ngoto at dev.open-bio.org Tue Jun 17 15:53:23 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Tue, 17 Jun 2008 15:53:23 +0000
Subject: [BioRuby-cvs] bioruby/lib/bio/db/genbank common.rb, 1.11.2.4,
1.11.2.5
Message-ID: <200806171553.m5HFrNlb021165@dev.open-bio.org>
Update of /home/repository/bioruby/bioruby/lib/bio/db/genbank
In directory dev.open-bio.org:/tmp/cvs-serv21145/lib/bio/db/genbank
Modified Files:
Tag: BRANCH-biohackathon2008
common.rb
Log Message:
Bio::GenBank#comment (and Bio::GenPept#comment) is changed not to remove
newlines inside the comment.
Index: common.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/db/genbank/common.rb,v
retrieving revision 1.11.2.4
retrieving revision 1.11.2.5
diff -C2 -d -r1.11.2.4 -r1.11.2.5
*** common.rb 7 May 2008 12:25:42 -0000 1.11.2.4
--- common.rb 17 Jun 2008 15:53:21 -0000 1.11.2.5
***************
*** 196,200 ****
# COMMENT -- Returns contents of the COMMENT record as a String.
def comment
! field_fetch('COMMENT')
end
--- 196,203 ----
# COMMENT -- Returns contents of the COMMENT record as a String.
def comment
! str = get('COMMENT').to_s.sub(/\ACOMMENT /, '')
! str.gsub!(/^ {12}/, '')
! str.chomp!
! str
end
From ngoto at dev.open-bio.org Tue Jun 17 15:56:20 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Tue, 17 Jun 2008 15:56:20 +0000
Subject: [BioRuby-cvs] bioruby/lib/bio/db/genbank genbank.rb, 0.40.2.3,
0.40.2.4
Message-ID: <200806171556.m5HFuKdb021193@dev.open-bio.org>
Update of /home/repository/bioruby/bioruby/lib/bio/db/genbank
In directory dev.open-bio.org:/tmp/cvs-serv21173/lib/bio/db/genbank
Modified Files:
Tag: BRANCH-biohackathon2008
genbank.rb
Log Message:
* Bio::GenBank#to_biosequence is changed to imporve support of sequence output
and data exchange.
* Bio::GenBank#date_created is added. It returns Date object.
Index: genbank.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/db/genbank/genbank.rb,v
retrieving revision 0.40.2.3
retrieving revision 0.40.2.4
diff -C2 -d -r0.40.2.3 -r0.40.2.4
*** genbank.rb 4 Mar 2008 09:22:35 -0000 0.40.2.3
--- genbank.rb 17 Jun 2008 15:56:18 -0000 0.40.2.4
***************
*** 8,13 ****
--- 8,16 ----
#
+ require 'date'
require 'bio/db'
require 'bio/db/genbank/common'
+ require 'bio/sequence'
+ require 'bio/sequence/dblink'
module Bio
***************
*** 122,129 ****
--- 125,142 ----
alias nalen length
+ # (obsolete???) length of the sequence
def seq_len
seq.length
end
+ # modified date. Returns Date object, String or nil.
+ def date_modified
+ begin
+ Date.parse(self.date)
+ rescue ArgumentError, TypeError, NoMethodError, NameError
+ self.date
+ end
+ end
+
# converts Bio::GenBank to Bio::Sequence
# ---
***************
*** 132,135 ****
--- 145,156 ----
def to_biosequence
sequence = Bio::Sequence.new(seq)
+
+ sequence.id_namespace =
+ if /\_/ =~ self.accession.to_s then
+ 'RefSeq'
+ else
+ 'GenBank'
+ end
+
sequence.entry_id = self.entry_id
***************
*** 137,147 ****
sequence.secondary_accessions = self.accessions - [ self.accession ]
sequence.molecule_type = self.natype
sequence.division = self.division
sequence.topology = self.circular
sequence.sequence_version = self.version
#sequence.date_created = nil #????
! sequence.date_modified = self.date
sequence.definition = self.definition
--- 158,177 ----
sequence.secondary_accessions = self.accessions - [ self.accession ]
+ if /GI\:(.+)/ =~ self.gi.to_s then
+ sequence.other_seqids = [ Bio::Sequence::DBLink.new('GI', $1) ]
+ end
+
sequence.molecule_type = self.natype
sequence.division = self.division
sequence.topology = self.circular
+ sequence.strandedness = case self.strand.to_s.downcase;
+ when 'ss-'; 'single';
+ when 'ds-'; 'double';
+ when 'ms-'; 'mixed';
+ else nil; end
sequence.sequence_version = self.version
#sequence.date_created = nil #????
! sequence.date_modified = date_modified
sequence.definition = self.definition
***************
*** 149,153 ****
sequence.species = self.organism
sequence.classification = self.taxonomy.to_s.sub(/\.\z/, '').split(/\s*\;\s*/)
! #sequence.organnella = nil # not used
sequence.comments = self.comment
sequence.references = self.references
--- 179,183 ----
sequence.species = self.organism
sequence.classification = self.taxonomy.to_s.sub(/\.\z/, '').split(/\s*\;\s*/)
! #sequence.organelle = nil # yet unsupported
sequence.comments = self.comment
sequence.references = self.references
From ngoto at dev.open-bio.org Tue Jun 17 15:59:26 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Tue, 17 Jun 2008 15:59:26 +0000
Subject: [BioRuby-cvs] bioruby/lib/bio/db/genbank format_genbank.rb, 1.1.2.4,
1.1.2.5
Message-ID: <200806171559.m5HFxQa4021221@dev.open-bio.org>
Update of /home/repository/bioruby/bioruby/lib/bio/db/genbank
In directory dev.open-bio.org:/tmp/cvs-serv21201/lib/bio/db/genbank
Modified Files:
Tag: BRANCH-biohackathon2008
format_genbank.rb
Log Message:
* Added support for COMMENT.
* Added support for GI number output.
* Many improvements are added.
Index: format_genbank.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/db/genbank/Attic/format_genbank.rb,v
retrieving revision 1.1.2.4
retrieving revision 1.1.2.5
diff -C2 -d -r1.1.2.4 -r1.1.2.5
*** format_genbank.rb 28 May 2008 13:26:33 -0000 1.1.2.4
--- format_genbank.rb 17 Jun 2008 15:59:24 -0000 1.1.2.5
***************
*** 101,104 ****
--- 101,115 ----
end
+ # formats comments lines as GenBank
+ def comments_format_genbank(cmnts)
+ return '' if !cmnts or cmnts.empty?
+ cmnts = [ cmnts ] unless cmnts.kind_of?(Array)
+ a = []
+ cmnts.each do |str|
+ a.push "COMMENT #{ genbank_wrap(str) }\n"
+ end
+ a.join('')
+ end
+
# formats sequence lines as GenBank
def seq_format_genbank(str)
***************
*** 113,122 ****
end
# Erb template of GenBank format for Bio::Sequence
erb_template <<'__END_OF_TEMPLATE__'
! LOCUS <%= sprintf("%-16s", entry_id) %> <%= sprintf("%11d", length) %> bp <%= sprintf("%3s", '') %><%= sprintf("%-6s", molecule_type) %> <%= sprintf("%-8s", topology) %><%= sprintf("%4s", division) %> <%= sprintf("%-11s", date_modified) %>
DEFINITION <%= genbank_wrap_dot(definition.to_s) %>
ACCESSION <%= genbank_wrap(([ primary_accession ] + (secondary_accessions or [])).join(" ")) %>
! VERSION <%= primary_accession %>.<%= sequence_version %><% unless true or gi_number.to_s.empty? %>GI:<%= gi_number %><% end %>
KEYWORDS <%= genbank_wrap_dot((keywords or []).join('; ')) %>
SOURCE <%= genbank_wrap(species) %>
--- 124,168 ----
end
+ # formats date
+ def date_format_genbank
+ date_modified || date_created || null_date
+ end
+
+ # moleculue type
+ def mol_type_genbank
+ if /(DNA|(t|r|m|u|sn|sno)?RNA)/i =~ molecule_type.to_s then
+ $1.sub(/[DR]NA/) { |x| x.upcase }
+ else
+ 'NA'
+ end
+ end
+
+ # NCBI GI number
+ def ncbi_gi_number
+ ids = other_seqids
+ if ids and r = ids.find { |x| x.database == 'GI' } then
+ r.id
+ else
+ nil
+ end
+ end
+
+ # strandedness
+ def strandedness_genbank
+ return nil unless strandedness
+ case strandedness
+ when 'single'; 'ss-';
+ when 'double'; 'ds-';
+ when 'mixed'; 'ms-';
+ else; nil
+ end
+ end
+
# Erb template of GenBank format for Bio::Sequence
erb_template <<'__END_OF_TEMPLATE__'
! LOCUS <%= sprintf("%-16s", entry_id) %> <%= sprintf("%11d", length) %> bp <%= sprintf("%3s", strandedness_genbank) %><%= sprintf("%-6s", mol_type_genbank) %> <%= sprintf("%-8s", topology) %><%= sprintf("%4s", division) %> <%= date_format_genbank %>
DEFINITION <%= genbank_wrap_dot(definition.to_s) %>
ACCESSION <%= genbank_wrap(([ primary_accession ] + (secondary_accessions or [])).join(" ")) %>
! VERSION <%= primary_accession %>.<%= sequence_version %><% if gi = ncbi_gi_number then %> GI:<%= gi %><% end %>
KEYWORDS <%= genbank_wrap_dot((keywords or []).join('; ')) %>
SOURCE <%= genbank_wrap(species) %>
***************
*** 129,132 ****
--- 175,179 ----
%><%= reference_format_genbank(ref, n) %><%
end
+ %><%= comments_format_genbank(comments)
%>FEATURES Location/Qualifiers
<%= format_features_genbank(features || [])
From ngoto at dev.open-bio.org Tue Jun 17 16:04:38 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Tue, 17 Jun 2008 16:04:38 +0000
Subject: [BioRuby-cvs] bioruby/lib/bio/db/embl embl.rb,1.29.2.6,1.29.2.7
Message-ID: <200806171604.m5HG4cnr021274@dev.open-bio.org>
Update of /home/repository/bioruby/bioruby/lib/bio/db/embl
In directory dev.open-bio.org:/tmp/cvs-serv21250/lib/bio/db/embl
Modified Files:
Tag: BRANCH-biohackathon2008
embl.rb
Log Message:
* Bio::EMBL#cc is changed to cut heading "CC ".
* Bio::EMBL#to_biosequence to improve support for sequence output
and data exchange.
* To get parse result of DT lines more easily, Bio::EMBL#date_modified,
date_created, release_modified, release_created, and entry_version
methods are added.
Index: embl.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/db/embl/embl.rb,v
retrieving revision 1.29.2.6
retrieving revision 1.29.2.7
diff -C2 -d -r1.29.2.6 -r1.29.2.7
*** embl.rb 28 May 2008 13:09:03 -0000 1.29.2.6
--- embl.rb 17 Jun 2008 16:04:36 -0000 1.29.2.7
***************
*** 32,39 ****
--- 32,42 ----
#
+ require 'date'
require 'bio/db'
require 'bio/db/embl/common'
require 'bio/compat/features'
require 'bio/compat/references'
+ require 'bio/sequence'
+ require 'bio/sequence/dblink'
module Bio
***************
*** 323,329 ****
# CC Line; comments of notes (>=0)
def cc
! get('CC')
end
!
##
--- 326,332 ----
# CC Line; comments of notes (>=0)
def cc
! get('CC').to_s.gsub(/^CC /, '')
end
! alias comment cc
##
***************
*** 376,379 ****
--- 379,436 ----
#++
+ # modified date. Returns Date object, String or nil.
+ def date_modified
+ parse_date(self.dt['updated'])
+ end
+
+ # created date. Returns Date object, String or nil.
+ def date_created
+ parse_date(self.dt['created'])
+ end
+
+ # release number when last updated
+ def release_modified
+ parse_release_version(self.dt['updated'])[0]
+ end
+
+ # release number when created
+ def release_created
+ parse_release_version(self.dt['created'])[0]
+ end
+
+ # entry version number numbered by EMBL
+ def entry_version
+ parse_release_version(self.dt['updated'])[1]
+ end
+
+ # parse date string. Returns Date object.
+ def parse_date(str)
+ begin
+ Date.parse(str)
+ rescue ArgumentError, TypeError, NoMethodError, NameError
+ str
+ end
+ end
+ private :parse_date
+
+ # extracts release and version numbers from DT line
+ def parse_release_version(str)
+ return [ nil, nil ] unless str
+ a = str.split(/[\(\,\)]/)
+ dstr = a.shift
+ rel = nil
+ ver = nil
+ a.each do |x|
+ case x
+ when /Rel\.\s*(.+)/
+ rel = $1.strip
+ when /Version\s*(.+)/
+ ver = $1.strip
+ end
+ end
+ [ rel, ver ]
+ end
+ private :parse_release_version
+
# converts the entry to Bio::Sequence object
# ---
***************
*** 382,385 ****
--- 439,444 ----
def to_biosequence
bio_seq = Bio::Sequence.new(self.seq)
+
+ bio_seq.id_namespace = 'EMBL'
bio_seq.entry_id = self.entry_id
bio_seq.primary_accession = self.accessions[0]
***************
*** 389,394 ****
bio_seq.definition = self.description
bio_seq.topology = self.topology
! bio_seq.date_created = self.dt['created']
! bio_seq.date_modified = self.dt['updated']
bio_seq.division = self.division
bio_seq.sequence_version = self.version
--- 448,456 ----
bio_seq.definition = self.description
bio_seq.topology = self.topology
! bio_seq.date_created = self.date_created
! bio_seq.date_modified = self.date_modified
! bio_seq.release_created = self.release_created
! bio_seq.release_modified = self.release_modified
! bio_seq.entry_version = self.entry_version
bio_seq.division = self.division
bio_seq.sequence_version = self.version
***************
*** 396,402 ****
bio_seq.species = self.fetch('OS')
bio_seq.classification = self.oc
bio_seq.references = self.references
bio_seq.features = self.ft
!
return bio_seq
end
--- 458,469 ----
bio_seq.species = self.fetch('OS')
bio_seq.classification = self.oc
+ # bio_seq.organelle = self.fetch('OG') # unsupported yet
bio_seq.references = self.references
bio_seq.features = self.ft
! bio_seq.comments = self.cc
! bio_seq.dblinks = get('DR').split(/\n/).collect { |x|
! Bio::Sequence::DBLink.parse_embl_DR_line(x)
! }
!
return bio_seq
end
From ngoto at dev.open-bio.org Tue Jun 17 16:06:06 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Tue, 17 Jun 2008 16:06:06 +0000
Subject: [BioRuby-cvs] bioruby/lib/bio/db/embl format_embl.rb, 1.1.2.5,
1.1.2.6
Message-ID: <200806171606.m5HG66iI021322@dev.open-bio.org>
Update of /home/repository/bioruby/bioruby/lib/bio/db/embl
In directory dev.open-bio.org:/tmp/cvs-serv21282/lib/bio/db/embl
Modified Files:
Tag: BRANCH-biohackathon2008
format_embl.rb
Log Message:
* Added support for CC lines (comments).
* Added support for DR lines (database cross references).
* Many improvements.
Index: format_embl.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/db/embl/Attic/format_embl.rb,v
retrieving revision 1.1.2.5
retrieving revision 1.1.2.6
diff -C2 -d -r1.1.2.5 -r1.1.2.6
*** format_embl.rb 28 May 2008 13:38:07 -0000 1.1.2.5
--- format_embl.rb 17 Jun 2008 16:06:04 -0000 1.1.2.6
***************
*** 2,6 ****
# = bio/db/embl/format_embl.rb - EMBL format generater
#
! # Copyright:: Copyright (C) 2008 Jan Aerts
# License:: The Ruby License
#
--- 2,8 ----
# = bio/db/embl/format_embl.rb - EMBL format generater
#
! # Copyright:: Copyright (C) 2008
! # Jan Aerts ,
! # Naohisa Goto
# License:: The Ruby License
#
***************
*** 125,136 ****
end
# Erb template of EMBL format for Bio::Sequence
erb_template <<'__END_OF_TEMPLATE__'
! ID <%= entry_id %>; SV <%= sequence_version %>; <%= topology %>; <%= molecule_type %>; <%= data_class %>; <%= division %>; <%= seq.length %> BP.
XX
<%= embl_wrap('AC ', accessions.reject{|a| a.nil?}.join('; ') + ';') %>
XX
! DT <%= date_created %>
! DT <%= date_modified %>
XX
<%= embl_wrap('DE ', definition) %>
--- 127,166 ----
end
+ # moleculue type
+ def mol_type_embl
+ if mt = molecule_type then
+ mt
+ elsif f = (features or []).find { |f| f.feature == 'source' } and
+ q = f.qualifiers.find { |q| q.qualifier == 'mol_type' } then
+ q.value
+ else
+ 'NA'
+ end
+ end
+
+ # CC line. Comments.
+ def comments_format_embl(cmnts)
+ return '' if !cmnts or cmnts.empty?
+ cmnts = [ cmnts ] unless cmnts.kind_of?(Array)
+ a = []
+ cmnts.each do |str|
+ a.push embl_wrap('CC ', str)
+ end
+ unless a.empty? then
+ a.push "XX "
+ a.push '' # dummy to put "\n" at the end of the string
+ end
+ a.join("\n")
+ end
+
+
# Erb template of EMBL format for Bio::Sequence
erb_template <<'__END_OF_TEMPLATE__'
! ID <%= primary_accession || entry_id %>; SV <%= sequence_version %>; <%= topology %>; <%= mol_type_embl %>; <%= data_class %>; <%= division %>; <%= seq.length %> BP.
XX
<%= embl_wrap('AC ', accessions.reject{|a| a.nil?}.join('; ') + ';') %>
XX
! DT <%= format_date(date_created || null_date) %> (Rel. <%= release_created || 0 %>, Created)
! DT <%= format_date(date_modified || null_date) %> (Rel. <%= release_modified || 0 %>, Last updated, Version <%= entry_version || 0 %>)
XX
<%= embl_wrap('DE ', definition) %>
***************
*** 142,146 ****
XX
<% hash = {}; (references || []).each do |ref| %><%= reference_format_embl(ref, hash) %>
! <% end %>FH Key Location/Qualifiers
FH
<%= format_features_embl(features || []) %>XX
--- 172,181 ----
XX
<% hash = {}; (references || []).each do |ref| %><%= reference_format_embl(ref, hash) %>
! <% end %><% (dblinks || []).each do |r|
! %>DR <%= r.database %>; <%= r.id %><% unless r.secondary_ids.empty? %>; <%= r.secondary_ids[0] %><% end %>.
! <% end %><% if dblinks and !dblinks.empty? then
! %>XX
! <% end %><%= comments_format_embl(comments)
! %>FH Key Location/Qualifiers
FH
<%= format_features_embl(features || []) %>XX
From ngoto at dev.open-bio.org Tue Jun 17 16:09:55 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Tue, 17 Jun 2008 16:09:55 +0000
Subject: [BioRuby-cvs] bioruby/test/unit/bio/db/embl test_embl_to_bioseq.rb,
1.1.2.1, 1.1.2.2
Message-ID: <200806171609.m5HG9tFR021392@dev.open-bio.org>
Update of /home/repository/bioruby/bioruby/test/unit/bio/db/embl
In directory dev.open-bio.org:/tmp/cvs-serv21372/test/unit/bio/db/embl
Modified Files:
Tag: BRANCH-biohackathon2008
test_embl_to_bioseq.rb
Log Message:
Unit test related to Bio::Sequence#date_created and date_modified are
changed because these methods are changed to store Date (or Time or DateTime)
objects instead of String objects.
Index: test_embl_to_bioseq.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/test/unit/bio/db/embl/Attic/test_embl_to_bioseq.rb,v
retrieving revision 1.1.2.1
retrieving revision 1.1.2.2
diff -C2 -d -r1.1.2.1 -r1.1.2.2
*** test_embl_to_bioseq.rb 20 Feb 2008 09:56:22 -0000 1.1.2.1
--- test_embl_to_bioseq.rb 17 Jun 2008 16:09:53 -0000 1.1.2.2
***************
*** 53,59 ****
end
! def test_dates
! assert_equal('25-OCT-2002 (Rel. 73, Created)', @bio_seq.date_created)
! assert_equal('14-NOV-2006 (Rel. 89, Last updated, Version 3)', @bio_seq.date_modified)
end
--- 53,76 ----
end
! def test_date_created
! # '25-OCT-2002 (Rel. 73, Created)'
! assert_equal(Date.parse('25-OCT-2002'), @bio_seq.date_created)
! end
!
! def test_date_modified
! # '14-NOV-2006 (Rel. 89, Last updated, Version 3)'
! assert_equal(Date.parse('14-NOV-2006'), @bio_seq.date_modified)
! end
!
! def test_release_created
! assert_equal('73', @bio_seq.release_created)
! end
!
! def test_release_modified
! assert_equal('89', @bio_seq.release_modified)
! end
!
! def test_entry_version
! assert_equal('3', @bio_seq.entry_version)
end
***************
*** 129,135 ****
end
! def test_dates
! assert_equal('25-OCT-2002 (Rel. 73, Created)', @bio_seq_2.date_created)
! assert_equal('14-NOV-2006 (Rel. 89, Last updated, Version 3)', @bio_seq_2.date_modified)
end
--- 146,169 ----
end
! def test_date_created
! # '25-OCT-2002 (Rel. 73, Created)'
! assert_equal(Date.parse('25-OCT-2002'), @bio_seq_2.date_created)
! end
!
! def test_date_modified
! # '14-NOV-2006 (Rel. 89, Last updated, Version 3)'
! assert_equal(Date.parse('14-NOV-2006'), @bio_seq_2.date_modified)
! end
!
! def test_release_created
! assert_equal('73', @bio_seq_2.release_created)
! end
!
! def test_release_modified
! assert_equal('89', @bio_seq_2.release_modified)
! end
!
! def test_entry_version
! assert_equal('3', @bio_seq_2.entry_version)
end
From ngoto at dev.open-bio.org Thu Jun 19 12:45:18 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Thu, 19 Jun 2008 12:45:18 +0000
Subject: [BioRuby-cvs] bioruby/lib/bio/db/embl format_embl.rb, 1.1.2.6,
1.1.2.7
Message-ID: <200806191245.m5JCjIps000652@dev.open-bio.org>
Update of /home/repository/bioruby/bioruby/lib/bio/db/embl
In directory dev.open-bio.org:/tmp/cvs-serv596/lib/bio/db/embl
Modified Files:
Tag: BRANCH-biohackathon2008
format_embl.rb
Log Message:
avoid error when keywords or classification is nil
Index: format_embl.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/db/embl/Attic/format_embl.rb,v
retrieving revision 1.1.2.6
retrieving revision 1.1.2.7
diff -C2 -d -r1.1.2.6 -r1.1.2.7
*** format_embl.rb 17 Jun 2008 16:06:04 -0000 1.1.2.6
--- format_embl.rb 19 Jun 2008 12:45:15 -0000 1.1.2.7
***************
*** 166,173 ****
<%= embl_wrap('DE ', definition) %>
XX
! <%= embl_wrap('KW ', keywords.join('; ') + '.') %>
XX
OS <%= species %>
! <%= embl_wrap('OC ', classification.join('; ') + '.') %>
XX
<% hash = {}; (references || []).each do |ref| %><%= reference_format_embl(ref, hash) %>
--- 166,173 ----
<%= embl_wrap('DE ', definition) %>
XX
! <%= embl_wrap('KW ', (keywords || []).join('; ') + '.') %>
XX
OS <%= species %>
! <%= embl_wrap('OC ', (classification || []).join('; ') + '.') %>
XX
<% hash = {}; (references || []).each do |ref| %><%= reference_format_embl(ref, hash) %>
From ngoto at dev.open-bio.org Fri Jun 20 13:22:34 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Fri, 20 Jun 2008 13:22:34 +0000
Subject: [BioRuby-cvs] bioruby/lib/bio/db fasta.rb,1.28,1.28.2.1
Message-ID: <200806201322.m5KDMYOR021703@dev.open-bio.org>
Update of /home/repository/bioruby/bioruby/lib/bio/db
In directory dev.open-bio.org:/tmp/cvs-serv21681
Modified Files:
Tag: BRANCH-biohackathon2008
fasta.rb
Log Message:
Split Bio::FastaDefline class into lib/bio/db/fasta/defline.rb
Index: fasta.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/db/fasta.rb,v
retrieving revision 1.28
retrieving revision 1.28.2.1
diff -C2 -d -r1.28 -r1.28.2.1
*** fasta.rb 5 Apr 2007 23:35:40 -0000 1.28
--- fasta.rb 20 Jun 2008 13:22:31 -0000 1.28.2.1
***************
*** 15,57 ****
# == Examples
#
! # rub = Bio::FastaDefline.new('>gi|671595|emb|CAA85678.1| rubisco large subunit [Perovskia abrotanoides]')
! # rub.entry_id ==> 'gi|671595'
! # rub.get('emb') ==> 'CAA85678.1'
! # rub.emb ==> 'CAA85678.1'
! # rub.gi ==> '671595'
! # rub.accession ==> 'CAA85678'
! # rub.accessions ==> [ 'CAA85678' ]
! # rub.acc_version ==> 'CAA85678.1'
! # rub.locus ==> nil
! # rub.list_ids ==> [["gi", "671595"],
! # ["emb", "CAA85678.1", nil],
! # ["Perovskia abrotanoides"]]
! #
! # ckr = Bio::FastaDefline.new(">gi|2495000|sp|Q63931|CCKR_CAVPO CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)\001gi|2147182|pir||I51898 cholecystokinin A receptor - guinea pig\001gi|544724|gb|AAB29504.1| cholecystokinin A receptor; CCK-A receptor [Cavia]")
! # ckr.entry_id ==> "gi|2495000"
! # ckr.sp ==> "CCKR_CAVPO"
! # ckr.pir ==> "I51898"
! # ckr.gb ==> "AAB29504.1"
! # ckr.gi ==> "2495000"
! # ckr.accession ==> "AAB29504"
! # ckr.accessions ==> ["Q63931", "AAB29504"]
! # ckr.acc_version ==> "AAB29504.1"
! # ckr.locus ==> nil
! # ckr.description ==>
! # "CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)"
! # ckr.descriptions ==>
! # ["CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)",
! # "cholecystokinin A receptor - guinea pig",
! # "cholecystokinin A receptor; CCK-A receptor [Cavia]"]
! # ckr.words ==>
! # ["cavia", "cck-a", "cck-ar", "cholecystokinin", "guinea", "pig",
! # "receptor", "type"]
! # ckr.id_strings ==>
! # ["2495000", "Q63931", "CCKR_CAVPO", "2147182", "I51898",
! # "544724", "AAB29504.1", "Cavia"]
! # ckr.list_ids ==>
! # [["gi", "2495000"], ["sp", "Q63931", "CCKR_CAVPO"],
! # ["gi", "2147182"], ["pir", nil, "I51898"], ["gi", "544724"],
! # ["gb", "AAB29504.1", nil], ["Cavia"]]
#
# == References
--- 15,19 ----
# == Examples
#
! # See documents of Bio::FastaFormat class.
#
# == References
***************
*** 66,69 ****
--- 28,32 ----
require 'bio/db'
require 'bio/sequence'
+ require 'bio/db/fasta/defline'
module Bio
***************
*** 363,825 ****
end #class FastaNumericFormat
-
- # Parsing FASTA Defline, and extract IDs and other informations.
- # IDs are NSIDs (NCBI standard FASTA sequence identifiers)
- # or ":"-separated IDs.
- #
- # specs are described in:
- # ftp://ftp.ncbi.nih.gov/blast/documents/README.formatdb
- # http://blast.wustl.edu/doc/FAQ-Indexing.html#Identifiers
- #
- # === Examples
- #
- # rub = Bio::FastaDefline.new('>gi|671595|emb|CAA85678.1| rubisco large subunit [Perovskia abrotanoides]')
- # rub.entry_id ==> 'gi|671595'
- # rub.get('emb') ==> 'CAA85678.1'
- # rub.emb ==> 'CAA85678.1'
- # rub.gi ==> '671595'
- # rub.accession ==> 'CAA85678'
- # rub.accessions ==> [ 'CAA85678' ]
- # rub.acc_version ==> 'CAA85678.1'
- # rub.locus ==> nil
- # rub.list_ids ==> [["gi", "671595"],
- # ["emb", "CAA85678.1", nil],
- # ["Perovskia abrotanoides"]]
- #
- # ckr = Bio::FastaDefline.new(">gi|2495000|sp|Q63931|CCKR_CAVPO CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)\001gi|2147182|pir||I51898 cholecystokinin A receptor - guinea pig\001gi|544724|gb|AAB29504.1| cholecystokinin A receptor; CCK-A receptor [Cavia]")
- # ckr.entry_id ==> "gi|2495000"
- # ckr.sp ==> "CCKR_CAVPO"
- # ckr.pir ==> "I51898"
- # ckr.gb ==> "AAB29504.1"
- # ckr.gi ==> "2495000"
- # ckr.accession ==> "AAB29504"
- # ckr.accessions ==> ["Q63931", "AAB29504"]
- # ckr.acc_version ==> "AAB29504.1"
- # ckr.locus ==> nil
- # ckr.description ==>
- # "CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)"
- # ckr.descriptions ==>
- # ["CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)",
- # "cholecystokinin A receptor - guinea pig",
- # "cholecystokinin A receptor; CCK-A receptor [Cavia]"]
- # ckr.words ==>
- # ["cavia", "cck-a", "cck-ar", "cholecystokinin", "guinea", "pig",
- # "receptor", "type"]
- # ckr.id_strings ==>
- # ["2495000", "Q63931", "CCKR_CAVPO", "2147182", "I51898",
- # "544724", "AAB29504.1", "Cavia"]
- # ckr.list_ids ==>
- # [["gi", "2495000"], ["sp", "Q63931", "CCKR_CAVPO"],
- # ["gi", "2147182"], ["pir", nil, "I51898"], ["gi", "544724"],
- # ["gb", "AAB29504.1", nil], ["Cavia"]]
- #
- # === Refereneces
- #
- # * Fasta format description (NCBI)
- # http://www.ncbi.nlm.nih.gov/BLAST/fasta.shtml
- #
- # * Frequently Asked Questions: Indexing of Sequence Identifiers (by Warren R. Gish.)
- # http://blast.wustl.edu/doc/FAQ-Indexing.html#Identifiers
- #
- # * README.formatdb
- # ftp://ftp.ncbi.nih.gov/blast/documents/README.formatdb
- #
- class FastaDefline
-
- NSIDs = {
- # NCBI and WU-BLAST
- 'gi' => [ 'gi' ], # NCBI GI
- 'gb' => [ 'acc_version', 'locus' ], # GenBank
- 'emb' => [ 'acc_version', 'locus' ], # EMBL
- 'dbj' => [ 'acc_version', 'locus' ], # DDBJ
- 'sp' => [ 'accession', 'entry_id' ], # SWISS-PROT
- 'pdb' => [ 'entry_id', 'chain' ], # PDB
- 'bbs' => [ 'number' ], # GenInfo Backbone Id
- 'gnl' => [ 'database' , 'entry_id' ], # General database identifier
- 'ref' => [ 'acc_version' , 'locus' ], # NCBI Reference Sequence
- 'lcl' => [ 'entry_id' ], # Local Sequence identifier
-
- # WU-BLAST and NCBI
- 'pir' => [ 'accession', 'entry_id' ], # PIR
- 'prf' => [ 'accession', 'entry_id' ], # Protein Research Foundation
- 'pat' => [ 'country', 'number', 'serial' ], # Patents
-
- # WU-BLAST only
- 'bbm' => [ 'number' ], # NCBI GenInfo Backbone database identifier
- 'gim' => [ 'number' ], # NCBI GenInfo Import identifier
- 'gp' => [ 'acc_version', 'locus' ], # GenPept
- 'oth' => [ 'accession', 'name', 'release' ], # Other (user-definable) identifier
- 'tpd' => [ 'accession', 'name' ], # Third party annotation, DDBJ
- 'tpe' => [ 'accession', 'name' ], # Third party annotation, EMBL
- 'tpg' => [ 'accession', 'name' ], # Third party annotation, GenBank
-
- # Original
- 'ri' => [ 'entry_id', 'rearray_id', 'len' ], # RIKEN FANTOM DB
- }
-
- # Shows array that contains IDs (or ID-like strings).
- # Returns an array of arrays of strings.
- attr_reader :list_ids
-
- # Shows a possibly unique identifier.
- # Returns a string.
- attr_reader :entry_id
-
- # Parses given string.
- def initialize(str)
- @deflines = []
- @info = {}
- @list_ids = []
-
- @entry_id = nil
-
- lines = str.split("\x01")
- lines.each do |line|
- add_defline(line)
- end
- end #def initialize
-
- # Parses given string and adds parsed data.
- def add_defline(str)
- case str
- when /^\>?\s*((?:[^\|\s]*\|)+[^\s]+)\s*(.*)$/
- # NSIDs
- # examples:
- # >gi|9910844|sp|Q9UWG2|RL3_METVA 50S ribosomal protein L3P
- #
- # note: regexp (:?) means grouping without backreferences
- i = $1
- d = $2
- tks = i.split('|')
- tks << '' if i[-1,1] == '|'
- a = parse_NSIDs(tks)
- i = a[0].join('|')
- a.unshift('|')
- d = tks.join('|') + ' ' + d unless tks.empty?
- a << d
- this_line = a
- match_EC(d)
- parse_square_brackets(d).each do |x|
- if !match_EC(x, false) and x =~ /\A[A-Z]/ then
- di = [ x ]
- @list_ids << di
- @info['organism'] = x unless @info['organism']
- end
- end
-
- when /^\>?\s*([a-zA-Z0-9]+\:[^\s]+)\s*(.*)$/
- # examples:
- # >sce:YBR160W CDC28, SRM5; cyclin-dependent protein kinase catalytic subunit [EC:2.7.1.-] [SP:CC28_YEAST]
- # >emb:CACDC28 [X80034] C.albicans CDC28 gene
- i = $1
- d = $2
- a = parse_ColonSepID(i)
- i = a.join(':')
- this_line = [ ':', a , d ]
- match_EC(d)
- parse_square_brackets(d).each do |x|
- if !match_EC(x, false) and x =~ /:/ then
- parse_ColonSepID(x)
- elsif x =~ /\A\s*([A-Z][A-Z0-9_\.]+)\s*\z/ then
- @list_ids << [ $1 ]
- end
- end
-
- when /^\>?\s*(\S+)(?:\s+(.+))?$/
- # examples:
- # >ABC12345 this is test
- i = $1
- d = $2.to_s
- @list_ids << [ i.chomp('.') ]
- this_line = [ '', [ i ], d ]
- match_EC(d)
- else
- i = str
- d = ''
- match_EC(i)
- this_line = [ '', [ i ], d ]
- end
-
- @deflines << this_line
- @entry_id = i unless @entry_id
- end
-
- def match_EC(str, write_flag = true)
- di = nil
- str.scan(/EC\:((:?[\-\d]+\.){3}(:?[\-\d]+))/i) do |x|
- di = [ 'EC', $1 ]
- if write_flag then
- @info['ec'] = di[1] if (!@info['ec'] or @info['ec'].to_s =~ /\-/)
- @list_ids << di
- end
- end
- di
- end
- private :match_EC
-
- def parse_square_brackets(str)
- r = []
- str.scan(/\[([^\]]*)\]/) do |x|
- r << x[0]
- end
- r
- end
- private :parse_square_brackets
-
- def parse_ColonSepID(str)
- di = str.split(':', 2)
- di << nil if di.size <= 1
- @list_ids << di
- di
- end
- private :parse_ColonSepID
-
- def parse_NSIDs(ary)
- # this method destroys ary
- data = []
- while token = ary.shift
- if labels = self.class::NSIDs[token] then
- di = [ token ]
- idtype = token
- labels.each do |x|
- token = ary.shift
- break unless token
- if self.class::NSIDs[token] then
- ary.unshift(token)
- break #each
- end
- if token.length > 0 then
- di << token
- else
- di << nil
- end
- end
- data << di
- else
- if token.length > 0 then
- # UCID (uncontrolled identifiers)
- di = [ token ]
- data << di
- @info['ucid'] = token unless @info['ucid']
- end
- break #while
- end
- end #while
- @list_ids.concat data
- data
- end #def parse_NSIDs
- private :parse_NSIDs
-
-
- # Shows original string.
- # Note that the result of this method may be different from
- # original string which is given in FastaDefline.new method.
- def to_s
- @deflines.collect { |a|
- s = a[0]
- (a[1..-2].collect { |x| x.join(s) }.join(s) + ' ' + a[-1]).strip
- }.join("\x01")
- end
-
- # Shows description.
- def description
- @deflines[0].to_a[-1]
- end
-
- # Returns descriptions.
- def descriptions
- @deflines.collect do |a|
- a[-1]
- end
- end
-
- # Shows ID-like strings.
- # Returns an array of strings.
- def id_strings
- r = []
- @list_ids.each do |a|
- if a.size >= 2 then
- r.concat a[1..-1].find_all { |x| x }
- else
- if a[0].to_s.size > 0 and a[0] =~ /\A[A-Za-z0-9\.\-\_]+\z/
- r << a[0]
- end
- end
- end
- r.concat( words(true, []).find_all do |x|
- x =~ /\A[A-Z][A-Za-z0-9\_]*[0-9]+[A-Za-z0-9\_]+\z/ or
- x =~ /\A[A-Z][A-Z0-9]*\_[A-Z0-9\_]+\z/
- end)
- r
- end
-
- KillWords = [
- 'an', 'the', 'this', 'that',
- 'is', 'are', 'were', 'was', 'be', 'can', 'may', 'might',
- 'as', 'at', 'by', 'for', 'in', 'of', 'on', 'to', 'with',
- 'from', 'and', 'or', 'not',
- 'dna', 'rna', 'mrna', 'cdna', 'orf',
- 'aa', 'nt', 'pct', 'id', 'ec', 'sp', 'subsp',
- 'similar', 'involved', 'identical', 'identity',
- 'cds', 'clone', 'library', 'contig', 'contigs',
- 'homolog', 'homologue', 'homologs', 'homologous',
- 'protein', 'proteins', 'gene', 'genes',
- 'product', 'products', 'sequence', 'sequences',
- 'strain', 'strains', 'region', 'regions',
- ]
- KillWordsHash = {}
- KillWords.each { |x| KillWordsHash[x] = true }
-
- KillRegexpArray = [
- /\A\d{1,3}\%?\z/,
- /\A[A-Z][A-Za-z0-9\_]*[0-9]+[A-Za-z0-9\_]+\z/,
- /\A[A-Z][A-Z0-9]*\_[A-Z0-9\_]+\z/
- ]
-
- # Shows words used in the defline. Returns an Array.
- def words(case_sensitive = nil, kill_regexp = self.class::KillRegexpArray,
- kwhash = self.class::KillWordsHash)
- a = descriptions.join(' ').split(/[\.\,\;\:\(\)\[\]\{\}\<\>\"\'\`\~\/\|\?\!\&\@\#\s\x00-\x1f\x7f]+/)
- a.collect! do |x|
- x.sub!(/\A[\$\*\-\+]+/, '')
- x.sub!(/[\$\*\-\=]+\z/, '')
- if x.size <= 1 then
- nil
- elsif kwhash[x.downcase] then
- nil
- else
- if kill_regexp.find { |expr| expr =~ x } then
- nil
- else
- x
- end
- end
- end
- a.compact!
- a.collect! { |x| x.downcase } unless case_sensitive
- a.sort!
- a.uniq!
- a
- end
-
- # Returns identifires by a database name.
- def get(dbname)
- db = dbname.to_s
- r = nil
- unless r = @info[db] then
- di = @list_ids.find { |x| x[0] == db.to_s }
- if di and di.size <= 2 then
- r = di[-1]
- elsif di then
- labels = self.class::NSIDs[db]
- [ 'acc_version', 'entry_id',
- 'locus', 'accession', 'number'].each do |x|
- if i = labels.index(x) then
- r = di[i+1]
- break if r
- end
- end
- r = di[1..-1].find { |x| x } unless r
- end
- @info[db] = r if r
- end
- r
- end
-
- # Returns an identifier by given type.
- def get_by_type(type_str)
- @list_ids.each do |x|
- if labels = self.class::NSIDs[x[0]] then
- if i = labels.index(type_str) then
- return x[i+1]
- end
- end
- end
- nil
- end
-
- # Returns identifiers by given type.
- def get_all_by_type(*type_strarg)
- d = []
- @list_ids.each do |x|
- if labels = self.class::NSIDs[x[0]] then
- type_strarg.each do |y|
- if i = labels.index(y) then
- d << x[i+1] if x[i+1]
- end
- end
- end
- end
- d
- end
-
- # Shows locus.
- # If the entry has more than two of such IDs,
- # only the first ID are shown.
- # Returns a string or nil.
- def locus
- unless defined?(@locus)
- @locus = get_by_type('locus')
- end
- @locus
- end
-
- # Shows GI.
- # If the entry has more than two of such IDs,
- # only the first ID are shown.
- # Returns a string or nil.
- def gi
- unless defined?(@gi) then
- @gi = get_by_type('gi')
- end
- @gi
- end
-
- # Shows accession with version number.
- # If the entry has more than two of such IDs,
- # only the first ID are shown.
- # Returns a string or nil.
- def acc_version
- unless defined?(@acc_version) then
- @acc_version = get_by_type('acc_version')
- end
- @acc_version
- end
-
- # Shows accession numbers.
- # Returns an array of strings.
- def accessions
- unless defined?(@accessions) then
- @accessions = get_all_by_type('accession', 'acc_version')
- @accessions.collect! { |x| x.sub(/\..*\z/, '') }
- end
- @accessions
- end
-
- # Shows an accession number.
- def accession
- unless defined?(@accession) then
- if acc_version then
- @accession = acc_version.split('.')[0]
- else
- @accession = accessions[0]
- end
- end
- @accession
- end
-
- def method_missing(name, *args)
- # raise ArgumentError,
- # "wrong # of arguments(#{args.size} for 1)" if args.size >= 2
- r = get(name, *args)
- if !r and !(self.class::NSIDs[name.to_s]) then
- raise "NameError: undefined method `#{name.inspect}'"
- end
- r
- end
-
-
- end #class FastaDefline
-
end #module Bio
--- 326,329 ----
From ngoto at dev.open-bio.org Fri Jun 20 13:22:34 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Fri, 20 Jun 2008 13:22:34 +0000
Subject: [BioRuby-cvs] bioruby/lib/bio/db/fasta defline.rb,NONE,1.1.2.1
Message-ID: <200806201322.m5KDMYlh021706@dev.open-bio.org>
Update of /home/repository/bioruby/bioruby/lib/bio/db/fasta
In directory dev.open-bio.org:/tmp/cvs-serv21681/fasta
Added Files:
Tag: BRANCH-biohackathon2008
defline.rb
Log Message:
Split Bio::FastaDefline class into lib/bio/db/fasta/defline.rb
--- NEW FILE: defline.rb ---
#
# = bio/db/fasta/defline.rb - FASTA defline parser class
#
# Copyright:: Copyright (C) 2001, 2002
# GOTO Naohisa ,
# Toshiaki Katayama
# License:: The Ruby License
#
# $Id: defline.rb,v 1.1.2.1 2008/06/20 13:22:32 ngoto Exp $
#
# == Description
#
# Bio::FastaDefline is a parser class for definition line (defline)
# of the FASTA format.
#
# == Examples
#
# rub = Bio::FastaDefline.new('>gi|671595|emb|CAA85678.1| rubisco large subunit [Perovskia abrotanoides]')
# rub.entry_id ==> 'gi|671595'
# rub.get('emb') ==> 'CAA85678.1'
# rub.emb ==> 'CAA85678.1'
# rub.gi ==> '671595'
# rub.accession ==> 'CAA85678'
# rub.accessions ==> [ 'CAA85678' ]
# rub.acc_version ==> 'CAA85678.1'
# rub.locus ==> nil
# rub.list_ids ==> [["gi", "671595"],
# ["emb", "CAA85678.1", nil],
# ["Perovskia abrotanoides"]]
#
# ckr = Bio::FastaDefline.new(">gi|2495000|sp|Q63931|CCKR_CAVPO CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)\001gi|2147182|pir||I51898 cholecystokinin A receptor - guinea pig\001gi|544724|gb|AAB29504.1| cholecystokinin A receptor; CCK-A receptor [Cavia]")
# ckr.entry_id ==> "gi|2495000"
# ckr.sp ==> "CCKR_CAVPO"
# ckr.pir ==> "I51898"
# ckr.gb ==> "AAB29504.1"
# ckr.gi ==> "2495000"
# ckr.accession ==> "AAB29504"
# ckr.accessions ==> ["Q63931", "AAB29504"]
# ckr.acc_version ==> "AAB29504.1"
# ckr.locus ==> nil
# ckr.description ==>
# "CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)"
# ckr.descriptions ==>
# ["CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)",
# "cholecystokinin A receptor - guinea pig",
# "cholecystokinin A receptor; CCK-A receptor [Cavia]"]
# ckr.words ==>
# ["cavia", "cck-a", "cck-ar", "cholecystokinin", "guinea", "pig",
# "receptor", "type"]
# ckr.id_strings ==>
# ["2495000", "Q63931", "CCKR_CAVPO", "2147182", "I51898",
# "544724", "AAB29504.1", "Cavia"]
# ckr.list_ids ==>
# [["gi", "2495000"], ["sp", "Q63931", "CCKR_CAVPO"],
# ["gi", "2147182"], ["pir", nil, "I51898"], ["gi", "544724"],
# ["gb", "AAB29504.1", nil], ["Cavia"]]
#
# == References
#
# * FASTA format (WikiPedia)
# http://en.wikipedia.org/wiki/FASTA_format
#
# * Fasta format description (NCBI)
# http://www.ncbi.nlm.nih.gov/BLAST/fasta.shtml
#
module Bio
#--
# split from fasta.rb revision 1.28
#++
# Parsing FASTA Defline, and extract IDs and other informations.
# IDs are NSIDs (NCBI standard FASTA sequence identifiers)
# or ":"-separated IDs.
#
# specs are described in:
# ftp://ftp.ncbi.nih.gov/blast/documents/README.formatdb
# http://blast.wustl.edu/doc/FAQ-Indexing.html#Identifiers
#
# === Examples
#
# rub = Bio::FastaDefline.new('>gi|671595|emb|CAA85678.1| rubisco large subunit [Perovskia abrotanoides]')
# rub.entry_id ==> 'gi|671595'
# rub.get('emb') ==> 'CAA85678.1'
# rub.emb ==> 'CAA85678.1'
# rub.gi ==> '671595'
# rub.accession ==> 'CAA85678'
# rub.accessions ==> [ 'CAA85678' ]
# rub.acc_version ==> 'CAA85678.1'
# rub.locus ==> nil
# rub.list_ids ==> [["gi", "671595"],
# ["emb", "CAA85678.1", nil],
# ["Perovskia abrotanoides"]]
#
# ckr = Bio::FastaDefline.new(">gi|2495000|sp|Q63931|CCKR_CAVPO CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)\001gi|2147182|pir||I51898 cholecystokinin A receptor - guinea pig\001gi|544724|gb|AAB29504.1| cholecystokinin A receptor; CCK-A receptor [Cavia]")
# ckr.entry_id ==> "gi|2495000"
# ckr.sp ==> "CCKR_CAVPO"
# ckr.pir ==> "I51898"
# ckr.gb ==> "AAB29504.1"
# ckr.gi ==> "2495000"
# ckr.accession ==> "AAB29504"
# ckr.accessions ==> ["Q63931", "AAB29504"]
# ckr.acc_version ==> "AAB29504.1"
# ckr.locus ==> nil
# ckr.description ==>
# "CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)"
# ckr.descriptions ==>
# ["CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)",
# "cholecystokinin A receptor - guinea pig",
# "cholecystokinin A receptor; CCK-A receptor [Cavia]"]
# ckr.words ==>
# ["cavia", "cck-a", "cck-ar", "cholecystokinin", "guinea", "pig",
# "receptor", "type"]
# ckr.id_strings ==>
# ["2495000", "Q63931", "CCKR_CAVPO", "2147182", "I51898",
# "544724", "AAB29504.1", "Cavia"]
# ckr.list_ids ==>
# [["gi", "2495000"], ["sp", "Q63931", "CCKR_CAVPO"],
# ["gi", "2147182"], ["pir", nil, "I51898"], ["gi", "544724"],
# ["gb", "AAB29504.1", nil], ["Cavia"]]
#
# === Refereneces
#
# * Fasta format description (NCBI)
# http://www.ncbi.nlm.nih.gov/BLAST/fasta.shtml
#
# * Frequently Asked Questions: Indexing of Sequence Identifiers (by Warren R. Gish.)
# http://blast.wustl.edu/doc/FAQ-Indexing.html#Identifiers
#
# * README.formatdb
# ftp://ftp.ncbi.nih.gov/blast/documents/README.formatdb
#
class FastaDefline
NSIDs = {
# NCBI and WU-BLAST
'gi' => [ 'gi' ], # NCBI GI
'gb' => [ 'acc_version', 'locus' ], # GenBank
'emb' => [ 'acc_version', 'locus' ], # EMBL
'dbj' => [ 'acc_version', 'locus' ], # DDBJ
'sp' => [ 'accession', 'entry_id' ], # SWISS-PROT
'pdb' => [ 'entry_id', 'chain' ], # PDB
'bbs' => [ 'number' ], # GenInfo Backbone Id
'gnl' => [ 'database' , 'entry_id' ], # General database identifier
'ref' => [ 'acc_version' , 'locus' ], # NCBI Reference Sequence
'lcl' => [ 'entry_id' ], # Local Sequence identifier
# WU-BLAST and NCBI
'pir' => [ 'accession', 'entry_id' ], # PIR
'prf' => [ 'accession', 'entry_id' ], # Protein Research Foundation
'pat' => [ 'country', 'number', 'serial' ], # Patents
# WU-BLAST only
'bbm' => [ 'number' ], # NCBI GenInfo Backbone database identifier
'gim' => [ 'number' ], # NCBI GenInfo Import identifier
'gp' => [ 'acc_version', 'locus' ], # GenPept
'oth' => [ 'accession', 'name', 'release' ], # Other (user-definable) identifier
'tpd' => [ 'accession', 'name' ], # Third party annotation, DDBJ
'tpe' => [ 'accession', 'name' ], # Third party annotation, EMBL
'tpg' => [ 'accession', 'name' ], # Third party annotation, GenBank
# Original
'ri' => [ 'entry_id', 'rearray_id', 'len' ], # RIKEN FANTOM DB
}
# Shows array that contains IDs (or ID-like strings).
# Returns an array of arrays of strings.
attr_reader :list_ids
# Shows a possibly unique identifier.
# Returns a string.
attr_reader :entry_id
# Parses given string.
def initialize(str)
@deflines = []
@info = {}
@list_ids = []
@entry_id = nil
lines = str.split("\x01")
lines.each do |line|
add_defline(line)
end
end #def initialize
# Parses given string and adds parsed data.
def add_defline(str)
case str
when /^\>?\s*((?:[^\|\s]*\|)+[^\s]+)\s*(.*)$/
# NSIDs
# examples:
# >gi|9910844|sp|Q9UWG2|RL3_METVA 50S ribosomal protein L3P
#
# note: regexp (:?) means grouping without backreferences
i = $1
d = $2
tks = i.split('|')
tks << '' if i[-1,1] == '|'
a = parse_NSIDs(tks)
i = a[0].join('|')
a.unshift('|')
d = tks.join('|') + ' ' + d unless tks.empty?
a << d
this_line = a
match_EC(d)
parse_square_brackets(d).each do |x|
if !match_EC(x, false) and x =~ /\A[A-Z]/ then
di = [ x ]
@list_ids << di
@info['organism'] = x unless @info['organism']
end
end
when /^\>?\s*([a-zA-Z0-9]+\:[^\s]+)\s*(.*)$/
# examples:
# >sce:YBR160W CDC28, SRM5; cyclin-dependent protein kinase catalytic subunit [EC:2.7.1.-] [SP:CC28_YEAST]
# >emb:CACDC28 [X80034] C.albicans CDC28 gene
i = $1
d = $2
a = parse_ColonSepID(i)
i = a.join(':')
this_line = [ ':', a , d ]
match_EC(d)
parse_square_brackets(d).each do |x|
if !match_EC(x, false) and x =~ /:/ then
parse_ColonSepID(x)
elsif x =~ /\A\s*([A-Z][A-Z0-9_\.]+)\s*\z/ then
@list_ids << [ $1 ]
end
end
when /^\>?\s*(\S+)(?:\s+(.+))?$/
# examples:
# >ABC12345 this is test
i = $1
d = $2.to_s
@list_ids << [ i.chomp('.') ]
this_line = [ '', [ i ], d ]
match_EC(d)
else
i = str
d = ''
match_EC(i)
this_line = [ '', [ i ], d ]
end
@deflines << this_line
@entry_id = i unless @entry_id
end
def match_EC(str, write_flag = true)
di = nil
str.scan(/EC\:((:?[\-\d]+\.){3}(:?[\-\d]+))/i) do |x|
di = [ 'EC', $1 ]
if write_flag then
@info['ec'] = di[1] if (!@info['ec'] or @info['ec'].to_s =~ /\-/)
@list_ids << di
end
end
di
end
private :match_EC
def parse_square_brackets(str)
r = []
str.scan(/\[([^\]]*)\]/) do |x|
r << x[0]
end
r
end
private :parse_square_brackets
def parse_ColonSepID(str)
di = str.split(':', 2)
di << nil if di.size <= 1
@list_ids << di
di
end
private :parse_ColonSepID
def parse_NSIDs(ary)
# this method destroys ary
data = []
while token = ary.shift
if labels = self.class::NSIDs[token] then
di = [ token ]
idtype = token
labels.each do |x|
token = ary.shift
break unless token
if self.class::NSIDs[token] then
ary.unshift(token)
break #each
end
if token.length > 0 then
di << token
else
di << nil
end
end
data << di
else
if token.length > 0 then
# UCID (uncontrolled identifiers)
di = [ token ]
data << di
@info['ucid'] = token unless @info['ucid']
end
break #while
end
end #while
@list_ids.concat data
data
end #def parse_NSIDs
private :parse_NSIDs
# Shows original string.
# Note that the result of this method may be different from
# original string which is given in FastaDefline.new method.
def to_s
@deflines.collect { |a|
s = a[0]
(a[1..-2].collect { |x| x.join(s) }.join(s) + ' ' + a[-1]).strip
}.join("\x01")
end
# Shows description.
def description
@deflines[0].to_a[-1]
end
# Returns descriptions.
def descriptions
@deflines.collect do |a|
a[-1]
end
end
# Shows ID-like strings.
# Returns an array of strings.
def id_strings
r = []
@list_ids.each do |a|
if a.size >= 2 then
r.concat a[1..-1].find_all { |x| x }
else
if a[0].to_s.size > 0 and a[0] =~ /\A[A-Za-z0-9\.\-\_]+\z/
r << a[0]
end
end
end
r.concat( words(true, []).find_all do |x|
x =~ /\A[A-Z][A-Za-z0-9\_]*[0-9]+[A-Za-z0-9\_]+\z/ or
x =~ /\A[A-Z][A-Z0-9]*\_[A-Z0-9\_]+\z/
end)
r
end
KillWords = [
'an', 'the', 'this', 'that',
'is', 'are', 'were', 'was', 'be', 'can', 'may', 'might',
'as', 'at', 'by', 'for', 'in', 'of', 'on', 'to', 'with',
'from', 'and', 'or', 'not',
'dna', 'rna', 'mrna', 'cdna', 'orf',
'aa', 'nt', 'pct', 'id', 'ec', 'sp', 'subsp',
'similar', 'involved', 'identical', 'identity',
'cds', 'clone', 'library', 'contig', 'contigs',
'homolog', 'homologue', 'homologs', 'homologous',
'protein', 'proteins', 'gene', 'genes',
'product', 'products', 'sequence', 'sequences',
'strain', 'strains', 'region', 'regions',
]
KillWordsHash = {}
KillWords.each { |x| KillWordsHash[x] = true }
KillRegexpArray = [
/\A\d{1,3}\%?\z/,
/\A[A-Z][A-Za-z0-9\_]*[0-9]+[A-Za-z0-9\_]+\z/,
/\A[A-Z][A-Z0-9]*\_[A-Z0-9\_]+\z/
]
# Shows words used in the defline. Returns an Array.
def words(case_sensitive = nil, kill_regexp = self.class::KillRegexpArray,
kwhash = self.class::KillWordsHash)
a = descriptions.join(' ').split(/[\.\,\;\:\(\)\[\]\{\}\<\>\"\'\`\~\/\|\?\!\&\@\#\s\x00-\x1f\x7f]+/)
a.collect! do |x|
x.sub!(/\A[\$\*\-\+]+/, '')
x.sub!(/[\$\*\-\=]+\z/, '')
if x.size <= 1 then
nil
elsif kwhash[x.downcase] then
nil
else
if kill_regexp.find { |expr| expr =~ x } then
nil
else
x
end
end
end
a.compact!
a.collect! { |x| x.downcase } unless case_sensitive
a.sort!
a.uniq!
a
end
# Returns identifires by a database name.
def get(dbname)
db = dbname.to_s
r = nil
unless r = @info[db] then
di = @list_ids.find { |x| x[0] == db.to_s }
if di and di.size <= 2 then
r = di[-1]
elsif di then
labels = self.class::NSIDs[db]
[ 'acc_version', 'entry_id',
'locus', 'accession', 'number'].each do |x|
if i = labels.index(x) then
r = di[i+1]
break if r
end
end
r = di[1..-1].find { |x| x } unless r
end
@info[db] = r if r
end
r
end
# Returns an identifier by given type.
def get_by_type(type_str)
@list_ids.each do |x|
if labels = self.class::NSIDs[x[0]] then
if i = labels.index(type_str) then
return x[i+1]
end
end
end
nil
end
# Returns identifiers by given type.
def get_all_by_type(*type_strarg)
d = []
@list_ids.each do |x|
if labels = self.class::NSIDs[x[0]] then
type_strarg.each do |y|
if i = labels.index(y) then
d << x[i+1] if x[i+1]
end
end
end
end
d
end
# Shows locus.
# If the entry has more than two of such IDs,
# only the first ID are shown.
# Returns a string or nil.
def locus
unless defined?(@locus)
@locus = get_by_type('locus')
end
@locus
end
# Shows GI.
# If the entry has more than two of such IDs,
# only the first ID are shown.
# Returns a string or nil.
def gi
unless defined?(@gi) then
@gi = get_by_type('gi')
end
@gi
end
# Shows accession with version number.
# If the entry has more than two of such IDs,
# only the first ID are shown.
# Returns a string or nil.
def acc_version
unless defined?(@acc_version) then
@acc_version = get_by_type('acc_version')
end
@acc_version
end
# Shows accession numbers.
# Returns an array of strings.
def accessions
unless defined?(@accessions) then
@accessions = get_all_by_type('accession', 'acc_version')
@accessions.collect! { |x| x.sub(/\..*\z/, '') }
end
@accessions
end
# Shows an accession number.
def accession
unless defined?(@accession) then
if acc_version then
@accession = acc_version.split('.')[0]
else
@accession = accessions[0]
end
end
@accession
end
def method_missing(name, *args)
# raise ArgumentError,
# "wrong # of arguments(#{args.size} for 1)" if args.size >= 2
r = get(name, *args)
if !r and !(self.class::NSIDs[name.to_s]) then
raise "NameError: undefined method `#{name.inspect}'"
end
r
end
end #class FastaDefline
end #module Bio
From ngoto at dev.open-bio.org Fri Jun 20 13:30:16 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Fri, 20 Jun 2008 13:30:16 +0000
Subject: [BioRuby-cvs] bioruby/lib/bio/db fasta.rb,1.28.2.1,1.28.2.2
Message-ID: <200806201330.m5KDUGds021895@dev.open-bio.org>
Update of /home/repository/bioruby/bioruby/lib/bio/db
In directory dev.open-bio.org:/tmp/cvs-serv21857
Modified Files:
Tag: BRANCH-biohackathon2008
fasta.rb
Log Message:
Here-document separater string in example is changed to aviod confusion
about "END" which is also a reserved word in Ruby.
Index: fasta.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/db/fasta.rb,v
retrieving revision 1.28.2.1
retrieving revision 1.28.2.2
diff -C2 -d -r1.28.2.1 -r1.28.2.2
*** fasta.rb 20 Jun 2008 13:22:31 -0000 1.28.2.1
--- fasta.rb 20 Jun 2008 13:30:14 -0000 1.28.2.2
***************
*** 3,7 ****
#
# Copyright:: Copyright (C) 2001, 2002
! # GOTO Naohisa ,
# Toshiaki Katayama
# License:: The Ruby License
--- 3,7 ----
#
# Copyright:: Copyright (C) 2001, 2002
! # Naohisa Goto ,
# Toshiaki Katayama
# License:: The Ruby License
***************
*** 45,49 ****
# === Examples
#
! # f_str = <sce:YBR160W CDC28, SRM5; cyclin-dependent protein kinase catalytic subunit [EC:2.7.1.-] [SP:CC28_YEAST]
# MSGELANYKRLEKVGEGTYGVVYKALDLRPGQGQRVVALKKIRLESEDEG
--- 45,49 ----
# === Examples
#
! # f_str = <sce:YBR160W CDC28, SRM5; cyclin-dependent protein kinase catalytic subunit [EC:2.7.1.-] [SP:CC28_YEAST]
# MSGELANYKRLEKVGEGTYGVVYKALDLRPGQGQRVVALKKIRLESEDEG
***************
*** 65,69 ****
# FERLCELLGYDNVFPLIINIKTKSNGGYQLCGSISIIKIEEELKSVGFER
# KTGDPLEWRRLFKKISTICRDIILIPN
! # END
#
# f = Bio::FastaFormat.new(f_str)
--- 65,69 ----
# FERLCELLGYDNVFPLIINIKTKSNGGYQLCGSISIIKIEEELKSVGFER
# KTGDPLEWRRLFKKISTICRDIILIPN
! # END_OF_STRING
#
# f = Bio::FastaFormat.new(f_str)
From ngoto at dev.open-bio.org Fri Jun 20 13:43:38 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Fri, 20 Jun 2008 13:43:38 +0000
Subject: [BioRuby-cvs] bioruby/lib/bio/db fasta.rb,1.28.2.2,1.28.2.3
Message-ID: <200806201343.m5KDhcUr021965@dev.open-bio.org>
Update of /home/repository/bioruby/bioruby/lib/bio/db
In directory dev.open-bio.org:/tmp/cvs-serv21945
Modified Files:
Tag: BRANCH-biohackathon2008
fasta.rb
Log Message:
Bio::FastaFormat#to_seq is renamed to to_biosequence with improvement.
The "to_seq" method is now an alias of to_biosequence.
Index: fasta.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/db/fasta.rb,v
retrieving revision 1.28.2.2
retrieving revision 1.28.2.3
diff -C2 -d -r1.28.2.2 -r1.28.2.3
*** fasta.rb 20 Jun 2008 13:30:14 -0000 1.28.2.2
--- fasta.rb 20 Jun 2008 13:43:36 -0000 1.28.2.3
***************
*** 28,31 ****
--- 28,32 ----
require 'bio/db'
require 'bio/sequence'
+ require 'bio/sequence/dblink'
require 'bio/db/fasta/defline'
***************
*** 217,226 ****
# because of efficiency.
#
! def to_seq
seq
obj = Bio::Sequence.new(@seq)
! obj.definition = self.definition
obj
end
# Parsing FASTA Defline, and extract IDs.
--- 218,243 ----
# because of efficiency.
#
! def to_biosequence
seq
obj = Bio::Sequence.new(@seq)
! d = self.identifiers
! # accessions
! obj.primary_accession = d.accessions.first
! obj.secondary_accessions = d.accessions[1..-1]
! # entry_id
! obj.entry_id = d.locus unless d.locus.to_s.empty?
! # GI
! other = []
! other.push Bio::Sequence::DBLink.new('GI', d.gi) if d.gi
! obj.other_seqids = other unless other.empty?
! # definition
! if d.accessions.empty? and other.empty? then
! obj.definition = self.definition
! else
! obj.definition = d.description
! end
obj
end
+ alias to_seq to_biosequence
# Parsing FASTA Defline, and extract IDs.