From ngoto at dev.open-bio.org  Mon Jun  2 05:33:50 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Mon, 02 Jun 2008 09:33:50 +0000
Subject: [BioRuby-cvs] bioruby/lib/bio reference.rb,1.26,1.27
Message-ID: <200806020933.m529Xoou025921@dev.open-bio.org>

Update of /home/repository/bioruby/bioruby/lib/bio
In directory dev.open-bio.org:/tmp/cvs-serv25887

Modified Files:
	reference.rb 
Log Message:
reverted to 1.24, because of potential security problem about "eval" in
bibtex method.


Index: reference.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/reference.rb,v
retrieving revision 1.26
retrieving revision 1.27
diff -C2 -d -r1.26 -r1.27
*** reference.rb	31 May 2008 09:36:55 -0000	1.26
--- reference.rb	2 Jun 2008 09:33:48 -0000	1.27
***************
*** 71,74 ****
--- 71,77 ----
      attr_reader :abstract
  
+     # An URL String.
+     attr_reader :url
+ 
      # MeSH terms in an Array.
      attr_reader :mesh
***************
*** 77,83 ****
      attr_reader :affiliations
  
-     # An URL String.
-     attr_reader :url
- 
      # Create a new Bio::Reference object from a Hash of values. 
      # Data is extracted from the values for keys:
--- 80,83 ----
***************
*** 232,236 ****
        lines << "%P #{@pages}" unless @pages.empty?
        lines << "%M #{@pubmed}" unless @pubmed.to_s.empty?
!       lines << "%U #{url}" unless url.empty?
        lines << "%X #{@abstract}" unless @abstract.empty?
        @mesh.each do |term|
--- 232,241 ----
        lines << "%P #{@pages}" unless @pages.empty?
        lines << "%M #{@pubmed}" unless @pubmed.to_s.empty?
!       if @pubmed
!         cgi = "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi"
!         opts = "cmd=Retrieve&db=PubMed&dopt=Citation&list_uids"
!         @url = "#{cgi}?#{opts}=#{@pubmed}"
!       end
!       lines << "%U #{@url}" unless @url.empty?
        lines << "%X #{@abstract}" unless @abstract.empty?
        @mesh.each do |term|
***************
*** 294,321 ****
      # *Arguments*:
      # * (optional) _section_: BiBTeX section as String
-     # * (optional) _keywords_: Array of additional keywords, e.g. ['abstract']
      # *Returns*:: String
!     def bibtex(section = nil, add_keywords = [])
        section = "article" unless section
        authors = authors_join(' and ', ' and ')
        pages   = @pages.sub('-', '--')
!       keywords = "author title journal year volume number pages url".split(/ /)
!       bib = "@#{section}{PMID:#{@pubmed},\n"
!       (keywords+add_keywords).each do | kw |
!         if kw == 'author'
!           ref = authors
!         elsif kw == 'title'
!           # strip final dot from title
!           ref = @title.sub(/\.$/,'')
!         elsif kw == 'number'
!           ref = @issue
!         elsif kw == 'url'
!           ref = url
!         else
!           ref = eval('@'+kw)
!         end
!         bib += "  #{kw.ljust(12)} = {#{ref}},\n" if ref != ''
!       end
!       bib+"}\n"
      end
  
--- 299,318 ----
      # *Arguments*:
      # * (optional) _section_: BiBTeX section as String
      # *Returns*:: String
!     def bibtex(section = nil)
        section = "article" unless section
        authors = authors_join(' and ', ' and ')
        pages   = @pages.sub('-', '--')
!       return <<-"END".gsub(/\t/, '')
!         @#{section}{PMID:#{@pubmed},
!           author  = {#{authors}},
!           title   = {#{@title}},
!           journal = {#{@journal}},
!           year    = {#{@year}},
!           volume  = {#{@volume}},
!           number  = {#{@issue}},
!           pages   = {#{pages}},
!         }
!       END
      end
  
***************
*** 503,518 ****
      end
  
-     # Returns a valid URL for pubmed records
-     #
-     # *Returns*:: String
-     def url
-       return @url if @url and @url != ''
-       if @pubmed != ''
-         cgi = "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi"
-         opts = "cmd=Retrieve&db=PubMed&dopt=Citation&list_uids"
-         return "#{cgi}?#{opts}=#{@pubmed}"
-       end
-       ''
-     end
  
      private
--- 500,503 ----
***************
*** 542,546 ****
      end
  
- 
    end
  
--- 527,530 ----


From ngoto at dev.open-bio.org  Mon Jun  2 05:47:11 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Mon, 02 Jun 2008 09:47:11 +0000
Subject: [BioRuby-cvs] bioruby/lib/bio reference.rb,1.27,1.28
Message-ID: <200806020947.m529lBCN026079@dev.open-bio.org>

Update of /home/repository/bioruby/bioruby/lib/bio
In directory dev.open-bio.org:/tmp/cvs-serv26058/lib/bio

Modified Files:
	reference.rb 
Log Message:
* New method Bio::Reference#pubmed_url added (renamed the url method in
  revision 1.25).
* Bio::Reference#endnote is changed not to overwrite url if url is
  already given by user.


Index: reference.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/reference.rb,v
retrieving revision 1.27
retrieving revision 1.28
diff -C2 -d -r1.27 -r1.28
*** reference.rb	2 Jun 2008 09:33:48 -0000	1.27
--- reference.rb	2 Jun 2008 09:47:08 -0000	1.28
***************
*** 232,241 ****
        lines << "%P #{@pages}" unless @pages.empty?
        lines << "%M #{@pubmed}" unless @pubmed.to_s.empty?
!       if @pubmed
!         cgi = "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi"
!         opts = "cmd=Retrieve&db=PubMed&dopt=Citation&list_uids"
!         @url = "#{cgi}?#{opts}=#{@pubmed}"
!       end
!       lines << "%U #{@url}" unless @url.empty?
        lines << "%X #{@abstract}" unless @abstract.empty?
        @mesh.each do |term|
--- 232,237 ----
        lines << "%P #{@pages}" unless @pages.empty?
        lines << "%M #{@pubmed}" unless @pubmed.to_s.empty?
!       url = @url.empty? ? pubmed_url : @url
!       lines << "%U #{url}" unless url.empty?
        lines << "%X #{@abstract}" unless @abstract.empty?
        @mesh.each do |term|
***************
*** 500,503 ****
--- 496,510 ----
      end
  
+     # Returns a valid URL for pubmed records
+     #
+     # *Returns*:: String
+     def pubmed_url
+       unless @pubmed.to_s.empty?
+         cgi = "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi"
+         opts = "cmd=Retrieve&db=PubMed&dopt=Citation&list_uids"
+         return "#{cgi}?#{opts}=#{@pubmed}"
+       end
+       ''
+     end
  
      private


From ngoto at dev.open-bio.org  Wed Jun  4 10:56:40 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Wed, 04 Jun 2008 14:56:40 +0000
Subject: [BioRuby-cvs] bioruby/lib/bio reference.rb,1.28,1.29
Message-ID: <200806041456.m54Eue8E001532@dev.open-bio.org>

Update of /home/repository/bioruby/bioruby/lib/bio
In directory dev.open-bio.org:/tmp/cvs-serv1512/lib/bio

Modified Files:
	reference.rb 
Log Message:
improvement of Bio::Reference#bibtex method


Index: reference.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/reference.rb,v
retrieving revision 1.28
retrieving revision 1.29
diff -C2 -d -r1.28 -r1.29
*** reference.rb	2 Jun 2008 09:47:08 -0000	1.28
--- reference.rb	4 Jun 2008 14:56:37 -0000	1.29
***************
*** 167,184 ****
      # *Arguments*:
      # * (optional) _style_: String with style identifier
!     # * (optional) _option_: Option for styles accepting one
      # *Returns*:: String
!     def format(style = nil, option = nil)
        case style
        when 'endnote'
          return endnote
        when 'bibitem'
!         return bibitem(option)
        when 'bibtex'
!         return bibtex(option)
        when 'rd'
!         return rd(option)
        when /^nature$/i
!         return nature(option)
        when /^science$/i
          return science
--- 167,184 ----
      # *Arguments*:
      # * (optional) _style_: String with style identifier
!     # * (optional) _options_: Options for styles accepting one
      # *Returns*:: String
!     def format(style = nil, *options)
        case style
        when 'endnote'
          return endnote
        when 'bibitem'
!         return bibitem(*options)
        when 'bibtex'
!         return bibtex(*options)
        when 'rd'
!         return rd(*options)
        when /^nature$/i
!         return nature(*options)
        when /^science$/i
          return science
***************
*** 295,314 ****
      # *Arguments*:
      # * (optional) _section_: BiBTeX section as String
      # *Returns*:: String
!     def bibtex(section = nil)
        section = "article" unless section
        authors = authors_join(' and ', ' and ')
!       pages   = @pages.sub('-', '--')
!       return <<-"END".gsub(/\t/, '')
!         @#{section}{PMID:#{@pubmed},
!           author  = {#{authors}},
!           title   = {#{@title}},
!           journal = {#{@journal}},
!           year    = {#{@year}},
!           volume  = {#{@volume}},
!           number  = {#{@issue}},
!           pages   = {#{pages}},
!         }
!       END
      end
  
--- 295,340 ----
      # *Arguments*:
      # * (optional) _section_: BiBTeX section as String
+     # * (optional) _label_: Label string cited by LaTeX documents.
+     #                       Default is <tt>"PMID:#{pubmed}"</tt>.
+     # * (optional) _keywords_: Hash of additional keywords,
+     #                          e.g. { 'abstract' => 'This is abstract.' }.
+     #                          You can also override default keywords.
+     #                          To disable default keywords, specify false as
+     #                          value, e.g. { 'url' => false, 'year' => false }.
      # *Returns*:: String
!     def bibtex(section = nil, label = nil, keywords = {})
        section = "article" unless section
        authors = authors_join(' and ', ' and ')
!       thepages = pages.to_s.empty? ? nil : pages.sub(/\-/, '--')
!       unless label then
!         label = "PMID:#{pubmed}"
!       end
!       theurl = if !(url.to_s.empty?) then
!                  url
!                elsif pmurl = pubmed_url and !(pmurl.to_s.empty?) then
!                  pmurl
!                else
!                  nil
!                end
!       hash = {
!         'author'  => authors.empty?    ? nil : authors,
!         'title'   => title.to_s.empty? ? nil : title,
!         'number'  => issue.to_s.empty? ? nil : issue,
!         'pages'   => thepages,
!         'url'     => theurl
!       }
!       keys = %w( author title journal year volume number pages url )
!       keys.each do |k|
!         hash[k] = self.__send__(k.intern) unless hash.has_key?(k)
!       end
!       hash.merge!(keywords) { |k, v1, v2| v2.nil? ? v1 : v2 }
!       bib = [ "@#{section}{#{label}," ]
!       keys.concat((hash.keys - keys).sort)
!       keys.each do |kw|
!         ref = hash[kw]
!         bib.push "  #{kw.ljust(12)} = {#{ref}}," if ref
!       end
!       bib.push "}\n"
!       return bib.join("\n")
      end
  

From ngoto at dev.open-bio.org  Wed Jun  4 10:58:10 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Wed, 04 Jun 2008 14:58:10 +0000
Subject: [BioRuby-cvs] bioruby/test/unit/bio test_reference.rb,1.4,1.5
Message-ID: <200806041458.m54EwAo2001581@dev.open-bio.org>

Update of /home/repository/bioruby/bioruby/test/unit/bio
In directory dev.open-bio.org:/tmp/cvs-serv1561/test/unit/bio

Modified Files:
	test_reference.rb 
Log Message:
test changed due to the improvement of Bio::Reference#bibtex


Index: test_reference.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/test/unit/bio/test_reference.rb,v
retrieving revision 1.4
retrieving revision 1.5
diff -C2 -d -r1.4 -r1.5
*** test_reference.rb	31 May 2008 09:36:56 -0000	1.4
--- test_reference.rb	4 Jun 2008 14:58:08 -0000	1.5
***************
*** 103,112 ****
  
      def test_format_bibtex
!       str = "@article{PMID:12345678,\n  author       = {Hoge, J.P. and Fuga, F.B.},\n  title        = {Title of the study},\n  journal      = {Theor. J. Hoge},\n  year         = {2001},\n  volume       = {12},\n  number       = {3},\n  pages        = {123-145},\n  url          = {http://example.com},\n}\n"
! 
        assert_equal(str, @obj.format('bibtex'))
        assert_equal(str, @obj.bibtex)
      end
  
      def test_format_rd
        str = "== Title of the study.\n\n* Hoge, J.P. and Fuga, F.B.\n\n* Theor. J. Hoge 2001 12:123-145 [PMID:12345678]\n\nHoge fuga. hoge fuga."
--- 103,147 ----
  
      def test_format_bibtex
!       str =<<__END__
! @article{PMID:12345678,
!   author       = {Hoge, J.P. and Fuga, F.B.},
!   title        = {Title of the study.},
!   journal      = {Theor. J. Hoge},
!   year         = {2001},
!   volume       = {12},
!   number       = {3},
!   pages        = {123--145},
!   url          = {http://example.com},
! }
! __END__
        assert_equal(str, @obj.format('bibtex'))
        assert_equal(str, @obj.bibtex)
      end
  
+     def test_format_bibtex_with_arguments
+       str =<<__END__
+ @inproceedings{YourArticle,
+   author       = {Hoge, J.P. and Fuga, F.B.},
+   title        = {Title of the study.},
+   year         = {2001},
+   volume       = {12},
+   number       = {3},
+   pages        = {123--145},
+   booktitle    = {Theor. J. Hoge},
+   month        = {December},
+ }
+ __END__
+       assert_equal(str, @obj.format('bibtex', 'inproceedings', 'YourArticle',
+                                     { 'journal'   => false,
+                                       'url' => false,
+                                       'booktitle' => @obj.journal,
+                                       'month' => 'December'}))
+       assert_equal(str, @obj.bibtex('inproceedings', 'YourArticle',
+                                     { 'journal'   => false,
+                                       'url' => false,
+                                       'booktitle' => @obj.journal,
+                                       'month' => 'December'}))
+     end
+ 
      def test_format_rd
        str = "== Title of the study.\n\n* Hoge, J.P. and Fuga, F.B.\n\n* Theor. J. Hoge 2001 12:123-145 [PMID:12345678]\n\nHoge fuga. hoge fuga."


From ngoto at dev.open-bio.org  Fri Jun 13 07:20:26 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Fri, 13 Jun 2008 11:20:26 +0000
Subject: [BioRuby-cvs] bioruby/lib/bio reference.rb,1.29,1.30
Message-ID: <200806131120.m5DBKQLQ004888@dev.open-bio.org>

Update of /home/repository/bioruby/bioruby/lib/bio
In directory dev.open-bio.org:/tmp/cvs-serv4830/lib/bio

Modified Files:
	reference.rb 
Log Message:
modified RDoc for Bio::Reference#bibitem 


Index: reference.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/reference.rb,v
retrieving revision 1.29
retrieving revision 1.30
diff -C2 -d -r1.29 -r1.30
*** reference.rb	4 Jun 2008 14:56:37 -0000	1.29
--- reference.rb	13 Jun 2008 11:20:23 -0000	1.30
***************
*** 252,255 ****
--- 252,257 ----
      #     {\em Theor. J. Hoge}, 12(3):123--145, 2001.
      # ---
+     # *Arguments*:
+     # * (optional) _item_: label string (default: <tt>"PMID:#{pubmed}"</tt>).
      # *Returns*:: String
      def bibitem(item = nil)


From ngoto at dev.open-bio.org  Fri Jun 13 07:37:27 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Fri, 13 Jun 2008 11:37:27 +0000
Subject: [BioRuby-cvs]
	bioruby/test/unit/bio/util/restriction_enzyme/double_stranded
	test_aligned_strands.rb, 1.3, 1.4
Message-ID: <200806131137.m5DBbRnA005201@dev.open-bio.org>

Update of /home/repository/bioruby/bioruby/test/unit/bio/util/restriction_enzyme/double_stranded
In directory dev.open-bio.org:/tmp/cvs-serv5181/test/unit/bio/util/restriction_enzyme/double_stranded

Modified Files:
	test_aligned_strands.rb 
Log Message:
"require 'bio/sequence'" is needed to run the tests in this file.


Index: test_aligned_strands.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/test/unit/bio/util/restriction_enzyme/double_stranded/test_aligned_strands.rb,v
retrieving revision 1.3
retrieving revision 1.4
diff -C2 -d -r1.3 -r1.4
*** test_aligned_strands.rb	5 Apr 2007 23:35:44 -0000	1.3
--- test_aligned_strands.rb	13 Jun 2008 11:37:25 -0000	1.4
***************
*** 14,17 ****
--- 14,18 ----
  
  require 'test/unit'
+ require 'bio/sequence'
  require 'bio/util/restriction_enzyme/double_stranded/aligned_strands'
  require 'bio/util/restriction_enzyme/double_stranded'


From ngoto at dev.open-bio.org  Fri Jun 13 07:39:41 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Fri, 13 Jun 2008 11:39:41 +0000
Subject: [BioRuby-cvs]
	bioruby/test/unit/bio/util/restriction_enzyme/double_stranded
	test_aligned_strands.rb, 1.3, 1.3.2.1
Message-ID: <200806131139.m5DBdfXW005450@dev.open-bio.org>

Update of /home/repository/bioruby/bioruby/test/unit/bio/util/restriction_enzyme/double_stranded
In directory dev.open-bio.org:/tmp/cvs-serv5209/test/unit/bio/util/restriction_enzyme/double_stranded

Modified Files:
      Tag: BRANCH-biohackathon2008
	test_aligned_strands.rb 
Log Message:
merged change from rev. 1.3 to 1.4 in the CVS trunk


Index: test_aligned_strands.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/test/unit/bio/util/restriction_enzyme/double_stranded/test_aligned_strands.rb,v
retrieving revision 1.3
retrieving revision 1.3.2.1
diff -C2 -d -r1.3 -r1.3.2.1
*** test_aligned_strands.rb	5 Apr 2007 23:35:44 -0000	1.3
--- test_aligned_strands.rb	13 Jun 2008 11:39:39 -0000	1.3.2.1
***************
*** 14,17 ****
--- 14,18 ----
  
  require 'test/unit'
+ require 'bio/sequence'
  require 'bio/util/restriction_enzyme/double_stranded/aligned_strands'
  require 'bio/util/restriction_enzyme/double_stranded'


From ngoto at dev.open-bio.org  Tue Jun 17 08:23:52 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Tue, 17 Jun 2008 12:23:52 +0000
Subject: [BioRuby-cvs] bioruby/lib/bio reference.rb,1.24.2.6,1.24.2.7
Message-ID: <200806171223.m5HCNqfC020085@dev.open-bio.org>

Update of /home/repository/bioruby/bioruby/lib/bio
In directory dev.open-bio.org:/tmp/cvs-serv20065/lib/bio

Modified Files:
      Tag: BRANCH-biohackathon2008
	reference.rb 
Log Message:
merged changes in trunk (revision 1.30)


Index: reference.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/reference.rb,v
retrieving revision 1.24.2.6
retrieving revision 1.24.2.7
diff -C2 -d -r1.24.2.6 -r1.24.2.7
*** reference.rb	23 Apr 2008 18:52:18 -0000	1.24.2.6
--- reference.rb	17 Jun 2008 12:23:49 -0000	1.24.2.7
***************
*** 180,186 ****
      # *Arguments*:
      # * (optional) _style_: String with style identifier
!     # * (optional) _option_: Option for styles accepting one
      # *Returns*:: String
!     def format(style = nil, option = nil)
        case style
        when 'embl'
--- 180,186 ----
      # *Arguments*:
      # * (optional) _style_: String with style identifier
!     # * (optional) _options_: Options for styles accepting one
      # *Returns*:: String
!     def format(style = nil, *options)
        case style
        when 'embl'
***************
*** 189,199 ****
          return endnote
        when 'bibitem'
!         return bibitem(option)
        when 'bibtex'
!         return bibtex(option)
        when 'rd'
!         return rd(option)
        when /^nature$/i
!         return nature(option)
        when /^science$/i
          return science
--- 189,199 ----
          return endnote
        when 'bibitem'
!         return bibitem(*options)
        when 'bibtex'
!         return bibtex(*options)
        when 'rd'
!         return rd(*options)
        when /^nature$/i
!         return nature(*options)
        when /^science$/i
          return science
***************
*** 247,256 ****
        lines << "%P #{@pages}" unless @pages.empty?
        lines << "%M #{@pubmed}" unless @pubmed.to_s.empty?
!       if @pubmed
!         cgi = "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi"
!         opts = "cmd=Retrieve&db=PubMed&dopt=Citation&list_uids"
!         @url = "#{cgi}?#{opts}=#{@pubmed}"
!       end
!       lines << "%U #{@url}" unless @url.empty?
        lines << "%X #{@abstract}" unless @abstract.empty?
        @mesh.each do |term|
--- 247,252 ----
        lines << "%P #{@pages}" unless @pages.empty?
        lines << "%M #{@pubmed}" unless @pubmed.to_s.empty?
!       u = @url.empty? ? pubmed_url : @url
!       lines << "%U #{u}" unless u.empty?
        lines << "%X #{@abstract}" unless @abstract.empty?
        @mesh.each do |term|
***************
*** 289,292 ****
--- 285,290 ----
      #     {\em Theor. J. Hoge}, 12(3):123--145, 2001.
      # ---
+     # *Arguments*:
+     # * (optional) _item_: label string (default: <tt>"PMID:#{pubmed}"</tt>).
      # *Returns*:: String
      def bibitem(item = nil)
***************
*** 332,351 ****
      # *Arguments*:
      # * (optional) _section_: BiBTeX section as String
      # *Returns*:: String
!     def bibtex(section = nil)
        section = "article" unless section
        authors = authors_join(' and ', ' and ')
!       pages   = @pages.sub('-', '--')
!       return <<-"END".gsub(/\t/, '')
!         @#{section}{PMID:#{@pubmed},
!           author  = {#{authors}},
!           title   = {#{@title}},
!           journal = {#{@journal}},
!           year    = {#{@year}},
!           volume  = {#{@volume}},
!           number  = {#{@issue}},
!           pages   = {#{pages}},
!         }
!       END
      end
  
--- 330,375 ----
      # *Arguments*:
      # * (optional) _section_: BiBTeX section as String
+     # * (optional) _label_: Label string cited by LaTeX documents.
+     #                       Default is <tt>"PMID:#{pubmed}"</tt>.
+     # * (optional) _keywords_: Hash of additional keywords,
+     #                          e.g. { 'abstract' => 'This is abstract.' }.
+     #                          You can also override default keywords.
+     #                          To disable default keywords, specify false as
+     #                          value, e.g. { 'url' => false, 'year' => false }.
      # *Returns*:: String
!     def bibtex(section = nil, label = nil, keywords = {})
        section = "article" unless section
        authors = authors_join(' and ', ' and ')
!       thepages = pages.to_s.empty? ? nil : pages.sub(/\-/, '--')
!       unless label then
!         label = "PMID:#{pubmed}"
!       end
!       theurl = if !(url.to_s.empty?) then
!                  url
!                elsif pmurl = pubmed_url and !(pmurl.to_s.empty?) then
!                  pmurl
!                else
!                  nil
!                end
!       hash = {
!         'author'  => authors.empty?    ? nil : authors,
!         'title'   => title.to_s.empty? ? nil : title,
!         'number'  => issue.to_s.empty? ? nil : issue,
!         'pages'   => thepages,
!         'url'     => theurl
!       }
!       keys = %w( author title journal year volume number pages url )
!       keys.each do |k|
!         hash[k] = self.__send__(k.intern) unless hash.has_key?(k)
!       end
!       hash.merge!(keywords) { |k, v1, v2| v2.nil? ? v1 : v2 }
!       bib = [ "@#{section}{#{label}," ]
!       keys.concat((hash.keys - keys).sort)
!       keys.each do |kw|
!         ref = hash[kw]
!         bib.push "  #{kw.ljust(12)} = {#{ref}}," if ref
!       end
!       bib.push "}\n"
!       return bib.join("\n")
      end
  
***************
*** 533,536 ****
--- 557,571 ----
      end
  
+     # Returns a valid URL for pubmed records
+     #
+     # *Returns*:: String
+     def pubmed_url
+       unless @pubmed.to_s.empty?
+         cgi = "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi"
+         opts = "cmd=Retrieve&db=PubMed&dopt=Citation&list_uids"
+         return "#{cgi}?#{opts}=#{@pubmed}"
+       end
+       ''
+     end
  
      private


From ngoto at dev.open-bio.org  Tue Jun 17 08:24:44 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Tue, 17 Jun 2008 12:24:44 +0000
Subject: [BioRuby-cvs] bioruby/test/unit/bio test_reference.rb, 1.3.2.1,
	1.3.2.2
Message-ID: <200806171224.m5HCOiAk020113@dev.open-bio.org>

Update of /home/repository/bioruby/bioruby/test/unit/bio
In directory dev.open-bio.org:/tmp/cvs-serv20093/test/unit/bio

Modified Files:
      Tag: BRANCH-biohackathon2008
	test_reference.rb 
Log Message:
merged changes from trunk (revision 1.5)


Index: test_reference.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/test/unit/bio/test_reference.rb,v
retrieving revision 1.3.2.1
retrieving revision 1.3.2.2
diff -C2 -d -r1.3.2.1 -r1.3.2.2
*** test_reference.rb	8 May 2008 05:38:01 -0000	1.3.2.1
--- test_reference.rb	17 Jun 2008 12:24:41 -0000	1.3.2.2
***************
*** 92,96 ****
  
      def test_format_endnote
!       str = "%0 Journal Article\n%A Hoge, J.P.\n%A Fuga, F.B.\n%D 2001\n%T Title of the study.\n%J Theor. J. Hoge\n%V 12\n%N 3\n%P 123-145\n%M 12345678\n%U http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=PubMed&dopt=Citation&list_uids=12345678\n%X Hoge fuga. hoge fuga.\n%K Hoge\n%+ Tokyo"
        assert_equal(str, @obj.format('endnote'))
        assert_equal(str, @obj.endnote)
--- 92,96 ----
  
      def test_format_endnote
!       str = "%0 Journal Article\n%A Hoge, J.P.\n%A Fuga, F.B.\n%D 2001\n%T Title of the study.\n%J Theor. J. Hoge\n%V 12\n%N 3\n%P 123-145\n%M 12345678\n%U http://example.com\n%X Hoge fuga. hoge fuga.\n%K Hoge\n%+ Tokyo"
        assert_equal(str, @obj.format('endnote'))
        assert_equal(str, @obj.endnote)
***************
*** 104,122 ****
  
      def test_format_bibtex
!       str =<<END
!         @article{PMID:12345678,
!           author  = {Hoge, J.P. and Fuga, F.B.},
!           title   = {Title of the study.},
!           journal = {Theor. J. Hoge},
!           year    = {2001},
!           volume  = {12},
!           number  = {3},
!           pages   = {123--145},
!         }
! END
        assert_equal(str, @obj.format('bibtex'))
        assert_equal(str, @obj.bibtex)
      end
  
      def test_format_rd
        str = "== Title of the study.\n\n* Hoge, J.P. and Fuga, F.B.\n\n* Theor. J. Hoge 2001 12:123-145 [PMID:12345678]\n\nHoge fuga. hoge fuga."
--- 104,148 ----
  
      def test_format_bibtex
!       str =<<__END__
! @article{PMID:12345678,
!   author       = {Hoge, J.P. and Fuga, F.B.},
!   title        = {Title of the study.},
!   journal      = {Theor. J. Hoge},
!   year         = {2001},
!   volume       = {12},
!   number       = {3},
!   pages        = {123--145},
!   url          = {http://example.com},
! }
! __END__
        assert_equal(str, @obj.format('bibtex'))
        assert_equal(str, @obj.bibtex)
      end
  
+     def test_format_bibtex_with_arguments
+       str =<<__END__
+ @inproceedings{YourArticle,
+   author       = {Hoge, J.P. and Fuga, F.B.},
+   title        = {Title of the study.},
+   year         = {2001},
+   volume       = {12},
+   number       = {3},
+   pages        = {123--145},
+   booktitle    = {Theor. J. Hoge},
+   month        = {December},
+ }
+ __END__
+       assert_equal(str, @obj.format('bibtex', 'inproceedings', 'YourArticle',
+                                     { 'journal'   => false,
+                                       'url' => false,
+                                       'booktitle' => @obj.journal,
+                                       'month' => 'December'}))
+       assert_equal(str, @obj.bibtex('inproceedings', 'YourArticle',
+                                     { 'journal'   => false,
+                                       'url' => false,
+                                       'booktitle' => @obj.journal,
+                                       'month' => 'December'}))
+     end
+ 
      def test_format_rd
        str = "== Title of the study.\n\n* Hoge, J.P. and Fuga, F.B.\n\n* Theor. J. Hoge 2001 12:123-145 [PMID:12345678]\n\nHoge fuga. hoge fuga."


From ngoto at dev.open-bio.org  Tue Jun 17 11:25:24 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Tue, 17 Jun 2008 15:25:24 +0000
Subject: [BioRuby-cvs] bioruby/lib/bio sequence.rb,0.58.2.11,0.58.2.12
Message-ID: <200806171525.m5HFPOpk020858@dev.open-bio.org>

Update of /home/repository/bioruby/bioruby/lib/bio
In directory dev.open-bio.org:/tmp/cvs-serv20823/lib/bio

Modified Files:
      Tag: BRANCH-biohackathon2008
	sequence.rb 
Log Message:
* Some attributes are added: strandedness (strand information), 
  release_created, release_modified (release information),
  entry_version (version of the entry numbered by database administrator),
  organelle (organelle information), other_seqids (sequence IDs other than
  accessions), and id_namespace (namespace of accessions).
  Most of them are added because corresponding tags are defined in the
  INSDSeq XML v1.4 ( http://www.insdc.org/files/documents/INSD_V1.4.dtd ).
  The "id_namespace" will be used to output NCBI style fasta format.
* The "taxonomy" attribute is changed to be an alias of the "classification"
  attribute.
* The "date" attribute is removed.
* RDoc documents of attributes are updated.


Index: sequence.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/sequence.rb,v
retrieving revision 0.58.2.11
retrieving revision 0.58.2.12
diff -C2 -d -r0.58.2.11 -r0.58.2.12
*** sequence.rb	24 Apr 2008 14:28:25 -0000	0.58.2.11
--- sequence.rb	17 Jun 2008 15:25:22 -0000	0.58.2.12
***************
*** 118,149 ****
    end
    
!   # The sequence identifier.  For example, for a sequence
!   # of Genbank origin, this is the accession number.
    attr_accessor :entry_id
    
!   # A String with a description of the sequence
    attr_accessor :definition
    
!   # An Array of Bio::Feature objects
    attr_accessor :features
    
!   # An Array of Bio::Reference objects
    attr_accessor :references
    
!   # A comment String
    attr_accessor :comments
    
!   # Date from sequence source. Often date of deposition.
!   attr_accessor :date
!   
!   # An Array of Strings
    attr_accessor :keywords
    
!   # An Array of Strings; links to other database entries.
    attr_accessor :dblinks
!   
!   # A taxonomy String
!   attr_accessor :taxonomy
!   
    # Bio::Sequence::NA/AA
    attr_accessor :moltype
--- 118,145 ----
    end
    
!   # The sequence identifier (String).  For example, for a sequence
!   # of Genbank origin, this is the locus name.
!   # For a sequence of EMBL origin, this is the primary accession number.
    attr_accessor :entry_id
    
!   # A String with a description of the sequence (String)
    attr_accessor :definition
    
!   # Features (An Array of Bio::Feature objects)
    attr_accessor :features
    
!   # References (An Array of Bio::Reference objects)
    attr_accessor :references
    
!   # Comments (String or an Array of String)
    attr_accessor :comments
    
!   # Keywords (An Array of String)
    attr_accessor :keywords
    
!   # Links to other database entries.
!   # (An Array of Bio::Sequence::DBLink objects)
    attr_accessor :dblinks
! 
    # Bio::Sequence::NA/AA
    attr_accessor :moltype
***************
*** 157,166 ****
    #+++
    
!   # Version number of the sequence (String).
    attr_accessor :sequence_version
  
!   # Topology (String). "circular" or "linear".
    attr_accessor :topology
  
    # molecular type (String). "DNA" or "RNA" for nucleotide sequence.
    attr_accessor :molecule_type
--- 153,170 ----
    #+++
    
!   # Version number of the sequence (String or Integer).
!   # Unlike <tt>entry_version</tt>, <tt>sequence_version</tt> will be changed
!   # when the submitter of the sequence updates the entry.
!   # Normally, the same entry taken from different databases (EMBL, GenBank,
!   # and DDBJ) may have the same sequence_version.
    attr_accessor :sequence_version
  
!   # Topology (String). "circular", "linear", or nil.
    attr_accessor :topology
  
+   # Strandedness (String). "single" (single-stranded),
+   # "double" (double-stranded), "mixed" (mixed-stranded), or nil.
+   attr_accessor :strandedness
+ 
    # molecular type (String). "DNA" or "RNA" for nucleotide sequence.
    attr_accessor :molecule_type
***************
*** 180,189 ****
    attr_accessor :secondary_accessions
  
!   # Created date of the sequence entry (String)
    attr_accessor :date_created
  
!   # Last modified date of the sequence entry (String)
    attr_accessor :date_modified
  
    # Organism species (String). For example, "Escherichia coli".
    attr_accessor :species
--- 184,208 ----
    attr_accessor :secondary_accessions
  
!   # Created date of the sequence entry (Date, DateTime, Time, or String)
    attr_accessor :date_created
  
!   # Last modified date of the sequence entry (Date, DateTime, Time, or String)
    attr_accessor :date_modified
  
+   # Release information when created (String)
+   attr_accessor :release_created
+ 
+   # Release information when last-modified (String)
+   attr_accessor :release_modified
+ 
+   # Version of the entry (String or Integer).
+   # Unlike <tt>sequence_version</tt>, <tt>entry_version</tt> is a database
+   # maintainer's internal version number.
+   # The version number will be changed when the database maintainer
+   # modifies the entry.
+   # The same enrty in EMBL, GenBank, and DDBJ may have different
+   # entry_version.
+   attr_accessor :entry_version
+ 
    # Organism species (String). For example, "Escherichia coli".
    attr_accessor :species
***************
*** 192,195 ****
--- 211,231 ----
    # (Array of String)
    attr_accessor :classification
+   alias taxonomy classification
+ 
+   # (not well supported) Organelle information (String).
+   attr_accessor :organelle
+ 
+   # Namespace of the sequence IDs described in entry_id, primary_accession,
+   # and secondary_accessions methods (String).
+   # For example, 'EMBL', 'GenBank', 'DDBJ', 'RefSeq'.
+   attr_accessor :id_namespace
+ 
+   # Sequence identifiers which are not described in entry_id,
+   # primary_accession,and secondary_accessions methods
+   # (Array of Bio::Sequence::DBLink objects).
+   # For example, NCBI GI number can be stored.
+   # Note that only identifiers of the entry itself should be stored.
+   # For database cross references, <tt>dblinks</tt> should be used.
+   attr_accessor :other_seqids
  
    # Guess the type of sequence, Amino Acid or Nucleic Acid, and create a 


From ngoto at dev.open-bio.org  Tue Jun 17 11:44:24 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Tue, 17 Jun 2008 15:44:24 +0000
Subject: [BioRuby-cvs] bioruby/test/unit/bio/sequence test_dblink.rb, NONE,
	1.1.2.1
Message-ID: <200806171544.m5HFiOIl021028@dev.open-bio.org>

Update of /home/repository/bioruby/bioruby/test/unit/bio/sequence
In directory dev.open-bio.org:/tmp/cvs-serv21001/test/unit/bio/sequence

Added Files:
      Tag: BRANCH-biohackathon2008
	test_dblink.rb 
Log Message:
New class Bio::Sequence::DBLink are added to store IDs and database names
together in an object.


--- NEW FILE: test_dblink.rb ---
#
# test/unit/bio/sequence/test_dblink.rb - Unit test for Bio::Sequencce::DBLink
#
# Copyright::  Copyright (C) 2008 Naohisa Goto <ng at bioruby.org>
# License::    The Ruby License
#
#  $Id: test_dblink.rb,v 1.1.2.1 2008/06/17 15:44:22 ngoto Exp $
#

require 'pathname'
libpath = Pathname.new(File.join(File.dirname(__FILE__), ['..'] * 4, 'lib')).cleanpath.to_s
$:.unshift(libpath) unless $:.include?(libpath)

require 'test/unit'
require 'bio/sequence'
require 'bio/sequence/dblink'

module Bio
  class TestSequenceDBLink < Test::Unit::TestCase
    def setup
      @xref = Bio::Sequence::DBLink.new('EMBL', 'Z14088', 'CAA78466.1',
                                        '-', 'mRNA')
    end

    def test_database
      assert_equal('EMBL', @xref.database)
    end

    def test_id
      assert_equal('Z14088', @xref.id)
    end

    def test_secondary_ids
      assert_equal([ 'CAA78466.1', '-', 'mRNA' ],
                   @xref.secondary_ids)
    end
  end #class

  class TestSequenceDBLinkClassMethods < Test::Unit::TestCase
    def test_parse_embl_DR_line
      str = 'DR   EPD; EP07077; HS_HBG1.'
      xref = Bio::Sequence::DBLink.parse_embl_DR_line(str)
      assert_equal('EPD', xref.database)
      assert_equal('EP07077', xref.id)
      assert_equal([ 'HS_HBG1' ], xref.secondary_ids)
    end

    def test_parse_uniprot_DR_line
      str = 'DR   EMBL; Z14088; CAA78466.1; -; mRNA.'
      xref = Bio::Sequence::DBLink.parse_uniprot_DR_line(str)
      assert_equal('EMBL', xref.database)
      assert_equal('Z14088', xref.id)
      assert_equal([ 'CAA78466.1', '-', 'mRNA' ],
                   xref.secondary_ids)
      end
  end #class

end #module Bio


From ngoto at dev.open-bio.org  Tue Jun 17 11:44:24 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Tue, 17 Jun 2008 15:44:24 +0000
Subject: [BioRuby-cvs] bioruby/lib/bio/sequence dblink.rb,NONE,1.1.2.1
Message-ID: <200806171544.m5HFiOF6021023@dev.open-bio.org>

Update of /home/repository/bioruby/bioruby/lib/bio/sequence
In directory dev.open-bio.org:/tmp/cvs-serv21001/lib/bio/sequence

Added Files:
      Tag: BRANCH-biohackathon2008
	dblink.rb 
Log Message:
New class Bio::Sequence::DBLink are added to store IDs and database names
together in an object.


--- NEW FILE: dblink.rb ---
#
# = bio/sequence/dblink.rb - sequence ID with database name
#
# Copyright::  Copyright (C) 2008
#              Naohisa Goto <ng at bioruby.org>
# License::    The Ruby License
#
# $Id: dblink.rb,v 1.1.2.1 2008/06/17 15:44:22 ngoto Exp $
#

require 'bio/sequence'

# Bio::Sequence::DBLink stores IDs with the database name.
# Its main purpose is to store database cross-reference information
# for a sequence entry.
class Bio::Sequence::DBLink

  # creates a new DBLink object
  def initialize(database, primary_id, *secondary_ids)
    @database = database
    @id = primary_id
    @secondary_ids = secondary_ids
  end

  # Database name, or namespace identifier (String).
  attr_reader :database

  # Primary identifier (String)
  attr_reader :id

  # Secondary identifiers (Array of String)
  attr_reader :secondary_ids

  #--
  # class methods
  #++

  # Parses DR line in EMBL entry, and returns a DBLink object.
  def self.parse_embl_DR_line(str)
    str = str.sub(/\.\s*\z/, '')
    str.sub!(/\ADR   /, '')
    self.new(*(str.split(/\s*\;\s*/, 3)))
  end

  # Parses DR line in UniProt entry, and returns a DBLink object.
  def self.parse_uniprot_DR_line(str)
    str = str.sub(/\.\s*\z/, '')
    str.sub!(/\ADR   /, '')
    self.new(*(str.split(/\s*\;\s*/)))
  end

end #class Bio::Sequence::DBLink


From ngoto at dev.open-bio.org  Tue Jun 17 11:50:07 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Tue, 17 Jun 2008 15:50:07 +0000
Subject: [BioRuby-cvs] bioruby/lib/bio/sequence format.rb,1.4.2.7,1.4.2.8
Message-ID: <200806171550.m5HFo7Jm021095@dev.open-bio.org>

Update of /home/repository/bioruby/bioruby/lib/bio/sequence
In directory dev.open-bio.org:/tmp/cvs-serv21057/lib/bio/sequence

Modified Files:
      Tag: BRANCH-biohackathon2008
	format.rb 
Log Message:
* In the wrap method, changed to recognize "\n" in given string.
* Some helper methods are added to help formatting date string.


Index: format.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/sequence/format.rb,v
retrieving revision 1.4.2.7
retrieving revision 1.4.2.8
diff -C2 -d -r1.4.2.7 -r1.4.2.8
*** format.rb	4 Mar 2008 11:10:28 -0000	1.4.2.7
--- format.rb	17 Jun 2008 15:50:05 -0000	1.4.2.8
***************
*** 285,305 ****
    def wrap_and_split_lines(str, width)
      result = []
!     left = str.dup
!     while left and left.length > width
!       line = nil
!       width.downto(1) do |i|
!         if left[i..i] == ' ' or /[\,\;]/ =~ left[(i-1)..(i-1)]  then
!           line = left[0..(i-1)].sub(/ +\z/, '')
!           left = left[i..-1].sub(/\A +/, '')
!           break
          end
        end
!       if line.nil? then
!         line = left[0..(width-1)]
!         left = left[width..-1]
!       end
!       result << line
      end
-     result << left if left and !(left.to_s.empty?)
      return result
    end
--- 285,309 ----
    def wrap_and_split_lines(str, width)
      result = []
!     lefts = str.chomp.split(/(?:\r\n|\r|\n)/)
!     lefts.each do |left|
!       left.rstrip!
!       while left and left.length > width
!         line = nil
!         width.downto(1) do |i|
!           if left[i..i] == ' ' or /[\,\;]/ =~ left[(i-1)..(i-1)]  then
!             line = left[0..(i-1)].sub(/ +\z/, '')
!             left = left[i..-1].sub(/\A +/, '')
!             break
!           end
          end
+         if line.nil? then
+           line = left[0..(width-1)]
+           left = left[width..-1]
+         end
+         result << line
+         left = nil if  left.to_s.empty?
        end
!       result << left if left
      end
      return result
    end
***************
*** 320,323 ****
--- 324,352 ----
    end
  
+   #--
+   # internal use only
+   MonthStr = [ nil, 
+                'JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN',
+                'JUL', 'AUG', 'SEP', 'OCT', 'NOV', 'DEC'
+              ].collect { |x| x.freeze }.freeze
+   #++
+ 
+   # formats a date from Date, DateTime, or Time object, or String.
+   def format_date(d)
+     begin
+       yy = d.year
+       mm = d.month
+       dd = d.day
+     rescue NoMethodError, NameError, ArgumentError, TypeError
+       return sprintf("%-11s", d)
+     end
+     sprintf("%02d-%-3s-%04d", dd, MonthStr[mm], yy)
+   end
+ 
+   # null date
+   def null_date
+     Date.new(0, 1, 1)
+   end
+ 
  end #module INSDFeatureHelper
  

From ngoto at dev.open-bio.org  Tue Jun 17 11:53:23 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Tue, 17 Jun 2008 15:53:23 +0000
Subject: [BioRuby-cvs] bioruby/lib/bio/db/genbank common.rb, 1.11.2.4,
	1.11.2.5
Message-ID: <200806171553.m5HFrNlb021165@dev.open-bio.org>

Update of /home/repository/bioruby/bioruby/lib/bio/db/genbank
In directory dev.open-bio.org:/tmp/cvs-serv21145/lib/bio/db/genbank

Modified Files:
      Tag: BRANCH-biohackathon2008
	common.rb 
Log Message:
Bio::GenBank#comment (and Bio::GenPept#comment) is changed not to remove
newlines inside the comment.


Index: common.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/db/genbank/common.rb,v
retrieving revision 1.11.2.4
retrieving revision 1.11.2.5
diff -C2 -d -r1.11.2.4 -r1.11.2.5
*** common.rb	7 May 2008 12:25:42 -0000	1.11.2.4
--- common.rb	17 Jun 2008 15:53:21 -0000	1.11.2.5
***************
*** 196,200 ****
    # COMMENT -- Returns contents of the COMMENT record as a String.
    def comment
!     field_fetch('COMMENT')
    end
  
--- 196,203 ----
    # COMMENT -- Returns contents of the COMMENT record as a String.
    def comment
!     str = get('COMMENT').to_s.sub(/\ACOMMENT     /, '')
!     str.gsub!(/^ {12}/, '')
!     str.chomp!
!     str
    end
  

From ngoto at dev.open-bio.org  Tue Jun 17 11:56:20 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Tue, 17 Jun 2008 15:56:20 +0000
Subject: [BioRuby-cvs] bioruby/lib/bio/db/genbank genbank.rb, 0.40.2.3,
	0.40.2.4
Message-ID: <200806171556.m5HFuKdb021193@dev.open-bio.org>

Update of /home/repository/bioruby/bioruby/lib/bio/db/genbank
In directory dev.open-bio.org:/tmp/cvs-serv21173/lib/bio/db/genbank

Modified Files:
      Tag: BRANCH-biohackathon2008
	genbank.rb 
Log Message:
* Bio::GenBank#to_biosequence is changed to imporve support of sequence output
  and data exchange.
* Bio::GenBank#date_created is added. It returns Date object.


Index: genbank.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/db/genbank/genbank.rb,v
retrieving revision 0.40.2.3
retrieving revision 0.40.2.4
diff -C2 -d -r0.40.2.3 -r0.40.2.4
*** genbank.rb	4 Mar 2008 09:22:35 -0000	0.40.2.3
--- genbank.rb	17 Jun 2008 15:56:18 -0000	0.40.2.4
***************
*** 8,13 ****
--- 8,16 ----
  #
  
+ require 'date'
  require 'bio/db'
  require 'bio/db/genbank/common'
+ require 'bio/sequence'
+ require 'bio/sequence/dblink'
  
  module Bio
***************
*** 122,129 ****
--- 125,142 ----
    alias nalen length
  
+   # (obsolete???) length of the sequence
    def seq_len
      seq.length
    end
  
+   # modified date. Returns Date object, String or nil.
+   def date_modified
+     begin
+       Date.parse(self.date)
+     rescue ArgumentError, TypeError, NoMethodError, NameError
+       self.date
+     end
+   end
+ 
    # converts Bio::GenBank to Bio::Sequence
    # ---
***************
*** 132,135 ****
--- 145,156 ----
    def to_biosequence
      sequence = Bio::Sequence.new(seq)
+ 
+     sequence.id_namespace = 
+       if /\_/ =~ self.accession.to_s then
+         'RefSeq'
+       else
+         'GenBank'
+       end
+ 
      sequence.entry_id = self.entry_id
  
***************
*** 137,147 ****
      sequence.secondary_accessions = self.accessions - [ self.accession ]
  
      sequence.molecule_type = self.natype
      sequence.division = self.division
      sequence.topology = self.circular
  
      sequence.sequence_version = self.version
      #sequence.date_created = nil #????
!     sequence.date_modified = self.date
  
      sequence.definition = self.definition
--- 158,177 ----
      sequence.secondary_accessions = self.accessions - [ self.accession ]
  
+     if /GI\:(.+)/ =~ self.gi.to_s then
+       sequence.other_seqids = [ Bio::Sequence::DBLink.new('GI', $1) ]
+     end
+ 
      sequence.molecule_type = self.natype
      sequence.division = self.division
      sequence.topology = self.circular
+     sequence.strandedness = case self.strand.to_s.downcase;
+                             when 'ss-'; 'single';
+                             when 'ds-'; 'double';
+                             when 'ms-'; 'mixed';
+                             else nil; end
  
      sequence.sequence_version = self.version
      #sequence.date_created = nil #????
!     sequence.date_modified = date_modified
  
      sequence.definition = self.definition
***************
*** 149,153 ****
      sequence.species = self.organism
      sequence.classification = self.taxonomy.to_s.sub(/\.\z/, '').split(/\s*\;\s*/)
!     #sequence.organnella = nil # not used
      sequence.comments = self.comment
      sequence.references = self.references
--- 179,183 ----
      sequence.species = self.organism
      sequence.classification = self.taxonomy.to_s.sub(/\.\z/, '').split(/\s*\;\s*/)
!     #sequence.organelle = nil # yet unsupported
      sequence.comments = self.comment
      sequence.references = self.references


From ngoto at dev.open-bio.org  Tue Jun 17 11:59:26 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Tue, 17 Jun 2008 15:59:26 +0000
Subject: [BioRuby-cvs] bioruby/lib/bio/db/genbank format_genbank.rb, 1.1.2.4,
	1.1.2.5
Message-ID: <200806171559.m5HFxQa4021221@dev.open-bio.org>

Update of /home/repository/bioruby/bioruby/lib/bio/db/genbank
In directory dev.open-bio.org:/tmp/cvs-serv21201/lib/bio/db/genbank

Modified Files:
      Tag: BRANCH-biohackathon2008
	format_genbank.rb 
Log Message:
* Added support for COMMENT.
* Added support for GI number output.
* Many improvements are added.


Index: format_genbank.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/db/genbank/Attic/format_genbank.rb,v
retrieving revision 1.1.2.4
retrieving revision 1.1.2.5
diff -C2 -d -r1.1.2.4 -r1.1.2.5
*** format_genbank.rb	28 May 2008 13:26:33 -0000	1.1.2.4
--- format_genbank.rb	17 Jun 2008 15:59:24 -0000	1.1.2.5
***************
*** 101,104 ****
--- 101,115 ----
      end
  
+     # formats comments lines as GenBank
+     def comments_format_genbank(cmnts)
+       return '' if !cmnts or cmnts.empty?
+       cmnts = [ cmnts ] unless cmnts.kind_of?(Array)
+       a = []
+       cmnts.each do |str|
+         a.push "COMMENT     #{ genbank_wrap(str) }\n"
+       end
+       a.join('')
+     end
+ 
      # formats sequence lines as GenBank
      def seq_format_genbank(str)
***************
*** 113,122 ****
      end
  
      # Erb template of GenBank format for Bio::Sequence
      erb_template <<'__END_OF_TEMPLATE__'
! LOCUS       <%= sprintf("%-16s", entry_id) %> <%= sprintf("%11d", length) %> bp <%= sprintf("%3s", '') %><%= sprintf("%-6s", molecule_type) %>  <%= sprintf("%-8s", topology) %><%= sprintf("%4s", division) %> <%= sprintf("%-11s", date_modified) %>
  DEFINITION  <%= genbank_wrap_dot(definition.to_s) %>
  ACCESSION   <%= genbank_wrap(([ primary_accession ] + (secondary_accessions or [])).join(" ")) %>
! VERSION     <%= primary_accession %>.<%= sequence_version %><% unless true or gi_number.to_s.empty? %>GI:<%= gi_number %><% end %>
  KEYWORDS    <%= genbank_wrap_dot((keywords or []).join('; ')) %>
  SOURCE      <%= genbank_wrap(species) %>
--- 124,168 ----
      end
  
+     # formats date
+     def date_format_genbank
+       date_modified || date_created || null_date
+     end
+ 
+     # moleculue type
+     def mol_type_genbank
+       if /(DNA|(t|r|m|u|sn|sno)?RNA)/i =~ molecule_type.to_s then
+         $1.sub(/[DR]NA/) { |x| x.upcase }
+       else
+         'NA'
+       end
+     end
+ 
+     # NCBI GI number
+     def ncbi_gi_number
+       ids = other_seqids
+       if ids and r = ids.find { |x| x.database == 'GI' } then
+         r.id
+       else
+         nil
+       end
+     end
+ 
+     # strandedness
+     def strandedness_genbank
+       return nil unless strandedness
+       case strandedness
+       when 'single'; 'ss-'; 
+       when 'double'; 'ds-'; 
+       when 'mixed';  'ms-'; 
+       else; nil
+       end
+     end
+ 
      # Erb template of GenBank format for Bio::Sequence
      erb_template <<'__END_OF_TEMPLATE__'
! LOCUS       <%= sprintf("%-16s", entry_id) %> <%= sprintf("%11d", length) %> bp <%= sprintf("%3s", strandedness_genbank) %><%= sprintf("%-6s", mol_type_genbank) %>  <%= sprintf("%-8s", topology) %><%= sprintf("%4s", division) %> <%= date_format_genbank %>
  DEFINITION  <%= genbank_wrap_dot(definition.to_s) %>
  ACCESSION   <%= genbank_wrap(([ primary_accession ] + (secondary_accessions or [])).join(" ")) %>
! VERSION     <%= primary_accession %>.<%= sequence_version %><% if gi = ncbi_gi_number then %>  GI:<%= gi %><% end %>
  KEYWORDS    <%= genbank_wrap_dot((keywords or []).join('; ')) %>
  SOURCE      <%= genbank_wrap(species) %>
***************
*** 129,132 ****
--- 175,179 ----
  %><%= reference_format_genbank(ref, n) %><%
      end
+ %><%= comments_format_genbank(comments)
  %>FEATURES             Location/Qualifiers
  <%= format_features_genbank(features || [])


From ngoto at dev.open-bio.org  Tue Jun 17 12:04:38 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Tue, 17 Jun 2008 16:04:38 +0000
Subject: [BioRuby-cvs] bioruby/lib/bio/db/embl embl.rb,1.29.2.6,1.29.2.7
Message-ID: <200806171604.m5HG4cnr021274@dev.open-bio.org>

Update of /home/repository/bioruby/bioruby/lib/bio/db/embl
In directory dev.open-bio.org:/tmp/cvs-serv21250/lib/bio/db/embl

Modified Files:
      Tag: BRANCH-biohackathon2008
	embl.rb 
Log Message:
* Bio::EMBL#cc is changed to cut heading "CC   ".
* Bio::EMBL#to_biosequence to improve support for sequence output
  and data exchange.
* To get parse result of DT lines more easily, Bio::EMBL#date_modified,
  date_created, release_modified, release_created, and entry_version
  methods are added. 


Index: embl.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/db/embl/embl.rb,v
retrieving revision 1.29.2.6
retrieving revision 1.29.2.7
diff -C2 -d -r1.29.2.6 -r1.29.2.7
*** embl.rb	28 May 2008 13:09:03 -0000	1.29.2.6
--- embl.rb	17 Jun 2008 16:04:36 -0000	1.29.2.7
***************
*** 32,39 ****
--- 32,42 ----
  #
  
+ require 'date'
  require 'bio/db'
  require 'bio/db/embl/common'
  require 'bio/compat/features'
  require 'bio/compat/references'
+ require 'bio/sequence'
+ require 'bio/sequence/dblink'
  
  module Bio
***************
*** 323,329 ****
    # CC Line; comments of notes (>=0)
    def cc
!     get('CC')
    end
! 
  
    ##
--- 326,332 ----
    # CC Line; comments of notes (>=0)
    def cc
!     get('CC').to_s.gsub(/^CC   /, '')
    end
!   alias comment cc
  
    ##
***************
*** 376,379 ****
--- 379,436 ----
    #++
  
+   # modified date. Returns Date object, String or nil.
+   def date_modified
+     parse_date(self.dt['updated'])
+   end
+ 
+   # created date. Returns Date object, String or nil.
+   def date_created
+     parse_date(self.dt['created'])
+   end
+ 
+   # release number when last updated
+   def release_modified
+     parse_release_version(self.dt['updated'])[0]
+   end
+ 
+   # release number when created
+   def release_created
+     parse_release_version(self.dt['created'])[0]
+   end
+ 
+   # entry version number numbered by EMBL
+   def entry_version
+     parse_release_version(self.dt['updated'])[1]
+   end
+ 
+   # parse date string. Returns Date object.
+   def parse_date(str)
+     begin
+       Date.parse(str)
+     rescue ArgumentError, TypeError, NoMethodError, NameError
+       str
+     end
+   end
+   private :parse_date
+ 
+   # extracts release and version numbers from DT line
+   def parse_release_version(str)
+     return [ nil, nil ] unless str
+     a = str.split(/[\(\,\)]/)
+     dstr = a.shift
+     rel = nil
+     ver = nil
+     a.each do |x|
+       case x
+       when /Rel\.\s*(.+)/
+         rel = $1.strip
+       when /Version\s*(.+)/
+         ver = $1.strip
+       end
+     end
+     [ rel, ver ]
+   end
+   private :parse_release_version
+ 
    # converts the entry to Bio::Sequence object
    # ---
***************
*** 382,385 ****
--- 439,444 ----
    def to_biosequence
      bio_seq = Bio::Sequence.new(self.seq)
+ 
+     bio_seq.id_namespace = 'EMBL'
      bio_seq.entry_id = self.entry_id
      bio_seq.primary_accession = self.accessions[0]
***************
*** 389,394 ****
      bio_seq.definition = self.description
      bio_seq.topology = self.topology
!     bio_seq.date_created = self.dt['created']
!     bio_seq.date_modified = self.dt['updated']
      bio_seq.division = self.division
      bio_seq.sequence_version = self.version
--- 448,456 ----
      bio_seq.definition = self.description
      bio_seq.topology = self.topology
!     bio_seq.date_created = self.date_created
!     bio_seq.date_modified = self.date_modified
!     bio_seq.release_created = self.release_created
!     bio_seq.release_modified = self.release_modified
!     bio_seq.entry_version = self.entry_version
      bio_seq.division = self.division
      bio_seq.sequence_version = self.version
***************
*** 396,402 ****
      bio_seq.species = self.fetch('OS')
      bio_seq.classification = self.oc
      bio_seq.references = self.references
      bio_seq.features = self.ft
!     
      return bio_seq
    end
--- 458,469 ----
      bio_seq.species = self.fetch('OS')
      bio_seq.classification = self.oc
+     # bio_seq.organelle = self.fetch('OG') # unsupported yet
      bio_seq.references = self.references
      bio_seq.features = self.ft
!     bio_seq.comments = self.cc
!     bio_seq.dblinks = get('DR').split(/\n/).collect { |x|
!       Bio::Sequence::DBLink.parse_embl_DR_line(x)
!     }
! 
      return bio_seq
    end


From ngoto at dev.open-bio.org  Tue Jun 17 12:06:06 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Tue, 17 Jun 2008 16:06:06 +0000
Subject: [BioRuby-cvs] bioruby/lib/bio/db/embl format_embl.rb, 1.1.2.5,
	1.1.2.6
Message-ID: <200806171606.m5HG66iI021322@dev.open-bio.org>

Update of /home/repository/bioruby/bioruby/lib/bio/db/embl
In directory dev.open-bio.org:/tmp/cvs-serv21282/lib/bio/db/embl

Modified Files:
      Tag: BRANCH-biohackathon2008
	format_embl.rb 
Log Message:
* Added support for CC lines (comments).
* Added support for DR lines (database cross references).
* Many improvements.


Index: format_embl.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/db/embl/Attic/format_embl.rb,v
retrieving revision 1.1.2.5
retrieving revision 1.1.2.6
diff -C2 -d -r1.1.2.5 -r1.1.2.6
*** format_embl.rb	28 May 2008 13:38:07 -0000	1.1.2.5
--- format_embl.rb	17 Jun 2008 16:06:04 -0000	1.1.2.6
***************
*** 2,6 ****
  # = bio/db/embl/format_embl.rb - EMBL format generater
  #
! # Copyright::  Copyright (C) 2008 Jan Aerts <jandot at bioruby.org>
  # License::    The Ruby License
  #
--- 2,8 ----
  # = bio/db/embl/format_embl.rb - EMBL format generater
  #
! # Copyright::  Copyright (C) 2008
! #              Jan Aerts <jandot at bioruby.org>,
! #              Naohisa Goto <ng at bioruby.org>
  # License::    The Ruby License
  #
***************
*** 125,136 ****
      end
  
      # Erb template of EMBL format for Bio::Sequence
      erb_template <<'__END_OF_TEMPLATE__'
! ID   <%= entry_id %>; SV <%= sequence_version %>; <%= topology %>; <%= molecule_type %>; <%= data_class %>; <%= division %>; <%= seq.length %> BP.
  XX   
  <%= embl_wrap('AC   ', accessions.reject{|a| a.nil?}.join('; ') + ';') %>
  XX   
! DT   <%= date_created %>
! DT   <%= date_modified %>
  XX   
  <%= embl_wrap('DE   ', definition) %>
--- 127,166 ----
      end
  
+     # moleculue type
+     def mol_type_embl
+       if mt = molecule_type then
+         mt
+       elsif f = (features or []).find { |f| f.feature == 'source' } and
+           q = f.qualifiers.find { |q| q.qualifier == 'mol_type' } then
+         q.value
+       else
+         'NA'
+       end
+     end
+ 
+     # CC line. Comments.
+     def comments_format_embl(cmnts)
+       return '' if !cmnts or cmnts.empty?
+       cmnts = [ cmnts ] unless cmnts.kind_of?(Array)
+       a = []
+       cmnts.each do |str|
+         a.push embl_wrap('CC   ', str)
+       end
+       unless a.empty? then
+         a.push "XX   "
+         a.push '' # dummy to put "\n" at the end of the string
+       end
+       a.join("\n")
+     end
+ 
+ 
      # Erb template of EMBL format for Bio::Sequence
      erb_template <<'__END_OF_TEMPLATE__'
! ID   <%= primary_accession || entry_id %>; SV <%= sequence_version %>; <%= topology %>; <%= mol_type_embl %>; <%= data_class %>; <%= division %>; <%= seq.length %> BP.
  XX   
  <%= embl_wrap('AC   ', accessions.reject{|a| a.nil?}.join('; ') + ';') %>
  XX   
! DT   <%= format_date(date_created || null_date) %> (Rel. <%= release_created || 0 %>, Created)
! DT   <%= format_date(date_modified || null_date) %> (Rel. <%= release_modified || 0 %>, Last updated, Version <%= entry_version || 0 %>)
  XX   
  <%= embl_wrap('DE   ', definition) %>
***************
*** 142,146 ****
  XX   
  <% hash = {}; (references || []).each do |ref| %><%= reference_format_embl(ref, hash) %>
! <% end %>FH   Key             Location/Qualifiers
  FH   
  <%= format_features_embl(features || []) %>XX   
--- 172,181 ----
  XX   
  <% hash = {}; (references || []).each do |ref| %><%= reference_format_embl(ref, hash) %>
! <% end %><% (dblinks || []).each do |r|
! %>DR   <%= r.database %>; <%= r.id %><% unless r.secondary_ids.empty? %>; <%= r.secondary_ids[0] %><% end %>.
! <% end %><% if dblinks and !dblinks.empty? then
!  %>XX   
! <% end %><%= comments_format_embl(comments)
! %>FH   Key             Location/Qualifiers
  FH   
  <%= format_features_embl(features || []) %>XX   


From ngoto at dev.open-bio.org  Tue Jun 17 12:09:55 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Tue, 17 Jun 2008 16:09:55 +0000
Subject: [BioRuby-cvs] bioruby/test/unit/bio/db/embl test_embl_to_bioseq.rb,
	1.1.2.1, 1.1.2.2
Message-ID: <200806171609.m5HG9tFR021392@dev.open-bio.org>

Update of /home/repository/bioruby/bioruby/test/unit/bio/db/embl
In directory dev.open-bio.org:/tmp/cvs-serv21372/test/unit/bio/db/embl

Modified Files:
      Tag: BRANCH-biohackathon2008
	test_embl_to_bioseq.rb 
Log Message:
Unit test related to Bio::Sequence#date_created and date_modified are
changed because these methods are changed to store Date (or Time or DateTime)
objects instead of String objects.


Index: test_embl_to_bioseq.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/test/unit/bio/db/embl/Attic/test_embl_to_bioseq.rb,v
retrieving revision 1.1.2.1
retrieving revision 1.1.2.2
diff -C2 -d -r1.1.2.1 -r1.1.2.2
*** test_embl_to_bioseq.rb	20 Feb 2008 09:56:22 -0000	1.1.2.1
--- test_embl_to_bioseq.rb	17 Jun 2008 16:09:53 -0000	1.1.2.2
***************
*** 53,59 ****
      end
      
!     def test_dates
!       assert_equal('25-OCT-2002 (Rel. 73, Created)', @bio_seq.date_created)
!       assert_equal('14-NOV-2006 (Rel. 89, Last updated, Version 3)', @bio_seq.date_modified)
      end
      
--- 53,76 ----
      end
      
!     def test_date_created
!       # '25-OCT-2002 (Rel. 73, Created)'
!       assert_equal(Date.parse('25-OCT-2002'), @bio_seq.date_created)
!     end
! 
!     def test_date_modified
!       # '14-NOV-2006 (Rel. 89, Last updated, Version 3)'
!       assert_equal(Date.parse('14-NOV-2006'), @bio_seq.date_modified)
!     end
! 
!     def test_release_created
!       assert_equal('73', @bio_seq.release_created)
!     end
! 
!     def test_release_modified
!       assert_equal('89', @bio_seq.release_modified)
!     end
! 
!     def test_entry_version
!       assert_equal('3', @bio_seq.entry_version)
      end
      
***************
*** 129,135 ****
      end
      
!     def test_dates
!       assert_equal('25-OCT-2002 (Rel. 73, Created)', @bio_seq_2.date_created)
!       assert_equal('14-NOV-2006 (Rel. 89, Last updated, Version 3)', @bio_seq_2.date_modified)
      end
      
--- 146,169 ----
      end
      
!     def test_date_created
!       # '25-OCT-2002 (Rel. 73, Created)'
!       assert_equal(Date.parse('25-OCT-2002'), @bio_seq_2.date_created)
!     end
! 
!     def test_date_modified
!       # '14-NOV-2006 (Rel. 89, Last updated, Version 3)'
!       assert_equal(Date.parse('14-NOV-2006'), @bio_seq_2.date_modified)
!     end
! 
!     def test_release_created
!       assert_equal('73', @bio_seq_2.release_created)
!     end
! 
!     def test_release_modified
!       assert_equal('89', @bio_seq_2.release_modified)
!     end
! 
!     def test_entry_version
!       assert_equal('3', @bio_seq_2.entry_version)
      end
      

From ngoto at dev.open-bio.org  Thu Jun 19 08:45:18 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Thu, 19 Jun 2008 12:45:18 +0000
Subject: [BioRuby-cvs] bioruby/lib/bio/db/embl format_embl.rb, 1.1.2.6,
	1.1.2.7
Message-ID: <200806191245.m5JCjIps000652@dev.open-bio.org>

Update of /home/repository/bioruby/bioruby/lib/bio/db/embl
In directory dev.open-bio.org:/tmp/cvs-serv596/lib/bio/db/embl

Modified Files:
      Tag: BRANCH-biohackathon2008
	format_embl.rb 
Log Message:
avoid error when keywords or classification is nil


Index: format_embl.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/db/embl/Attic/format_embl.rb,v
retrieving revision 1.1.2.6
retrieving revision 1.1.2.7
diff -C2 -d -r1.1.2.6 -r1.1.2.7
*** format_embl.rb	17 Jun 2008 16:06:04 -0000	1.1.2.6
--- format_embl.rb	19 Jun 2008 12:45:15 -0000	1.1.2.7
***************
*** 166,173 ****
  <%= embl_wrap('DE   ', definition) %>
  XX   
! <%= embl_wrap('KW   ', keywords.join('; ') + '.') %>
  XX   
  OS   <%= species %>
! <%= embl_wrap('OC   ', classification.join('; ') + '.') %>
  XX   
  <% hash = {}; (references || []).each do |ref| %><%= reference_format_embl(ref, hash) %>
--- 166,173 ----
  <%= embl_wrap('DE   ', definition) %>
  XX   
! <%= embl_wrap('KW   ', (keywords || []).join('; ') + '.') %>
  XX   
  OS   <%= species %>
! <%= embl_wrap('OC   ', (classification || []).join('; ') + '.') %>
  XX   
  <% hash = {}; (references || []).each do |ref| %><%= reference_format_embl(ref, hash) %>


From ngoto at dev.open-bio.org  Fri Jun 20 09:22:34 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Fri, 20 Jun 2008 13:22:34 +0000
Subject: [BioRuby-cvs] bioruby/lib/bio/db fasta.rb,1.28,1.28.2.1
Message-ID: <200806201322.m5KDMYOR021703@dev.open-bio.org>

Update of /home/repository/bioruby/bioruby/lib/bio/db
In directory dev.open-bio.org:/tmp/cvs-serv21681

Modified Files:
      Tag: BRANCH-biohackathon2008
	fasta.rb 
Log Message:
Split Bio::FastaDefline class into lib/bio/db/fasta/defline.rb


Index: fasta.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/db/fasta.rb,v
retrieving revision 1.28
retrieving revision 1.28.2.1
diff -C2 -d -r1.28 -r1.28.2.1
*** fasta.rb	5 Apr 2007 23:35:40 -0000	1.28
--- fasta.rb	20 Jun 2008 13:22:31 -0000	1.28.2.1
***************
*** 15,57 ****
  # == Examples
  #
! #       rub = Bio::FastaDefline.new('>gi|671595|emb|CAA85678.1| rubisco large subunit [Perovskia abrotanoides]')
! #       rub.entry_id       ==> 'gi|671595'
! #       rub.get('emb')     ==> 'CAA85678.1'
! #       rub.emb            ==> 'CAA85678.1'
! #       rub.gi             ==> '671595'
! #       rub.accession      ==> 'CAA85678'
! #       rub.accessions     ==> [ 'CAA85678' ]
! #       rub.acc_version    ==> 'CAA85678.1'
! #       rub.locus          ==> nil
! #       rub.list_ids       ==> [["gi", "671595"],
! #                               ["emb", "CAA85678.1", nil],
! #                               ["Perovskia abrotanoides"]]
! #
! #       ckr = Bio::FastaDefline.new(">gi|2495000|sp|Q63931|CCKR_CAVPO CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)\001gi|2147182|pir||I51898 cholecystokinin A receptor - guinea pig\001gi|544724|gb|AAB29504.1| cholecystokinin A receptor; CCK-A receptor [Cavia]")
! #       ckr.entry_id      ==> "gi|2495000"
! #       ckr.sp            ==> "CCKR_CAVPO"
! #       ckr.pir           ==> "I51898"
! #       ckr.gb            ==> "AAB29504.1"
! #       ckr.gi            ==> "2495000"
! #       ckr.accession     ==> "AAB29504"
! #       ckr.accessions    ==> ["Q63931", "AAB29504"]
! #       ckr.acc_version   ==> "AAB29504.1"
! #       ckr.locus         ==> nil
! #       ckr.description   ==>
! #         "CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)"
! #       ckr.descriptions  ==>
! #         ["CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)",
! #          "cholecystokinin A receptor - guinea pig",
! #          "cholecystokinin A receptor; CCK-A receptor [Cavia]"]
! #       ckr.words         ==> 
! #         ["cavia", "cck-a", "cck-ar", "cholecystokinin", "guinea", "pig",
! #          "receptor", "type"]
! #       ckr.id_strings    ==>
! #         ["2495000", "Q63931", "CCKR_CAVPO", "2147182", "I51898",
! #          "544724", "AAB29504.1", "Cavia"]
! #       ckr.list_ids      ==>
! #         [["gi", "2495000"], ["sp", "Q63931", "CCKR_CAVPO"],
! #          ["gi", "2147182"], ["pir", nil, "I51898"], ["gi", "544724"],
! #          ["gb", "AAB29504.1", nil], ["Cavia"]]
  #
  # == References
--- 15,19 ----
  # == Examples
  #
! # See documents of Bio::FastaFormat class.
  #
  # == References
***************
*** 66,69 ****
--- 28,32 ----
  require 'bio/db'
  require 'bio/sequence'
+ require 'bio/db/fasta/defline'
  
  module Bio
***************
*** 363,825 ****
    end #class FastaNumericFormat
  
- 
-   # Parsing FASTA Defline, and extract IDs and other informations.
-   # IDs are NSIDs (NCBI standard FASTA sequence identifiers)
-   # or ":"-separated IDs.
-   # 
-   # specs are described in:
-   # ftp://ftp.ncbi.nih.gov/blast/documents/README.formatdb
-   # http://blast.wustl.edu/doc/FAQ-Indexing.html#Identifiers
-   #
-   # === Examples
-   #
-   #   rub = Bio::FastaDefline.new('>gi|671595|emb|CAA85678.1| rubisco large subunit [Perovskia abrotanoides]')
-   #   rub.entry_id       ==> 'gi|671595'
-   #   rub.get('emb')     ==> 'CAA85678.1'
-   #   rub.emb            ==> 'CAA85678.1'
-   #   rub.gi             ==> '671595'
-   #   rub.accession      ==> 'CAA85678'
-   #   rub.accessions     ==> [ 'CAA85678' ]
-   #   rub.acc_version    ==> 'CAA85678.1'
-   #   rub.locus          ==> nil
-   #   rub.list_ids       ==> [["gi", "671595"],
-   #                           ["emb", "CAA85678.1", nil],
-   #                           ["Perovskia abrotanoides"]]
-   #
-   #   ckr = Bio::FastaDefline.new(">gi|2495000|sp|Q63931|CCKR_CAVPO CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)\001gi|2147182|pir||I51898 cholecystokinin A receptor - guinea pig\001gi|544724|gb|AAB29504.1| cholecystokinin A receptor; CCK-A receptor [Cavia]")
-   #   ckr.entry_id      ==> "gi|2495000"
-   #   ckr.sp            ==> "CCKR_CAVPO"
-   #   ckr.pir           ==> "I51898"
-   #   ckr.gb            ==> "AAB29504.1"
-   #   ckr.gi            ==> "2495000"
-   #   ckr.accession     ==> "AAB29504"
-   #   ckr.accessions    ==> ["Q63931", "AAB29504"]
-   #   ckr.acc_version   ==> "AAB29504.1"
-   #   ckr.locus         ==> nil
-   #   ckr.description   ==>
-   #     "CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)"
-   #   ckr.descriptions  ==>
-   #     ["CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)",
-   #      "cholecystokinin A receptor - guinea pig",
-   #      "cholecystokinin A receptor; CCK-A receptor [Cavia]"]
-   #   ckr.words         ==> 
-   #     ["cavia", "cck-a", "cck-ar", "cholecystokinin", "guinea", "pig",
-   #      "receptor", "type"]
-   #   ckr.id_strings    ==>
-   #     ["2495000", "Q63931", "CCKR_CAVPO", "2147182", "I51898",
-   #      "544724", "AAB29504.1", "Cavia"]
-   #   ckr.list_ids      ==>
-   #     [["gi", "2495000"], ["sp", "Q63931", "CCKR_CAVPO"],
-   #      ["gi", "2147182"], ["pir", nil, "I51898"], ["gi", "544724"],
-   #      ["gb", "AAB29504.1", nil], ["Cavia"]]
-   #
-   # === Refereneces
-   #
-   # * Fasta format description (NCBI)
-   #   http://www.ncbi.nlm.nih.gov/BLAST/fasta.shtml
-   #
-   # * Frequently Asked Questions:  Indexing of Sequence Identifiers (by Warren R. Gish.)
-   #   http://blast.wustl.edu/doc/FAQ-Indexing.html#Identifiers
-   #
-   # * README.formatdb
-   #   ftp://ftp.ncbi.nih.gov/blast/documents/README.formatdb
-   # 
-   class FastaDefline
- 
-     NSIDs = {
-       # NCBI and WU-BLAST
-       'gi'  => [ 'gi' ],                      # NCBI GI
-       'gb'  => [ 'acc_version', 'locus' ],      # GenBank
-       'emb' => [ 'acc_version', 'locus' ],      # EMBL
-       'dbj' => [ 'acc_version', 'locus' ],      # DDBJ
-       'sp'  => [ 'accession', 'entry_id' ],   # SWISS-PROT
-       'pdb' => [ 'entry_id', 'chain' ],       # PDB
-       'bbs' => [ 'number' ],                  # GenInfo Backbone Id
-       'gnl' => [ 'database' , 'entry_id' ],   # General database identifier
-       'ref' => [ 'acc_version' , 'locus' ],     # NCBI Reference Sequence
-       'lcl' => [ 'entry_id' ],                # Local Sequence identifier
- 
-       # WU-BLAST and NCBI
-       'pir' => [ 'accession', 'entry_id' ],   # PIR
-       'prf' => [ 'accession', 'entry_id' ],   # Protein Research Foundation
-       'pat' => [ 'country', 'number', 'serial' ], # Patents
- 
-       # WU-BLAST only
-       'bbm' => [ 'number' ],      # NCBI GenInfo Backbone database identifier
-       'gim' => [ 'number' ],      # NCBI GenInfo Import identifier
-       'gp'  => [ 'acc_version', 'locus' ],      # GenPept
-       'oth' => [ 'accession', 'name', 'release' ],  # Other (user-definable) identifier
-       'tpd' => [ 'accession', 'name' ],       # Third party annotation, DDBJ
-       'tpe' => [ 'accession', 'name' ],       # Third party annotation, EMBL
-       'tpg' => [ 'accession', 'name' ],       # Third party annotation, GenBank
- 
-       # Original
-       'ri'  => [ 'entry_id', 'rearray_id', 'len' ], # RIKEN FANTOM DB
-     }
- 
-     # Shows array that contains IDs (or ID-like strings).
-     # Returns an array of arrays of strings.
-     attr_reader :list_ids
- 
-     # Shows a possibly unique identifier.
-     # Returns a string.
-     attr_reader :entry_id
- 
-     # Parses given string.
-     def initialize(str)
-       @deflines = []
-       @info = {}
-       @list_ids = []
- 
-       @entry_id = nil
- 
-       lines = str.split("\x01")
-       lines.each do |line|
-         add_defline(line)
-       end
-     end #def initialize
- 
-     # Parses given string and adds parsed data.
-     def add_defline(str)
-       case str
-       when /^\>?\s*((?:[^\|\s]*\|)+[^\s]+)\s*(.*)$/
-         # NSIDs
-         # examples:
-         # >gi|9910844|sp|Q9UWG2|RL3_METVA 50S ribosomal protein L3P
-         #
-         # note: regexp (:?) means grouping without backreferences
-         i = $1
-         d = $2
-         tks = i.split('|')
-         tks << '' if i[-1,1] == '|'
-         a = parse_NSIDs(tks)
-         i = a[0].join('|')
-         a.unshift('|')
-         d = tks.join('|') + ' ' + d unless tks.empty?
-         a << d
-         this_line = a
-         match_EC(d)
-         parse_square_brackets(d).each do |x|
-           if !match_EC(x, false) and x =~ /\A[A-Z]/ then
-             di = [  x ]
-             @list_ids << di
-             @info['organism'] = x unless @info['organism']
-           end
-         end
- 
-       when /^\>?\s*([a-zA-Z0-9]+\:[^\s]+)\s*(.*)$/
-         # examples:
-         # >sce:YBR160W  CDC28, SRM5; cyclin-dependent protein kinase catalytic subunit [EC:2.7.1.-] [SP:CC28_YEAST]
-         # >emb:CACDC28 [X80034] C.albicans CDC28 gene 
-         i = $1
-         d = $2
-         a = parse_ColonSepID(i)
-         i = a.join(':')
-         this_line = [ ':', a , d ]
-         match_EC(d)
-         parse_square_brackets(d).each do |x|
-           if !match_EC(x, false) and x =~ /:/ then
-             parse_ColonSepID(x)
-           elsif x =~ /\A\s*([A-Z][A-Z0-9_\.]+)\s*\z/ then
-             @list_ids << [ $1 ]
-           end
-         end
- 
-       when /^\>?\s*(\S+)(?:\s+(.+))?$/
-         # examples:
-         # >ABC12345 this is test
-         i = $1
-         d = $2.to_s
-         @list_ids << [ i.chomp('.') ]
-         this_line = [  '', [ i ], d ]
-         match_EC(d)
-       else
-         i = str
-         d = ''
-         match_EC(i)
-         this_line = [ '', [ i ], d ]
-       end
- 
-       @deflines << this_line
-       @entry_id = i unless @entry_id
-     end
- 
-     def match_EC(str, write_flag = true)
-       di = nil
-       str.scan(/EC\:((:?[\-\d]+\.){3}(:?[\-\d]+))/i) do |x|
-         di = [ 'EC', $1 ]
-         if write_flag then
-           @info['ec'] = di[1] if (!@info['ec'] or @info['ec'].to_s =~ /\-/)
-           @list_ids << di
-         end
-       end
-       di
-     end
-     private :match_EC
- 
-     def parse_square_brackets(str)
-       r = []
-       str.scan(/\[([^\]]*)\]/) do |x|
-         r << x[0]
-       end
-       r
-     end
-     private :parse_square_brackets
- 
-     def parse_ColonSepID(str)
-       di = str.split(':', 2)
-       di << nil if di.size <= 1 
-       @list_ids << di
-       di
-     end
-     private :parse_ColonSepID
- 
-     def parse_NSIDs(ary)
-       # this method destroys ary
-       data = []
-       while token = ary.shift
-         if labels = self.class::NSIDs[token] then
-           di = [ token ]
-           idtype = token
-           labels.each do |x|
-             token = ary.shift
-             break unless token
-             if self.class::NSIDs[token] then
-               ary.unshift(token)
-               break #each
-             end
-             if token.length > 0 then
-               di << token
-             else
-               di << nil
-             end
-           end
-           data << di
-         else
-           if token.length > 0 then
-             # UCID (uncontrolled identifiers)
-             di = [ token ]
-             data << di
-             @info['ucid'] = token unless @info['ucid']
-           end
-           break #while
-         end
-       end #while
-       @list_ids.concat data
-       data
-     end #def parse_NSIDs
-     private :parse_NSIDs
- 
- 
-     # Shows original string.
-     # Note that the result of this method may be different from
-     # original string which is given in FastaDefline.new method.
-     def to_s
-       @deflines.collect { |a|
-         s = a[0]
-         (a[1..-2].collect { |x| x.join(s) }.join(s) + ' ' + a[-1]).strip
-       }.join("\x01")
-     end
- 
-     # Shows description.
-     def description
-       @deflines[0].to_a[-1]
-     end
- 
-     # Returns descriptions.
-     def descriptions
-       @deflines.collect do |a|
-         a[-1]
-       end
-     end
- 
-     # Shows ID-like strings.
-     # Returns an array of strings.
-     def id_strings
-       r = []
-       @list_ids.each do |a|
-         if a.size >= 2 then
-           r.concat a[1..-1].find_all { |x| x }
-         else
-           if a[0].to_s.size > 0 and a[0] =~ /\A[A-Za-z0-9\.\-\_]+\z/
-             r << a[0]
-           end
-         end
-       end
-       r.concat( words(true, []).find_all do |x|
-                  x =~ /\A[A-Z][A-Za-z0-9\_]*[0-9]+[A-Za-z0-9\_]+\z/ or
-                    x =~ /\A[A-Z][A-Z0-9]*\_[A-Z0-9\_]+\z/
-                end)
-       r
-     end
- 
-     KillWords = [
-       'an', 'the', 'this', 'that',
-       'is', 'are', 'were', 'was', 'be', 'can', 'may', 'might',
-       'as', 'at', 'by', 'for', 'in', 'of', 'on', 'to', 'with',
-       'from', 'and', 'or', 'not',
-       'dna', 'rna', 'mrna', 'cdna', 'orf',
-       'aa', 'nt', 'pct', 'id', 'ec', 'sp', 'subsp',
-       'similar', 'involved', 'identical', 'identity',
-       'cds', 'clone', 'library', 'contig', 'contigs',
-       'homolog', 'homologue', 'homologs', 'homologous',
-       'protein', 'proteins', 'gene', 'genes',
-       'product', 'products', 'sequence', 'sequences', 
-       'strain', 'strains', 'region', 'regions',
-     ]
-     KillWordsHash = {}
-     KillWords.each { |x| KillWordsHash[x] = true }
- 
-     KillRegexpArray = [
-       /\A\d{1,3}\%?\z/,
-       /\A[A-Z][A-Za-z0-9\_]*[0-9]+[A-Za-z0-9\_]+\z/,
-       /\A[A-Z][A-Z0-9]*\_[A-Z0-9\_]+\z/
-     ]
- 
-     # Shows words used in the defline. Returns an Array.
-     def words(case_sensitive = nil, kill_regexp = self.class::KillRegexpArray,
-               kwhash = self.class::KillWordsHash)
-       a = descriptions.join(' ').split(/[\.\,\;\:\(\)\[\]\{\}\<\>\"\'\`\~\/\|\?\!\&\@\#\s\x00-\x1f\x7f]+/)
-       a.collect! do |x|
-         x.sub!(/\A[\$\*\-\+]+/, '')
-         x.sub!(/[\$\*\-\=]+\z/, '')
-         if x.size <= 1 then
-           nil
-         elsif kwhash[x.downcase] then
-           nil
-         else
-           if kill_regexp.find { |expr| expr =~ x } then
-             nil
-           else
-             x
-           end
-         end
-       end
-       a.compact!
-       a.collect! { |x| x.downcase } unless case_sensitive
-       a.sort!
-       a.uniq!
-       a
-     end
- 
-     # Returns identifires by a database name.
-     def get(dbname)
-       db = dbname.to_s
-       r = nil
-       unless r = @info[db] then
-         di = @list_ids.find { |x| x[0] == db.to_s }
-         if di and di.size <= 2 then
-           r = di[-1]
-         elsif di then
-           labels = self.class::NSIDs[db]
-           [ 'acc_version', 'entry_id',
-             'locus', 'accession', 'number'].each do |x|
-             if i = labels.index(x) then
-               r = di[i+1]
-               break if r
-             end
-           end
-           r = di[1..-1].find { |x| x } unless r
-         end
-         @info[db] = r if r
-       end
-       r
-     end
- 
-     # Returns an identifier by given type.
-     def get_by_type(type_str)
-       @list_ids.each do |x|
-         if labels = self.class::NSIDs[x[0]] then
-           if i = labels.index(type_str) then
-             return x[i+1]
-           end
-         end
-       end
-       nil
-     end
- 
-     # Returns identifiers by given type.
-     def get_all_by_type(*type_strarg)
-       d = []
-       @list_ids.each do |x|
-         if labels = self.class::NSIDs[x[0]] then
-           type_strarg.each do |y|
-             if i = labels.index(y) then
-               d << x[i+1] if x[i+1]
-             end
-           end
-         end
-       end
-       d
-     end
- 
-     # Shows locus.
-     # If the entry has more than two of such IDs,
-     # only the first ID are shown.
-     # Returns a string or nil.
-     def locus
-       unless defined?(@locus)
-         @locus = get_by_type('locus')
-       end
-       @locus
-     end
- 
-     # Shows GI.
-     # If the entry has more than two of such IDs,
-     # only the first ID are shown.
-     # Returns a string or nil.
-     def gi
-       unless defined?(@gi) then
-         @gi = get_by_type('gi')
-       end
-       @gi
-     end
- 
-     # Shows accession with version number.
-     # If the entry has more than two of such IDs,
-     # only the first ID are shown.
-     # Returns a string or nil.
-     def acc_version
-       unless defined?(@acc_version) then
-         @acc_version = get_by_type('acc_version')
-       end
-       @acc_version
-     end
- 
-     # Shows accession numbers.
-     # Returns an array of strings.
-     def accessions
-       unless defined?(@accessions) then
-         @accessions = get_all_by_type('accession', 'acc_version')
-         @accessions.collect! { |x| x.sub(/\..*\z/, '') }
-       end
-       @accessions
-     end
- 
-     # Shows an accession number.
-     def accession
-       unless defined?(@accession) then
-         if acc_version then
-           @accession = acc_version.split('.')[0]
-         else
-           @accession = accessions[0]
-         end
-       end
-       @accession
-     end
-     
-     def method_missing(name, *args)
-       # raise ArgumentError,
-       # "wrong # of arguments(#{args.size} for 1)" if args.size >= 2
-       r = get(name, *args)
-       if !r and !(self.class::NSIDs[name.to_s]) then
-         raise "NameError: undefined method `#{name.inspect}'"
-       end
-       r
-     end
-     
- 
-   end #class FastaDefline
- 
  end #module Bio
  
--- 326,329 ----


From ngoto at dev.open-bio.org  Fri Jun 20 09:22:34 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Fri, 20 Jun 2008 13:22:34 +0000
Subject: [BioRuby-cvs] bioruby/lib/bio/db/fasta defline.rb,NONE,1.1.2.1
Message-ID: <200806201322.m5KDMYlh021706@dev.open-bio.org>

Update of /home/repository/bioruby/bioruby/lib/bio/db/fasta
In directory dev.open-bio.org:/tmp/cvs-serv21681/fasta

Added Files:
      Tag: BRANCH-biohackathon2008
	defline.rb 
Log Message:
Split Bio::FastaDefline class into lib/bio/db/fasta/defline.rb


--- NEW FILE: defline.rb ---
#
# = bio/db/fasta/defline.rb - FASTA defline parser class
#
# Copyright::  Copyright (C) 2001, 2002
#              GOTO Naohisa <ngoto at gen-info.osaka-u.ac.jp>,
#              Toshiaki Katayama <k at bioruby.org>
# License::    The Ruby License
#
# $Id: defline.rb,v 1.1.2.1 2008/06/20 13:22:32 ngoto Exp $
# 
# == Description
# 
# Bio::FastaDefline is a parser class for definition line (defline)
# of the FASTA format.
#
# == Examples
#
#       rub = Bio::FastaDefline.new('>gi|671595|emb|CAA85678.1| rubisco large subunit [Perovskia abrotanoides]')
#       rub.entry_id       ==> 'gi|671595'
#       rub.get('emb')     ==> 'CAA85678.1'
#       rub.emb            ==> 'CAA85678.1'
#       rub.gi             ==> '671595'
#       rub.accession      ==> 'CAA85678'
#       rub.accessions     ==> [ 'CAA85678' ]
#       rub.acc_version    ==> 'CAA85678.1'
#       rub.locus          ==> nil
#       rub.list_ids       ==> [["gi", "671595"],
#                               ["emb", "CAA85678.1", nil],
#                               ["Perovskia abrotanoides"]]
#
#       ckr = Bio::FastaDefline.new(">gi|2495000|sp|Q63931|CCKR_CAVPO CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)\001gi|2147182|pir||I51898 cholecystokinin A receptor - guinea pig\001gi|544724|gb|AAB29504.1| cholecystokinin A receptor; CCK-A receptor [Cavia]")
#       ckr.entry_id      ==> "gi|2495000"
#       ckr.sp            ==> "CCKR_CAVPO"
#       ckr.pir           ==> "I51898"
#       ckr.gb            ==> "AAB29504.1"
#       ckr.gi            ==> "2495000"
#       ckr.accession     ==> "AAB29504"
#       ckr.accessions    ==> ["Q63931", "AAB29504"]
#       ckr.acc_version   ==> "AAB29504.1"
#       ckr.locus         ==> nil
#       ckr.description   ==>
#         "CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)"
#       ckr.descriptions  ==>
#         ["CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)",
#          "cholecystokinin A receptor - guinea pig",
#          "cholecystokinin A receptor; CCK-A receptor [Cavia]"]
#       ckr.words         ==> 
#         ["cavia", "cck-a", "cck-ar", "cholecystokinin", "guinea", "pig",
#          "receptor", "type"]
#       ckr.id_strings    ==>
#         ["2495000", "Q63931", "CCKR_CAVPO", "2147182", "I51898",
#          "544724", "AAB29504.1", "Cavia"]
#       ckr.list_ids      ==>
#         [["gi", "2495000"], ["sp", "Q63931", "CCKR_CAVPO"],
#          ["gi", "2147182"], ["pir", nil, "I51898"], ["gi", "544724"],
#          ["gb", "AAB29504.1", nil], ["Cavia"]]
#
# == References
#
# * FASTA format (WikiPedia)
#   http://en.wikipedia.org/wiki/FASTA_format
#   
# * Fasta format description (NCBI)
#   http://www.ncbi.nlm.nih.gov/BLAST/fasta.shtml
#

module Bio

  #--
  # split from fasta.rb revision 1.28
  #++

  # Parsing FASTA Defline, and extract IDs and other informations.
  # IDs are NSIDs (NCBI standard FASTA sequence identifiers)
  # or ":"-separated IDs.
  # 
  # specs are described in:
  # ftp://ftp.ncbi.nih.gov/blast/documents/README.formatdb
  # http://blast.wustl.edu/doc/FAQ-Indexing.html#Identifiers
  #
  # === Examples
  #
  #   rub = Bio::FastaDefline.new('>gi|671595|emb|CAA85678.1| rubisco large subunit [Perovskia abrotanoides]')
  #   rub.entry_id       ==> 'gi|671595'
  #   rub.get('emb')     ==> 'CAA85678.1'
  #   rub.emb            ==> 'CAA85678.1'
  #   rub.gi             ==> '671595'
  #   rub.accession      ==> 'CAA85678'
  #   rub.accessions     ==> [ 'CAA85678' ]
  #   rub.acc_version    ==> 'CAA85678.1'
  #   rub.locus          ==> nil
  #   rub.list_ids       ==> [["gi", "671595"],
  #                           ["emb", "CAA85678.1", nil],
  #                           ["Perovskia abrotanoides"]]
  #
  #   ckr = Bio::FastaDefline.new(">gi|2495000|sp|Q63931|CCKR_CAVPO CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)\001gi|2147182|pir||I51898 cholecystokinin A receptor - guinea pig\001gi|544724|gb|AAB29504.1| cholecystokinin A receptor; CCK-A receptor [Cavia]")
  #   ckr.entry_id      ==> "gi|2495000"
  #   ckr.sp            ==> "CCKR_CAVPO"
  #   ckr.pir           ==> "I51898"
  #   ckr.gb            ==> "AAB29504.1"
  #   ckr.gi            ==> "2495000"
  #   ckr.accession     ==> "AAB29504"
  #   ckr.accessions    ==> ["Q63931", "AAB29504"]
  #   ckr.acc_version   ==> "AAB29504.1"
  #   ckr.locus         ==> nil
  #   ckr.description   ==>
  #     "CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)"
  #   ckr.descriptions  ==>
  #     ["CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)",
  #      "cholecystokinin A receptor - guinea pig",
  #      "cholecystokinin A receptor; CCK-A receptor [Cavia]"]
  #   ckr.words         ==> 
  #     ["cavia", "cck-a", "cck-ar", "cholecystokinin", "guinea", "pig",
  #      "receptor", "type"]
  #   ckr.id_strings    ==>
  #     ["2495000", "Q63931", "CCKR_CAVPO", "2147182", "I51898",
  #      "544724", "AAB29504.1", "Cavia"]
  #   ckr.list_ids      ==>
  #     [["gi", "2495000"], ["sp", "Q63931", "CCKR_CAVPO"],
  #      ["gi", "2147182"], ["pir", nil, "I51898"], ["gi", "544724"],
  #      ["gb", "AAB29504.1", nil], ["Cavia"]]
  #
  # === Refereneces
  #
  # * Fasta format description (NCBI)
  #   http://www.ncbi.nlm.nih.gov/BLAST/fasta.shtml
  #
  # * Frequently Asked Questions:  Indexing of Sequence Identifiers (by Warren R. Gish.)
  #   http://blast.wustl.edu/doc/FAQ-Indexing.html#Identifiers
  #
  # * README.formatdb
  #   ftp://ftp.ncbi.nih.gov/blast/documents/README.formatdb
  # 
  class FastaDefline

    NSIDs = {
      # NCBI and WU-BLAST
      'gi'  => [ 'gi' ],                      # NCBI GI
      'gb'  => [ 'acc_version', 'locus' ],      # GenBank
      'emb' => [ 'acc_version', 'locus' ],      # EMBL
      'dbj' => [ 'acc_version', 'locus' ],      # DDBJ
      'sp'  => [ 'accession', 'entry_id' ],   # SWISS-PROT
      'pdb' => [ 'entry_id', 'chain' ],       # PDB
      'bbs' => [ 'number' ],                  # GenInfo Backbone Id
      'gnl' => [ 'database' , 'entry_id' ],   # General database identifier
      'ref' => [ 'acc_version' , 'locus' ],     # NCBI Reference Sequence
      'lcl' => [ 'entry_id' ],                # Local Sequence identifier

      # WU-BLAST and NCBI
      'pir' => [ 'accession', 'entry_id' ],   # PIR
      'prf' => [ 'accession', 'entry_id' ],   # Protein Research Foundation
      'pat' => [ 'country', 'number', 'serial' ], # Patents

      # WU-BLAST only
      'bbm' => [ 'number' ],      # NCBI GenInfo Backbone database identifier
      'gim' => [ 'number' ],      # NCBI GenInfo Import identifier
      'gp'  => [ 'acc_version', 'locus' ],      # GenPept
      'oth' => [ 'accession', 'name', 'release' ],  # Other (user-definable) identifier
      'tpd' => [ 'accession', 'name' ],       # Third party annotation, DDBJ
      'tpe' => [ 'accession', 'name' ],       # Third party annotation, EMBL
      'tpg' => [ 'accession', 'name' ],       # Third party annotation, GenBank

      # Original
      'ri'  => [ 'entry_id', 'rearray_id', 'len' ], # RIKEN FANTOM DB
    }

    # Shows array that contains IDs (or ID-like strings).
    # Returns an array of arrays of strings.
    attr_reader :list_ids

    # Shows a possibly unique identifier.
    # Returns a string.
    attr_reader :entry_id

    # Parses given string.
    def initialize(str)
      @deflines = []
      @info = {}
      @list_ids = []

      @entry_id = nil

      lines = str.split("\x01")
      lines.each do |line|
        add_defline(line)
      end
    end #def initialize

    # Parses given string and adds parsed data.
    def add_defline(str)
      case str
      when /^\>?\s*((?:[^\|\s]*\|)+[^\s]+)\s*(.*)$/
        # NSIDs
        # examples:
        # >gi|9910844|sp|Q9UWG2|RL3_METVA 50S ribosomal protein L3P
        #
        # note: regexp (:?) means grouping without backreferences
        i = $1
        d = $2
        tks = i.split('|')
        tks << '' if i[-1,1] == '|'
        a = parse_NSIDs(tks)
        i = a[0].join('|')
        a.unshift('|')
        d = tks.join('|') + ' ' + d unless tks.empty?
        a << d
        this_line = a
        match_EC(d)
        parse_square_brackets(d).each do |x|
          if !match_EC(x, false) and x =~ /\A[A-Z]/ then
            di = [  x ]
            @list_ids << di
            @info['organism'] = x unless @info['organism']
          end
        end

      when /^\>?\s*([a-zA-Z0-9]+\:[^\s]+)\s*(.*)$/
        # examples:
        # >sce:YBR160W  CDC28, SRM5; cyclin-dependent protein kinase catalytic subunit [EC:2.7.1.-] [SP:CC28_YEAST]
        # >emb:CACDC28 [X80034] C.albicans CDC28 gene 
        i = $1
        d = $2
        a = parse_ColonSepID(i)
        i = a.join(':')
        this_line = [ ':', a , d ]
        match_EC(d)
        parse_square_brackets(d).each do |x|
          if !match_EC(x, false) and x =~ /:/ then
            parse_ColonSepID(x)
          elsif x =~ /\A\s*([A-Z][A-Z0-9_\.]+)\s*\z/ then
            @list_ids << [ $1 ]
          end
        end

      when /^\>?\s*(\S+)(?:\s+(.+))?$/
        # examples:
        # >ABC12345 this is test
        i = $1
        d = $2.to_s
        @list_ids << [ i.chomp('.') ]
        this_line = [  '', [ i ], d ]
        match_EC(d)
      else
        i = str
        d = ''
        match_EC(i)
        this_line = [ '', [ i ], d ]
      end

      @deflines << this_line
      @entry_id = i unless @entry_id
    end

    def match_EC(str, write_flag = true)
      di = nil
      str.scan(/EC\:((:?[\-\d]+\.){3}(:?[\-\d]+))/i) do |x|
        di = [ 'EC', $1 ]
        if write_flag then
          @info['ec'] = di[1] if (!@info['ec'] or @info['ec'].to_s =~ /\-/)
          @list_ids << di
        end
      end
      di
    end
    private :match_EC

    def parse_square_brackets(str)
      r = []
      str.scan(/\[([^\]]*)\]/) do |x|
        r << x[0]
      end
      r
    end
    private :parse_square_brackets

    def parse_ColonSepID(str)
      di = str.split(':', 2)
      di << nil if di.size <= 1 
      @list_ids << di
      di
    end
    private :parse_ColonSepID

    def parse_NSIDs(ary)
      # this method destroys ary
      data = []
      while token = ary.shift
        if labels = self.class::NSIDs[token] then
          di = [ token ]
          idtype = token
          labels.each do |x|
            token = ary.shift
            break unless token
            if self.class::NSIDs[token] then
              ary.unshift(token)
              break #each
            end
            if token.length > 0 then
              di << token
            else
              di << nil
            end
          end
          data << di
        else
          if token.length > 0 then
            # UCID (uncontrolled identifiers)
            di = [ token ]
            data << di
            @info['ucid'] = token unless @info['ucid']
          end
          break #while
        end
      end #while
      @list_ids.concat data
      data
    end #def parse_NSIDs
    private :parse_NSIDs


    # Shows original string.
    # Note that the result of this method may be different from
    # original string which is given in FastaDefline.new method.
    def to_s
      @deflines.collect { |a|
        s = a[0]
        (a[1..-2].collect { |x| x.join(s) }.join(s) + ' ' + a[-1]).strip
      }.join("\x01")
    end

    # Shows description.
    def description
      @deflines[0].to_a[-1]
    end

    # Returns descriptions.
    def descriptions
      @deflines.collect do |a|
        a[-1]
      end
    end

    # Shows ID-like strings.
    # Returns an array of strings.
    def id_strings
      r = []
      @list_ids.each do |a|
        if a.size >= 2 then
          r.concat a[1..-1].find_all { |x| x }
        else
          if a[0].to_s.size > 0 and a[0] =~ /\A[A-Za-z0-9\.\-\_]+\z/
            r << a[0]
          end
        end
      end
      r.concat( words(true, []).find_all do |x|
                 x =~ /\A[A-Z][A-Za-z0-9\_]*[0-9]+[A-Za-z0-9\_]+\z/ or
                   x =~ /\A[A-Z][A-Z0-9]*\_[A-Z0-9\_]+\z/
               end)
      r
    end

    KillWords = [
      'an', 'the', 'this', 'that',
      'is', 'are', 'were', 'was', 'be', 'can', 'may', 'might',
      'as', 'at', 'by', 'for', 'in', 'of', 'on', 'to', 'with',
      'from', 'and', 'or', 'not',
      'dna', 'rna', 'mrna', 'cdna', 'orf',
      'aa', 'nt', 'pct', 'id', 'ec', 'sp', 'subsp',
      'similar', 'involved', 'identical', 'identity',
      'cds', 'clone', 'library', 'contig', 'contigs',
      'homolog', 'homologue', 'homologs', 'homologous',
      'protein', 'proteins', 'gene', 'genes',
      'product', 'products', 'sequence', 'sequences', 
      'strain', 'strains', 'region', 'regions',
    ]
    KillWordsHash = {}
    KillWords.each { |x| KillWordsHash[x] = true }

    KillRegexpArray = [
      /\A\d{1,3}\%?\z/,
      /\A[A-Z][A-Za-z0-9\_]*[0-9]+[A-Za-z0-9\_]+\z/,
      /\A[A-Z][A-Z0-9]*\_[A-Z0-9\_]+\z/
    ]

    # Shows words used in the defline. Returns an Array.
    def words(case_sensitive = nil, kill_regexp = self.class::KillRegexpArray,
              kwhash = self.class::KillWordsHash)
      a = descriptions.join(' ').split(/[\.\,\;\:\(\)\[\]\{\}\<\>\"\'\`\~\/\|\?\!\&\@\#\s\x00-\x1f\x7f]+/)
      a.collect! do |x|
        x.sub!(/\A[\$\*\-\+]+/, '')
        x.sub!(/[\$\*\-\=]+\z/, '')
        if x.size <= 1 then
          nil
        elsif kwhash[x.downcase] then
          nil
        else
          if kill_regexp.find { |expr| expr =~ x } then
            nil
          else
            x
          end
        end
      end
      a.compact!
      a.collect! { |x| x.downcase } unless case_sensitive
      a.sort!
      a.uniq!
      a
    end

    # Returns identifires by a database name.
    def get(dbname)
      db = dbname.to_s
      r = nil
      unless r = @info[db] then
        di = @list_ids.find { |x| x[0] == db.to_s }
        if di and di.size <= 2 then
          r = di[-1]
        elsif di then
          labels = self.class::NSIDs[db]
          [ 'acc_version', 'entry_id',
            'locus', 'accession', 'number'].each do |x|
            if i = labels.index(x) then
              r = di[i+1]
              break if r
            end
          end
          r = di[1..-1].find { |x| x } unless r
        end
        @info[db] = r if r
      end
      r
    end

    # Returns an identifier by given type.
    def get_by_type(type_str)
      @list_ids.each do |x|
        if labels = self.class::NSIDs[x[0]] then
          if i = labels.index(type_str) then
            return x[i+1]
          end
        end
      end
      nil
    end

    # Returns identifiers by given type.
    def get_all_by_type(*type_strarg)
      d = []
      @list_ids.each do |x|
        if labels = self.class::NSIDs[x[0]] then
          type_strarg.each do |y|
            if i = labels.index(y) then
              d << x[i+1] if x[i+1]
            end
          end
        end
      end
      d
    end

    # Shows locus.
    # If the entry has more than two of such IDs,
    # only the first ID are shown.
    # Returns a string or nil.
    def locus
      unless defined?(@locus)
        @locus = get_by_type('locus')
      end
      @locus
    end

    # Shows GI.
    # If the entry has more than two of such IDs,
    # only the first ID are shown.
    # Returns a string or nil.
    def gi
      unless defined?(@gi) then
        @gi = get_by_type('gi')
      end
      @gi
    end

    # Shows accession with version number.
    # If the entry has more than two of such IDs,
    # only the first ID are shown.
    # Returns a string or nil.
    def acc_version
      unless defined?(@acc_version) then
        @acc_version = get_by_type('acc_version')
      end
      @acc_version
    end

    # Shows accession numbers.
    # Returns an array of strings.
    def accessions
      unless defined?(@accessions) then
        @accessions = get_all_by_type('accession', 'acc_version')
        @accessions.collect! { |x| x.sub(/\..*\z/, '') }
      end
      @accessions
    end

    # Shows an accession number.
    def accession
      unless defined?(@accession) then
        if acc_version then
          @accession = acc_version.split('.')[0]
        else
          @accession = accessions[0]
        end
      end
      @accession
    end
    
    def method_missing(name, *args)
      # raise ArgumentError,
      # "wrong # of arguments(#{args.size} for 1)" if args.size >= 2
      r = get(name, *args)
      if !r and !(self.class::NSIDs[name.to_s]) then
        raise "NameError: undefined method `#{name.inspect}'"
      end
      r
    end
    

  end #class FastaDefline

end #module Bio


From ngoto at dev.open-bio.org  Fri Jun 20 09:30:16 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Fri, 20 Jun 2008 13:30:16 +0000
Subject: [BioRuby-cvs] bioruby/lib/bio/db fasta.rb,1.28.2.1,1.28.2.2
Message-ID: <200806201330.m5KDUGds021895@dev.open-bio.org>

Update of /home/repository/bioruby/bioruby/lib/bio/db
In directory dev.open-bio.org:/tmp/cvs-serv21857

Modified Files:
      Tag: BRANCH-biohackathon2008
	fasta.rb 
Log Message:
Here-document separater string in example is changed to aviod confusion
about "END" which is also a reserved word in Ruby.


Index: fasta.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/db/fasta.rb,v
retrieving revision 1.28.2.1
retrieving revision 1.28.2.2
diff -C2 -d -r1.28.2.1 -r1.28.2.2
*** fasta.rb	20 Jun 2008 13:22:31 -0000	1.28.2.1
--- fasta.rb	20 Jun 2008 13:30:14 -0000	1.28.2.2
***************
*** 3,7 ****
  #
  # Copyright::  Copyright (C) 2001, 2002
! #              GOTO Naohisa <ngoto at gen-info.osaka-u.ac.jp>,
  #              Toshiaki Katayama <k at bioruby.org>
  # License::    The Ruby License
--- 3,7 ----
  #
  # Copyright::  Copyright (C) 2001, 2002
! #              Naohisa Goto <ng at bioruby.org>,
  #              Toshiaki Katayama <k at bioruby.org>
  # License::    The Ruby License
***************
*** 45,49 ****
    # === Examples
    #
!   #   f_str = <<END
    #   >sce:YBR160W  CDC28, SRM5; cyclin-dependent protein kinase catalytic subunit [EC:2.7.1.-] [SP:CC28_YEAST]
    #   MSGELANYKRLEKVGEGTYGVVYKALDLRPGQGQRVVALKKIRLESEDEG
--- 45,49 ----
    # === Examples
    #
!   #   f_str = <<END_OF_STRING
    #   >sce:YBR160W  CDC28, SRM5; cyclin-dependent protein kinase catalytic subunit [EC:2.7.1.-] [SP:CC28_YEAST]
    #   MSGELANYKRLEKVGEGTYGVVYKALDLRPGQGQRVVALKKIRLESEDEG
***************
*** 65,69 ****
    #   FERLCELLGYDNVFPLIINIKTKSNGGYQLCGSISIIKIEEELKSVGFER
    #   KTGDPLEWRRLFKKISTICRDIILIPN
!   #   END
    #
    #   f = Bio::FastaFormat.new(f_str)
--- 65,69 ----
    #   FERLCELLGYDNVFPLIINIKTKSNGGYQLCGSISIIKIEEELKSVGFER
    #   KTGDPLEWRRLFKKISTICRDIILIPN
!   #   END_OF_STRING
    #
    #   f = Bio::FastaFormat.new(f_str)


From ngoto at dev.open-bio.org  Fri Jun 20 09:43:38 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Fri, 20 Jun 2008 13:43:38 +0000
Subject: [BioRuby-cvs] bioruby/lib/bio/db fasta.rb,1.28.2.2,1.28.2.3
Message-ID: <200806201343.m5KDhcUr021965@dev.open-bio.org>

Update of /home/repository/bioruby/bioruby/lib/bio/db
In directory dev.open-bio.org:/tmp/cvs-serv21945

Modified Files:
      Tag: BRANCH-biohackathon2008
	fasta.rb 
Log Message:
Bio::FastaFormat#to_seq is renamed to to_biosequence with improvement.
The "to_seq" method is now an alias of to_biosequence.


Index: fasta.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/db/fasta.rb,v
retrieving revision 1.28.2.2
retrieving revision 1.28.2.3
diff -C2 -d -r1.28.2.2 -r1.28.2.3
*** fasta.rb	20 Jun 2008 13:30:14 -0000	1.28.2.2
--- fasta.rb	20 Jun 2008 13:43:36 -0000	1.28.2.3
***************
*** 28,31 ****
--- 28,32 ----
  require 'bio/db'
  require 'bio/sequence'
+ require 'bio/sequence/dblink'
  require 'bio/db/fasta/defline'
  
***************
*** 217,226 ****
      # because of efficiency.
      # 
!     def to_seq
        seq
        obj = Bio::Sequence.new(@seq)
!       obj.definition = self.definition
        obj
      end
  
      # Parsing FASTA Defline, and extract IDs.
--- 218,243 ----
      # because of efficiency.
      # 
!     def to_biosequence
        seq
        obj = Bio::Sequence.new(@seq)
!       d = self.identifiers
!       # accessions
!       obj.primary_accession = d.accessions.first
!       obj.secondary_accessions = d.accessions[1..-1]
!       # entry_id
!       obj.entry_id = d.locus unless d.locus.to_s.empty?
!       # GI
!       other = []
!       other.push Bio::Sequence::DBLink.new('GI', d.gi) if d.gi
!       obj.other_seqids = other unless other.empty?
!       # definition
!       if d.accessions.empty? and other.empty? then
!         obj.definition = self.definition
!       else
!         obj.definition = d.description
!       end
        obj
      end
+     alias to_seq to_biosequence
  
      # Parsing FASTA Defline, and extract IDs.


From ngoto at dev.open-bio.org  Mon Jun  2 09:33:50 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Mon, 02 Jun 2008 09:33:50 +0000
Subject: [BioRuby-cvs] bioruby/lib/bio reference.rb,1.26,1.27
Message-ID: <200806020933.m529Xoou025921@dev.open-bio.org>

Update of /home/repository/bioruby/bioruby/lib/bio
In directory dev.open-bio.org:/tmp/cvs-serv25887

Modified Files:
	reference.rb 
Log Message:
reverted to 1.24, because of potential security problem about "eval" in
bibtex method.


Index: reference.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/reference.rb,v
retrieving revision 1.26
retrieving revision 1.27
diff -C2 -d -r1.26 -r1.27
*** reference.rb	31 May 2008 09:36:55 -0000	1.26
--- reference.rb	2 Jun 2008 09:33:48 -0000	1.27
***************
*** 71,74 ****
--- 71,77 ----
      attr_reader :abstract
  
+     # An URL String.
+     attr_reader :url
+ 
      # MeSH terms in an Array.
      attr_reader :mesh
***************
*** 77,83 ****
      attr_reader :affiliations
  
-     # An URL String.
-     attr_reader :url
- 
      # Create a new Bio::Reference object from a Hash of values. 
      # Data is extracted from the values for keys:
--- 80,83 ----
***************
*** 232,236 ****
        lines << "%P #{@pages}" unless @pages.empty?
        lines << "%M #{@pubmed}" unless @pubmed.to_s.empty?
!       lines << "%U #{url}" unless url.empty?
        lines << "%X #{@abstract}" unless @abstract.empty?
        @mesh.each do |term|
--- 232,241 ----
        lines << "%P #{@pages}" unless @pages.empty?
        lines << "%M #{@pubmed}" unless @pubmed.to_s.empty?
!       if @pubmed
!         cgi = "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi"
!         opts = "cmd=Retrieve&db=PubMed&dopt=Citation&list_uids"
!         @url = "#{cgi}?#{opts}=#{@pubmed}"
!       end
!       lines << "%U #{@url}" unless @url.empty?
        lines << "%X #{@abstract}" unless @abstract.empty?
        @mesh.each do |term|
***************
*** 294,321 ****
      # *Arguments*:
      # * (optional) _section_: BiBTeX section as String
-     # * (optional) _keywords_: Array of additional keywords, e.g. ['abstract']
      # *Returns*:: String
!     def bibtex(section = nil, add_keywords = [])
        section = "article" unless section
        authors = authors_join(' and ', ' and ')
        pages   = @pages.sub('-', '--')
!       keywords = "author title journal year volume number pages url".split(/ /)
!       bib = "@#{section}{PMID:#{@pubmed},\n"
!       (keywords+add_keywords).each do | kw |
!         if kw == 'author'
!           ref = authors
!         elsif kw == 'title'
!           # strip final dot from title
!           ref = @title.sub(/\.$/,'')
!         elsif kw == 'number'
!           ref = @issue
!         elsif kw == 'url'
!           ref = url
!         else
!           ref = eval('@'+kw)
!         end
!         bib += "  #{kw.ljust(12)} = {#{ref}},\n" if ref != ''
!       end
!       bib+"}\n"
      end
  
--- 299,318 ----
      # *Arguments*:
      # * (optional) _section_: BiBTeX section as String
      # *Returns*:: String
!     def bibtex(section = nil)
        section = "article" unless section
        authors = authors_join(' and ', ' and ')
        pages   = @pages.sub('-', '--')
!       return <<-"END".gsub(/\t/, '')
!         @#{section}{PMID:#{@pubmed},
!           author  = {#{authors}},
!           title   = {#{@title}},
!           journal = {#{@journal}},
!           year    = {#{@year}},
!           volume  = {#{@volume}},
!           number  = {#{@issue}},
!           pages   = {#{pages}},
!         }
!       END
      end
  
***************
*** 503,518 ****
      end
  
-     # Returns a valid URL for pubmed records
-     #
-     # *Returns*:: String
-     def url
-       return @url if @url and @url != ''
-       if @pubmed != ''
-         cgi = "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi"
-         opts = "cmd=Retrieve&db=PubMed&dopt=Citation&list_uids"
-         return "#{cgi}?#{opts}=#{@pubmed}"
-       end
-       ''
-     end
  
      private
--- 500,503 ----
***************
*** 542,546 ****
      end
  
- 
    end
  
--- 527,530 ----


From ngoto at dev.open-bio.org  Mon Jun  2 09:47:11 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Mon, 02 Jun 2008 09:47:11 +0000
Subject: [BioRuby-cvs] bioruby/lib/bio reference.rb,1.27,1.28
Message-ID: <200806020947.m529lBCN026079@dev.open-bio.org>

Update of /home/repository/bioruby/bioruby/lib/bio
In directory dev.open-bio.org:/tmp/cvs-serv26058/lib/bio

Modified Files:
	reference.rb 
Log Message:
* New method Bio::Reference#pubmed_url added (renamed the url method in
  revision 1.25).
* Bio::Reference#endnote is changed not to overwrite url if url is
  already given by user.


Index: reference.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/reference.rb,v
retrieving revision 1.27
retrieving revision 1.28
diff -C2 -d -r1.27 -r1.28
*** reference.rb	2 Jun 2008 09:33:48 -0000	1.27
--- reference.rb	2 Jun 2008 09:47:08 -0000	1.28
***************
*** 232,241 ****
        lines << "%P #{@pages}" unless @pages.empty?
        lines << "%M #{@pubmed}" unless @pubmed.to_s.empty?
!       if @pubmed
!         cgi = "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi"
!         opts = "cmd=Retrieve&db=PubMed&dopt=Citation&list_uids"
!         @url = "#{cgi}?#{opts}=#{@pubmed}"
!       end
!       lines << "%U #{@url}" unless @url.empty?
        lines << "%X #{@abstract}" unless @abstract.empty?
        @mesh.each do |term|
--- 232,237 ----
        lines << "%P #{@pages}" unless @pages.empty?
        lines << "%M #{@pubmed}" unless @pubmed.to_s.empty?
!       url = @url.empty? ? pubmed_url : @url
!       lines << "%U #{url}" unless url.empty?
        lines << "%X #{@abstract}" unless @abstract.empty?
        @mesh.each do |term|
***************
*** 500,503 ****
--- 496,510 ----
      end
  
+     # Returns a valid URL for pubmed records
+     #
+     # *Returns*:: String
+     def pubmed_url
+       unless @pubmed.to_s.empty?
+         cgi = "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi"
+         opts = "cmd=Retrieve&db=PubMed&dopt=Citation&list_uids"
+         return "#{cgi}?#{opts}=#{@pubmed}"
+       end
+       ''
+     end
  
      private


From ngoto at dev.open-bio.org  Wed Jun  4 14:56:40 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Wed, 04 Jun 2008 14:56:40 +0000
Subject: [BioRuby-cvs] bioruby/lib/bio reference.rb,1.28,1.29
Message-ID: <200806041456.m54Eue8E001532@dev.open-bio.org>

Update of /home/repository/bioruby/bioruby/lib/bio
In directory dev.open-bio.org:/tmp/cvs-serv1512/lib/bio

Modified Files:
	reference.rb 
Log Message:
improvement of Bio::Reference#bibtex method


Index: reference.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/reference.rb,v
retrieving revision 1.28
retrieving revision 1.29
diff -C2 -d -r1.28 -r1.29
*** reference.rb	2 Jun 2008 09:47:08 -0000	1.28
--- reference.rb	4 Jun 2008 14:56:37 -0000	1.29
***************
*** 167,184 ****
      # *Arguments*:
      # * (optional) _style_: String with style identifier
!     # * (optional) _option_: Option for styles accepting one
      # *Returns*:: String
!     def format(style = nil, option = nil)
        case style
        when 'endnote'
          return endnote
        when 'bibitem'
!         return bibitem(option)
        when 'bibtex'
!         return bibtex(option)
        when 'rd'
!         return rd(option)
        when /^nature$/i
!         return nature(option)
        when /^science$/i
          return science
--- 167,184 ----
      # *Arguments*:
      # * (optional) _style_: String with style identifier
!     # * (optional) _options_: Options for styles accepting one
      # *Returns*:: String
!     def format(style = nil, *options)
        case style
        when 'endnote'
          return endnote
        when 'bibitem'
!         return bibitem(*options)
        when 'bibtex'
!         return bibtex(*options)
        when 'rd'
!         return rd(*options)
        when /^nature$/i
!         return nature(*options)
        when /^science$/i
          return science
***************
*** 295,314 ****
      # *Arguments*:
      # * (optional) _section_: BiBTeX section as String
      # *Returns*:: String
!     def bibtex(section = nil)
        section = "article" unless section
        authors = authors_join(' and ', ' and ')
!       pages   = @pages.sub('-', '--')
!       return <<-"END".gsub(/\t/, '')
!         @#{section}{PMID:#{@pubmed},
!           author  = {#{authors}},
!           title   = {#{@title}},
!           journal = {#{@journal}},
!           year    = {#{@year}},
!           volume  = {#{@volume}},
!           number  = {#{@issue}},
!           pages   = {#{pages}},
!         }
!       END
      end
  
--- 295,340 ----
      # *Arguments*:
      # * (optional) _section_: BiBTeX section as String
+     # * (optional) _label_: Label string cited by LaTeX documents.
+     #                       Default is <tt>"PMID:#{pubmed}"</tt>.
+     # * (optional) _keywords_: Hash of additional keywords,
+     #                          e.g. { 'abstract' => 'This is abstract.' }.
+     #                          You can also override default keywords.
+     #                          To disable default keywords, specify false as
+     #                          value, e.g. { 'url' => false, 'year' => false }.
      # *Returns*:: String
!     def bibtex(section = nil, label = nil, keywords = {})
        section = "article" unless section
        authors = authors_join(' and ', ' and ')
!       thepages = pages.to_s.empty? ? nil : pages.sub(/\-/, '--')
!       unless label then
!         label = "PMID:#{pubmed}"
!       end
!       theurl = if !(url.to_s.empty?) then
!                  url
!                elsif pmurl = pubmed_url and !(pmurl.to_s.empty?) then
!                  pmurl
!                else
!                  nil
!                end
!       hash = {
!         'author'  => authors.empty?    ? nil : authors,
!         'title'   => title.to_s.empty? ? nil : title,
!         'number'  => issue.to_s.empty? ? nil : issue,
!         'pages'   => thepages,
!         'url'     => theurl
!       }
!       keys = %w( author title journal year volume number pages url )
!       keys.each do |k|
!         hash[k] = self.__send__(k.intern) unless hash.has_key?(k)
!       end
!       hash.merge!(keywords) { |k, v1, v2| v2.nil? ? v1 : v2 }
!       bib = [ "@#{section}{#{label}," ]
!       keys.concat((hash.keys - keys).sort)
!       keys.each do |kw|
!         ref = hash[kw]
!         bib.push "  #{kw.ljust(12)} = {#{ref}}," if ref
!       end
!       bib.push "}\n"
!       return bib.join("\n")
      end
  

From ngoto at dev.open-bio.org  Wed Jun  4 14:58:10 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Wed, 04 Jun 2008 14:58:10 +0000
Subject: [BioRuby-cvs] bioruby/test/unit/bio test_reference.rb,1.4,1.5
Message-ID: <200806041458.m54EwAo2001581@dev.open-bio.org>

Update of /home/repository/bioruby/bioruby/test/unit/bio
In directory dev.open-bio.org:/tmp/cvs-serv1561/test/unit/bio

Modified Files:
	test_reference.rb 
Log Message:
test changed due to the improvement of Bio::Reference#bibtex


Index: test_reference.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/test/unit/bio/test_reference.rb,v
retrieving revision 1.4
retrieving revision 1.5
diff -C2 -d -r1.4 -r1.5
*** test_reference.rb	31 May 2008 09:36:56 -0000	1.4
--- test_reference.rb	4 Jun 2008 14:58:08 -0000	1.5
***************
*** 103,112 ****
  
      def test_format_bibtex
!       str = "@article{PMID:12345678,\n  author       = {Hoge, J.P. and Fuga, F.B.},\n  title        = {Title of the study},\n  journal      = {Theor. J. Hoge},\n  year         = {2001},\n  volume       = {12},\n  number       = {3},\n  pages        = {123-145},\n  url          = {http://example.com},\n}\n"
! 
        assert_equal(str, @obj.format('bibtex'))
        assert_equal(str, @obj.bibtex)
      end
  
      def test_format_rd
        str = "== Title of the study.\n\n* Hoge, J.P. and Fuga, F.B.\n\n* Theor. J. Hoge 2001 12:123-145 [PMID:12345678]\n\nHoge fuga. hoge fuga."
--- 103,147 ----
  
      def test_format_bibtex
!       str =<<__END__
! @article{PMID:12345678,
!   author       = {Hoge, J.P. and Fuga, F.B.},
!   title        = {Title of the study.},
!   journal      = {Theor. J. Hoge},
!   year         = {2001},
!   volume       = {12},
!   number       = {3},
!   pages        = {123--145},
!   url          = {http://example.com},
! }
! __END__
        assert_equal(str, @obj.format('bibtex'))
        assert_equal(str, @obj.bibtex)
      end
  
+     def test_format_bibtex_with_arguments
+       str =<<__END__
+ @inproceedings{YourArticle,
+   author       = {Hoge, J.P. and Fuga, F.B.},
+   title        = {Title of the study.},
+   year         = {2001},
+   volume       = {12},
+   number       = {3},
+   pages        = {123--145},
+   booktitle    = {Theor. J. Hoge},
+   month        = {December},
+ }
+ __END__
+       assert_equal(str, @obj.format('bibtex', 'inproceedings', 'YourArticle',
+                                     { 'journal'   => false,
+                                       'url' => false,
+                                       'booktitle' => @obj.journal,
+                                       'month' => 'December'}))
+       assert_equal(str, @obj.bibtex('inproceedings', 'YourArticle',
+                                     { 'journal'   => false,
+                                       'url' => false,
+                                       'booktitle' => @obj.journal,
+                                       'month' => 'December'}))
+     end
+ 
      def test_format_rd
        str = "== Title of the study.\n\n* Hoge, J.P. and Fuga, F.B.\n\n* Theor. J. Hoge 2001 12:123-145 [PMID:12345678]\n\nHoge fuga. hoge fuga."


From ngoto at dev.open-bio.org  Fri Jun 13 11:20:26 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Fri, 13 Jun 2008 11:20:26 +0000
Subject: [BioRuby-cvs] bioruby/lib/bio reference.rb,1.29,1.30
Message-ID: <200806131120.m5DBKQLQ004888@dev.open-bio.org>

Update of /home/repository/bioruby/bioruby/lib/bio
In directory dev.open-bio.org:/tmp/cvs-serv4830/lib/bio

Modified Files:
	reference.rb 
Log Message:
modified RDoc for Bio::Reference#bibitem 


Index: reference.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/reference.rb,v
retrieving revision 1.29
retrieving revision 1.30
diff -C2 -d -r1.29 -r1.30
*** reference.rb	4 Jun 2008 14:56:37 -0000	1.29
--- reference.rb	13 Jun 2008 11:20:23 -0000	1.30
***************
*** 252,255 ****
--- 252,257 ----
      #     {\em Theor. J. Hoge}, 12(3):123--145, 2001.
      # ---
+     # *Arguments*:
+     # * (optional) _item_: label string (default: <tt>"PMID:#{pubmed}"</tt>).
      # *Returns*:: String
      def bibitem(item = nil)


From ngoto at dev.open-bio.org  Fri Jun 13 11:37:27 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Fri, 13 Jun 2008 11:37:27 +0000
Subject: [BioRuby-cvs]
	bioruby/test/unit/bio/util/restriction_enzyme/double_stranded
	test_aligned_strands.rb, 1.3, 1.4
Message-ID: <200806131137.m5DBbRnA005201@dev.open-bio.org>

Update of /home/repository/bioruby/bioruby/test/unit/bio/util/restriction_enzyme/double_stranded
In directory dev.open-bio.org:/tmp/cvs-serv5181/test/unit/bio/util/restriction_enzyme/double_stranded

Modified Files:
	test_aligned_strands.rb 
Log Message:
"require 'bio/sequence'" is needed to run the tests in this file.


Index: test_aligned_strands.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/test/unit/bio/util/restriction_enzyme/double_stranded/test_aligned_strands.rb,v
retrieving revision 1.3
retrieving revision 1.4
diff -C2 -d -r1.3 -r1.4
*** test_aligned_strands.rb	5 Apr 2007 23:35:44 -0000	1.3
--- test_aligned_strands.rb	13 Jun 2008 11:37:25 -0000	1.4
***************
*** 14,17 ****
--- 14,18 ----
  
  require 'test/unit'
+ require 'bio/sequence'
  require 'bio/util/restriction_enzyme/double_stranded/aligned_strands'
  require 'bio/util/restriction_enzyme/double_stranded'


From ngoto at dev.open-bio.org  Fri Jun 13 11:39:41 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Fri, 13 Jun 2008 11:39:41 +0000
Subject: [BioRuby-cvs]
	bioruby/test/unit/bio/util/restriction_enzyme/double_stranded
	test_aligned_strands.rb, 1.3, 1.3.2.1
Message-ID: <200806131139.m5DBdfXW005450@dev.open-bio.org>

Update of /home/repository/bioruby/bioruby/test/unit/bio/util/restriction_enzyme/double_stranded
In directory dev.open-bio.org:/tmp/cvs-serv5209/test/unit/bio/util/restriction_enzyme/double_stranded

Modified Files:
      Tag: BRANCH-biohackathon2008
	test_aligned_strands.rb 
Log Message:
merged change from rev. 1.3 to 1.4 in the CVS trunk


Index: test_aligned_strands.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/test/unit/bio/util/restriction_enzyme/double_stranded/test_aligned_strands.rb,v
retrieving revision 1.3
retrieving revision 1.3.2.1
diff -C2 -d -r1.3 -r1.3.2.1
*** test_aligned_strands.rb	5 Apr 2007 23:35:44 -0000	1.3
--- test_aligned_strands.rb	13 Jun 2008 11:39:39 -0000	1.3.2.1
***************
*** 14,17 ****
--- 14,18 ----
  
  require 'test/unit'
+ require 'bio/sequence'
  require 'bio/util/restriction_enzyme/double_stranded/aligned_strands'
  require 'bio/util/restriction_enzyme/double_stranded'


From ngoto at dev.open-bio.org  Tue Jun 17 12:23:52 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Tue, 17 Jun 2008 12:23:52 +0000
Subject: [BioRuby-cvs] bioruby/lib/bio reference.rb,1.24.2.6,1.24.2.7
Message-ID: <200806171223.m5HCNqfC020085@dev.open-bio.org>

Update of /home/repository/bioruby/bioruby/lib/bio
In directory dev.open-bio.org:/tmp/cvs-serv20065/lib/bio

Modified Files:
      Tag: BRANCH-biohackathon2008
	reference.rb 
Log Message:
merged changes in trunk (revision 1.30)


Index: reference.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/reference.rb,v
retrieving revision 1.24.2.6
retrieving revision 1.24.2.7
diff -C2 -d -r1.24.2.6 -r1.24.2.7
*** reference.rb	23 Apr 2008 18:52:18 -0000	1.24.2.6
--- reference.rb	17 Jun 2008 12:23:49 -0000	1.24.2.7
***************
*** 180,186 ****
      # *Arguments*:
      # * (optional) _style_: String with style identifier
!     # * (optional) _option_: Option for styles accepting one
      # *Returns*:: String
!     def format(style = nil, option = nil)
        case style
        when 'embl'
--- 180,186 ----
      # *Arguments*:
      # * (optional) _style_: String with style identifier
!     # * (optional) _options_: Options for styles accepting one
      # *Returns*:: String
!     def format(style = nil, *options)
        case style
        when 'embl'
***************
*** 189,199 ****
          return endnote
        when 'bibitem'
!         return bibitem(option)
        when 'bibtex'
!         return bibtex(option)
        when 'rd'
!         return rd(option)
        when /^nature$/i
!         return nature(option)
        when /^science$/i
          return science
--- 189,199 ----
          return endnote
        when 'bibitem'
!         return bibitem(*options)
        when 'bibtex'
!         return bibtex(*options)
        when 'rd'
!         return rd(*options)
        when /^nature$/i
!         return nature(*options)
        when /^science$/i
          return science
***************
*** 247,256 ****
        lines << "%P #{@pages}" unless @pages.empty?
        lines << "%M #{@pubmed}" unless @pubmed.to_s.empty?
!       if @pubmed
!         cgi = "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi"
!         opts = "cmd=Retrieve&db=PubMed&dopt=Citation&list_uids"
!         @url = "#{cgi}?#{opts}=#{@pubmed}"
!       end
!       lines << "%U #{@url}" unless @url.empty?
        lines << "%X #{@abstract}" unless @abstract.empty?
        @mesh.each do |term|
--- 247,252 ----
        lines << "%P #{@pages}" unless @pages.empty?
        lines << "%M #{@pubmed}" unless @pubmed.to_s.empty?
!       u = @url.empty? ? pubmed_url : @url
!       lines << "%U #{u}" unless u.empty?
        lines << "%X #{@abstract}" unless @abstract.empty?
        @mesh.each do |term|
***************
*** 289,292 ****
--- 285,290 ----
      #     {\em Theor. J. Hoge}, 12(3):123--145, 2001.
      # ---
+     # *Arguments*:
+     # * (optional) _item_: label string (default: <tt>"PMID:#{pubmed}"</tt>).
      # *Returns*:: String
      def bibitem(item = nil)
***************
*** 332,351 ****
      # *Arguments*:
      # * (optional) _section_: BiBTeX section as String
      # *Returns*:: String
!     def bibtex(section = nil)
        section = "article" unless section
        authors = authors_join(' and ', ' and ')
!       pages   = @pages.sub('-', '--')
!       return <<-"END".gsub(/\t/, '')
!         @#{section}{PMID:#{@pubmed},
!           author  = {#{authors}},
!           title   = {#{@title}},
!           journal = {#{@journal}},
!           year    = {#{@year}},
!           volume  = {#{@volume}},
!           number  = {#{@issue}},
!           pages   = {#{pages}},
!         }
!       END
      end
  
--- 330,375 ----
      # *Arguments*:
      # * (optional) _section_: BiBTeX section as String
+     # * (optional) _label_: Label string cited by LaTeX documents.
+     #                       Default is <tt>"PMID:#{pubmed}"</tt>.
+     # * (optional) _keywords_: Hash of additional keywords,
+     #                          e.g. { 'abstract' => 'This is abstract.' }.
+     #                          You can also override default keywords.
+     #                          To disable default keywords, specify false as
+     #                          value, e.g. { 'url' => false, 'year' => false }.
      # *Returns*:: String
!     def bibtex(section = nil, label = nil, keywords = {})
        section = "article" unless section
        authors = authors_join(' and ', ' and ')
!       thepages = pages.to_s.empty? ? nil : pages.sub(/\-/, '--')
!       unless label then
!         label = "PMID:#{pubmed}"
!       end
!       theurl = if !(url.to_s.empty?) then
!                  url
!                elsif pmurl = pubmed_url and !(pmurl.to_s.empty?) then
!                  pmurl
!                else
!                  nil
!                end
!       hash = {
!         'author'  => authors.empty?    ? nil : authors,
!         'title'   => title.to_s.empty? ? nil : title,
!         'number'  => issue.to_s.empty? ? nil : issue,
!         'pages'   => thepages,
!         'url'     => theurl
!       }
!       keys = %w( author title journal year volume number pages url )
!       keys.each do |k|
!         hash[k] = self.__send__(k.intern) unless hash.has_key?(k)
!       end
!       hash.merge!(keywords) { |k, v1, v2| v2.nil? ? v1 : v2 }
!       bib = [ "@#{section}{#{label}," ]
!       keys.concat((hash.keys - keys).sort)
!       keys.each do |kw|
!         ref = hash[kw]
!         bib.push "  #{kw.ljust(12)} = {#{ref}}," if ref
!       end
!       bib.push "}\n"
!       return bib.join("\n")
      end
  
***************
*** 533,536 ****
--- 557,571 ----
      end
  
+     # Returns a valid URL for pubmed records
+     #
+     # *Returns*:: String
+     def pubmed_url
+       unless @pubmed.to_s.empty?
+         cgi = "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi"
+         opts = "cmd=Retrieve&db=PubMed&dopt=Citation&list_uids"
+         return "#{cgi}?#{opts}=#{@pubmed}"
+       end
+       ''
+     end
  
      private


From ngoto at dev.open-bio.org  Tue Jun 17 12:24:44 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Tue, 17 Jun 2008 12:24:44 +0000
Subject: [BioRuby-cvs] bioruby/test/unit/bio test_reference.rb, 1.3.2.1,
	1.3.2.2
Message-ID: <200806171224.m5HCOiAk020113@dev.open-bio.org>

Update of /home/repository/bioruby/bioruby/test/unit/bio
In directory dev.open-bio.org:/tmp/cvs-serv20093/test/unit/bio

Modified Files:
      Tag: BRANCH-biohackathon2008
	test_reference.rb 
Log Message:
merged changes from trunk (revision 1.5)


Index: test_reference.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/test/unit/bio/test_reference.rb,v
retrieving revision 1.3.2.1
retrieving revision 1.3.2.2
diff -C2 -d -r1.3.2.1 -r1.3.2.2
*** test_reference.rb	8 May 2008 05:38:01 -0000	1.3.2.1
--- test_reference.rb	17 Jun 2008 12:24:41 -0000	1.3.2.2
***************
*** 92,96 ****
  
      def test_format_endnote
!       str = "%0 Journal Article\n%A Hoge, J.P.\n%A Fuga, F.B.\n%D 2001\n%T Title of the study.\n%J Theor. J. Hoge\n%V 12\n%N 3\n%P 123-145\n%M 12345678\n%U http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=PubMed&dopt=Citation&list_uids=12345678\n%X Hoge fuga. hoge fuga.\n%K Hoge\n%+ Tokyo"
        assert_equal(str, @obj.format('endnote'))
        assert_equal(str, @obj.endnote)
--- 92,96 ----
  
      def test_format_endnote
!       str = "%0 Journal Article\n%A Hoge, J.P.\n%A Fuga, F.B.\n%D 2001\n%T Title of the study.\n%J Theor. J. Hoge\n%V 12\n%N 3\n%P 123-145\n%M 12345678\n%U http://example.com\n%X Hoge fuga. hoge fuga.\n%K Hoge\n%+ Tokyo"
        assert_equal(str, @obj.format('endnote'))
        assert_equal(str, @obj.endnote)
***************
*** 104,122 ****
  
      def test_format_bibtex
!       str =<<END
!         @article{PMID:12345678,
!           author  = {Hoge, J.P. and Fuga, F.B.},
!           title   = {Title of the study.},
!           journal = {Theor. J. Hoge},
!           year    = {2001},
!           volume  = {12},
!           number  = {3},
!           pages   = {123--145},
!         }
! END
        assert_equal(str, @obj.format('bibtex'))
        assert_equal(str, @obj.bibtex)
      end
  
      def test_format_rd
        str = "== Title of the study.\n\n* Hoge, J.P. and Fuga, F.B.\n\n* Theor. J. Hoge 2001 12:123-145 [PMID:12345678]\n\nHoge fuga. hoge fuga."
--- 104,148 ----
  
      def test_format_bibtex
!       str =<<__END__
! @article{PMID:12345678,
!   author       = {Hoge, J.P. and Fuga, F.B.},
!   title        = {Title of the study.},
!   journal      = {Theor. J. Hoge},
!   year         = {2001},
!   volume       = {12},
!   number       = {3},
!   pages        = {123--145},
!   url          = {http://example.com},
! }
! __END__
        assert_equal(str, @obj.format('bibtex'))
        assert_equal(str, @obj.bibtex)
      end
  
+     def test_format_bibtex_with_arguments
+       str =<<__END__
+ @inproceedings{YourArticle,
+   author       = {Hoge, J.P. and Fuga, F.B.},
+   title        = {Title of the study.},
+   year         = {2001},
+   volume       = {12},
+   number       = {3},
+   pages        = {123--145},
+   booktitle    = {Theor. J. Hoge},
+   month        = {December},
+ }
+ __END__
+       assert_equal(str, @obj.format('bibtex', 'inproceedings', 'YourArticle',
+                                     { 'journal'   => false,
+                                       'url' => false,
+                                       'booktitle' => @obj.journal,
+                                       'month' => 'December'}))
+       assert_equal(str, @obj.bibtex('inproceedings', 'YourArticle',
+                                     { 'journal'   => false,
+                                       'url' => false,
+                                       'booktitle' => @obj.journal,
+                                       'month' => 'December'}))
+     end
+ 
      def test_format_rd
        str = "== Title of the study.\n\n* Hoge, J.P. and Fuga, F.B.\n\n* Theor. J. Hoge 2001 12:123-145 [PMID:12345678]\n\nHoge fuga. hoge fuga."


From ngoto at dev.open-bio.org  Tue Jun 17 15:25:24 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Tue, 17 Jun 2008 15:25:24 +0000
Subject: [BioRuby-cvs] bioruby/lib/bio sequence.rb,0.58.2.11,0.58.2.12
Message-ID: <200806171525.m5HFPOpk020858@dev.open-bio.org>

Update of /home/repository/bioruby/bioruby/lib/bio
In directory dev.open-bio.org:/tmp/cvs-serv20823/lib/bio

Modified Files:
      Tag: BRANCH-biohackathon2008
	sequence.rb 
Log Message:
* Some attributes are added: strandedness (strand information), 
  release_created, release_modified (release information),
  entry_version (version of the entry numbered by database administrator),
  organelle (organelle information), other_seqids (sequence IDs other than
  accessions), and id_namespace (namespace of accessions).
  Most of them are added because corresponding tags are defined in the
  INSDSeq XML v1.4 ( http://www.insdc.org/files/documents/INSD_V1.4.dtd ).
  The "id_namespace" will be used to output NCBI style fasta format.
* The "taxonomy" attribute is changed to be an alias of the "classification"
  attribute.
* The "date" attribute is removed.
* RDoc documents of attributes are updated.


Index: sequence.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/sequence.rb,v
retrieving revision 0.58.2.11
retrieving revision 0.58.2.12
diff -C2 -d -r0.58.2.11 -r0.58.2.12
*** sequence.rb	24 Apr 2008 14:28:25 -0000	0.58.2.11
--- sequence.rb	17 Jun 2008 15:25:22 -0000	0.58.2.12
***************
*** 118,149 ****
    end
    
!   # The sequence identifier.  For example, for a sequence
!   # of Genbank origin, this is the accession number.
    attr_accessor :entry_id
    
!   # A String with a description of the sequence
    attr_accessor :definition
    
!   # An Array of Bio::Feature objects
    attr_accessor :features
    
!   # An Array of Bio::Reference objects
    attr_accessor :references
    
!   # A comment String
    attr_accessor :comments
    
!   # Date from sequence source. Often date of deposition.
!   attr_accessor :date
!   
!   # An Array of Strings
    attr_accessor :keywords
    
!   # An Array of Strings; links to other database entries.
    attr_accessor :dblinks
!   
!   # A taxonomy String
!   attr_accessor :taxonomy
!   
    # Bio::Sequence::NA/AA
    attr_accessor :moltype
--- 118,145 ----
    end
    
!   # The sequence identifier (String).  For example, for a sequence
!   # of Genbank origin, this is the locus name.
!   # For a sequence of EMBL origin, this is the primary accession number.
    attr_accessor :entry_id
    
!   # A String with a description of the sequence (String)
    attr_accessor :definition
    
!   # Features (An Array of Bio::Feature objects)
    attr_accessor :features
    
!   # References (An Array of Bio::Reference objects)
    attr_accessor :references
    
!   # Comments (String or an Array of String)
    attr_accessor :comments
    
!   # Keywords (An Array of String)
    attr_accessor :keywords
    
!   # Links to other database entries.
!   # (An Array of Bio::Sequence::DBLink objects)
    attr_accessor :dblinks
! 
    # Bio::Sequence::NA/AA
    attr_accessor :moltype
***************
*** 157,166 ****
    #+++
    
!   # Version number of the sequence (String).
    attr_accessor :sequence_version
  
!   # Topology (String). "circular" or "linear".
    attr_accessor :topology
  
    # molecular type (String). "DNA" or "RNA" for nucleotide sequence.
    attr_accessor :molecule_type
--- 153,170 ----
    #+++
    
!   # Version number of the sequence (String or Integer).
!   # Unlike <tt>entry_version</tt>, <tt>sequence_version</tt> will be changed
!   # when the submitter of the sequence updates the entry.
!   # Normally, the same entry taken from different databases (EMBL, GenBank,
!   # and DDBJ) may have the same sequence_version.
    attr_accessor :sequence_version
  
!   # Topology (String). "circular", "linear", or nil.
    attr_accessor :topology
  
+   # Strandedness (String). "single" (single-stranded),
+   # "double" (double-stranded), "mixed" (mixed-stranded), or nil.
+   attr_accessor :strandedness
+ 
    # molecular type (String). "DNA" or "RNA" for nucleotide sequence.
    attr_accessor :molecule_type
***************
*** 180,189 ****
    attr_accessor :secondary_accessions
  
!   # Created date of the sequence entry (String)
    attr_accessor :date_created
  
!   # Last modified date of the sequence entry (String)
    attr_accessor :date_modified
  
    # Organism species (String). For example, "Escherichia coli".
    attr_accessor :species
--- 184,208 ----
    attr_accessor :secondary_accessions
  
!   # Created date of the sequence entry (Date, DateTime, Time, or String)
    attr_accessor :date_created
  
!   # Last modified date of the sequence entry (Date, DateTime, Time, or String)
    attr_accessor :date_modified
  
+   # Release information when created (String)
+   attr_accessor :release_created
+ 
+   # Release information when last-modified (String)
+   attr_accessor :release_modified
+ 
+   # Version of the entry (String or Integer).
+   # Unlike <tt>sequence_version</tt>, <tt>entry_version</tt> is a database
+   # maintainer's internal version number.
+   # The version number will be changed when the database maintainer
+   # modifies the entry.
+   # The same enrty in EMBL, GenBank, and DDBJ may have different
+   # entry_version.
+   attr_accessor :entry_version
+ 
    # Organism species (String). For example, "Escherichia coli".
    attr_accessor :species
***************
*** 192,195 ****
--- 211,231 ----
    # (Array of String)
    attr_accessor :classification
+   alias taxonomy classification
+ 
+   # (not well supported) Organelle information (String).
+   attr_accessor :organelle
+ 
+   # Namespace of the sequence IDs described in entry_id, primary_accession,
+   # and secondary_accessions methods (String).
+   # For example, 'EMBL', 'GenBank', 'DDBJ', 'RefSeq'.
+   attr_accessor :id_namespace
+ 
+   # Sequence identifiers which are not described in entry_id,
+   # primary_accession,and secondary_accessions methods
+   # (Array of Bio::Sequence::DBLink objects).
+   # For example, NCBI GI number can be stored.
+   # Note that only identifiers of the entry itself should be stored.
+   # For database cross references, <tt>dblinks</tt> should be used.
+   attr_accessor :other_seqids
  
    # Guess the type of sequence, Amino Acid or Nucleic Acid, and create a 


From ngoto at dev.open-bio.org  Tue Jun 17 15:44:24 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Tue, 17 Jun 2008 15:44:24 +0000
Subject: [BioRuby-cvs] bioruby/test/unit/bio/sequence test_dblink.rb, NONE,
	1.1.2.1
Message-ID: <200806171544.m5HFiOIl021028@dev.open-bio.org>

Update of /home/repository/bioruby/bioruby/test/unit/bio/sequence
In directory dev.open-bio.org:/tmp/cvs-serv21001/test/unit/bio/sequence

Added Files:
      Tag: BRANCH-biohackathon2008
	test_dblink.rb 
Log Message:
New class Bio::Sequence::DBLink are added to store IDs and database names
together in an object.


--- NEW FILE: test_dblink.rb ---
#
# test/unit/bio/sequence/test_dblink.rb - Unit test for Bio::Sequencce::DBLink
#
# Copyright::  Copyright (C) 2008 Naohisa Goto <ng at bioruby.org>
# License::    The Ruby License
#
#  $Id: test_dblink.rb,v 1.1.2.1 2008/06/17 15:44:22 ngoto Exp $
#

require 'pathname'
libpath = Pathname.new(File.join(File.dirname(__FILE__), ['..'] * 4, 'lib')).cleanpath.to_s
$:.unshift(libpath) unless $:.include?(libpath)

require 'test/unit'
require 'bio/sequence'
require 'bio/sequence/dblink'

module Bio
  class TestSequenceDBLink < Test::Unit::TestCase
    def setup
      @xref = Bio::Sequence::DBLink.new('EMBL', 'Z14088', 'CAA78466.1',
                                        '-', 'mRNA')
    end

    def test_database
      assert_equal('EMBL', @xref.database)
    end

    def test_id
      assert_equal('Z14088', @xref.id)
    end

    def test_secondary_ids
      assert_equal([ 'CAA78466.1', '-', 'mRNA' ],
                   @xref.secondary_ids)
    end
  end #class

  class TestSequenceDBLinkClassMethods < Test::Unit::TestCase
    def test_parse_embl_DR_line
      str = 'DR   EPD; EP07077; HS_HBG1.'
      xref = Bio::Sequence::DBLink.parse_embl_DR_line(str)
      assert_equal('EPD', xref.database)
      assert_equal('EP07077', xref.id)
      assert_equal([ 'HS_HBG1' ], xref.secondary_ids)
    end

    def test_parse_uniprot_DR_line
      str = 'DR   EMBL; Z14088; CAA78466.1; -; mRNA.'
      xref = Bio::Sequence::DBLink.parse_uniprot_DR_line(str)
      assert_equal('EMBL', xref.database)
      assert_equal('Z14088', xref.id)
      assert_equal([ 'CAA78466.1', '-', 'mRNA' ],
                   xref.secondary_ids)
      end
  end #class

end #module Bio


From ngoto at dev.open-bio.org  Tue Jun 17 15:44:24 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Tue, 17 Jun 2008 15:44:24 +0000
Subject: [BioRuby-cvs] bioruby/lib/bio/sequence dblink.rb,NONE,1.1.2.1
Message-ID: <200806171544.m5HFiOF6021023@dev.open-bio.org>

Update of /home/repository/bioruby/bioruby/lib/bio/sequence
In directory dev.open-bio.org:/tmp/cvs-serv21001/lib/bio/sequence

Added Files:
      Tag: BRANCH-biohackathon2008
	dblink.rb 
Log Message:
New class Bio::Sequence::DBLink are added to store IDs and database names
together in an object.


--- NEW FILE: dblink.rb ---
#
# = bio/sequence/dblink.rb - sequence ID with database name
#
# Copyright::  Copyright (C) 2008
#              Naohisa Goto <ng at bioruby.org>
# License::    The Ruby License
#
# $Id: dblink.rb,v 1.1.2.1 2008/06/17 15:44:22 ngoto Exp $
#

require 'bio/sequence'

# Bio::Sequence::DBLink stores IDs with the database name.
# Its main purpose is to store database cross-reference information
# for a sequence entry.
class Bio::Sequence::DBLink

  # creates a new DBLink object
  def initialize(database, primary_id, *secondary_ids)
    @database = database
    @id = primary_id
    @secondary_ids = secondary_ids
  end

  # Database name, or namespace identifier (String).
  attr_reader :database

  # Primary identifier (String)
  attr_reader :id

  # Secondary identifiers (Array of String)
  attr_reader :secondary_ids

  #--
  # class methods
  #++

  # Parses DR line in EMBL entry, and returns a DBLink object.
  def self.parse_embl_DR_line(str)
    str = str.sub(/\.\s*\z/, '')
    str.sub!(/\ADR   /, '')
    self.new(*(str.split(/\s*\;\s*/, 3)))
  end

  # Parses DR line in UniProt entry, and returns a DBLink object.
  def self.parse_uniprot_DR_line(str)
    str = str.sub(/\.\s*\z/, '')
    str.sub!(/\ADR   /, '')
    self.new(*(str.split(/\s*\;\s*/)))
  end

end #class Bio::Sequence::DBLink


From ngoto at dev.open-bio.org  Tue Jun 17 15:50:07 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Tue, 17 Jun 2008 15:50:07 +0000
Subject: [BioRuby-cvs] bioruby/lib/bio/sequence format.rb,1.4.2.7,1.4.2.8
Message-ID: <200806171550.m5HFo7Jm021095@dev.open-bio.org>

Update of /home/repository/bioruby/bioruby/lib/bio/sequence
In directory dev.open-bio.org:/tmp/cvs-serv21057/lib/bio/sequence

Modified Files:
      Tag: BRANCH-biohackathon2008
	format.rb 
Log Message:
* In the wrap method, changed to recognize "\n" in given string.
* Some helper methods are added to help formatting date string.


Index: format.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/sequence/format.rb,v
retrieving revision 1.4.2.7
retrieving revision 1.4.2.8
diff -C2 -d -r1.4.2.7 -r1.4.2.8
*** format.rb	4 Mar 2008 11:10:28 -0000	1.4.2.7
--- format.rb	17 Jun 2008 15:50:05 -0000	1.4.2.8
***************
*** 285,305 ****
    def wrap_and_split_lines(str, width)
      result = []
!     left = str.dup
!     while left and left.length > width
!       line = nil
!       width.downto(1) do |i|
!         if left[i..i] == ' ' or /[\,\;]/ =~ left[(i-1)..(i-1)]  then
!           line = left[0..(i-1)].sub(/ +\z/, '')
!           left = left[i..-1].sub(/\A +/, '')
!           break
          end
        end
!       if line.nil? then
!         line = left[0..(width-1)]
!         left = left[width..-1]
!       end
!       result << line
      end
-     result << left if left and !(left.to_s.empty?)
      return result
    end
--- 285,309 ----
    def wrap_and_split_lines(str, width)
      result = []
!     lefts = str.chomp.split(/(?:\r\n|\r|\n)/)
!     lefts.each do |left|
!       left.rstrip!
!       while left and left.length > width
!         line = nil
!         width.downto(1) do |i|
!           if left[i..i] == ' ' or /[\,\;]/ =~ left[(i-1)..(i-1)]  then
!             line = left[0..(i-1)].sub(/ +\z/, '')
!             left = left[i..-1].sub(/\A +/, '')
!             break
!           end
          end
+         if line.nil? then
+           line = left[0..(width-1)]
+           left = left[width..-1]
+         end
+         result << line
+         left = nil if  left.to_s.empty?
        end
!       result << left if left
      end
      return result
    end
***************
*** 320,323 ****
--- 324,352 ----
    end
  
+   #--
+   # internal use only
+   MonthStr = [ nil, 
+                'JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN',
+                'JUL', 'AUG', 'SEP', 'OCT', 'NOV', 'DEC'
+              ].collect { |x| x.freeze }.freeze
+   #++
+ 
+   # formats a date from Date, DateTime, or Time object, or String.
+   def format_date(d)
+     begin
+       yy = d.year
+       mm = d.month
+       dd = d.day
+     rescue NoMethodError, NameError, ArgumentError, TypeError
+       return sprintf("%-11s", d)
+     end
+     sprintf("%02d-%-3s-%04d", dd, MonthStr[mm], yy)
+   end
+ 
+   # null date
+   def null_date
+     Date.new(0, 1, 1)
+   end
+ 
  end #module INSDFeatureHelper
  

From ngoto at dev.open-bio.org  Tue Jun 17 15:53:23 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Tue, 17 Jun 2008 15:53:23 +0000
Subject: [BioRuby-cvs] bioruby/lib/bio/db/genbank common.rb, 1.11.2.4,
	1.11.2.5
Message-ID: <200806171553.m5HFrNlb021165@dev.open-bio.org>

Update of /home/repository/bioruby/bioruby/lib/bio/db/genbank
In directory dev.open-bio.org:/tmp/cvs-serv21145/lib/bio/db/genbank

Modified Files:
      Tag: BRANCH-biohackathon2008
	common.rb 
Log Message:
Bio::GenBank#comment (and Bio::GenPept#comment) is changed not to remove
newlines inside the comment.


Index: common.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/db/genbank/common.rb,v
retrieving revision 1.11.2.4
retrieving revision 1.11.2.5
diff -C2 -d -r1.11.2.4 -r1.11.2.5
*** common.rb	7 May 2008 12:25:42 -0000	1.11.2.4
--- common.rb	17 Jun 2008 15:53:21 -0000	1.11.2.5
***************
*** 196,200 ****
    # COMMENT -- Returns contents of the COMMENT record as a String.
    def comment
!     field_fetch('COMMENT')
    end
  
--- 196,203 ----
    # COMMENT -- Returns contents of the COMMENT record as a String.
    def comment
!     str = get('COMMENT').to_s.sub(/\ACOMMENT     /, '')
!     str.gsub!(/^ {12}/, '')
!     str.chomp!
!     str
    end
  

From ngoto at dev.open-bio.org  Tue Jun 17 15:56:20 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Tue, 17 Jun 2008 15:56:20 +0000
Subject: [BioRuby-cvs] bioruby/lib/bio/db/genbank genbank.rb, 0.40.2.3,
	0.40.2.4
Message-ID: <200806171556.m5HFuKdb021193@dev.open-bio.org>

Update of /home/repository/bioruby/bioruby/lib/bio/db/genbank
In directory dev.open-bio.org:/tmp/cvs-serv21173/lib/bio/db/genbank

Modified Files:
      Tag: BRANCH-biohackathon2008
	genbank.rb 
Log Message:
* Bio::GenBank#to_biosequence is changed to imporve support of sequence output
  and data exchange.
* Bio::GenBank#date_created is added. It returns Date object.


Index: genbank.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/db/genbank/genbank.rb,v
retrieving revision 0.40.2.3
retrieving revision 0.40.2.4
diff -C2 -d -r0.40.2.3 -r0.40.2.4
*** genbank.rb	4 Mar 2008 09:22:35 -0000	0.40.2.3
--- genbank.rb	17 Jun 2008 15:56:18 -0000	0.40.2.4
***************
*** 8,13 ****
--- 8,16 ----
  #
  
+ require 'date'
  require 'bio/db'
  require 'bio/db/genbank/common'
+ require 'bio/sequence'
+ require 'bio/sequence/dblink'
  
  module Bio
***************
*** 122,129 ****
--- 125,142 ----
    alias nalen length
  
+   # (obsolete???) length of the sequence
    def seq_len
      seq.length
    end
  
+   # modified date. Returns Date object, String or nil.
+   def date_modified
+     begin
+       Date.parse(self.date)
+     rescue ArgumentError, TypeError, NoMethodError, NameError
+       self.date
+     end
+   end
+ 
    # converts Bio::GenBank to Bio::Sequence
    # ---
***************
*** 132,135 ****
--- 145,156 ----
    def to_biosequence
      sequence = Bio::Sequence.new(seq)
+ 
+     sequence.id_namespace = 
+       if /\_/ =~ self.accession.to_s then
+         'RefSeq'
+       else
+         'GenBank'
+       end
+ 
      sequence.entry_id = self.entry_id
  
***************
*** 137,147 ****
      sequence.secondary_accessions = self.accessions - [ self.accession ]
  
      sequence.molecule_type = self.natype
      sequence.division = self.division
      sequence.topology = self.circular
  
      sequence.sequence_version = self.version
      #sequence.date_created = nil #????
!     sequence.date_modified = self.date
  
      sequence.definition = self.definition
--- 158,177 ----
      sequence.secondary_accessions = self.accessions - [ self.accession ]
  
+     if /GI\:(.+)/ =~ self.gi.to_s then
+       sequence.other_seqids = [ Bio::Sequence::DBLink.new('GI', $1) ]
+     end
+ 
      sequence.molecule_type = self.natype
      sequence.division = self.division
      sequence.topology = self.circular
+     sequence.strandedness = case self.strand.to_s.downcase;
+                             when 'ss-'; 'single';
+                             when 'ds-'; 'double';
+                             when 'ms-'; 'mixed';
+                             else nil; end
  
      sequence.sequence_version = self.version
      #sequence.date_created = nil #????
!     sequence.date_modified = date_modified
  
      sequence.definition = self.definition
***************
*** 149,153 ****
      sequence.species = self.organism
      sequence.classification = self.taxonomy.to_s.sub(/\.\z/, '').split(/\s*\;\s*/)
!     #sequence.organnella = nil # not used
      sequence.comments = self.comment
      sequence.references = self.references
--- 179,183 ----
      sequence.species = self.organism
      sequence.classification = self.taxonomy.to_s.sub(/\.\z/, '').split(/\s*\;\s*/)
!     #sequence.organelle = nil # yet unsupported
      sequence.comments = self.comment
      sequence.references = self.references


From ngoto at dev.open-bio.org  Tue Jun 17 15:59:26 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Tue, 17 Jun 2008 15:59:26 +0000
Subject: [BioRuby-cvs] bioruby/lib/bio/db/genbank format_genbank.rb, 1.1.2.4,
	1.1.2.5
Message-ID: <200806171559.m5HFxQa4021221@dev.open-bio.org>

Update of /home/repository/bioruby/bioruby/lib/bio/db/genbank
In directory dev.open-bio.org:/tmp/cvs-serv21201/lib/bio/db/genbank

Modified Files:
      Tag: BRANCH-biohackathon2008
	format_genbank.rb 
Log Message:
* Added support for COMMENT.
* Added support for GI number output.
* Many improvements are added.


Index: format_genbank.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/db/genbank/Attic/format_genbank.rb,v
retrieving revision 1.1.2.4
retrieving revision 1.1.2.5
diff -C2 -d -r1.1.2.4 -r1.1.2.5
*** format_genbank.rb	28 May 2008 13:26:33 -0000	1.1.2.4
--- format_genbank.rb	17 Jun 2008 15:59:24 -0000	1.1.2.5
***************
*** 101,104 ****
--- 101,115 ----
      end
  
+     # formats comments lines as GenBank
+     def comments_format_genbank(cmnts)
+       return '' if !cmnts or cmnts.empty?
+       cmnts = [ cmnts ] unless cmnts.kind_of?(Array)
+       a = []
+       cmnts.each do |str|
+         a.push "COMMENT     #{ genbank_wrap(str) }\n"
+       end
+       a.join('')
+     end
+ 
      # formats sequence lines as GenBank
      def seq_format_genbank(str)
***************
*** 113,122 ****
      end
  
      # Erb template of GenBank format for Bio::Sequence
      erb_template <<'__END_OF_TEMPLATE__'
! LOCUS       <%= sprintf("%-16s", entry_id) %> <%= sprintf("%11d", length) %> bp <%= sprintf("%3s", '') %><%= sprintf("%-6s", molecule_type) %>  <%= sprintf("%-8s", topology) %><%= sprintf("%4s", division) %> <%= sprintf("%-11s", date_modified) %>
  DEFINITION  <%= genbank_wrap_dot(definition.to_s) %>
  ACCESSION   <%= genbank_wrap(([ primary_accession ] + (secondary_accessions or [])).join(" ")) %>
! VERSION     <%= primary_accession %>.<%= sequence_version %><% unless true or gi_number.to_s.empty? %>GI:<%= gi_number %><% end %>
  KEYWORDS    <%= genbank_wrap_dot((keywords or []).join('; ')) %>
  SOURCE      <%= genbank_wrap(species) %>
--- 124,168 ----
      end
  
+     # formats date
+     def date_format_genbank
+       date_modified || date_created || null_date
+     end
+ 
+     # moleculue type
+     def mol_type_genbank
+       if /(DNA|(t|r|m|u|sn|sno)?RNA)/i =~ molecule_type.to_s then
+         $1.sub(/[DR]NA/) { |x| x.upcase }
+       else
+         'NA'
+       end
+     end
+ 
+     # NCBI GI number
+     def ncbi_gi_number
+       ids = other_seqids
+       if ids and r = ids.find { |x| x.database == 'GI' } then
+         r.id
+       else
+         nil
+       end
+     end
+ 
+     # strandedness
+     def strandedness_genbank
+       return nil unless strandedness
+       case strandedness
+       when 'single'; 'ss-'; 
+       when 'double'; 'ds-'; 
+       when 'mixed';  'ms-'; 
+       else; nil
+       end
+     end
+ 
      # Erb template of GenBank format for Bio::Sequence
      erb_template <<'__END_OF_TEMPLATE__'
! LOCUS       <%= sprintf("%-16s", entry_id) %> <%= sprintf("%11d", length) %> bp <%= sprintf("%3s", strandedness_genbank) %><%= sprintf("%-6s", mol_type_genbank) %>  <%= sprintf("%-8s", topology) %><%= sprintf("%4s", division) %> <%= date_format_genbank %>
  DEFINITION  <%= genbank_wrap_dot(definition.to_s) %>
  ACCESSION   <%= genbank_wrap(([ primary_accession ] + (secondary_accessions or [])).join(" ")) %>
! VERSION     <%= primary_accession %>.<%= sequence_version %><% if gi = ncbi_gi_number then %>  GI:<%= gi %><% end %>
  KEYWORDS    <%= genbank_wrap_dot((keywords or []).join('; ')) %>
  SOURCE      <%= genbank_wrap(species) %>
***************
*** 129,132 ****
--- 175,179 ----
  %><%= reference_format_genbank(ref, n) %><%
      end
+ %><%= comments_format_genbank(comments)
  %>FEATURES             Location/Qualifiers
  <%= format_features_genbank(features || [])


From ngoto at dev.open-bio.org  Tue Jun 17 16:04:38 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Tue, 17 Jun 2008 16:04:38 +0000
Subject: [BioRuby-cvs] bioruby/lib/bio/db/embl embl.rb,1.29.2.6,1.29.2.7
Message-ID: <200806171604.m5HG4cnr021274@dev.open-bio.org>

Update of /home/repository/bioruby/bioruby/lib/bio/db/embl
In directory dev.open-bio.org:/tmp/cvs-serv21250/lib/bio/db/embl

Modified Files:
      Tag: BRANCH-biohackathon2008
	embl.rb 
Log Message:
* Bio::EMBL#cc is changed to cut heading "CC   ".
* Bio::EMBL#to_biosequence to improve support for sequence output
  and data exchange.
* To get parse result of DT lines more easily, Bio::EMBL#date_modified,
  date_created, release_modified, release_created, and entry_version
  methods are added. 


Index: embl.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/db/embl/embl.rb,v
retrieving revision 1.29.2.6
retrieving revision 1.29.2.7
diff -C2 -d -r1.29.2.6 -r1.29.2.7
*** embl.rb	28 May 2008 13:09:03 -0000	1.29.2.6
--- embl.rb	17 Jun 2008 16:04:36 -0000	1.29.2.7
***************
*** 32,39 ****
--- 32,42 ----
  #
  
+ require 'date'
  require 'bio/db'
  require 'bio/db/embl/common'
  require 'bio/compat/features'
  require 'bio/compat/references'
+ require 'bio/sequence'
+ require 'bio/sequence/dblink'
  
  module Bio
***************
*** 323,329 ****
    # CC Line; comments of notes (>=0)
    def cc
!     get('CC')
    end
! 
  
    ##
--- 326,332 ----
    # CC Line; comments of notes (>=0)
    def cc
!     get('CC').to_s.gsub(/^CC   /, '')
    end
!   alias comment cc
  
    ##
***************
*** 376,379 ****
--- 379,436 ----
    #++
  
+   # modified date. Returns Date object, String or nil.
+   def date_modified
+     parse_date(self.dt['updated'])
+   end
+ 
+   # created date. Returns Date object, String or nil.
+   def date_created
+     parse_date(self.dt['created'])
+   end
+ 
+   # release number when last updated
+   def release_modified
+     parse_release_version(self.dt['updated'])[0]
+   end
+ 
+   # release number when created
+   def release_created
+     parse_release_version(self.dt['created'])[0]
+   end
+ 
+   # entry version number numbered by EMBL
+   def entry_version
+     parse_release_version(self.dt['updated'])[1]
+   end
+ 
+   # parse date string. Returns Date object.
+   def parse_date(str)
+     begin
+       Date.parse(str)
+     rescue ArgumentError, TypeError, NoMethodError, NameError
+       str
+     end
+   end
+   private :parse_date
+ 
+   # extracts release and version numbers from DT line
+   def parse_release_version(str)
+     return [ nil, nil ] unless str
+     a = str.split(/[\(\,\)]/)
+     dstr = a.shift
+     rel = nil
+     ver = nil
+     a.each do |x|
+       case x
+       when /Rel\.\s*(.+)/
+         rel = $1.strip
+       when /Version\s*(.+)/
+         ver = $1.strip
+       end
+     end
+     [ rel, ver ]
+   end
+   private :parse_release_version
+ 
    # converts the entry to Bio::Sequence object
    # ---
***************
*** 382,385 ****
--- 439,444 ----
    def to_biosequence
      bio_seq = Bio::Sequence.new(self.seq)
+ 
+     bio_seq.id_namespace = 'EMBL'
      bio_seq.entry_id = self.entry_id
      bio_seq.primary_accession = self.accessions[0]
***************
*** 389,394 ****
      bio_seq.definition = self.description
      bio_seq.topology = self.topology
!     bio_seq.date_created = self.dt['created']
!     bio_seq.date_modified = self.dt['updated']
      bio_seq.division = self.division
      bio_seq.sequence_version = self.version
--- 448,456 ----
      bio_seq.definition = self.description
      bio_seq.topology = self.topology
!     bio_seq.date_created = self.date_created
!     bio_seq.date_modified = self.date_modified
!     bio_seq.release_created = self.release_created
!     bio_seq.release_modified = self.release_modified
!     bio_seq.entry_version = self.entry_version
      bio_seq.division = self.division
      bio_seq.sequence_version = self.version
***************
*** 396,402 ****
      bio_seq.species = self.fetch('OS')
      bio_seq.classification = self.oc
      bio_seq.references = self.references
      bio_seq.features = self.ft
!     
      return bio_seq
    end
--- 458,469 ----
      bio_seq.species = self.fetch('OS')
      bio_seq.classification = self.oc
+     # bio_seq.organelle = self.fetch('OG') # unsupported yet
      bio_seq.references = self.references
      bio_seq.features = self.ft
!     bio_seq.comments = self.cc
!     bio_seq.dblinks = get('DR').split(/\n/).collect { |x|
!       Bio::Sequence::DBLink.parse_embl_DR_line(x)
!     }
! 
      return bio_seq
    end


From ngoto at dev.open-bio.org  Tue Jun 17 16:06:06 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Tue, 17 Jun 2008 16:06:06 +0000
Subject: [BioRuby-cvs] bioruby/lib/bio/db/embl format_embl.rb, 1.1.2.5,
	1.1.2.6
Message-ID: <200806171606.m5HG66iI021322@dev.open-bio.org>

Update of /home/repository/bioruby/bioruby/lib/bio/db/embl
In directory dev.open-bio.org:/tmp/cvs-serv21282/lib/bio/db/embl

Modified Files:
      Tag: BRANCH-biohackathon2008
	format_embl.rb 
Log Message:
* Added support for CC lines (comments).
* Added support for DR lines (database cross references).
* Many improvements.


Index: format_embl.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/db/embl/Attic/format_embl.rb,v
retrieving revision 1.1.2.5
retrieving revision 1.1.2.6
diff -C2 -d -r1.1.2.5 -r1.1.2.6
*** format_embl.rb	28 May 2008 13:38:07 -0000	1.1.2.5
--- format_embl.rb	17 Jun 2008 16:06:04 -0000	1.1.2.6
***************
*** 2,6 ****
  # = bio/db/embl/format_embl.rb - EMBL format generater
  #
! # Copyright::  Copyright (C) 2008 Jan Aerts <jandot at bioruby.org>
  # License::    The Ruby License
  #
--- 2,8 ----
  # = bio/db/embl/format_embl.rb - EMBL format generater
  #
! # Copyright::  Copyright (C) 2008
! #              Jan Aerts <jandot at bioruby.org>,
! #              Naohisa Goto <ng at bioruby.org>
  # License::    The Ruby License
  #
***************
*** 125,136 ****
      end
  
      # Erb template of EMBL format for Bio::Sequence
      erb_template <<'__END_OF_TEMPLATE__'
! ID   <%= entry_id %>; SV <%= sequence_version %>; <%= topology %>; <%= molecule_type %>; <%= data_class %>; <%= division %>; <%= seq.length %> BP.
  XX   
  <%= embl_wrap('AC   ', accessions.reject{|a| a.nil?}.join('; ') + ';') %>
  XX   
! DT   <%= date_created %>
! DT   <%= date_modified %>
  XX   
  <%= embl_wrap('DE   ', definition) %>
--- 127,166 ----
      end
  
+     # moleculue type
+     def mol_type_embl
+       if mt = molecule_type then
+         mt
+       elsif f = (features or []).find { |f| f.feature == 'source' } and
+           q = f.qualifiers.find { |q| q.qualifier == 'mol_type' } then
+         q.value
+       else
+         'NA'
+       end
+     end
+ 
+     # CC line. Comments.
+     def comments_format_embl(cmnts)
+       return '' if !cmnts or cmnts.empty?
+       cmnts = [ cmnts ] unless cmnts.kind_of?(Array)
+       a = []
+       cmnts.each do |str|
+         a.push embl_wrap('CC   ', str)
+       end
+       unless a.empty? then
+         a.push "XX   "
+         a.push '' # dummy to put "\n" at the end of the string
+       end
+       a.join("\n")
+     end
+ 
+ 
      # Erb template of EMBL format for Bio::Sequence
      erb_template <<'__END_OF_TEMPLATE__'
! ID   <%= primary_accession || entry_id %>; SV <%= sequence_version %>; <%= topology %>; <%= mol_type_embl %>; <%= data_class %>; <%= division %>; <%= seq.length %> BP.
  XX   
  <%= embl_wrap('AC   ', accessions.reject{|a| a.nil?}.join('; ') + ';') %>
  XX   
! DT   <%= format_date(date_created || null_date) %> (Rel. <%= release_created || 0 %>, Created)
! DT   <%= format_date(date_modified || null_date) %> (Rel. <%= release_modified || 0 %>, Last updated, Version <%= entry_version || 0 %>)
  XX   
  <%= embl_wrap('DE   ', definition) %>
***************
*** 142,146 ****
  XX   
  <% hash = {}; (references || []).each do |ref| %><%= reference_format_embl(ref, hash) %>
! <% end %>FH   Key             Location/Qualifiers
  FH   
  <%= format_features_embl(features || []) %>XX   
--- 172,181 ----
  XX   
  <% hash = {}; (references || []).each do |ref| %><%= reference_format_embl(ref, hash) %>
! <% end %><% (dblinks || []).each do |r|
! %>DR   <%= r.database %>; <%= r.id %><% unless r.secondary_ids.empty? %>; <%= r.secondary_ids[0] %><% end %>.
! <% end %><% if dblinks and !dblinks.empty? then
!  %>XX   
! <% end %><%= comments_format_embl(comments)
! %>FH   Key             Location/Qualifiers
  FH   
  <%= format_features_embl(features || []) %>XX   


From ngoto at dev.open-bio.org  Tue Jun 17 16:09:55 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Tue, 17 Jun 2008 16:09:55 +0000
Subject: [BioRuby-cvs] bioruby/test/unit/bio/db/embl test_embl_to_bioseq.rb,
	1.1.2.1, 1.1.2.2
Message-ID: <200806171609.m5HG9tFR021392@dev.open-bio.org>

Update of /home/repository/bioruby/bioruby/test/unit/bio/db/embl
In directory dev.open-bio.org:/tmp/cvs-serv21372/test/unit/bio/db/embl

Modified Files:
      Tag: BRANCH-biohackathon2008
	test_embl_to_bioseq.rb 
Log Message:
Unit test related to Bio::Sequence#date_created and date_modified are
changed because these methods are changed to store Date (or Time or DateTime)
objects instead of String objects.


Index: test_embl_to_bioseq.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/test/unit/bio/db/embl/Attic/test_embl_to_bioseq.rb,v
retrieving revision 1.1.2.1
retrieving revision 1.1.2.2
diff -C2 -d -r1.1.2.1 -r1.1.2.2
*** test_embl_to_bioseq.rb	20 Feb 2008 09:56:22 -0000	1.1.2.1
--- test_embl_to_bioseq.rb	17 Jun 2008 16:09:53 -0000	1.1.2.2
***************
*** 53,59 ****
      end
      
!     def test_dates
!       assert_equal('25-OCT-2002 (Rel. 73, Created)', @bio_seq.date_created)
!       assert_equal('14-NOV-2006 (Rel. 89, Last updated, Version 3)', @bio_seq.date_modified)
      end
      
--- 53,76 ----
      end
      
!     def test_date_created
!       # '25-OCT-2002 (Rel. 73, Created)'
!       assert_equal(Date.parse('25-OCT-2002'), @bio_seq.date_created)
!     end
! 
!     def test_date_modified
!       # '14-NOV-2006 (Rel. 89, Last updated, Version 3)'
!       assert_equal(Date.parse('14-NOV-2006'), @bio_seq.date_modified)
!     end
! 
!     def test_release_created
!       assert_equal('73', @bio_seq.release_created)
!     end
! 
!     def test_release_modified
!       assert_equal('89', @bio_seq.release_modified)
!     end
! 
!     def test_entry_version
!       assert_equal('3', @bio_seq.entry_version)
      end
      
***************
*** 129,135 ****
      end
      
!     def test_dates
!       assert_equal('25-OCT-2002 (Rel. 73, Created)', @bio_seq_2.date_created)
!       assert_equal('14-NOV-2006 (Rel. 89, Last updated, Version 3)', @bio_seq_2.date_modified)
      end
      
--- 146,169 ----
      end
      
!     def test_date_created
!       # '25-OCT-2002 (Rel. 73, Created)'
!       assert_equal(Date.parse('25-OCT-2002'), @bio_seq_2.date_created)
!     end
! 
!     def test_date_modified
!       # '14-NOV-2006 (Rel. 89, Last updated, Version 3)'
!       assert_equal(Date.parse('14-NOV-2006'), @bio_seq_2.date_modified)
!     end
! 
!     def test_release_created
!       assert_equal('73', @bio_seq_2.release_created)
!     end
! 
!     def test_release_modified
!       assert_equal('89', @bio_seq_2.release_modified)
!     end
! 
!     def test_entry_version
!       assert_equal('3', @bio_seq_2.entry_version)
      end
      

From ngoto at dev.open-bio.org  Thu Jun 19 12:45:18 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Thu, 19 Jun 2008 12:45:18 +0000
Subject: [BioRuby-cvs] bioruby/lib/bio/db/embl format_embl.rb, 1.1.2.6,
	1.1.2.7
Message-ID: <200806191245.m5JCjIps000652@dev.open-bio.org>

Update of /home/repository/bioruby/bioruby/lib/bio/db/embl
In directory dev.open-bio.org:/tmp/cvs-serv596/lib/bio/db/embl

Modified Files:
      Tag: BRANCH-biohackathon2008
	format_embl.rb 
Log Message:
avoid error when keywords or classification is nil


Index: format_embl.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/db/embl/Attic/format_embl.rb,v
retrieving revision 1.1.2.6
retrieving revision 1.1.2.7
diff -C2 -d -r1.1.2.6 -r1.1.2.7
*** format_embl.rb	17 Jun 2008 16:06:04 -0000	1.1.2.6
--- format_embl.rb	19 Jun 2008 12:45:15 -0000	1.1.2.7
***************
*** 166,173 ****
  <%= embl_wrap('DE   ', definition) %>
  XX   
! <%= embl_wrap('KW   ', keywords.join('; ') + '.') %>
  XX   
  OS   <%= species %>
! <%= embl_wrap('OC   ', classification.join('; ') + '.') %>
  XX   
  <% hash = {}; (references || []).each do |ref| %><%= reference_format_embl(ref, hash) %>
--- 166,173 ----
  <%= embl_wrap('DE   ', definition) %>
  XX   
! <%= embl_wrap('KW   ', (keywords || []).join('; ') + '.') %>
  XX   
  OS   <%= species %>
! <%= embl_wrap('OC   ', (classification || []).join('; ') + '.') %>
  XX   
  <% hash = {}; (references || []).each do |ref| %><%= reference_format_embl(ref, hash) %>


From ngoto at dev.open-bio.org  Fri Jun 20 13:22:34 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Fri, 20 Jun 2008 13:22:34 +0000
Subject: [BioRuby-cvs] bioruby/lib/bio/db fasta.rb,1.28,1.28.2.1
Message-ID: <200806201322.m5KDMYOR021703@dev.open-bio.org>

Update of /home/repository/bioruby/bioruby/lib/bio/db
In directory dev.open-bio.org:/tmp/cvs-serv21681

Modified Files:
      Tag: BRANCH-biohackathon2008
	fasta.rb 
Log Message:
Split Bio::FastaDefline class into lib/bio/db/fasta/defline.rb


Index: fasta.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/db/fasta.rb,v
retrieving revision 1.28
retrieving revision 1.28.2.1
diff -C2 -d -r1.28 -r1.28.2.1
*** fasta.rb	5 Apr 2007 23:35:40 -0000	1.28
--- fasta.rb	20 Jun 2008 13:22:31 -0000	1.28.2.1
***************
*** 15,57 ****
  # == Examples
  #
! #       rub = Bio::FastaDefline.new('>gi|671595|emb|CAA85678.1| rubisco large subunit [Perovskia abrotanoides]')
! #       rub.entry_id       ==> 'gi|671595'
! #       rub.get('emb')     ==> 'CAA85678.1'
! #       rub.emb            ==> 'CAA85678.1'
! #       rub.gi             ==> '671595'
! #       rub.accession      ==> 'CAA85678'
! #       rub.accessions     ==> [ 'CAA85678' ]
! #       rub.acc_version    ==> 'CAA85678.1'
! #       rub.locus          ==> nil
! #       rub.list_ids       ==> [["gi", "671595"],
! #                               ["emb", "CAA85678.1", nil],
! #                               ["Perovskia abrotanoides"]]
! #
! #       ckr = Bio::FastaDefline.new(">gi|2495000|sp|Q63931|CCKR_CAVPO CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)\001gi|2147182|pir||I51898 cholecystokinin A receptor - guinea pig\001gi|544724|gb|AAB29504.1| cholecystokinin A receptor; CCK-A receptor [Cavia]")
! #       ckr.entry_id      ==> "gi|2495000"
! #       ckr.sp            ==> "CCKR_CAVPO"
! #       ckr.pir           ==> "I51898"
! #       ckr.gb            ==> "AAB29504.1"
! #       ckr.gi            ==> "2495000"
! #       ckr.accession     ==> "AAB29504"
! #       ckr.accessions    ==> ["Q63931", "AAB29504"]
! #       ckr.acc_version   ==> "AAB29504.1"
! #       ckr.locus         ==> nil
! #       ckr.description   ==>
! #         "CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)"
! #       ckr.descriptions  ==>
! #         ["CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)",
! #          "cholecystokinin A receptor - guinea pig",
! #          "cholecystokinin A receptor; CCK-A receptor [Cavia]"]
! #       ckr.words         ==> 
! #         ["cavia", "cck-a", "cck-ar", "cholecystokinin", "guinea", "pig",
! #          "receptor", "type"]
! #       ckr.id_strings    ==>
! #         ["2495000", "Q63931", "CCKR_CAVPO", "2147182", "I51898",
! #          "544724", "AAB29504.1", "Cavia"]
! #       ckr.list_ids      ==>
! #         [["gi", "2495000"], ["sp", "Q63931", "CCKR_CAVPO"],
! #          ["gi", "2147182"], ["pir", nil, "I51898"], ["gi", "544724"],
! #          ["gb", "AAB29504.1", nil], ["Cavia"]]
  #
  # == References
--- 15,19 ----
  # == Examples
  #
! # See documents of Bio::FastaFormat class.
  #
  # == References
***************
*** 66,69 ****
--- 28,32 ----
  require 'bio/db'
  require 'bio/sequence'
+ require 'bio/db/fasta/defline'
  
  module Bio
***************
*** 363,825 ****
    end #class FastaNumericFormat
  
- 
-   # Parsing FASTA Defline, and extract IDs and other informations.
-   # IDs are NSIDs (NCBI standard FASTA sequence identifiers)
-   # or ":"-separated IDs.
-   # 
-   # specs are described in:
-   # ftp://ftp.ncbi.nih.gov/blast/documents/README.formatdb
-   # http://blast.wustl.edu/doc/FAQ-Indexing.html#Identifiers
-   #
-   # === Examples
-   #
-   #   rub = Bio::FastaDefline.new('>gi|671595|emb|CAA85678.1| rubisco large subunit [Perovskia abrotanoides]')
-   #   rub.entry_id       ==> 'gi|671595'
-   #   rub.get('emb')     ==> 'CAA85678.1'
-   #   rub.emb            ==> 'CAA85678.1'
-   #   rub.gi             ==> '671595'
-   #   rub.accession      ==> 'CAA85678'
-   #   rub.accessions     ==> [ 'CAA85678' ]
-   #   rub.acc_version    ==> 'CAA85678.1'
-   #   rub.locus          ==> nil
-   #   rub.list_ids       ==> [["gi", "671595"],
-   #                           ["emb", "CAA85678.1", nil],
-   #                           ["Perovskia abrotanoides"]]
-   #
-   #   ckr = Bio::FastaDefline.new(">gi|2495000|sp|Q63931|CCKR_CAVPO CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)\001gi|2147182|pir||I51898 cholecystokinin A receptor - guinea pig\001gi|544724|gb|AAB29504.1| cholecystokinin A receptor; CCK-A receptor [Cavia]")
-   #   ckr.entry_id      ==> "gi|2495000"
-   #   ckr.sp            ==> "CCKR_CAVPO"
-   #   ckr.pir           ==> "I51898"
-   #   ckr.gb            ==> "AAB29504.1"
-   #   ckr.gi            ==> "2495000"
-   #   ckr.accession     ==> "AAB29504"
-   #   ckr.accessions    ==> ["Q63931", "AAB29504"]
-   #   ckr.acc_version   ==> "AAB29504.1"
-   #   ckr.locus         ==> nil
-   #   ckr.description   ==>
-   #     "CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)"
-   #   ckr.descriptions  ==>
-   #     ["CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)",
-   #      "cholecystokinin A receptor - guinea pig",
-   #      "cholecystokinin A receptor; CCK-A receptor [Cavia]"]
-   #   ckr.words         ==> 
-   #     ["cavia", "cck-a", "cck-ar", "cholecystokinin", "guinea", "pig",
-   #      "receptor", "type"]
-   #   ckr.id_strings    ==>
-   #     ["2495000", "Q63931", "CCKR_CAVPO", "2147182", "I51898",
-   #      "544724", "AAB29504.1", "Cavia"]
-   #   ckr.list_ids      ==>
-   #     [["gi", "2495000"], ["sp", "Q63931", "CCKR_CAVPO"],
-   #      ["gi", "2147182"], ["pir", nil, "I51898"], ["gi", "544724"],
-   #      ["gb", "AAB29504.1", nil], ["Cavia"]]
-   #
-   # === Refereneces
-   #
-   # * Fasta format description (NCBI)
-   #   http://www.ncbi.nlm.nih.gov/BLAST/fasta.shtml
-   #
-   # * Frequently Asked Questions:  Indexing of Sequence Identifiers (by Warren R. Gish.)
-   #   http://blast.wustl.edu/doc/FAQ-Indexing.html#Identifiers
-   #
-   # * README.formatdb
-   #   ftp://ftp.ncbi.nih.gov/blast/documents/README.formatdb
-   # 
-   class FastaDefline
- 
-     NSIDs = {
-       # NCBI and WU-BLAST
-       'gi'  => [ 'gi' ],                      # NCBI GI
-       'gb'  => [ 'acc_version', 'locus' ],      # GenBank
-       'emb' => [ 'acc_version', 'locus' ],      # EMBL
-       'dbj' => [ 'acc_version', 'locus' ],      # DDBJ
-       'sp'  => [ 'accession', 'entry_id' ],   # SWISS-PROT
-       'pdb' => [ 'entry_id', 'chain' ],       # PDB
-       'bbs' => [ 'number' ],                  # GenInfo Backbone Id
-       'gnl' => [ 'database' , 'entry_id' ],   # General database identifier
-       'ref' => [ 'acc_version' , 'locus' ],     # NCBI Reference Sequence
-       'lcl' => [ 'entry_id' ],                # Local Sequence identifier
- 
-       # WU-BLAST and NCBI
-       'pir' => [ 'accession', 'entry_id' ],   # PIR
-       'prf' => [ 'accession', 'entry_id' ],   # Protein Research Foundation
-       'pat' => [ 'country', 'number', 'serial' ], # Patents
- 
-       # WU-BLAST only
-       'bbm' => [ 'number' ],      # NCBI GenInfo Backbone database identifier
-       'gim' => [ 'number' ],      # NCBI GenInfo Import identifier
-       'gp'  => [ 'acc_version', 'locus' ],      # GenPept
-       'oth' => [ 'accession', 'name', 'release' ],  # Other (user-definable) identifier
-       'tpd' => [ 'accession', 'name' ],       # Third party annotation, DDBJ
-       'tpe' => [ 'accession', 'name' ],       # Third party annotation, EMBL
-       'tpg' => [ 'accession', 'name' ],       # Third party annotation, GenBank
- 
-       # Original
-       'ri'  => [ 'entry_id', 'rearray_id', 'len' ], # RIKEN FANTOM DB
-     }
- 
-     # Shows array that contains IDs (or ID-like strings).
-     # Returns an array of arrays of strings.
-     attr_reader :list_ids
- 
-     # Shows a possibly unique identifier.
-     # Returns a string.
-     attr_reader :entry_id
- 
-     # Parses given string.
-     def initialize(str)
-       @deflines = []
-       @info = {}
-       @list_ids = []
- 
-       @entry_id = nil
- 
-       lines = str.split("\x01")
-       lines.each do |line|
-         add_defline(line)
-       end
-     end #def initialize
- 
-     # Parses given string and adds parsed data.
-     def add_defline(str)
-       case str
-       when /^\>?\s*((?:[^\|\s]*\|)+[^\s]+)\s*(.*)$/
-         # NSIDs
-         # examples:
-         # >gi|9910844|sp|Q9UWG2|RL3_METVA 50S ribosomal protein L3P
-         #
-         # note: regexp (:?) means grouping without backreferences
-         i = $1
-         d = $2
-         tks = i.split('|')
-         tks << '' if i[-1,1] == '|'
-         a = parse_NSIDs(tks)
-         i = a[0].join('|')
-         a.unshift('|')
-         d = tks.join('|') + ' ' + d unless tks.empty?
-         a << d
-         this_line = a
-         match_EC(d)
-         parse_square_brackets(d).each do |x|
-           if !match_EC(x, false) and x =~ /\A[A-Z]/ then
-             di = [  x ]
-             @list_ids << di
-             @info['organism'] = x unless @info['organism']
-           end
-         end
- 
-       when /^\>?\s*([a-zA-Z0-9]+\:[^\s]+)\s*(.*)$/
-         # examples:
-         # >sce:YBR160W  CDC28, SRM5; cyclin-dependent protein kinase catalytic subunit [EC:2.7.1.-] [SP:CC28_YEAST]
-         # >emb:CACDC28 [X80034] C.albicans CDC28 gene 
-         i = $1
-         d = $2
-         a = parse_ColonSepID(i)
-         i = a.join(':')
-         this_line = [ ':', a , d ]
-         match_EC(d)
-         parse_square_brackets(d).each do |x|
-           if !match_EC(x, false) and x =~ /:/ then
-             parse_ColonSepID(x)
-           elsif x =~ /\A\s*([A-Z][A-Z0-9_\.]+)\s*\z/ then
-             @list_ids << [ $1 ]
-           end
-         end
- 
-       when /^\>?\s*(\S+)(?:\s+(.+))?$/
-         # examples:
-         # >ABC12345 this is test
-         i = $1
-         d = $2.to_s
-         @list_ids << [ i.chomp('.') ]
-         this_line = [  '', [ i ], d ]
-         match_EC(d)
-       else
-         i = str
-         d = ''
-         match_EC(i)
-         this_line = [ '', [ i ], d ]
-       end
- 
-       @deflines << this_line
-       @entry_id = i unless @entry_id
-     end
- 
-     def match_EC(str, write_flag = true)
-       di = nil
-       str.scan(/EC\:((:?[\-\d]+\.){3}(:?[\-\d]+))/i) do |x|
-         di = [ 'EC', $1 ]
-         if write_flag then
-           @info['ec'] = di[1] if (!@info['ec'] or @info['ec'].to_s =~ /\-/)
-           @list_ids << di
-         end
-       end
-       di
-     end
-     private :match_EC
- 
-     def parse_square_brackets(str)
-       r = []
-       str.scan(/\[([^\]]*)\]/) do |x|
-         r << x[0]
-       end
-       r
-     end
-     private :parse_square_brackets
- 
-     def parse_ColonSepID(str)
-       di = str.split(':', 2)
-       di << nil if di.size <= 1 
-       @list_ids << di
-       di
-     end
-     private :parse_ColonSepID
- 
-     def parse_NSIDs(ary)
-       # this method destroys ary
-       data = []
-       while token = ary.shift
-         if labels = self.class::NSIDs[token] then
-           di = [ token ]
-           idtype = token
-           labels.each do |x|
-             token = ary.shift
-             break unless token
-             if self.class::NSIDs[token] then
-               ary.unshift(token)
-               break #each
-             end
-             if token.length > 0 then
-               di << token
-             else
-               di << nil
-             end
-           end
-           data << di
-         else
-           if token.length > 0 then
-             # UCID (uncontrolled identifiers)
-             di = [ token ]
-             data << di
-             @info['ucid'] = token unless @info['ucid']
-           end
-           break #while
-         end
-       end #while
-       @list_ids.concat data
-       data
-     end #def parse_NSIDs
-     private :parse_NSIDs
- 
- 
-     # Shows original string.
-     # Note that the result of this method may be different from
-     # original string which is given in FastaDefline.new method.
-     def to_s
-       @deflines.collect { |a|
-         s = a[0]
-         (a[1..-2].collect { |x| x.join(s) }.join(s) + ' ' + a[-1]).strip
-       }.join("\x01")
-     end
- 
-     # Shows description.
-     def description
-       @deflines[0].to_a[-1]
-     end
- 
-     # Returns descriptions.
-     def descriptions
-       @deflines.collect do |a|
-         a[-1]
-       end
-     end
- 
-     # Shows ID-like strings.
-     # Returns an array of strings.
-     def id_strings
-       r = []
-       @list_ids.each do |a|
-         if a.size >= 2 then
-           r.concat a[1..-1].find_all { |x| x }
-         else
-           if a[0].to_s.size > 0 and a[0] =~ /\A[A-Za-z0-9\.\-\_]+\z/
-             r << a[0]
-           end
-         end
-       end
-       r.concat( words(true, []).find_all do |x|
-                  x =~ /\A[A-Z][A-Za-z0-9\_]*[0-9]+[A-Za-z0-9\_]+\z/ or
-                    x =~ /\A[A-Z][A-Z0-9]*\_[A-Z0-9\_]+\z/
-                end)
-       r
-     end
- 
-     KillWords = [
-       'an', 'the', 'this', 'that',
-       'is', 'are', 'were', 'was', 'be', 'can', 'may', 'might',
-       'as', 'at', 'by', 'for', 'in', 'of', 'on', 'to', 'with',
-       'from', 'and', 'or', 'not',
-       'dna', 'rna', 'mrna', 'cdna', 'orf',
-       'aa', 'nt', 'pct', 'id', 'ec', 'sp', 'subsp',
-       'similar', 'involved', 'identical', 'identity',
-       'cds', 'clone', 'library', 'contig', 'contigs',
-       'homolog', 'homologue', 'homologs', 'homologous',
-       'protein', 'proteins', 'gene', 'genes',
-       'product', 'products', 'sequence', 'sequences', 
-       'strain', 'strains', 'region', 'regions',
-     ]
-     KillWordsHash = {}
-     KillWords.each { |x| KillWordsHash[x] = true }
- 
-     KillRegexpArray = [
-       /\A\d{1,3}\%?\z/,
-       /\A[A-Z][A-Za-z0-9\_]*[0-9]+[A-Za-z0-9\_]+\z/,
-       /\A[A-Z][A-Z0-9]*\_[A-Z0-9\_]+\z/
-     ]
- 
-     # Shows words used in the defline. Returns an Array.
-     def words(case_sensitive = nil, kill_regexp = self.class::KillRegexpArray,
-               kwhash = self.class::KillWordsHash)
-       a = descriptions.join(' ').split(/[\.\,\;\:\(\)\[\]\{\}\<\>\"\'\`\~\/\|\?\!\&\@\#\s\x00-\x1f\x7f]+/)
-       a.collect! do |x|
-         x.sub!(/\A[\$\*\-\+]+/, '')
-         x.sub!(/[\$\*\-\=]+\z/, '')
-         if x.size <= 1 then
-           nil
-         elsif kwhash[x.downcase] then
-           nil
-         else
-           if kill_regexp.find { |expr| expr =~ x } then
-             nil
-           else
-             x
-           end
-         end
-       end
-       a.compact!
-       a.collect! { |x| x.downcase } unless case_sensitive
-       a.sort!
-       a.uniq!
-       a
-     end
- 
-     # Returns identifires by a database name.
-     def get(dbname)
-       db = dbname.to_s
-       r = nil
-       unless r = @info[db] then
-         di = @list_ids.find { |x| x[0] == db.to_s }
-         if di and di.size <= 2 then
-           r = di[-1]
-         elsif di then
-           labels = self.class::NSIDs[db]
-           [ 'acc_version', 'entry_id',
-             'locus', 'accession', 'number'].each do |x|
-             if i = labels.index(x) then
-               r = di[i+1]
-               break if r
-             end
-           end
-           r = di[1..-1].find { |x| x } unless r
-         end
-         @info[db] = r if r
-       end
-       r
-     end
- 
-     # Returns an identifier by given type.
-     def get_by_type(type_str)
-       @list_ids.each do |x|
-         if labels = self.class::NSIDs[x[0]] then
-           if i = labels.index(type_str) then
-             return x[i+1]
-           end
-         end
-       end
-       nil
-     end
- 
-     # Returns identifiers by given type.
-     def get_all_by_type(*type_strarg)
-       d = []
-       @list_ids.each do |x|
-         if labels = self.class::NSIDs[x[0]] then
-           type_strarg.each do |y|
-             if i = labels.index(y) then
-               d << x[i+1] if x[i+1]
-             end
-           end
-         end
-       end
-       d
-     end
- 
-     # Shows locus.
-     # If the entry has more than two of such IDs,
-     # only the first ID are shown.
-     # Returns a string or nil.
-     def locus
-       unless defined?(@locus)
-         @locus = get_by_type('locus')
-       end
-       @locus
-     end
- 
-     # Shows GI.
-     # If the entry has more than two of such IDs,
-     # only the first ID are shown.
-     # Returns a string or nil.
-     def gi
-       unless defined?(@gi) then
-         @gi = get_by_type('gi')
-       end
-       @gi
-     end
- 
-     # Shows accession with version number.
-     # If the entry has more than two of such IDs,
-     # only the first ID are shown.
-     # Returns a string or nil.
-     def acc_version
-       unless defined?(@acc_version) then
-         @acc_version = get_by_type('acc_version')
-       end
-       @acc_version
-     end
- 
-     # Shows accession numbers.
-     # Returns an array of strings.
-     def accessions
-       unless defined?(@accessions) then
-         @accessions = get_all_by_type('accession', 'acc_version')
-         @accessions.collect! { |x| x.sub(/\..*\z/, '') }
-       end
-       @accessions
-     end
- 
-     # Shows an accession number.
-     def accession
-       unless defined?(@accession) then
-         if acc_version then
-           @accession = acc_version.split('.')[0]
-         else
-           @accession = accessions[0]
-         end
-       end
-       @accession
-     end
-     
-     def method_missing(name, *args)
-       # raise ArgumentError,
-       # "wrong # of arguments(#{args.size} for 1)" if args.size >= 2
-       r = get(name, *args)
-       if !r and !(self.class::NSIDs[name.to_s]) then
-         raise "NameError: undefined method `#{name.inspect}'"
-       end
-       r
-     end
-     
- 
-   end #class FastaDefline
- 
  end #module Bio
  
--- 326,329 ----


From ngoto at dev.open-bio.org  Fri Jun 20 13:22:34 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Fri, 20 Jun 2008 13:22:34 +0000
Subject: [BioRuby-cvs] bioruby/lib/bio/db/fasta defline.rb,NONE,1.1.2.1
Message-ID: <200806201322.m5KDMYlh021706@dev.open-bio.org>

Update of /home/repository/bioruby/bioruby/lib/bio/db/fasta
In directory dev.open-bio.org:/tmp/cvs-serv21681/fasta

Added Files:
      Tag: BRANCH-biohackathon2008
	defline.rb 
Log Message:
Split Bio::FastaDefline class into lib/bio/db/fasta/defline.rb


--- NEW FILE: defline.rb ---
#
# = bio/db/fasta/defline.rb - FASTA defline parser class
#
# Copyright::  Copyright (C) 2001, 2002
#              GOTO Naohisa <ngoto at gen-info.osaka-u.ac.jp>,
#              Toshiaki Katayama <k at bioruby.org>
# License::    The Ruby License
#
# $Id: defline.rb,v 1.1.2.1 2008/06/20 13:22:32 ngoto Exp $
# 
# == Description
# 
# Bio::FastaDefline is a parser class for definition line (defline)
# of the FASTA format.
#
# == Examples
#
#       rub = Bio::FastaDefline.new('>gi|671595|emb|CAA85678.1| rubisco large subunit [Perovskia abrotanoides]')
#       rub.entry_id       ==> 'gi|671595'
#       rub.get('emb')     ==> 'CAA85678.1'
#       rub.emb            ==> 'CAA85678.1'
#       rub.gi             ==> '671595'
#       rub.accession      ==> 'CAA85678'
#       rub.accessions     ==> [ 'CAA85678' ]
#       rub.acc_version    ==> 'CAA85678.1'
#       rub.locus          ==> nil
#       rub.list_ids       ==> [["gi", "671595"],
#                               ["emb", "CAA85678.1", nil],
#                               ["Perovskia abrotanoides"]]
#
#       ckr = Bio::FastaDefline.new(">gi|2495000|sp|Q63931|CCKR_CAVPO CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)\001gi|2147182|pir||I51898 cholecystokinin A receptor - guinea pig\001gi|544724|gb|AAB29504.1| cholecystokinin A receptor; CCK-A receptor [Cavia]")
#       ckr.entry_id      ==> "gi|2495000"
#       ckr.sp            ==> "CCKR_CAVPO"
#       ckr.pir           ==> "I51898"
#       ckr.gb            ==> "AAB29504.1"
#       ckr.gi            ==> "2495000"
#       ckr.accession     ==> "AAB29504"
#       ckr.accessions    ==> ["Q63931", "AAB29504"]
#       ckr.acc_version   ==> "AAB29504.1"
#       ckr.locus         ==> nil
#       ckr.description   ==>
#         "CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)"
#       ckr.descriptions  ==>
#         ["CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)",
#          "cholecystokinin A receptor - guinea pig",
#          "cholecystokinin A receptor; CCK-A receptor [Cavia]"]
#       ckr.words         ==> 
#         ["cavia", "cck-a", "cck-ar", "cholecystokinin", "guinea", "pig",
#          "receptor", "type"]
#       ckr.id_strings    ==>
#         ["2495000", "Q63931", "CCKR_CAVPO", "2147182", "I51898",
#          "544724", "AAB29504.1", "Cavia"]
#       ckr.list_ids      ==>
#         [["gi", "2495000"], ["sp", "Q63931", "CCKR_CAVPO"],
#          ["gi", "2147182"], ["pir", nil, "I51898"], ["gi", "544724"],
#          ["gb", "AAB29504.1", nil], ["Cavia"]]
#
# == References
#
# * FASTA format (WikiPedia)
#   http://en.wikipedia.org/wiki/FASTA_format
#   
# * Fasta format description (NCBI)
#   http://www.ncbi.nlm.nih.gov/BLAST/fasta.shtml
#

module Bio

  #--
  # split from fasta.rb revision 1.28
  #++

  # Parsing FASTA Defline, and extract IDs and other informations.
  # IDs are NSIDs (NCBI standard FASTA sequence identifiers)
  # or ":"-separated IDs.
  # 
  # specs are described in:
  # ftp://ftp.ncbi.nih.gov/blast/documents/README.formatdb
  # http://blast.wustl.edu/doc/FAQ-Indexing.html#Identifiers
  #
  # === Examples
  #
  #   rub = Bio::FastaDefline.new('>gi|671595|emb|CAA85678.1| rubisco large subunit [Perovskia abrotanoides]')
  #   rub.entry_id       ==> 'gi|671595'
  #   rub.get('emb')     ==> 'CAA85678.1'
  #   rub.emb            ==> 'CAA85678.1'
  #   rub.gi             ==> '671595'
  #   rub.accession      ==> 'CAA85678'
  #   rub.accessions     ==> [ 'CAA85678' ]
  #   rub.acc_version    ==> 'CAA85678.1'
  #   rub.locus          ==> nil
  #   rub.list_ids       ==> [["gi", "671595"],
  #                           ["emb", "CAA85678.1", nil],
  #                           ["Perovskia abrotanoides"]]
  #
  #   ckr = Bio::FastaDefline.new(">gi|2495000|sp|Q63931|CCKR_CAVPO CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)\001gi|2147182|pir||I51898 cholecystokinin A receptor - guinea pig\001gi|544724|gb|AAB29504.1| cholecystokinin A receptor; CCK-A receptor [Cavia]")
  #   ckr.entry_id      ==> "gi|2495000"
  #   ckr.sp            ==> "CCKR_CAVPO"
  #   ckr.pir           ==> "I51898"
  #   ckr.gb            ==> "AAB29504.1"
  #   ckr.gi            ==> "2495000"
  #   ckr.accession     ==> "AAB29504"
  #   ckr.accessions    ==> ["Q63931", "AAB29504"]
  #   ckr.acc_version   ==> "AAB29504.1"
  #   ckr.locus         ==> nil
  #   ckr.description   ==>
  #     "CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)"
  #   ckr.descriptions  ==>
  #     ["CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)",
  #      "cholecystokinin A receptor - guinea pig",
  #      "cholecystokinin A receptor; CCK-A receptor [Cavia]"]
  #   ckr.words         ==> 
  #     ["cavia", "cck-a", "cck-ar", "cholecystokinin", "guinea", "pig",
  #      "receptor", "type"]
  #   ckr.id_strings    ==>
  #     ["2495000", "Q63931", "CCKR_CAVPO", "2147182", "I51898",
  #      "544724", "AAB29504.1", "Cavia"]
  #   ckr.list_ids      ==>
  #     [["gi", "2495000"], ["sp", "Q63931", "CCKR_CAVPO"],
  #      ["gi", "2147182"], ["pir", nil, "I51898"], ["gi", "544724"],
  #      ["gb", "AAB29504.1", nil], ["Cavia"]]
  #
  # === Refereneces
  #
  # * Fasta format description (NCBI)
  #   http://www.ncbi.nlm.nih.gov/BLAST/fasta.shtml
  #
  # * Frequently Asked Questions:  Indexing of Sequence Identifiers (by Warren R. Gish.)
  #   http://blast.wustl.edu/doc/FAQ-Indexing.html#Identifiers
  #
  # * README.formatdb
  #   ftp://ftp.ncbi.nih.gov/blast/documents/README.formatdb
  # 
  class FastaDefline

    NSIDs = {
      # NCBI and WU-BLAST
      'gi'  => [ 'gi' ],                      # NCBI GI
      'gb'  => [ 'acc_version', 'locus' ],      # GenBank
      'emb' => [ 'acc_version', 'locus' ],      # EMBL
      'dbj' => [ 'acc_version', 'locus' ],      # DDBJ
      'sp'  => [ 'accession', 'entry_id' ],   # SWISS-PROT
      'pdb' => [ 'entry_id', 'chain' ],       # PDB
      'bbs' => [ 'number' ],                  # GenInfo Backbone Id
      'gnl' => [ 'database' , 'entry_id' ],   # General database identifier
      'ref' => [ 'acc_version' , 'locus' ],     # NCBI Reference Sequence
      'lcl' => [ 'entry_id' ],                # Local Sequence identifier

      # WU-BLAST and NCBI
      'pir' => [ 'accession', 'entry_id' ],   # PIR
      'prf' => [ 'accession', 'entry_id' ],   # Protein Research Foundation
      'pat' => [ 'country', 'number', 'serial' ], # Patents

      # WU-BLAST only
      'bbm' => [ 'number' ],      # NCBI GenInfo Backbone database identifier
      'gim' => [ 'number' ],      # NCBI GenInfo Import identifier
      'gp'  => [ 'acc_version', 'locus' ],      # GenPept
      'oth' => [ 'accession', 'name', 'release' ],  # Other (user-definable) identifier
      'tpd' => [ 'accession', 'name' ],       # Third party annotation, DDBJ
      'tpe' => [ 'accession', 'name' ],       # Third party annotation, EMBL
      'tpg' => [ 'accession', 'name' ],       # Third party annotation, GenBank

      # Original
      'ri'  => [ 'entry_id', 'rearray_id', 'len' ], # RIKEN FANTOM DB
    }

    # Shows array that contains IDs (or ID-like strings).
    # Returns an array of arrays of strings.
    attr_reader :list_ids

    # Shows a possibly unique identifier.
    # Returns a string.
    attr_reader :entry_id

    # Parses given string.
    def initialize(str)
      @deflines = []
      @info = {}
      @list_ids = []

      @entry_id = nil

      lines = str.split("\x01")
      lines.each do |line|
        add_defline(line)
      end
    end #def initialize

    # Parses given string and adds parsed data.
    def add_defline(str)
      case str
      when /^\>?\s*((?:[^\|\s]*\|)+[^\s]+)\s*(.*)$/
        # NSIDs
        # examples:
        # >gi|9910844|sp|Q9UWG2|RL3_METVA 50S ribosomal protein L3P
        #
        # note: regexp (:?) means grouping without backreferences
        i = $1
        d = $2
        tks = i.split('|')
        tks << '' if i[-1,1] == '|'
        a = parse_NSIDs(tks)
        i = a[0].join('|')
        a.unshift('|')
        d = tks.join('|') + ' ' + d unless tks.empty?
        a << d
        this_line = a
        match_EC(d)
        parse_square_brackets(d).each do |x|
          if !match_EC(x, false) and x =~ /\A[A-Z]/ then
            di = [  x ]
            @list_ids << di
            @info['organism'] = x unless @info['organism']
          end
        end

      when /^\>?\s*([a-zA-Z0-9]+\:[^\s]+)\s*(.*)$/
        # examples:
        # >sce:YBR160W  CDC28, SRM5; cyclin-dependent protein kinase catalytic subunit [EC:2.7.1.-] [SP:CC28_YEAST]
        # >emb:CACDC28 [X80034] C.albicans CDC28 gene 
        i = $1
        d = $2
        a = parse_ColonSepID(i)
        i = a.join(':')
        this_line = [ ':', a , d ]
        match_EC(d)
        parse_square_brackets(d).each do |x|
          if !match_EC(x, false) and x =~ /:/ then
            parse_ColonSepID(x)
          elsif x =~ /\A\s*([A-Z][A-Z0-9_\.]+)\s*\z/ then
            @list_ids << [ $1 ]
          end
        end

      when /^\>?\s*(\S+)(?:\s+(.+))?$/
        # examples:
        # >ABC12345 this is test
        i = $1
        d = $2.to_s
        @list_ids << [ i.chomp('.') ]
        this_line = [  '', [ i ], d ]
        match_EC(d)
      else
        i = str
        d = ''
        match_EC(i)
        this_line = [ '', [ i ], d ]
      end

      @deflines << this_line
      @entry_id = i unless @entry_id
    end

    def match_EC(str, write_flag = true)
      di = nil
      str.scan(/EC\:((:?[\-\d]+\.){3}(:?[\-\d]+))/i) do |x|
        di = [ 'EC', $1 ]
        if write_flag then
          @info['ec'] = di[1] if (!@info['ec'] or @info['ec'].to_s =~ /\-/)
          @list_ids << di
        end
      end
      di
    end
    private :match_EC

    def parse_square_brackets(str)
      r = []
      str.scan(/\[([^\]]*)\]/) do |x|
        r << x[0]
      end
      r
    end
    private :parse_square_brackets

    def parse_ColonSepID(str)
      di = str.split(':', 2)
      di << nil if di.size <= 1 
      @list_ids << di
      di
    end
    private :parse_ColonSepID

    def parse_NSIDs(ary)
      # this method destroys ary
      data = []
      while token = ary.shift
        if labels = self.class::NSIDs[token] then
          di = [ token ]
          idtype = token
          labels.each do |x|
            token = ary.shift
            break unless token
            if self.class::NSIDs[token] then
              ary.unshift(token)
              break #each
            end
            if token.length > 0 then
              di << token
            else
              di << nil
            end
          end
          data << di
        else
          if token.length > 0 then
            # UCID (uncontrolled identifiers)
            di = [ token ]
            data << di
            @info['ucid'] = token unless @info['ucid']
          end
          break #while
        end
      end #while
      @list_ids.concat data
      data
    end #def parse_NSIDs
    private :parse_NSIDs


    # Shows original string.
    # Note that the result of this method may be different from
    # original string which is given in FastaDefline.new method.
    def to_s
      @deflines.collect { |a|
        s = a[0]
        (a[1..-2].collect { |x| x.join(s) }.join(s) + ' ' + a[-1]).strip
      }.join("\x01")
    end

    # Shows description.
    def description
      @deflines[0].to_a[-1]
    end

    # Returns descriptions.
    def descriptions
      @deflines.collect do |a|
        a[-1]
      end
    end

    # Shows ID-like strings.
    # Returns an array of strings.
    def id_strings
      r = []
      @list_ids.each do |a|
        if a.size >= 2 then
          r.concat a[1..-1].find_all { |x| x }
        else
          if a[0].to_s.size > 0 and a[0] =~ /\A[A-Za-z0-9\.\-\_]+\z/
            r << a[0]
          end
        end
      end
      r.concat( words(true, []).find_all do |x|
                 x =~ /\A[A-Z][A-Za-z0-9\_]*[0-9]+[A-Za-z0-9\_]+\z/ or
                   x =~ /\A[A-Z][A-Z0-9]*\_[A-Z0-9\_]+\z/
               end)
      r
    end

    KillWords = [
      'an', 'the', 'this', 'that',
      'is', 'are', 'were', 'was', 'be', 'can', 'may', 'might',
      'as', 'at', 'by', 'for', 'in', 'of', 'on', 'to', 'with',
      'from', 'and', 'or', 'not',
      'dna', 'rna', 'mrna', 'cdna', 'orf',
      'aa', 'nt', 'pct', 'id', 'ec', 'sp', 'subsp',
      'similar', 'involved', 'identical', 'identity',
      'cds', 'clone', 'library', 'contig', 'contigs',
      'homolog', 'homologue', 'homologs', 'homologous',
      'protein', 'proteins', 'gene', 'genes',
      'product', 'products', 'sequence', 'sequences', 
      'strain', 'strains', 'region', 'regions',
    ]
    KillWordsHash = {}
    KillWords.each { |x| KillWordsHash[x] = true }

    KillRegexpArray = [
      /\A\d{1,3}\%?\z/,
      /\A[A-Z][A-Za-z0-9\_]*[0-9]+[A-Za-z0-9\_]+\z/,
      /\A[A-Z][A-Z0-9]*\_[A-Z0-9\_]+\z/
    ]

    # Shows words used in the defline. Returns an Array.
    def words(case_sensitive = nil, kill_regexp = self.class::KillRegexpArray,
              kwhash = self.class::KillWordsHash)
      a = descriptions.join(' ').split(/[\.\,\;\:\(\)\[\]\{\}\<\>\"\'\`\~\/\|\?\!\&\@\#\s\x00-\x1f\x7f]+/)
      a.collect! do |x|
        x.sub!(/\A[\$\*\-\+]+/, '')
        x.sub!(/[\$\*\-\=]+\z/, '')
        if x.size <= 1 then
          nil
        elsif kwhash[x.downcase] then
          nil
        else
          if kill_regexp.find { |expr| expr =~ x } then
            nil
          else
            x
          end
        end
      end
      a.compact!
      a.collect! { |x| x.downcase } unless case_sensitive
      a.sort!
      a.uniq!
      a
    end

    # Returns identifires by a database name.
    def get(dbname)
      db = dbname.to_s
      r = nil
      unless r = @info[db] then
        di = @list_ids.find { |x| x[0] == db.to_s }
        if di and di.size <= 2 then
          r = di[-1]
        elsif di then
          labels = self.class::NSIDs[db]
          [ 'acc_version', 'entry_id',
            'locus', 'accession', 'number'].each do |x|
            if i = labels.index(x) then
              r = di[i+1]
              break if r
            end
          end
          r = di[1..-1].find { |x| x } unless r
        end
        @info[db] = r if r
      end
      r
    end

    # Returns an identifier by given type.
    def get_by_type(type_str)
      @list_ids.each do |x|
        if labels = self.class::NSIDs[x[0]] then
          if i = labels.index(type_str) then
            return x[i+1]
          end
        end
      end
      nil
    end

    # Returns identifiers by given type.
    def get_all_by_type(*type_strarg)
      d = []
      @list_ids.each do |x|
        if labels = self.class::NSIDs[x[0]] then
          type_strarg.each do |y|
            if i = labels.index(y) then
              d << x[i+1] if x[i+1]
            end
          end
        end
      end
      d
    end

    # Shows locus.
    # If the entry has more than two of such IDs,
    # only the first ID are shown.
    # Returns a string or nil.
    def locus
      unless defined?(@locus)
        @locus = get_by_type('locus')
      end
      @locus
    end

    # Shows GI.
    # If the entry has more than two of such IDs,
    # only the first ID are shown.
    # Returns a string or nil.
    def gi
      unless defined?(@gi) then
        @gi = get_by_type('gi')
      end
      @gi
    end

    # Shows accession with version number.
    # If the entry has more than two of such IDs,
    # only the first ID are shown.
    # Returns a string or nil.
    def acc_version
      unless defined?(@acc_version) then
        @acc_version = get_by_type('acc_version')
      end
      @acc_version
    end

    # Shows accession numbers.
    # Returns an array of strings.
    def accessions
      unless defined?(@accessions) then
        @accessions = get_all_by_type('accession', 'acc_version')
        @accessions.collect! { |x| x.sub(/\..*\z/, '') }
      end
      @accessions
    end

    # Shows an accession number.
    def accession
      unless defined?(@accession) then
        if acc_version then
          @accession = acc_version.split('.')[0]
        else
          @accession = accessions[0]
        end
      end
      @accession
    end
    
    def method_missing(name, *args)
      # raise ArgumentError,
      # "wrong # of arguments(#{args.size} for 1)" if args.size >= 2
      r = get(name, *args)
      if !r and !(self.class::NSIDs[name.to_s]) then
        raise "NameError: undefined method `#{name.inspect}'"
      end
      r
    end
    

  end #class FastaDefline

end #module Bio


From ngoto at dev.open-bio.org  Fri Jun 20 13:30:16 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Fri, 20 Jun 2008 13:30:16 +0000
Subject: [BioRuby-cvs] bioruby/lib/bio/db fasta.rb,1.28.2.1,1.28.2.2
Message-ID: <200806201330.m5KDUGds021895@dev.open-bio.org>

Update of /home/repository/bioruby/bioruby/lib/bio/db
In directory dev.open-bio.org:/tmp/cvs-serv21857

Modified Files:
      Tag: BRANCH-biohackathon2008
	fasta.rb 
Log Message:
Here-document separater string in example is changed to aviod confusion
about "END" which is also a reserved word in Ruby.


Index: fasta.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/db/fasta.rb,v
retrieving revision 1.28.2.1
retrieving revision 1.28.2.2
diff -C2 -d -r1.28.2.1 -r1.28.2.2
*** fasta.rb	20 Jun 2008 13:22:31 -0000	1.28.2.1
--- fasta.rb	20 Jun 2008 13:30:14 -0000	1.28.2.2
***************
*** 3,7 ****
  #
  # Copyright::  Copyright (C) 2001, 2002
! #              GOTO Naohisa <ngoto at gen-info.osaka-u.ac.jp>,
  #              Toshiaki Katayama <k at bioruby.org>
  # License::    The Ruby License
--- 3,7 ----
  #
  # Copyright::  Copyright (C) 2001, 2002
! #              Naohisa Goto <ng at bioruby.org>,
  #              Toshiaki Katayama <k at bioruby.org>
  # License::    The Ruby License
***************
*** 45,49 ****
    # === Examples
    #
!   #   f_str = <<END
    #   >sce:YBR160W  CDC28, SRM5; cyclin-dependent protein kinase catalytic subunit [EC:2.7.1.-] [SP:CC28_YEAST]
    #   MSGELANYKRLEKVGEGTYGVVYKALDLRPGQGQRVVALKKIRLESEDEG
--- 45,49 ----
    # === Examples
    #
!   #   f_str = <<END_OF_STRING
    #   >sce:YBR160W  CDC28, SRM5; cyclin-dependent protein kinase catalytic subunit [EC:2.7.1.-] [SP:CC28_YEAST]
    #   MSGELANYKRLEKVGEGTYGVVYKALDLRPGQGQRVVALKKIRLESEDEG
***************
*** 65,69 ****
    #   FERLCELLGYDNVFPLIINIKTKSNGGYQLCGSISIIKIEEELKSVGFER
    #   KTGDPLEWRRLFKKISTICRDIILIPN
!   #   END
    #
    #   f = Bio::FastaFormat.new(f_str)
--- 65,69 ----
    #   FERLCELLGYDNVFPLIINIKTKSNGGYQLCGSISIIKIEEELKSVGFER
    #   KTGDPLEWRRLFKKISTICRDIILIPN
!   #   END_OF_STRING
    #
    #   f = Bio::FastaFormat.new(f_str)


From ngoto at dev.open-bio.org  Fri Jun 20 13:43:38 2008
From: ngoto at dev.open-bio.org (Naohisa Goto)
Date: Fri, 20 Jun 2008 13:43:38 +0000
Subject: [BioRuby-cvs] bioruby/lib/bio/db fasta.rb,1.28.2.2,1.28.2.3
Message-ID: <200806201343.m5KDhcUr021965@dev.open-bio.org>

Update of /home/repository/bioruby/bioruby/lib/bio/db
In directory dev.open-bio.org:/tmp/cvs-serv21945

Modified Files:
      Tag: BRANCH-biohackathon2008
	fasta.rb 
Log Message:
Bio::FastaFormat#to_seq is renamed to to_biosequence with improvement.
The "to_seq" method is now an alias of to_biosequence.


Index: fasta.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/db/fasta.rb,v
retrieving revision 1.28.2.2
retrieving revision 1.28.2.3
diff -C2 -d -r1.28.2.2 -r1.28.2.3
*** fasta.rb	20 Jun 2008 13:30:14 -0000	1.28.2.2
--- fasta.rb	20 Jun 2008 13:43:36 -0000	1.28.2.3
***************
*** 28,31 ****
--- 28,32 ----
  require 'bio/db'
  require 'bio/sequence'
+ require 'bio/sequence/dblink'
  require 'bio/db/fasta/defline'
  
***************
*** 217,226 ****
      # because of efficiency.
      # 
!     def to_seq
        seq
        obj = Bio::Sequence.new(@seq)
!       obj.definition = self.definition
        obj
      end
  
      # Parsing FASTA Defline, and extract IDs.
--- 218,243 ----
      # because of efficiency.
      # 
!     def to_biosequence
        seq
        obj = Bio::Sequence.new(@seq)
!       d = self.identifiers
!       # accessions
!       obj.primary_accession = d.accessions.first
!       obj.secondary_accessions = d.accessions[1..-1]
!       # entry_id
!       obj.entry_id = d.locus unless d.locus.to_s.empty?
!       # GI
!       other = []
!       other.push Bio::Sequence::DBLink.new('GI', d.gi) if d.gi
!       obj.other_seqids = other unless other.empty?
!       # definition
!       if d.accessions.empty? and other.empty? then
!         obj.definition = self.definition
!       else
!         obj.definition = d.description
!       end
        obj
      end
+     alias to_seq to_biosequence
  
      # Parsing FASTA Defline, and extract IDs.