[BioRuby-cvs] bioruby/lib/bio/db fasta.rb,1.28,1.28.2.1
Naohisa Goto
ngoto at dev.open-bio.org
Fri Jun 20 13:22:34 UTC 2008
Update of /home/repository/bioruby/bioruby/lib/bio/db
In directory dev.open-bio.org:/tmp/cvs-serv21681
Modified Files:
Tag: BRANCH-biohackathon2008
fasta.rb
Log Message:
Split Bio::FastaDefline class into lib/bio/db/fasta/defline.rb
Index: fasta.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/db/fasta.rb,v
retrieving revision 1.28
retrieving revision 1.28.2.1
diff -C2 -d -r1.28 -r1.28.2.1
*** fasta.rb 5 Apr 2007 23:35:40 -0000 1.28
--- fasta.rb 20 Jun 2008 13:22:31 -0000 1.28.2.1
***************
*** 15,57 ****
# == Examples
#
! # rub = Bio::FastaDefline.new('>gi|671595|emb|CAA85678.1| rubisco large subunit [Perovskia abrotanoides]')
! # rub.entry_id ==> 'gi|671595'
! # rub.get('emb') ==> 'CAA85678.1'
! # rub.emb ==> 'CAA85678.1'
! # rub.gi ==> '671595'
! # rub.accession ==> 'CAA85678'
! # rub.accessions ==> [ 'CAA85678' ]
! # rub.acc_version ==> 'CAA85678.1'
! # rub.locus ==> nil
! # rub.list_ids ==> [["gi", "671595"],
! # ["emb", "CAA85678.1", nil],
! # ["Perovskia abrotanoides"]]
! #
! # ckr = Bio::FastaDefline.new(">gi|2495000|sp|Q63931|CCKR_CAVPO CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)\001gi|2147182|pir||I51898 cholecystokinin A receptor - guinea pig\001gi|544724|gb|AAB29504.1| cholecystokinin A receptor; CCK-A receptor [Cavia]")
! # ckr.entry_id ==> "gi|2495000"
! # ckr.sp ==> "CCKR_CAVPO"
! # ckr.pir ==> "I51898"
! # ckr.gb ==> "AAB29504.1"
! # ckr.gi ==> "2495000"
! # ckr.accession ==> "AAB29504"
! # ckr.accessions ==> ["Q63931", "AAB29504"]
! # ckr.acc_version ==> "AAB29504.1"
! # ckr.locus ==> nil
! # ckr.description ==>
! # "CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)"
! # ckr.descriptions ==>
! # ["CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)",
! # "cholecystokinin A receptor - guinea pig",
! # "cholecystokinin A receptor; CCK-A receptor [Cavia]"]
! # ckr.words ==>
! # ["cavia", "cck-a", "cck-ar", "cholecystokinin", "guinea", "pig",
! # "receptor", "type"]
! # ckr.id_strings ==>
! # ["2495000", "Q63931", "CCKR_CAVPO", "2147182", "I51898",
! # "544724", "AAB29504.1", "Cavia"]
! # ckr.list_ids ==>
! # [["gi", "2495000"], ["sp", "Q63931", "CCKR_CAVPO"],
! # ["gi", "2147182"], ["pir", nil, "I51898"], ["gi", "544724"],
! # ["gb", "AAB29504.1", nil], ["Cavia"]]
#
# == References
--- 15,19 ----
# == Examples
#
! # See documents of Bio::FastaFormat class.
#
# == References
***************
*** 66,69 ****
--- 28,32 ----
require 'bio/db'
require 'bio/sequence'
+ require 'bio/db/fasta/defline'
module Bio
***************
*** 363,825 ****
end #class FastaNumericFormat
-
- # Parsing FASTA Defline, and extract IDs and other informations.
- # IDs are NSIDs (NCBI standard FASTA sequence identifiers)
- # or ":"-separated IDs.
- #
- # specs are described in:
- # ftp://ftp.ncbi.nih.gov/blast/documents/README.formatdb
- # http://blast.wustl.edu/doc/FAQ-Indexing.html#Identifiers
- #
- # === Examples
- #
- # rub = Bio::FastaDefline.new('>gi|671595|emb|CAA85678.1| rubisco large subunit [Perovskia abrotanoides]')
- # rub.entry_id ==> 'gi|671595'
- # rub.get('emb') ==> 'CAA85678.1'
- # rub.emb ==> 'CAA85678.1'
- # rub.gi ==> '671595'
- # rub.accession ==> 'CAA85678'
- # rub.accessions ==> [ 'CAA85678' ]
- # rub.acc_version ==> 'CAA85678.1'
- # rub.locus ==> nil
- # rub.list_ids ==> [["gi", "671595"],
- # ["emb", "CAA85678.1", nil],
- # ["Perovskia abrotanoides"]]
- #
- # ckr = Bio::FastaDefline.new(">gi|2495000|sp|Q63931|CCKR_CAVPO CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)\001gi|2147182|pir||I51898 cholecystokinin A receptor - guinea pig\001gi|544724|gb|AAB29504.1| cholecystokinin A receptor; CCK-A receptor [Cavia]")
- # ckr.entry_id ==> "gi|2495000"
- # ckr.sp ==> "CCKR_CAVPO"
- # ckr.pir ==> "I51898"
- # ckr.gb ==> "AAB29504.1"
- # ckr.gi ==> "2495000"
- # ckr.accession ==> "AAB29504"
- # ckr.accessions ==> ["Q63931", "AAB29504"]
- # ckr.acc_version ==> "AAB29504.1"
- # ckr.locus ==> nil
- # ckr.description ==>
- # "CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)"
- # ckr.descriptions ==>
- # ["CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)",
- # "cholecystokinin A receptor - guinea pig",
- # "cholecystokinin A receptor; CCK-A receptor [Cavia]"]
- # ckr.words ==>
- # ["cavia", "cck-a", "cck-ar", "cholecystokinin", "guinea", "pig",
- # "receptor", "type"]
- # ckr.id_strings ==>
- # ["2495000", "Q63931", "CCKR_CAVPO", "2147182", "I51898",
- # "544724", "AAB29504.1", "Cavia"]
- # ckr.list_ids ==>
- # [["gi", "2495000"], ["sp", "Q63931", "CCKR_CAVPO"],
- # ["gi", "2147182"], ["pir", nil, "I51898"], ["gi", "544724"],
- # ["gb", "AAB29504.1", nil], ["Cavia"]]
- #
- # === Refereneces
- #
- # * Fasta format description (NCBI)
- # http://www.ncbi.nlm.nih.gov/BLAST/fasta.shtml
- #
- # * Frequently Asked Questions: Indexing of Sequence Identifiers (by Warren R. Gish.)
- # http://blast.wustl.edu/doc/FAQ-Indexing.html#Identifiers
- #
- # * README.formatdb
- # ftp://ftp.ncbi.nih.gov/blast/documents/README.formatdb
- #
- class FastaDefline
-
- NSIDs = {
- # NCBI and WU-BLAST
- 'gi' => [ 'gi' ], # NCBI GI
- 'gb' => [ 'acc_version', 'locus' ], # GenBank
- 'emb' => [ 'acc_version', 'locus' ], # EMBL
- 'dbj' => [ 'acc_version', 'locus' ], # DDBJ
- 'sp' => [ 'accession', 'entry_id' ], # SWISS-PROT
- 'pdb' => [ 'entry_id', 'chain' ], # PDB
- 'bbs' => [ 'number' ], # GenInfo Backbone Id
- 'gnl' => [ 'database' , 'entry_id' ], # General database identifier
- 'ref' => [ 'acc_version' , 'locus' ], # NCBI Reference Sequence
- 'lcl' => [ 'entry_id' ], # Local Sequence identifier
-
- # WU-BLAST and NCBI
- 'pir' => [ 'accession', 'entry_id' ], # PIR
- 'prf' => [ 'accession', 'entry_id' ], # Protein Research Foundation
- 'pat' => [ 'country', 'number', 'serial' ], # Patents
-
- # WU-BLAST only
- 'bbm' => [ 'number' ], # NCBI GenInfo Backbone database identifier
- 'gim' => [ 'number' ], # NCBI GenInfo Import identifier
- 'gp' => [ 'acc_version', 'locus' ], # GenPept
- 'oth' => [ 'accession', 'name', 'release' ], # Other (user-definable) identifier
- 'tpd' => [ 'accession', 'name' ], # Third party annotation, DDBJ
- 'tpe' => [ 'accession', 'name' ], # Third party annotation, EMBL
- 'tpg' => [ 'accession', 'name' ], # Third party annotation, GenBank
-
- # Original
- 'ri' => [ 'entry_id', 'rearray_id', 'len' ], # RIKEN FANTOM DB
- }
-
- # Shows array that contains IDs (or ID-like strings).
- # Returns an array of arrays of strings.
- attr_reader :list_ids
-
- # Shows a possibly unique identifier.
- # Returns a string.
- attr_reader :entry_id
-
- # Parses given string.
- def initialize(str)
- @deflines = []
- @info = {}
- @list_ids = []
-
- @entry_id = nil
-
- lines = str.split("\x01")
- lines.each do |line|
- add_defline(line)
- end
- end #def initialize
-
- # Parses given string and adds parsed data.
- def add_defline(str)
- case str
- when /^\>?\s*((?:[^\|\s]*\|)+[^\s]+)\s*(.*)$/
- # NSIDs
- # examples:
- # >gi|9910844|sp|Q9UWG2|RL3_METVA 50S ribosomal protein L3P
- #
- # note: regexp (:?) means grouping without backreferences
- i = $1
- d = $2
- tks = i.split('|')
- tks << '' if i[-1,1] == '|'
- a = parse_NSIDs(tks)
- i = a[0].join('|')
- a.unshift('|')
- d = tks.join('|') + ' ' + d unless tks.empty?
- a << d
- this_line = a
- match_EC(d)
- parse_square_brackets(d).each do |x|
- if !match_EC(x, false) and x =~ /\A[A-Z]/ then
- di = [ x ]
- @list_ids << di
- @info['organism'] = x unless @info['organism']
- end
- end
-
- when /^\>?\s*([a-zA-Z0-9]+\:[^\s]+)\s*(.*)$/
- # examples:
- # >sce:YBR160W CDC28, SRM5; cyclin-dependent protein kinase catalytic subunit [EC:2.7.1.-] [SP:CC28_YEAST]
- # >emb:CACDC28 [X80034] C.albicans CDC28 gene
- i = $1
- d = $2
- a = parse_ColonSepID(i)
- i = a.join(':')
- this_line = [ ':', a , d ]
- match_EC(d)
- parse_square_brackets(d).each do |x|
- if !match_EC(x, false) and x =~ /:/ then
- parse_ColonSepID(x)
- elsif x =~ /\A\s*([A-Z][A-Z0-9_\.]+)\s*\z/ then
- @list_ids << [ $1 ]
- end
- end
-
- when /^\>?\s*(\S+)(?:\s+(.+))?$/
- # examples:
- # >ABC12345 this is test
- i = $1
- d = $2.to_s
- @list_ids << [ i.chomp('.') ]
- this_line = [ '', [ i ], d ]
- match_EC(d)
- else
- i = str
- d = ''
- match_EC(i)
- this_line = [ '', [ i ], d ]
- end
-
- @deflines << this_line
- @entry_id = i unless @entry_id
- end
-
- def match_EC(str, write_flag = true)
- di = nil
- str.scan(/EC\:((:?[\-\d]+\.){3}(:?[\-\d]+))/i) do |x|
- di = [ 'EC', $1 ]
- if write_flag then
- @info['ec'] = di[1] if (!@info['ec'] or @info['ec'].to_s =~ /\-/)
- @list_ids << di
- end
- end
- di
- end
- private :match_EC
-
- def parse_square_brackets(str)
- r = []
- str.scan(/\[([^\]]*)\]/) do |x|
- r << x[0]
- end
- r
- end
- private :parse_square_brackets
-
- def parse_ColonSepID(str)
- di = str.split(':', 2)
- di << nil if di.size <= 1
- @list_ids << di
- di
- end
- private :parse_ColonSepID
-
- def parse_NSIDs(ary)
- # this method destroys ary
- data = []
- while token = ary.shift
- if labels = self.class::NSIDs[token] then
- di = [ token ]
- idtype = token
- labels.each do |x|
- token = ary.shift
- break unless token
- if self.class::NSIDs[token] then
- ary.unshift(token)
- break #each
- end
- if token.length > 0 then
- di << token
- else
- di << nil
- end
- end
- data << di
- else
- if token.length > 0 then
- # UCID (uncontrolled identifiers)
- di = [ token ]
- data << di
- @info['ucid'] = token unless @info['ucid']
- end
- break #while
- end
- end #while
- @list_ids.concat data
- data
- end #def parse_NSIDs
- private :parse_NSIDs
-
-
- # Shows original string.
- # Note that the result of this method may be different from
- # original string which is given in FastaDefline.new method.
- def to_s
- @deflines.collect { |a|
- s = a[0]
- (a[1..-2].collect { |x| x.join(s) }.join(s) + ' ' + a[-1]).strip
- }.join("\x01")
- end
-
- # Shows description.
- def description
- @deflines[0].to_a[-1]
- end
-
- # Returns descriptions.
- def descriptions
- @deflines.collect do |a|
- a[-1]
- end
- end
-
- # Shows ID-like strings.
- # Returns an array of strings.
- def id_strings
- r = []
- @list_ids.each do |a|
- if a.size >= 2 then
- r.concat a[1..-1].find_all { |x| x }
- else
- if a[0].to_s.size > 0 and a[0] =~ /\A[A-Za-z0-9\.\-\_]+\z/
- r << a[0]
- end
- end
- end
- r.concat( words(true, []).find_all do |x|
- x =~ /\A[A-Z][A-Za-z0-9\_]*[0-9]+[A-Za-z0-9\_]+\z/ or
- x =~ /\A[A-Z][A-Z0-9]*\_[A-Z0-9\_]+\z/
- end)
- r
- end
-
- KillWords = [
- 'an', 'the', 'this', 'that',
- 'is', 'are', 'were', 'was', 'be', 'can', 'may', 'might',
- 'as', 'at', 'by', 'for', 'in', 'of', 'on', 'to', 'with',
- 'from', 'and', 'or', 'not',
- 'dna', 'rna', 'mrna', 'cdna', 'orf',
- 'aa', 'nt', 'pct', 'id', 'ec', 'sp', 'subsp',
- 'similar', 'involved', 'identical', 'identity',
- 'cds', 'clone', 'library', 'contig', 'contigs',
- 'homolog', 'homologue', 'homologs', 'homologous',
- 'protein', 'proteins', 'gene', 'genes',
- 'product', 'products', 'sequence', 'sequences',
- 'strain', 'strains', 'region', 'regions',
- ]
- KillWordsHash = {}
- KillWords.each { |x| KillWordsHash[x] = true }
-
- KillRegexpArray = [
- /\A\d{1,3}\%?\z/,
- /\A[A-Z][A-Za-z0-9\_]*[0-9]+[A-Za-z0-9\_]+\z/,
- /\A[A-Z][A-Z0-9]*\_[A-Z0-9\_]+\z/
- ]
-
- # Shows words used in the defline. Returns an Array.
- def words(case_sensitive = nil, kill_regexp = self.class::KillRegexpArray,
- kwhash = self.class::KillWordsHash)
- a = descriptions.join(' ').split(/[\.\,\;\:\(\)\[\]\{\}\<\>\"\'\`\~\/\|\?\!\&\@\#\s\x00-\x1f\x7f]+/)
- a.collect! do |x|
- x.sub!(/\A[\$\*\-\+]+/, '')
- x.sub!(/[\$\*\-\=]+\z/, '')
- if x.size <= 1 then
- nil
- elsif kwhash[x.downcase] then
- nil
- else
- if kill_regexp.find { |expr| expr =~ x } then
- nil
- else
- x
- end
- end
- end
- a.compact!
- a.collect! { |x| x.downcase } unless case_sensitive
- a.sort!
- a.uniq!
- a
- end
-
- # Returns identifires by a database name.
- def get(dbname)
- db = dbname.to_s
- r = nil
- unless r = @info[db] then
- di = @list_ids.find { |x| x[0] == db.to_s }
- if di and di.size <= 2 then
- r = di[-1]
- elsif di then
- labels = self.class::NSIDs[db]
- [ 'acc_version', 'entry_id',
- 'locus', 'accession', 'number'].each do |x|
- if i = labels.index(x) then
- r = di[i+1]
- break if r
- end
- end
- r = di[1..-1].find { |x| x } unless r
- end
- @info[db] = r if r
- end
- r
- end
-
- # Returns an identifier by given type.
- def get_by_type(type_str)
- @list_ids.each do |x|
- if labels = self.class::NSIDs[x[0]] then
- if i = labels.index(type_str) then
- return x[i+1]
- end
- end
- end
- nil
- end
-
- # Returns identifiers by given type.
- def get_all_by_type(*type_strarg)
- d = []
- @list_ids.each do |x|
- if labels = self.class::NSIDs[x[0]] then
- type_strarg.each do |y|
- if i = labels.index(y) then
- d << x[i+1] if x[i+1]
- end
- end
- end
- end
- d
- end
-
- # Shows locus.
- # If the entry has more than two of such IDs,
- # only the first ID are shown.
- # Returns a string or nil.
- def locus
- unless defined?(@locus)
- @locus = get_by_type('locus')
- end
- @locus
- end
-
- # Shows GI.
- # If the entry has more than two of such IDs,
- # only the first ID are shown.
- # Returns a string or nil.
- def gi
- unless defined?(@gi) then
- @gi = get_by_type('gi')
- end
- @gi
- end
-
- # Shows accession with version number.
- # If the entry has more than two of such IDs,
- # only the first ID are shown.
- # Returns a string or nil.
- def acc_version
- unless defined?(@acc_version) then
- @acc_version = get_by_type('acc_version')
- end
- @acc_version
- end
-
- # Shows accession numbers.
- # Returns an array of strings.
- def accessions
- unless defined?(@accessions) then
- @accessions = get_all_by_type('accession', 'acc_version')
- @accessions.collect! { |x| x.sub(/\..*\z/, '') }
- end
- @accessions
- end
-
- # Shows an accession number.
- def accession
- unless defined?(@accession) then
- if acc_version then
- @accession = acc_version.split('.')[0]
- else
- @accession = accessions[0]
- end
- end
- @accession
- end
-
- def method_missing(name, *args)
- # raise ArgumentError,
- # "wrong # of arguments(#{args.size} for 1)" if args.size >= 2
- r = get(name, *args)
- if !r and !(self.class::NSIDs[name.to_s]) then
- raise "NameError: undefined method `#{name.inspect}'"
- end
- r
- end
-
-
- end #class FastaDefline
-
end #module Bio
--- 326,329 ----
More information about the bioruby-cvs
mailing list