[BioRuby-cvs] bioruby/lib/bio/db/fasta defline.rb,NONE,1.1.2.1

Fri Jun 20 13:22:34 UTC 2008

Update of /home/repository/bioruby/bioruby/lib/bio/db/fasta
In directory dev.open-bio.org:/tmp/cvs-serv21681/fasta

Added Files:
      Tag: BRANCH-biohackathon2008
	defline.rb 
Log Message:
Split Bio::FastaDefline class into lib/bio/db/fasta/defline.rb


--- NEW FILE: defline.rb ---
#
# = bio/db/fasta/defline.rb - FASTA defline parser class
#
# Copyright::  Copyright (C) 2001, 2002
#              GOTO Naohisa <ngoto at gen-info.osaka-u.ac.jp>,
#              Toshiaki Katayama <k at bioruby.org>
# License::    The Ruby License
#
# $Id: defline.rb,v 1.1.2.1 2008/06/20 13:22:32 ngoto Exp $
# 
# == Description
# 
# Bio::FastaDefline is a parser class for definition line (defline)
# of the FASTA format.
#
# == Examples
#
#       rub = Bio::FastaDefline.new('>gi|671595|emb|CAA85678.1| rubisco large subunit [Perovskia abrotanoides]')
#       rub.entry_id       ==> 'gi|671595'
#       rub.get('emb')     ==> 'CAA85678.1'
#       rub.emb            ==> 'CAA85678.1'
#       rub.gi             ==> '671595'
#       rub.accession      ==> 'CAA85678'
#       rub.accessions     ==> [ 'CAA85678' ]
#       rub.acc_version    ==> 'CAA85678.1'
#       rub.locus          ==> nil
#       rub.list_ids       ==> [["gi", "671595"],
#                               ["emb", "CAA85678.1", nil],
#                               ["Perovskia abrotanoides"]]
#
#       ckr = Bio::FastaDefline.new(">gi|2495000|sp|Q63931|CCKR_CAVPO CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)\001gi|2147182|pir||I51898 cholecystokinin A receptor - guinea pig\001gi|544724|gb|AAB29504.1| cholecystokinin A receptor; CCK-A receptor [Cavia]")
#       ckr.entry_id      ==> "gi|2495000"
#       ckr.sp            ==> "CCKR_CAVPO"
#       ckr.pir           ==> "I51898"
#       ckr.gb            ==> "AAB29504.1"
#       ckr.gi            ==> "2495000"
#       ckr.accession     ==> "AAB29504"
#       ckr.accessions    ==> ["Q63931", "AAB29504"]
#       ckr.acc_version   ==> "AAB29504.1"
#       ckr.locus         ==> nil
#       ckr.description   ==>
#         "CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)"
#       ckr.descriptions  ==>
#         ["CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)",
#          "cholecystokinin A receptor - guinea pig",
#          "cholecystokinin A receptor; CCK-A receptor [Cavia]"]
#       ckr.words         ==> 
#         ["cavia", "cck-a", "cck-ar", "cholecystokinin", "guinea", "pig",
#          "receptor", "type"]
#       ckr.id_strings    ==>
#         ["2495000", "Q63931", "CCKR_CAVPO", "2147182", "I51898",
#          "544724", "AAB29504.1", "Cavia"]
#       ckr.list_ids      ==>
#         [["gi", "2495000"], ["sp", "Q63931", "CCKR_CAVPO"],
#          ["gi", "2147182"], ["pir", nil, "I51898"], ["gi", "544724"],
#          ["gb", "AAB29504.1", nil], ["Cavia"]]
#
# == References
#
# * FASTA format (WikiPedia)
#   http://en.wikipedia.org/wiki/FASTA_format
#   
# * Fasta format description (NCBI)
#   http://www.ncbi.nlm.nih.gov/BLAST/fasta.shtml
#

module Bio

  #--
  # split from fasta.rb revision 1.28
  #++

  # Parsing FASTA Defline, and extract IDs and other informations.
  # IDs are NSIDs (NCBI standard FASTA sequence identifiers)
  # or ":"-separated IDs.
  # 
  # specs are described in:
  # ftp://ftp.ncbi.nih.gov/blast/documents/README.formatdb
  # http://blast.wustl.edu/doc/FAQ-Indexing.html#Identifiers
  #
  # === Examples
  #
  #   rub = Bio::FastaDefline.new('>gi|671595|emb|CAA85678.1| rubisco large subunit [Perovskia abrotanoides]')
  #   rub.entry_id       ==> 'gi|671595'
  #   rub.get('emb')     ==> 'CAA85678.1'
  #   rub.emb            ==> 'CAA85678.1'
  #   rub.gi             ==> '671595'
  #   rub.accession      ==> 'CAA85678'
  #   rub.accessions     ==> [ 'CAA85678' ]
  #   rub.acc_version    ==> 'CAA85678.1'
  #   rub.locus          ==> nil
  #   rub.list_ids       ==> [["gi", "671595"],
  #                           ["emb", "CAA85678.1", nil],
  #                           ["Perovskia abrotanoides"]]
  #
  #   ckr = Bio::FastaDefline.new(">gi|2495000|sp|Q63931|CCKR_CAVPO CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)\001gi|2147182|pir||I51898 cholecystokinin A receptor - guinea pig\001gi|544724|gb|AAB29504.1| cholecystokinin A receptor; CCK-A receptor [Cavia]")
  #   ckr.entry_id      ==> "gi|2495000"
  #   ckr.sp            ==> "CCKR_CAVPO"
  #   ckr.pir           ==> "I51898"
  #   ckr.gb            ==> "AAB29504.1"
  #   ckr.gi            ==> "2495000"
  #   ckr.accession     ==> "AAB29504"
  #   ckr.accessions    ==> ["Q63931", "AAB29504"]
  #   ckr.acc_version   ==> "AAB29504.1"
  #   ckr.locus         ==> nil
  #   ckr.description   ==>
  #     "CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)"
  #   ckr.descriptions  ==>
  #     ["CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)",
  #      "cholecystokinin A receptor - guinea pig",
  #      "cholecystokinin A receptor; CCK-A receptor [Cavia]"]
  #   ckr.words         ==> 
  #     ["cavia", "cck-a", "cck-ar", "cholecystokinin", "guinea", "pig",
  #      "receptor", "type"]
  #   ckr.id_strings    ==>
  #     ["2495000", "Q63931", "CCKR_CAVPO", "2147182", "I51898",
  #      "544724", "AAB29504.1", "Cavia"]
  #   ckr.list_ids      ==>
  #     [["gi", "2495000"], ["sp", "Q63931", "CCKR_CAVPO"],
  #      ["gi", "2147182"], ["pir", nil, "I51898"], ["gi", "544724"],
  #      ["gb", "AAB29504.1", nil], ["Cavia"]]
  #
  # === Refereneces
  #
  # * Fasta format description (NCBI)
  #   http://www.ncbi.nlm.nih.gov/BLAST/fasta.shtml
  #
  # * Frequently Asked Questions:  Indexing of Sequence Identifiers (by Warren R. Gish.)
  #   http://blast.wustl.edu/doc/FAQ-Indexing.html#Identifiers
  #
  # * README.formatdb
  #   ftp://ftp.ncbi.nih.gov/blast/documents/README.formatdb
  # 
  class FastaDefline

    NSIDs = {
      # NCBI and WU-BLAST
      'gi'  => [ 'gi' ],                      # NCBI GI
      'gb'  => [ 'acc_version', 'locus' ],      # GenBank
      'emb' => [ 'acc_version', 'locus' ],      # EMBL
      'dbj' => [ 'acc_version', 'locus' ],      # DDBJ
      'sp'  => [ 'accession', 'entry_id' ],   # SWISS-PROT
      'pdb' => [ 'entry_id', 'chain' ],       # PDB
      'bbs' => [ 'number' ],                  # GenInfo Backbone Id
      'gnl' => [ 'database' , 'entry_id' ],   # General database identifier
      'ref' => [ 'acc_version' , 'locus' ],     # NCBI Reference Sequence
      'lcl' => [ 'entry_id' ],                # Local Sequence identifier

      # WU-BLAST and NCBI
      'pir' => [ 'accession', 'entry_id' ],   # PIR
      'prf' => [ 'accession', 'entry_id' ],   # Protein Research Foundation
      'pat' => [ 'country', 'number', 'serial' ], # Patents

      # WU-BLAST only
      'bbm' => [ 'number' ],      # NCBI GenInfo Backbone database identifier
      'gim' => [ 'number' ],      # NCBI GenInfo Import identifier
      'gp'  => [ 'acc_version', 'locus' ],      # GenPept
      'oth' => [ 'accession', 'name', 'release' ],  # Other (user-definable) identifier
      'tpd' => [ 'accession', 'name' ],       # Third party annotation, DDBJ
      'tpe' => [ 'accession', 'name' ],       # Third party annotation, EMBL
      'tpg' => [ 'accession', 'name' ],       # Third party annotation, GenBank

      # Original
      'ri'  => [ 'entry_id', 'rearray_id', 'len' ], # RIKEN FANTOM DB
    }

    # Shows array that contains IDs (or ID-like strings).
    # Returns an array of arrays of strings.
    attr_reader :list_ids

    # Shows a possibly unique identifier.
    # Returns a string.
    attr_reader :entry_id

    # Parses given string.
    def initialize(str)
      @deflines = []
      @info = {}
      @list_ids = []

      @entry_id = nil

      lines = str.split("\x01")
      lines.each do |line|
        add_defline(line)
      end
    end #def initialize

    # Parses given string and adds parsed data.
    def add_defline(str)
      case str
      when /^\>?\s*((?:[^\|\s]*\|)+[^\s]+)\s*(.*)$/
        # NSIDs
        # examples:
        # >gi|9910844|sp|Q9UWG2|RL3_METVA 50S ribosomal protein L3P
        #
        # note: regexp (:?) means grouping without backreferences
        i = $1
        d = $2
        tks = i.split('|')
        tks << '' if i[-1,1] == '|'
        a = parse_NSIDs(tks)
        i = a[0].join('|')
        a.unshift('|')
        d = tks.join('|') + ' ' + d unless tks.empty?
        a << d
        this_line = a
        match_EC(d)
        parse_square_brackets(d).each do |x|
          if !match_EC(x, false) and x =~ /\A[A-Z]/ then
            di = [  x ]
            @list_ids << di
            @info['organism'] = x unless @info['organism']
          end
        end

      when /^\>?\s*([a-zA-Z0-9]+\:[^\s]+)\s*(.*)$/
        # examples:
        # >sce:YBR160W  CDC28, SRM5; cyclin-dependent protein kinase catalytic subunit [EC:2.7.1.-] [SP:CC28_YEAST]
        # >emb:CACDC28 [X80034] C.albicans CDC28 gene 
        i = $1
        d = $2
        a = parse_ColonSepID(i)
        i = a.join(':')
        this_line = [ ':', a , d ]
        match_EC(d)
        parse_square_brackets(d).each do |x|
          if !match_EC(x, false) and x =~ /:/ then
            parse_ColonSepID(x)
          elsif x =~ /\A\s*([A-Z][A-Z0-9_\.]+)\s*\z/ then
            @list_ids << [ $1 ]
          end
        end

      when /^\>?\s*(\S+)(?:\s+(.+))?$/
        # examples:
        # >ABC12345 this is test
        i = $1
        d = $2.to_s
        @list_ids << [ i.chomp('.') ]
        this_line = [  '', [ i ], d ]
        match_EC(d)
      else
        i = str
        d = ''
        match_EC(i)
        this_line = [ '', [ i ], d ]
      end

      @deflines << this_line
      @entry_id = i unless @entry_id
    end

    def match_EC(str, write_flag = true)
      di = nil
      str.scan(/EC\:((:?[\-\d]+\.){3}(:?[\-\d]+))/i) do |x|
        di = [ 'EC', $1 ]
        if write_flag then
          @info['ec'] = di[1] if (!@info['ec'] or @info['ec'].to_s =~ /\-/)
          @list_ids << di
        end
      end
      di
    end
    private :match_EC

    def parse_square_brackets(str)
      r = []
      str.scan(/\[([^\]]*)\]/) do |x|
        r << x[0]
      end
      r
    end
    private :parse_square_brackets

    def parse_ColonSepID(str)
      di = str.split(':', 2)
      di << nil if di.size <= 1 
      @list_ids << di
      di
    end
    private :parse_ColonSepID

    def parse_NSIDs(ary)
      # this method destroys ary
      data = []
      while token = ary.shift
        if labels = self.class::NSIDs[token] then
          di = [ token ]
          idtype = token
          labels.each do |x|
            token = ary.shift
            break unless token
            if self.class::NSIDs[token] then
              ary.unshift(token)
              break #each
            end
            if token.length > 0 then
              di << token
            else
              di << nil
            end
          end
          data << di
        else
          if token.length > 0 then
            # UCID (uncontrolled identifiers)
            di = [ token ]
            data << di
            @info['ucid'] = token unless @info['ucid']
          end
          break #while
        end
      end #while
      @list_ids.concat data
      data
    end #def parse_NSIDs
    private :parse_NSIDs


    # Shows original string.
    # Note that the result of this method may be different from
    # original string which is given in FastaDefline.new method.
    def to_s
      @deflines.collect { |a|
        s = a[0]
        (a[1..-2].collect { |x| x.join(s) }.join(s) + ' ' + a[-1]).strip
      }.join("\x01")
    end

    # Shows description.
    def description
      @deflines[0].to_a[-1]
    end

    # Returns descriptions.
    def descriptions
      @deflines.collect do |a|
        a[-1]
      end
    end

    # Shows ID-like strings.
    # Returns an array of strings.
    def id_strings
      r = []
      @list_ids.each do |a|
        if a.size >= 2 then
          r.concat a[1..-1].find_all { |x| x }
        else
          if a[0].to_s.size > 0 and a[0] =~ /\A[A-Za-z0-9\.\-\_]+\z/
            r << a[0]
          end
        end
      end
      r.concat( words(true, []).find_all do |x|
                 x =~ /\A[A-Z][A-Za-z0-9\_]*[0-9]+[A-Za-z0-9\_]+\z/ or
                   x =~ /\A[A-Z][A-Z0-9]*\_[A-Z0-9\_]+\z/
               end)
      r
    end

    KillWords = [
      'an', 'the', 'this', 'that',
      'is', 'are', 'were', 'was', 'be', 'can', 'may', 'might',
      'as', 'at', 'by', 'for', 'in', 'of', 'on', 'to', 'with',
      'from', 'and', 'or', 'not',
      'dna', 'rna', 'mrna', 'cdna', 'orf',
      'aa', 'nt', 'pct', 'id', 'ec', 'sp', 'subsp',
      'similar', 'involved', 'identical', 'identity',
      'cds', 'clone', 'library', 'contig', 'contigs',
      'homolog', 'homologue', 'homologs', 'homologous',
      'protein', 'proteins', 'gene', 'genes',
      'product', 'products', 'sequence', 'sequences', 
      'strain', 'strains', 'region', 'regions',
    ]
    KillWordsHash = {}
    KillWords.each { |x| KillWordsHash[x] = true }

    KillRegexpArray = [
      /\A\d{1,3}\%?\z/,
      /\A[A-Z][A-Za-z0-9\_]*[0-9]+[A-Za-z0-9\_]+\z/,
      /\A[A-Z][A-Z0-9]*\_[A-Z0-9\_]+\z/
    ]

    # Shows words used in the defline. Returns an Array.
    def words(case_sensitive = nil, kill_regexp = self.class::KillRegexpArray,
              kwhash = self.class::KillWordsHash)
      a = descriptions.join(' ').split(/[\.\,\;\:\(\)\[\]\{\}\<\>\"\'\`\~\/\|\?\!\&\@\#\s\x00-\x1f\x7f]+/)
      a.collect! do |x|
        x.sub!(/\A[\$\*\-\+]+/, '')
        x.sub!(/[\$\*\-\=]+\z/, '')
        if x.size <= 1 then
          nil
        elsif kwhash[x.downcase] then
          nil
        else
          if kill_regexp.find { |expr| expr =~ x } then
            nil
          else
            x
          end
        end
      end
      a.compact!
      a.collect! { |x| x.downcase } unless case_sensitive
      a.sort!
      a.uniq!
      a
    end

    # Returns identifires by a database name.
    def get(dbname)
      db = dbname.to_s
      r = nil
      unless r = @info[db] then
        di = @list_ids.find { |x| x[0] == db.to_s }
        if di and di.size <= 2 then
          r = di[-1]
        elsif di then
          labels = self.class::NSIDs[db]
          [ 'acc_version', 'entry_id',
            'locus', 'accession', 'number'].each do |x|
            if i = labels.index(x) then
              r = di[i+1]
              break if r
            end
          end
          r = di[1..-1].find { |x| x } unless r
        end
        @info[db] = r if r
      end
      r
    end

    # Returns an identifier by given type.
    def get_by_type(type_str)
      @list_ids.each do |x|
        if labels = self.class::NSIDs[x[0]] then
          if i = labels.index(type_str) then
            return x[i+1]
          end
        end
      end
      nil
    end

    # Returns identifiers by given type.
    def get_all_by_type(*type_strarg)
      d = []
      @list_ids.each do |x|
        if labels = self.class::NSIDs[x[0]] then
          type_strarg.each do |y|
            if i = labels.index(y) then
              d << x[i+1] if x[i+1]
            end
          end
        end
      end
      d
    end

    # Shows locus.
    # If the entry has more than two of such IDs,
    # only the first ID are shown.
    # Returns a string or nil.
    def locus
      unless defined?(@locus)
        @locus = get_by_type('locus')
      end
      @locus
    end

    # Shows GI.
    # If the entry has more than two of such IDs,
    # only the first ID are shown.
    # Returns a string or nil.
    def gi
      unless defined?(@gi) then
        @gi = get_by_type('gi')
      end
      @gi
    end

    # Shows accession with version number.
    # If the entry has more than two of such IDs,
    # only the first ID are shown.
    # Returns a string or nil.
    def acc_version
      unless defined?(@acc_version) then
        @acc_version = get_by_type('acc_version')
      end
      @acc_version
    end

    # Shows accession numbers.
    # Returns an array of strings.
    def accessions
      unless defined?(@accessions) then
        @accessions = get_all_by_type('accession', 'acc_version')
        @accessions.collect! { |x| x.sub(/\..*\z/, '') }
      end
      @accessions
    end

    # Shows an accession number.
    def accession
      unless defined?(@accession) then
        if acc_version then
          @accession = acc_version.split('.')[0]
        else
          @accession = accessions[0]
        end
      end
      @accession
    end
    
    def method_missing(name, *args)
      # raise ArgumentError,
      # "wrong # of arguments(#{args.size} for 1)" if args.size >= 2
      r = get(name, *args)
      if !r and !(self.class::NSIDs[name.to_s]) then
        raise "NameError: undefined method `#{name.inspect}'"
      end
      r
    end
    

  end #class FastaDefline

end #module Bio