[BioRuby-cvs] bioruby/lib/bio/db/embl sptr.rb,1.23,1.24

Mon Aug 8 03:27:12 EDT 2005

Update of /home/repository/bioruby/bioruby/lib/bio/db/embl
In directory pub.open-bio.org:/tmp/cvs-serv29902/lib/bio/db/embl

Modified Files:
	sptr.rb 
Log Message:
* Added Bio::SPTR#protein_name method (proposed by Luca Pireddu).
* Added Bio::SPTR#synomyms method (proposed by Luca Pireddu).
* Changed Bio::SPTR#gn to parsing newly GN line format (proposed by Luca 
  Pireddu).
* Changed Bio::SPTR#gene_names method (proposed by Luca Pireddu).
* Changed Bio::SPTR#gene_name method (proposed by Luca Pireddu).


Index: sptr.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/db/embl/sptr.rb,v
retrieving revision 1.23
retrieving revision 1.24
diff -C2 -d -r1.23 -r1.24
*** sptr.rb	25 Aug 2004 16:57:12 -0000	1.23
--- sptr.rb	8 Aug 2005 06:41:13 -0000	1.24
***************
*** 67,70 ****
--- 67,71 ----
    end
  
+   #
    def entry
      id_line('ENTRY_NAME')
***************
*** 73,76 ****
--- 74,78 ----
    alias entry_id entry
  
+   #
    def molecule
      id_line('MOLECULE_TYPE')
***************
*** 78,81 ****
--- 80,84 ----
    alias molecule_type molecule
  
+   #
    def sequence_length
      id_line('SEQUENCE_LENGTH')
***************
*** 98,102 ****
    #          #accession  -> accessions.first
  
!   @@ac_regrexp = /[OPQ][0-9][A-Z0-9]{3}[0-9]/
  
    # DT Line; date (3/entry)
--- 101,106 ----
    #          #accession  -> accessions.first
  
!   @@ac_regrexp = /[OPQ][0-9][A-Z0-9]{3}[0-9]/ 
! 
  
    # DT Line; date (3/entry)
***************
*** 133,139 ****
--- 137,179 ----
    # CONTEINS       >=0
    #
+   # Returns the proposed official name of the protein
+   def protein_name
+     name = ""
+     if de_line = fetch('DE') then
+       str = de_line[/^[^\[]*/] # everything preceding the first [ (the "contains" part)
+       name = str[/^[^(]*/].strip
+       name << ' (Fragment)' if str =~ /fragment/i
+     end
+     return name
+   end
+   # synonyms are each placed in () following the official name on the DE line
+   # Returns an array of synonyms (unofficial names)
+   def synonyms
+     ary = Array.new
+     if de_line = fetch('DE') then
+       line = de_line.sub(/\[.*\]/,'') # ignore stuff between [ and ].  That's the "contains" part
+       line.scan(/\([^)]+/) do |synonym| 
+         unless synonym =~ /fragment/i then 
+           ary << synonym[1..-1].strip # index to remove the leading (  
+         end
+       end
+     end
+     return ary
+   end
  
  
    # GN Line: Gene name(s) (>=0, optional)
+   def gn
+     return @data['GN'] if @data['GN']
+ 
+     case fetch('GN')
+     when /Name=/ then
+       return gn_uniprot_parser
+     else
+       return gn_old_parser
+     end
+   end
+ 
+   # GN Line: Gene name(s) (>=0, optional)
    # GN   HNS OR DRDX OR OSMZ OR BGLY.
    # GN   CECA1 AND CECA2.
***************
*** 146,192 ****
    #          #gene_names -> Array
    #
!   def gn 
!     unless @data['GN']
!       if get('GN').size > 0
!         names = fetch('GN').sub(/\.$/,'').split(/ AND /)
!         names.map! {|synonyms|
!           synonyms = synonyms.gsub(/\(|\)/,'').split(/ OR /).map {|e|
!             e.strip 
!           }
          }
        end
!       @data['GN'] = names
      end
      return @data['GN']
    end
!   alias gene_names gn
  
    # Bio::SPTR#gene_name -> String
    #
    def gene_name
!     begin
!       @data['GN'][0][0]
!     rescue NameError
!       nil
!     end
    end
  
    # OS Line; organism species (>=1)
!   # "OS   Trifolium repens (white clover)"
!   #
!   # OS   Genus species (name).
!   # OS   Genus species (name0) (name1).
!   # OS   Genus species (name0) (name1).
!   # OS   Genus species (name0), G s0 (name0), and G s (name1).
    #
!   # Bio::EMBLDB#os  -> Array w/in Hash
    # [{'name'=>'(Human)', 'os'=>'Homo sapiens'}, 
    #  {'name'=>'(Rat)', 'os'=>'Rattus norveticus'}]
!   # Bio::EMBLDB#os[0]['name'] => "(Human)"
!   # Bio::EMBLDB#os[0] => {'name'=>"(Human)", 'os'=>'Homo sapiens'}
!   # Bio::EMBLDB#os(0) => "Homo sapiens (Human)"
    #
!   # Bio::SPTR#os -> Array w/in Hash
    # Bio::SPTR#os(num) -> String
    
  
--- 186,298 ----
    #          #gene_names -> Array
    #
!   def gn_old_parser
!     names = Array.new
!     if get('GN').size > 0
!       names = fetch('GN').sub(/\.$/,'').split(/ AND /)
!       names.map! { |synonyms|
!         synonyms = synonyms.gsub(/\(|\)/,'').split(/ OR /).map { |e|
!           e.strip 
          }
+       }
+     end
+     return @data['GN'] = names
+   end
+   private :gn_old_parser
+ 
+ 
+   # The new format of the GN line is:
+   # GN   Name=; Synonyms=[, ...]; OrderedLocusNames=[, ...];
+   # GN   ORFNames=[, ...];
+   # 
+   # GN   and
+   #
+   # Bio::SPTR#gn -> [ <gene record>* ]
+   #   where <gene record> is:
+   #                    { :name => '...', 
+   #                      :synonyms => [ 's1', 's2', ... ],
+   #                      :loci   => [ 'l1', 'l2', ... ],
+   #                      :orfs     => [ 'o1', 'o2', ... ] 
+   #                    }
+   def gn_uniprot_parser
+     @data['GN'] = Array.new
+     gn_line = fetch('GN').strip
+     records = gn_line.split(/\s*and\s*/)
+     records.each do |record|
+       gene_hash = {:name => '', :synonyms => [], :loci => [], :orfs => []}
+       record.each(';') do |element|
+         case element
+         when /Name=/ then
+           gene_hash[:name] = $'[0..-2]
+         when /Synonyms=/ then
+           gene_hash[:synonyms] = $'[0..-2].split(/\s*,\s*/)
+         when /OrderedLocusNames=/ then
+           gene_hash[:loci] = $'[0..-2].split(/\s*,\s*/)
+         when /ORFNames=/ then
+           gene_hash[:orfs] = $'[0..-2].split(/\s*,\s*/)
+         end
        end
!       @data['GN'] << gene_hash
      end
      return @data['GN']
    end
!   private :gn_uniprot_parser
! 
! 
!   # Bio::SPTR#gene_names -> [String]
!   def gene_names
!     gn # set @data['GN'] if it hasn't been already done
!     if @data['GN'].first.class == Hash then
!       @data['GN'].collect { |element| element[:name] }
!     else
!       @data['GN'].first
!     end
!   end
! 
  
    # Bio::SPTR#gene_name -> String
    #
    def gene_name
!     gene_names.first
    end
  
+ 
    # OS Line; organism species (>=1)
!   # "OS   Genus species (name)."
!   # "OS   Genus species (name0) (name1)."
!   # "OS   Genus species (name0) (name1)."
!   # "OS   Genus species (name0), G s0 (name0), and G s (name0) (name1)."
!   # "OS   Homo sapiens (Human), and Rarrus norveticus (Rat)"
    #
!   # Bio::EMBLDB#os  -> Array of Hash
    # [{'name'=>'(Human)', 'os'=>'Homo sapiens'}, 
    #  {'name'=>'(Rat)', 'os'=>'Rattus norveticus'}]
!   # Bio::SPTR#os[0]['name'] => "(Human)"
!   # Bio::EPTR#os[0] => {'name'=>"(Human)", 'os'=>'Homo sapiens'}
!   # Bio::EPTR#os(0) => "Homo sapiens (Human)"
    #
!   # Bio::SPTR#os -> Array of Hash
    # Bio::SPTR#os(num) -> String
+   def os(num = nil)
+     unless @data['OS']
+       os = Array.new
+       fetch('OS').split(/, and|, /).each do |tmp|
+         if tmp =~ /([A-Z][a-z]* *[\w\d \:\'\+\-]+[\w\d])/
+           org = $1
+           tmp =~ /(\(.+\))/ 
+           os.push({'name' => $1, 'os' => org})
+         else
+           raise "Error: OS Line. #{$!}\n#{fetch('OS')}\n"
+         end
+       end
+       @data['OS'] = os
+     end
+ 
+     if num
+       # EX. "Trifolium repens (white clover)"
+       return "#{@data['OS'][num]['os']} #{@data['OS'][num]['name']}"
+     else
+       return @data['OS']
+     end
+   end
    
  
***************
*** 211,220 ****
    def ox
      unless @data['OX']
!       tmp = fetch('OX').sub(/\.$/,'').split(/;/).map {|e| e.strip }
        hsh = Hash.new
!       tmp.each {|e|
          db,refs = e.split(/=/)
          hsh[db] = refs.split(/, */)
!       }
        @data['OX'] = hsh
      end
--- 317,326 ----
    def ox
      unless @data['OX']
!       tmp = fetch('OX').sub(/\.$/,'').split(/;/).map { |e| e.strip }
        hsh = Hash.new
!       tmp.each do |e|
          db,refs = e.split(/=/)
          hsh[db] = refs.split(/, */)
!       end
        @data['OX'] = hsh
      end
***************
*** 272,276 ****
  
        begin
!         fetch('CC').split(/#{cmt}/)[0].sub(dlm,'').split(dlm).each {|tmp|
            if /(^[A-Z ]+[A-Z]): (.+)/ =~ tmp
              key  = $1
--- 378,382 ----
  
        begin
!         fetch('CC').split(/#{cmt}/)[0].sub(dlm,'').split(dlm).each do |tmp|
            if /(^[A-Z ]+[A-Z]): (.+)/ =~ tmp
              key  = $1
***************
*** 282,296 ****
              end
            else
!             raise ["Error: #{entry_id}: CC Lines", '',
                     tmp, '', '', fetch('CC'),''].join("\n")
            end
!         }
        rescue NameError
          if fetch('CC') == ''
            return {}
          else
!           raise "Error: Invalid CC Lines: #{entry_id}: " + 
!                         "\n'#{self.get('CC')}'\n" +
!                         "(#{$!})"
          end
        rescue NoMethodError
--- 388,401 ----
              end
            else
!             raise ["Error: [#{entry_id}]: CC Lines", '',
                     tmp, '', '', fetch('CC'),''].join("\n")
            end
!         end
        rescue NameError
          if fetch('CC') == ''
            return {}
          else
!           raise ["Error: Invalid CC Lines: [#{entry_id}]: ",
!                  "\n'#{self.get('CC')}'\n", "(#{$!})"].join
          end
        rescue NoMethodError
***************
*** 314,319 ****
  
        # Event, Named isoforms, Comment, [Name, Synonyms, IsoId, Sequnce]+
!       tmp = {'Event'=>nil, 'Named isoforms'=>nil, 'Comment'=>nil,
!              'Variants' => []}
  
        if /Event=(.+?);/ =~ ap
--- 419,423 ----
  
        # Event, Named isoforms, Comment, [Name, Synonyms, IsoId, Sequnce]+
!       tmp = {'Event' => nil, 'Named isoforms' => nil, 'Comment' => nil, 'Variants'  => []}
  
        if /Event=(.+?);/ =~ ap
***************
*** 326,332 ****
          tmp['Comment'] = $1
        end
!       ap.scan(/Name=.+?Sequence=.+?;/).each {|ent|
          tmp['Variants'] << cc_ap_variants_parse(ent)
!       }
        return tmp
  
--- 430,436 ----
          tmp['Comment'] = $1
        end
!       ap.scan(/Name=.+?Sequence=.+?;/).each do |ent|
          tmp['Variants'] << cc_ap_variants_parse(ent)
!       end
        return tmp
  
***************
*** 338,344 ****
        return db unless db
  
!       db.each {|e|
!         db = {'NAME'=>nil,'NOTE'=>nil,'WWW'=>nil,'FTP'=>nil}
!         e.sub(/.$/,'').split(/;/).each {|line|
            case line
            when /NAME=(.+)/
--- 442,448 ----
        return db unless db
  
!       db.each do |e|
!         db = {'NAME' => nil, 'NOTE' => nil, 'WWW' => nil, 'FTP' => nil}
!         e.sub(/.$/,'').split(/;/).each do |line|
            case line
            when /NAME=(.+)/
***************
*** 351,357 ****
              db['FTP'] = $1
            end 
!         }
          tmp.push(db)
!       }
        return tmp
  
--- 455,461 ----
              db['FTP'] = $1
            end 
!         end
          tmp.push(db)
!       end
        return tmp
  
***************
*** 362,368 ****
        return ms unless ms
  
!       ms.each {|m|
          mass = {'MW'=>nil,'MW_ERR'=>nil,'METHOD'=>nil,'RANGE'=>nil}
!         m.sub(/.$/,'').split(/;/).each {|line|
            case line
            when /MW=(.+)/
--- 466,472 ----
        return ms unless ms
  
!       ms.each do |m|
          mass = {'MW'=>nil,'MW_ERR'=>nil,'METHOD'=>nil,'RANGE'=>nil}
!         m.sub(/.$/,'').split(/;/).each do |line|
            case line
            when /MW=(.+)/
***************
*** 375,381 ****
              mass['RANGE'] = $1          # RANGE class ? 
            end 
!         }
          tmp.push(mass)
!       }
        return tmp
  
--- 479,485 ----
              mass['RANGE'] = $1          # RANGE class ? 
            end 
!         end
          tmp.push(mass)
!       end
        return tmp
  
***************
*** 390,402 ****
  
    def cc_ap_variants_parse(ent)
!     tmp = {}
!     ent.split(/; /).map {|e| e.split(/=/) }.each {|e|
        case e[0]
        when 'Sequence'
          e[1] = e[1].sub(/;/,'').split(/, /)
        end
!       tmp[e[0]] = e[1]
!     }
!     tmp
    end
    private :cc_ap_variants_parse
--- 494,506 ----
  
    def cc_ap_variants_parse(ent)
!     hsh = {}
!     ent.split(/; /).map {|e| e.split(/=/) }.each do |e|
        case e[0]
        when 'Sequence'
          e[1] = e[1].sub(/;/,'').split(/, /)
        end
!       hsh[e[0]] = e[1]
!     end
!     return hsh
    end
    private :cc_ap_variants_parse
***************
*** 404,409 ****
  
  
- 
- 
    # DR Line; defabases cross-reference (>=0)
    # a cross_ref pre one line
--- 508,511 ----
***************
*** 419,423 ****
  
  
- 
    # KW Line; keyword (>=1)
    # KW   [Keyword;]+
--- 521,524 ----
***************
*** 471,475 ****
            end
  
- 
            case last_feature
            when 'VARSPLIC', 'VARIANT', 'CONFLICT'
--- 572,575 ----
***************
*** 502,507 ****
        end
  
!       table.each_key {|k|
!         table[k].each {|e|
            if / -> / =~ e['Description']
              pattern = /([A-Z][A-Z ]*[A-Z]*) -> ([A-Z][A-Z ]*[A-Z]*)/
--- 602,607 ----
        end
  
!       table.each_key do |k|
!         table[k].each do |e|
            if / -> / =~ e['Description']
              pattern = /([A-Z][A-Z ]*[A-Z]*) -> ([A-Z][A-Z ]*[A-Z]*)/
***************
*** 522,528 ****
              }
            end
!         }
!       }
! 
        @data['FT'] = table
      end
--- 622,627 ----
              }
            end
!         end
!       end
        @data['FT'] = table
      end
***************
*** 549,553 ****
      unless @data['SQ']
        if fetch('SQ') =~ /(\d+) AA\; (\d+) MW; (.+) CRC64;/
!         @data['SQ'] = { 'aalen'=>$1.to_i, 'MW'=>$2.to_i, 'CRC64'=>$3 }
        else
          raise "Invalid SQ Line: \n'#{fetch('SQ')}'"
--- 648,652 ----
      unless @data['SQ']
        if fetch('SQ') =~ /(\d+) AA\; (\d+) MW; (.+) CRC64;/
!         @data['SQ'] = { 'aalen' => $1.to_i, 'MW' => $2.to_i, 'CRC64' => $3 }
        else
          raise "Invalid SQ Line: \n'#{fetch('SQ')}'"
***************
*** 624,627 ****
--- 723,727 ----
      cmd "Bio::SPTR.new($ent).gn", 'GN'
      cmd "Bio::SPTR.new($ent).gene_name"
+     cmd "Bio::SPTR.new($ent).gene_names"
  
      cmd "Bio::SPTR.new($ent).dt", "DT"
***************
*** 632,635 ****
--- 732,737 ----
      cmd "Bio::SPTR.new($ent).de", 'DE'
      cmd "Bio::SPTR.new($ent).definition"
+     cmd "Bio::SPTR.new($ent).protein_name"
+     cmd "Bio::SPTR.new($ent).synonyms"
  
      cmd "Bio::SPTR.new($ent).kw", 'KW'
***************
*** 684,690 ****
  
  --- Bio::SPTR#entry_id -> str
- 
  --- Bio::SPTR#molecule -> str
- 
  --- Bio::SPTR#sequence_length -> int
      
--- 786,790 ----
***************
*** 695,703 ****
  --- Bio::SPTR#accessions -> ary
  --- Bio::SPTR#accession -> accessions.first
   
  === GN line (Gene name(s))
  
! --- Bio::SPTR#gn -> [ary, ...]
! --- Bio::SPTR#gene_name -> gn[0][0]
  
  === DT lines (Date) 
--- 795,806 ----
  --- Bio::SPTR#accessions -> ary
  --- Bio::SPTR#accession -> accessions.first
+ 
   
  === GN line (Gene name(s))
  
! --- Bio::SPTR#gn -> [ary, ...] or [{:name => str, :synonyms => [], :loci => [], :orfs => []}]
! --- Bio::SPTR#gene_name -> str
! --- Bio::SPTR#gene_names -> [str] or [str]
! 
  
  === DT lines (Date) 
***************
*** 708,715 ****
--- 811,828 ----
        key := (created|annotation|sequence)
  
+ 
  === DE lines (Description)
  
  --- Bio::SPTR#de -> str
               #definition -> str
+ 
+ --- Bio::SPTR#protein_name
+ 
+       Returns the proposed official name of the protein
+ 
+ 
+ --- Bio::SPTR#synonyms
+ 
+       Returns an array of synonyms (unofficial names)
  
  === KW lines (Keyword)