[BioRuby-cvs] bioruby/lib/bio/db/embl sptr.rb,1.33,1.34

Sat Jul 15 15:29:28 UTC 2006

Update of /home/repository/bioruby/bioruby/lib/bio/db/embl
In directory dev.open-bio.org:/tmp/cvs-serv8890/lib/bio/db/embl

Modified Files:
	sptr.rb 
Log Message:
* Refactered code for parsing CC lines.
* Added Bio::SPTR#ref method.
* Added comprehensive tests for the recent updates of UniProt. 
* Added OH line parser (Bio::SPTR#oh).
* Added HI line parser (Bio::SPTR#hi). 


Index: sptr.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/db/embl/sptr.rb,v
retrieving revision 1.33
retrieving revision 1.34
diff -C2 -d -r1.33 -r1.34
*** sptr.rb	11 Jul 2006 15:52:51 -0000	1.33
--- sptr.rb	15 Jul 2006 15:29:26 -0000	1.34
***************
*** 187,200 ****
    # === GN Line: Gene name(s) (>=0, optional)
    def gn
!     return @data['GN'] if @data['GN']
! 
!     case fetch('GN')
!     when /Name=/ then
!       return gn_uniprot_parser
!     else
!       return gn_old_parser
      end
    end
  
    # returns contents in the old style GN line.
    # === GN Line: Gene name(s) (>=0, optional)
--- 187,202 ----
    # === GN Line: Gene name(s) (>=0, optional)
    def gn
!     unless @data['GN']
!       case fetch('GN')
!       when /Name=/,/ORFNames=/
!         @data['GN'] = gn_uniprot_parser
!       else
!         @data['GN'] = gn_old_parser
!       end
      end
+     @data['GN']
    end
  
+ 
    # returns contents in the old style GN line.
    # === GN Line: Gene name(s) (>=0, optional)
***************
*** 218,222 ****
        }
      end
!     return @data['GN'] = names
    end
    private :gn_old_parser
--- 220,224 ----
        }
      end
!     @data['GN'] = names
    end
    private :gn_old_parser
***************
*** 348,351 ****
--- 350,380 ----
    end
  
+   # === The OH Line;  
+   #
+   # OH   NCBI_TaxID=TaxID; HostName.
+   # http://br.expasy.org/sprot/userman.html#OH_line
+   def oh
+     unless @data['OH']
+       @data['OH'] = fetch('OH').split("\. ").map {|x|
+         if x =~ /NCBI_TaxID=(\d+);/
+           taxid = $1
+         else
+           raise ArgumentError, ["Error: Invalid OH line format (#{self.entry_id}):",
+                                 $!, "\n", get('OH'), "\n"].join
+           
+         end
+         if x =~ /NCBI_TaxID=\d+; (.+)/ 
+           host_name = $1
+           host_name.sub!(/\.$/, '')
+         else
+           host_name = nil
+         end
+         {'NCBI_TaxID' => taxid, 'HostName' => host_name}
+       }
+     end
+     @data['OH']
+   end
+ 
+ 
    
    # Bio::EMBLDB::Common#ref -> Array
***************
*** 353,356 ****
--- 382,543 ----
    # RN RC RP RX RA RT RL
  
+   # returns contents in the R lines.
+   # * Bio::EMBLDB::Common#ref -> [ <refernece information Hash>* ]
+   # where <reference information Hash> is:
+   #  {'RN' => '', 'RC' => '', 'RP' => '', 'RX' => '', 
+   #   'RA' => '', 'RT' => '', 'RL' => '', 'RG' => ''}
+   # 
+   # R Lines
+   # * RN RC RP RX RA RT RL RG
+   def ref
+     unless @data['R']
+       @data['R'] = [get('R').split(/\nRN   /)].flatten.map { |str|
+         hash = {'RN' => '', 'RC' => '', 'RP' => '', 'RX' => '', 
+                'RA' => '', 'RT' => '', 'RL' => '', 'RG' => ''}
+         str = 'RN   ' + str unless /^RN   / =~ str
+ 
+         str.split("\n").each do |line|
+           if /^(R[NPXARLCTG])   (.+)/ =~ line
+             hash[$1] += $2 + ' '
+           else
+             raise "Invalid format in R lines, \n[#{line}]\n"
+           end
+         end
+ 
+         hash['RN'] = set_RN(hash['RN'])
+         hash['RC'] = set_RC(hash['RC'])
+         hash['RP'] = set_RP(hash['RP'])
+         hash['RX'] = set_RX(hash['RX'])
+         hash['RA'] = set_RA(hash['RA'])
+         hash['RT'] = set_RT(hash['RT'])
+         hash['RL'] = set_RL(hash['RL'])
+         hash['RG'] = set_RG(hash['RG'])
+ 
+         hash
+       }
+ 
+     end
+     @data['R']
+   end
+ 
+   def set_RN(data)
+     data.strip
+   end
+ 
+   def set_RC(data)
+     data.scan(/([STP]\w+)=(.+);/).map { |comment|
+       [comment[1].split(/, and |, /)].flatten.map { |text|
+         {'Token' => comment[0], 'Text' => text}
+       }
+     }.flatten
+   end
+   private :set_RC
+ 
+   def set_RP(data)
+     data = data.strip
+     data = data.sub(/\.$/, '')
+     data.split(/, AND |, /i).map {|x| 
+       x = x.strip
+       x = x.gsub('  ', ' ')
+     }
+   end
+   private :set_RP
+ 
+   def set_RX(data)
+     rx = {'MEDLINE' => nil, 'PubMed' => nil, 'DOI' => nil}
+     if data =~ /MEDLINE=(.+?);/
+       rx['MEDLINE'] = $1
+     end
+     if data =~ /PubMed=(.+?);/
+       rx['PubMed'] = $1
+     end
+     if data =~ /DOI=(.+?);/
+       rx['DOI'] = $1
+     end
+     rx
+   end
+   private :set_RX
+ 
+   def set_RA(data)
+     data = data.sub(/; *$/, '')
+   end
+   private :set_RA
+ 
+   def set_RT(data)
+     data = data.sub(/; *$/, '')
+     data = data.gsub(/(^"|"$)/, '')
+   end
+   private :set_RT
+ 
+   def set_RL(data)
+     data = data.strip
+   end
+   private :set_RL
+ 
+   def set_RG(data)
+     data = data.split('; ')
+   end
+   private :set_RG
+ 
+ 
+ 
+   # returns Bio::Reference object from Bio::EMBLDB::Common#ref.
+   # * Bio::EMBLDB::Common#ref -> Bio::References
+   def references
+     unless @data['references']
+       ary = self.ref.map {|ent|
+         hash = Hash.new('')
+         ent.each {|key, value|
+           case key
+           when 'RA'
+             hash['authors'] = value.split(/, /)
+           when 'RT'
+             hash['title'] = value
+           when 'RL'
+             if value =~ /(.*) (\d+) \((\d+)\), (\d+-\d+) \((\d+)\)$/
+               hash['journal'] = $1
+               hash['volume']  = $2
+               hash['issue']   = $3
+               hash['pages']   = $4
+               hash['year']    = $5
+             else
+               hash['journal'] = value
+             end
+           when 'RX'  # PUBMED, MEDLINE
+             value.split('.').each {|item|
+               tag, xref = item.split(/; /).map {|i| i.strip }
+               hash[ tag.downcase ]  = xref
+             }
+           end
+         }
+         Reference.new(hash)
+       }
+       @data['references'] = References.new(ary)
+     end
+     @data['references']
+   end
+ 
+ 
+ 
+ 
+ 
+ 
+   # === The HI line
+   # Bio::SPTR#hi #=> hash
+   def hi
+     unless @data['HI']
+       @data['HI'] = []
+       fetch('HI').split(/\. /).each do |hlist|
+         hash = {'Category' => '',  'Keywords' => [], 'Keyword' => ''}
+         hash['Category'], hash['Keywords'] = hlist.split(': ')
+         hash['Keywords'] = hash['Keywords'].split('; ')
+         hash['Keyword'] = hash['Keywords'].pop
+         hash['Keyword'].sub!(/\.$/, '')
+         @data['HI'] << hash
+       end
+     end
+     @data['HI']
+   end
+ 
  
    @@cc_topics = ['PHARMACEUTICAL',
***************
*** 422,429 ****
    # See also http://www.expasy.org/sprot/userman.html#CC_line
    #
!   def cc(tag = nil)
      unless @data['CC']
        cc  = Hash.new
!       cmt = '-' * (77 - 4 + 1)
        dlm = /-!- /
  
--- 609,616 ----
    # See also http://www.expasy.org/sprot/userman.html#CC_line
    #
!   def cc(topic = nil)
      unless @data['CC']
        cc  = Hash.new
!       comment_border= '-' * (77 - 4 + 1)
        dlm = /-!- /
  
***************
*** 433,446 ****
        cc_raw = fetch('CC')
  
        cc_raw.sub!(/ *---.+---/m, '')
        # Not any CC Lines without the copyright statement.
        return cc if cc_raw == ''
  
- 
        begin
!         cc_raw.split(/#{cmt}/)[0].sub(dlm,'').split(dlm).each do |tmp|
            if /(^[A-Z ]+[A-Z]): (.+)/ =~ tmp
              key  = $1
!             body = $2.gsub(/- (?!AND)/,'-')
              unless cc[key]
                cc[key] = [body]
--- 620,640 ----
        cc_raw = fetch('CC')
  
+       # Removing the copyright statement.
        cc_raw.sub!(/ *---.+---/m, '')
+ 
        # Not any CC Lines without the copyright statement.
        return cc if cc_raw == ''
  
        begin
!         cc_raw, copyright = cc_raw.split(/#{comment_border}/)[0]
!         cc_raw = cc_raw.sub(dlm,'')
!         cc_raw.split(dlm).each do |tmp|
!           tmp = tmp.strip
! 
            if /(^[A-Z ]+[A-Z]): (.+)/ =~ tmp
              key  = $1
!             body = $2
!             body.gsub!(/- (?!AND)/,'-')
!             body.strip!
              unless cc[key]
                cc[key] = [body]
***************
*** 449,454 ****
              end
            else
!             raise ["Error: [#{entry_id}]: CC Lines", '',
!                    tmp, '', '', fetch('CC'),''].join("\n")
            end
          end
--- 643,648 ----
              end
            else
!             raise ["Error: [#{entry_id}]: CC Lines", '"', tmp, '"',
!                    '', get('CC'),''].join("\n")
            end
          end
***************
*** 466,493 ****
      end
  
-     case tag
-     when 'ALTERNATIVE PRODUCTS'
-       ap = @data['CC']['ALTERNATIVE PRODUCTS'].to_s
-       return ap unless ap
- 
-       # Event, Named isoforms, Comment, [Name, Synonyms, IsoId, Sequnce]+
-       tmp = {'Event' => nil, 'Named isoforms' => nil, 'Comment' => nil, 
-              'Variants'  => []}
- 
-       if /Event=(.+?);/ =~ ap
-         tmp['Event'] = $1
-       end
-       if /Named isoforms=(\S+?);/ =~ ap
-         tmp['Named isoforms'] = $1
-       end
-       if /Comment=(.+?);/m =~ ap
-         tmp['Comment'] = $1
-       end
-       ap.scan(/Name=.+?Sequence=.+?;/).each do |ent|
-         tmp['Variants'] << cc_ap_variants_parse(ent)
-       end
-       return tmp
- 
  
      when 'DATABASE'
        # DATABASE: NAME=Text[; NOTE=Text][; WWW="Address"][; FTP="Address"].
--- 660,719 ----
      end
  
  
+     case topic
+     when 'ALLERGEN'
+       return @data['CC'][topic]
+     when 'ALTERNATIVE PRODUCTS'
+       return cc_alternative_products(@data['CC'][topic])
+     when 'BIOPHYSICOCHEMICAL PROPERTIES'
+       return cc_biophysiochemical_properties(@data['CC'][topic])
+     when 'BIOTECHNOLOGY'
+       return @data['CC'][topic]
+     when 'CATALITIC ACTIVITY'
+       return cc_catalytic_activity(@data['CC'][topic])
+     when 'CAUTION'
+       return cc_caution(@data['CC'][topic])
+     when 'COFACTOR'
+       return @data['CC'][topic]
+     when 'DEVELOPMENTAL STAGE'
+       return @data['CC'][topic].to_s
+     when 'DISEASE'
+       return @data['CC'][topic].to_s
+     when 'DOMAIN'
+       return @data['CC'][topic]
+     when 'ENZYME REGULATION'
+       return @data['CC'][topic].to_s
+     when 'FUNCTION'
+       return @data['CC'][topic].to_s
+     when 'INDUCTION'
+       return @data['CC'][topic].to_s
+     when 'INTERACTION'
+       return cc_interaction(@data['CC'][topic])
+     when 'MASS SPECTROMETRY'
+       return cc_mass_spectrometry(@data['CC'][topic])
+     when 'MISCELLANEOUS'
+       return @data['CC'][topic]
+     when 'PATHWAY'
+       return cc_pathway(@data['CC'][topic])
+     when 'PHARMACEUTICAL'
+       return @data['CC'][topic]
+     when 'POLYMORPHISM'
+       return @data['CC'][topic]
+     when 'PTM'
+       return @data['CC'][topic]
+     when 'RNA EDITING'
+       return cc_rna_editing(@data['CC'][topic])
+     when 'SIMILARITY'
+       return @data['CC'][topic]
+     when 'SUBCELLULAR LOCATION'
+       return cc_subcellular_location(@data['CC'][topic])
+     when 'SUBUNIT'
+       return @data['CC'][topic]
+     when 'TISSUE SPECIFICITY'
+       return @data['CC'][topic]
+     when 'TOXIC DOSE'
+       return @data['CC'][topic]
+     when 'WEB RESOURCE'
+       return cc_web_resource(@data['CC'][topic])
      when 'DATABASE'
        # DATABASE: NAME=Text[; NOTE=Text][; WWW="Address"][; FTP="Address"].
***************
*** 513,566 ****
        end
        return tmp
- 
-     when 'MASS SPECTOROMETRY'
-       # MASS SPECTROMETRY: MW=XXX[; MW_ERR=XX][; METHOD=XX][;RANGE=XX-XX].
-       tmp = Array.new
-       ms = @data['CC']['MASS SPECTOROMETRY']
-       return ms unless ms
- 
-       ms.each do |m|
-         mass = {'MW' => nil,'MW_ERR' => nil,'METHOD' => nil,'RANGE' => nil}
-         m.sub(/.$/,'').split(/;/).each do |line|
-           case line
-           when /MW=(.+)/
-             mass['MW'] = $1.to_f
-           when /MW_ERR=(.+)/
-             mass['MW_ERR'] = $1.to_f
-           when /METHOD="(.+)"/
-             mass['METHOD'] = $1.to_s
-           when /RANGE="(\d+-\d+)"/ 
-             mass['RANGE'] = $1          # RANGE class ? 
-           end 
-         end
-         tmp.push(mass)
-       end
-       return tmp
- 
-     when 'INTERACTION'
-       return cc_interaction_parse(@data['CC']['INTERACTION'].to_s)
- 
      when nil
        return @data['CC']
- 
      else
!       return @data['CC'][tag]
      end
    end
  
  
!   def cc_ap_variants_parse(ent)
!     hsh = {}
!     ent.split(/; /).map {|e| e.split(/=/) }.each do |e|
        case e[0]
!       when 'Sequence'
          e[1] = e[1].sub(/;/,'').split(/, /)
        end
!       hsh[e[0]] = e[1]
      end
!     return hsh
    end
!   private :cc_ap_variants_parse
  
  
--- 739,827 ----
        end
        return tmp
      when nil
        return @data['CC']
      else
!       return @data['CC'][topic]
      end
    end
  
  
+   def cc_alternative_products(data)
+     ap = data.to_s
+     return ap unless ap
  
!     # Event, Named isoforms, Comment, [Name, Synonyms, IsoId, Sequnce]+
!     tmp = {'Event' => "", 'Named isoforms' => "", 'Comment' => "", 
!            'Variants'  => []}
!     if /Event=(.+?);/ =~ ap
!       tmp['Event'] = $1
!       tmp['Event'] = tmp['Event'].sub(/;/,'').split(/, /)
!     end
!     if /Named isoforms=(\S+?);/ =~ ap
!       tmp['Named isoforms'] = $1
!     end
!     if /Comment=(.+?);/m =~ ap
!       tmp['Comment'] = $1
!     end
!     ap.scan(/Name=.+?Sequence=.+?;/).each do |ent|
!       tmp['Variants'] << cc_alternative_products_variants(ent)
!     end
!     return tmp
!   end
!   private :cc_alternative_products
! 
!   def cc_alternative_products_variants(data)
!     variant = {'Name' => '', 'Synonyms' => [], 'IsoId' => [], 'Sequence' => []}
!     data.split(/; /).map {|x| x.split(/=/) }.each do |e|
        case e[0]
!       when 'Sequence', 'Synonyms', 'IsoId'
          e[1] = e[1].sub(/;/,'').split(/, /)
        end
!       variant[e[0]] = e[1]
      end
!     variant
    end
!   private :cc_alternative_products_variants
! 
! 
!   def cc_biophysiochemical_properties(data)
!     data = data[0]
! 
!     hash = {'Absorption' => {}, 
!             'Kinetic parameters' => {},
!             'pH dependence' => "",
!             'Redox potential' => "",
!             'Temperature dependence' => ""}
!     if data =~ /Absorption: Abs\(max\)=(.+?);/
!       hash['Absorption']['Abs(max)'] = $1
!     end
!     if data =~ /Absorption: Abs\(max\)=.+; Note=(.+?);/
!       hash['Absorption']['Note'] = $1
!     end
!     if data =~ /Kinetic parameters: KM=(.+?); Vmax=(.+?);/
!       hash['Kinetic parameters']['KM'] = $1
!       hash['Kinetic parameters']['Vmax'] = $2
!     end
!     if data =~ /Kinetic parameters: KM=.+; Vmax=.+; Note=(.+?);/
!       hash['Kinetic parameters']['Note'] = $1
!     end
!     if data =~ /pH dependence: (.+?);/
!       hash['pH dependence'] = $1
!     end
!     if data =~ /Redox potential: (.+?);/
!       hash['Redox potential'] = $1
!     end
!     if data =~ /Temperature dependence: (.+?);/
!       hash['Temperature dependence'] = $1
!     end
!     hash
!   end
!   private :cc_biophysiochemical_properties
! 
! 
!   def cc_caution(data)
!     data.to_s
!   end
!   private :cc_caution
  
  
***************
*** 568,583 ****
    #
    #   CC       P46527:CDKN1B; NbExp=1; IntAct=EBI-359815, EBI-519280;
!   def cc_interaction_parse(str)
      it = str.scan(/(.+?); NbExp=(.+?); IntAct=(.+?);/)
      it.map {|ent|
!       {:partner_id => ent[0].strip,
!        :nbexp      => ent[1].strip, 
!        :intact_acc => ent[2].split(', ') }
      }
    end
!   private :cc_interaction_parse
  
    # returns databases cross-references in the DR lines.
!   # * Bio::EMBLDB#dr  -> Hash w/in Array
    #
    # === DR Line; defabases cross-reference (>=0)
--- 829,944 ----
    #
    #   CC       P46527:CDKN1B; NbExp=1; IntAct=EBI-359815, EBI-519280;
!   def cc_interaction(data)
!     str = data.to_s
      it = str.scan(/(.+?); NbExp=(.+?); IntAct=(.+?);/)
      it.map {|ent|
!       ent.map! {|x| x.strip }
!       if ent[0] =~ /^(.+):(.+)/
!         spac = $1
!         spid = $2.split(' ')[0]
!         optid = nil
!       elsif ent[0] =~ /Self/
!         spac = self.entry_id
!         spid = self.entry_id
!         optid = nil
!       end
!       if ent[0] =~ /^.+:.+ (.+)/
!         optid = $1
!       end
! 
!       {'SP_Ac' => spac,
!        'identifier' => spid,
!        'NbExp' => ent[1],
!        'IntAct' => ent[2].split(', '),
!        'optional_identifier' => optid}
      }
    end
!   private :cc_interaction
! 
! 
!   def cc_mass_spectrometry(data)
!     # MASS SPECTROMETRY: MW=XXX[; MW_ERR=XX][; METHOD=XX][;RANGE=XX-XX].
!     return data unless data
! 
!     data.map { |m|
!       mass = {'MW' => nil, 'MW_ERR' => nil, 'METHOD' => nil, 'RANGE' => nil,
!               'NOTE' => nil}
!       m.sub(/.$/,'').split(/;/).each do |line|
!         case line
!         when /MW=(.+)/
!           mass['MW'] = $1
!         when /MW_ERR=(.+)/
!           mass['MW_ERR'] = $1
!         when /METHOD=(.+)/
!           mass['METHOD'] = $1
!         when /RANGE=(\d+-\d+)/ 
!           mass['RANGE'] = $1          # RANGE class ? 
!         when /NOTE=(.+)/
!           mass['NOTE'] = $1
!         end 
!       end
!       mass
!     }
!   end
!   private :cc_mass_spectrometry
! 
! 
!   def cc_pathway(data)
!     data.map {|x| x.sub(/\.$/, '') }.map {|x|
!       x.split(/; | and |: /)
!     }[0]
!   end
!   private :cc_pathway
! 
! 
!   def cc_rna_editing(data)
!  data = data.to_s
!     entry = {'Modified_positions' => [], 'Note' => ""}
!     if data =~ /Modified_positions=(.+?)(\.|;)/
!       entry['Modified_positions'] = $1.sub(/\.$/, '').split(', ')
!     else
!       raise ArgumentError, "Invarid CC RNA Editing lines (#{self.entry_id}):#{$!}\n#{get('CC')}"
!     end
!     if data =~ /Note=(.+)/
!       entry['Note'] = $1
!     end
!     entry
!   end
!   private :cc_rna_editing
! 
! 
!   def cc_subcellular_location(data)
!     data.map {|x| 
!       x.split('. ').map {|y| 
!         y.split('; ').map {|z| 
!           z.sub(/\.$/, '') 
!         } 
!       } 
!     }[0]
!   end
!   private :cc_subcellular_location
! 
!   
!   # CC   -!- WEB RESOURCE: NAME=ResourceName[; NOTE=FreeText][; URL=WWWAddress].  
!   def cc_web_resource(data)
!     data.map {|x|
!       entry = {'NAME' => nil, 'NOTE' => nil, 'URL' => nil}
!       x.split(';').each do |y|
!         case y
!         when /NAME=(.+)/
!           entry['NAME'] = $1.strip
!         when /NOTE=(.+)/
!           entry['NOTE'] = $1.strip
!         when /URL="(.+)"/
!           entry['URL'] = $1.strip
!         end
!       end
!       entry
!     }
!   end
!   
  
    # returns databases cross-references in the DR lines.
!   # * Bio::SPTR#dr  -> Hash w/in Array
    #
    # === DR Line; defabases cross-reference (>=0)
***************
*** 591,594 ****
--- 952,973 ----
      'SWISS-2DPAGE','TIGR','TRANSFAC','TUBERCULIST','WORMPEP','YEPD','ZFIN']
  
+   # Backup Bio::EMBLDB#dr as embl_dr
+   alias :embl_dr :dr 
+ 
+   # Bio::SPTR#dr
+   def dr(key = nil)
+     unless key
+       embl_dr
+     else
+       embl_dr[key].map {|x|
+         {'Accession' => x[0],
+          'Version' => x[1],
+          ' ' => x[2],
+          'Molecular Type' => x[3]}
+       }
+     end
+   end
+ 
+ 
    # Bio::EMBLDB::Common#kw - Array
    #                    #keywords  -> Array
***************
*** 598,602 ****
  
  
!   # returns conteins in the feature table.
    #
    # == Examples
--- 977,981 ----
  
  
!   # returns contents in the feature table.
    #
    # == Examples
***************
*** 612,621 ****
    #      feature['FTId'] #=> ''
    #      feature['diff'] #=> []
    #    end
    #  end
    #
    # * Bio::SPTR#ft -> Hash
!   #    {FEATURE_KEY => [{'From' => int, 'To' => int, 'diff' => [],
!   #                      'Description' => aStr, 'FTId' => aStr}],...}
    #
    # returns an Array of the information about the feature_name in the feature table.
--- 991,1003 ----
    #      feature['FTId'] #=> ''
    #      feature['diff'] #=> []
+   #      feature['original'] #=> [feature_key, '1', '21', '', '']
    #    end
    #  end
    #
    # * Bio::SPTR#ft -> Hash
!   #    {FEATURE_KEY => [{'From' => int, 'To' => int, 
!   #                      'Description' => aStr, 'FTId' => aStr,
!   #                      'diff' => [original_residues, changed_residues],
!   #                      'original' => aAry }],...}
    #
    # returns an Array of the information about the feature_name in the feature table.
***************
*** 634,637 ****
--- 1016,1024 ----
    #   -----   -----------------
    #
+   # Note: 'FROM' and 'TO' endopoints are allowed to use non-numerial charactors 
+   # including '<', '>' or '?'. (c.f. '<1', '?42')
+   #
+   # See also http://www.expasy.org/sprot/userman.html#FT_line
+   #
    def ft(feature_key = nil)
      return ft[feature_key] if feature_key
***************
*** 640,693 ****
      table = []
      begin
!     get('FT').split("\n").each do |line|
!       if line =~ /^FT   \w/
!         feature = line.chomp.ljust(74)
!         table << [feature[ 5..12].strip,   # Feature Name
!                   feature[14..19].strip,   # From
!                   feature[21..26].strip,   # To
!                   feature[34..74].strip ]  # Description
!       else
!         table.last << line.chomp.sub!(/^FT +/, '')
!       end
!     end
! 
!     # Join Desctiption lines
!     table = table.map { |feature| 
!       ftid = feature.pop if feature.last =~ /FTId=/
!       if feature.size > 4
!         feature = [feature[0], feature[1], feature[2], 
!                    feature[3, feature.size - 3].join(" ")]
        end
-       feature << ftid
-     }
  
!     hash = {}
!     table.each do |feature|
!       hash[feature[0]] = [] unless hash[feature[0]]
! 
!       hash[feature[0]] << {
!         'From' => feature[1].to_i, 
!         'To'   => feature[2].to_i, 
!         'Description' => feature[3], 
!         'FTId' => feature[4].to_s.sub(/\/FTId=/, '').sub(/\.$/, ''),
!         'diff' => []
        }
  
!       case feature[0]
!       when 'VARSPLIC', 'VARIANT', 'VAR_SEQ', 'CONFLICT'
!         case hash[feature[0]].last['Description']
!         when /(\w[\w ]*\w*) - ?> (\w[\w ]*\w*)/
!           original_res = $1
!           changed_res = $2
!           original_res = original_res.gsub(/ /,'').strip
!           chenged_res = changed_res.gsub(/ /,'').strip
!         when /Missing/i
!           original_res = seq.subseq(hash[feature[0]].last['From'],
!                                     hash[feature[0]].last['To'])
!           changed_res = ''
          end
-         hash[feature[0]].last['diff'] = [original_res, chenged_res]
        end
-     end
      rescue
        raise "Invalid FT Lines(#{$!}) in #{entry_id}:, \n'#{self.get('FT')}'\n"
--- 1027,1083 ----
      table = []
      begin
!       get('FT').split("\n").each do |line|
!         if line =~ /^FT   \w/
!           feature = line.chomp.ljust(74)
!           table << [feature[ 5..12].strip,   # Feature Name
!                     feature[14..19].strip,   # From
!                     feature[21..26].strip,   # To
!                     feature[34..74].strip ]  # Description
!         else
!           table.last << line.chomp.sub!(/^FT +/, '')
!         end
        end
  
!       # Joining Description lines
!       table = table.map { |feature| 
!         ftid = feature.pop if feature.last =~ /FTId=/
!         if feature.size > 4
!           feature = [feature[0], 
!                      feature[1], 
!                      feature[2], 
!                      feature[3, feature.size - 3].join(" ")]
!         end
!         feature << if ftid then ftid else '' end
        }
  
!       hash = {}
!       table.each do |feature|
!         hash[feature[0]] = [] unless hash[feature[0]]
!         hash[feature[0]] << {
!           # Removing '<', '>' or '?' in FROM/TO endopoint.
!           'From' => feature[1].sub(/\D/, '').to_i,  
!           'To'   => feature[2].sub(/\D/, '').to_i, 
!           'Description' => feature[3], 
!           'FTId' => feature[4].to_s.sub(/\/FTId=/, '').sub(/\.$/, ''),
!           'diff' => [],
!           'original' => feature
!         }
! 
!         case feature[0]
!         when 'VARSPLIC', 'VARIANT', 'VAR_SEQ', 'CONFLICT'
!           case hash[feature[0]].last['Description']
!           when /(\w[\w ]*\w*) - ?> (\w[\w ]*\w*)/
!             original_res = $1
!             changed_res = $2
!             original_res = original_res.gsub(/ /,'').strip
!             chenged_res = changed_res.gsub(/ /,'').strip
!           when /Missing/i
!             original_res = seq.subseq(hash[feature[0]].last['From'],
!                                       hash[feature[0]].last['To'])
!             changed_res = ''
!           end
!           hash[feature[0]].last['diff'] = [original_res, chenged_res]
          end
        end
      rescue
        raise "Invalid FT Lines(#{$!}) in #{entry_id}:, \n'#{self.get('FT')}'\n"
***************
*** 872,875 ****
--- 1262,1266 ----
    # OC - organism classification      (>=1 per entry)
    # OX - organism taxonomy x-ref      (>=1 per entry)
+   # OH - Organism Host
    # RN - reference number             (>=1 per entry)
    # RP - reference positions          (>=1 per entry)
***************
*** 879,882 ****
--- 1270,1274 ----
    # RT - reference title              (>=0 per entry; optional)
    # RL - reference location           (>=1 per entry)
+   # RG - reference group(s)
    # CC - comments or notes            (>=0 per entry; optional)
    # DR - database cross-references    (>=0 per entry; optional)