[Biopython-dev] ipi parser

Brad Chapman chapmanb at uga.edu
Wed May 19 05:50:17 EDT 2004


Hi Pierre and Jeff;

Pierre:
> >I'm trying to use the Swissprot parser to parse IPI. I read that the 
> >parser should have been fixed for IPI however I get an error on date 
> >when I try to parse ipi.HUMAN I get:
[...]
> >ValueError: invalid literal for int(): Human

Jeff:
> These errors are nearly always due to changes in the formats of the 
> records that occur from time to time.  Do you have a sample file, or 
> accession number, that I can use to see what's going on?

I took a look at this using the ipi.HUMAN.dat file from
ftp://ftp.infobiogen.fr/pub/db/ipi/current/ and was able to
reproduce the error. It looks like the problem was that the DT lines
are different then expected:

DT   01-AUG-2003 (IPI Human rel. 2.22, Created)
DT   01-AUG-2003 (IPI Human rel. 2.22, Last sequence update)

They've got the 'IPI Human' bit before 'rel.' and SProt tries to get
the version information from the third column (which is 'Human')
since it normally expects a rational 'Rel.' part only.

Also, the versions are now dotted and not just integers, and the
third DT line is missing.

Finally, some of these IPI files are missing the DE line.

I updated the SProt parser to handle this and a patch to
Bio/SwissProt/SProt.py is attached. I also updated the tests with an
example file, and added an IPI expression to swissprot in the
registry so that the FormatIO system can also handle these files.

Jeff, let me know if I broke anything or did anything else bad.

Pierre, hope this fixes your problem.
Brad
-------------- next part --------------
? SProt.diff
Index: SProt.py
===================================================================
RCS file: /home/repository/biopython/biopython/Bio/SwissProt/SProt.py,v
retrieving revision 1.28
retrieving revision 1.29
diff -c -r1.28 -r1.29
*** SProt.py	18 May 2004 13:58:35 -0000	1.28
--- SProt.py	19 May 2004 14:01:33 -0000	1.29
***************
*** 377,386 ****
      def _scan_dt(self, uhandle, consumer):
          self._scan_line('DT', uhandle, consumer.date, exactly_one=1)
          self._scan_line('DT', uhandle, consumer.date, exactly_one=1)
!         self._scan_line('DT', uhandle, consumer.date, exactly_one=1)
  
      def _scan_de(self, uhandle, consumer):
!         self._scan_line('DE', uhandle, consumer.description, one_or_more=1)
      
      def _scan_gn(self, uhandle, consumer):
          self._scan_line('GN', uhandle, consumer.gene_name, any_number=1)
--- 377,388 ----
      def _scan_dt(self, uhandle, consumer):
          self._scan_line('DT', uhandle, consumer.date, exactly_one=1)
          self._scan_line('DT', uhandle, consumer.date, exactly_one=1)
!         # IPI doesn't necessarily contain the third line about annotations
!         self._scan_line('DT', uhandle, consumer.date, up_to_one=1)
  
      def _scan_de(self, uhandle, consumer):
!         # IPI can be missing a DE line
!         self._scan_line('DE', uhandle, consumer.description, any_number=1)
      
      def _scan_gn(self, uhandle, consumer):
          self._scan_line('GN', uhandle, consumer.gene_name, any_number=1)
***************
*** 526,554 ****
      
      def date(self, line):
          uprline = string.upper(line)
          cols = line.split()
          if uprline.find("CREATED") >= 0:
!             # ws:2001-12-05 prevent e.g. (IPIrel. , created)
!             # !no number given! from crashing
!             if self._chomp(cols[3]) == '':                            #<=
!                 self.data.created = cols[1], 0                        #<=
! 	    else:	                                              #<=
!                 self.data.created = cols[1], int(self._chomp(cols[3]))
          elif uprline.find('LAST SEQUENCE UPDATE') >= 0:
!             # ws:2001-12-05 prevent e.g. (IPIrel. , created)
!             # !no number given! from crashing
!             if self._chomp(cols[3]) == '':                            #<=
!                 self.data.sequence_update = cols[1], 0                #<=
! 	    else:                                                     #<=
!                 self.data.sequence_update = cols[1], int(self._chomp(cols[3]))
          elif uprline.find( 'LAST ANNOTATION UPDATE') >= 0:
!             # ws:2001-12-05 prevent e.g. (IPIrel. , created)
!             # !no number given! from crashing
!             if self._chomp(cols[3]) == '':                               #<=
!                 self.data.annotation_update = cols[1], 0                 #<=
! 	    else:                                                        #<=
!                 self.data.annotation_update = cols[1], \
!                                               int(self._chomp(cols[3]))  #<=
          else:
              raise SyntaxError, "I don't understand the date line %s" % line
      
--- 528,565 ----
      
      def date(self, line):
          uprline = string.upper(line)
+         
+         # find where the version information will be located
+         # This is needed for when you have cases like IPI where
+         # the release verison is in a different spot:
+         # DT   08-JAN-2002 (IPI Human rel. 2.3, Created)
+         uprcols = uprline.split()
+         rel_index = -1
+         for index in range(len(uprcols)):
+             if uprcols[index].find("REL.") >= 0:
+                 rel_index = index
+         assert rel_index >= 0, \
+                 "Could not find Rel. in DT line: %s" % (line)
+         version_index = rel_index + 1
+         # get the version information
          cols = line.split()
+         str_version = self._chomp(cols[version_index])
+         # no version number
+         if str_version == '':
+             version = 0
+         # dot versioned
+         elif str_version.find(".") >= 0:
+             version = str_version
+         # integer versioned
+         else:
+             version = int(str_version)
+ 
          if uprline.find("CREATED") >= 0:
!             self.data.created = cols[1], version
          elif uprline.find('LAST SEQUENCE UPDATE') >= 0:
!             self.data.sequence_update = cols[1], version
          elif uprline.find( 'LAST ANNOTATION UPDATE') >= 0:
!             self.data.annotation_update = cols[1], version
          else:
              raise SyntaxError, "I don't understand the date line %s" % line
      


More information about the Biopython-dev mailing list