[Biopython-dev] Re: [BioPython] parse IPI data with biopythons SwissProt parser

Thu Dec 20 22:40:35 EST 2001

(moved from biopython)

Hi Wolfgang,

These look like relatively minor changes.  I'd like to incorporate
them into the SProt.py file in the standard distribution, if you don't
mind.  However, I'm having a little bit of trouble reconstructing the
patch from the description given.  Do you mind sending me your
SProt.py file with all the changes necessary?

Thanks,
Jeff

On Tue, Dec 18, 2001 at 10:21:31AM +0100, Wolfgang Schueler wrote:
> Hi all,
> 
> the IPI database at EBI contains proteins from the human genome
> from SWISS-PROT, TrEMBL, RefSeq and Ensembl and is available in a 
> SWISS-PROT format.
> Nevertheless there are minor differences to real SWISS-PROT data which 
> prevent the use of the SWISS-PROT parser of Biopython1.00.a3
> 
> The following modifications of Sprot.py allowed the parsing of the 
> IPI-data (find IPI in http://www.ebi.ac.uk/IPI/IPIhelp.html).
> 
> Maybe it is helpful for someone.
> Wolfgang
> 
> 
> 
> 
> # ws: changes in _RecordConsumer.date()            for IPI
> #                _RecordConsumer.identification()  for IPI
> #                _Scanner.scanReference()          crashing SwissProt entry
> #                _Scanner.scanDT()                 for IPI
> #                _Scanner.scanDE()                 for IPI
> 
>     def _scan_dt(self, uhandle, consumer):
>         self._scan_line('DT', uhandle, consumer.date, exactly_one=1)
> #        self._scan_line('DT', uhandle, consumer.date, exactly_one=1)
> #ws:2001-12-05----------------------------------------v========v----  # 
> IPI does not use 'last annotation update'	
>         self._scan_line('DT', uhandle, consumer.date, one_or_more=1)  #
> # 
>                                               ^========^------#
>  #       self._scan_line('DT', uhandle, consumer.date, exactly_one=1) #
> #^--------------------------------------------------------------------# 
> 
> 
>     def _scan_de(self, uhandle, consumer):
> #ws:2001-12-05-----------------------------------------------v========v---- 
>  # IPI IPI00029727.2: no DE entry	
>         self._scan_line('DE', uhandle, consumer.description, 
> any_number=1) # was one_or_more
> #------------------------------------------------------------^========^
>     def _scan_reference(self, uhandle, consumer):
>         while 1:
>             if safe_peekline(uhandle)[:2] != 'RN':
>                 break
>             self._scan_rn(uhandle, consumer)
>             self._scan_rp(uhandle, consumer)
>             self._scan_rc(uhandle, consumer)
>             self._scan_rx(uhandle, consumer)
> # ws:2001-12-05 added, entry exists with RL before RA
> # ----------v==============================v
>             self._scan_rl(uhandle, consumer)
> #-----------^==============================^ 
> 
>             self._scan_ra(uhandle, consumer)
>             self._scan_rt(uhandle, consumer)
>             self._scan_rl(uhandle, consumer)
> 
> 
>     def identification(self, line):
>         cols = string.split(line)
>         self.data.entry_name = cols[1]
>         self.data.data_class = self._chomp(cols[2])    # don't want ';'
>         self.data.molecule_type = self._chomp(cols[3]) # don't want ';'
>         self.data.sequence_length = int(cols[4])
> 
>         # data class can be 'STANDARD' or 'PRELIMINARY'
> # ws:2001-12-05 added to be IPI conform -------------------------v=====v
>         if self.data.data_class not in ['STANDARD','PRELIMINARY','IPI']:
> # ---------------------------------------------------------------^=====^
>             raise SyntaxError, "Unrecognized data class %s is in 
> line\n%s" % \
>                   (self.data.data_class, line)
>         # molecule_type should be 'PRT' for PRoTein
>         if self.data.molecule_type != 'PRT':
>             raise SyntaxError, "Unrecognized molecule type %s in 
> line\n%s" % \
>                   (self.data.molecule_type, line)
> 
>     def date(self, line):
>         uprline = string.upper(line)
>         if string.find(uprline, 'CREATED') >= 0:
>             cols = string.split(line)
> # ws:2001-12-05 added lines to prevent crash at (IPIrel. , created) !no 
> number given!
>             if self._chomp(cols[3]) == '':                            #<=
> 	       self.data.created = cols[1], 0                         #<=
> 	    else:	                                              #<=
>                self.data.created = cols[1], int(self._chomp(cols[3]))
> #-----------^=^--------------------------------------------------------
>         elif string.find(uprline, 'LAST SEQUENCE UPDATE') >= 0:
>             cols = string.split(line)
> # ws:2001-12-05 added lines to prevent crash at '(IPIrel. , created)' 
> !no number given!
>             if self._chomp(cols[3]) == '': 
>        #<=
> 	       self.data.sequence_update = cols[1], 0                         #<=
> 	    else:                                                             #<=
>                self.data.sequence_update = cols[1], 
> int(self._chomp(cols[3]))
> #-----------^=^----------------------------------------------------------------
>         elif string.find(uprline, 'LAST ANNOTATION UPDATE') >= 0:
>             cols = string.split(line)
> # ws:2001-12-05 added lines to prevent crash at '(IPIrel. , created)' 
> !no number given!
>             if self._chomp(cols[3]) == '': 
>        #<=
> 	       self.data.annotation_update = cols[1], 0                       #<=
> 	    else:                                                             #<=
>                self.data.annotation_update = cols[1], 
> int(self._chomp(cols[3]))  #<=
> #-----------^=^----------------------------------------------------------------
>         else:
>             raise SyntaxError, "I don't understand the date line %s" % line
> 
> 
> _______________________________________________
> BioPython mailing list  -  BioPython at biopython.org
> http://biopython.org/mailman/listinfo/biopython