[Biojava-l] GenBank parsing change

Scott Markel smarkel@netgenics.com
Fri, 04 Jan 2002 09:55:32 -0800


Simon,

Thanks for trying out the new code.  We'll check it in to the BioJava
CVS repository.

Scott

Simon Foote wrote:
> 
> No objections here.
> 
> I made the change, and ran my script that generates a blast database of
> all the bacterial proteins in GenBank using the release 127 gbbct files
> and it worked fine.  Also, worked fine on all the daily update files
> upto  today.
> 
> Only had to make a slight modification to my script to catch a few
> records that for some unexplained reason, didn't have a type (ie.
> DNA,RNA, etc) in the LOCUS line, thus causing an incorrect number of
> tokens exception.
> 
> Aside from that it parsed all the files flawlessly.
> 
> Cheers,
> Simon Foote
> 
> --
> Bioinformatics Specialist
> Institute for Biological Sciences
> National Research Council of Canada
> [T] 613-991-4342  [F] 613-952-9092
> 
> Scott Markel wrote:
> 
> >NCBI has changed the formatting of the GenBank LOCUS line for release
> >127, released on 15 December.  The change allows for larger sequences
> >and longer locus names.  It also allows a tokenization based parsing
> >rather than a column based parsing.  See section 1.4.1 ("LOCUS line
> >format change : to accomodate longer names and sequences") in
> >ftp://ftp.ncbi.nih.gov/genbank/release.notes/gb126.release.notes for
> >details.
> >
> >We thought about changing the parsing to handle only the new format.  It
> >would be nice to be able to just tokenize on white space and get rid of
> >the old format, but that would have caused us, and presumably others,
> >some maintenance headaches since the old files still exist.  The
> >approach we took was to support both formats.
> >
> >To handle this issue for our customers we've made the following change
> >to the GenbankContext class in biojava/bio/seq/io/GenbankFormat.java.
> >
> >Any objections to us checking this change in so that others can use it
> >too?
> >
> >Scott
> >
> >PS We haven't yet made the corresponding change to the writer, but there
> >we would just support the new format, just like NCBI does.
> >
> >-----------------------------------------------------------------------
> >
> >current BioJava code -
> >
> >    private     void processHeaderLine(String line)
> >            throws ParseException
> >    {
> >        if(line.startsWith(GenbankFormat.LOCUS_TAG))
> >        {
> >            // the LOCUS line is a special case because it contains the
> >            // locus, size, molecule type, GenBank division, and the date
> >            // of last modification.
> >            this.saveSeqAnno();
> >            StringTokenizer lineTokens = new StringTokenizer(line);
> >            headerTag = lineTokens.nextToken();
> >            headerTagText = new StringBuffer(lineTokens.nextToken());
> >
> >            this.saveSeqAnno();
> >            headerTag = GenbankFormat.SIZE_TAG;
> >            headerTagText = new StringBuffer(lineTokens.nextToken());
> >            // read past 'bp'
> >            lineTokens.nextToken();
> >
> >            // At this point there are three optional fields, strand number,
> >            // type, and circularity.
> >            if(line.charAt(34) != ' ')
> >            {
> >                this.saveSeqAnno();
> >                headerTag = GenbankFormat.STRAND_NUMBER_TAG;
> >                if(line.charAt(37) == ' ')
> >                    headerTagText = new StringBuffer(lineTokens.nextToken());
> >                else // Both STRAND and TYPE fields are in the token
> >                {
> >                    String fields = lineTokens.nextToken();
> >                    headerTagText = new StringBuffer(fields.substring(0,3));
> >
> >                    this.saveSeqAnno();
> >                    headerTag = GenbankFormat.TYPE_TAG;
> >                    headerTagText = new StringBuffer(fields.substring(3));
> >                }
> >            }
> >            else
> >            if(line.charAt(37) != ' ')
> >            {
> >                this.saveSeqAnno();
> >                headerTag = GenbankFormat.TYPE_TAG;// Check this; may be under PROP
> >                headerTagText = new StringBuffer(lineTokens.nextToken());
> >            }
> >
> >            if(line.charAt(43) != ' ')
> >            {
> >                this.saveSeqAnno();
> >                headerTag = GenbankFormat.CIRCULAR_TAG;
> >                headerTagText = new StringBuffer(lineTokens.nextToken());
> >            }
> >
> >            this.saveSeqAnno();
> >            headerTag = GenbankFormat.DIVISION_TAG; // May be under PROP
> >            headerTagText = new StringBuffer(lineTokens.nextToken());
> >
> >            this.saveSeqAnno();
> >            headerTag = GenbankFormat.DATE_TAG;
> >            headerTagText = new StringBuffer(lineTokens.nextToken());
> >        }
> >        else if(line.startsWith(GenbankFormat.VERSION_TAG))
> >        {
> >            // VERSION line is a special case because it contains both
> >            // the VERSION field and the GI number
> >            this.saveSeqAnno();
> >            StringTokenizer lineTokens = new StringTokenizer(line);
> >            headerTag = lineTokens.nextToken();
> >            headerTagText = new StringBuffer(lineTokens.nextToken());
> >
> >            String nextToken = lineTokens.nextToken();
> >            if(nextToken.startsWith(GenbankFormat.GI_TAG))
> >            {
> >                this.saveSeqAnno();
> >                headerTag = GenbankFormat.GI_TAG; // Possibly should be UID?
> >                headerTagText =
> >                        new StringBuffer(nextToken.substring(3));
> >            }
> >        }
> >        else if (hasHeaderTag(line))
> >        {       // line has     a header tag
> >            this.saveSeqAnno();
> >            headerTag = line.substring(0, TAG_LENGTH).trim();
> >            headerTagText =     new     StringBuffer(line.substring(TAG_LENGTH));
> >        }
> >        else
> >        {       // keep appending tag text value
> >            headerTagText.append(" " + line.substring(TAG_LENGTH));
> >        }
> >    }
> >
> >-----------------------------------------------------------------------
> >
> >modified code -
> >
> >    private void processHeaderLine(String line)
> >        throws ParseException
> >    {
> >        if(line.startsWith(GenbankFormat.LOCUS_TAG))
> >        {
> >                // Genbank changed the format of the Locus line for release 127.
> >                // The new format is incompatible with the old.
> >                if(this.isLocusLinePre127(line))
> >                {
> >                        this.parseLocusLinePre127(line);
> >                }
> >                else
> >                {
> >                        this.parseLocusLinePost127(line);
> >                }
> >        }
> >        else if (line.startsWith(GenbankFormat.VERSION_TAG))
> >        {
> >            // VERSION line is a special case because it contains both
> >            // the VERSION field and the GI number
> >            this.saveSeqAnno();
> >            StringTokenizer lineTokens = new StringTokenizer(line);
> >            headerTag = lineTokens.nextToken();
> >            headerTagText = new StringBuffer(lineTokens.nextToken());
> >
> >            if (lineTokens.hasMoreTokens()) {
> >                String nextToken = lineTokens.nextToken();
> >                if(nextToken.startsWith(GenbankFormat.GI_TAG))
> >                {
> >                    this.saveSeqAnno();
> >                    headerTag = GenbankFormat.GI_TAG; // Possibly should be UID?
> >                    headerTagText =
> >                        new StringBuffer(nextToken.substring(3));
> >                }
> >            }
> >        }
> >        else if (hasHeaderTag(line))
> >        {       // line has a header tag
> >            this.saveSeqAnno();
> >            headerTag = line.substring(0, TAG_LENGTH).trim();
> >            headerTagText = new StringBuffer(line.substring(TAG_LENGTH));
> >        }
> >        // gbpri1.seq (Release 125.0) has a line which is not
> >        // TAG_LENGTH long. Patch offered by Ron Kuhn (rkuhn@cellomics.com)
> >        else if (line.length() >= TAG_LENGTH)
> >        {       // keep appending tag text value
> >            headerTagText.append(" " + line.substring(TAG_LENGTH));
> >        }
> >    }
> >
> >    /**
> >     * Checks which version of the locus line format is used.  The algorithm
> >     * switches on the size of the line; <75 means pre-127, otherwise it's 127.
> >     *
> >     * @param theLine the line to check the format of.
> >     * @return TRUE if the line is in Genbank release 126 or earlier format.
> >     * FALSE otherwise
> >     */
> >    private boolean isLocusLinePre127(String theLine)
> >    {
> >        return (theLine.length() < 75);
> >    }
> >
> >    /**
> >     * Parses the locus line assuming it is in pre release 127 format.
> >     *
> >     * @param theLine Locus line to parse.
> >     * @throws ParseException If the line is too short.
> >     */
> >    private void parseLocusLinePre127(String theLine)
> >        throws ParseException
> >    {
> >                if (theLine.length() < 73)
> >                {
> >                        throw new ParseException("LOCUS line too short [" + theLine
> >+ "]");
> >                }
> >
> >                saveSeqAnno2(GenbankFormat.LOCUS_TAG, theLine.substring(12, 22));
> >                saveSeqAnno2(GenbankFormat.SIZE_TAG, theLine.substring(22, 29));
> >                saveSeqAnno2(GenbankFormat.STRAND_NUMBER_TAG, theLine.substring(33,
> >35));
> >                saveSeqAnno2(GenbankFormat.TYPE_TAG, theLine.substring(36, 41));
> >                saveSeqAnno2(GenbankFormat.CIRCULAR_TAG, theLine.substring(42,
> >52));
> >                saveSeqAnno2(GenbankFormat.DIVISION_TAG, theLine.substring(52,
> >55));
> >                saveSeqAnno2(GenbankFormat.DATE_TAG, theLine.substring(62, 73));
> >    }
> >
> >    /**
> >     * Parses the locus line assuming it is in post release 127 format.
> >     *
> >     * @param theLine Locus line to parse.
> >     * @throws ParseException If the line is too short.
> >     */
> >    private void parseLocusLinePost127(String theLine)
> >        throws ParseException
> >    {
> >                if (theLine.length() < 79)
> >                {
> >                        throw new ParseException("LOCUS line too short [" + theLine
> >+ "]");
> >                }
> >
> >                StringTokenizer locusTokens = new StringTokenizer(theLine);
> >                if(locusTokens.countTokens() != 8)
> >                {
> >                        throw new ParseException("LOCUS line incorrectly tokenized
> >[" + theLine + "]");
> >                }
> >                // LOCUS tag; not stored
> >                locusTokens.nextToken();
> >                // Locus name
> >                saveSeqAnno2(GenbankFormat.LOCUS_TAG, locusTokens.nextToken());
> >                // Sequence length
> >                saveSeqAnno2(GenbankFormat.SIZE_TAG, locusTokens.nextToken());
> >                // "bp"; not stored
> >                locusTokens.nextToken();
> >                // Strand information
> >                // Both the strand and type are in the same token.  The strand
> >                // information is an optional part, so this is a bit hairy
> >                String strandString = locusTokens.nextToken();
> >                StringTokenizer strandTokens = new StringTokenizer(strandString,
> >"-");
> >                if(strandTokens.countTokens() > 1)
> >                {
> >                        saveSeqAnno2(GenbankFormat.STRAND_NUMBER_TAG,
> >strandTokens.nextToken());
> >                }
> >                saveSeqAnno2(GenbankFormat.TYPE_TAG, strandTokens.nextToken());
> >                // Circularity
> >                saveSeqAnno2(GenbankFormat.CIRCULAR_TAG, locusTokens.nextToken());
> >                // Division code
> >                saveSeqAnno2(GenbankFormat.DIVISION_TAG, locusTokens.nextToken());
> >                // Date in dd-MMM-yyyy format
> >                saveSeqAnno2(GenbankFormat.DATE_TAG, locusTokens.nextToken());
> >    }
> >
> >    /**
> >     * Passes the tag and the text to the listener.
> >     *
> >     * @throws ParseException Thrown when an error occurs parsing the file
> >     */
> >    private void saveSeqAnno()
> >        throws ParseException
> >    {
> >        if (!headerTag.equals(""))
> >        { // save tag and its text
> >            listener.addSequenceProperty(headerTag, headerTagText.toString());
> >            headerTag = "";
> >            headerTagText = new StringBuffer("");
> >        }
> >    }
> >
> >    /**
> >     * Private method to process a header tag and associated value.
> >     *
> >     * @param tag The tag to add
> >     * @param value The value of the associated tag
> >     * @throws ParseException Thrown when an error occurs parsing the file
> >     */
> >        private void saveSeqAnno2(String tag, String value)
> >        throws ParseException
> >        {
> >                value = value.trim();   // strip whitespace
> >                if (value.length() > 0) {
> >                        this.saveSeqAnno();
> >                        headerTag = tag;
> >                headerTagText = new StringBuffer(value);
> >                }
> >        }
> >
> >-----------------------------------------------------------------------
> >

-- 
Scott Markel, Ph.D.       NetGenics, Inc.
smarkel@netgenics.com     4350 Executive Drive
Tel: 858 455 5223         Suite 260
FAX: 858 455 1388         San Diego, CA  92121