[Biojava-l] GenBank parsing change

Scott Markel smarkel@netgenics.com
Fri, 28 Dec 2001 16:02:13 -0800


NCBI has changed the formatting of the GenBank LOCUS line for release
127, released on 15 December.  The change allows for larger sequences
and longer locus names.  It also allows a tokenization based parsing
rather than a column based parsing.  See section 1.4.1 ("LOCUS line
format change : to accomodate longer names and sequences") in
ftp://ftp.ncbi.nih.gov/genbank/release.notes/gb126.release.notes for
details.

We thought about changing the parsing to handle only the new format.  It
would be nice to be able to just tokenize on white space and get rid of
the old format, but that would have caused us, and presumably others,
some maintenance headaches since the old files still exist.  The
approach we took was to support both formats.

To handle this issue for our customers we've made the following change
to the GenbankContext class in biojava/bio/seq/io/GenbankFormat.java.

Any objections to us checking this change in so that others can use it
too?

Scott

PS We haven't yet made the corresponding change to the writer, but there
we would just support the new format, just like NCBI does.

-----------------------------------------------------------------------

current BioJava code -

    private     void processHeaderLine(String line)
            throws ParseException
    {
        if(line.startsWith(GenbankFormat.LOCUS_TAG))
        {
            // the LOCUS line is a special case because it contains the
            // locus, size, molecule type, GenBank division, and the date
            // of last modification.
            this.saveSeqAnno();
            StringTokenizer lineTokens = new StringTokenizer(line);
            headerTag = lineTokens.nextToken();
            headerTagText = new StringBuffer(lineTokens.nextToken());

            this.saveSeqAnno();
            headerTag = GenbankFormat.SIZE_TAG;
            headerTagText = new StringBuffer(lineTokens.nextToken());
            // read past 'bp'
            lineTokens.nextToken();

            // At this point there are three optional fields, strand number,
            // type, and circularity.
            if(line.charAt(34) != ' ')
            {
                this.saveSeqAnno();
                headerTag = GenbankFormat.STRAND_NUMBER_TAG;
                if(line.charAt(37) == ' ')
                    headerTagText = new StringBuffer(lineTokens.nextToken());
                else // Both STRAND and TYPE fields are in the token
                {
                    String fields = lineTokens.nextToken();
                    headerTagText = new StringBuffer(fields.substring(0,3));

                    this.saveSeqAnno();
                    headerTag = GenbankFormat.TYPE_TAG;
                    headerTagText = new StringBuffer(fields.substring(3));
                }
            }
            else
            if(line.charAt(37) != ' ')
            {
                this.saveSeqAnno();
                headerTag = GenbankFormat.TYPE_TAG;// Check this; may be under PROP
                headerTagText = new StringBuffer(lineTokens.nextToken());
            }

            if(line.charAt(43) != ' ')
            {
                this.saveSeqAnno();
                headerTag = GenbankFormat.CIRCULAR_TAG;
                headerTagText = new StringBuffer(lineTokens.nextToken());
            }

            this.saveSeqAnno();
            headerTag = GenbankFormat.DIVISION_TAG; // May be under PROP
            headerTagText = new StringBuffer(lineTokens.nextToken());

            this.saveSeqAnno();
            headerTag = GenbankFormat.DATE_TAG;
            headerTagText = new StringBuffer(lineTokens.nextToken());
        }
        else if(line.startsWith(GenbankFormat.VERSION_TAG))
        {
            // VERSION line is a special case because it contains both
            // the VERSION field and the GI number
            this.saveSeqAnno();
            StringTokenizer lineTokens = new StringTokenizer(line);
            headerTag = lineTokens.nextToken();
            headerTagText = new StringBuffer(lineTokens.nextToken());

            String nextToken = lineTokens.nextToken();
            if(nextToken.startsWith(GenbankFormat.GI_TAG))
            {
                this.saveSeqAnno();
                headerTag = GenbankFormat.GI_TAG; // Possibly should be UID?
                headerTagText =
                        new StringBuffer(nextToken.substring(3));
            }
        }
        else if (hasHeaderTag(line))
        {       // line has     a header tag
            this.saveSeqAnno();
            headerTag = line.substring(0, TAG_LENGTH).trim();
            headerTagText =     new     StringBuffer(line.substring(TAG_LENGTH));
        }
        else
        {       // keep appending tag text value
            headerTagText.append(" " + line.substring(TAG_LENGTH));
        }
    }

-----------------------------------------------------------------------

modified code -

    private void processHeaderLine(String line)
        throws ParseException
    {
        if(line.startsWith(GenbankFormat.LOCUS_TAG))
        {
                // Genbank changed the format of the Locus line for release 127.
                // The new format is incompatible with the old.
                if(this.isLocusLinePre127(line))
                {
                        this.parseLocusLinePre127(line);
                }
                else
                {
                        this.parseLocusLinePost127(line);
                }
        }
        else if (line.startsWith(GenbankFormat.VERSION_TAG))
        {
            // VERSION line is a special case because it contains both
            // the VERSION field and the GI number
            this.saveSeqAnno();
            StringTokenizer lineTokens = new StringTokenizer(line);
            headerTag = lineTokens.nextToken();
            headerTagText = new StringBuffer(lineTokens.nextToken());

            if (lineTokens.hasMoreTokens()) {
                String nextToken = lineTokens.nextToken();
                if(nextToken.startsWith(GenbankFormat.GI_TAG))
                {
                    this.saveSeqAnno();
                    headerTag = GenbankFormat.GI_TAG; // Possibly should be UID?
                    headerTagText =
                        new StringBuffer(nextToken.substring(3));
                }
            }
        }
        else if (hasHeaderTag(line))
        {       // line has a header tag
            this.saveSeqAnno();
            headerTag = line.substring(0, TAG_LENGTH).trim();
            headerTagText = new StringBuffer(line.substring(TAG_LENGTH));
        }
        // gbpri1.seq (Release 125.0) has a line which is not
        // TAG_LENGTH long. Patch offered by Ron Kuhn (rkuhn@cellomics.com)
        else if (line.length() >= TAG_LENGTH)
        {       // keep appending tag text value
            headerTagText.append(" " + line.substring(TAG_LENGTH));
        }
    }

    /**
     * Checks which version of the locus line format is used.  The algorithm
     * switches on the size of the line; <75 means pre-127, otherwise it's 127.
     *
     * @param theLine the line to check the format of.
     * @return TRUE if the line is in Genbank release 126 or earlier format.
     * FALSE otherwise
     */
    private boolean isLocusLinePre127(String theLine)
    {
        return (theLine.length() < 75);
    }

    /**
     * Parses the locus line assuming it is in pre release 127 format.
     *
     * @param theLine Locus line to parse.
     * @throws ParseException If the line is too short.
     */
    private void parseLocusLinePre127(String theLine)
        throws ParseException
    {
                if (theLine.length() < 73)
                {
                        throw new ParseException("LOCUS line too short [" + theLine
+ "]");
                }

                saveSeqAnno2(GenbankFormat.LOCUS_TAG, theLine.substring(12, 22));
                saveSeqAnno2(GenbankFormat.SIZE_TAG, theLine.substring(22, 29));
                saveSeqAnno2(GenbankFormat.STRAND_NUMBER_TAG, theLine.substring(33,
35));
                saveSeqAnno2(GenbankFormat.TYPE_TAG, theLine.substring(36, 41));
                saveSeqAnno2(GenbankFormat.CIRCULAR_TAG, theLine.substring(42,
52));
                saveSeqAnno2(GenbankFormat.DIVISION_TAG, theLine.substring(52,
55));
                saveSeqAnno2(GenbankFormat.DATE_TAG, theLine.substring(62, 73));
    }

    /**
     * Parses the locus line assuming it is in post release 127 format.
     *
     * @param theLine Locus line to parse.
     * @throws ParseException If the line is too short.
     */
    private void parseLocusLinePost127(String theLine)
        throws ParseException
    {
                if (theLine.length() < 79)
                {
                        throw new ParseException("LOCUS line too short [" + theLine
+ "]");
                }

                StringTokenizer locusTokens = new StringTokenizer(theLine);
                if(locusTokens.countTokens() != 8)
                {
                        throw new ParseException("LOCUS line incorrectly tokenized
[" + theLine + "]");
                }
                // LOCUS tag; not stored
                locusTokens.nextToken();
                // Locus name
                saveSeqAnno2(GenbankFormat.LOCUS_TAG, locusTokens.nextToken());
                // Sequence length
                saveSeqAnno2(GenbankFormat.SIZE_TAG, locusTokens.nextToken());
                // "bp"; not stored
                locusTokens.nextToken();
                // Strand information
                // Both the strand and type are in the same token.  The strand
                // information is an optional part, so this is a bit hairy
                String strandString = locusTokens.nextToken();
                StringTokenizer strandTokens = new StringTokenizer(strandString,
"-");
                if(strandTokens.countTokens() > 1)
                {
                        saveSeqAnno2(GenbankFormat.STRAND_NUMBER_TAG,
strandTokens.nextToken());
                }
                saveSeqAnno2(GenbankFormat.TYPE_TAG, strandTokens.nextToken());
                // Circularity
                saveSeqAnno2(GenbankFormat.CIRCULAR_TAG, locusTokens.nextToken());
                // Division code
                saveSeqAnno2(GenbankFormat.DIVISION_TAG, locusTokens.nextToken());
                // Date in dd-MMM-yyyy format
                saveSeqAnno2(GenbankFormat.DATE_TAG, locusTokens.nextToken());
    }

    /**
     * Passes the tag and the text to the listener.
     *
     * @throws ParseException Thrown when an error occurs parsing the file
     */
    private void saveSeqAnno()
        throws ParseException
    {
        if (!headerTag.equals(""))
        { // save tag and its text
            listener.addSequenceProperty(headerTag, headerTagText.toString());
            headerTag = "";
            headerTagText = new StringBuffer("");
        }
    }

    /**
     * Private method to process a header tag and associated value.
     *
     * @param tag The tag to add
     * @param value The value of the associated tag
     * @throws ParseException Thrown when an error occurs parsing the file
     */
        private void saveSeqAnno2(String tag, String value)
        throws ParseException
        {
                value = value.trim();   // strip whitespace
                if (value.length() > 0) {
                        this.saveSeqAnno();
                        headerTag = tag;
                headerTagText = new StringBuffer(value);
                }
        }

-----------------------------------------------------------------------
-- 
Scott Markel, Ph.D.       NetGenics, Inc.
smarkel@netgenics.com     4350 Executive Drive
Tel: 858 455 5223         Suite 260
FAX: 858 455 1388         San Diego, CA  92121