[Biojava-l] GenBank parsing change
Scott Markel
smarkel@netgenics.com
Fri, 04 Jan 2002 09:55:32 -0800
Simon,
Thanks for trying out the new code. We'll check it in to the BioJava
CVS repository.
Scott
Simon Foote wrote:
>
> No objections here.
>
> I made the change, and ran my script that generates a blast database of
> all the bacterial proteins in GenBank using the release 127 gbbct files
> and it worked fine. Also, worked fine on all the daily update files
> upto today.
>
> Only had to make a slight modification to my script to catch a few
> records that for some unexplained reason, didn't have a type (ie.
> DNA,RNA, etc) in the LOCUS line, thus causing an incorrect number of
> tokens exception.
>
> Aside from that it parsed all the files flawlessly.
>
> Cheers,
> Simon Foote
>
> --
> Bioinformatics Specialist
> Institute for Biological Sciences
> National Research Council of Canada
> [T] 613-991-4342 [F] 613-952-9092
>
> Scott Markel wrote:
>
> >NCBI has changed the formatting of the GenBank LOCUS line for release
> >127, released on 15 December. The change allows for larger sequences
> >and longer locus names. It also allows a tokenization based parsing
> >rather than a column based parsing. See section 1.4.1 ("LOCUS line
> >format change : to accomodate longer names and sequences") in
> >ftp://ftp.ncbi.nih.gov/genbank/release.notes/gb126.release.notes for
> >details.
> >
> >We thought about changing the parsing to handle only the new format. It
> >would be nice to be able to just tokenize on white space and get rid of
> >the old format, but that would have caused us, and presumably others,
> >some maintenance headaches since the old files still exist. The
> >approach we took was to support both formats.
> >
> >To handle this issue for our customers we've made the following change
> >to the GenbankContext class in biojava/bio/seq/io/GenbankFormat.java.
> >
> >Any objections to us checking this change in so that others can use it
> >too?
> >
> >Scott
> >
> >PS We haven't yet made the corresponding change to the writer, but there
> >we would just support the new format, just like NCBI does.
> >
> >-----------------------------------------------------------------------
> >
> >current BioJava code -
> >
> > private void processHeaderLine(String line)
> > throws ParseException
> > {
> > if(line.startsWith(GenbankFormat.LOCUS_TAG))
> > {
> > // the LOCUS line is a special case because it contains the
> > // locus, size, molecule type, GenBank division, and the date
> > // of last modification.
> > this.saveSeqAnno();
> > StringTokenizer lineTokens = new StringTokenizer(line);
> > headerTag = lineTokens.nextToken();
> > headerTagText = new StringBuffer(lineTokens.nextToken());
> >
> > this.saveSeqAnno();
> > headerTag = GenbankFormat.SIZE_TAG;
> > headerTagText = new StringBuffer(lineTokens.nextToken());
> > // read past 'bp'
> > lineTokens.nextToken();
> >
> > // At this point there are three optional fields, strand number,
> > // type, and circularity.
> > if(line.charAt(34) != ' ')
> > {
> > this.saveSeqAnno();
> > headerTag = GenbankFormat.STRAND_NUMBER_TAG;
> > if(line.charAt(37) == ' ')
> > headerTagText = new StringBuffer(lineTokens.nextToken());
> > else // Both STRAND and TYPE fields are in the token
> > {
> > String fields = lineTokens.nextToken();
> > headerTagText = new StringBuffer(fields.substring(0,3));
> >
> > this.saveSeqAnno();
> > headerTag = GenbankFormat.TYPE_TAG;
> > headerTagText = new StringBuffer(fields.substring(3));
> > }
> > }
> > else
> > if(line.charAt(37) != ' ')
> > {
> > this.saveSeqAnno();
> > headerTag = GenbankFormat.TYPE_TAG;// Check this; may be under PROP
> > headerTagText = new StringBuffer(lineTokens.nextToken());
> > }
> >
> > if(line.charAt(43) != ' ')
> > {
> > this.saveSeqAnno();
> > headerTag = GenbankFormat.CIRCULAR_TAG;
> > headerTagText = new StringBuffer(lineTokens.nextToken());
> > }
> >
> > this.saveSeqAnno();
> > headerTag = GenbankFormat.DIVISION_TAG; // May be under PROP
> > headerTagText = new StringBuffer(lineTokens.nextToken());
> >
> > this.saveSeqAnno();
> > headerTag = GenbankFormat.DATE_TAG;
> > headerTagText = new StringBuffer(lineTokens.nextToken());
> > }
> > else if(line.startsWith(GenbankFormat.VERSION_TAG))
> > {
> > // VERSION line is a special case because it contains both
> > // the VERSION field and the GI number
> > this.saveSeqAnno();
> > StringTokenizer lineTokens = new StringTokenizer(line);
> > headerTag = lineTokens.nextToken();
> > headerTagText = new StringBuffer(lineTokens.nextToken());
> >
> > String nextToken = lineTokens.nextToken();
> > if(nextToken.startsWith(GenbankFormat.GI_TAG))
> > {
> > this.saveSeqAnno();
> > headerTag = GenbankFormat.GI_TAG; // Possibly should be UID?
> > headerTagText =
> > new StringBuffer(nextToken.substring(3));
> > }
> > }
> > else if (hasHeaderTag(line))
> > { // line has a header tag
> > this.saveSeqAnno();
> > headerTag = line.substring(0, TAG_LENGTH).trim();
> > headerTagText = new StringBuffer(line.substring(TAG_LENGTH));
> > }
> > else
> > { // keep appending tag text value
> > headerTagText.append(" " + line.substring(TAG_LENGTH));
> > }
> > }
> >
> >-----------------------------------------------------------------------
> >
> >modified code -
> >
> > private void processHeaderLine(String line)
> > throws ParseException
> > {
> > if(line.startsWith(GenbankFormat.LOCUS_TAG))
> > {
> > // Genbank changed the format of the Locus line for release 127.
> > // The new format is incompatible with the old.
> > if(this.isLocusLinePre127(line))
> > {
> > this.parseLocusLinePre127(line);
> > }
> > else
> > {
> > this.parseLocusLinePost127(line);
> > }
> > }
> > else if (line.startsWith(GenbankFormat.VERSION_TAG))
> > {
> > // VERSION line is a special case because it contains both
> > // the VERSION field and the GI number
> > this.saveSeqAnno();
> > StringTokenizer lineTokens = new StringTokenizer(line);
> > headerTag = lineTokens.nextToken();
> > headerTagText = new StringBuffer(lineTokens.nextToken());
> >
> > if (lineTokens.hasMoreTokens()) {
> > String nextToken = lineTokens.nextToken();
> > if(nextToken.startsWith(GenbankFormat.GI_TAG))
> > {
> > this.saveSeqAnno();
> > headerTag = GenbankFormat.GI_TAG; // Possibly should be UID?
> > headerTagText =
> > new StringBuffer(nextToken.substring(3));
> > }
> > }
> > }
> > else if (hasHeaderTag(line))
> > { // line has a header tag
> > this.saveSeqAnno();
> > headerTag = line.substring(0, TAG_LENGTH).trim();
> > headerTagText = new StringBuffer(line.substring(TAG_LENGTH));
> > }
> > // gbpri1.seq (Release 125.0) has a line which is not
> > // TAG_LENGTH long. Patch offered by Ron Kuhn (rkuhn@cellomics.com)
> > else if (line.length() >= TAG_LENGTH)
> > { // keep appending tag text value
> > headerTagText.append(" " + line.substring(TAG_LENGTH));
> > }
> > }
> >
> > /**
> > * Checks which version of the locus line format is used. The algorithm
> > * switches on the size of the line; <75 means pre-127, otherwise it's 127.
> > *
> > * @param theLine the line to check the format of.
> > * @return TRUE if the line is in Genbank release 126 or earlier format.
> > * FALSE otherwise
> > */
> > private boolean isLocusLinePre127(String theLine)
> > {
> > return (theLine.length() < 75);
> > }
> >
> > /**
> > * Parses the locus line assuming it is in pre release 127 format.
> > *
> > * @param theLine Locus line to parse.
> > * @throws ParseException If the line is too short.
> > */
> > private void parseLocusLinePre127(String theLine)
> > throws ParseException
> > {
> > if (theLine.length() < 73)
> > {
> > throw new ParseException("LOCUS line too short [" + theLine
> >+ "]");
> > }
> >
> > saveSeqAnno2(GenbankFormat.LOCUS_TAG, theLine.substring(12, 22));
> > saveSeqAnno2(GenbankFormat.SIZE_TAG, theLine.substring(22, 29));
> > saveSeqAnno2(GenbankFormat.STRAND_NUMBER_TAG, theLine.substring(33,
> >35));
> > saveSeqAnno2(GenbankFormat.TYPE_TAG, theLine.substring(36, 41));
> > saveSeqAnno2(GenbankFormat.CIRCULAR_TAG, theLine.substring(42,
> >52));
> > saveSeqAnno2(GenbankFormat.DIVISION_TAG, theLine.substring(52,
> >55));
> > saveSeqAnno2(GenbankFormat.DATE_TAG, theLine.substring(62, 73));
> > }
> >
> > /**
> > * Parses the locus line assuming it is in post release 127 format.
> > *
> > * @param theLine Locus line to parse.
> > * @throws ParseException If the line is too short.
> > */
> > private void parseLocusLinePost127(String theLine)
> > throws ParseException
> > {
> > if (theLine.length() < 79)
> > {
> > throw new ParseException("LOCUS line too short [" + theLine
> >+ "]");
> > }
> >
> > StringTokenizer locusTokens = new StringTokenizer(theLine);
> > if(locusTokens.countTokens() != 8)
> > {
> > throw new ParseException("LOCUS line incorrectly tokenized
> >[" + theLine + "]");
> > }
> > // LOCUS tag; not stored
> > locusTokens.nextToken();
> > // Locus name
> > saveSeqAnno2(GenbankFormat.LOCUS_TAG, locusTokens.nextToken());
> > // Sequence length
> > saveSeqAnno2(GenbankFormat.SIZE_TAG, locusTokens.nextToken());
> > // "bp"; not stored
> > locusTokens.nextToken();
> > // Strand information
> > // Both the strand and type are in the same token. The strand
> > // information is an optional part, so this is a bit hairy
> > String strandString = locusTokens.nextToken();
> > StringTokenizer strandTokens = new StringTokenizer(strandString,
> >"-");
> > if(strandTokens.countTokens() > 1)
> > {
> > saveSeqAnno2(GenbankFormat.STRAND_NUMBER_TAG,
> >strandTokens.nextToken());
> > }
> > saveSeqAnno2(GenbankFormat.TYPE_TAG, strandTokens.nextToken());
> > // Circularity
> > saveSeqAnno2(GenbankFormat.CIRCULAR_TAG, locusTokens.nextToken());
> > // Division code
> > saveSeqAnno2(GenbankFormat.DIVISION_TAG, locusTokens.nextToken());
> > // Date in dd-MMM-yyyy format
> > saveSeqAnno2(GenbankFormat.DATE_TAG, locusTokens.nextToken());
> > }
> >
> > /**
> > * Passes the tag and the text to the listener.
> > *
> > * @throws ParseException Thrown when an error occurs parsing the file
> > */
> > private void saveSeqAnno()
> > throws ParseException
> > {
> > if (!headerTag.equals(""))
> > { // save tag and its text
> > listener.addSequenceProperty(headerTag, headerTagText.toString());
> > headerTag = "";
> > headerTagText = new StringBuffer("");
> > }
> > }
> >
> > /**
> > * Private method to process a header tag and associated value.
> > *
> > * @param tag The tag to add
> > * @param value The value of the associated tag
> > * @throws ParseException Thrown when an error occurs parsing the file
> > */
> > private void saveSeqAnno2(String tag, String value)
> > throws ParseException
> > {
> > value = value.trim(); // strip whitespace
> > if (value.length() > 0) {
> > this.saveSeqAnno();
> > headerTag = tag;
> > headerTagText = new StringBuffer(value);
> > }
> > }
> >
> >-----------------------------------------------------------------------
> >
--
Scott Markel, Ph.D. NetGenics, Inc.
smarkel@netgenics.com 4350 Executive Drive
Tel: 858 455 5223 Suite 260
FAX: 858 455 1388 San Diego, CA 92121