[Biojava-l] EMBL Parser

Lorna Morris lmorris at ebi.ac.uk
Wed Nov 19 11:09:41 EST 2003


Hello

I'm using biojava to parse an EMBL Flat file, modify it, and dump it out 
to file at the end. However when I used SeqIOTools.writeEmbl the file 
created, did not have correctly ordered and nested RN, RP, RX, RA, RT 
and RL lines. These lines should occur in repeated sets, one set for 
each reference in the flat file. I've modified some of the biojava 
classes and added 2 new classes to correct this. Everthing works fine 
now. I'm attatching the classes to this mail.

Files modfied:

EmblLikeFormat
EmblFileFormer
SeqIOEventEmitter
GenEmblPropertyComparator

Files added:

ReferenceAnnotation.java
EmblReferenceComparator.java

If you need any more details on the changes I've made let me know. Thanks,

Lorna



-------------- next part --------------
/*
 *                    BioJava development code
 *
 * This code may be freely distributed and modified under the
 * terms of the GNU Lesser General Public Licence.  This should
 * be distributed with the code.  If you do not have a copy,
 * see:
 *
 *      http://www.gnu.org/copyleft/lesser.html
 *
 * Copyright for this code is held jointly by the individual
 * authors.  These should be listed in @author doc comments.
 *
 * For more information on the BioJava project and its aims,
 * or to join the biojava-l mailing list, visit the home page
 * at:
 *
 *      http://www.biojava.org/
 *
 */

package org.biojava.bio.seq.io;


import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import org.biojava.bio.seq.Feature;
import org.biojava.bio.seq.StrandedFeature;
import org.biojava.bio.symbol.Alphabet;
import org.biojava.bio.symbol.IllegalAlphabetException;
import org.biojava.bio.symbol.IllegalSymbolException;
import org.biojava.bio.symbol.Symbol;
import org.biojava.bio.taxa.EbiFormat;
import org.biojava.bio.taxa.Taxon;
import org.biojava.bio.BioException;

/**
 * <p><code>EmblFileFormer</code> performs the detailed formatting of
 * EMBL entries for writing to a <code>PrintStream</code>. Currently
 * the formatting of the header is not correct. This really needs to
 * be addressed in the parser which is merging fields which should
 * remain separate.</p>
 *
 * <p>The event generator used to feed events to this class should
 * enforce ordering of those events. This class will stream data
 * directly to the <code>PrintStream</code></p>.
 *
 * <p>This implementation requires that all the symbols be added in
 * one block as is does not buffer the tokenized symbols between
 * calls.</p>
 *
 * @author Keith James
 * @author Len Trigg (Taxon output)
 * @since 1.2
 */
public class EmblFileFormer extends AbstractGenEmblFileFormer
    implements SeqFileFormer
{
    // Tags which are special cases, not having "XX" after them
    private static List NON_SEPARATED_TAGS = new ArrayList();

    static
    {
        NON_SEPARATED_TAGS.add(EmblLikeFormat.SOURCE_TAG);
        NON_SEPARATED_TAGS.add(EmblLikeFormat.REFERENCE_TAG);
        NON_SEPARATED_TAGS.add(EmblLikeFormat.COORDINATE_TAG);
        NON_SEPARATED_TAGS.add(EmblLikeFormat.REF_ACCESSION_TAG);
        NON_SEPARATED_TAGS.add(EmblLikeFormat.AUTHORS_TAG);
        NON_SEPARATED_TAGS.add(EmblLikeFormat.TITLE_TAG);
        NON_SEPARATED_TAGS.add(EmblLikeFormat.FEATURE_TAG);
        NON_SEPARATED_TAGS.add(EmblLikeFormat.JOURNAL_TAG);//Lorna: added
        NON_SEPARATED_TAGS.add(EmblLikeFormat.SEPARATOR_TAG);//Lorna: added
    }

    // 19 spaces
    private static String FT_LEADER =
        EmblLikeFormat.FEATURE_TABLE_TAG + "                   ";

    // 3 spaces
    private static String SQ_LEADER = "   ";

    // 80 spaces
    private static String EMPTY_LINE =
        "                                        " +
        "                                        ";

    private PrintStream stream;

    private String idLine;
    private String accLine;

    /**
     * Creates a new <code>EmblFileFormer</code> using
     * <code>System.out</code> stream.
     */
    protected EmblFileFormer()
    {
        this(System.out);
    }

    /**
     * Creates a new <code>EmblFileFormer</code> using the specified
     * stream.
     *
     * @param stream a <code>PrintStream</code>.
     */
    protected EmblFileFormer(PrintStream stream)
    {
        super();
        this.stream = stream;
    }

    public PrintStream getPrintStream()
    {
        return stream;
    }

    public void setPrintStream(PrintStream stream)
    {
        this.stream = stream;
    }

    public void setName(String id) throws ParseException
    {
        idLine = id;
    }

    public void startSequence() throws ParseException
    {
       aCount = 0;
       cCount = 0;
       gCount = 0;
       tCount = 0;
       oCount = 0;
    }

    public void endSequence() throws ParseException
    {
        stream.println(EmblLikeFormat.END_SEQUENCE_TAG);
    }

    public void setURI(String uri) throws ParseException { }

    public void addSymbols(Alphabet  alpha,
                           Symbol [] syms,
                           int       start,
                           int       length)
        throws IllegalAlphabetException
    {
        try
        {
            int end = start + length - 1;

            for (int i = start; i <= end; i++)
            {
                Symbol sym = syms[i];

                if (sym == a)
                    aCount++;
                else if (sym == c)
                    cCount++;
                else if (sym == g)
                    gCount++;
                else if (sym == t)
                    tCount++;
                else
                    oCount++;
            }

            StringBuffer sb = new StringBuffer(EmblLikeFormat.SEPARATOR_TAG);
            sb.append(nl);
            sb.append("SQ   Sequence ");
            sb.append(length + " BP; ");
            sb.append(aCount + " A; ");
            sb.append(cCount + " C; ");
            sb.append(gCount + " G; ");
            sb.append(tCount + " T; ");
            sb.append(oCount + " other;");

            // Print sequence summary header
            stream.println(sb);

            int fullLine = length / 60;
            int partLine = length % 60;

            int lineCount = fullLine;
            if (partLine > 0)
                lineCount++;

            int lineLens [] = new int [lineCount];

            // All lines are 60, except last (if present)
            Arrays.fill(lineLens, 60);

            if (partLine > 0)
                lineLens[lineCount - 1] = partLine;

            for (int i = 0; i < lineLens.length; i++)
            {
                // Prep the whitespace
                StringBuffer sq = new StringBuffer(EMPTY_LINE);

                // How long is this chunk?
                int len = lineLens[i];
                // Prepare a Symbol array same length as chunk
                Symbol [] sa = new Symbol [len];

                // Get symbols and format into blocks of tokens
                System.arraycopy(syms, start + (i * 60), sa, 0, len);

                sb = new StringBuffer();

                String blocks = (formatTokenBlock(sb, sa, 10,
                         alpha.getTokenization("token"))).toString();

                sq.replace(5, blocks.length() + 5, blocks);

                // Calculate the running residue count and add to the line
                String count = Integer.toString((i * 60) + len);
                sq.replace((80 - count.length()), 80, count);

                // Print formatted sequence line
                stream.println(sq);
            }
        }
        catch (BioException ex)
        {
            throw new IllegalAlphabetException(ex, "Alphabet not tokenizing");
        }
    }

        public void addSequenceProperty(Object key, Object value)
        throws ParseException
    {
        StringBuffer sb = new StringBuffer();

        // Ignore separators if they are sent to us. The parser should
        // be ignoring these really (lorna: I've changed this so they are ignored in SeqIOEventEmitter)
        //if (key.equals(EmblLikeFormat.SEPARATOR_TAG))
            //return;

        String tag = key.toString();
        String leader = tag + SQ_LEADER;
        String line = "";
        int wrapWidth = 85 - leader.length();

        // Special case: accession number
        if (key.equals(EmblProcessor.PROPERTY_EMBL_ACCESSIONS))
        {
            accLine = buildPropertyLine((Collection) value, ";", true);
            return;
        }
        else if (key.equals(EmblLikeFormat.ACCESSION_TAG))
        {
            line = accLine;
        } else if (key.equals(OrganismParser.PROPERTY_ORGANISM)) {
            Taxon taxon = (Taxon) value;
            addSequenceProperty(EmblLikeFormat.SOURCE_TAG, taxon);
            addSequenceProperty(EmblLikeFormat.ORGANISM_TAG, taxon.getParent());
            addSequenceProperty(EmblLikeFormat.ORGANISM_XREF_TAG, taxon);
            return;
        }
        if (value instanceof String)
        {
            line = (String) value;
        }
        else if (value instanceof Collection)
        {
            // Special case: date lines
            if (key.equals(EmblLikeFormat.DATE_TAG))
            {
                line = buildPropertyLine((Collection) value, nl + leader, false);
                wrapWidth = Integer.MAX_VALUE;
            }
            //lorna :added 21.08.03, DR lines are another special case. Each one goes onto a separate line.
            else if (key.equals(EmblLikeFormat.DR_TAG))
            {
                line = buildPropertyLine((Collection) value, nl + leader, false);
                wrapWidth = Integer.MAX_VALUE;
            }
            else if (key.equals(EmblLikeFormat.AUTHORS_TAG))
            {
                line = buildPropertyLine((Collection) value, nl + leader, false); //lorna: add space here?
                wrapWidth = Integer.MAX_VALUE;
            }
            else if (key.equals(EmblLikeFormat.REF_ACCESSION_TAG))
            {
                line = buildPropertyLine((Collection) value, nl + leader, false);
                wrapWidth = Integer.MAX_VALUE;
            }
            else
            {
                line = buildPropertyLine((Collection) value, " ", false);
            }
        } else if (value instanceof Taxon) {
            if (key.equals(EmblLikeFormat.ORGANISM_TAG)) {
                line = EbiFormat.getInstance().serialize((Taxon) value);
            } else if (key.equals(EmblLikeFormat.SOURCE_TAG)) {
                line = EbiFormat.getInstance().serializeSource((Taxon) value);
            } else if (key.equals(EmblLikeFormat.ORGANISM_XREF_TAG)) {
                line = EbiFormat.getInstance().serializeXRef((Taxon) value);
            }
        }

        if (line.length() == 0)
        {
            stream.println(tag);
        }
        else
        {
            sb = formatSequenceProperty(sb, line, leader, wrapWidth);
            stream.println(sb);
        }
        // Special case: those which don't get separated
        if (! NON_SEPARATED_TAGS.contains(key))
            stream.println(EmblLikeFormat.SEPARATOR_TAG);
        // Special case: feature header
        if (key.equals(EmblLikeFormat.FEATURE_TAG))
            stream.println(EmblLikeFormat.FEATURE_TAG);
    }


    public void startFeature(Feature.Template templ)
        throws ParseException
    {
        int strand = 0;

        if (templ instanceof StrandedFeature.Template)
            strand = ((StrandedFeature.Template) templ).strand.getValue();

        StringBuffer sb = new StringBuffer(FT_LEADER);
        sb = formatLocationBlock(sb, templ.location, strand, FT_LEADER, 80);
        sb.replace(5, 5 + templ.type.length(), templ.type);
        stream.println(sb);
    }

    public void endFeature() throws ParseException { }

    public void addFeatureProperty(Object key, Object value)
    {
        // Don't print internal data structures
        if (key.equals(Feature.PROPERTY_DATA_KEY))
            return;

        StringBuffer fb;
        StringBuffer sb;

        // The value may be a collection if several qualifiers of the
        // same type are present in a feature
        if (value instanceof Collection)
        {
            for (Iterator vi = ((Collection) value).iterator(); vi.hasNext();)
            {
                fb = new StringBuffer();
                sb = new StringBuffer();

                fb = formatQualifierBlock(fb,
                                          formatQualifier(sb, key, vi.next()).substring(0),
                                          FT_LEADER,
                                          80);
                stream.println(fb);
            }
        }
        else
        {
            fb = new StringBuffer();
            sb = new StringBuffer();

            fb = formatQualifierBlock(fb,
                                      formatQualifier(sb, key, value).substring(0),
                                      FT_LEADER,
                                      80);
            stream.println(fb);
        }
    }

    private String buildPropertyLine(Collection property,
                                     String separator,
                                     boolean terminate)
    {
        StringBuffer sb = new StringBuffer();

        for (Iterator pi = property.iterator(); pi.hasNext();)
        {
            sb.append(pi.next().toString());
            sb.append(separator);
        }

        if (terminate)
        {
            return sb.substring(0);
        }
        else
        {
            return sb.substring(0, sb.length() - separator.length());
        }
    }
}
-------------- next part --------------
/*
 *                    BioJava development code
 *
 * This code may be freely distributed and modified under the
 * terms of the GNU Lesser General Public Licence.  This should
 * be distributed with the code.  If you do not have a copy,
 * see:
 *
 *      http://www.gnu.org/copyleft/lesser.html
 *
 * Copyright for this code is held jointly by the individual
 * authors.  These should be listed in @author doc comments.
 *
 * For more information on the BioJava project and its aims,
 * or to join the biojava-l mailing list, visit the home page
 * at:
 *
 *      http://www.biojava.org/
 *
 */

package org.biojava.bio.seq.io;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.PrintStream;
import java.io.Serializable;
import java.util.Vector;
import java.util.ArrayList;

import org.biojava.bio.seq.Sequence;
import org.biojava.bio.symbol.IllegalSymbolException;
import org.biojava.utils.ParseErrorEvent;
import org.biojava.utils.ParseErrorListener;
import org.biojava.utils.ParseErrorSource;
import org.biojava.utils.ChangeVetoException;

/**
 * <p>
 * Format processor for handling EMBL records and similar files.  This
 * takes a very simple approach: all `normal' attribute lines are
 * passed to the listener as a tag (first two characters) and a value
 * (the rest of the line from the 6th character onwards).  Any data
 * between the special `SQ' line and the "//" entry terminator is
 * passed as a SymbolReader.
 * </p>
 *
 * <p>
 * This low-level format processor should normally be used in
 * conjunction with one or more `filter' objects, such as
 * EmblProcessor.
 * </p>
 *
 * <p>
 * Many ideas borrowed from the old EmblFormat processor by Thomas
 * Down and Thad Welch.
 * </p>
 *
 * @author Thomas Down
 * @author Greg Cox
 * @author Keith James
 * @author Len Trigg
 * @since 1.1
 */

public class EmblLikeFormat
    implements
            SequenceFormat,
            Serializable,
            ParseErrorSource,
            ParseErrorListener
{
    public static final String DEFAULT = "EMBL";

    protected static final String ID_TAG = "ID";
    protected static final String SIZE_TAG = "SIZE";
    protected static final String STRAND_NUMBER_TAG = "STRANDS";
    protected static final String TYPE_TAG = "TYPE";
    protected static final String CIRCULAR_TAG = "CIRCULAR";
    protected static final String DIVISION_TAG = "DIVISION";
    protected static final String DR_TAG = "DR"; //Lorna: new tag

    protected static final String ACCESSION_TAG = "AC";
    protected static final String VERSION_TAG = "SV";
    protected static final String DATE_TAG = "DT";
    protected static final String DEFINITION_TAG = "DE";
    protected static final String KEYWORDS_TAG = "KW";
    protected static final String SOURCE_TAG = "OS";
    protected static final String ORGANISM_TAG = "OC";
    protected static final String ORGANISM_XREF_TAG = "OX";
    protected static final String REFERENCE_TAG = "RN";
    protected static final String COORDINATE_TAG = "RP";
    protected static final String REF_ACCESSION_TAG = "RX";
    protected static final String AUTHORS_TAG = "RA";
    protected static final String TITLE_TAG = "RT";
    protected static final String JOURNAL_TAG = "RL";
    protected static final String COMMENT_TAG = "CC";
    protected static final String FEATURE_TAG = "FH";
    protected static final String SEPARATOR_TAG = "XX";
    protected static final String FEATURE_TABLE_TAG = "FT";
    protected static final String START_SEQUENCE_TAG = "SQ";  
    protected static final String END_SEQUENCE_TAG = "//";

    private boolean elideSymbols = false;
    private Vector mListeners = new Vector();

    /**
     * <p>Specifies whether the symbols (SQ) part of the entry should
     * be ignored. If this property is set to <code>true</code>, the
     * parser will never call addSymbols on the
     * <code>SeqIOListener</code>, but parsing will be faster if
     * you're only interested in header information.</p>
     *
     * <p> This property also allows the header to be parsed for files
     * which have invalid sequence data.</p>
     */
    public void setElideSymbols(boolean b)
    {
        elideSymbols = b;
    }

    /**
     * Return a flag indicating if symbol data will be skipped
     * when parsing streams.
     */
    public boolean getElideSymbols()
    {
        return elideSymbols;
    }

        public boolean readSequence(BufferedReader     reader,
                                SymbolTokenization symParser,
                                SeqIOListener      listener)
        throws IllegalSymbolException, IOException, ParseException
    {

    EmblReferenceProperty reference = null; //lorna

	if (listener instanceof ParseErrorSource) {
	    ((ParseErrorSource)(listener)).addParseErrorListener(this);
	}

        String            line;
        StreamParser    sparser       = null;
        boolean hasMoreSequence       = true;
        boolean hasInternalWhitespace = false;

        listener.startSequence();

        while ((line = reader.readLine()) != null)
        {
            if (line.startsWith(END_SEQUENCE_TAG))
            {
                if (sparser != null)
                {
                    // End of symbol data
                    sparser.close();
                    sparser = null;
                }

                // Allows us to tolerate trailing whitespace without
                // thinking that there is another Sequence to follow
                while (true)
                {
                    reader.mark(1);
                    int c = reader.read();

                    if (c == -1)
                    {
                        hasMoreSequence = false;
                        break;
                    }

                    if (Character.isWhitespace((char) c))
                    {
                        hasInternalWhitespace = true;
                        continue;
                    }

                    if (hasInternalWhitespace)
                        System.err.println("Warning: whitespace found between sequence entries");

                    reader.reset();
                    break;
                }

                listener.endSequence();
                return hasMoreSequence;
            }
            else if (line.startsWith(START_SEQUENCE_TAG))
            {
                // Adding a null property to flush the last feature;
                // Needed for Swissprot files because there is no gap
                // between the feature table and the sequence data
                listener.addSequenceProperty(SEPARATOR_TAG, "");

                sparser = symParser.parseStream(listener);
            }
            else
            {
                if (sparser == null)
                {
                    // Normal attribute line
                    String tag  = line.substring(0, 2);
                    String rest = null;
                    if (line.length() > 5)
                    {
                        rest = line.substring(5);
                    }

                    //lorna added, tags read in order, when a complete set goes through,
                    //spit out a single annotation event

                    ReferenceAnnotation refAnnot = new ReferenceAnnotation();

                    if (tag.equals(REFERENCE_TAG)) { //only 1 reference_tag!

                        try {
                            refAnnot.setProperty(tag, rest);
                            while (!(tag.equals(SEPARATOR_TAG))) {
                                // Normal attribute line

                                line = reader.readLine();

                                tag  = line.substring(0, 2);

                                if (line.length() > 5)
                                {
                                    rest = line.substring(5);
                                } else {
                                    rest = null;//for XX lines
                                }

                                if (refAnnot.containsProperty(tag)) {

                                    Object property = refAnnot.getProperty(tag);
                                    ArrayList properties;

                                    if (property instanceof String) {
                                        properties = new ArrayList();
                                        properties.add(property);
                                        properties.add(rest);
                                        refAnnot.setProperty(tag, properties);
                                    }
                                    if (property instanceof ArrayList) {
                                        ((ArrayList)property).add(rest);
                                    }
                                }  else {
                                    refAnnot.setProperty(tag, rest);
                                }
                            }
                            listener.addSequenceProperty(ReferenceAnnotation.class, refAnnot);

                        } catch (ChangeVetoException cve) {
                            cve.printStackTrace();
                        }

                    }
                    // lorna, end
                    else { //lorna
                        listener.addSequenceProperty(tag, rest);
                    } //lorna
                }
                else
                {
                    // Sequence line
                    if (! elideSymbols)
                        processSequenceLine(line, sparser);
                }
            }
        }

        if (sparser != null)
            sparser.close();

        throw new IOException("Premature end of stream or missing end tag '//' for EMBL");
    }


    /**
     * Dispatch symbol data from SQ-block line of an EMBL-like file.
     */
    protected void processSequenceLine(String line, StreamParser parser)
        throws IllegalSymbolException, ParseException
    {
        char[] cline = line.toCharArray();
        int parseStart = 0;
        int parseEnd   = 0;

        while (parseStart < cline.length)
        {
            while (parseStart < cline.length && cline[parseStart] == ' ')
                ++parseStart;
            if (parseStart >= cline.length)
                break;

            if (Character.isDigit(cline[parseStart]))
                return;

            parseEnd = parseStart + 1;
            while (parseEnd < cline.length && cline[parseEnd] != ' ') {
                if (cline[parseEnd] == '.' || cline[parseEnd] == '~') {
                   cline[parseEnd] = '-';
                }
                ++parseEnd;
            }

            // Got a segment of read sequence data
            parser.characters(cline, parseStart, parseEnd - parseStart);

            parseStart = parseEnd;
        }
    }

    public void writeSequence(Sequence seq, PrintStream os)
        throws IOException
    {
        writeSequence(seq, getDefaultFormat(), os);
    }

    /**
     * <code>writeSequence</code> writes a sequence to the specified
     * <code>PrintStream</code>, using the specified format.
     *
     * @param seq a <code>Sequence</code> to write out.
     * @param format a <code>String</code> indicating which sub-format
     * of those available from a particular
     * <code>SequenceFormat</code> implemention to use when
     * writing.
     * @param os a <code>PrintStream</code> object.
     *
     * @exception IOException if an error occurs.
     * @deprecated use writeSequence(Sequence seq, PrintStream os)
     */
    public void writeSequence(Sequence seq, String format, PrintStream os)
	throws IOException
    {
        SeqFileFormer former;

        if (format.equalsIgnoreCase("EMBL"))
            former = new EmblFileFormer();
        else if (format.equalsIgnoreCase("SWISSPROT"))
            former = new SwissprotFileFormer();
        else
            throw new IllegalArgumentException("Unknown format '"
                                               + format
                                               + "'");
        former.setPrintStream(os);

        SeqIOEventEmitter emitter =
            new SeqIOEventEmitter(GenEmblPropertyComparator.INSTANCE,
                                  GenEmblFeatureComparator.INSTANCE);

        emitter.getSeqIOEvents(seq, former);
    }

    /**
     * <code>getDefaultFormat</code> returns the String identifier for
     * the default format written by a <code>SequenceFormat</code>
     * implementation.
     *
     * @return a <code>String</code>.
     * @deprecated
     */
    public String getDefaultFormat()
    {
        return DEFAULT;
    }

    /**
     * <p>
     * This method determines the behaviour when a bad line is processed.
     * Some options are to log the error, throw an exception, ignore it
     * completely, or pass the event through.
     * </p>
     *
     * <p>
     * This method should be overwritten when different behavior is desired.
     * </p>
     *
     * @param theEvent The event that contains the bad line and token.
     */
    public void BadLineParsed(ParseErrorEvent theEvent)
    {
        notifyParseErrorEvent(theEvent);
    }

    /**
     * Adds a parse error listener to the list of listeners if it isn't already
     * included.
     *
     * @param theListener Listener to be added.
     */
    public synchronized void addParseErrorListener(ParseErrorListener theListener)
    {
        if (mListeners.contains(theListener) == false)
        {
            mListeners.addElement(theListener);
        }
    }

    /**
     * Removes a parse error listener from the list of listeners if it is
     * included.
     *
     * @param theListener Listener to be removed.
     */
    public synchronized void removeParseErrorListener(ParseErrorListener theListener)
    {
        if (mListeners.contains(theListener) == true)
        {
            mListeners.removeElement(theListener);
        }
    }
 
    // Protected methods
    /**
     * Passes the event on to all the listeners registered for ParseErrorEvents.
     *
     * @param theEvent The event to be handed to the listeners.
     */
    protected void notifyParseErrorEvent(ParseErrorEvent theEvent)
    {
        Vector listeners;
        synchronized(this)
        {
            listeners = (Vector)mListeners.clone();
        }

        for (int index = 0; index < listeners.size(); index++)
        {
            ParseErrorListener client = (ParseErrorListener)listeners.elementAt(index);
            client.BadLineParsed(theEvent);
        }
    }
}
-------------- next part --------------
/*
 * Created by IntelliJ IDEA.
 * User: lmorris
 * Date: Nov 14, 2003
 * Time: 11:11:52 AM
 * To change template for new class use 
 * Code Style | Class Templates options (Tools | IDE Options).
 */
package org.biojava.bio.seq.io;

import java.util.Comparator;
import java.util.List;
import java.util.ArrayList;

public class EmblReferenceComparator implements Comparator {

    static final Comparator INSTANCE = new EmblReferenceComparator();

    private List tagOrder;

    {
        tagOrder = new ArrayList();
        tagOrder.add(EmblLikeFormat.REFERENCE_TAG);
        tagOrder.add(EmblLikeFormat.COORDINATE_TAG);
        tagOrder.add(EmblLikeFormat.REF_ACCESSION_TAG);
        tagOrder.add(EmblLikeFormat.AUTHORS_TAG);
        tagOrder.add(EmblLikeFormat.TITLE_TAG);
        tagOrder.add(EmblLikeFormat.JOURNAL_TAG);
        tagOrder.add(EmblLikeFormat.SEPARATOR_TAG);
    }

    public int compare(Object o1, Object o2)
    {
        int index1 = tagOrder.indexOf(o1);
        int index2 = tagOrder.indexOf(o2);

        return (index1 - index2);
    }

}
-------------- next part --------------
/*
 *                    BioJava development code
 *
 * This code may be freely distributed and modified under the
 * terms of the GNU Lesser General Public Licence.  This should
 * be distributed with the code.  If you do not have a copy,
 * see:
 *
 *      http://www.gnu.org/copyleft/lesser.html
 *
 * Copyright for this code is held jointly by the individual
 * authors.  These should be listed in @author doc comments.
 *
 * For more information on the BioJava project and its aims,
 * or to join the biojava-l mailing list, visit the home page
 * at:
 *
 *      http://www.biojava.org/
 *
 */

package org.biojava.bio.seq.io;

import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;

/**
 * <p><code>GenEmblPropertyComparator</code> compares Genbank/EMBL
 * file format tags by the order in which they should appear in their
 * respective formats.</p>
 *
 * <p>EMBL tags sort before Genbank tags. This is arbitrary. Given the
 * subtle differences in the values accompanying equivalent tags in
 * these formats the two sets shouldn't be mixed anyway.</p>
 *
 * <p>Any tags which belong to neither set sort before anything
 * else.<p>
 *
 * @author Keith James
 */
final class GenEmblPropertyComparator implements Comparator
{
    static final Comparator INSTANCE = new GenEmblPropertyComparator();

    private List tagOrder;

    private GenEmblPropertyComparator()
    {
        tagOrder = new ArrayList();
        tagOrder.add(EmblLikeFormat.ID_TAG);
        tagOrder.add(EmblLikeFormat.ACCESSION_TAG);
        tagOrder.add(EmblLikeFormat.VERSION_TAG);
        tagOrder.add(EmblLikeFormat.DATE_TAG);
        tagOrder.add(EmblLikeFormat.DEFINITION_TAG);
        tagOrder.add(EmblLikeFormat.KEYWORDS_TAG);
        tagOrder.add(EmblLikeFormat.SOURCE_TAG);
        tagOrder.add(EmblLikeFormat.ORGANISM_TAG);
        /*tagOrder.add(EmblLikeFormat.REFERENCE_TAG);
        tagOrder.add(EmblLikeFormat.COORDINATE_TAG);
        tagOrder.add(EmblLikeFormat.REF_ACCESSION_TAG);
        tagOrder.add(EmblLikeFormat.AUTHORS_TAG);
        tagOrder.add(EmblLikeFormat.TITLE_TAG);
        tagOrder.add(EmblLikeFormat.JOURNAL_TAG);*/
        tagOrder.add(ReferenceAnnotation.class);
        tagOrder.add(EmblLikeFormat.DR_TAG);//lorna:added 21.08.03
        tagOrder.add(EmblLikeFormat.COORDINATE_TAG);
        tagOrder.add(EmblLikeFormat.REF_ACCESSION_TAG);
        tagOrder.add(EmblLikeFormat.AUTHORS_TAG);
        tagOrder.add(EmblLikeFormat.TITLE_TAG);
        tagOrder.add(EmblLikeFormat.JOURNAL_TAG);
        tagOrder.add(EmblLikeFormat.COMMENT_TAG);
        tagOrder.add(EmblLikeFormat.FEATURE_TAG);

        tagOrder.add(GenbankFormat.LOCUS_TAG);
        tagOrder.add(GenbankFormat.SIZE_TAG);
        tagOrder.add(GenbankFormat.STRAND_NUMBER_TAG);
        tagOrder.add(GenbankFormat.TYPE_TAG);
        tagOrder.add(GenbankFormat.CIRCULAR_TAG);
        tagOrder.add(GenbankFormat.DIVISION_TAG);
        tagOrder.add(GenbankFormat.DATE_TAG);
        tagOrder.add(GenbankFormat.DEFINITION_TAG);
        tagOrder.add(GenbankFormat.ACCESSION_TAG);
        tagOrder.add(GenbankFormat.VERSION_TAG);
        tagOrder.add(GenbankFormat.GI_TAG);
        tagOrder.add(GenbankFormat.KEYWORDS_TAG);
        tagOrder.add(GenbankFormat.SOURCE_TAG);
        tagOrder.add(GenbankFormat.ORGANISM_TAG);
        tagOrder.add(GenbankFormat.REFERENCE_TAG);
        tagOrder.add(GenbankFormat.AUTHORS_TAG);
        tagOrder.add(GenbankFormat.TITLE_TAG);
        tagOrder.add(GenbankFormat.JOURNAL_TAG);
        tagOrder.add(GenbankFormat.COMMENT_TAG);
        tagOrder.add(GenbankFormat.FEATURE_TAG);
    }

    public int compare(Object o1, Object o2)
    {
        int index1 = tagOrder.indexOf(o1);
        int index2 = tagOrder.indexOf(o2);

        return (index1 - index2);
    }
}
-------------- next part --------------
/*
 * Created by IntelliJ IDEA.
 * User: lmorris
 * Date: Nov 14, 2003
 * Time: 11:45:41 AM
 * To change template for new class use 
 * Code Style | Class Templates options (Tools | IDE Options).
 */
package org.biojava.bio.seq.io;

import org.biojava.bio.AbstractAnnotation;
import org.biojava.utils.ChangeVetoException;

import java.util.Map;
import java.util.HashMap;

public class ReferenceAnnotation extends AbstractAnnotation {

     /**
   * The properties map. This may be null if no property values have
   * yet been set.
   */
    private Map properties;

    public ReferenceAnnotation() {

            super();
        try {
            System.out.println("Calling refAnnot");
            this.setProperty(EmblLikeFormat.SEPARATOR_TAG, "");//all references have an epty XX line
        } catch (ChangeVetoException e) {
            e.printStackTrace();
        }
    }

    protected Map getProperties() {
        if(!propertiesAllocated()) {
            properties = new HashMap();
        }
        return properties;
    }

    protected boolean propertiesAllocated() {
        return properties != null;
    }



}
-------------- next part --------------
/*
 *                    BioJava development code
 *
 * This code may be freely distributed and modified under the
 * terms of the GNU Lesser General Public Licence.  This should
 * be distributed with the code.  If you do not have a copy,
 * see:
 *
 *      http://www.gnu.org/copyleft/lesser.html
 *
 * Copyright for this code is held jointly by the individual
 * authors.  These should be listed in @author doc comments.
 *
 * For more information on the BioJava project and its aims,
 * or to join the biojava-l mailing list, visit the home page
 * at:
 *
 *      http://www.biojava.org/
 *
 */

package org.biojava.bio.seq.io;

import java.util.*;

import org.biojava.bio.Annotation;
import org.biojava.bio.BioError;
import org.biojava.bio.seq.Feature;
import org.biojava.bio.seq.FeatureHolder;
import org.biojava.bio.seq.Sequence;
import org.biojava.bio.symbol.IllegalAlphabetException;
import org.biojava.bio.symbol.Symbol;

/**
 * <code>SeqIOEventEmitter</code> is a utility class which scans a
 * <code>Sequence</code> object and sends events describing its
 * constituent data to a <code>SeqIOListener</code>. The listener
 * should be able to reconstruct the <code>Sequence</code> from these
 * events.
 *
 * @author Keith James
 * @since 1.2
*/
class SeqIOEventEmitter
{
    private static Symbol [] symProto = new Symbol [0];

    private Comparator seqPropComparator;
    private Comparator refPropComparator;
    private Comparator featureComparator;

    SeqIOEventEmitter(Comparator seqPropComparator,
                      Comparator featureComparator)
    {
        this.seqPropComparator = seqPropComparator;
        this.featureComparator = featureComparator;
    };


            /**
     * <code>getSeqIOEvents</code> scans a <code>Sequence</code>
     * object and sends events describing its data to the
     * <code>SeqIOListener</code>.
     *
     * @param seq a <code>Sequence</code>.
     * @param listener a <code>SeqIOListener</code>.
     */
    void getSeqIOEvents(Sequence seq, SeqIOListener listener)
    {
        try
        {
            // Inform listener of sequence start
            listener.startSequence();

            // Pass name to listener
            listener.setName(seq.getName());

            // Pass URN to listener
            listener.setURI(seq.getURN());

            // Pass sequence properties to listener
            Annotation a = seq.getAnnotation();
            List sKeys = new ArrayList(a.keys());
            Collections.sort(sKeys, seqPropComparator);

            for (Iterator ki = sKeys.iterator(); ki.hasNext();)
            {
                Object key = ki.next();

                if ( key.equals(ReferenceAnnotation.class)) {

                    ArrayList references = null;

                    if (a.getProperty(key) instanceof ArrayList) {
                       references = ((ArrayList)a.getProperty(key));
                    }

                    if (references != null) {

                        for ( int i = 0; i < references.size(); i++ ) {
                            ReferenceAnnotation refAnnot = (ReferenceAnnotation)references.get(i);

                            Map referenceLines = refAnnot.getProperties();
                            List refKeys = new ArrayList(referenceLines.keySet());
                            refPropComparator = EmblReferenceComparator.INSTANCE;
                            Collections.sort(refKeys, refPropComparator);

                            for (Iterator kit = refKeys.iterator(); kit.hasNext();)
                            {
                                Object refKey = kit.next();
                                //adds all the R* tags and final XX tag
                                listener.addSequenceProperty(refKey, refAnnot.getProperty(refKey));
                            }
                        }
                    }
                }
                else {

                    if (!(key.equals(EmblLikeFormat.SEPARATOR_TAG)))  {  //lorna: ignore XX

                       listener.addSequenceProperty(key, a.getProperty(key));
                    }

                }
            }

            // Recurse through sub feature tree, flattening it for
            // EMBL
            List subs = getSubFeatures(seq);
            Collections.sort(subs, featureComparator);

            // Put the source features first for EMBL
            for (Iterator fi = subs.iterator(); fi.hasNext();)
            {
                // The template is required to call startFeature
                Feature.Template t = ((Feature) fi.next()).makeTemplate();

                // Inform listener of feature start
                listener.startFeature(t);

                // Pass feature properties (i.e. qualifiers to
                // listener)
                // FIXME: this will drop all non-comparable keys
                List fKeys = comparableList(t.annotation.keys());
                Collections.sort(fKeys);

                for (Iterator ki = fKeys.iterator(); ki.hasNext();)
                {
                    Object key = ki.next();
                    listener.addFeatureProperty(key, t.annotation.getProperty(key));
                }

                // Inform listener of feature end
                listener.endFeature();
            }

            // Add symbols
            listener.addSymbols(seq.getAlphabet(),
                                (Symbol []) seq.toList().toArray(symProto),
                                0,
                                seq.length());

            // Inform listener of sequence end
            listener.endSequence();
        }
        catch (IllegalAlphabetException iae)
        {
            // This should never happen as the alphabet is being used
            // by this Sequence instance
            throw new BioError("An internal error occurred processing symbols",iae);
        }
        catch (ParseException pe)
        {
            throw new BioError("An internal error occurred creating SeqIO events",pe);
        }
    }


    /**
     * <code>getSubFeatures</code> is a recursive method which returns
     * a list of all <code>Feature</code>s within a
     * <code>FeatureHolder</code>.
     *
     * @param fh a <code>FeatureHolder</code>.
     *
     * @return a <code>List</code>.
     */
    private static List getSubFeatures(FeatureHolder fh)
    {
        List subfeat = new ArrayList();

        for (Iterator fi = fh.features(); fi.hasNext();)
        {
            FeatureHolder sfh = (FeatureHolder) fi.next();

            subfeat.addAll((Collection) getSubFeatures(sfh));
            subfeat.add(sfh);
        }
        return subfeat;
    }

    private List comparableList(Collection coll) {
      ArrayList res = new ArrayList();
      for(Iterator i = coll.iterator(); i.hasNext(); ) {
        Object o = i.next();
        if(o instanceof Comparable) {
          res.add(o);
        }
      }
      return res;
    }
}


More information about the Biojava-l mailing list