[Biojava-l] Re: Multiple questions (mark.schreiber@novartis.com)

Franck franckv at ebi.ac.uk
Mon Jan 16 10:59:42 EST 2006


Hi,

sorry for this late response !
As for point 2) (Is there a wrapper for SequenceIO.fileToBiojava(..)), 
For one of my projects I've written a factory class which returns a 
Sequence object according to an URI or a string. The formats taken into 
account are EMBL, Genbank and SwissProt.
This project is still going on and not fully tested but by now this code 
works with my sequences.
If it can help someone...

Franck

p.s. You can find the java file attached.
-------------- next part --------------
package uk.ac.ebi.ftv;

import java.io.*;
import java.net.URL;
import java.net.MalformedURLException;
import java.util.regex.Pattern;

import org.biojava.bio.seq.Sequence;
import org.biojava.bio.seq.SequenceIterator;
import org.biojava.bio.seq.io.SeqIOTools;
import org.biojava.bio.seq.io.SequenceBuilder;
import org.biojava.bio.BioException;

/**
 * Project FTV : Feature Table Viewer
 * F. Valentin - Jul 2005
 * Copyright (c) European Bioinformatics Institute 2005
 * <p/>
 * $Header$
 * Version : $Name$
 * <p/>
 * <p/>
 * $Log$
 */
public abstract class SequenceFactory {

	/* ----------------------- Class variables    --------------------------- */

	// According to the documentation the first line of EMBL and SwissProt files are
	// defined as following :
	// EMBL := ID \s+ <entryname> \s+ <dataclass>; \s+ [circular] \s+ <molecule>; \s+
	//                <division>; \s+ <seqlength> \s+ BP.
	// <entryname> := \p{Alpha> \w+
	// <dataclass> := standard
	// <molecule>  := .+  (should be the same as the value in the mol_type qualifier).
	// < division> := (PHG)|(CON)|... (see EMBL documentation)
	// <seqlength> := \d+
	// ------------------------------------------------------------------------------
	// SwissProt := ID \s+ <entryname> \s+ <dataclass>; \s+ <type>; <length> \s+ AA.
	// <entryname> := \w{1,12}
	// <dataclass  := (STANDARD) | (PRELIMINARY)
	// <type>      := PRT
	// <length>    := \d+
	// ------------------------------------------------------------------------------
	// GenBank := LOCUS \s{7} <locusname> \s <length> \s bp \s <strandtype><molecule>
	//            \s{2} <type_adn> \s <division> \s <date>
	// <locusname>  := \w ( (\w(?<=\w)) | (\s(?=\s)) ){11}
	// <lentgth>    := \s ( (\s(?<=\s)) | (\d (?=\d) ){4} \d
	// <strandtype> := \s{3} ([sdm]s-)
	// <molecule>   := (NA\s) | ( (DNA) | (tRNA) | (rRNA) | (mRNA) | (uRNA) | (snRNA) | (snoRNA)
	// <type_adn>   := (circular) | (linear \s \s)
	// <division>   := \w{3}
	// <date>       := // date format dd-MMM-yyyy
	// ------------------------------------------------------------------------------
	// DDBJ := the format seems to be the same as Genbank.
	// TODO need to be confirmed.
	//
	// We don't strictly follow these definitions. The important point here is to
	// be able to distinguish the different formats. However, if new formats are
	// added it's important to adapt the tests to keep the choice deterministic !

	private static Pattern EMBL_PATTERN      = Pattern.compile("\\AID.+BP\\.\\s*$",     Pattern.MULTILINE);
	private static Pattern GENBANK_PATTERN   = Pattern.compile("\\ALOCUS.+\\d{4}\\s*$", Pattern.MULTILINE);
	private static Pattern SWISSPROT_PATTERN = Pattern.compile("\\AID.+AA\\.\\s*$",     Pattern.MULTILINE);

	/* ------------------------- Class methods    --------------------------- */

	/**
	 * Create the biojava object Sequence according to the first line of the string.
	 * @param st A string representing the sequence.
	 * @return the sequence object.
	 */
	private static Sequence createSequenceFromString(String st) throws FtvUserException {
		SequenceIterator iterator;
		BufferedReader   br = new BufferedReader(new StringReader(st));
		Sequence         sequence;

		// If EMBL format
		if (EMBL_PATTERN.matcher(st).find()) {
			iterator = SeqIOTools.readEmbl(br);
		}
		// Genbank/DDBJ format
		else if (GENBANK_PATTERN.matcher(st).find()) {
			iterator = SeqIOTools.readGenbank(br);
		}
		// SwissProt format
		else if (SWISSPROT_PATTERN.matcher(st).find()) {
			iterator = SeqIOTools.readSwissprot(br);
		}
		else {
			throw new FtvUserException(FtvUtil.MSG_SEQ_FORMAT_UNKNOWN);
		}

		// We read only the first sequence from the iterator (we use an iterator here because
		// it's simpler than creating the Sequence object directly, see StreamReader's
		// implementation to see what's have to be done).
		try {
			return sequence = iterator.nextSequence();
		} catch (BioException e) {
			System.out.println("-------------------------");
			e.getStackTrace();
			System.out.println("-------------------------");
			throw new FtvUserException("BioException : " + e.getMessage());

		}
	}

	/**
	 * Create a Sequence object according to the sort of string given as a parameter :<br>
	 * The string can be :<br>
	 *    - the sequence itself.<br>
	 *    - an URI to the sequence.<br>
	 *        eg. http://www.ebi.ac.uk/cgibin/dbfetch?db=EMBL&id=j00021&forma=embl&style=raw<br>
	 *            ftp://www.asite.fr/sequence.embl
	 * @param st string that represents a sequence.
	 * @return the sequence object.
	 */
	public static Sequence createSequence(String st) throws FtvUserException, IOException {
		StringBuffer   sb_sequence = new StringBuffer();
		String         st_sequence;
		BufferedReader in       = null;
		URL            url      = null;
		String         seq_line = null ;

		// If the URL has no protocol defined, this is the sequence itself.
		// (See http://www.ietf.org/rfc/rfc2396.txt chap 3.1)
		if (! st.matches("\\A\\w*(\\w|\\d|\\+|-|\\.):.+$")) {
			st_sequence = new String(st);
		}
		else {
			try {
				url = new URL(st);
				in  = new BufferedReader(new InputStreamReader(url.openStream()));

				while ((seq_line = in.readLine()) != null) {
					sb_sequence.append(seq_line).append("\n");
				}
				in.close();
				st_sequence = new String(sb_sequence);

			} catch (MalformedURLException e) {
				throw new FtvUserException(FtvUtil.MSG_PROTOCOL_UNKNOWN);
			} catch (FileNotFoundException e)  {
				throw new FtvUserException(FtvUtil.MSG_FILE_NOT_FOUND);
			} catch (IOException e) {
				throw e;  //To change body of catch statement use File | Settings | File Templates.
			}
		}
		return createSequenceFromString(st_sequence);
	}
}


More information about the Biojava-l mailing list