[Biojava-l] fileToBiojava question

Bernd Jagla bernd.jagla at pasteur.fr
Tue Sep 21 12:47:21 UTC 2010


  Sorry for the wrong reply...
Here is the FULL code I marked the passages that are important in red:

Thanks for looking at it!!!!

Bernd


package org.pasteur.pf2.biojava;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.Iterator;
import java.util.NoSuchElementException;

import org.biojava.bio.BioException;
import org.biojava.bio.seq.Sequence;
import org.biojava.bio.seq.SequenceIterator;
import org.biojava.bio.seq.io.SeqIOTools;
import org.biojava.bio.seq.io.SymbolTokenization;
import org.biojava.bio.symbol.Alphabet;
import org.biojava.bio.symbol.AlphabetManager;
import org.biojava.bio.symbol.SymbolList;
import org.biojavax.RichObjectFactory;
import org.biojavax.bio.seq.io.RichSequenceFormat;
import org.knime.core.data.DataCell;
import org.knime.core.data.DataColumnSpec;
import org.knime.core.data.DataColumnSpecCreator;
import org.knime.core.data.DataTableSpec;
import org.knime.core.data.RowKey;
import org.knime.core.data.container.BlobDataCell;
import org.knime.core.data.def.DefaultRow;
import org.knime.core.data.def.StringCell;
import org.knime.core.node.BufferedDataContainer;
import org.knime.core.node.BufferedDataTable;
import org.knime.core.node.CanceledExecutionException;
import org.knime.core.node.ExecutionContext;
import org.knime.core.node.ExecutionMonitor;
import org.knime.core.node.InvalidSettingsException;
import org.knime.core.node.NodeLogger;
import org.knime.core.node.NodeModel;
import org.knime.core.node.NodeSettingsRO;
import org.knime.core.node.NodeSettingsWO;
import org.knime.core.node.defaultnodesettings.SettingsModelString;
import org.biojavax.bio.seq.io.EMBLFormat;
import org.biojavax.bio.seq.io.FastaFormat;
import org.biojavax.bio.seq.io.GenbankFormat;
import org.biojavax.bio.seq.io.INSDseqFormat;
import org.biojavax.bio.seq.io.RichSequenceBuilderFactory;
import org.biojavax.bio.seq.io.RichSequenceFormat;
import org.biojavax.bio.seq.io.RichStreamReader;
import org.biojavax.bio.seq.io.UniProtFormat;
import org.pasteur.pf2.datatypes.*;
/**
  * This is the model implementation of FastAReader. Reads a FASTA file 
into two
  * columns: seq_name and sequence
  *
  * @author Bernd Jagla
  */
@SuppressWarnings("deprecation")
public class FastAReaderNodeModel extends NodeModel {
     // the logger instance
     private static final NodeLogger logger = NodeLogger
             .getLogger(FastQReaderNodeModel.class);
     private Alphabet alpha;
     private SequenceIterator iter;

     /**
      * the settings key which is used to retrieve and store the 
settings (from
      * the dialog or from a settings file) (package visibility to be 
usable from
      * the dialog).
      */
     private static final String FAR_name = "far_name";

     private static final String FAR_fileFormat = "far_ff";

     private static final String FAR_alphabet = "far_alph";

     private final SettingsModelString m_fpname = createFAR_fpname();
     private final SettingsModelString m_fformat = createFileFormat();
     private final SettingsModelString m_alphabet = createAlphabet();

     /**
      * Constructor for the node model.
      */
     protected FastAReaderNodeModel() {
         super(0, 1);
     }

     /**
      * {@inheritDoc}
      */
     @Override
     protected BufferedDataTable[] execute(final BufferedDataTable[] inData,
             final ExecutionContext exec) throws Exception {

         // TODO do something here
         logger.info("Node Model Stub... this is not yet implemented !");

         // the data table spec of the single output table,
         // the table will have three columns:
         DataColumnSpec[] allColSpecs = new DataColumnSpec[1];
         allColSpecs[0] = new DataColumnSpecCreator("sequence", 
SequenceDataCell.TYPE)
                 .createSpec();
         DataTableSpec outputSpec = new DataTableSpec(allColSpecs);
         // the execution context will provide us with storage capacity, 
in this
         // case a data container to which we will add rows sequentially
         // Note, this container can also handle arbitrary big data 
tables, it
         // will buffer to disc if necessary.
         BufferedDataContainer container = 
exec.createDataContainer(outputSpec);
         // let's add m_count rows to it
         // once we are done, we close the container and return its table
         FileReader fp = new FileReader(m_fpname.getStringValue());

         exec.checkCanceled();
         //String form = m_fformat.getStringValue();
         //String alphabet = m_alphabet.getStringValue();
         String form = "genbank";
         String alphabet = "DNA";

         BufferedReader br = new BufferedReader(fp);
         // String line = br.readLine();
         int count = 0;
         SequenceIterator iter = (SequenceIterator) 
SeqIOTools.fileToBiojava(
                 form, alphabet, br);

         while (iter.hasNext()) {
             exec.checkCanceled();
             RowKey key = new RowKey("Row " + count);
             exec.setProgress("Row " + count);
             // System.out.println(fastq.getSequence());
             Sequence seq = iter.nextSequence();
             String seqName = seq.getName();
             // String seqName = "asdf";
             //String sequence = seq.seqString();
             System.err.println("reading: " + seqName + " " + seq.length());
             SequenceDataCell seqCell = new SequenceDataCell(seqName, seq);
             container.addRowToTable(new DefaultRow(key, seqCell));
             count++;
         }
         System.err.println("finished reading file");
         br.close();
         fp.close();
         container.close();
         return new BufferedDataTable[] { container.getTable() };
     }

     /**
      * Makes a <code>SequenceIterator</code> look like an
      * <code>Iterator {@code <Sequence>}</code>
      *
      * @param iter
      *            The <CODE>SequenceIterator</CODE>
      * @return An <CODE>Iterator</CODE> that returns only 
<CODE>Sequence</CODE>
      *         objects. <B>You cannot call <code>remove()</code> on this
      *         iterator!</B>
      */
     public Iterator<Sequence> asIterator(SequenceIterator iter) {
         final SequenceIterator it = iter;
         return new Iterator<Sequence>() {
             public boolean hasNext() {
                 return it.hasNext();
             }

             public Sequence next() {
                 try {
                     return it.nextSequence();
                 } catch (BioException e) {
                     NoSuchElementException ex = new 
NoSuchElementException();
                     ex.initCause(e);
                     throw ex;
                 }
             }

             public void remove() {
                 throw new UnsupportedOperationException();
             }
         };
     }

     public static RichSequenceFormat formatForName(String name)
             throws ClassNotFoundException, InstantiationException,
             IllegalAccessException {
         // determine the format to use
         RichSequenceFormat format;
         if (name.equalsIgnoreCase("fasta")) {
             format = (RichSequenceFormat) new FastaFormat();
         } else if (name.equalsIgnoreCase("genbank")) {
             format = (RichSequenceFormat) new GenbankFormat();
         } else if (name.equalsIgnoreCase("uniprot")) {
             format = new UniProtFormat();
         } else if (name.equalsIgnoreCase("embl")) {
             format = new EMBLFormat();
         } else if (name.equalsIgnoreCase("INSDseq")) {
             format = new INSDseqFormat();
         } else {
             Class formatClass = Class.forName(name);
             format = (RichSequenceFormat) formatClass.newInstance();
         }
         return format;
     }

     /**
      * {@inheritDoc}
      */
     @Override
     protected void reset() {
     }

     /**
      * {@inheritDoc}
      */
     @Override
     protected DataTableSpec[] configure(final DataTableSpec[] inSpecs)
             throws InvalidSettingsException {
         DataColumnSpec[] allColSpecs = new DataColumnSpec[1];
         allColSpecs[0] = new DataColumnSpecCreator("sequence", 
SequenceDataCell.TYPE)
                 .createSpec();
         DataTableSpec outputSpec = new DataTableSpec(allColSpecs);

         return new DataTableSpec[] { outputSpec };

     }

     /**
      * {@inheritDoc}
      */
     @Override
     protected void saveSettingsTo(final NodeSettingsWO settings) {
         m_alphabet.saveSettingsTo(settings);
         m_fformat.saveSettingsTo(settings);
         m_fpname.saveSettingsTo(settings);
     }

     /**
      * {@inheritDoc}
      */
     @Override
     protected void loadValidatedSettingsFrom(final NodeSettingsRO settings)
             throws InvalidSettingsException {
         m_alphabet.loadSettingsFrom(settings);
         m_fformat.loadSettingsFrom(settings);
         m_fpname.loadSettingsFrom(settings);
     }

     /**
      * {@inheritDoc}
      */
     @Override
     protected void validateSettings(final NodeSettingsRO settings)
             throws InvalidSettingsException {
         m_alphabet.validateSettings(settings);
         m_fformat.validateSettings(settings);
         m_fpname.validateSettings(settings);
     }

     /**
      * {@inheritDoc}
      */
     @Override
     protected void loadInternals(final File internDir,
             final ExecutionMonitor exec) throws IOException,
             CanceledExecutionException {
     }

     /**
      * {@inheritDoc}
      */
     @Override
     protected void saveInternals(final File internDir,
             final ExecutionMonitor exec) throws IOException,
             CanceledExecutionException {
     }

     public static SettingsModelString createFAR_fpname() {
         return new SettingsModelString(FAR_name, "");
     }

     public static SettingsModelString createFileFormat() {
         return new SettingsModelString(FAR_fileFormat, "FASTA");
     }

     public static SettingsModelString createAlphabet() {
         return new SettingsModelString(FAR_alphabet, "RNA");
     }

}


On 9/21/2010 2:40 PM, simon rayner wrote:
> hi,
>
> can you repost to the biojava group along with the full code, (just in 
> case there is a missing import or something).  you only replied to, 
> and not to the biojava mailing list
>
> thanks
>
> simon
>
> On Tue, Sep 21, 2010 at 8:18 PM, Bernd Jagla <bernd.jagla at pasteur.fr 
> <mailto:bernd.jagla at pasteur.fr>> wrote:
>
>     Thanks for the quick reply!
>
>     Here is some code that should have all the important parts:
>
>     String form = "genbank";
>     String alphabet = "dna";
>     BufferedReader br = new BufferedReader(fp);
>     SequenceIterator iter = (SequenceIterator) SeqIOTools.fileToBiojava(
>                     form, alphabet, br);
>             while (iter.hasNext()) {
>                 Sequence seq = iter.nextSequence();
>     => Exception thrown
>                 String seqName = seq.getName();
>               }
>
>
>     When trying to simplify the code a bit I now get the following error:
>     Execute failed: Could not initialize class
>     org.biojava.bio.seq.FeatureFilter
>
>     I assume that in the previous times I had a spelling error??
>     Then the exception got thrown during the initialization of "iter"
>
>     Thanks,
>
>     Bernd
>
>
>     On 9/21/2010 2:07 PM, simon rayner wrote:
>>     hi,
>>
>>     can you post the code you are trying to run along with the full
>>     error, it will help to figure out what is happening.  There are
>>     now loaders for biojavax as well, which work well which are
>>     available in the biojavax docs here
>>     http://biojava.org/wiki/BioJava:BioJavaXDocs#Example
>>
>>     but yeah, it's confusing unless you happen to be a real java
>>     guru.  i keep having to refer back to the docs because i keep
>>     forgeting which class does what
>>
>>     On Tue, Sep 21, 2010 at 7:46 PM, Bernd Jagla
>>     <bernd.jagla at pasteur.fr <mailto:bernd.jagla at pasteur.fr>> wrote:
>>
>>          Hello,
>>
>>         I am getting a little frustrated with the wiki page (I guess
>>         I don't spend enough time reading and testing). I have the
>>         impression that some of the documentation relates to version
>>         3 whereas others relate to 1.5 or 1.7.
>>         So sorry if this all sounds a bit confused... ;(
>>
>>         I believe I am using 1.7.1. (I wasn't able to find a readme
>>         file that contains that information) even though I would
>>         probably like to use version 3. But as I am stuck with an
>>         older Eclipse version I think it will be even worse when I
>>         try that.
>>
>>         Anyways, I am trying to read in sequence files using
>>         SeqIOTools.fileToBiojava, which seems to be deprecated, with
>>         the following parameters: "genbank", "dna", bufferedReader.
>>
>>         somehow this works with "fasta" but with genbank I get the
>>         following exception:
>>         Execute failed: Unknown file type '524300'
>>         in some cases I get:
>>         Unknown file type '262156'
>>
>>         Does this mean anything to you?
>>
>>         Or how do you read in a sequence file? I am looking for a
>>         generic way that covers many file types (genbank, fasta,
>>         swissprot...)
>>
>>         Once I have this I will probably be able to get to the
>>         feature information using the information from the tutorial.
>>
>>         Thanks for your time.
>>
>>         Bernd
>>
>>
>>
>>         _______________________________________________
>>         Biojava-l mailing list  - Biojava-l at lists.open-bio.org
>>         <mailto:Biojava-l at lists.open-bio.org>
>>         http://lists.open-bio.org/mailman/listinfo/biojava-l
>>
>>
>>
>>
>>     -- 
>>     Simon Rayner
>>
>>     State Key Laboratory of Virology
>>     Wuhan Institute of Virology
>>     Chinese Academy of Sciences
>>     Wuhan, Hubei 430071
>>     P.R.China
>>
>>     +86 (27) 87199895 (office)
>>     +86 18627113001 (cell)
>>
>
>
>
> -- 
> Simon Rayner
>
> State Key Laboratory of Virology
> Wuhan Institute of Virology
> Chinese Academy of Sciences
> Wuhan, Hubei 430071
> P.R.China
>
> +86 (27) 87199895 (office)
> +86 18627113001 (cell)
>



More information about the Biojava-l mailing list