[Biojava-l] RE: Bug in HashedAlphabetIndex??
Schreiber, Mark
mark.schreiber@agresearch.co.nz
Wed, 7 Mar 2001 17:16:38 +1300
Actually after loooking at the debugger I find that the Finite alphabet
produced by the statement
//create a cross product of N dna alphabets
FiniteAlphabet nOrderAlpha =
(FiniteAlphabet)AlphabetManager.getCrossProductAlphabet(
Collections.nCopies(order.intValue(),DNATools.getDNA())
);
is very different depending on the value returned by order.intValue() If it
is 3 then a shiney happy SimpleCrossProduct object is returned if it is
larger than 4 a SparseCrossProduct object is returned??
Is this a "feature"??
Mark
> -----Original Message-----
> From: Schreiber, Mark
> Sent: Wednesday, March 07, 2001 4:21 PM
> To: 'biojava-l@biojava.org'
> Subject: Bug in HashedAlphabetIndex??
>
>
> Hi,
>
> I think there may be a bug in HashedAlphabetIndex,
>
> when I run the program below with an order of greater than 4
> I get the following exception. If I use an argument of less
> than or equal to 4 the program works fine. The problem seems
> to be unrelated to the sequence file used.
>
> java.lang.NullPointerException
> at
> org.biojava.bio.symbol.HashedAlphabetIndex$HashComparator.comp
> are(HashedAlphabetIndex.java:74)
> at java.util.Arrays.mergeSort(Arrays.java:1181)
> at java.util.Arrays.mergeSort(Arrays.java:1188)
> at java.util.Arrays.mergeSort(Arrays.java:1188)
> at java.util.Arrays.mergeSort(Arrays.java:1188)
> at java.util.Arrays.mergeSort(Arrays.java:1188)
> at java.util.Arrays.mergeSort(Arrays.java:1188)
> at java.util.Arrays.mergeSort(Arrays.java:1188)
> at java.util.Arrays.mergeSort(Arrays.java:1188)
> at java.util.Arrays.mergeSort(Arrays.java:1188)
> at java.util.Arrays.sort(Arrays.java:1128)
> at
> org.biojava.bio.symbol.HashedAlphabetIndex.<init>(HashedAlphab
> etIndex.java:61)
> at
> org.biojava.bio.symbol.AlphabetManager.getAlphabetIndex(Alphab
> etManager.java:924)
> at
> org.biojava.bio.dist.SimpleDistribution.<init>(SimpleDistribut
> ion.java:96)
> at
> org.biojava.bio.dist.DistributionFactory$DefaultDistributionFa
> ctory.createDistribution(DistributionFactory.java:84)
> at WindowCount.WindowCount.main(WindowCount.java:39)
>
> The problem seems to be spawned in this section of code:
>
> //create a cross product of N dna alphabets
> FiniteAlphabet nOrderAlpha =
> (FiniteAlphabet)AlphabetManager.getCrossProductAlphabet(
>
> Collections.nCopies(order.intValue(),DNATools.getDNA())
> );
>
> //create a distribution for the alphabet and a trainer.
> Distribution d =
> DistributionFactory.DEFAULT.createDistribution(nOrderAlpha);
>
> Can anyone think of what may be going wrong?
>
> Mark
>
>
>
> ####PROGRAM STARTS HERE#####
>
> package WindowCount;
>
> import org.biojava.bio.*;
> import org.biojava.utils.*;
> import org.biojava.bio.dist.*;
> import org.biojava.bio.seq.db.*;
> import org.biojava.bio.seq.io.*;
> import org.biojava.bio.seq.*;
> import org.biojava.bio.seq.impl.*;
> import org.biojava.bio.symbol.*;
>
> import java.util.*;
> import java.io.*;
>
> /**
> * Title: WindowCount
> * Description: A program to find the distribution of nmers
> in a fasta library
> * Copyright: Copyright (c) 2001
> * Company: AgResearch
> * @author Mark Schreiber
> * @version 1.0
> */
> public class WindowCount {
>
> public static void main(String[] args) {
> try{
> File infile = new File(args[0]);
> Integer order = new Integer(args[1]);
> Double threshold = new Double(1.0 /
> Math.pow(4.0,(double)order.intValue()));
> FiniteAlphabet dna = DNATools.getDNA();
> SequenceDB seqs = readSequenceDB(infile,dna);
>
> //create a cross product of N dna alphabets
> FiniteAlphabet nOrderAlpha =
> (FiniteAlphabet)AlphabetManager.getCrossProductAlphabet(
>
> Collections.nCopies(order.intValue(),DNATools.getDNA())
> );
>
> //create a distribution for the alphabet and a trainer.
> Distribution d =
> DistributionFactory.DEFAULT.createDistribution(nOrderAlpha);
> DistributionTrainer dt = new SimpleDistributionTrainer(d);
> DistributionTrainerContext context =
> new SimpleDistributionTrainerContext();
>
> //for each sequence
> SequenceIterator iter = seqs.sequenceIterator();
> while (iter.hasNext()) {
> SymbolList s = iter.nextSequence();
> SymbolList nseq =
> SymbolListViews.orderNSymbolList(s,order.intValue());
>
> //add nmer counts to the distribution
> Iterator nmers = nseq.iterator();
> while(nmers.hasNext()){
> Object nmer = nmers.next();
> try{
> dt.addCount(context,(AtomicSymbol)nmer,1.0);
> //System.out.println("+");
> }catch(ClassCastException cce){
> //System.err.println(".");
> continue;// ignore the redundant basis symbols
> }
> }
> }
>
> //train the distribution.
> dt.train(0.0); //No pseudo-counts
>
> //return the list of nmer symbols in the alphabet
> SymbolList nOrderSymbols = nOrderAlpha.symbols();
>
> //Add each symbol and its counts to a collection so
> they can be sorted
> Iterator symbols = nOrderSymbols.iterator();
> SortedMap tree = new TreeMap();
> while(symbols.hasNext()){
> AtomicSymbol s = (AtomicSymbol)symbols.next();
> Double weight = new Double(d.getWeight(s)); // the key
> tree.put(weight,s);
> }
>
> //Print out the nmers above the threshold
> SortedMap sig = tree.tailMap(threshold);
> Set keys = sig.keySet();
> System.out.println("threshold = " + threshold.doubleValue());
> System.out.println("\nNMER\tWEIGHT");
> Iterator keysI = keys.iterator();
> while(keysI.hasNext()){
> Double key = (Double)keysI.next();
> AtomicSymbol value = (AtomicSymbol)sig.get(key);
> output(key, value);
> }
>
> }catch(IOException ioe){
> ioe.printStackTrace(System.err);
> }catch(Exception e){
> e.printStackTrace(System.err);
> }
> }
>
> /**
> * Create a sequence database from a fasta file.
> */
> public static SequenceDB readSequenceDB(File seqFile,
> Alphabet alpha)
> throws Exception {
> HashSequenceDB seqDB = new HashSequenceDB(IDMaker.byName);
>
> SequenceBuilderFactory sbFact = new
> FastaDescriptionLineParser.Factory(
>
> SimpleSequenceBuilder.FACTORY);
> FastaFormat fFormat = new FastaFormat();
> for(
> SequenceIterator seqI = new StreamReader(
> new FileInputStream(seqFile),
> fFormat,
> alpha.getParser("token"),
> sbFact
> );
> seqI.hasNext();
> ) {
> Sequence seq = seqI.nextSequence();
> seqDB.addSequence(seq);
> }
>
> return seqDB;
> }
>
> public static void output(Double d, AtomicSymbol s){
> //get the symbols that make up the atomic symbol
> List syms = ((BasisSymbol)s).getSymbols();
> //print the symbol
> Iterator iter = syms.iterator();
> while (iter.hasNext()) {
> Symbol subSymbol = (Symbol)iter.next();
> System.out.print(subSymbol.getToken());
> }
> //print the double value
> System.out.println("\t" + d.doubleValue());
> }
>
> public static void usage(){
> System.out.println("\n\n\t***USAGE***\n\n");
> System.out.println("java WindowCount <file> [size]");
> System.out.println("\n\tfile\tFile in Fasta Format");
> System.out.println("\tsize\tSize of nmers to count");
> //bail out!
> System.exit(0);
> }
> }
>