[Biojava-l] RE: Bug in HashedAlphabetIndex??

Schreiber, Mark mark.schreiber@agresearch.co.nz
Wed, 7 Mar 2001 17:16:38 +1300


Actually after loooking at the debugger I find that the Finite alphabet
produced by the statement

  //create a cross product of N dna alphabets
  FiniteAlphabet nOrderAlpha =
(FiniteAlphabet)AlphabetManager.getCrossProductAlphabet(
 
Collections.nCopies(order.intValue(),DNATools.getDNA())
                         );

is very different depending on the value returned by order.intValue() If it
is 3 then a shiney happy SimpleCrossProduct object is returned if it is
larger than 4 a SparseCrossProduct object is returned??

Is this a "feature"??

Mark

> -----Original Message-----
> From: Schreiber, Mark 
> Sent: Wednesday, March 07, 2001 4:21 PM
> To: 'biojava-l@biojava.org'
> Subject: Bug in HashedAlphabetIndex??
> 
> 
> Hi,
> 
> I think there may be a bug in HashedAlphabetIndex,
> 
> when I run the program below with an order of greater than 4 
> I get the following exception. If I use an argument of less 
> than or equal to 4 the program works fine. The problem seems 
> to be unrelated to the sequence file used.
> 
> java.lang.NullPointerException
> 	at 
> org.biojava.bio.symbol.HashedAlphabetIndex$HashComparator.comp
> are(HashedAlphabetIndex.java:74)
> 	at java.util.Arrays.mergeSort(Arrays.java:1181)
> 	at java.util.Arrays.mergeSort(Arrays.java:1188)
> 	at java.util.Arrays.mergeSort(Arrays.java:1188)
> 	at java.util.Arrays.mergeSort(Arrays.java:1188)
> 	at java.util.Arrays.mergeSort(Arrays.java:1188)
> 	at java.util.Arrays.mergeSort(Arrays.java:1188)
> 	at java.util.Arrays.mergeSort(Arrays.java:1188)
> 	at java.util.Arrays.mergeSort(Arrays.java:1188)
> 	at java.util.Arrays.mergeSort(Arrays.java:1188)
> 	at java.util.Arrays.sort(Arrays.java:1128)
> 	at 
> org.biojava.bio.symbol.HashedAlphabetIndex.<init>(HashedAlphab
> etIndex.java:61)
> 	at 
> org.biojava.bio.symbol.AlphabetManager.getAlphabetIndex(Alphab
> etManager.java:924)
> 	at 
> org.biojava.bio.dist.SimpleDistribution.<init>(SimpleDistribut
> ion.java:96)
> 	at 
> org.biojava.bio.dist.DistributionFactory$DefaultDistributionFa
> ctory.createDistribution(DistributionFactory.java:84)
> 	at WindowCount.WindowCount.main(WindowCount.java:39)
> 
> The problem seems to be spawned in this section of code:
> 
>       //create a cross product of N dna alphabets
>       FiniteAlphabet nOrderAlpha = 
> (FiniteAlphabet)AlphabetManager.getCrossProductAlphabet(
>                                  
> Collections.nCopies(order.intValue(),DNATools.getDNA())
>                                 );
> 
>       //create a distribution for the alphabet and a trainer.
>       Distribution d = 
> DistributionFactory.DEFAULT.createDistribution(nOrderAlpha);
> 
> Can anyone think of what may be going wrong?
> 
> Mark
> 
> 
> 
> ####PROGRAM STARTS HERE#####
> 
> package WindowCount;
> 
> import org.biojava.bio.*;
> import org.biojava.utils.*;
> import org.biojava.bio.dist.*;
> import org.biojava.bio.seq.db.*;
> import org.biojava.bio.seq.io.*;
> import org.biojava.bio.seq.*;
> import org.biojava.bio.seq.impl.*;
> import org.biojava.bio.symbol.*;
> 
> import java.util.*;
> import java.io.*;
> 
> /**
>  * Title:        WindowCount
>  * Description:  A program to find the distribution of nmers 
> in a fasta library
>  * Copyright:    Copyright (c) 2001
>  * Company:      AgResearch
>  * @author Mark Schreiber
>  * @version 1.0
>  */
> public class WindowCount {
> 
>   public static void main(String[] args) {
>     try{
>       File infile = new File(args[0]);
>       Integer order = new Integer(args[1]);
>       Double threshold = new Double(1.0 / 
> Math.pow(4.0,(double)order.intValue()));
>       FiniteAlphabet dna = DNATools.getDNA();
>       SequenceDB seqs = readSequenceDB(infile,dna);
> 
>       //create a cross product of N dna alphabets
>       FiniteAlphabet nOrderAlpha = 
> (FiniteAlphabet)AlphabetManager.getCrossProductAlphabet(
>                                  
> Collections.nCopies(order.intValue(),DNATools.getDNA())
>                                 );
> 
>       //create a distribution for the alphabet and a trainer.
>       Distribution d = 
> DistributionFactory.DEFAULT.createDistribution(nOrderAlpha);
>       DistributionTrainer dt = new SimpleDistributionTrainer(d);
>       DistributionTrainerContext context =
>                                new SimpleDistributionTrainerContext();
> 
>       //for each sequence
>       SequenceIterator iter = seqs.sequenceIterator();
>       while (iter.hasNext()) {
>         SymbolList s = iter.nextSequence();
>         SymbolList nseq = 
> SymbolListViews.orderNSymbolList(s,order.intValue());
> 
>         //add nmer counts to the distribution
>         Iterator nmers = nseq.iterator();
>         while(nmers.hasNext()){
>           Object nmer = nmers.next();
>           try{
>             dt.addCount(context,(AtomicSymbol)nmer,1.0);
>             //System.out.println("+");
>           }catch(ClassCastException cce){
>             //System.err.println(".");
>             continue;// ignore the redundant basis symbols
>           }
>         }
>       }
> 
>       //train the distribution.
>       dt.train(0.0); //No pseudo-counts
> 
>       //return the list of nmer symbols in the alphabet
>       SymbolList nOrderSymbols = nOrderAlpha.symbols();
> 
>       //Add each symbol and its counts to a collection so 
> they can be sorted
>       Iterator symbols = nOrderSymbols.iterator();
>       SortedMap tree = new TreeMap();
>       while(symbols.hasNext()){
>         AtomicSymbol s = (AtomicSymbol)symbols.next();
>         Double weight = new Double(d.getWeight(s)); // the key
>         tree.put(weight,s);
>       }
> 
>       //Print out the nmers above the threshold
>       SortedMap sig = tree.tailMap(threshold);
>       Set keys = sig.keySet();
>       System.out.println("threshold = " + threshold.doubleValue());
>       System.out.println("\nNMER\tWEIGHT");
>       Iterator keysI = keys.iterator();
>       while(keysI.hasNext()){
>         Double key = (Double)keysI.next();
>         AtomicSymbol value = (AtomicSymbol)sig.get(key);
>         output(key, value);
>       }
> 
>     }catch(IOException ioe){
>       ioe.printStackTrace(System.err);
>     }catch(Exception e){
>      e.printStackTrace(System.err);
>     }
>   }
> 
>     /**
>      * Create a sequence database from a fasta file.
>      */
>     public static SequenceDB readSequenceDB(File seqFile, 
> Alphabet alpha)
>             throws Exception {
>     HashSequenceDB seqDB = new HashSequenceDB(IDMaker.byName);
> 
>     SequenceBuilderFactory sbFact = new 
> FastaDescriptionLineParser.Factory(
>                                             
> SimpleSequenceBuilder.FACTORY);
>     FastaFormat fFormat = new FastaFormat();
>     for(
>       SequenceIterator seqI = new StreamReader(
>         new FileInputStream(seqFile),
>         fFormat,
>         alpha.getParser("token"),
>         sbFact
>       );
>       seqI.hasNext();
>     ) {
>       Sequence seq = seqI.nextSequence();
>       seqDB.addSequence(seq);
>     }
> 
>     return seqDB;
>   }
> 
>   public static void output(Double d, AtomicSymbol s){
>      //get the symbols that make up the atomic symbol
>      List syms = ((BasisSymbol)s).getSymbols();
>      //print the symbol
>      Iterator iter = syms.iterator();
>      while (iter.hasNext()) {
>        Symbol subSymbol = (Symbol)iter.next();
>        System.out.print(subSymbol.getToken());
>      }
>      //print the double value
>      System.out.println("\t" + d.doubleValue());
>   }
> 
>   public static void usage(){
>     System.out.println("\n\n\t***USAGE***\n\n");
>     System.out.println("java WindowCount <file> [size]");
>     System.out.println("\n\tfile\tFile in Fasta Format");
>     System.out.println("\tsize\tSize of nmers to count");
>     //bail out!
>     System.exit(0);
>   }
> }
>