[Biojava-l]

Mon May 5 13:31:06 EDT 2003

Hi,

This message refers to the WeightMatrixDemo example at "BioJava In Anger" page again (http://bioconf.otago.ac.nz/biojava/weightMatrix.htm).  I am trying to understand the null model behind this.  Experts out there, please help!

I slightly modified the code as below (DNA version):

import java.util.*;
import org.biojava.bio.dist.*;
import org.biojava.bio.dp.*;
import org.biojava.bio.seq.*;
import org.biojava.bio.symbol.*;

public class WeightMatrixDemo {
  public static void main(String[] args) throws Exception{
    //make an Alignment of a motif.
    Map map = new HashMap();
    map.put("seq0", DNATools.createDNA("aggag"));
    map.put("seq1", DNATools.createDNA("aggaa"));
    map.put("seq2", DNATools.createDNA("aggag"));
    map.put("seq3", DNATools.createDNA("aagag"));
    Alignment align = new SimpleAlignment(map);

    //make a Distribution[] of the motif
    Distribution[] dists =
        DistributionTools.distOverAlignment(align, false, 0.01);

    // Zhen added this block of code.
    for(int i = 0; i < dists.length; i++) {
        Distribution nullmodel = dists[i].getNullModel();
        FiniteAlphabet dna = DNATools.getDNA();
        Iterator dnaSymbols = dna.iterator();
        while(dnaSymbols.hasNext()) {
            Symbol s = (Symbol)dnaSymbols.next();
            System.out.println(s.getName() + "\t" + nullmodel.getWeight(s));
        }
        System.out.println();
    }

    //make a Weight Matrix
    WeightMatrix matrix = new SimpleWeightMatrix(dists);

    //the sequence to score against
    Sequence seq = DNATools.createDNASequence("aaagcctaggaagaggagctgat","seq");

    //annotate the sequence with the weight matrix using a low threshold (0.1)
    WeightMatrixAnnotator wma = new WeightMatrixAnnotator(matrix, 0.1);
    seq = wma.annotate(seq);

    //output match information
    for (Iterator it = seq.features(); it.hasNext(); ) {
      Feature f = (Feature)it.next();
      Location loc = f.getLocation();
      System.out.println("Match at " + loc.getMin()+"-"+loc.getMax());
      System.out.println("\tscore : "+f.getAnnotation().getProperty("score"));
    }
  }
}

The output is:
adenine 0.25
guanine 0.25
thymine 0.25
cytosine        0.25

adenine 0.25
guanine 0.25
thymine 0.25
cytosine        0.25

adenine 0.25
guanine 0.25
thymine 0.25
cytosine        0.25

adenine 0.25
guanine 0.25
thymine 0.25
cytosine        0.25

adenine 0.25
guanine 0.25
thymine 0.25
cytosine        0.25

Match at 8-12
        score : 0.18613993419374553
Match at 11-15
        score : 0.18613993419374553
Match at 14-18
        score : 0.5574914238570782

I understand 0.25 is 1/4 for 4 nucleotides.  However, let's look at the protein version:

import java.util.*;
import org.biojava.bio.dist.*;
import org.biojava.bio.dp.*;
import org.biojava.bio.seq.*;
import org.biojava.bio.symbol.*;

public class WeightMatrixDemo {
  public static void main(String[] args) throws Exception{
    //make an Alignment of a motif.
    Map map = new HashMap();
    map.put("seq0", ProteinTools.createProtein("aggag"));
    map.put("seq1", ProteinTools.createProtein("aggaa"));
    map.put("seq2", ProteinTools.createProtein("aggag"));
    map.put("seq3", ProteinTools.createProtein("aagag"));
    Alignment align = new SimpleAlignment(map);

    //make a Distribution[] of the motif
    Distribution[] dists =
        DistributionTools.distOverAlignment(align, false, 0.01);

    // Zhen added this block of code.
    for(int i = 0; i < dists.length; i++) {
        Distribution nullmodel = dists[i].getNullModel();
        FiniteAlphabet pro = ProteinTools.getAlphabet();
        Iterator proSymbols = pro.iterator();
        while(proSymbols.hasNext()) {
            Symbol s = (Symbol)proSymbols.next();
            System.out.println(s.getName() + "\t" + nullmodel.getWeight(s));
        }
        System.out.println();
    }

    //make a Weight Matrix
    WeightMatrix matrix = new SimpleWeightMatrix(dists);

    //the sequence to score against
    Sequence seq = ProteinTools.createProteinSequence("aaagcctaggaagaggagctgat","seq");

    //annotate the sequence with the weight matrix using a low threshold (0.1)
    WeightMatrixAnnotator wma = new WeightMatrixAnnotator(matrix, 0.1);
    seq = wma.annotate(seq);

    //output match information
    for (Iterator it = seq.features(); it.hasNext(); ) {
      Feature f = (Feature)it.next();
      Location loc = f.getLocation();
      System.out.println("Match at " + loc.getMin()+"-"+loc.getMax());
      System.out.println("\tscore : "+f.getAnnotation().getProperty("score"));
    }
  }
}

The output is:
SER     0.045454545454545456
ILE     0.045454545454545456
TYR     0.045454545454545456
ARG     0.045454545454545456
PHE     0.045454545454545456
ASN     0.045454545454545456
GLY     0.045454545454545456
VAL     0.045454545454545456
CYS     0.045454545454545456
LYS     0.045454545454545456
THR     0.045454545454545456
ALA     0.045454545454545456
LEU     0.045454545454545456
MET     0.045454545454545456
PRO     0.045454545454545456
TRP     0.045454545454545456
GLU     0.045454545454545456
GLN     0.045454545454545456
SEC     0.045454545454545456
HIS     0.045454545454545456
ASP     0.045454545454545456

SER     0.045454545454545456
ILE     0.045454545454545456
TYR     0.045454545454545456
ARG     0.045454545454545456
PHE     0.045454545454545456
ASN     0.045454545454545456
GLY     0.045454545454545456
VAL     0.045454545454545456
CYS     0.045454545454545456
LYS     0.045454545454545456
THR     0.045454545454545456
ALA     0.045454545454545456
LEU     0.045454545454545456
MET     0.045454545454545456
PRO     0.045454545454545456
TRP     0.045454545454545456
GLU     0.045454545454545456
GLN     0.045454545454545456
SEC     0.045454545454545456
HIS     0.045454545454545456
ASP     0.045454545454545456

SER     0.045454545454545456
ILE     0.045454545454545456
TYR     0.045454545454545456
ARG     0.045454545454545456
PHE     0.045454545454545456
ASN     0.045454545454545456
GLY     0.045454545454545456
VAL     0.045454545454545456
CYS     0.045454545454545456
LYS     0.045454545454545456
THR     0.045454545454545456
ALA     0.045454545454545456
LEU     0.045454545454545456
MET     0.045454545454545456
PRO     0.045454545454545456
TRP     0.045454545454545456
GLU     0.045454545454545456
GLN     0.045454545454545456
SEC     0.045454545454545456
HIS     0.045454545454545456
ASP     0.045454545454545456

SER     0.045454545454545456
ILE     0.045454545454545456
TYR     0.045454545454545456
ARG     0.045454545454545456
PHE     0.045454545454545456
ASN     0.045454545454545456
GLY     0.045454545454545456
VAL     0.045454545454545456
CYS     0.045454545454545456
LYS     0.045454545454545456
THR     0.045454545454545456
ALA     0.045454545454545456
LEU     0.045454545454545456
MET     0.045454545454545456
PRO     0.045454545454545456
TRP     0.045454545454545456
GLU     0.045454545454545456
GLN     0.045454545454545456
SEC     0.045454545454545456
HIS     0.045454545454545456
ASP     0.045454545454545456

SER     0.045454545454545456
ILE     0.045454545454545456
TYR     0.045454545454545456
ARG     0.045454545454545456
PHE     0.045454545454545456
ASN     0.045454545454545456
GLY     0.045454545454545456
VAL     0.045454545454545456
CYS     0.045454545454545456
LYS     0.045454545454545456
THR     0.045454545454545456
ALA     0.045454545454545456
LEU     0.045454545454545456
MET     0.045454545454545456
PRO     0.045454545454545456
TRP     0.045454545454545456
GLU     0.045454545454545456
GLN     0.045454545454545456
SEC     0.045454545454545456
HIS     0.045454545454545456
ASP     0.045454545454545456

Match at 8-12
        score : 0.1853491381982068
Match at 11-15
        score : 0.18534913819820675
Match at 14-18
        score : 0.5558789919338315

Now, 0.045454545454545456 is 1/22.  Can anyone tell me why 22 here?

I appreciate your help.

Zhen