[Dynamite] ...getting started...

Ewan Birney birney@ebi.ac.uk
Fri, 3 Mar 2000 17:18:46 +0000 (GMT)


This is really just to get us started on this. We have alot of
decisions to make, but the only way we are going to get there
is by sending IDL to each other.... ;)







typedef sequence<float> WeightVector;

interface Alphabet
{
  // every symbol in the alphabet corresponds to a non-negative integer 0,1,2...(size-1)
  // negative integers represent ambiguous symbols, e.g. "N" "Y" "R" for DNA, "X" for proteins, and wildcards "*"
  //  -1 is always 'N' and -2 is always '*'
  // these ambiguities can be represented by a weighted sum over the real symbols in the alphabet
  //  e.g. "Y" = 0.5 * "C" + 0.5 * "T"
  //       "*" = "A" + "C" + "G" + "T" 
  //  (NB this is not a probability distribution, as it doesn't sum to one, although in general ambiguous characters are probabilistic)
  // such weighted sums are represented by WeightVectors
  //

  attribute string name();   // name of alphabet e.g. "DNA"
  attribute string alphabet_string();   // e.g. "acgt"

  // want attributes for adding symbols like "n" (and upper-case symbols like "A" that map down to "a")
  // but associative containers are the best way of doing this and i need to check glib, etc
  // probably also want a constructor but i don't know how to do this yet

  int size();
  bool equal_to (in Alphabet a);

  bool contains (in char c);

  int char2int (in char c);
  WeightVector char2vec (in char c);

  char int2char (in int i);
  WeightVector int2vec (in int i);

  int vec2int (in WeightVector w);     // makes a best guess
  char vec2char (in WeightVector w);   // makes a best guess

  bool has_complement();               // basically TRUE if this is DNA or RNA
  int  complement_int (in int i);
  char complement_char (in char c);       // this should preserve the case if possible, so that "A" maps to "T" and not "t"
  WeightVector complement_vec (in WeightVector vec);

  char unknown_char();    // = int2char(-1); returns 'n' for DNA, 'x' for protein
  char wildcard_char();   // = int2char(-2); returns '*'

  // probably add enum unknown_int = -1, wildcard_int = -2
  // maybe have unknown_vec() and wildcard_vec() for completeness

};



interface Seq { // should this be read-only, at least for the public interface?
  attribute string seq(); 
  attribute string display_id(); // human read-able
  attribute string accession_number(); // could be ""
  attribute string primary_id(); // id guarented by teh implementation to represent this
  
  // other methods?

  Alphabet alphabet(); // in the sequence?
}



// separate modules for the different models or not?
// what about code sharing between them?

module SingleModel { // Single means emits only one sequence

  interface State;
  interface Transition;

  typedef sequence<float> ProbabilityEmission;

  interface Transition {
    State from;
    State to;
    float transition_probability;
    ProbabilityEmission emission; // emission on the transitions.
  };

  typedef sequence<Transition> TransitionList;

  interface State {
    TransitionList all_Transitions();
  };

  typedef sequence<State> StateList;

  interface Model {
    StateList all_States();
  };


  //
  // Have not done alignment yet
  //
  
  interface AlignmentFactory {
    attribute model Model;
    // also here a function pointer for compile-time function for this model
    Alignment make_alignment(in Seq seq);

    // can throw exceptions/errors of bad alphabet, other things...
  };


}




-----------------------------------------------------------------
Ewan Birney. Mobile: +44 (0)7970 151230
<birney@ebi.ac.uk>
-----------------------------------------------------------------