[Biojava-l] BioSQL

Len Trigg len at reeltwo.com
Mon Nov 1 15:13:31 EST 2004


Mark Schreiber wrote:
> Does anyone have a current BioSQL schema that matches the BioJava 
> bindings? Preferably one for Oracle.

AFAIK, current BioSQL CVS matches what BioJava expects -- Hilmar
recently added the extra table that BioJava uses.  I've also attached
the Oracle schema that I've used (it's a bit simpler than the full
BioSQL Oracle schema).

Cheers,
Len.

-------------- next part --------------
-- conventions:
-- <table_name>_id is primary internal id (usually autogenerated)

-- Authors: Ewan Birney, Elia Stupka
-- Contributors: Hilmar Lapp, Aaron Mackey
--
-- Copyright Ewan Birney. You may use, modify, and distribute this code under
-- the same terms as Perl. See the Perl Artistic License.
--
-- comments to biosql - biosql-l at open-bio.org

--
-- Migration of the MySQL schema to InnoDB by Hilmar Lapp <hlapp at gmx.net>
-- Post-Cape Town changes by Hilmar Lapp.
-- Singapore changes by Hilmar Lapp and Aaron Mackey.
--


CREATE SEQUENCE biodatabase_pk_seq
	INCREMENT BY 1 START WITH 1 NOMAXVALUE NOMINVALUE NOCYCLE NOORDER;
CREATE SEQUENCE taxon_pk_seq
	INCREMENT BY 1 START WITH 1 NOMAXVALUE NOMINVALUE NOCYCLE NOORDER;
CREATE SEQUENCE ontology_pk_seq
	INCREMENT BY 1 START WITH 1 NOMAXVALUE NOMINVALUE NOCYCLE NOORDER;
CREATE SEQUENCE term_pk_seq
	INCREMENT BY 1 START WITH 1 NOMAXVALUE NOMINVALUE NOCYCLE NOORDER;
CREATE SEQUENCE term_relationship_pk_seq
	INCREMENT BY 1 START WITH 1 NOMAXVALUE NOMINVALUE NOCYCLE NOORDER;
CREATE SEQUENCE term_path_pk_seq
	INCREMENT BY 1 START WITH 1 NOMAXVALUE NOMINVALUE NOCYCLE NOORDER;
CREATE SEQUENCE bioentry_pk_seq
	INCREMENT BY 1 START WITH 1 NOMAXVALUE NOMINVALUE NOCYCLE NOORDER;
CREATE SEQUENCE bioentry_relationship_pk_seq
	INCREMENT BY 1 START WITH 1 NOMAXVALUE NOMINVALUE NOCYCLE NOORDER;
CREATE SEQUENCE dbxref_pk_seq
	INCREMENT BY 1 START WITH 1 NOMAXVALUE NOMINVALUE NOCYCLE NOORDER;
CREATE SEQUENCE reference_pk_seq
 	INCREMENT BY 1 START WITH 1 NOMAXVALUE NOMINVALUE NOCYCLE NOORDER;
CREATE SEQUENCE anncomment_pk_seq
 	INCREMENT BY 1 START WITH 1 NOMAXVALUE NOMINVALUE NOCYCLE NOORDER;
CREATE SEQUENCE seqfeature_pk_seq
 	INCREMENT BY 1 START WITH 1 NOMAXVALUE NOMINVALUE NOCYCLE NOORDER;
CREATE SEQUENCE seqfeature_relationship_pk_seq
 	INCREMENT BY 1 START WITH 1 NOMAXVALUE NOMINVALUE NOCYCLE NOORDER;
CREATE SEQUENCE location_pk_seq
 	INCREMENT BY 1 START WITH 1 NOMAXVALUE NOMINVALUE NOCYCLE NOORDER;


-- database have bioentries. That is about it.
-- we do not store different versions of a database as different dbids
-- (there is no concept of versions of database). There is a concept of
-- versions of entries. Versions of databases deserve their own table and
-- join to bioentry table for tracking with versions of entries 

CREATE TABLE biodatabase (
  	biodatabase_id 	int  NOT NULL ,
  	name           	VARCHAR(128) NOT NULL,
	authority	VARCHAR(128),
	description	VARCHAR2(250),
	PRIMARY KEY (biodatabase_id),
  	UNIQUE (name)
) TABLESPACE "BIOSQL_DATA";

CREATE INDEX db_auth on biodatabase(authority) TABLESPACE "BIOSQL_INDEX";

-- we could insist that taxa are NCBI taxon id, but on reflection I made this
-- an optional extra line, as many flat file formats do not have the NCBI id
--
-- no organelle/sub species

-- corresponds to the node table of the NCBI taxonomy databaase
CREATE TABLE taxon (
       taxon_id		int  NOT NULL ,
       ncbi_taxon_id 	int,
       parent_taxon_id	int ,
       node_rank	VARCHAR(32),
       genetic_code	INT ,
       mito_genetic_code INT ,
       left_value	int ,
       right_value	int ,
       PRIMARY KEY (taxon_id),
       UNIQUE (ncbi_taxon_id),
       UNIQUE (left_value),
       UNIQUE (right_value)
) TABLESPACE "BIOSQL_DATA";

CREATE INDEX taxparent ON taxon(parent_taxon_id) TABLESPACE "BIOSQL_INDEX";

-- corresponds to the names table of the NCBI taxonomy databaase
CREATE TABLE taxon_name (
       taxon_id		int  NOT NULL,
       name		VARCHAR(255) NOT NULL,
       name_class	VARCHAR(32) NOT NULL,
       UNIQUE (taxon_id,name,name_class)
) TABLESPACE "BIOSQL_DATA";

CREATE INDEX taxnametaxonid ON taxon_name(taxon_id) TABLESPACE "BIOSQL_INDEX";
CREATE INDEX taxnamename    ON taxon_name(name) TABLESPACE "BIOSQL_INDEX";

-- this is the namespace (controlled vocabulary) ontology terms live in
-- we chose to have a separate table for this instead of reusing biodatabase
CREATE TABLE ontology (
       	ontology_id        int  NOT NULL ,
       	name	   	   VARCHAR(32) NOT NULL,
       	definition	   VARCHAR2(250),
	PRIMARY KEY (ontology_id),
	UNIQUE (name)
) TABLESPACE "BIOSQL_DATA";

-- any controlled vocab term, everything from full ontology
-- terms eg GO IDs to the various keys allowed as qualifiers
CREATE TABLE term (
       	term_id   int  NOT NULL ,
       	name	   	   VARCHAR(255) NOT NULL,
       	definition	   VARCHAR2(250),
	identifier	   VARCHAR(40),
	is_obsolete	   CHAR(1),
	ontology_id	   int  NOT NULL,
	PRIMARY KEY (term_id),
	UNIQUE (name,ontology_id),
	UNIQUE (identifier)
) TABLESPACE "BIOSQL_DATA";

CREATE INDEX term_ont ON term(ontology_id) TABLESPACE "BIOSQL_INDEX";

-- ontology terms have synonyms, here is how to store them
CREATE TABLE term_synonym (
       name		  VARCHAR(255) NOT NULL,
       term_id		  int  NOT NULL,
       PRIMARY KEY (term_id,name)
) TABLESPACE "BIOSQL_DATA";

-- ontology terms to dbxref association: ontology terms have dbxrefs
CREATE TABLE term_dbxref (
       	term_id	          int  NOT NULL,
       	dbxref_id         int  NOT NULL,
	rank		  SMALLINT,
	PRIMARY KEY (term_id, dbxref_id)
) TABLESPACE "BIOSQL_DATA";

CREATE INDEX trmdbxref_dbxrefid ON term_dbxref(dbxref_id) TABLESPACE "BIOSQL_INDEX";

-- relationship between controlled vocabulary / ontology term
-- we use subject/predicate/object but this could also
-- be thought of as child/relationship-type/parent.
-- the subject/predicate/object naming is better as we
-- can think of the graph as composed of statements.
--
-- we also treat the relationshiptypes / predicates as
-- controlled terms in themselves; this is quite useful
-- as a lot of systems (eg GO) will soon require
-- ontologies of relationship types (eg subtle differences
-- in the partOf relationship)
--
-- this table probably won''t be filled for a while, the core
-- will just treat ontologies as flat lists of terms

CREATE TABLE term_relationship (
        term_relationship_id int  NOT NULL ,
       	subject_term_id	int  NOT NULL,
       	predicate_term_id    int  NOT NULL,
       	object_term_id       int  NOT NULL,
	ontology_id	int  NOT NULL,
	PRIMARY KEY (term_relationship_id),
	UNIQUE (subject_term_id,predicate_term_id,object_term_id,ontology_id)
) TABLESPACE "BIOSQL_DATA";

CREATE INDEX trmrel_predicateid ON term_relationship(predicate_term_id) TABLESPACE "BIOSQL_INDEX";
CREATE INDEX trmrel_objectid ON term_relationship(object_term_id) TABLESPACE "BIOSQL_INDEX";
CREATE INDEX trmrel_ontid ON term_relationship(ontology_id) TABLESPACE "BIOSQL_INDEX";
-- you may want to add this for mysql because MySQL often is broken with
-- respect to using the composite index for the initial keys
--CREATE INDEX ontrel_subjectid ON term_relationship(subject_term_id);

-- the infamous transitive closure table on ontology term relationships
-- this is a warehouse approach - you will need to update this regularly
--
-- the triple of (subject, predicate, object) is the same as for ontology
-- relationships, with the exception of predicate being the greatest common
-- denominator of the relationships types visited in the path (i.e., if
-- relationship type A is-a relationship type B, the greatest common
-- denominator for path containing both types A and B is B)
--
-- See the GO database or Chado schema for other (and possibly better
-- documented) implementations of the transitive closure table approach.
CREATE TABLE term_path (
        term_path_id         int  NOT NULL ,
       	subject_term_id	     int  NOT NULL,
       	predicate_term_id    int  NOT NULL,
       	object_term_id       int  NOT NULL,
	ontology_id          int  NOT NULL,
	distance	     int ,
	PRIMARY KEY (term_path_id),
	UNIQUE (subject_term_id,predicate_term_id,object_term_id,ontology_id,distance)
) TABLESPACE "BIOSQL_DATA";

CREATE INDEX trmpath_predicateid ON term_path(predicate_term_id) TABLESPACE "BIOSQL_INDEX";
CREATE INDEX trmpath_objectid ON term_path(object_term_id) TABLESPACE "BIOSQL_INDEX";
CREATE INDEX trmpath_ontid ON term_path(ontology_id) TABLESPACE "BIOSQL_INDEX";
-- you may want to add this for mysql because MySQL often is broken with
-- respect to using the composite index for the initial keys
--CREATE INDEX trmpath_subjectid ON term_path(subject_term_id);


-- BioJava addition
CREATE TABLE term_relationship_term (
        term_relationship_id    int  DEFAULT 0 NOT NULL,
        term_id                 int  DEFAULT 0 NOT NULL,
        PRIMARY KEY (term_relationship_id,term_id),
) TABLESPACE "BIOSQL_DATA";

ALTER TABLE term_relationship_term ADD CONSTRAINT uni_term_relationship_id 
        UNIQUE (term_relationship_id) ENABLE VALIDATE;
ALTER TABLE term_relationship_term ADD CONSTRAINT uni_term_id 
        UNIQUE (term_id) ENABLE VALIDATE;


-- we can be a bioentry without a biosequence, but not visa-versa
-- most things are going to be keyed off bioentry_id
--
-- accession is the stable id, display_id is a potentially volatile,
-- human readable name.
--
-- Version may be unknown, may be undefined, or may not exist for a certain
-- accession or database (namespace). We require it here to avoid RDBMS-
-- dependend enforcement variants (version is in a compound alternative key),
-- and to simplify query construction for UK look-ups. If there is no version
-- the convention is to put 0 (zero) here. Likewise, a record with a version
-- of zero means the version is to be interpreted as NULL.
--
-- not all entries have a taxon, but many do.
-- one bioentry only has one taxon! (weirdo chimerias are not handled. tough)
--
-- Name maps to display_id in bioperl. We have a different column name
-- here to avoid confusion with the naming convention for foreign keys.

CREATE TABLE bioentry (
	bioentry_id	int  NOT NULL ,
  	biodatabase_id  int  NOT NULL,
  	taxon_id     	int ,
  	name		VARCHAR(40) NOT NULL,
  	accession    	VARCHAR(40) NOT NULL,
  	identifier   	VARCHAR(40),
	division	VARCHAR(6),
  	description  	VARCHAR2(250),
  	version 	SMALLINT  NOT NULL, 
	PRIMARY KEY (bioentry_id),
  	UNIQUE (accession,biodatabase_id,version),
  	UNIQUE (identifier)
) TABLESPACE "BIOSQL_DATA";

CREATE INDEX bioentry_name ON bioentry(name) TABLESPACE "BIOSQL_INDEX";
CREATE INDEX bioentry_db   ON bioentry(biodatabase_id) TABLESPACE "BIOSQL_INDEX";
CREATE INDEX bioentry_tax  ON bioentry(taxon_id) TABLESPACE "BIOSQL_INDEX";



--
-- bioentry-bioentry relationships: these are typed
--
CREATE TABLE bioentry_relationship (
        bioentry_relationship_id int  NOT NULL ,
   	object_bioentry_id 	int  NOT NULL,
   	subject_bioentry_id 	int  NOT NULL,
   	term_id 		int  NOT NULL,
   	rank 			INT,
   	PRIMARY KEY (bioentry_relationship_id),
	UNIQUE (object_bioentry_id,subject_bioentry_id,term_id)
) TABLESPACE "BIOSQL_DATA";

CREATE INDEX bioentryrel_trm   ON bioentry_relationship(term_id) TABLESPACE "BIOSQL_INDEX";
CREATE INDEX bioentryrel_child ON bioentry_relationship(subject_bioentry_id) TABLESPACE "BIOSQL_INDEX";
-- you may want to add this for mysql because MySQL often is broken with
-- respect to using the composite index for the initial keys
--CREATE INDEX bioentryrel_parent ON bioentry_relationship(object_bioentry_id);

-- for deep (depth > 1) bioentry relationship trees we need a transitive
-- closure table too
CREATE TABLE bioentry_path (
   	object_bioentry_id 	int  NOT NULL,
   	subject_bioentry_id 	int  NOT NULL,
   	term_id 		int  NOT NULL,
	distance	     	int ,
	UNIQUE (object_bioentry_id,subject_bioentry_id,term_id,distance)
) TABLESPACE "BIOSQL_DATA";

CREATE INDEX bioentrypath_trm   ON bioentry_path(term_id) TABLESPACE "BIOSQL_INDEX";
CREATE INDEX bioentrypath_child ON bioentry_path(subject_bioentry_id) TABLESPACE "BIOSQL_INDEX";
-- you may want to add this for mysql because MySQL often is broken with
-- respect to using the composite index for the initial keys
--CREATE INDEX bioentrypath_parent ON bioentry_path(object_bioentry_id);

-- some bioentries will have a sequence
-- biosequence because sequence is sometimes a reserved word

CREATE TABLE biosequence (
  	bioentry_id     int  NOT NULL,
  	version     	SMALLINT, 
  	length      	int,
  	alphabet        VARCHAR(10),
  	seq 		LONG,
	PRIMARY KEY (bioentry_id)
) TABLESPACE "BIOSQL_DATA";

-- add these only if you want them:
-- ALTER TABLE biosequence ADD COLUMN ( isoelec_pt NUMERIC(4,2) );
-- ALTER TABLE biosequence ADD COLUMN (	mol_wgt DOUBLE PRECISION );
-- ALTER TABLE biosequence ADD COLUMN ( perc_gc DOUBLE PRECISION );

-- database cross-references (e.g., GenBank:AC123456.1)
--
-- Version may be unknown, may be undefined, or may not exist for a certain
-- accession or database (namespace). We require it here to avoid RDBMS-
-- dependend enforcement variants (version is in a compound alternative key),
-- and to simplify query construction for UK look-ups. If there is no version
-- the convention is to put 0 (zero) here. Likewise, a record with a version
-- of zero means the version is to be interpreted as NULL.
--
CREATE TABLE dbxref (
        dbxref_id	int  NOT NULL ,
        dbname          VARCHAR(40) NOT NULL,
        accession       VARCHAR(40) NOT NULL,
	version		SMALLINT  NOT NULL,
	PRIMARY KEY (dbxref_id),
        UNIQUE(accession, dbname, version)
) TABLESPACE "BIOSQL_DATA";

CREATE INDEX dbxref_db  ON dbxref(dbname) TABLESPACE "BIOSQL_INDEX";

-- for roundtripping embl/genbank, we need to have the "optional ID"
-- for the dbxref.
--
-- another use of this table could be for storing
-- descriptive text for a dbxref. for example, we may want to
-- know stuff about the interpro accessions we store (without
-- importing all of interpro), so we can attach the text
-- description as a synonym
CREATE TABLE dbxref_qualifier_value (
       	dbxref_id 		int  NOT NULL,
       	term_id 		int  NOT NULL,
  	rank  		   	INT DEFAULT 0 NOT NULL,
       	value			VARCHAR2(100),
	PRIMARY KEY (dbxref_id,term_id,rank)
) TABLESPACE "BIOSQL_DATA";

CREATE INDEX dbxrefqual_dbx ON dbxref_qualifier_value(dbxref_id) TABLESPACE "BIOSQL_INDEX";
CREATE INDEX dbxrefqual_trm ON dbxref_qualifier_value(term_id) TABLESPACE "BIOSQL_INDEX";

-- Direct dblinks. It is tempting to do this
-- from bioentry_id to bioentry_id. But that wont work
-- during updates of one database - we will have to edit
-- this table each time. Better to do the join through accession
-- and db each time. Should be almost as cheap

CREATE TABLE bioentry_dbxref ( 
       	bioentry_id        int  NOT NULL,
       	dbxref_id          int  NOT NULL,
  	rank  		   SMALLINT,
	PRIMARY KEY (bioentry_id,dbxref_id)
) TABLESPACE "BIOSQL_DATA";

CREATE INDEX dblink_dbx  ON bioentry_dbxref(dbxref_id) TABLESPACE "BIOSQL_INDEX";

-- We can have multiple references per bioentry, but one reference
-- can also be used for the same bioentry.
--
-- No two references can reference the same reference database entry
-- (dbxref_id). This is where the MEDLINE id goes: PUBMED:123456.

CREATE TABLE reference (
  	reference_id       int  NOT NULL ,
	dbxref_id	   int ,
  	location 	   VARCHAR2(100) NOT NULL,
  	title    	   VARCHAR2(100),
  	authors  	   VARCHAR2(100) NOT NULL,
  	crc	   	   VARCHAR(32),
	PRIMARY KEY (reference_id),
	UNIQUE (dbxref_id),
	UNIQUE (crc)
) TABLESPACE "BIOSQL_DATA";

-- bioentry to reference associations
CREATE TABLE bioentry_reference (
  	bioentry_id 	int  NOT NULL,
  	reference_id 	int  NOT NULL,
  	start_pos	int,
  	end_pos	  	int,
  	rank  		SMALLINT DEFAULT 0 NOT NULL,
  	PRIMARY KEY(bioentry_id,reference_id,rank)
) TABLESPACE "BIOSQL_DATA";

CREATE INDEX bioentryref_ref ON bioentry_reference(reference_id) TABLESPACE "BIOSQL_INDEX";


-- We can have multiple comments per seqentry, and
-- comments can have embedded '\n' characters

CREATE TABLE anncomment (
  	comment_id  	int  NOT NULL ,
  	bioentry_id    	int  NOT NULL,
  	comment_text   	VARCHAR2(100) NOT NULL,
  	rank   		SMALLINT DEFAULT 0 NOT NULL,
	PRIMARY KEY (comment_id),
  	UNIQUE(bioentry_id, rank)
) TABLESPACE "BIOSQL_DATA";


-- tag/value and ontology term annotation for bioentries goes here
CREATE TABLE bioentry_qualifier_value (
	bioentry_id   		int  NOT NULL,
   	term_id  		int  NOT NULL,
   	value         		VARCHAR2(100),
	rank			INT DEFAULT 0 NOT NULL,
	UNIQUE (bioentry_id,term_id,rank)
) TABLESPACE "BIOSQL_DATA";

CREATE INDEX bioentryqual_trm ON bioentry_qualifier_value(term_id) TABLESPACE "BIOSQL_INDEX";

-- feature table. We cleanly handle
--   - simple locations
--   - split locations
--   - split locations on remote sequences

CREATE TABLE seqfeature (
   	seqfeature_id 		int  NOT NULL ,
   	bioentry_id   		int  NOT NULL,
   	type_term_id		int  NOT NULL,
   	source_term_id  	int  NOT NULL,
	display_name		VARCHAR(64),
   	rank 			SMALLINT  DEFAULT 0  NOT NULL,
	PRIMARY KEY (seqfeature_id),
	UNIQUE (bioentry_id,type_term_id,source_term_id,rank)
) TABLESPACE "BIOSQL_DATA";

CREATE INDEX seqfeature_trm  ON seqfeature(type_term_id) TABLESPACE "BIOSQL_INDEX";
CREATE INDEX seqfeature_fsrc ON seqfeature(source_term_id) TABLESPACE "BIOSQL_INDEX";
-- you may want to add this for mysql because MySQL often is broken with
-- respect to using the composite index for the initial keys
--CREATE INDEX seqfeature_bioentryid ON seqfeature(bioentry_id);

-- seqfeatures can be arranged in containment hierarchies.
-- one can imagine storing other relationships between features,
-- in this case the term_id can be used to type the relationship

CREATE TABLE seqfeature_relationship (
        seqfeature_relationship_id int  NOT NULL ,
   	object_seqfeature_id	int  NOT NULL,
   	subject_seqfeature_id 	int  NOT NULL,
   	term_id 	int  NOT NULL,
   	rank 			INT,
   	PRIMARY KEY (seqfeature_relationship_id),
	UNIQUE (object_seqfeature_id,subject_seqfeature_id,term_id)
) TABLESPACE "BIOSQL_DATA";

CREATE INDEX seqfeaturerel_trm   ON seqfeature_relationship(term_id) TABLESPACE "BIOSQL_INDEX";
CREATE INDEX seqfeaturerel_child ON seqfeature_relationship(subject_seqfeature_id) TABLESPACE "BIOSQL_INDEX";
-- you may want to add this for mysql because MySQL often is broken with
-- respect to using the composite index for the initial keys
--CREATE INDEX seqfeaturerel_parent ON seqfeature_relationship(object_seqfeature_id);

-- for deep (depth > 1) seqfeature relationship trees we need a transitive
-- closure table too
CREATE TABLE seqfeature_path (
   	object_seqfeature_id	int  NOT NULL,
   	subject_seqfeature_id 	int  NOT NULL,
   	term_id 		int  NOT NULL,
	distance	     	int ,
	UNIQUE (object_seqfeature_id,subject_seqfeature_id,term_id,distance)
) TABLESPACE "BIOSQL_DATA";

CREATE INDEX seqfeaturepath_trm   ON seqfeature_path(term_id) TABLESPACE "BIOSQL_INDEX";
CREATE INDEX seqfeaturepath_child ON seqfeature_path(subject_seqfeature_id) TABLESPACE "BIOSQL_INDEX";
-- you may want to add this for mysql because MySQL often is broken with
-- respect to using the composite index for the initial keys
--CREATE INDEX seqfeaturerel_parent ON seqfeature_path(object_seqfeature_id);

-- tag/value associations - or ontology annotations
CREATE TABLE seqfeature_qualifier_value (
	seqfeature_id 		int  NOT NULL,
   	term_id 		int  NOT NULL,
   	rank 			SMALLINT DEFAULT 0  NOT NULL,
   	value  			VARCHAR2(4000) NOT NULL,
   	PRIMARY KEY (seqfeature_id,term_id,rank)
) TABLESPACE "BIOSQL_DATA";

CREATE INDEX seqfeaturequal_trm ON seqfeature_qualifier_value(term_id) TABLESPACE "BIOSQL_INDEX";
   
-- DBXrefs for features. This is necessary for genome oriented viewpoints,
-- where you have a few have long sequences (contigs, or chromosomes) with many
-- features on them. In that case the features are the semantic scope for
-- their annotation bundles, not the bioentry they are attached to.

CREATE TABLE seqfeature_dbxref ( 
       	seqfeature_id      int  NOT NULL,
       	dbxref_id          int  NOT NULL,
  	rank  		   SMALLINT,
	PRIMARY KEY (seqfeature_id,dbxref_id)
) TABLESPACE "BIOSQL_DATA";

CREATE INDEX feadblink_dbx  ON seqfeature_dbxref(dbxref_id) TABLESPACE "BIOSQL_INDEX";

-- basically we model everything as potentially having
-- any number of locations, ie, a split location. SimpleLocations
-- just have one location. We need to have a location id for the qualifier
-- associations of fuzzy locations.

-- please do not try to model complex assemblies with this thing. It wont
-- work. Check out the ensembl schema for this.

-- we allow nulls for start/end - this is useful for fuzzies as
-- standard range queries will not be included

-- for remote locations, the join to make is to DBXref
-- the FK to term is a possibility to store the type of the
-- location for determining in one hit whether it's a fuzzy or not

CREATE TABLE location (
	location_id		int  NOT NULL ,
   	seqfeature_id		int  NOT NULL,
	dbxref_id		int ,
	term_id			int ,
   	start_pos              	int,
   	end_pos                	int,
   	strand             	INT NOT NULL,
   	rank          		SMALLINT DEFAULT 0  NOT NULL,
	PRIMARY KEY (location_id),
   	UNIQUE (seqfeature_id, rank)
) TABLESPACE "BIOSQL_DATA";

CREATE INDEX seqfeatureloc_start ON location(start_pos, end_pos) TABLESPACE "BIOSQL_INDEX";
CREATE INDEX seqfeatureloc_dbx   ON location(dbxref_id) TABLESPACE "BIOSQL_INDEX";
CREATE INDEX seqfeatureloc_trm   ON location(term_id) TABLESPACE "BIOSQL_INDEX";

-- location qualifiers - mainly intended for fuzzies but anything
-- can go in here
-- some controlled vocab terms have slots;
-- fuzzies could be modeled as min_start(5), max_start(5)
-- 
-- there is no restriction on extending the fuzzy ontology
-- for your own nefarious aims, although the bio* apis will
-- most likely ignore these
CREATE TABLE location_qualifier_value (
	location_id		int  NOT NULL,
   	term_id 		int  NOT NULL,
   	value  			VARCHAR(255) NOT NULL,
   	int_value 		int,
	PRIMARY KEY (location_id,term_id)
) TABLESPACE "BIOSQL_DATA";

CREATE INDEX locationqual_trm ON location_qualifier_value(term_id) TABLESPACE "BIOSQL_INDEX";

--
-- Create the foreign key constraints
--

-- ontology term

ALTER TABLE term ADD CONSTRAINT FKont_term
	FOREIGN KEY (ontology_id) REFERENCES ontology(ontology_id)
	ON DELETE CASCADE;

-- term synonyms

ALTER TABLE term_synonym ADD CONSTRAINT FKterm_syn
	FOREIGN KEY (term_id) REFERENCES term(term_id)
	ON DELETE CASCADE;

-- term_dbxref

ALTER TABLE term_dbxref ADD CONSTRAINT FKdbxref_trmdbxref
       	FOREIGN KEY (dbxref_id) REFERENCES dbxref(dbxref_id)
	ON DELETE CASCADE;
ALTER TABLE term_dbxref ADD CONSTRAINT FKterm_trmdbxref
      FOREIGN KEY (term_id) REFERENCES term(term_id)
	ON DELETE CASCADE;

-- term_relationship

ALTER TABLE term_relationship ADD CONSTRAINT FKtrmsubject_trmrel
	FOREIGN KEY (subject_term_id) REFERENCES term(term_id)
	ON DELETE CASCADE;
ALTER TABLE term_relationship ADD CONSTRAINT FKtrmpredicate_trmrel
       	FOREIGN KEY (predicate_term_id) REFERENCES term(term_id)
	ON DELETE CASCADE;
ALTER TABLE term_relationship ADD CONSTRAINT FKtrmobject_trmrel
       	FOREIGN KEY (object_term_id) REFERENCES term(term_id)
	ON DELETE CASCADE;
ALTER TABLE term_relationship ADD CONSTRAINT FKterm_trmrel
       	FOREIGN KEY (ontology_id) REFERENCES ontology(ontology_id)
	ON DELETE CASCADE;

-- term_path

ALTER TABLE term_path ADD CONSTRAINT FKtrmsubject_trmpath
	FOREIGN KEY (subject_term_id) REFERENCES term(term_id)
	ON DELETE CASCADE;
ALTER TABLE term_path ADD CONSTRAINT FKtrmpredicate_trmpath
       	FOREIGN KEY (predicate_term_id) REFERENCES term(term_id)
	ON DELETE CASCADE;
ALTER TABLE term_path ADD CONSTRAINT FKtrmobject_trmpath
       	FOREIGN KEY (object_term_id) REFERENCES term(term_id)
	ON DELETE CASCADE;
ALTER TABLE term_path ADD CONSTRAINT FKontology_trmpath
       	FOREIGN KEY (ontology_id) REFERENCES ontology(ontology_id)
	ON DELETE CASCADE;

-- taxon, taxon_name

-- unfortunately, we can't constrain parent_taxon_id as it is violated
-- occasionally by the downloads available from NCBI
-- ALTER TABLE taxon ADD CONSTRAINT FKtaxon_taxon
--         FOREIGN KEY (parent_taxon_id) REFERENCES taxon(taxon_id);
ALTER TABLE taxon_name ADD CONSTRAINT FKtaxon_taxonname
        FOREIGN KEY (taxon_id) REFERENCES taxon(taxon_id)
        ON DELETE CASCADE;

-- bioentry

ALTER TABLE bioentry ADD CONSTRAINT FKtaxon_bioentry
	FOREIGN KEY (taxon_id) REFERENCES taxon(taxon_id);
ALTER TABLE bioentry ADD CONSTRAINT FKbiodatabase_bioentry
	FOREIGN KEY (biodatabase_id) REFERENCES biodatabase(biodatabase_id);

-- bioentry_relationship

ALTER TABLE bioentry_relationship ADD CONSTRAINT FKterm_bioentryrel
	FOREIGN KEY (term_id) REFERENCES term(term_id);
ALTER TABLE bioentry_relationship ADD CONSTRAINT FKparentent_bioentryrel
	FOREIGN KEY (object_bioentry_id) REFERENCES bioentry(bioentry_id)
	ON DELETE CASCADE;
ALTER TABLE bioentry_relationship ADD CONSTRAINT FKchildent_bioentryrel
	FOREIGN KEY (subject_bioentry_id) REFERENCES bioentry(bioentry_id)
	ON DELETE CASCADE;

-- bioentry_path

ALTER TABLE bioentry_path ADD CONSTRAINT FKterm_bioentrypath
	FOREIGN KEY (term_id) REFERENCES term(term_id);
ALTER TABLE bioentry_path ADD CONSTRAINT FKparentent_bioentrypath
	FOREIGN KEY (object_bioentry_id) REFERENCES bioentry(bioentry_id)
	ON DELETE CASCADE;
ALTER TABLE bioentry_path ADD CONSTRAINT FKchildent_bioentrypath
	FOREIGN KEY (subject_bioentry_id) REFERENCES bioentry(bioentry_id)
	ON DELETE CASCADE;

-- biosequence

ALTER TABLE biosequence ADD CONSTRAINT FKbioentry_bioseq
	FOREIGN KEY (bioentry_id) REFERENCES bioentry(bioentry_id)
	ON DELETE CASCADE;

-- comment

ALTER TABLE anncomment ADD CONSTRAINT FKbioentry_comment
	FOREIGN KEY(bioentry_id) REFERENCES bioentry(bioentry_id)
	ON DELETE CASCADE;

-- bioentry_dbxref

ALTER TABLE bioentry_dbxref ADD CONSTRAINT FKbioentry_dblink
        FOREIGN KEY (bioentry_id) REFERENCES bioentry(bioentry_id)
	ON DELETE CASCADE;
ALTER TABLE bioentry_dbxref ADD CONSTRAINT FKdbxref_dblink
       	FOREIGN KEY (dbxref_id) REFERENCES dbxref(dbxref_id)
	ON DELETE CASCADE;

-- dbxref_qualifier_value

ALTER TABLE dbxref_qualifier_value ADD CONSTRAINT FKtrm_dbxrefqual
	FOREIGN KEY (term_id) REFERENCES term(term_id);
ALTER TABLE dbxref_qualifier_value ADD CONSTRAINT FKdbxref_dbxrefqual
	FOREIGN KEY (dbxref_id) REFERENCES dbxref(dbxref_id)
	ON DELETE CASCADE;

-- bioentry_reference

ALTER TABLE bioentry_reference ADD CONSTRAINT FKbioentry_entryref
	FOREIGN KEY (bioentry_id) REFERENCES bioentry(bioentry_id)
	ON DELETE CASCADE;
ALTER TABLE bioentry_reference ADD CONSTRAINT FKreference_entryref
	FOREIGN KEY (reference_id) REFERENCES reference(reference_id)
	ON DELETE CASCADE;

-- bioentry_qualifier_value

ALTER TABLE bioentry_qualifier_value ADD CONSTRAINT FKbioentry_entqual
	FOREIGN KEY (bioentry_id) REFERENCES bioentry(bioentry_id)
	ON DELETE CASCADE;
ALTER TABLE bioentry_qualifier_value ADD CONSTRAINT FKterm_entqual
	FOREIGN KEY (term_id) REFERENCES term(term_id);

-- reference 
ALTER TABLE reference ADD CONSTRAINT FKdbxref_reference
      FOREIGN KEY ( dbxref_id ) REFERENCES dbxref ( dbxref_id ) ;

-- seqfeature

ALTER TABLE seqfeature ADD CONSTRAINT FKterm_seqfeature
	FOREIGN KEY (type_term_id) REFERENCES term(term_id);
ALTER TABLE seqfeature ADD CONSTRAINT FKsourceterm_seqfeature
	FOREIGN KEY (source_term_id) REFERENCES term(term_id);
ALTER TABLE seqfeature ADD CONSTRAINT FKbioentry_seqfeature
	FOREIGN KEY (bioentry_id) REFERENCES bioentry(bioentry_id)
	ON DELETE CASCADE;

-- seqfeature_relationship

ALTER TABLE seqfeature_relationship ADD CONSTRAINT FKterm_seqfeatrel
	FOREIGN KEY (term_id) REFERENCES term(term_id);
ALTER TABLE seqfeature_relationship ADD CONSTRAINT FKparentfeat_seqfeatrel
	FOREIGN KEY (object_seqfeature_id) REFERENCES seqfeature(seqfeature_id)
	ON DELETE CASCADE;
ALTER TABLE seqfeature_relationship ADD CONSTRAINT FKchildfeat_seqfeatrel
	FOREIGN KEY (subject_seqfeature_id) REFERENCES seqfeature(seqfeature_id)
	ON DELETE CASCADE;

-- seqfeature_path

ALTER TABLE seqfeature_path ADD CONSTRAINT FKterm_seqfeatpath
	FOREIGN KEY (term_id) REFERENCES term(term_id);
ALTER TABLE seqfeature_path ADD CONSTRAINT FKparentfeat_seqfeatpath
	FOREIGN KEY (object_seqfeature_id) REFERENCES seqfeature(seqfeature_id)
	ON DELETE CASCADE;
ALTER TABLE seqfeature_path ADD CONSTRAINT FKchildfeat_seqfeatpath
	FOREIGN KEY (subject_seqfeature_id) REFERENCES seqfeature(seqfeature_id)
	ON DELETE CASCADE;

-- seqfeature_qualifier_value
ALTER TABLE seqfeature_qualifier_value ADD CONSTRAINT FKterm_featqual
	FOREIGN KEY (term_id) REFERENCES term(term_id);
ALTER TABLE seqfeature_qualifier_value ADD CONSTRAINT FKseqfeature_featqual
	FOREIGN KEY (seqfeature_id) REFERENCES seqfeature(seqfeature_id)
	ON DELETE CASCADE;

-- seqfeature_dbxref

ALTER TABLE seqfeature_dbxref ADD CONSTRAINT FKseqfeature_feadblink
        FOREIGN KEY (seqfeature_id) REFERENCES seqfeature(seqfeature_id)
	ON DELETE CASCADE;
ALTER TABLE seqfeature_dbxref ADD CONSTRAINT FKdbxref_feadblink
       	FOREIGN KEY (dbxref_id) REFERENCES dbxref(dbxref_id)
	ON DELETE CASCADE;

-- location

ALTER TABLE location ADD CONSTRAINT FKseqfeature_location
	FOREIGN KEY (seqfeature_id) REFERENCES seqfeature(seqfeature_id)
	ON DELETE CASCADE;
ALTER TABLE location ADD CONSTRAINT FKdbxref_location
	FOREIGN KEY (dbxref_id) REFERENCES dbxref(dbxref_id);
ALTER TABLE location ADD CONSTRAINT FKterm_featloc
	FOREIGN KEY (term_id) REFERENCES term(term_id);

-- location_qualifier_value

ALTER TABLE location_qualifier_value ADD CONSTRAINT FKfeatloc_locqual
	FOREIGN KEY (location_id) REFERENCES location(location_id)
	ON DELETE CASCADE;
ALTER TABLE location_qualifier_value ADD CONSTRAINT FKterm_locqual
	FOREIGN KEY (term_id) REFERENCES term(term_id);


--
-- Triggers for automatic primary key generation and other sanity checks
--

CREATE OR REPLACE TRIGGER BID_location
  BEFORE INSERT
  on location
  -- 
  for each row
BEGIN
IF :new.location_id IS NULL THEN
    SELECT location_pk_seq.nextval INTO :new.location_id FROM DUAL;
END IF;
END;
/


CREATE OR REPLACE TRIGGER BID_seqfeature
  BEFORE INSERT
  on seqfeature
  -- 
  for each row
BEGIN
IF :new.seqfeature_id IS NULL THEN
    SELECT seqfeature_pk_seq.nextval INTO :new.seqfeature_id FROM DUAL;
END IF;
END;
/


CREATE TRIGGER BID_seqfeature_relationship
  BEFORE INSERT
  on seqfeature_relationship
  -- 
  for each row
BEGIN
IF :new.seqfeature_relationship_id IS NULL THEN
    SELECT seqfeature_relationship_pk_seq.nextval INTO :new.seqfeature_relationship_id FROM DUAL;
END IF;
END;
/


CREATE OR REPLACE TRIGGER BID_anncomment
  BEFORE INSERT
  on anncomment
  -- 
  for each row
BEGIN
IF :new.comment_id IS NULL THEN
    SELECT anncomment_pk_seq.nextval INTO :new.comment_id FROM DUAL;
END IF;
END;
/


CREATE OR REPLACE TRIGGER BID_reference
  BEFORE INSERT
  on reference
  -- 
  for each row
BEGIN
IF :new.reference_id IS NULL THEN
    SELECT reference_pk_seq.nextval INTO :new.reference_id FROM DUAL;
END IF;
END;
/


CREATE OR REPLACE TRIGGER BID_bioentry_relationship
  BEFORE INSERT
  on bioentry_relationship
  -- 
  for each row
BEGIN
IF :new.bioentry_relationship_id IS NULL THEN
    SELECT bioentry_relationship_pk_seq.nextval INTO :new.bioentry_relationship_id FROM DUAL;
END IF;
END;
/


CREATE OR REPLACE TRIGGER BID_bioentry
  BEFORE INSERT
  on bioentry
  -- 
  for each row
BEGIN
IF :new.bioentry_id IS NULL THEN
    SELECT bioentry_pk_seq.nextval INTO :new.bioentry_id FROM DUAL;
END IF;
-- IF :new.Division IS NULL THEN
--    :new.Division := 'UNK';
-- END IF;
END;
/


CREATE OR REPLACE TRIGGER BID_term
  BEFORE INSERT
  on term
  -- 
  for each row
BEGIN
IF :new.term_id IS NULL THEN
    SELECT term_pk_seq.nextval INTO :new.term_id FROM DUAL;
END IF;
END;
/


CREATE OR REPLACE TRIGGER BID_term_relationship
  BEFORE INSERT
  on term_relationship
  -- 
  for each row
BEGIN
IF :new.term_relationship_id IS NULL THEN
    SELECT term_relationship_pk_seq.nextval INTO :new.term_relationship_id FROM DUAL;
END IF;
END;
/


CREATE OR REPLACE TRIGGER BID_term_path
  BEFORE INSERT
  on term_path
  -- 
  for each row
BEGIN
IF :new.term_path_id IS NULL THEN
    SELECT term_path_pk_seq.nextval INTO :new.term_path_id FROM DUAL;
END IF;
END;
/

CREATE OR REPLACE TRIGGER BID_ontology
  BEFORE INSERT
  on ontology
  -- 
  for each row
BEGIN
IF :new.ontology_id IS NULL THEN
    SELECT ontology_pk_seq.nextval INTO :new.ontology_id FROM DUAL;
END IF;
END;
/


CREATE OR REPLACE TRIGGER BID_taxon
  BEFORE INSERT
  on taxon
  -- 
  for each row
BEGIN
IF :new.taxon_id IS NULL THEN
    SELECT taxon_pk_seq.nextval INTO :new.taxon_id FROM DUAL;
END IF;
END;
/


CREATE OR REPLACE TRIGGER BID_biodatabase
  BEFORE INSERT
  on biodatabase
  -- 
  for each row
BEGIN
IF :new.biodatabase_id IS NULL THEN
    SELECT biodatabase_pk_seq.nextval INTO :new.biodatabase_id FROM DUAL;
END IF;
END;
/


CREATE OR REPLACE TRIGGER BID_dbxref
  BEFORE INSERT
  on dbxref
  -- 
  for each row
BEGIN
IF :new.dbxref_id IS NULL THEN
    SELECT dbxref_pk_seq.nextval INTO :new.dbxref_id FROM DUAL;
END IF;
END;
/

-------------- next part --------------





More information about the Biojava-l mailing list