[BioSQL-l] Oracle support...

Len Trigg len at reeltwo.com
Tue Jul 29 17:49:37 EDT 2003


Hi all,

I have been using BioJava for some bioinformatics problems, and use
BioSQL for storing sequences in mysql. All in all, it's very nice. 

I then had to use Oracle as the database, and ran into a whole pile of
headaches. I am definitely no expert in Oracle, so that may account
for a large part of my problems, but I guess it also says something
when the procedure for creating the biosql database in mysql pretty
much involves executing one sql file, but the oracle setup consists of
around 140 files, and the table and field names you end up with don't
match those in the BioSQL relational model described in
biosql-ERD.pdf. Trying to fathom what is going on is a nightmare for
those who have only had experience with the mysql installation.

In desperation, I actually just ported the mysql schema file to Oracle
directly and have happily been using that. There were a couple of
minor niggles that I thought I would mention because either they might
warrant changes to the BioSQL schema, or someone might know of
workarounds.

First, I discovered that "comment" and "synonym" are reserved words in
Oracle, so I had to rename the comment table, and the synonym field of
term_synonym table. Perhaps the main schema could be changed to avoid
these reserved words.

Second, to support autoincrement fields, I created an oracle sequence
and trigger for each table. Having one per table makes it easy to
access the last id assigned when doing an insert (which BioJava makes
use of).

I have attached my ported schema file (and one to drop the schema) for
those interested. It should be possible to autogenerate this file, as
I gather that is how the postgres one is created. I think that a
"vanilla" alternative to the existing oracle schema would make a nice
addition for those who don't need extra bells and whistles. These
files correspond to current BioSQL CVS.

Cheers,
Len.

-------------- next part --------------
-- conventions:
-- <table_name>_id is primary internal id (usually autogenerated)

-- Authors: Ewan Birney, Elia Stupka
-- Contributors: Hilmar Lapp, Aaron Mackey
--
-- Copyright Ewan Birney. You may use, modify, and distribute this code under
-- the same terms as Perl. See the Perl Artistic License.
--
-- comments to biosql - biosql-l at open-bio.org

--
-- Migration of the MySQL schema to InnoDB by Hilmar Lapp <hlapp at gmx.net>
-- Post-Cape Town changes by Hilmar Lapp.
-- Singapore changes by Hilmar Lapp and Aaron Mackey.
--


CREATE SEQUENCE SE_biodatabase
	INCREMENT BY 1 START WITH 1 NOMAXVALUE NOMINVALUE NOCYCLE NOORDER;
CREATE SEQUENCE SE_taxon
	INCREMENT BY 1 START WITH 1 NOMAXVALUE NOMINVALUE NOCYCLE NOORDER;
CREATE SEQUENCE SE_ontology
	INCREMENT BY 1 START WITH 1 NOMAXVALUE NOMINVALUE NOCYCLE NOORDER;
CREATE SEQUENCE SE_term
	INCREMENT BY 1 START WITH 1 NOMAXVALUE NOMINVALUE NOCYCLE NOORDER;
CREATE SEQUENCE SE_term_relationship
	INCREMENT BY 1 START WITH 1 NOMAXVALUE NOMINVALUE NOCYCLE NOORDER;
CREATE SEQUENCE SE_term_path
	INCREMENT BY 1 START WITH 1 NOMAXVALUE NOMINVALUE NOCYCLE NOORDER;
CREATE SEQUENCE SE_bioentry
	INCREMENT BY 1 START WITH 1 NOMAXVALUE NOMINVALUE NOCYCLE NOORDER;
CREATE SEQUENCE SE_bioentry_relationship
	INCREMENT BY 1 START WITH 1 NOMAXVALUE NOMINVALUE NOCYCLE NOORDER;
CREATE SEQUENCE SE_dbxref
	INCREMENT BY 1 START WITH 1 NOMAXVALUE NOMINVALUE NOCYCLE NOORDER;
CREATE SEQUENCE SE_reference
 	INCREMENT BY 1 START WITH 1 NOMAXVALUE NOMINVALUE NOCYCLE NOORDER;
CREATE SEQUENCE SE_comment_
 	INCREMENT BY 1 START WITH 1 NOMAXVALUE NOMINVALUE NOCYCLE NOORDER;
CREATE SEQUENCE SE_seqfeature
 	INCREMENT BY 1 START WITH 1 NOMAXVALUE NOMINVALUE NOCYCLE NOORDER;
CREATE SEQUENCE SE_seqfeature_relationship
 	INCREMENT BY 1 START WITH 1 NOMAXVALUE NOMINVALUE NOCYCLE NOORDER;
CREATE SEQUENCE SE_location
 	INCREMENT BY 1 START WITH 1 NOMAXVALUE NOMINVALUE NOCYCLE NOORDER;


-- database have bioentries. That is about it.
-- we do not store different versions of a database as different dbids
-- (there is no concept of versions of database). There is a concept of
-- versions of entries. Versions of databases deserve their own table and
-- join to bioentry table for tracking with versions of entries 

CREATE TABLE biodatabase (
  	biodatabase_id 	int  NOT NULL ,
  	name           	VARCHAR(128) NOT NULL,
	authority	VARCHAR(128),
	description	VARCHAR2(100),
	PRIMARY KEY (biodatabase_id),
  	UNIQUE (name)
) TABLESPACE "SYMGENE_DATA";

CREATE INDEX db_auth on biodatabase(authority) TABLESPACE "SYMGENE_INDEX";

-- we could insist that taxa are NCBI taxon id, but on reflection I made this
-- an optional extra line, as many flat file formats do not have the NCBI id
--
-- no organelle/sub species

-- corresponds to the node table of the NCBI taxonomy databaase
CREATE TABLE taxon (
       taxon_id		int  NOT NULL ,
       ncbi_taxon_id 	int,
       parent_taxon_id	int ,
       node_rank	VARCHAR(32),
       genetic_code	INT ,
       mito_genetic_code INT ,
       left_value	int ,
       right_value	int ,
       PRIMARY KEY (taxon_id),
       UNIQUE (ncbi_taxon_id),
       UNIQUE (left_value),
       UNIQUE (right_value)
) TABLESPACE "SYMGENE_DATA";

CREATE INDEX taxparent ON taxon(parent_taxon_id) TABLESPACE "SYMGENE_INDEX";

-- corresponds to the names table of the NCBI taxonomy databaase
CREATE TABLE taxon_name (
       taxon_id		int  NOT NULL,
       name		VARCHAR(255) NOT NULL,
       name_class	VARCHAR(32) NOT NULL,
       UNIQUE (taxon_id,name,name_class)
) TABLESPACE "SYMGENE_DATA";

CREATE INDEX taxnametaxonid ON taxon_name(taxon_id) TABLESPACE "SYMGENE_INDEX";
CREATE INDEX taxnamename    ON taxon_name(name) TABLESPACE "SYMGENE_INDEX";

-- this is the namespace (controlled vocabulary) ontology terms live in
-- we chose to have a separate table for this instead of reusing biodatabase
CREATE TABLE ontology (
       	ontology_id        int  NOT NULL ,
       	name	   	   VARCHAR(32) NOT NULL,
       	definition	   VARCHAR2(100),
	PRIMARY KEY (ontology_id),
	UNIQUE (name)
) TABLESPACE "SYMGENE_DATA";

-- any controlled vocab term, everything from full ontology
-- terms eg GO IDs to the various keys allowed as qualifiers
CREATE TABLE term (
       	term_id   int  NOT NULL ,
       	name	   	   VARCHAR(255) NOT NULL,
       	definition	   VARCHAR2(100),
	identifier	   VARCHAR(40),
	is_obsolete	   CHAR(1),
	ontology_id	   int  NOT NULL,
	PRIMARY KEY (term_id),
	UNIQUE (name,ontology_id),
	UNIQUE (identifier)
) TABLESPACE "SYMGENE_DATA";

CREATE INDEX term_ont ON term(ontology_id) TABLESPACE "SYMGENE_INDEX";

-- ontology terms have synonyms, here is how to store them
CREATE TABLE term_synonym (
       synonym_		  VARCHAR(255) NOT NULL,
       term_id		  int  NOT NULL,
       PRIMARY KEY (term_id,synonym_)
) TABLESPACE "SYMGENE_DATA";

-- ontology terms to dbxref association: ontology terms have dbxrefs
CREATE TABLE term_dbxref (
       	term_id	          int  NOT NULL,
       	dbxref_id         int  NOT NULL,
	rank		  SMALLINT,
	PRIMARY KEY (term_id, dbxref_id)
) TABLESPACE "SYMGENE_DATA";

CREATE INDEX trmdbxref_dbxrefid ON term_dbxref(dbxref_id) TABLESPACE "SYMGENE_INDEX";

-- relationship between controlled vocabulary / ontology term
-- we use subject/predicate/object but this could also
-- be thought of as child/relationship-type/parent.
-- the subject/predicate/object naming is better as we
-- can think of the graph as composed of statements.
--
-- we also treat the relationshiptypes / predicates as
-- controlled terms in themselves; this is quite useful
-- as a lot of systems (eg GO) will soon require
-- ontologies of relationship types (eg subtle differences
-- in the partOf relationship)
--
-- this table probably won''t be filled for a while, the core
-- will just treat ontologies as flat lists of terms

CREATE TABLE term_relationship (
        term_relationship_id int  NOT NULL ,
       	subject_term_id	int  NOT NULL,
       	predicate_term_id    int  NOT NULL,
       	object_term_id       int  NOT NULL,
	ontology_id	int  NOT NULL,
	PRIMARY KEY (term_relationship_id),
	UNIQUE (subject_term_id,predicate_term_id,object_term_id,ontology_id)
) TABLESPACE "SYMGENE_DATA";

CREATE INDEX trmrel_predicateid ON term_relationship(predicate_term_id) TABLESPACE "SYMGENE_INDEX";
CREATE INDEX trmrel_objectid ON term_relationship(object_term_id) TABLESPACE "SYMGENE_INDEX";
CREATE INDEX trmrel_ontid ON term_relationship(ontology_id) TABLESPACE "SYMGENE_INDEX";
-- you may want to add this for mysql because MySQL often is broken with
-- respect to using the composite index for the initial keys
--CREATE INDEX ontrel_subjectid ON term_relationship(subject_term_id);

-- the infamous transitive closure table on ontology term relationships
-- this is a warehouse approach - you will need to update this regularly
--
-- the triple of (subject, predicate, object) is the same as for ontology
-- relationships, with the exception of predicate being the greatest common
-- denominator of the relationships types visited in the path (i.e., if
-- relationship type A is-a relationship type B, the greatest common
-- denominator for path containing both types A and B is B)
--
-- See the GO database or Chado schema for other (and possibly better
-- documented) implementations of the transitive closure table approach.
CREATE TABLE term_path (
        term_path_id         int  NOT NULL ,
       	subject_term_id	     int  NOT NULL,
       	predicate_term_id    int  NOT NULL,
       	object_term_id       int  NOT NULL,
	ontology_id          int  NOT NULL,
	distance	     int ,
	PRIMARY KEY (term_path_id),
	UNIQUE (subject_term_id,predicate_term_id,object_term_id,ontology_id,distance)
) TABLESPACE "SYMGENE_DATA";

CREATE INDEX trmpath_predicateid ON term_path(predicate_term_id) TABLESPACE "SYMGENE_INDEX";
CREATE INDEX trmpath_objectid ON term_path(object_term_id) TABLESPACE "SYMGENE_INDEX";
CREATE INDEX trmpath_ontid ON term_path(ontology_id) TABLESPACE "SYMGENE_INDEX";
-- you may want to add this for mysql because MySQL often is broken with
-- respect to using the composite index for the initial keys
--CREATE INDEX trmpath_subjectid ON term_path(subject_term_id);

-- we can be a bioentry without a biosequence, but not visa-versa
-- most things are going to be keyed off bioentry_id
--
-- accession is the stable id, display_id is a potentially volatile,
-- human readable name.
--
-- Version may be unknown, may be undefined, or may not exist for a certain
-- accession or database (namespace). We require it here to avoid RDBMS-
-- dependend enforcement variants (version is in a compound alternative key),
-- and to simplify query construction for UK look-ups. If there is no version
-- the convention is to put 0 (zero) here. Likewise, a record with a version
-- of zero means the version is to be interpreted as NULL.
--
-- not all entries have a taxon, but many do.
-- one bioentry only has one taxon! (weirdo chimerias are not handled. tough)
--
-- Name maps to display_id in bioperl. We have a different column name
-- here to avoid confusion with the naming convention for foreign keys.

CREATE TABLE bioentry (
	bioentry_id	int  NOT NULL ,
  	biodatabase_id  int  NOT NULL,
  	taxon_id     	int ,
  	name		VARCHAR(40) NOT NULL,
  	accession    	VARCHAR(40) NOT NULL,
  	identifier   	VARCHAR(40),
	division	VARCHAR(6),
  	description  	VARCHAR2(100),
  	version 	SMALLINT  NOT NULL, 
	PRIMARY KEY (bioentry_id),
  	UNIQUE (accession,biodatabase_id,version),
  	UNIQUE (identifier)
) TABLESPACE "SYMGENE_DATA";

CREATE INDEX bioentry_name ON bioentry(name) TABLESPACE "SYMGENE_INDEX";
CREATE INDEX bioentry_db   ON bioentry(biodatabase_id) TABLESPACE "SYMGENE_INDEX";
CREATE INDEX bioentry_tax  ON bioentry(taxon_id) TABLESPACE "SYMGENE_INDEX";

--
-- bioentry-bioentry relationships: these are typed
--
CREATE TABLE bioentry_relationship (
        bioentry_relationship_id int  NOT NULL ,
   	object_bioentry_id 	int  NOT NULL,
   	subject_bioentry_id 	int  NOT NULL,
   	term_id 		int  NOT NULL,
   	rank 			INT,
   	PRIMARY KEY (bioentry_relationship_id),
	UNIQUE (object_bioentry_id,subject_bioentry_id,term_id)
) TABLESPACE "SYMGENE_DATA";

CREATE INDEX bioentryrel_trm   ON bioentry_relationship(term_id) TABLESPACE "SYMGENE_INDEX";
CREATE INDEX bioentryrel_child ON bioentry_relationship(subject_bioentry_id) TABLESPACE "SYMGENE_INDEX";
-- you may want to add this for mysql because MySQL often is broken with
-- respect to using the composite index for the initial keys
--CREATE INDEX bioentryrel_parent ON bioentry_relationship(object_bioentry_id);

-- for deep (depth > 1) bioentry relationship trees we need a transitive
-- closure table too
CREATE TABLE bioentry_path (
   	object_bioentry_id 	int  NOT NULL,
   	subject_bioentry_id 	int  NOT NULL,
   	term_id 		int  NOT NULL,
	distance	     	int ,
	UNIQUE (object_bioentry_id,subject_bioentry_id,term_id,distance)
) TABLESPACE "SYMGENE_DATA";

CREATE INDEX bioentrypath_trm   ON bioentry_path(term_id) TABLESPACE "SYMGENE_INDEX";
CREATE INDEX bioentrypath_child ON bioentry_path(subject_bioentry_id) TABLESPACE "SYMGENE_INDEX";
-- you may want to add this for mysql because MySQL often is broken with
-- respect to using the composite index for the initial keys
--CREATE INDEX bioentrypath_parent ON bioentry_path(object_bioentry_id);

-- some bioentries will have a sequence
-- biosequence because sequence is sometimes a reserved word

CREATE TABLE biosequence (
  	bioentry_id     int  NOT NULL,
  	version     	SMALLINT, 
  	length      	int,
  	alphabet        VARCHAR(10),
  	seq 		LONG,
	PRIMARY KEY (bioentry_id)
) TABLESPACE "SYMGENE_DATA";

-- add these only if you want them:
-- ALTER TABLE biosequence ADD COLUMN ( isoelec_pt NUMERIC(4,2) );
-- ALTER TABLE biosequence ADD COLUMN (	mol_wgt DOUBLE PRECISION );
-- ALTER TABLE biosequence ADD COLUMN ( perc_gc DOUBLE PRECISION );

-- database cross-references (e.g., GenBank:AC123456.1)
--
-- Version may be unknown, may be undefined, or may not exist for a certain
-- accession or database (namespace). We require it here to avoid RDBMS-
-- dependend enforcement variants (version is in a compound alternative key),
-- and to simplify query construction for UK look-ups. If there is no version
-- the convention is to put 0 (zero) here. Likewise, a record with a version
-- of zero means the version is to be interpreted as NULL.
--
CREATE TABLE dbxref (
        dbxref_id	int  NOT NULL ,
        dbname          VARCHAR(40) NOT NULL,
        accession       VARCHAR(40) NOT NULL,
	version		SMALLINT  NOT NULL,
	PRIMARY KEY (dbxref_id),
        UNIQUE(accession, dbname, version)
) TABLESPACE "SYMGENE_DATA";

CREATE INDEX dbxref_db  ON dbxref(dbname) TABLESPACE "SYMGENE_INDEX";

-- for roundtripping embl/genbank, we need to have the "optional ID"
-- for the dbxref.
--
-- another use of this table could be for storing
-- descriptive text for a dbxref. for example, we may want to
-- know stuff about the interpro accessions we store (without
-- importing all of interpro), so we can attach the text
-- description as a synonym
CREATE TABLE dbxref_qualifier_value (
       	dbxref_id 		int  NOT NULL,
       	term_id 		int  NOT NULL,
  	rank  		   	INT DEFAULT 0 NOT NULL,
       	value			VARCHAR2(100),
	PRIMARY KEY (dbxref_id,term_id,rank)
) TABLESPACE "SYMGENE_DATA";

CREATE INDEX dbxrefqual_dbx ON dbxref_qualifier_value(dbxref_id) TABLESPACE "SYMGENE_INDEX";
CREATE INDEX dbxrefqual_trm ON dbxref_qualifier_value(term_id) TABLESPACE "SYMGENE_INDEX";

-- Direct dblinks. It is tempting to do this
-- from bioentry_id to bioentry_id. But that wont work
-- during updates of one database - we will have to edit
-- this table each time. Better to do the join through accession
-- and db each time. Should be almost as cheap

CREATE TABLE bioentry_dbxref ( 
       	bioentry_id        int  NOT NULL,
       	dbxref_id          int  NOT NULL,
  	rank  		   SMALLINT,
	PRIMARY KEY (bioentry_id,dbxref_id)
) TABLESPACE "SYMGENE_DATA";

CREATE INDEX dblink_dbx  ON bioentry_dbxref(dbxref_id) TABLESPACE "SYMGENE_INDEX";

-- We can have multiple references per bioentry, but one reference
-- can also be used for the same bioentry.
--
-- No two references can reference the same reference database entry
-- (dbxref_id). This is where the MEDLINE id goes: PUBMED:123456.

CREATE TABLE reference (
  	reference_id       int  NOT NULL ,
	dbxref_id	   int ,
  	location 	   VARCHAR2(100) NOT NULL,
  	title    	   VARCHAR2(100),
  	authors  	   VARCHAR2(100) NOT NULL,
  	crc	   	   VARCHAR(32),
	PRIMARY KEY (reference_id),
	UNIQUE (dbxref_id),
	UNIQUE (crc)
) TABLESPACE "SYMGENE_DATA";

-- bioentry to reference associations
CREATE TABLE bioentry_reference (
  	bioentry_id 	int  NOT NULL,
  	reference_id 	int  NOT NULL,
  	start_pos	int,
  	end_pos	  	int,
  	rank  		SMALLINT DEFAULT 0 NOT NULL,
  	PRIMARY KEY(bioentry_id,reference_id,rank)
) TABLESPACE "SYMGENE_DATA";

CREATE INDEX bioentryref_ref ON bioentry_reference(reference_id) TABLESPACE "SYMGENE_INDEX";


-- We can have multiple comments per seqentry, and
-- comments can have embedded '\n' characters

CREATE TABLE comment_ (
  	comment_id  	int  NOT NULL ,
  	bioentry_id    	int  NOT NULL,
  	comment_text   	VARCHAR2(100) NOT NULL,
  	rank   		SMALLINT DEFAULT 0 NOT NULL,
	PRIMARY KEY (comment_id),
  	UNIQUE(bioentry_id, rank)
) TABLESPACE "SYMGENE_DATA";


-- tag/value and ontology term annotation for bioentries goes here
CREATE TABLE bioentry_qualifier_value (
	bioentry_id   		int  NOT NULL,
   	term_id  		int  NOT NULL,
   	value         		VARCHAR2(100),
	rank			INT DEFAULT 0 NOT NULL,
	UNIQUE (bioentry_id,term_id,rank)
) TABLESPACE "SYMGENE_DATA";

CREATE INDEX bioentryqual_trm ON bioentry_qualifier_value(term_id) TABLESPACE "SYMGENE_INDEX";

-- feature table. We cleanly handle
--   - simple locations
--   - split locations
--   - split locations on remote sequences

CREATE TABLE seqfeature (
   	seqfeature_id 		int  NOT NULL ,
   	bioentry_id   		int  NOT NULL,
   	type_term_id		int  NOT NULL,
   	source_term_id  	int  NOT NULL,
	display_name		VARCHAR(64),
   	rank 			SMALLINT  DEFAULT 0  NOT NULL,
	PRIMARY KEY (seqfeature_id),
	UNIQUE (bioentry_id,type_term_id,source_term_id,rank)
) TABLESPACE "SYMGENE_DATA";

CREATE INDEX seqfeature_trm  ON seqfeature(type_term_id) TABLESPACE "SYMGENE_INDEX";
CREATE INDEX seqfeature_fsrc ON seqfeature(source_term_id) TABLESPACE "SYMGENE_INDEX";
-- you may want to add this for mysql because MySQL often is broken with
-- respect to using the composite index for the initial keys
--CREATE INDEX seqfeature_bioentryid ON seqfeature(bioentry_id);

-- seqfeatures can be arranged in containment hierarchies.
-- one can imagine storing other relationships between features,
-- in this case the term_id can be used to type the relationship

CREATE TABLE seqfeature_relationship (
        seqfeature_relationship_id int  NOT NULL ,
   	object_seqfeature_id	int  NOT NULL,
   	subject_seqfeature_id 	int  NOT NULL,
   	term_id 	int  NOT NULL,
   	rank 			INT,
   	PRIMARY KEY (seqfeature_relationship_id),
	UNIQUE (object_seqfeature_id,subject_seqfeature_id,term_id)
) TABLESPACE "SYMGENE_DATA";

CREATE INDEX seqfeaturerel_trm   ON seqfeature_relationship(term_id) TABLESPACE "SYMGENE_INDEX";
CREATE INDEX seqfeaturerel_child ON seqfeature_relationship(subject_seqfeature_id) TABLESPACE "SYMGENE_INDEX";
-- you may want to add this for mysql because MySQL often is broken with
-- respect to using the composite index for the initial keys
--CREATE INDEX seqfeaturerel_parent ON seqfeature_relationship(object_seqfeature_id);

-- for deep (depth > 1) seqfeature relationship trees we need a transitive
-- closure table too
CREATE TABLE seqfeature_path (
   	object_seqfeature_id	int  NOT NULL,
   	subject_seqfeature_id 	int  NOT NULL,
   	term_id 		int  NOT NULL,
	distance	     	int ,
	UNIQUE (object_seqfeature_id,subject_seqfeature_id,term_id,distance)
) TABLESPACE "SYMGENE_DATA";

CREATE INDEX seqfeaturepath_trm   ON seqfeature_path(term_id) TABLESPACE "SYMGENE_INDEX";
CREATE INDEX seqfeaturepath_child ON seqfeature_path(subject_seqfeature_id) TABLESPACE "SYMGENE_INDEX";
-- you may want to add this for mysql because MySQL often is broken with
-- respect to using the composite index for the initial keys
--CREATE INDEX seqfeaturerel_parent ON seqfeature_path(object_seqfeature_id);

-- tag/value associations - or ontology annotations
CREATE TABLE seqfeature_qualifier_value (
	seqfeature_id 		int  NOT NULL,
   	term_id 		int  NOT NULL,
   	rank 			SMALLINT DEFAULT 0  NOT NULL,
   	value  			VARCHAR2(100) NOT NULL,
   	PRIMARY KEY (seqfeature_id,term_id,rank)
) TABLESPACE "SYMGENE_DATA";

CREATE INDEX seqfeaturequal_trm ON seqfeature_qualifier_value(term_id) TABLESPACE "SYMGENE_INDEX";
   
-- DBXrefs for features. This is necessary for genome oriented viewpoints,
-- where you have a few have long sequences (contigs, or chromosomes) with many
-- features on them. In that case the features are the semantic scope for
-- their annotation bundles, not the bioentry they are attached to.

CREATE TABLE seqfeature_dbxref ( 
       	seqfeature_id      int  NOT NULL,
       	dbxref_id          int  NOT NULL,
  	rank  		   SMALLINT,
	PRIMARY KEY (seqfeature_id,dbxref_id)
) TABLESPACE "SYMGENE_DATA";

CREATE INDEX feadblink_dbx  ON seqfeature_dbxref(dbxref_id) TABLESPACE "SYMGENE_INDEX";

-- basically we model everything as potentially having
-- any number of locations, ie, a split location. SimpleLocations
-- just have one location. We need to have a location id for the qualifier
-- associations of fuzzy locations.

-- please do not try to model complex assemblies with this thing. It wont
-- work. Check out the ensembl schema for this.

-- we allow nulls for start/end - this is useful for fuzzies as
-- standard range queries will not be included

-- for remote locations, the join to make is to DBXref
-- the FK to term is a possibility to store the type of the
-- location for determining in one hit whether it's a fuzzy or not

CREATE TABLE location (
	location_id		int  NOT NULL ,
   	seqfeature_id		int  NOT NULL,
	dbxref_id		int ,
	term_id			int ,
   	start_pos              	int,
   	end_pos                	int,
   	strand             	INT NOT NULL,
   	rank          		SMALLINT DEFAULT 0  NOT NULL,
	PRIMARY KEY (location_id),
   	UNIQUE (seqfeature_id, rank)
) TABLESPACE "SYMGENE_DATA";

CREATE INDEX seqfeatureloc_start ON location(start_pos, end_pos) TABLESPACE "SYMGENE_INDEX";
CREATE INDEX seqfeatureloc_dbx   ON location(dbxref_id) TABLESPACE "SYMGENE_INDEX";
CREATE INDEX seqfeatureloc_trm   ON location(term_id) TABLESPACE "SYMGENE_INDEX";

-- location qualifiers - mainly intended for fuzzies but anything
-- can go in here
-- some controlled vocab terms have slots;
-- fuzzies could be modeled as min_start(5), max_start(5)
-- 
-- there is no restriction on extending the fuzzy ontology
-- for your own nefarious aims, although the bio* apis will
-- most likely ignore these
CREATE TABLE location_qualifier_value (
	location_id		int  NOT NULL,
   	term_id 		int  NOT NULL,
   	value  			VARCHAR(255) NOT NULL,
   	int_value 		int,
	PRIMARY KEY (location_id,term_id)
) TABLESPACE "SYMGENE_DATA";

CREATE INDEX locationqual_trm ON location_qualifier_value(term_id) TABLESPACE "SYMGENE_INDEX";

--
-- Create the foreign key constraints
--

-- ontology term

ALTER TABLE term ADD CONSTRAINT FKont_term
	FOREIGN KEY (ontology_id) REFERENCES ontology(ontology_id)
	ON DELETE CASCADE;

-- term synonyms

ALTER TABLE term_synonym ADD CONSTRAINT FKterm_syn
	FOREIGN KEY (term_id) REFERENCES term(term_id)
	ON DELETE CASCADE;

-- term_dbxref

ALTER TABLE term_dbxref ADD CONSTRAINT FKdbxref_trmdbxref
       	FOREIGN KEY (dbxref_id) REFERENCES dbxref(dbxref_id)
	ON DELETE CASCADE;
ALTER TABLE term_dbxref ADD CONSTRAINT FKterm_trmdbxref
      FOREIGN KEY (term_id) REFERENCES term(term_id)
	ON DELETE CASCADE;

-- term_relationship

ALTER TABLE term_relationship ADD CONSTRAINT FKtrmsubject_trmrel
	FOREIGN KEY (subject_term_id) REFERENCES term(term_id)
	ON DELETE CASCADE;
ALTER TABLE term_relationship ADD CONSTRAINT FKtrmpredicate_trmrel
       	FOREIGN KEY (predicate_term_id) REFERENCES term(term_id)
	ON DELETE CASCADE;
ALTER TABLE term_relationship ADD CONSTRAINT FKtrmobject_trmrel
       	FOREIGN KEY (object_term_id) REFERENCES term(term_id)
	ON DELETE CASCADE;
ALTER TABLE term_relationship ADD CONSTRAINT FKterm_trmrel
       	FOREIGN KEY (ontology_id) REFERENCES ontology(ontology_id)
	ON DELETE CASCADE;

-- term_path

ALTER TABLE term_path ADD CONSTRAINT FKtrmsubject_trmpath
	FOREIGN KEY (subject_term_id) REFERENCES term(term_id)
	ON DELETE CASCADE;
ALTER TABLE term_path ADD CONSTRAINT FKtrmpredicate_trmpath
       	FOREIGN KEY (predicate_term_id) REFERENCES term(term_id)
	ON DELETE CASCADE;
ALTER TABLE term_path ADD CONSTRAINT FKtrmobject_trmpath
       	FOREIGN KEY (object_term_id) REFERENCES term(term_id)
	ON DELETE CASCADE;
ALTER TABLE term_path ADD CONSTRAINT FKontology_trmpath
       	FOREIGN KEY (ontology_id) REFERENCES ontology(ontology_id)
	ON DELETE CASCADE;

-- taxon, taxon_name

-- unfortunately, we can't constrain parent_taxon_id as it is violated
-- occasionally by the downloads available from NCBI
-- ALTER TABLE taxon ADD CONSTRAINT FKtaxon_taxon
--         FOREIGN KEY (parent_taxon_id) REFERENCES taxon(taxon_id);
ALTER TABLE taxon_name ADD CONSTRAINT FKtaxon_taxonname
        FOREIGN KEY (taxon_id) REFERENCES taxon(taxon_id)
        ON DELETE CASCADE;

-- bioentry

ALTER TABLE bioentry ADD CONSTRAINT FKtaxon_bioentry
	FOREIGN KEY (taxon_id) REFERENCES taxon(taxon_id);
ALTER TABLE bioentry ADD CONSTRAINT FKbiodatabase_bioentry
	FOREIGN KEY (biodatabase_id) REFERENCES biodatabase(biodatabase_id);

-- bioentry_relationship

ALTER TABLE bioentry_relationship ADD CONSTRAINT FKterm_bioentryrel
	FOREIGN KEY (term_id) REFERENCES term(term_id);
ALTER TABLE bioentry_relationship ADD CONSTRAINT FKparentent_bioentryrel
	FOREIGN KEY (object_bioentry_id) REFERENCES bioentry(bioentry_id)
	ON DELETE CASCADE;
ALTER TABLE bioentry_relationship ADD CONSTRAINT FKchildent_bioentryrel
	FOREIGN KEY (subject_bioentry_id) REFERENCES bioentry(bioentry_id)
	ON DELETE CASCADE;

-- bioentry_path

ALTER TABLE bioentry_path ADD CONSTRAINT FKterm_bioentrypath
	FOREIGN KEY (term_id) REFERENCES term(term_id);
ALTER TABLE bioentry_path ADD CONSTRAINT FKparentent_bioentrypath
	FOREIGN KEY (object_bioentry_id) REFERENCES bioentry(bioentry_id)
	ON DELETE CASCADE;
ALTER TABLE bioentry_path ADD CONSTRAINT FKchildent_bioentrypath
	FOREIGN KEY (subject_bioentry_id) REFERENCES bioentry(bioentry_id)
	ON DELETE CASCADE;

-- biosequence

ALTER TABLE biosequence ADD CONSTRAINT FKbioentry_bioseq
	FOREIGN KEY (bioentry_id) REFERENCES bioentry(bioentry_id)
	ON DELETE CASCADE;

-- comment

ALTER TABLE comment_ ADD CONSTRAINT FKbioentry_comment
	FOREIGN KEY(bioentry_id) REFERENCES bioentry(bioentry_id)
	ON DELETE CASCADE;

-- bioentry_dbxref

ALTER TABLE bioentry_dbxref ADD CONSTRAINT FKbioentry_dblink
        FOREIGN KEY (bioentry_id) REFERENCES bioentry(bioentry_id)
	ON DELETE CASCADE;
ALTER TABLE bioentry_dbxref ADD CONSTRAINT FKdbxref_dblink
       	FOREIGN KEY (dbxref_id) REFERENCES dbxref(dbxref_id)
	ON DELETE CASCADE;

-- dbxref_qualifier_value

ALTER TABLE dbxref_qualifier_value ADD CONSTRAINT FKtrm_dbxrefqual
	FOREIGN KEY (term_id) REFERENCES term(term_id);
ALTER TABLE dbxref_qualifier_value ADD CONSTRAINT FKdbxref_dbxrefqual
	FOREIGN KEY (dbxref_id) REFERENCES dbxref(dbxref_id)
	ON DELETE CASCADE;

-- bioentry_reference

ALTER TABLE bioentry_reference ADD CONSTRAINT FKbioentry_entryref
	FOREIGN KEY (bioentry_id) REFERENCES bioentry(bioentry_id)
	ON DELETE CASCADE;
ALTER TABLE bioentry_reference ADD CONSTRAINT FKreference_entryref
	FOREIGN KEY (reference_id) REFERENCES reference(reference_id)
	ON DELETE CASCADE;

-- bioentry_qualifier_value

ALTER TABLE bioentry_qualifier_value ADD CONSTRAINT FKbioentry_entqual
	FOREIGN KEY (bioentry_id) REFERENCES bioentry(bioentry_id)
	ON DELETE CASCADE;
ALTER TABLE bioentry_qualifier_value ADD CONSTRAINT FKterm_entqual
	FOREIGN KEY (term_id) REFERENCES term(term_id);

-- reference 
ALTER TABLE reference ADD CONSTRAINT FKdbxref_reference
      FOREIGN KEY ( dbxref_id ) REFERENCES dbxref ( dbxref_id ) ;

-- seqfeature

ALTER TABLE seqfeature ADD CONSTRAINT FKterm_seqfeature
	FOREIGN KEY (type_term_id) REFERENCES term(term_id);
ALTER TABLE seqfeature ADD CONSTRAINT FKsourceterm_seqfeature
	FOREIGN KEY (source_term_id) REFERENCES term(term_id);
ALTER TABLE seqfeature ADD CONSTRAINT FKbioentry_seqfeature
	FOREIGN KEY (bioentry_id) REFERENCES bioentry(bioentry_id)
	ON DELETE CASCADE;

-- seqfeature_relationship

ALTER TABLE seqfeature_relationship ADD CONSTRAINT FKterm_seqfeatrel
	FOREIGN KEY (term_id) REFERENCES term(term_id);
ALTER TABLE seqfeature_relationship ADD CONSTRAINT FKparentfeat_seqfeatrel
	FOREIGN KEY (object_seqfeature_id) REFERENCES seqfeature(seqfeature_id)
	ON DELETE CASCADE;
ALTER TABLE seqfeature_relationship ADD CONSTRAINT FKchildfeat_seqfeatrel
	FOREIGN KEY (subject_seqfeature_id) REFERENCES seqfeature(seqfeature_id)
	ON DELETE CASCADE;

-- seqfeature_path

ALTER TABLE seqfeature_path ADD CONSTRAINT FKterm_seqfeatpath
	FOREIGN KEY (term_id) REFERENCES term(term_id);
ALTER TABLE seqfeature_path ADD CONSTRAINT FKparentfeat_seqfeatpath
	FOREIGN KEY (object_seqfeature_id) REFERENCES seqfeature(seqfeature_id)
	ON DELETE CASCADE;
ALTER TABLE seqfeature_path ADD CONSTRAINT FKchildfeat_seqfeatpath
	FOREIGN KEY (subject_seqfeature_id) REFERENCES seqfeature(seqfeature_id)
	ON DELETE CASCADE;

-- seqfeature_qualifier_value
ALTER TABLE seqfeature_qualifier_value ADD CONSTRAINT FKterm_featqual
	FOREIGN KEY (term_id) REFERENCES term(term_id);
ALTER TABLE seqfeature_qualifier_value ADD CONSTRAINT FKseqfeature_featqual
	FOREIGN KEY (seqfeature_id) REFERENCES seqfeature(seqfeature_id)
	ON DELETE CASCADE;

-- seqfeature_dbxref

ALTER TABLE seqfeature_dbxref ADD CONSTRAINT FKseqfeature_feadblink
        FOREIGN KEY (seqfeature_id) REFERENCES seqfeature(seqfeature_id)
	ON DELETE CASCADE;
ALTER TABLE seqfeature_dbxref ADD CONSTRAINT FKdbxref_feadblink
       	FOREIGN KEY (dbxref_id) REFERENCES dbxref(dbxref_id)
	ON DELETE CASCADE;

-- location

ALTER TABLE location ADD CONSTRAINT FKseqfeature_location
	FOREIGN KEY (seqfeature_id) REFERENCES seqfeature(seqfeature_id)
	ON DELETE CASCADE;
ALTER TABLE location ADD CONSTRAINT FKdbxref_location
	FOREIGN KEY (dbxref_id) REFERENCES dbxref(dbxref_id);
ALTER TABLE location ADD CONSTRAINT FKterm_featloc
	FOREIGN KEY (term_id) REFERENCES term(term_id);

-- location_qualifier_value

ALTER TABLE location_qualifier_value ADD CONSTRAINT FKfeatloc_locqual
	FOREIGN KEY (location_id) REFERENCES location(location_id)
	ON DELETE CASCADE;
ALTER TABLE location_qualifier_value ADD CONSTRAINT FKterm_locqual
	FOREIGN KEY (term_id) REFERENCES term(term_id);


--
-- Triggers for automatic primary key generation and other sanity checks
--

CREATE OR REPLACE TRIGGER BID_location
  BEFORE INSERT
  on location
  -- 
  for each row
BEGIN
IF :new.location_id IS NULL THEN
    SELECT SE_location.nextval INTO :new.location_id FROM DUAL;
END IF;
END;
/


CREATE OR REPLACE TRIGGER BID_seqfeature
  BEFORE INSERT
  on seqfeature
  -- 
  for each row
BEGIN
IF :new.seqfeature_id IS NULL THEN
    SELECT SE_seqfeature.nextval INTO :new.seqfeature_id FROM DUAL;
END IF;
END;
/


CREATE TRIGGER BID_seqfeature_relationship
  BEFORE INSERT
  on seqfeature_relationship
  -- 
  for each row
BEGIN
IF :new.seqfeature_relationship_id IS NULL THEN
    SELECT SE_seqfeature_relationship.nextval INTO :new.seqfeature_relationship_id FROM DUAL;
END IF;
END;
/


CREATE OR REPLACE TRIGGER BID_comment
  BEFORE INSERT
  on comment_
  -- 
  for each row
BEGIN
IF :new.comment_id IS NULL THEN
    SELECT SE_comment_.nextval INTO :new.comment_id FROM DUAL;
END IF;
END;
/


CREATE OR REPLACE TRIGGER BID_reference
  BEFORE INSERT
  on reference
  -- 
  for each row
BEGIN
IF :new.reference_id IS NULL THEN
    SELECT SE_reference.nextval INTO :new.reference_id FROM DUAL;
END IF;
END;
/


CREATE OR REPLACE TRIGGER BID_bioentry_relationship
  BEFORE INSERT
  on bioentry_relationship
  -- 
  for each row
BEGIN
IF :new.bioentry_relationship_id IS NULL THEN
    SELECT SE_bioentry_relationship.nextval INTO :new.bioentry_relationship_id FROM DUAL;
END IF;
END;
/


CREATE OR REPLACE TRIGGER BID_bioentry
  BEFORE INSERT
  on bioentry
  -- 
  for each row
BEGIN
IF :new.bioentry_id IS NULL THEN
    SELECT SE_bioentry.nextval INTO :new.bioentry_id FROM DUAL;
END IF;
-- IF :new.Division IS NULL THEN
--    :new.Division := 'UNK';
-- END IF;
END;
/


CREATE OR REPLACE TRIGGER BID_term
  BEFORE INSERT
  on term
  -- 
  for each row
BEGIN
IF :new.term_id IS NULL THEN
    SELECT SE_term.nextval INTO :new.term_id FROM DUAL;
END IF;
END;
/


CREATE OR REPLACE TRIGGER BID_term_relationship
  BEFORE INSERT
  on term_relationship
  -- 
  for each row
BEGIN
IF :new.term_relationship_id IS NULL THEN
    SELECT SE_term_relationship.nextval INTO :new.term_relationship_id FROM DUAL;
END IF;
END;
/


CREATE OR REPLACE TRIGGER BID_term_path
  BEFORE INSERT
  on term_path
  -- 
  for each row
BEGIN
IF :new.term_path_id IS NULL THEN
    SELECT SE_term_path.nextval INTO :new.term_path_id FROM DUAL;
END IF;
END;
/

CREATE OR REPLACE TRIGGER BID_ontology
  BEFORE INSERT
  on ontology
  -- 
  for each row
BEGIN
IF :new.ontology_id IS NULL THEN
    SELECT SE_ontology.nextval INTO :new.ontology_id FROM DUAL;
END IF;
END;
/


CREATE OR REPLACE TRIGGER BID_taxon
  BEFORE INSERT
  on taxon
  -- 
  for each row
BEGIN
IF :new.taxon_id IS NULL THEN
    SELECT SE_taxon.nextval INTO :new.taxon_id FROM DUAL;
END IF;
END;
/


CREATE OR REPLACE TRIGGER BID_biodatabase
  BEFORE INSERT
  on biodatabase
  -- 
  for each row
BEGIN
IF :new.biodatabase_id IS NULL THEN
    SELECT SE_biodatabase.nextval INTO :new.biodatabase_id FROM DUAL;
END IF;
END;
/


CREATE OR REPLACE TRIGGER BID_dbxref
  BEFORE INSERT
  on dbxref
  -- 
  for each row
BEGIN
IF :new.dbxref_id IS NULL THEN
    SELECT SE_dbxref.nextval INTO :new.dbxref_id FROM DUAL;
END IF;
END;
/

-------------- next part --------------

-------------- next part --------------
DROP SEQUENCE SE_biodatabase;
DROP SEQUENCE SE_taxon;
DROP SEQUENCE SE_ontology;
DROP SEQUENCE SE_term;
DROP SEQUENCE SE_term_relationship;
DROP SEQUENCE SE_term_path;
DROP SEQUENCE SE_bioentry;
DROP SEQUENCE SE_bioentry_relationship;
DROP SEQUENCE SE_dbxref;
DROP SEQUENCE SE_reference;
DROP SEQUENCE SE_comment_;
DROP SEQUENCE SE_seqfeature;
DROP SEQUENCE SE_seqfeature_relationship;
DROP SEQUENCE SE_location;

DROP TABLE biodatabase cascade constraints;
DROP INDEX db_auth ;
DROP TABLE taxon cascade constraints;
DROP INDEX taxparent ;
DROP TABLE taxon_name cascade constraints;
DROP INDEX taxnametaxonid;
DROP INDEX taxnamename;
DROP TABLE ontology cascade constraints;
DROP TABLE term cascade constraints;
DROP INDEX term_ont ;
DROP TABLE term_synonym cascade constraints;
DROP TABLE term_dbxref cascade constraints;
DROP INDEX trmdbxref_dbxrefid ;
DROP TABLE term_relationship cascade constraints;
DROP INDEX trmrel_predicateid ;
DROP INDEX trmrel_objectid;
DROP INDEX trmrel_ontid ;
--DROP INDEX ontrel_subjectid ON term_relationshipsubject_term_id);
DROP TABLE term_path cascade constraints;
DROP INDEX trmpath_predicateid ;
DROP INDEX trmpath_objectid ;
DROP INDEX trmpath_ontid ;
--DROP INDEX trmpath_subjectid ON term_pathsubject_term_id);
DROP TABLE bioentry cascade constraints;
DROP INDEX bioentry_name ;
DROP INDEX bioentry_db   ;
DROP INDEX bioentry_tax  ;
DROP TABLE bioentry_relationship cascade constraints;
DROP INDEX bioentryrel_trm   ;
DROP INDEX bioentryrel_child ;
--DROP INDEX bioentryrel_parent ;
DROP TABLE bioentry_path cascade constraints;
DROP INDEX bioentrypath_trm   ;
DROP INDEX bioentrypath_child ;
--DROP INDEX bioentrypath_parent ON bioentry_pathobject_bioentry_id);
DROP TABLE biosequence cascade constraints; 
DROP TABLE dbxref   cascade constraints;
DROP INDEX dbxref_db  ;
DROP TABLE dbxref_qualifier_value  cascade constraints;
DROP INDEX dbxrefqual_dbx ;
DROP INDEX dbxrefqual_trm ;
DROP TABLE bioentry_dbxref  cascade constraints ;
DROP INDEX dblink_dbx  ;
DROP TABLE reference  cascade constraints;
DROP TABLE bioentry_reference  cascade constraints;
DROP INDEX bioentryref_ref ;
DROP TABLE comment_  cascade constraints;
DROP TABLE bioentry_qualifier_value  cascade constraints;
DROP INDEX bioentryqual_trm ;
DROP TABLE seqfeature  cascade constraints;
DROP INDEX seqfeature_trm  ;
DROP INDEX seqfeature_fsrc ;
--DROP INDEX seqfeature_bioentryid ON seqfeaturebioentry_id);
DROP TABLE seqfeature_relationship  cascade constraints;
DROP INDEX seqfeaturerel_trm   ;
DROP INDEX seqfeaturerel_child ;
--DROP INDEX seqfeaturerel_parent ON seqfeature_relationshipobject_seqfeature_id);
DROP TABLE seqfeature_path  cascade constraints;
DROP INDEX seqfeaturepath_trm   ;
DROP INDEX seqfeaturepath_child ;
--DROP INDEX seqfeaturerel_parent ON seqfeature_pathobject_seqfeature_id);
DROP TABLE seqfeature_qualifier_value  cascade constraints;
DROP INDEX seqfeaturequal_trm ;
DROP TABLE seqfeature_dbxref   cascade constraints;
DROP INDEX feadblink_dbx  ;
DROP TABLE location  cascade constraints;
DROP INDEX seqfeatureloc_start ;
DROP INDEX seqfeatureloc_dbx ;
DROP INDEX seqfeatureloc_trm ;
DROP TABLE location_qualifier_value  cascade constraints;
DROP INDEX locationqual_trm ;
-- Create the foreign key constraints
-------------- next part --------------




More information about the BioSQL-l mailing list