[BioSQL-l] Re: Postgres version support
Jason E. Stewart
jason@openinformatics.com
22 Oct 2002 10:55:17 -0600
--=-=-=
"Hilmar Lapp" <hlapp@gnf.org> writes:
> The MAGEstk folks in fact came up with a XML-schema definition for
> describing a schema, and a corresponding converter to DDL of
> particular flavors (which I believe so far is Pg only, not
> sure). Jason, correct me if this description is too sloppy.
Hey all,
The definition was designed for Genex, and is currently DTD-based
(although the DTD is so simple, it could be turned into a schema with
no effort).
MAGEstk uses that format when it converts a UML model into relational
DB representation.
There is a utility method, xml2sql(), that takes a set of table files
and converts them into SQL DDL for the DB backend currently in use -
to support multiple backends we would simply pass in an active DB
handle and switch on the DB driver type.
Yes, currently Pg is the only supported backend - primarily because we
needed sequences and views for Genex, and MySQL didn't have them, but
now we also use inheritance as well.
The code in xml2sql is pretty simple, but at the moment it isn't very
generic. I'm happy to have other projects use it, and I'd be thrilled
if it became a supported part of BioPerl. I'd be happy to help make it
more generic and maintain it.
I'll include the DTD, and example table file, and the xml2sql()
subroutine from the Genex project.
Cheers,
jas.
--
--=-=-=
Content-Disposition: attachment; filename=xml2sql
Content-Description: xml2sql
=item ($sql,\@all_tables,\@audit_tables,\@all_views = xml2sql(@file_name_list);
Description: A method for converting an XML description of a relation
DB table (using table.dtd) into SQL
Return Value: a list of values:
$sql = SQL string for creating the table
\@all_tables = array ref of table names to be created
\@audit_tables = array ref of table names with audit_fks
\@all_views = array ref of view names to be created
Errors: Will call die() on error
=cut
sub xml2sql {
my (@files,$MASTER_SEQ) = @_;
$MASTER_SEQ = 'GENEX_ID_SEQ' unless defined $MASTER_SEQ;
my @defer_fkeys;
my @defer_views;
my @all_views;
my @all_tables;
my %audit_tables;
my $output;
my $parser = XML::Xerces::DOMParser->new();
my %docs;
my %inherit_tables;
# we pre-parse all the files and save their docs
foreach my $file (@files) {
error(caller=>join(':',(caller())[1,2]),
message=>"couldn't open $file for input")
unless -f $file;
my $ERROR_HANDLER = XML::Xerces::PerlErrorHandler->new();
$parser->setErrorHandler($ERROR_HANDLER);
$parser->parse($file);
my $doc = $parser->getDocument();
$file =~ s|.*/||;
$file =~ s|\.xml||;
$docs{$file} = $doc;
}
# Since we use Postgres' inheritance system, we have to order
# the tables so that child tables get created after parent tables.
# Step one is to find all the tables which do not inherit and add
# them to a tmp list, and file away those that do inherit in a
# dependancy list hashed on who they inherit from
my @inheritance_tmp;
my %depends;
foreach my $file (keys %docs) {
my $doc = $docs{$file};
my $table = $doc->getDocumentElement();
my $inherits_from = $table->getAttribute('inherits_from')
or die "Couldn't get inheritance for table: $file";
if ($inherits_from eq 'none') {
# we don't inherit from anything, so order is irrelevant
push (@inheritance_tmp,$file);
} else {
# store the table as a dependancy of the table it inherits from
push(@{$depends{uc($inherits_from)}},$file);
# keep track of all tables that are inherited from
$inherit_tables{$inherits_from}++;
}
}
# now we go through the tmp list, and add the table, and
# all tables that inherited from it
my @inheritance_set;
foreach my $file (@inheritance_tmp) {
push(@inheritance_set,$docs{$file});
foreach my $dep_file (@{$depends{uc($file)}}) {
push(@inheritance_set,$docs{$dep_file});
}
}
# Create a master sequence for all records in the db. The sequence
# generator itself is an 8-byte integer, with a max value of
# roughly 9 quintillion, but in PostgreSQL, v.7.1.* use datatype
# integer for storing the value, which is 4-byte, with a limit of
# roughly 2 billion. PostgreSQL v.7.2.* uses the datatype
# biginteger, which is 8-byte, and thus meets the same 9
# quintillion limit.
$output .= <<SQL;
CREATE SEQUENCE "$MASTER_SEQ";
SQL
foreach my $doc (@inheritance_set) {
my $table = $doc->getDocumentElement();
my $table_name = $table->getAttribute('name')
or die "Couldn't get name for table: " . $table->serialize;
push(@all_tables,$table_name);
my @column_sql;
my $has_ro_groupname = 0;
my @columns;
my $inherits_from = $table->getAttribute('inherits_from')
or die "Couldn't get inheritance for table: " . $table->serialize;
if ($table->getAttribute('type') eq 'VIEW') {
@columns = $table->getElementsByTagName('column');
my %from;
foreach my $column (@columns) {
my %attrs = $column->getAttributes();
$has_ro_groupname = 1
if $attrs{name} eq 'ro_groupname';
die "No name for column in $table_name"
unless exists $attrs{name};
die "No source_table for column $attrs{name} in VIEW $table_name"
unless exists $attrs{source_table};
push(@column_sql,qq[\t$attrs{source_table}."$attrs{name}"]);
$from{$attrs{source_table}}++;
}
my $column_sql = join(",\n", @column_sql);
my $from = join(", ", keys %from);
my $where = $table->getAttribute('where');
my $view .= <<SQL;
CREATE VIEW $table_name AS
SELECT $column_sql
FROM $from
WHERE $where;
SQL
# save the view data to be added to the output after all tables
# are created
push(@defer_views,$view);
# save the view name to return to the caller
push(@all_views,$table_name);
} else {
@columns = $table->getElementsByTagName('column');
foreach my $column (@columns) {
my %attrs = $column->getAttributes();
die "No name for column in $table_name"
unless exists $attrs{name};
die "No type for column $attrs{name} in $table_name"
unless exists $attrs{type};
my $null = '';
$null = 'NOT NULL' if $attrs{not_null} eq 'true';
push(@column_sql,qq[\t"$attrs{name}"\t$attrs{type} $null]);
$has_ro_groupname = 1
if $attrs{name} eq 'ro_groupname';
# we add the table to the Audit hash if it has an audit fkey
$audit_tables{$table_name}++
if $attrs{name} eq 'audit_fk' and $table_name ne 'Audit';
# we add the table to the Audit hash if it
# inherits from a table with an audit fkey
$audit_tables{$table_name}++
if exists $audit_tables{$inherits_from};
}
# do the primary key
my ($pkey) = $table->getElementsByTagName('primary_key');
if (defined $pkey) {
my %attrs = $pkey->getAttributes();
die "No column_id for pkey in $table_name"
unless exists $attrs{column_id};
push(@column_sql,"\tPRIMARY KEY ($attrs{column_id})");
}
###
### PostgreSQL inheritance cannot see child table IDs when linking,
### so we have to be cautious when using foreign key constraints.
### We only allow constraints to tables that aren't inherited from
###
my @fkeys = $table->getElementsByTagName('foreign_key');
foreach my $column (@fkeys) {
my %attrs = $column->getAttributes();
# don't include fkeys to tables which are inherited from
next if $inherit_tables{$attrs{foreign_table}};
# some fkeys refer to views which, not being tables, cannot
# have FOREIGN KEY clauses which point to them
next if $attrs{write_sql} eq 'false';
die "No column_id for fkey in $table_name"
unless exists $attrs{column_id};
die "No foreign_table for fkey $attrs{column_id} in $table_name"
unless exists $attrs{foreign_table};
die "No foreign_table_pkey for fkey $attrs{column_id} in $table_name"
unless exists $attrs{foreign_table_pkey};
# split on whitespace, join on commas
my $constraint = "$ {table_name}_$attrs{column_id}";
my $fkey = <<SQL;
ALTER TABLE $table_name ADD CONSTRAINT $constraint
FOREIGN KEY ($attrs{column_id})
REFERENCES $attrs{foreign_table} ($attrs{foreign_table_pkey});
SQL
# we store the fkey for later, after all the tables are created
# that way, we don't have to worry about creation order
push(@defer_fkeys,$fkey);
}
my @uniques = $table->getElementsByTagName('unique');
foreach my $column (@uniques) {
my %attrs = $column->getAttributes();
die "No column_ids for unique in $table_name"
unless exists $attrs{column_ids};
# split on whitespace, join on commas
my $string = join(', ',split(' ',$attrs{column_ids}));
push(@column_sql,"\tUNIQUE ($string)");
}
my $column_sql = join(",\n", @column_sql);
$output .= <<SQL;
CREATE TABLE $table_name (
$column_sql
)
SQL
$output .= qq[INHERITS ($inherits_from)]
if ($inherits_from ne 'none');
$output .= ";\n";
} # if VIEW
# if this table or view has security the selectable view we create
# will enforce it in the where clause, otherwise it's public
my $WHERE = '';
my $from = qq[$table_name];
if ($has_ro_groupname) {
$WHERE = 'WHERE ro_groupname=GroupLink.groupname AND GroupLink.username=user()';
$from .= ', GroupLink'
unless $table_name eq 'GroupLink';
}
my @inherited_columns;
if ($inherits_from ne 'none') {
@inherited_columns = $docs{$inherits_from}->getElementsByTagName('column');
}
my $columns = join(",\n", map {$table_name . '."' . $_->getAttribute('name'). '"'}
@columns, @inherited_columns);
my $view = "$ {table_name}_view";
my $sql = <<SQL;
CREATE VIEW $view AS
SELECT $columns
FROM $from
$WHERE;
GRANT SELECT ON $view TO PUBLIC;
SQL
# we defer creation of the view until after all tables are created
# this way we don't need to know when GroupLink is created
push(@defer_views,$sql);
# save the view name to return to the caller
push(@all_views,$view);
# reset the list
@column_sql = ();
my @indices = $table->getElementsByTagName('index');
foreach my $index (@indices) {
my %attrs = $index->getAttributes();
die "No name for index in $table_name"
unless exists $attrs{name};
die "No column_id for index $attrs{name} in $table_name"
unless exists $attrs{column_id};
push(@column_sql,qq[CREATE INDEX $attrs{name} on $table_name ( "$attrs{column_id}" );]);
}
# now we only grant priveleges to genex
push(@column_sql,qq[GRANT ALL on $table_name to genex;])
unless $table->getAttribute('type') eq 'VIEW';
$output .= join("\n", @column_sql) . "\n";
}
# now we add on all the pieces that needed to wait until all tables
# were created
$output .= join("\n",@defer_views,@defer_fkeys);
return ($output,\@all_tables,[keys %audit_tables],\@all_views);
}
--=-=-=
Content-Disposition: attachment; filename=table.dtd
Content-Description: table.dtd
<!-- ====================================== -->
<!-- RDBMS Table Definition DTD (table.dtd) -->
<!-- ====================================== -->
<!-- Copyright 2001-2002 Jason E. Stewart
All rights reserved -->
<!-- Table Type Entities -->
<!ENTITY data_table "DATA" >
<!ENTITY validation_table
"VALIDATION" >
<!ENTITY subset_table "SUBSET" >
<!ENTITY linking_table "LINKING" >
<!ENTITY system_table "SYSTEM" >
<!ENTITY view "VIEW" >
<!-- Foreign Key Type Entities -->
<!ENTITY fkey_linking "LINKING_TABLE" >
<!ENTITY fkey_lookup "LOOKUP_TABLE" >
<!ENTITY fkey_oto "ONE_TO_ONE" >
<!ENTITY fkey_mto "MANY_TO_ONE" >
<!ELEMENT table (column|
unique|
index|
linking_keys|
foreign_key|
primary_key)* >
<!ATTLIST table
type CDATA #REQUIRED
name CDATA #REQUIRED
comment CDATA #IMPLIED
where CDATA #IMPLIED
inherits_from
CDATA "none"
can_self_reference
(true|false) "false"
is_abstract
(true|false) "false" >
<!ELEMENT column EMPTY >
<!ATTLIST column
name ID #REQUIRED
full_name CDATA #REQUIRED
type CDATA #REQUIRED
comment CDATA #IMPLIED
not_null (true|false) "false"
source_table
CDATA #IMPLIED >
<!ELEMENT unique EMPTY >
<!ATTLIST unique
column_ids IDREFS #REQUIRED >
<!ELEMENT primary_key EMPTY >
<!ATTLIST primary_key
column_id IDREF #REQUIRED
serial (true|false) "true" >
<!--
the write_sql attribute enables us to indicate where table
references exist, but we don't want them to be defined by an
actual FOREIGN KEY constraint in the DB. This will enable the
API to have a getter method for this value, but it just wont'
have a constraint in the DB.
the can_self_reference attribute is for those cases when a table
has a foreign key to itself, and that foreign key is permitted
to point to the same object. For example, this happens when
groups are used for setting permissions, and the groups themselves
have a group that defines the permissions for who can add or delete
members from the group.
-->
<!ELEMENT foreign_key EMPTY >
<!ATTLIST foreign_key
column_id IDREF #REQUIRED
foreign_table
CDATA #REQUIRED
foreign_table_pkey
CDATA #REQUIRED
fkey_type CDATA #REQUIRED
can_cascade
(true|false) "false"
write_sql (true|false) "true"
can_self_reference
(true|false) "false" >
<!ELEMENT linking_keys EMPTY >
<!ATTLIST linking_keys
link1 IDREF #REQUIRED
link2 IDREF #REQUIRED >
<!ELEMENT index EMPTY >
<!ATTLIST index
name CDATA #REQUIRED
column_id IDREF #REQUIRED >
<!--
Local Variables:
dtd-xml-flag: t
End:
-->
--=-=-=
Content-Type: text/xml
Content-Disposition: attachment; filename=PhysicalBioAssay.xml
Content-Description: Physical BioAssay table
<?xml version="1.0" standalone="no"?>
<!DOCTYPE table SYSTEM "../../DTD/table.dtd">
<table name="PhysicalBioAssay"
type="&data_table;"
comment=" The PhysicalBioAssay table is used in two related, but
distinct fashions: to represent the primary data for gene
expression experiments and to represent processed data such as
averaged values across replicates or ratio values for treatment
vs. controls. In representing the primary data, it describes
an instance of an ArrayDesign that has been used to assay gene
expression levels in a preparation of the mRNA from a sample
that has been subjected to a specific treatment in the context
of an experiment. It contains detailed information as to the
treatment and protocols used on this ArrayDesign instance. The
actual expression levels for each spot are stored in the
PhysicalBioAssaySpots table. In general, the PhysicalBioAssay
table has the same relationship to the PhysicalBioAssaySpots
table as the ArrayDesign table has to the ArrayDesignSpots
table. However, there several cases in which the relationship
breaks down to some extent. First, there may be
PhysicalBioAssays for which no ArrayDesign is given. Second, for
spots that represent controls on a given ArrayDesign, we may
choose not to store expression levels for the control spots.
Finally, in cases of ArrayDesigns with replicate spots (i.e. the
same Reporter is used in different locations on the
ArrayDesign), replicates will be broken out into separate
PhysicalBioAssay rows. For example, an ArrayDesign with 1000
distinct Reporters, each of which has been replicated 2
times on the ArrayDesign will have 3000 corresponding
ArrayDesignSpot rows, while a given instance of the ArrayDesign
will be represented by three distinct PhysicalBioAssay rows,
each of which has 1000 corresponding PhysicalBioAssaySpots
rows. A related example is when mutiple channels are used to
detect the signal of the expression level for each spot. In this
case, an ArrayDesign that has 1000 distinct Reporters may
be represented by two or more PhysicalBioAssay rows, each with
1000 PhysicalBioAssaySpot rows representing the expression level
reported by one channel. PhysicalBioAssays that represent
primary data will be grouped into sets around the ReplicateDef
table. This set of primary data may then be used to define
derived PhysicalBioAssays representing the averaged values
across members of the replicate set. Further, these
ReplicateDefs can be used to describe TreatmentDefs which define
the levels of the treatment and associated control
ReplicateDefs. The TreatmentDef may reference a third type of
PhysicalBioAssay that is derived from the ratio between the
averaged values for the treatment replicates and the averaged
values of the control replicates.">
<column name="pba_pk"
full_name="Accession Number"
type="serial"
comment=""/>
<column name="name"
full_name="Array Name"
type="varchar(128)"
comment=""/>
<column name="description"
full_name="Description"
type="text"
comment=" a more verbose description of the purpose of this
PhysicalBioAssay"/>
<column name="es_fk"
full_name="Primary Experiment Set"
type="int4"
not_null="true"
comment=" the primary experiment in which the PhysicalBioAssay
was taken (the ExperimentSet in which it was submitted)"/>
<foreign_key column_id="es_fk"
foreign_table ="ExperimentSet"
foreign_table_pkey ="es_pk"
fkey_type ="&fkey_mto;"/>
<column name="array_fk"
full_name="Array"
not_null="true"
type="int4"
comment="The chip or array used with this assay"/>
<foreign_key column_id="array_fk"
foreign_table ="Array"
foreign_table_pkey ="array_pk"
fkey_type ="&fkey_oto;"/>
<column name="provider_con_fk"
full_name="Data Provider"
type="int4"
comment=" the researcher that provided the array info it is
important to destinquish between this and the rw_groupname
because if data is imported from another DB then the owner of
the data will not be the individual that submitted it."/>
<foreign_key column_id="provider_con_fk"
foreign_table ="Contact"
foreign_table_pkey ="con_pk"
fkey_type ="&fkey_oto;"/>
<column name="ro_groupname"
full_name="Read-Only Group Name"
not_null="true"
type="name"
comment="The group with permission to view this data"/>
<foreign_key column_id="ro_groupname"
foreign_table ="GroupSec"
foreign_table_pkey ="name"
fkey_type ="&fkey_oto;"/>
<column name="rw_groupname"
full_name="Read/Write Group Name"
not_null="true"
type="name"
comment="The group with permission to modify this data"/>
<foreign_key column_id="rw_groupname"
foreign_table ="GroupSec"
foreign_table_pkey ="name"
fkey_type ="&fkey_oto;"/>
<column name="audit_fk"
full_name="Audit"
not_null="true"
type="int4"
comment="The audit entry when this data was last modified"/>
<foreign_key column_id="audit_fk"
foreign_table ="Audit"
foreign_table_pkey ="audit_pk"
fkey_type ="&fkey_oto;"/>
<primary_key column_id="pba_pk"/>
</table>
--=-=-=--