[MOBY-guts] biomoby commit
Paul Gordon
gordonp at dev.open-bio.org
Thu Jun 28 16:45:45 UTC 2007
gordonp
Thu Jun 28 12:45:44 EDT 2007
Update of /home/repository/moby/moby-live/Java/src/main/ca/ucalgary/seahawk/services
In directory dev.open-bio.org:/tmp/cvs-serv4488/src/main/ca/ucalgary/seahawk/services
Modified Files:
MobyClient.java MobyComplexBuilder.java
Added Files:
IterativeMatchResult.java RegexParser.java
Log Message:
Major changes to Seahawk 'services' package to make MOB rule parsing much more powerful
moby-live/Java/src/main/ca/ucalgary/seahawk/services IterativeMatchResult.java,NONE,1.1 RegexParser.java,NONE,1.1 MobyClient.java,1.14,1.15 MobyComplexBuilder.java,1.9,1.10
===================================================================
RCS file: /home/repository/moby/moby-live/Java/src/main/ca/ucalgary/seahawk/services/MobyClient.java,v
retrieving revision 1.14
retrieving revision 1.15
diff -u -r1.14 -r1.15
--- /home/repository/moby/moby-live/Java/src/main/ca/ucalgary/seahawk/services/MobyClient.java 2007/06/08 20:30:21 1.14
+++ /home/repository/moby/moby-live/Java/src/main/ca/ucalgary/seahawk/services/MobyClient.java 2007/06/28 16:45:44 1.15
@@ -37,6 +37,7 @@
public static final String DATA_MAPPING_XML_RESOURCE = "ca/ucalgary/seahawk/resources/mobyBuilderRules.xml";
public static final String RESOURCE_SYSTEM_PROPERTY = "seahawk.rules";
public static final String RULE_SET_TAG = "object";
+ public static final String RULE_NAME_ATTR = "name";
public static final String PREFIX_TAG = "prefix";
public static final String PREFIX_ATTR = "value";
public static final String ARTICLENAME_RULE_TAG = "articlename";
@@ -47,6 +48,8 @@
public static final String DATATYPE_RULE_ATTR = "value";
public static final String MEMBER_RULE_TAG = "member";
public static final String MEMBER_RULE_ATTR = "value";
+ public static final String MEMBERS_RULE_TAG = "inheritMembers";
+ public static final String MEMBERS_RULE_ATTR = "rule";
public static final String URL_REGEX_TAG = "url_regex";
public static final String REGEX_TAG = "regex";
public static final String XPATH_TAG = "xpath";
@@ -55,6 +58,7 @@
public static final String WHITESPACE_ATTR_NORMALIZE_VAL = "normalize";
public static final String WHITESPACE_ATTR_STRIP_FLANKING_VAL = "flanking";
public static final String WHITESPACE_ATTR_KEEP_VAL = "keep";
+ public static final String DATATYPE_ATTR = "datatype";
public static final String ENCODING_ATTR = "encoding";
public static final String ENCODING_ATTR_BASE64_VAL = "Base64";
public static final String ENCODING_ATTR_NONE_VAL = "none";
@@ -67,6 +71,8 @@
private HashMap xpathMap;
private HashMap urlRegexMap;
private Map<Pattern,MobyComplexBuilder> regexMap;
+ private Map<String,MobyComplexBuilder> builderNameMap;
+ private Map<String,Pattern> patternNameMap;
private URL dataMappingXMLURL;
private DocumentBuilder docBuilder;
private static org.apache.log4j.Logger logger = org.apache.log4j.Logger.getLogger(MobyClient.class);
@@ -88,6 +94,8 @@
xpathMap = new HashMap();
urlRegexMap = new HashMap();
regexMap = new HashMap<Pattern,MobyComplexBuilder>();
+ builderNameMap = new HashMap<String,MobyComplexBuilder>();
+ patternNameMap = new HashMap<String,Pattern>();
nsContext = new NamespaceContextImpl();
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
@@ -201,6 +209,8 @@
continue;
}
+ String ruleName = ruleSet.getAttribute(RULE_NAME_ATTR);
+
Vector regexStrings = new Vector();
Vector urlRegexStrings = new Vector();
Vector xpathStrings = new Vector();
@@ -250,7 +260,7 @@
dataTypeString = getDataType(ruleMember);
}
else if(isMemberRule(ruleMember)){
- addMemberMapping(ruleMember, memberMap);
+ addMemberMapping(ruleMember, memberMap, dataTypeString);
}
// TODO add other production rules
else{
@@ -279,7 +289,7 @@
addXPathMapping((String) xpathStrings.elementAt(j), namespaceMap, articleNameString);
}
for(int j = 0; j < regexStrings.size(); j++){
- addRegexMapping((String) regexStrings.elementAt(j), namespaceMap, articleNameString);
+ addRegexMapping((String) regexStrings.elementAt(j), namespaceMap, articleNameString, ruleName);
}
for(int j = 0; j < urlRegexStrings.size(); j++){
addURLRegexMapping((String) urlRegexStrings.elementAt(j), namespaceMap, articleNameString);
@@ -296,7 +306,7 @@
addXPathMapping((String) xpathStrings.elementAt(j), namespaceMap, dataTypeString, memberMap, articleNameString);
}
for(int j = 0; j < regexStrings.size(); j++){
- addRegexMapping((String) regexStrings.elementAt(j), namespaceMap, dataTypeString, memberMap, articleNameString);
+ addRegexMapping((String) regexStrings.elementAt(j), namespaceMap, dataTypeString, memberMap, articleNameString, ruleName);
}
for(int j = 0; j < urlRegexStrings.size(); j++){
addURLRegexMapping((String) urlRegexStrings.elementAt(j), namespaceMap, dataTypeString, memberMap, articleNameString);
@@ -330,8 +340,15 @@
return e != null && ARTICLENAME_RULE_TAG.equals(e.getLocalName());
}
+ // Defined or inherited rule spec.
public boolean isMemberRule(Element e){
- return e != null && MEMBER_RULE_TAG.equals(e.getLocalName());
+ return e != null && (MEMBER_RULE_TAG.equals(e.getLocalName()) ||
+ MEMBERS_RULE_TAG.equals(e.getLocalName()));
+ }
+
+ // Inherited rule spec.
+ public boolean isMembersRule(Element e){
+ return e != null && MEMBERS_RULE_TAG.equals(e.getLocalName());
}
public boolean isNamespaceRule(Element e){
@@ -402,19 +419,58 @@
return str;
}
- protected void addMemberMapping(Element memTag, Map<String,String[]> membersMap) throws Exception{
+ protected void addMemberMapping(Element memTag, Map<String,String[]> membersMap, String dataTypeName)
+ throws Exception{
if(!isMemberRule(memTag)){
throw new Exception("Element provided to addMemberMapping (" +
(memTag == null ? null : memTag.getLocalName()) +
") was not a member rule element");
}
-
+ String ruleValue = memTag.getTextContent();
String memberNameKey = memTag.getAttribute(DATATYPE_RULE_ATTR);
+ if(ruleValue == null || ruleValue.length() == 0){
+ System.err.println("Object member " + memberNameKey + " has a blank value rule");
+ }
+
+ String memberDataTypeSetting = memTag.getAttribute(DATATYPE_ATTR);
+ if(isMembersRule(memTag)){
+ String membersRuleName = memTag.getAttribute(MEMBERS_RULE_ATTR);
+ if(membersRuleName == null || membersRuleName.length() == 0){
+ throw new Exception("Attribute "+MEMBERS_RULE_ATTR+" is missing from the " +
+ "member rule tag '" + memTag.getNodeName()+"'");
+ }
+ MobyComplexBuilder membersBuilder = builderNameMap.get(membersRuleName);
+ if(membersBuilder == null){
+ throw new Exception("Attribute "+MEMBERS_RULE_ATTR+" refers to a rule (" +
+ membersRuleName+") that does not exist");
+ }
+ MobyDataType dataType = MobyDataType.getDataType(dataTypeName);
+ if(!dataType.inheritsFrom(membersBuilder.getDataType())){
+ throw new Exception("Data type produced by inherited rule (" + membersRuleName +
+ ") is not a subtype of the current rule (" + dataType.getName() + ")");
+ }
+ // Borrow members from the builder, unless they already exist in the
+ // production rule (i.e. "member" rules override "inheritMembers" rules),
+ // BUT, we need to tell the builder that the regex to match is not
+ // the one from the current rule, but the one from the inherited rule
+ // (applied to the value created by ruleValue)
+ // the stering array for the inheritance rule looks like ["ruleName1", "ruleSrcValueExpr1"]
+ if(membersMap.containsKey(MobyComplexBuilder.INHERITED_MEMBERS_SENTINEL)){
+ throw new Exception("More than one member inheritance tag was given, which is illegal");
+ }
+ String[] inheritanceRuleSpecs = new String[2];
+ inheritanceRuleSpecs[0] = membersRuleName;
+ inheritanceRuleSpecs[1] = ruleValue;
+ membersMap.put(MobyComplexBuilder.INHERITED_MEMBERS_SENTINEL, inheritanceRuleSpecs);
+ return;
+ }
+
if(memberNameKey == null || memberNameKey.length() == 0){
throw new Exception("Element provided to addMemberMapping did not " +
"have a non-blank " + DATATYPE_RULE_ATTR +
" attribute as required");
}
+
String memberWhitespaceSetting = memTag.getAttribute(WHITESPACE_ATTR);
if(memberWhitespaceSetting == null || memberWhitespaceSetting.length() == 0){
memberWhitespaceSetting = WHITESPACE_ATTR_KEEP_VAL; // default is to keep whitespace
@@ -429,6 +485,7 @@
"), overriding with default of " + WHITESPACE_ATTR_KEEP_VAL);
memberWhitespaceSetting = WHITESPACE_ATTR_KEEP_VAL;
}
+
String memberEncodingSetting = memTag.getAttribute(ENCODING_ATTR);
if(memberEncodingSetting == null || memberEncodingSetting.length() == 0){
memberEncodingSetting = ENCODING_ATTR_NONE_VAL; // default is to not encode
@@ -446,13 +503,16 @@
return;
}
- String ruleValue = memTag.getTextContent();
- if(ruleValue == null || ruleValue.length() == 0){
- System.err.println("Object member " + memberNameKey + " has a blank value rule");
- }
- membersMap.put(memberNameKey, new String[]{ruleValue,
+ // Leave memberRuleName blank unless we inherited members via
+ // a rule attrubute. If memberRuleName is null,
+ // in another method we will see if any capture value in the member
+ // rule refers to a \p{ruleName} string in the regex
+ String memberRuleName = memTag.getAttribute(MEMBERS_RULE_ATTR);
+ membersMap.put(memberNameKey, new String[]{ruleValue,
+ memberDataTypeSetting,
memberWhitespaceSetting,
- memberEncodingSetting});
+ memberEncodingSetting,
+ memberRuleName});
}
protected void addNamespaceMapping(Element nsTag, Map<String,String> namespaceStrings) throws Exception{
@@ -1016,7 +1076,7 @@
for(int i = 0; i < mobyObj.length; i++){
nsRules.put(mobyObj[i], "$0");
}
- addRegexMapping(regexp, nsRules, articleName);
+ addRegexMapping(regexp, nsRules, articleName, (String) null);
}
public void addURLRegexMapping(String url_regexp, String[] mobyObj, String articleName){ //mobyObj<--mobyNamespaces
@@ -1035,11 +1095,58 @@
/**
* Converts seahawk-specific regex syntax into generic Java syntax (e.g \N becomes a
- * match for any IUPAC DNA character, \P any amino acid)
+ * match for any IUPAC DNA character, \P any amino acid). We pass in the member map so that
+ * this method can populate the last field of rules that inherit complex members (by virtue
+ * of using the capture group whose values is derived from \p{ruleName}), with ruleName
*/
- protected String processRegExp(String regex){
- return regex.replaceAll("\\\\N", "[acgtunxACGTUNX]")
- .replaceAll("\\\\P", "[ARNDCQEGHILKMFPSTWYVBZXarndcqeghilkmfpstwyvbz*]");
+ protected String processRegExp(String regex, Map<String,String[]> membersMap) throws Exception{
+ String returnValue = regex.replaceAll("\\\\N", "[acgtunxACGTUNX]")
+ .replaceAll("\\\\P", "[ARNDCQEGHILKMFPSTWYVBZXarndcqeghilkmfpstwyvbz*]");
+
+ // Now see if there are any references to other patterns (by rule name)
+ // with the \p{} syntax specific to Seahawk
+ Pattern charClassPattern = Pattern.compile("\\\\p\\{([A-Za-z0-9]+)\\}");
+ Matcher charClassMatcher = charClassPattern.matcher(returnValue);
+ Map<Integer,String> capGroup2RuleReference = new HashMap<Integer,String>();
+ while(charClassMatcher.find()){
+ String reference = charClassMatcher.group(1);
+ if(RegexParser.isPosixCharacterClass(reference)){
+ continue;
+ }
+
+ if(!patternNameMap.containsKey(reference)){
+ throw new Exception("\\p{"+reference+"} in regex does not refer " +
+ "to a known Seahawk rule, cannot build the regex");
+ }
+
+ int capGroup = RegexParser.locationToCaptureGroupNumber(regex, charClassMatcher.start(1));
+ if(capGroup > 0){ // sanity check
+ capGroup2RuleReference.put(capGroup, reference);
+ }
+
+ // Replace ref with regex, elinating any nested capture groups, for efficiency
+ // (otherwise we'd need to shift all the $# refs in the rules map to compensate)
+ returnValue = returnValue.replaceFirst("\\\\p\\{"+reference+"\\}",
+ patternNameMap.get(reference).pattern().replaceAll("\\\\", "\\\\\\\\").replaceAll("\\((?!\\?)", "(?:"));
+ }
+
+ if(!capGroup2RuleReference.isEmpty()){
+ // Update any member rule that uses one of the capture values referring to a
+ // \p{ruleName} reference
+ for(String[] rule: membersMap.values()){
+ for(Integer capGroupNum: capGroup2RuleReference.keySet()){
+ if(rule[0].matches("^\\s*\\$"+capGroupNum+"\\s*$")){
+ // This is where the rule actually gets the subrule reference update,
+ // UNLESS it was already specified (probably by a inheritsMembers tag)
+ if(rule[4] == null || rule[4].length() == 0){
+ rule[4] = capGroup2RuleReference.get(capGroupNum);
+ }
+ break;
+ }
+ }
+ }
+ }
+ return returnValue;
}
/**
@@ -1060,13 +1167,13 @@
return url_regex_flexible;
}
- protected void addRegexMapping(String regexp, Map<String,String> nsRules, String articleName){ //nsRules = Map<String ns, String regex_replacement>
+ protected void addRegexMapping(String regexp, Map<String,String> nsRules, String articleName, String ruleName){ //nsRules = Map<String ns, String regex_replacement>
if(nsRules == null || nsRules.size() == 0){
System.err.println("Ignoring empty namespace-only regex rule mappings");
return;
}
- addRegexMapping(regexp, nsRules, (String) null, (Map<String,String[]>) null, articleName);
+ addRegexMapping(regexp, nsRules, (String) null, (Map<String,String[]>) null, articleName, null);
}
protected void addURLRegexMapping(String url_regexp, Map url_nsRules, String articleName){ //nsRules = Map<String ns, String regex_replacement>
@@ -1080,29 +1187,128 @@
public void addRegexMapping(String regexp, Map<String,String> nsRules,
String mobyDataType, Map<String,String[]> membersMap){
- addRegexMapping(regexp, nsRules, mobyDataType, membersMap, null);
+ addRegexMapping(regexp, nsRules, mobyDataType, membersMap, null, null);
}
- public void addRegexMapping(String regexp, Map<String,String> nsRules, String mobyDataType, Map<String,String[]> membersMap, String articleName){
+ /**
+ * This method looks for HAS members in a rule, and if they exist creates new
+ * capture groups around the accessed capture groups so that they can be further
+ * processed in MobyComplexBuilder (e.g. (\d)+ become ((\d+)) so we can deconstruct
+ * that part of the regex as capture *each* \d for the HAS (0 or more) member relationship.
+ * We also need to modify all of the rules that access capture groups to bump up their
+ * numbers due to the added capture groups here (transparent to the user).
+ *
+ * membersMap, nsRules, and articleName get their capture group references modified accordingly,
+ * and the new regex is returned with its extract capture groups.
+ */
+ private String handleHASMembers(String regexp, Map<String,String> nsRules, String mobyDataType,
+ Map<String,String[]> membersMap, StringBuffer articleName) throws Exception{
+ if(mobyDataType == null){
+ //System.err.println("Got null data type for regex "+regexp);
+ return regexp; //must be a base object
+ }
+ MobyDataType dataType = MobyDataType.getDataType(mobyDataType, SeahawkOptions.getRegistry());
+ if(dataType == null){
+ throw new Exception("Cannot find definition of data type "+mobyDataType+
+ " in the ontology, therefore the rule cannot be properly parsed");
+ }
+ MobyRelationship[] memberRelationships = dataType.getChildren();
+
+ String newRegexp = processRegExp(regexp, membersMap);
+ Map<Integer,Boolean> captured = new HashMap<Integer,Boolean>();
+ for(MobyRelationship memberRelationship: memberRelationships){
+ if(memberRelationship.getRelationshipType() == Central.iHAS){
+ String[] rule = membersMap.get(memberRelationship.getName());
+ if(rule == null){
+ System.err.println("Skipping HAS member "+memberRelationship.getName() +
+ " without a rule");
+ continue;
+ }
+ Pattern pattern = Pattern.compile(newRegexp, Pattern.DOTALL | Pattern.COMMENTS);
+ int groupCount = RegexParser.groupCount(pattern);
+ for(int i = 0; i < groupCount; i++){
+ if(captured.containsKey(i)){ //autoboxed int
+ // Already encapsulated the capture group due to another
+ // HAS member, don't need to add anything
+ System.err.println("Skipping processing of capture group "+i+
+ ", it's already been processed by another member in this rule");
+ continue;
+ }
+ if(rule[0].matches("^.*\\$"+i+"(?=\\D.*|\\z)")){
+ //System.err.println("Substituting "+i+" with encapsulating capture group, " +
+ // "due to HAS condition of member "+memberRelationship.getName() +
+ // " with rule " + rule[0]);
+
+ // Now actually update the regex with the new cap group
+ // including any quantity modifier associated with it.
+ boolean INCL_QUANTIFIER = true;
+ int capGroupRange[] = RegexParser.getCaptureGroupRange(pattern, i, INCL_QUANTIFIER);
+ newRegexp = newRegexp.substring(0, capGroupRange[0])+"("+
+ newRegexp.substring(capGroupRange[0], capGroupRange[1]+1) + ")" +
+ (capGroupRange[1]+1 < newRegexp.length() ? newRegexp.substring(capGroupRange[1]+1) : "");
+ // Bump up all the capture group reference higher than this one, in all rules
+ for(int j = i; j < groupCount; j++){
+ for(String memberName: membersMap.keySet()){
+ String[] memberRule = membersMap.get(memberName);
+ memberRule[0] = memberRule[0].replaceAll("\\$"+i+"(?=\\D.*|\\z)", "\\$"+(i+1));
+ }
+ for(String nsName: nsRules.keySet()){
+ String nsRule = nsRules.get(nsName);
+ nsRules.put(nsName, nsRule.replaceAll("\\$"+i+"(?=\\D.*|\\z)", "\\$"+(i+1)));
+ }
+ articleName.replace(0, articleName.length(),
+ articleName.toString().replaceAll("\\$"+i+"(?=\\D.*|\\z)", "\\$"+(i+1)));
+ }
+ captured.put(i, true); //autobox both key and value
+ for(int j = groupCount; j >= i; j--){
+ captured.remove(j);
+ captured.put(j+1, true);
+ }
+ } //end if (rule contains group reference)
+ } // end for(group count)
+ } // end if (member's relationship is HAS)
+ else{
+ System.err.println("Relationship for member " + memberRelationship.getName() + " of "+ mobyDataType +
+ "is *not* HAS");
+ }
+ } //end for(member replationships)
+ return newRegexp;
+ }
+
+ public void addRegexMapping(String regexp, Map<String,String> nsRules, String mobyDataType,
+ Map<String,String[]> membersMap, String articleName,
+ String ruleName){
try{
+ // Use a StringBuffer so it's mutable by handleHASMembers()
+ StringBuffer articleNameBuffer = new StringBuffer(articleName == null ? "" : articleName);
+ regexp = handleHASMembers(regexp, nsRules, mobyDataType, membersMap, articleNameBuffer);
+
// Pattern.DOTALL to allow ".*" to span multiple lines, also allow comments (# to EOL) and whitespace
// for better readability in the rules file.
- Pattern pattern = Pattern.compile(processRegExp(regexp), Pattern.DOTALL | Pattern.COMMENTS);
+ Pattern pattern = Pattern.compile(processRegExp(regexp, membersMap), Pattern.DOTALL | Pattern.COMMENTS);
// Base object
if(mobyDataType == null || mobyDataType.length() == 0){
regexMap.put(pattern, new MobyComplexBuilder("Object",
membersMap,
nsRules,
- articleName));
- return;
+ this,
+ articleNameBuffer.toString()));
}
-
// Complex Object
- regexMap.put(pattern, new MobyComplexBuilder(mobyDataType,
- membersMap,
- nsRules,
- articleName));
+ else{
+ regexMap.put(pattern, new MobyComplexBuilder(mobyDataType,
+ membersMap,
+ nsRules,
+ this,
+ articleNameBuffer.toString()));
+ }
+
+ if(ruleName != null && ruleName.length() != 0){
+ patternNameMap.put(ruleName, pattern);
+ builderNameMap.put(ruleName, regexMap.get(pattern));
+ }
+
}catch(Exception e){
System.err.println("Could not create regular expression statement from '" +
regexp + "': " + e);
@@ -1121,6 +1327,7 @@
urlRegexMap.put(pattern, new MobyComplexBuilder("Object",
membersMap,
url_nsRules,
+ this,
articleName));
return;
}
@@ -1129,6 +1336,7 @@
urlRegexMap.put(pattern, new MobyComplexBuilder(mobyDataType,
membersMap,
url_nsRules,
+ this,
articleName));
}catch(Exception e){
System.err.println("Could not create URL regular expression statement from '" +
@@ -1169,6 +1377,7 @@
xpathMap.put(xpath, new MobyComplexBuilder("Object",
membersMap,
nsRules,
+ this,
articleName));
return;
}
@@ -1177,6 +1386,7 @@
xpathMap.put(xpath, new MobyComplexBuilder(mobyDataType,
membersMap,
nsRules,
+ this,
articleName));
}catch(Exception e){
@@ -1286,5 +1496,19 @@
}
return true; // be optimistic by default, assume it's alive
}
+
+ /**
+ * If a rule was given this name, the MOBY object builder for the rule is returned.
+ */
+ public MobyComplexBuilder getBuilder(String ruleName){
+ return builderNameMap.get(ruleName);
+ }
+
+ /**
+ * If a rule was given this name, the regex pattern for the rule is returned.
+ */
+ public Pattern getPattern(String ruleName){
+ return patternNameMap.get(ruleName);
+ }
}
===================================================================
RCS file: /home/repository/moby/moby-live/Java/src/main/ca/ucalgary/seahawk/services/MobyComplexBuilder.java,v
retrieving revision 1.9
retrieving revision 1.10
diff -u -r1.9 -r1.10
--- /home/repository/moby/moby-live/Java/src/main/ca/ucalgary/seahawk/services/MobyComplexBuilder.java 2007/06/16 00:28:11 1.9
+++ /home/repository/moby/moby-live/Java/src/main/ca/ucalgary/seahawk/services/MobyComplexBuilder.java 2007/06/28 16:45:44 1.10
@@ -12,9 +12,8 @@
import javax.xml.parsers.*;
import javax.xml.namespace.NamespaceContext;
-import java.util.Iterator;
-import java.util.Map;
-import java.util.regex.Matcher;
+import java.util.*;
+import java.util.regex.*;
/**
* Class used by MobyClient to build the MOBY rule database from a configuration file.
@@ -22,6 +21,7 @@
public class MobyComplexBuilder{
public static final String ANON_ARTICLE = "_seahawk_data";
+ public static final String INHERITED_MEMBERS_SENTINEL = "_seahawk_member_inheritance";
private static XPathFactory xPathFactory;
private static DocumentBuilder docBuilder;
@@ -31,6 +31,7 @@
private MobyDataType mobyDataType;
private MobyNamespace[] mobyNS;
private MobyRelationship[] children;
+ private MobyClient client;
static{
xPathFactory = XPathFactory.newInstance();
@@ -43,11 +44,11 @@
}
}
- public MobyComplexBuilder(String dataType, Map<String,String[]> members, Map<String,String> nameSpaces) throws Exception{
- this(dataType, members, nameSpaces, ANON_ARTICLE);
+ public MobyComplexBuilder(String dataType, Map<String,String[]> members, Map<String,String> nameSpaces, MobyClient cl) throws Exception{
+ this(dataType, members, nameSpaces, cl, ANON_ARTICLE);
}
- public MobyComplexBuilder(String dataType, Map<String,String[]> members, Map<String,String> nameSpaces, String articleName) throws Exception{
+ public MobyComplexBuilder(String dataType, Map<String,String[]> members, Map<String,String> nameSpaces, MobyClient cl, String articleName) throws Exception{
mobyDataType = MobyDataType.getDataType(dataType, SeahawkOptions.getRegistry());
// Not an existing Ontology data type
if(mobyDataType == null){
@@ -56,6 +57,7 @@
articleNameRule = articleName;
memberRules = members;
+ client = cl;
if(mobyDataType != null && mobyDataType.getName() != null &&
!mobyDataType.getName().equals("Object")){
@@ -66,9 +68,21 @@
for(int i = 0; children != null && i < children.length; i++){
String childName = children[i].getName();
if(!memberRules.containsKey(childName)){
- throw new Exception("Ruleset for object " + mobyDataType.getName() +
- " does not include a rule for required member " +
- children[i]);
+ // See if there's an inherited member
+ if(!memberRules.containsKey(INHERITED_MEMBERS_SENTINEL)){
+ throw new Exception("Ruleset for object " + mobyDataType.getName() +
+ " does not include a rule for required member '" +
+ childName+"'");
+ }
+ String inheritedRuleName = memberRules.get(INHERITED_MEMBERS_SENTINEL)[0];
+ MobyComplexBuilder inheritedBuilder = client.getBuilder(inheritedRuleName);
+ if(inheritedBuilder.getDataType().getChild(childName) == null){
+ throw new Exception("Ruleset for object " + mobyDataType.getName() +
+ " does not include a rule for required member '" +
+ childName+"', nor does the inherited rule '"+
+ inheritedRuleName+"'");
+ }
+
}
for(int j = 0; j < ruleNames.length; j++){
if(ruleNames[j] == childName){
@@ -82,11 +96,50 @@
System.err.println("Ignoring member rules for object type " +
mobyDataType.getName() + " (not in the MOBY ontology):");
for(int i = 0; i < ruleNames.length; i++){
- if(ruleNames[i] != null){
+ if(ruleNames[i] != null && !ruleNames[i].equals(INHERITED_MEMBERS_SENTINEL)){
System.err.println("Extra member: " + ruleNames[i]);
}
}
}
+
+ // Check that any casting done on members of the object is kosher
+ // (do this now because we now have the data type of the parent object for sure)
+ for(Map.Entry<String,String[]> member: members.entrySet()){
+ if(member.getKey().equals(INHERITED_MEMBERS_SENTINEL)){
+ continue;
+ }
+ String memberDataTypeSetting = member.getValue()[1];
+ if(memberDataTypeSetting != null && memberDataTypeSetting.length() > 0){
+ MobyRelationship child = mobyDataType.getChild(member.getKey());
+ String defaultDataTypeName = child.getDataTypeName();
+ MobyDataType castType = MobyDataType.getDataType(memberDataTypeSetting,
+ SeahawkOptions.getRegistry());
+ MobyDataType origType = MobyDataType.getDataType(defaultDataTypeName,
+ SeahawkOptions.getRegistry());
+ if(castType == null){
+ System.err.println("Could not find the data type "+memberDataTypeSetting+
+ " in the ontology, ignoring the data type cast"+
+ " that says it should override " + defaultDataTypeName);
+ member.getValue()[1] = null;
+ }
+ else if(origType == null){
+ System.err.println("Could not find the data type "+defaultDataTypeName+
+ " in the ontology, ignoring the data type cast" +
+ " that overrides it to data type (" + memberDataTypeSetting +
+ "), can't tell if the cast is legal or not)");
+ member.getValue()[1] = null;
+ }
+ else if(!castType.inheritsFrom(origType)){
+ System.err.println("The data type "+memberDataTypeSetting+
+ " does not inherit from the member's type in the ontology (" +
+ defaultDataTypeName+"), ignoring the data type cast");
+ member.getValue()[1] = null;
+ }
+ else{
+ // It's safe it we got this far, leave it as-is
+ }
+ }
+ }
}
namespaceRules = nameSpaces;
@@ -124,7 +177,8 @@
}
// Set article name, if available
- if(articleNameRule != null && articleNameRule.length() > 0){
+ if(articleNameRule != null && articleNameRule.length() > 0 &&
+ !articleNameRule.equals(ANON_ARTICLE)){
mobyObj.setName(evaluateRule(matcher,
articleNameRule,
MobyClient.WHITESPACE_ATTR_STRIP_FLANKING_VAL,
@@ -159,36 +213,194 @@
}
}
- // Set members if available and required
- if(mobyObj instanceof MobyDataComposite){
- if(children != null){
- for(int i = 0; i < children.length; i++){
- String memberName = children[i].getName();
- String[] resultSpec = memberRules.get(memberName);
+ // Done?
+ if(!(mobyObj instanceof MobyDataComposite) || children == null){
+ return mobyObj;
+ }
+
+ // There's another rule to populate members from
+ if(memberRules.containsKey(INHERITED_MEMBERS_SENTINEL)){
+ // String[2] = {ruleName, ruleValue}
+ String[] ruleSpec = memberRules.get(INHERITED_MEMBERS_SENTINEL);
+ MobyComplexBuilder inheritedBuilder = client.getBuilder(ruleSpec[0]);
+ String result = evaluateRule(matcher, ruleSpec[1], null, null, data);
+ Matcher submatcher = client.getPattern(ruleSpec[0]).matcher(result);
+ if(!submatcher.find()){
+ throw new MobyException("Pattern of inherited rule '"+ruleSpec[0]+
+ "' does not match data given: " + result);
+ }
+ MobyDataObject inheritedResult = inheritedBuilder.apply(submatcher, result.getBytes());
+
+ // Now copy all the fields from the inheritedResults to our object
+ if(inheritedResult instanceof MobyDataComposite){
+ ((MobyDataComposite) mobyObj).putAll((MobyDataComposite) inheritedResult);
+ }
+ }
+
+ // Otherwise set members if available and required
+ for(int i = 0; i < children.length; i++){
+ String memberName = children[i].getName();
+ String[] resultSpec = memberRules.get(memberName);
+ if(resultSpec == null){
+ // Must have been populated by the inherited rule?
+ if(!((MobyDataComposite) mobyObj).containsKey(memberName)){
+ throw new MobyException("Member " +memberName+" does not have a rule, nor" +
+ " was not populated by any inherited rule");
+ }
+ continue;
+ }
- MobyRelationship memberRelationship = mobyDataType.getChild(memberName);
- // Is the field a list rather than a single value? If so,
- // we will need to reinterpret the member value specification
- // as many times as the regex capture group matched...
- if(memberRelationship.getRelationshipType() == Central.iHAS){
-
+ String dataTypeName = children[i].getDataTypeName();
+ // First: does the rule cast the object to some subtype?
+ if(resultSpec[1] != null && resultSpec[1].length() > 0){
+ // Safe to cast if we got this far (e.g. put String where an Object is specified)
+ dataTypeName = resultSpec[1];
+ }
+
+ // Is the field a list rather than a single value? If so,
+ // we will need to reinterpret the member value specification
+ // as many times as the regex capture group matched...
+ String[] results = new String[1];
+ if(children[i].getRelationshipType() == Central.iHAS){
+ // We need to reconstruct the capture group so that we
+ // can save each value, not just its last one.
+ try{
+ // results may be more than one element in this case
+ results = evaluateIterativeRule(matcher, resultSpec[0], resultSpec[2], resultSpec[3], data);
+ } catch(Exception e){
+ e.printStackTrace();
+ throw new MobyException("Error while evaluating HAS (iterative evaluation) rule: " + e);
+ }
+ }
+ // else is iHASA, scalar context
+ else{
+ // only one result to process and add the to object
+ results[0] = evaluateRule(matcher, resultSpec[0], resultSpec[2], resultSpec[3], data);
+ }
+
+ for(String result: results){
+
+ if(PrimitiveTypes.isPrimitive(dataTypeName)){
+ ((MobyDataComposite) mobyObj).put(memberName,
+ MobyDataObject.createInstanceFromString(dataTypeName, result));
+ }
+ else{
+ // Recursively call the object creator for complex members
+
+ // Fetch the member's MobyComplexBuilder by name from the rule spec
+ String memberRuleName = resultSpec[4];
+ if(memberRuleName == null || memberRuleName.length() == 0){
+ throw new MobyException("Composite member "+memberName+
+ " does not have an associated rule to produce it");
+ }
+ MobyComplexBuilder memberBuilder = client.getBuilder(memberRuleName);
+ if(memberBuilder == null){
+ throw new MobyException("The object builder for the rule " + memberRuleName +
+ " could not be found");
+ }
+ Pattern memberPattern = client.getPattern(memberRuleName);
+
+ // Now apply the regex for the capture group, so we can use
+ // the results in a recursive call to this method (stop condition
+ // is when all members are primitives).
+ Matcher memberMatcher = memberPattern.matcher(result);
+
+ if(memberMatcher.find()){
+ ((MobyDataComposite) mobyObj).put(memberName,
+ memberBuilder.apply(memberMatcher));
}
- // else is iHASA, scalar context
else{
- String result = evaluateRule(matcher, resultSpec[0], resultSpec[1], resultSpec[2], data);
-
- ((MobyDataComposite) mobyObj).put(memberName,
- MobyDataObject.createInstanceFromString(children[i].getDataTypeName(), result));
+ System.err.println("Could not match pattern \"" + memberPattern.pattern() +
+ "\" to string \"" + result + "\", abandoning " +
+ dataTypeName + " object creation");
}
}
+ } //end for results
+ } //end for children
+
+ return mobyObj;
+ }
+
+ /**
+ * @return a list of rule values to interpret
+ */
+ public String[] evaluateIterativeRule(Matcher matcher, String result, String whitespaceMode, String encodingMode, byte[] data)
+ throws MobyException, Exception{
+ Vector<String> capturedValues = new Vector<String>();
+
+ // Figure out which capture group needs to be evaluated iteratively
+ Vector<Integer> captureGroups = new Vector<Integer>();
+ // This only really works with 10 capture groups or less, otherwise $1 and $10 can get captured, etc.
+ for(int captureGroup = 1; captureGroup <= matcher.groupCount(); captureGroup++){
+ if(result.indexOf("$"+captureGroup) != -1){
+ //System.err.println("Adding capture group " + captureGroup +
+ // " to evaluation list due to its presence in rule " + result);
+ captureGroups.add(captureGroup);
}
}
+ // No capture group found, so just return single evaluation on whole expression (may use $0)
+ if(captureGroups.isEmpty()){
+ return new String[]{evaluateRule(matcher, result, whitespaceMode, encodingMode, data)};
+ }
+
+ // Pull out the capture group's pattern, by counting parentheses
+ Map<Integer,String> captureGroupRegexes = new HashMap<Integer,String>();
+ for(Integer captureGroup: captureGroups){
+ captureGroupRegexes.put(captureGroup.intValue(),
+ RegexParser.getCaptureGroupRegex(matcher.pattern(),
+ captureGroup.intValue()));
+ }
+
+ // Find all instances of the capture group, save their value
+ Map<Integer,String[]> captureGroupSubvalues = new HashMap<Integer,String[]>();
+ for(Map.Entry<Integer,String> entry: captureGroupRegexes.entrySet()){
+ // Why fetch cap group # -1? Because we auto-encapsulated the cap groups the HAS member
+ // refers to, specifically so we get the whole match for processing, rather than just
+ // the last one, i.e. (\d)+ in a regex rule became ((\d)+) in MobyClient so we can
+ // find each \d in the whole match and add them indidivually as HAS (0 or more) members
+ // in the object instance.
+ String wholeMatch = matcher.group(entry.getKey().intValue()-1);
+ String regex = entry.getValue();
+ if(regex.length() == 0){
+ throw new Exception("Encountered empty regex in capture group " + entry.getKey());
+ }
+ //System.err.println("Applying regex "+regex+" to "+wholeMatch+", whole pattern was "+matcher.pattern().pattern());
+ // If the pattern matches more than once, the matches must be contiguous, hence the \G
+ Pattern captureGroupPattern = Pattern.compile("\\G"+regex,
+ Pattern.DOTALL | Pattern.COMMENTS);
+ Vector<String> subValues = new Vector<String>();
+ Matcher subMatcher = captureGroupPattern.matcher(wholeMatch);
+ int lastSubMatcherEnd = -1;
+ while(subMatcher.find()){
+ subValues.add(subMatcher.group());
+ lastSubMatcherEnd = subMatcher.end();
+ }
- return mobyObj;
+ // For safety, we should do a sanity check that there's
+ // is no unmatched input left over, as we'd expect
+ if(lastSubMatcherEnd != -1 && lastSubMatcherEnd != wholeMatch.length()){
+ throw new Exception("The submatcher for capture group "+entry.getKey()+
+ " did not match to the last char of \"" + wholeMatch +
+ "\", should have ended match at index " + (wholeMatch.length()-1) +
+ ", but instead matched until " + (lastSubMatcherEnd-1));
+ }
+ captureGroupSubvalues.put(entry.getKey().intValue(),
+ (String[]) subValues.toArray(new String[subValues.size()]));
+ }
+
+ // Now, iteratively substitute each subvalue from each capture group into the rules
+ IterativeMatchResult iterMatch = new IterativeMatchResult(matcher, captureGroupSubvalues);
+ for(int i = 0; i < iterMatch.getNumIterations(); i++){
+ iterMatch.setIteration(i);
+ capturedValues.add(evaluateRule(iterMatch, result, whitespaceMode, encodingMode, data));
+ }
+
+ // Return all the values we found
+ return (String[]) capturedValues.toArray(new String[capturedValues.size()]);
}
// takes a rule and evaluates $# variables and XPath expressions
- private String evaluateRule(Matcher matcher, String result, String whitespaceMode, String encodingMode, byte[] data)
+ private String evaluateRule(MatchResult match, String result, String whitespaceMode, String encodingMode, byte[] data)
throws MobyException{
byte[] resultBytes = result.getBytes();
boolean nonBasic = result.length() != 2 || result.indexOf("$") != 0;
@@ -202,24 +414,24 @@
doc = docBuilder.newDocument();
}
- // Replace any $0, $1, etc. in the replacement string with the values found in the matcher
+ // Replace any $0, $1, etc. in the replacement string with the values found in the match
// Note that this is not perfect: if you had "$1 $2", and $1 had value "$250", you'd get $250$2,
// then you'd substitute $2's value of "per metre", you'd get "per metre50 per metre" instead of
// "$250 per metre". Not sure of a good way around this yet (i.e. when varValue had $k in it where k > j)...
- for(int j = 0; j <= matcher.groupCount(); j++){
+ for(int j = 0; j <= match.groupCount(); j++){
// A replaceAll() for binary data
if(data != null && isBinary){
int srcPos = 0;
for(int varIndex = result.indexOf("$"+j, srcPos);
varIndex != -1;
varIndex = result.indexOf("$"+j, srcPos)){
- int varValueLength = matcher.end(j)-matcher.start(j);
+ int varValueLength = match.end(j)-match.start(j);
int varLen = ("$"+j).getBytes().length;
byte[] newResultBytes = new byte[resultBytes.length+varValueLength-varLen];
if(varIndex > 0){
System.arraycopy(resultBytes, 0, newResultBytes, 0, varIndex);
}
- System.arraycopy(data, matcher.start(j), newResultBytes, varIndex, varValueLength);
+ System.arraycopy(data, match.start(j), newResultBytes, varIndex, varValueLength);
int remaining = result.length()-varIndex-varLen;
if(remaining > 0){
System.arraycopy(resultBytes, varIndex+varLen, newResultBytes, varIndex+varValueLength,
@@ -230,8 +442,8 @@
}
}
// $# substitution in a string, considerably simpler!
- else{
- String varValue = matcher.group(j);
+ else if(result.matches("^\\$"+j+"(?=\\D.*|\\z)")){
+ String varValue = match.group(j);
if(whitespaceMode == null){
// do nothing
}
@@ -244,7 +456,7 @@
else if(whitespaceMode.equals(MobyClient.WHITESPACE_ATTR_STRIP_FLANKING_VAL)){
varValue = varValue.trim(); // removes leading and trailing whitespace
}
- result = result.replaceAll("\\$"+j, varValue);
+ result = result.replaceAll("\\$"+j+"(?=\\D|\\z)", varValue);
// Binary data and XPath are incompatible, since many bytes are not allowed in XML
// so only create the node list if not Base64 encoding
More information about the MOBY-guts
mailing list