[MOBY-guts] biomoby commit

Paul Gordon gordonp at dev.open-bio.org
Thu Jun 28 16:45:45 UTC 2007


gordonp
Thu Jun 28 12:45:44 EDT 2007
Update of /home/repository/moby/moby-live/Java/src/main/ca/ucalgary/seahawk/services
In directory dev.open-bio.org:/tmp/cvs-serv4488/src/main/ca/ucalgary/seahawk/services

Modified Files:
	MobyClient.java MobyComplexBuilder.java 
Added Files:
	IterativeMatchResult.java RegexParser.java 
Log Message:
Major changes to Seahawk 'services' package to make MOB rule parsing much more powerful
moby-live/Java/src/main/ca/ucalgary/seahawk/services IterativeMatchResult.java,NONE,1.1 RegexParser.java,NONE,1.1 MobyClient.java,1.14,1.15 MobyComplexBuilder.java,1.9,1.10
===================================================================
RCS file: /home/repository/moby/moby-live/Java/src/main/ca/ucalgary/seahawk/services/MobyClient.java,v
retrieving revision 1.14
retrieving revision 1.15
diff -u -r1.14 -r1.15
--- /home/repository/moby/moby-live/Java/src/main/ca/ucalgary/seahawk/services/MobyClient.java	2007/06/08 20:30:21	1.14
+++ /home/repository/moby/moby-live/Java/src/main/ca/ucalgary/seahawk/services/MobyClient.java	2007/06/28 16:45:44	1.15
@@ -37,6 +37,7 @@
     public static final String DATA_MAPPING_XML_RESOURCE = "ca/ucalgary/seahawk/resources/mobyBuilderRules.xml";
     public static final String RESOURCE_SYSTEM_PROPERTY = "seahawk.rules";
     public static final String RULE_SET_TAG = "object";
+    public static final String RULE_NAME_ATTR = "name";
     public static final String PREFIX_TAG = "prefix";
     public static final String PREFIX_ATTR = "value";
     public static final String ARTICLENAME_RULE_TAG = "articlename";
@@ -47,6 +48,8 @@
     public static final String DATATYPE_RULE_ATTR = "value";
     public static final String MEMBER_RULE_TAG = "member";
     public static final String MEMBER_RULE_ATTR = "value";
+    public static final String MEMBERS_RULE_TAG = "inheritMembers";
+    public static final String MEMBERS_RULE_ATTR = "rule";
     public static final String URL_REGEX_TAG = "url_regex";
     public static final String REGEX_TAG = "regex";
     public static final String XPATH_TAG = "xpath";
@@ -55,6 +58,7 @@
     public static final String WHITESPACE_ATTR_NORMALIZE_VAL = "normalize";
     public static final String WHITESPACE_ATTR_STRIP_FLANKING_VAL = "flanking";
     public static final String WHITESPACE_ATTR_KEEP_VAL = "keep";
+    public static final String DATATYPE_ATTR = "datatype";
     public static final String ENCODING_ATTR = "encoding";
     public static final String ENCODING_ATTR_BASE64_VAL = "Base64";
     public static final String ENCODING_ATTR_NONE_VAL = "none";
@@ -67,6 +71,8 @@
     private HashMap xpathMap; 
     private HashMap urlRegexMap; 
     private Map<Pattern,MobyComplexBuilder> regexMap; 
+    private Map<String,MobyComplexBuilder> builderNameMap; 
+    private Map<String,Pattern> patternNameMap; 
     private URL dataMappingXMLURL;
     private DocumentBuilder docBuilder;
     private static org.apache.log4j.Logger logger = org.apache.log4j.Logger.getLogger(MobyClient.class);
@@ -88,6 +94,8 @@
 	xpathMap = new HashMap();
 	urlRegexMap = new HashMap();
 	regexMap = new HashMap<Pattern,MobyComplexBuilder>();
+	builderNameMap = new HashMap<String,MobyComplexBuilder>(); 
+	patternNameMap = new HashMap<String,Pattern>(); 
 	nsContext = new NamespaceContextImpl();
 
  	DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
@@ -201,6 +209,8 @@
 		continue;
 	    }
 
+	    String ruleName = ruleSet.getAttribute(RULE_NAME_ATTR);
+
 	    Vector regexStrings = new Vector();
 	    Vector urlRegexStrings = new Vector();
 	    Vector xpathStrings = new Vector();
@@ -250,7 +260,7 @@
 		    dataTypeString = getDataType(ruleMember);
 		}
 		else if(isMemberRule(ruleMember)){
-		    addMemberMapping(ruleMember, memberMap);
+		    addMemberMapping(ruleMember, memberMap, dataTypeString);
 		}
 		// TODO add other production rules
 		else{
@@ -279,7 +289,7 @@
 		    addXPathMapping((String) xpathStrings.elementAt(j), namespaceMap, articleNameString);
 		}
 		for(int j = 0; j < regexStrings.size(); j++){
-		    addRegexMapping((String) regexStrings.elementAt(j), namespaceMap, articleNameString);
+		    addRegexMapping((String) regexStrings.elementAt(j), namespaceMap, articleNameString, ruleName);
 		}
 		for(int j = 0; j < urlRegexStrings.size(); j++){
 		    addURLRegexMapping((String) urlRegexStrings.elementAt(j), namespaceMap, articleNameString);
@@ -296,7 +306,7 @@
 		    addXPathMapping((String) xpathStrings.elementAt(j), namespaceMap, dataTypeString, memberMap, articleNameString);
 		}
 		for(int j = 0; j < regexStrings.size(); j++){
-		    addRegexMapping((String) regexStrings.elementAt(j), namespaceMap, dataTypeString, memberMap, articleNameString);
+		    addRegexMapping((String) regexStrings.elementAt(j), namespaceMap, dataTypeString, memberMap, articleNameString, ruleName);
 		}
 		for(int j = 0; j < urlRegexStrings.size(); j++){
 		    addURLRegexMapping((String) urlRegexStrings.elementAt(j), namespaceMap, dataTypeString, memberMap, articleNameString);
@@ -330,8 +340,15 @@
 	return e != null && ARTICLENAME_RULE_TAG.equals(e.getLocalName());
     }
 
+    // Defined or inherited rule spec.
     public boolean isMemberRule(Element e){
-	return e != null && MEMBER_RULE_TAG.equals(e.getLocalName());
+	return e != null && (MEMBER_RULE_TAG.equals(e.getLocalName()) ||
+			     MEMBERS_RULE_TAG.equals(e.getLocalName()));
+    }
+
+    // Inherited rule spec.
+    public boolean isMembersRule(Element e){
+	return e != null && MEMBERS_RULE_TAG.equals(e.getLocalName());
     }
 
     public boolean isNamespaceRule(Element e){
@@ -402,19 +419,58 @@
 	return str;
     }
 
-    protected void addMemberMapping(Element memTag, Map<String,String[]> membersMap) throws Exception{
+    protected void addMemberMapping(Element memTag, Map<String,String[]> membersMap, String dataTypeName) 
+	throws Exception{
 	if(!isMemberRule(memTag)){
 	    throw new Exception("Element provided to addMemberMapping (" +
 				(memTag == null ? null : memTag.getLocalName()) + 
 				") was not a member rule element");
 	}
-
+	String ruleValue = memTag.getTextContent();
 	String memberNameKey = memTag.getAttribute(DATATYPE_RULE_ATTR);
+	if(ruleValue == null || ruleValue.length() == 0){
+	    System.err.println("Object member " + memberNameKey + " has a blank value rule");
+	}
+
+	String memberDataTypeSetting = memTag.getAttribute(DATATYPE_ATTR);
+	if(isMembersRule(memTag)){
+	    String membersRuleName = memTag.getAttribute(MEMBERS_RULE_ATTR);
+	    if(membersRuleName == null || membersRuleName.length() == 0){
+		throw new Exception("Attribute "+MEMBERS_RULE_ATTR+" is missing from the " +
+				    "member rule tag '" + memTag.getNodeName()+"'");
+	    }
+	    MobyComplexBuilder membersBuilder = builderNameMap.get(membersRuleName);
+	    if(membersBuilder == null){
+		throw new Exception("Attribute "+MEMBERS_RULE_ATTR+" refers to a rule (" +
+				    membersRuleName+") that does not exist");
+	    }
+	    MobyDataType dataType = MobyDataType.getDataType(dataTypeName);
+	    if(!dataType.inheritsFrom(membersBuilder.getDataType())){
+		throw new Exception("Data type produced by inherited rule (" + membersRuleName + 
+				    ") is not a subtype of the current rule (" + dataType.getName() + ")"); 
+	    }
+	    // Borrow members from the builder, unless they already exist in the
+	    // production rule (i.e. "member" rules override "inheritMembers" rules),
+	    // BUT, we need to tell the builder that the regex to match is not
+	    // the one from the current rule, but the one from the inherited rule
+	    // (applied to the value created by ruleValue)
+	    // the stering array for the inheritance rule looks like ["ruleName1", "ruleSrcValueExpr1"]
+	    if(membersMap.containsKey(MobyComplexBuilder.INHERITED_MEMBERS_SENTINEL)){
+		throw new Exception("More than one member inheritance tag was given, which is illegal");
+	    }
+	    String[] inheritanceRuleSpecs = new String[2];
+	    inheritanceRuleSpecs[0] = membersRuleName;
+	    inheritanceRuleSpecs[1] = ruleValue;
+	    membersMap.put(MobyComplexBuilder.INHERITED_MEMBERS_SENTINEL, inheritanceRuleSpecs);
+	    return;
+	}
+
 	if(memberNameKey == null || memberNameKey.length() == 0){
 	    throw new Exception("Element provided to addMemberMapping did not " +
 				"have a non-blank " + DATATYPE_RULE_ATTR + 
 				" attribute as required");
 	}
+
 	String memberWhitespaceSetting = memTag.getAttribute(WHITESPACE_ATTR);
 	if(memberWhitespaceSetting == null || memberWhitespaceSetting.length() == 0){
 	    memberWhitespaceSetting = WHITESPACE_ATTR_KEEP_VAL;  // default is to keep whitespace
@@ -429,6 +485,7 @@
 			       "), overriding with default of " + WHITESPACE_ATTR_KEEP_VAL);
 	    memberWhitespaceSetting = WHITESPACE_ATTR_KEEP_VAL;
 	}
+
 	String memberEncodingSetting = memTag.getAttribute(ENCODING_ATTR);
 	if(memberEncodingSetting == null || memberEncodingSetting.length() == 0){
 	    memberEncodingSetting = ENCODING_ATTR_NONE_VAL;  // default is to not encode
@@ -446,13 +503,16 @@
 	    return;
 	}
 
-	String ruleValue = memTag.getTextContent();
-	if(ruleValue == null || ruleValue.length() == 0){
-	    System.err.println("Object member " + memberNameKey + " has a blank value rule");
-	}
-	membersMap.put(memberNameKey, new String[]{ruleValue, 
+	// Leave memberRuleName blank unless we inherited members via 
+        // a rule attrubute.  If memberRuleName is null,
+        // in another method we will see if any capture value in the member
+        // rule refers to a \p{ruleName} string in the regex 
+	String memberRuleName = memTag.getAttribute(MEMBERS_RULE_ATTR);
+	membersMap.put(memberNameKey, new String[]{ruleValue,
+						   memberDataTypeSetting,
 						   memberWhitespaceSetting, 
-						   memberEncodingSetting});
+						   memberEncodingSetting,
+	                                           memberRuleName});
     }
 
     protected void addNamespaceMapping(Element nsTag, Map<String,String> namespaceStrings) throws Exception{
@@ -1016,7 +1076,7 @@
 	for(int i = 0; i < mobyObj.length; i++){
 	    nsRules.put(mobyObj[i], "$0");
 	}
-	addRegexMapping(regexp, nsRules, articleName);
+	addRegexMapping(regexp, nsRules, articleName, (String) null);
     }
 
     public void addURLRegexMapping(String url_regexp, String[] mobyObj, String articleName){ //mobyObj<--mobyNamespaces
@@ -1035,11 +1095,58 @@
 
     /**
      * Converts seahawk-specific regex syntax into generic Java syntax (e.g \N becomes a
-     * match for any IUPAC DNA character, \P any amino acid)
+     * match for any IUPAC DNA character, \P any amino acid).  We pass in the member map so that
+     * this method can populate the last field of rules that inherit complex members (by virtue 
+     * of using the capture group whose values is derived from \p{ruleName}), with ruleName 
      */
-    protected String processRegExp(String regex){
-	return regex.replaceAll("\\\\N", "[acgtunxACGTUNX]")
-                    .replaceAll("\\\\P", "[ARNDCQEGHILKMFPSTWYVBZXarndcqeghilkmfpstwyvbz*]");
+    protected String processRegExp(String regex, Map<String,String[]> membersMap) throws Exception{
+	String returnValue = regex.replaceAll("\\\\N", "[acgtunxACGTUNX]")
+	    .replaceAll("\\\\P", "[ARNDCQEGHILKMFPSTWYVBZXarndcqeghilkmfpstwyvbz*]");
+	
+	// Now see if there are any references to other patterns (by rule name) 
+	// with the \p{} syntax specific to Seahawk
+	Pattern charClassPattern = Pattern.compile("\\\\p\\{([A-Za-z0-9]+)\\}");
+	Matcher charClassMatcher = charClassPattern.matcher(returnValue);
+	Map<Integer,String> capGroup2RuleReference = new HashMap<Integer,String>();
+	while(charClassMatcher.find()){
+	    String reference = charClassMatcher.group(1);
+	    if(RegexParser.isPosixCharacterClass(reference)){
+		continue;
+	    }
+
+	    if(!patternNameMap.containsKey(reference)){
+		throw new Exception("\\p{"+reference+"} in regex does not refer " +
+				    "to a known Seahawk rule, cannot build the regex");
+	    }
+
+	    int capGroup = RegexParser.locationToCaptureGroupNumber(regex, charClassMatcher.start(1));
+	    if(capGroup > 0){  // sanity check
+		capGroup2RuleReference.put(capGroup, reference);
+	    }
+
+	    // Replace ref with regex, elinating any nested capture groups, for efficiency
+	    // (otherwise we'd need to shift all the $# refs in the rules map to compensate)
+	    returnValue = returnValue.replaceFirst("\\\\p\\{"+reference+"\\}", 
+						   patternNameMap.get(reference).pattern().replaceAll("\\\\", "\\\\\\\\").replaceAll("\\((?!\\?)", "(?:"));
+	}
+
+	if(!capGroup2RuleReference.isEmpty()){
+	    // Update any member rule that uses one of the capture values referring to a
+	    // \p{ruleName} reference
+	    for(String[] rule: membersMap.values()){
+		for(Integer capGroupNum: capGroup2RuleReference.keySet()){
+		    if(rule[0].matches("^\\s*\\$"+capGroupNum+"\\s*$")){
+			// This is where the rule actually gets the subrule reference update,
+			// UNLESS it was already specified (probably by a inheritsMembers tag)
+			if(rule[4] == null || rule[4].length() == 0){
+			    rule[4] = capGroup2RuleReference.get(capGroupNum);
+			}
+			break;
+		    }
+		}
+	    }
+	}
+	return returnValue;
     }
 
     /**
@@ -1060,13 +1167,13 @@
 	return url_regex_flexible;
     }
 
-    protected void addRegexMapping(String regexp, Map<String,String> nsRules, String articleName){ //nsRules = Map<String ns, String regex_replacement>
+    protected void addRegexMapping(String regexp, Map<String,String> nsRules, String articleName, String ruleName){ //nsRules = Map<String ns, String regex_replacement>
 	if(nsRules == null || nsRules.size() == 0){
 	    System.err.println("Ignoring empty namespace-only regex rule mappings");
 	    return;
 	}
 
-	addRegexMapping(regexp, nsRules, (String) null, (Map<String,String[]>) null, articleName);
+	addRegexMapping(regexp, nsRules, (String) null, (Map<String,String[]>) null, articleName, null);
     }
 
     protected void addURLRegexMapping(String url_regexp, Map url_nsRules, String articleName){ //nsRules = Map<String ns, String regex_replacement>
@@ -1080,29 +1187,128 @@
 
     public void addRegexMapping(String regexp, Map<String,String> nsRules, 
 				String mobyDataType, Map<String,String[]> membersMap){
-	addRegexMapping(regexp, nsRules, mobyDataType, membersMap, null);
+	addRegexMapping(regexp, nsRules, mobyDataType, membersMap, null, null);
     }
 
-    public void addRegexMapping(String regexp, Map<String,String> nsRules, String mobyDataType, Map<String,String[]> membersMap, String articleName){ 
+    /**
+     * This method looks for HAS members in a rule, and if they exist creates new
+     * capture groups around the accessed capture groups so that they can be further
+     * processed in MobyComplexBuilder (e.g. (\d)+ become ((\d+)) so we can deconstruct
+     * that part of the regex as capture *each* \d for the HAS (0 or more) member relationship.
+     * We also need to modify all of the rules that access capture groups to bump up their
+     * numbers due to the added capture groups here (transparent to the user).
+     *
+     * membersMap, nsRules, and articleName get their capture group references modified accordingly, 
+     * and the new regex is returned with its extract capture groups.
+     */
+    private String handleHASMembers(String regexp, Map<String,String> nsRules, String mobyDataType,
+				    Map<String,String[]> membersMap, StringBuffer articleName) throws Exception{
+	if(mobyDataType == null){
+	    //System.err.println("Got null data type for regex "+regexp);
+	    return regexp; //must be a base object
+	}
+	MobyDataType dataType = MobyDataType.getDataType(mobyDataType, SeahawkOptions.getRegistry());	
+	if(dataType == null){
+	    throw new Exception("Cannot find definition of data type "+mobyDataType+
+				" in the ontology, therefore the rule cannot be properly parsed");
+	}
+	MobyRelationship[] memberRelationships = dataType.getChildren();
+	
+	String newRegexp = processRegExp(regexp, membersMap);
+	Map<Integer,Boolean> captured = new HashMap<Integer,Boolean>();
+	for(MobyRelationship memberRelationship: memberRelationships){
+	    if(memberRelationship.getRelationshipType() == Central.iHAS){
+		String[] rule = membersMap.get(memberRelationship.getName());
+		if(rule == null){
+		    System.err.println("Skipping HAS member "+memberRelationship.getName() + 
+				       " without a rule");		    
+		    continue;
+		}
+		Pattern pattern = Pattern.compile(newRegexp, Pattern.DOTALL | Pattern.COMMENTS);
+		int groupCount = RegexParser.groupCount(pattern);
+		for(int i = 0; i < groupCount; i++){
+		    if(captured.containsKey(i)){ //autoboxed int
+			// Already encapsulated the capture group due to another
+			// HAS member, don't need to add anything
+			System.err.println("Skipping processing of capture group "+i+
+					   ", it's already been processed by another member in this rule");
+			continue;
+		    }
+		    if(rule[0].matches("^.*\\$"+i+"(?=\\D.*|\\z)")){
+			//System.err.println("Substituting "+i+" with encapsulating capture group, " +
+			//		   "due to HAS condition of member "+memberRelationship.getName() +
+			//		   " with rule " + rule[0]);
+			
+			// Now actually update the regex with the new cap group
+			// including any quantity modifier associated with it.
+			boolean INCL_QUANTIFIER = true;
+			int capGroupRange[] = RegexParser.getCaptureGroupRange(pattern, i, INCL_QUANTIFIER);
+			newRegexp = newRegexp.substring(0, capGroupRange[0])+"("+
+			    newRegexp.substring(capGroupRange[0], capGroupRange[1]+1) + ")" +
+			    (capGroupRange[1]+1 < newRegexp.length() ? newRegexp.substring(capGroupRange[1]+1) : "");
+			// Bump up all the capture group reference higher than this one, in all rules
+			for(int j = i; j < groupCount; j++){
+			    for(String memberName: membersMap.keySet()){
+				String[] memberRule = membersMap.get(memberName);
+				memberRule[0] = memberRule[0].replaceAll("\\$"+i+"(?=\\D.*|\\z)", "\\$"+(i+1));
+			    }
+			    for(String nsName: nsRules.keySet()){
+				String nsRule = nsRules.get(nsName);
+				nsRules.put(nsName, nsRule.replaceAll("\\$"+i+"(?=\\D.*|\\z)", "\\$"+(i+1)));
+			    }
+			    articleName.replace(0, articleName.length(), 
+						articleName.toString().replaceAll("\\$"+i+"(?=\\D.*|\\z)", "\\$"+(i+1)));
+			}
+			captured.put(i, true); //autobox both key and value
+			for(int j = groupCount; j >= i; j--){
+			    captured.remove(j);
+			    captured.put(j+1, true);
+			}
+		    }  //end if (rule contains group reference)
+		} // end for(group count)
+	    }  // end if (member's relationship is HAS)
+	    else{
+		System.err.println("Relationship for member " + memberRelationship.getName() + " of "+ mobyDataType +
+				   "is *not* HAS");
+	    }
+	}  //end for(member replationships)
+	return newRegexp;
+    }
+
+    public void addRegexMapping(String regexp, Map<String,String> nsRules, String mobyDataType, 
+				Map<String,String[]> membersMap, String articleName,
+				String ruleName){ 
 	try{
+	    // Use a StringBuffer so it's mutable by handleHASMembers()
+	    StringBuffer articleNameBuffer = new StringBuffer(articleName == null ? "" : articleName); 
+	    regexp = handleHASMembers(regexp, nsRules, mobyDataType, membersMap, articleNameBuffer);
+
 	    // Pattern.DOTALL to allow ".*" to span multiple lines, also allow comments (# to EOL) and whitespace
 	    // for better readability in the rules file.
-	    Pattern pattern = Pattern.compile(processRegExp(regexp), Pattern.DOTALL | Pattern.COMMENTS);	
+	    Pattern pattern = Pattern.compile(processRegExp(regexp, membersMap), Pattern.DOTALL | Pattern.COMMENTS);	
 
 	    // Base object
 	    if(mobyDataType == null || mobyDataType.length() == 0){
 		regexMap.put(pattern, new MobyComplexBuilder("Object", 
 							     membersMap, 
 							     nsRules,
-							     articleName));
-		return;
+							     this,
+							     articleNameBuffer.toString()));
 	    }
-
 	    // Complex Object
-	    regexMap.put(pattern, new MobyComplexBuilder(mobyDataType, 
-							 membersMap, 
-							 nsRules,
-							 articleName));
+	    else{
+		regexMap.put(pattern, new MobyComplexBuilder(mobyDataType, 
+							     membersMap, 
+							     nsRules,
+							     this,
+							     articleNameBuffer.toString()));
+	    }
+
+	    if(ruleName != null && ruleName.length() != 0){
+		patternNameMap.put(ruleName, pattern);
+		builderNameMap.put(ruleName, regexMap.get(pattern));
+	    }
+
 	}catch(Exception e){
 	    System.err.println("Could not create regular expression statement from '" +
 			       regexp + "': " + e);
@@ -1121,6 +1327,7 @@
 		urlRegexMap.put(pattern, new MobyComplexBuilder("Object", 
 								membersMap, 
 								url_nsRules,
+								this,
 								articleName));
 		return;
 	    }
@@ -1129,6 +1336,7 @@
 	    urlRegexMap.put(pattern, new MobyComplexBuilder(mobyDataType, 
 							    membersMap, 
 							    url_nsRules,
+							    this,
 							    articleName));
 	}catch(Exception e){
 	    System.err.println("Could not create URL regular expression statement from '" +
@@ -1169,6 +1377,7 @@
 		xpathMap.put(xpath, new MobyComplexBuilder("Object", 
 							   membersMap, 
 							   nsRules,
+							   this,
 							   articleName));
 		return;
 	    }
@@ -1177,6 +1386,7 @@
 	    xpathMap.put(xpath, new MobyComplexBuilder(mobyDataType, 
 						       membersMap, 
 						       nsRules,
+						       this,
 						       articleName));
 
 	}catch(Exception e){
@@ -1286,5 +1496,19 @@
 	}
 	return true;  // be optimistic by default, assume it's alive
     }
+
+    /**
+     * If a rule was given this name, the MOBY object builder for the rule is returned.
+     */
+    public MobyComplexBuilder getBuilder(String ruleName){
+	return builderNameMap.get(ruleName);
+    }
+
+    /**
+     * If a rule was given this name, the regex pattern for the rule is returned.
+     */
+    public Pattern getPattern(String ruleName){
+	return patternNameMap.get(ruleName);
+    }
 }
 

===================================================================
RCS file: /home/repository/moby/moby-live/Java/src/main/ca/ucalgary/seahawk/services/MobyComplexBuilder.java,v
retrieving revision 1.9
retrieving revision 1.10
diff -u -r1.9 -r1.10
--- /home/repository/moby/moby-live/Java/src/main/ca/ucalgary/seahawk/services/MobyComplexBuilder.java	2007/06/16 00:28:11	1.9
+++ /home/repository/moby/moby-live/Java/src/main/ca/ucalgary/seahawk/services/MobyComplexBuilder.java	2007/06/28 16:45:44	1.10
@@ -12,9 +12,8 @@
 import javax.xml.parsers.*;
 import javax.xml.namespace.NamespaceContext;
 
-import java.util.Iterator;
-import java.util.Map;
-import java.util.regex.Matcher;
+import java.util.*;
+import java.util.regex.*;
 
 /**
  * Class used by MobyClient to build the MOBY rule database from a configuration file.
@@ -22,6 +21,7 @@
 
 public class MobyComplexBuilder{
     public static final String ANON_ARTICLE = "_seahawk_data";
+    public static final String INHERITED_MEMBERS_SENTINEL = "_seahawk_member_inheritance";
     private static XPathFactory xPathFactory;
     private static DocumentBuilder docBuilder;
 
@@ -31,6 +31,7 @@
     private MobyDataType mobyDataType;
     private MobyNamespace[] mobyNS;
     private MobyRelationship[] children;
+    private MobyClient client;
 
     static{
 	xPathFactory = XPathFactory.newInstance();
@@ -43,11 +44,11 @@
 	}
     }
 
-    public MobyComplexBuilder(String dataType, Map<String,String[]> members, Map<String,String> nameSpaces) throws Exception{
-	this(dataType, members, nameSpaces, ANON_ARTICLE);
+    public MobyComplexBuilder(String dataType, Map<String,String[]> members, Map<String,String> nameSpaces, MobyClient cl) throws Exception{
+	this(dataType, members, nameSpaces, cl, ANON_ARTICLE);
     }
 
-    public MobyComplexBuilder(String dataType, Map<String,String[]> members, Map<String,String> nameSpaces, String articleName) throws Exception{
+    public MobyComplexBuilder(String dataType, Map<String,String[]> members, Map<String,String> nameSpaces, MobyClient cl, String articleName) throws Exception{
 	mobyDataType = MobyDataType.getDataType(dataType, SeahawkOptions.getRegistry());
 	// Not an existing Ontology data type
 	if(mobyDataType == null){
@@ -56,6 +57,7 @@
 
 	articleNameRule = articleName;
 	memberRules = members;
+	client = cl;
 
 	if(mobyDataType != null && mobyDataType.getName() != null && 
 	   !mobyDataType.getName().equals("Object")){
@@ -66,9 +68,21 @@
 	    for(int i = 0; children != null && i < children.length; i++){
 		String childName = children[i].getName();
 		if(!memberRules.containsKey(childName)){
-		    throw new Exception("Ruleset for object " + mobyDataType.getName() +
-					" does not include a rule for required member " +
-					children[i]);
+		    // See if there's an inherited member
+		    if(!memberRules.containsKey(INHERITED_MEMBERS_SENTINEL)){
+			throw new Exception("Ruleset for object " + mobyDataType.getName() +
+					    " does not include a rule for required member '" +
+					    childName+"'");
+		    }
+		    String inheritedRuleName = memberRules.get(INHERITED_MEMBERS_SENTINEL)[0];
+		    MobyComplexBuilder inheritedBuilder = client.getBuilder(inheritedRuleName);
+		    if(inheritedBuilder.getDataType().getChild(childName) == null){
+			throw new Exception("Ruleset for object " + mobyDataType.getName() +
+					    " does not include a rule for required member '" +
+					    childName+"', nor does the inherited rule '"+
+					    inheritedRuleName+"'");
+		    }
+		    
 		}
 		for(int j = 0; j < ruleNames.length; j++){
 		    if(ruleNames[j] == childName){
@@ -82,11 +96,50 @@
 		System.err.println("Ignoring member rules for object type " + 
 				   mobyDataType.getName() + " (not in the MOBY ontology):");
 		for(int i = 0; i < ruleNames.length; i++){
-		    if(ruleNames[i] != null){
+		    if(ruleNames[i] != null && !ruleNames[i].equals(INHERITED_MEMBERS_SENTINEL)){
 			System.err.println("Extra member: " + ruleNames[i]);
 		    }
 		}
 	    }
+
+	    // Check that any casting done on members of the object is kosher
+	    // (do this now because we now have the data type of the parent object for sure)
+	    for(Map.Entry<String,String[]> member: members.entrySet()){
+		if(member.getKey().equals(INHERITED_MEMBERS_SENTINEL)){
+		    continue;
+		}
+		String memberDataTypeSetting = member.getValue()[1];
+		if(memberDataTypeSetting != null && memberDataTypeSetting.length() > 0){
+		    MobyRelationship child = mobyDataType.getChild(member.getKey());
+		    String defaultDataTypeName = child.getDataTypeName();
+		    MobyDataType castType = MobyDataType.getDataType(memberDataTypeSetting, 
+								     SeahawkOptions.getRegistry());
+		    MobyDataType origType = MobyDataType.getDataType(defaultDataTypeName, 
+								     SeahawkOptions.getRegistry());
+		    if(castType == null){
+			System.err.println("Could not find the data type "+memberDataTypeSetting+
+					   " in the ontology, ignoring the data type cast"+
+					   " that says it should override " + defaultDataTypeName);
+			member.getValue()[1] = null;
+		    }
+		    else if(origType == null){
+			System.err.println("Could not find the data type "+defaultDataTypeName+
+					   " in the ontology, ignoring the data type cast" + 
+					   " that overrides it to data type (" + memberDataTypeSetting +
+					   "), can't tell if the cast is legal or not)");
+			member.getValue()[1] = null;
+		    }
+		    else if(!castType.inheritsFrom(origType)){
+			System.err.println("The data type "+memberDataTypeSetting+
+					   " does not inherit from the member's type in the ontology (" +
+					   defaultDataTypeName+"), ignoring the data type cast");
+			member.getValue()[1] = null;
+		    }
+		    else{
+			// It's safe it we got this far, leave it as-is
+		    }
+		}
+	    }
 	}
 
 	namespaceRules = nameSpaces;
@@ -124,7 +177,8 @@
 	}
 
 	// Set article name, if available
-	if(articleNameRule != null && articleNameRule.length() > 0){
+	if(articleNameRule != null && articleNameRule.length() > 0 && 
+	   !articleNameRule.equals(ANON_ARTICLE)){
 	    mobyObj.setName(evaluateRule(matcher, 
 					 articleNameRule, 
 					 MobyClient.WHITESPACE_ATTR_STRIP_FLANKING_VAL,
@@ -159,36 +213,194 @@
 	    }
 	}
 
-	// Set members if available and required
-	if(mobyObj instanceof MobyDataComposite){
-	    if(children != null){
-		for(int i = 0; i < children.length; i++){
-		    String memberName = children[i].getName();
-		    String[] resultSpec = memberRules.get(memberName);
+	// Done?
+	if(!(mobyObj instanceof MobyDataComposite) || children == null){
+	    return mobyObj;
+	}
+	
+	// There's another rule to populate members from
+	if(memberRules.containsKey(INHERITED_MEMBERS_SENTINEL)){
+	    // String[2] = {ruleName, ruleValue}
+	    String[] ruleSpec = memberRules.get(INHERITED_MEMBERS_SENTINEL);
+	    MobyComplexBuilder inheritedBuilder = client.getBuilder(ruleSpec[0]);
+	    String result = evaluateRule(matcher, ruleSpec[1], null, null, data);
+	    Matcher submatcher = client.getPattern(ruleSpec[0]).matcher(result);
+	    if(!submatcher.find()){
+		throw new MobyException("Pattern of inherited rule '"+ruleSpec[0]+
+					"' does not match data given: " + result);
+	    }
+	    MobyDataObject inheritedResult = inheritedBuilder.apply(submatcher, result.getBytes());
+	    
+	    // Now copy all the fields from the inheritedResults to our object
+	    if(inheritedResult instanceof MobyDataComposite){
+		((MobyDataComposite) mobyObj).putAll((MobyDataComposite) inheritedResult);
+	    }
+	}
+	
+	// Otherwise set members if available and required
+	for(int i = 0; i < children.length; i++){
+	    String memberName = children[i].getName();
+	    String[] resultSpec = memberRules.get(memberName);
+	    if(resultSpec == null){
+		// Must have been populated by the inherited rule?
+		if(!((MobyDataComposite) mobyObj).containsKey(memberName)){
+		    throw new MobyException("Member " +memberName+" does not have a rule, nor" +
+					    " was not populated by any inherited rule");
+		}
+		continue;
+	    }
 
-		    MobyRelationship memberRelationship = mobyDataType.getChild(memberName);
-		    // Is the field a list rather than a single value?  If so,
-		    // we will need to reinterpret the member value specification
-		    // as many times as the regex capture group matched...
-		    if(memberRelationship.getRelationshipType() == Central.iHAS){
-			
+	    String dataTypeName = children[i].getDataTypeName();
+	    // First: does the rule cast the object to some subtype?
+	    if(resultSpec[1] != null && resultSpec[1].length() > 0){			
+		// Safe to cast if we got this far (e.g. put String where an Object is specified)
+		dataTypeName = resultSpec[1]; 
+	    }
+	    
+	    // Is the field a list rather than a single value?  If so,
+	    // we will need to reinterpret the member value specification
+	    // as many times as the regex capture group matched...
+	    String[] results = new String[1];
+	    if(children[i].getRelationshipType() == Central.iHAS){
+		// We need to reconstruct the capture group so that we
+		// can save each value, not just its last one.
+		try{
+		    // results may be more than one element in this case
+		    results = evaluateIterativeRule(matcher, resultSpec[0], resultSpec[2], resultSpec[3], data);
+		} catch(Exception e){
+		    e.printStackTrace();
+		    throw new MobyException("Error while evaluating HAS (iterative evaluation) rule: " + e);
+		}
+	    }
+	    // else is iHASA, scalar context
+	    else{
+		// only one result to process and add the to object
+		results[0] = evaluateRule(matcher, resultSpec[0], resultSpec[2], resultSpec[3], data);
+	    }
+	    
+	    for(String result: results){
+		
+		if(PrimitiveTypes.isPrimitive(dataTypeName)){
+		    ((MobyDataComposite) mobyObj).put(memberName, 
+						      MobyDataObject.createInstanceFromString(dataTypeName, result));
+		}
+		else{
+		    // Recursively call the object creator for complex members
+		    
+		    // Fetch the member's MobyComplexBuilder by name from the rule spec
+		    String memberRuleName = resultSpec[4];
+		    if(memberRuleName == null || memberRuleName.length() == 0){
+			throw new MobyException("Composite member "+memberName+
+						" does not have an associated rule to produce it");
+		    }
+		    MobyComplexBuilder memberBuilder = client.getBuilder(memberRuleName);
+		    if(memberBuilder == null){
+			throw new MobyException("The object builder for the rule " + memberRuleName +
+						" could not be found");
+		    }
+		    Pattern memberPattern = client.getPattern(memberRuleName);
+		    
+		    // Now apply the regex for the capture group, so we can use
+		    // the results in a recursive call to this method (stop condition
+		    // is when all members are primitives).
+		    Matcher memberMatcher = memberPattern.matcher(result);
+		    
+		    if(memberMatcher.find()){
+			((MobyDataComposite) mobyObj).put(memberName,
+							  memberBuilder.apply(memberMatcher));
 		    }
-		    // else is iHASA, scalar context
 		    else{
-			String result = evaluateRule(matcher, resultSpec[0], resultSpec[1], resultSpec[2], data);
-
-			((MobyDataComposite) mobyObj).put(memberName, 
-			    MobyDataObject.createInstanceFromString(children[i].getDataTypeName(), result));
+			System.err.println("Could not match pattern \"" + memberPattern.pattern() +
+					   "\" to string \"" + result + "\", abandoning " +
+					   dataTypeName + " object creation");
 		    }
 		}
+	    } //end for results
+	}  //end for children
+
+	return mobyObj;
+    }
+
+    /**
+     * @return a list of rule values to interpret
+     */
+    public String[] evaluateIterativeRule(Matcher matcher, String result, String whitespaceMode, String encodingMode, byte[] data)
+	throws MobyException, Exception{
+	Vector<String> capturedValues = new Vector<String>();
+
+	// Figure out which capture group needs to be evaluated iteratively
+	Vector<Integer> captureGroups = new Vector<Integer>();
+	// This only really works with 10 capture groups or less, otherwise $1 and $10 can get captured, etc.
+	for(int captureGroup = 1; captureGroup <= matcher.groupCount(); captureGroup++){
+	    if(result.indexOf("$"+captureGroup) != -1){
+		//System.err.println("Adding capture group " + captureGroup + 
+		//		   " to evaluation list due to its presence in rule " + result);
+		captureGroups.add(captureGroup);
 	    }
 	}
+	// No capture group found, so just return single evaluation on whole expression (may use $0)
+	if(captureGroups.isEmpty()){
+	    return new String[]{evaluateRule(matcher, result, whitespaceMode, encodingMode, data)};
+	}
+	
+	// Pull out the capture group's pattern, by counting parentheses
+	Map<Integer,String> captureGroupRegexes = new HashMap<Integer,String>();
+	for(Integer captureGroup: captureGroups){
+	    captureGroupRegexes.put(captureGroup.intValue(),
+				    RegexParser.getCaptureGroupRegex(matcher.pattern(), 
+								     captureGroup.intValue()));
+	}
+
+	// Find all instances of the capture group, save their value
+	Map<Integer,String[]> captureGroupSubvalues = new HashMap<Integer,String[]>();
+	for(Map.Entry<Integer,String> entry: captureGroupRegexes.entrySet()){
+	    // Why fetch cap group # -1?  Because we auto-encapsulated the cap groups the HAS member
+	    // refers to, specifically so we get the whole match for processing, rather than just
+	    // the last one, i.e. (\d)+ in a regex rule became ((\d)+) in MobyClient so we can 
+	    // find each \d in the whole match and add them indidivually as HAS (0 or more) members
+	    // in the object instance.
+	    String wholeMatch = matcher.group(entry.getKey().intValue()-1);
+	    String regex = entry.getValue();
+	    if(regex.length() == 0){
+		throw new Exception("Encountered empty regex in capture group " + entry.getKey());
+	    }
+	    //System.err.println("Applying regex "+regex+" to "+wholeMatch+", whole pattern was "+matcher.pattern().pattern());
+	    // If the pattern matches more than once, the matches must be contiguous, hence the \G
+	    Pattern captureGroupPattern = Pattern.compile("\\G"+regex, 
+							  Pattern.DOTALL | Pattern.COMMENTS);
+	    Vector<String> subValues = new Vector<String>();
+	    Matcher subMatcher = captureGroupPattern.matcher(wholeMatch);
+	    int lastSubMatcherEnd = -1;
+	    while(subMatcher.find()){
+		subValues.add(subMatcher.group());
+		lastSubMatcherEnd = subMatcher.end();
+	    }
 
-	return mobyObj;
+	    // For safety, we should do a sanity check that there's 
+	    // is no unmatched input left over, as we'd expect
+	    if(lastSubMatcherEnd != -1 && lastSubMatcherEnd != wholeMatch.length()){
+		throw new Exception("The submatcher for capture group "+entry.getKey()+ 
+				    " did not match to the last char of \"" + wholeMatch + 
+				    "\", should have ended match at index " + (wholeMatch.length()-1) +
+				    ", but instead matched until " + (lastSubMatcherEnd-1));
+	    }
+	    captureGroupSubvalues.put(entry.getKey().intValue(), 
+				      (String[]) subValues.toArray(new String[subValues.size()]));
+	}
+
+	// Now, iteratively substitute each subvalue from each capture group into the rules 
+	IterativeMatchResult iterMatch = new IterativeMatchResult(matcher, captureGroupSubvalues);
+	for(int i = 0; i < iterMatch.getNumIterations(); i++){
+	    iterMatch.setIteration(i);
+	    capturedValues.add(evaluateRule(iterMatch, result, whitespaceMode, encodingMode, data));
+	}
+
+	// Return all the values we found
+	return (String[]) capturedValues.toArray(new String[capturedValues.size()]);
     }
 
     // takes a rule and evaluates $# variables and XPath expressions
-    private String evaluateRule(Matcher matcher, String result, String whitespaceMode, String encodingMode, byte[] data)
+    private String evaluateRule(MatchResult match, String result, String whitespaceMode, String encodingMode, byte[] data)
 	throws MobyException{
 	byte[] resultBytes = result.getBytes();
 	boolean nonBasic = result.length() != 2 || result.indexOf("$") != 0;
@@ -202,24 +414,24 @@
 	    doc = docBuilder.newDocument();
 	}
 
-	// Replace any $0, $1, etc. in the replacement string with the values found in the matcher
+	// Replace any $0, $1, etc. in the replacement string with the values found in the match
 	// Note that this is not perfect: if you had "$1 $2", and $1 had value "$250", you'd get $250$2,
 	// then you'd substitute $2's value of "per metre", you'd get "per metre50 per metre" instead of
 	// "$250 per metre".  Not sure of a good way around this yet (i.e. when varValue had $k in it where k > j)...
-	for(int j = 0; j <= matcher.groupCount(); j++){
+	for(int j = 0; j <= match.groupCount(); j++){
 	    // A replaceAll() for binary data
 	    if(data != null && isBinary){
 		int srcPos = 0;
 		for(int varIndex = result.indexOf("$"+j, srcPos); 
 		    varIndex != -1; 
 		    varIndex = result.indexOf("$"+j, srcPos)){
-		    int varValueLength = matcher.end(j)-matcher.start(j);
+		    int varValueLength = match.end(j)-match.start(j);
 		    int varLen = ("$"+j).getBytes().length;
 		    byte[] newResultBytes = new byte[resultBytes.length+varValueLength-varLen];
 		    if(varIndex > 0){			
 			System.arraycopy(resultBytes, 0, newResultBytes, 0, varIndex);
 		    }
-		    System.arraycopy(data, matcher.start(j), newResultBytes, varIndex, varValueLength);
+		    System.arraycopy(data, match.start(j), newResultBytes, varIndex, varValueLength);
 		    int remaining = result.length()-varIndex-varLen;
 		    if(remaining > 0){
 			System.arraycopy(resultBytes, varIndex+varLen, newResultBytes, varIndex+varValueLength, 
@@ -230,8 +442,8 @@
 		}
 	    }
 	    // $# substitution in a string, considerably simpler!
-	    else{
-		String varValue = matcher.group(j);
+	    else if(result.matches("^\\$"+j+"(?=\\D.*|\\z)")){
+		String varValue = match.group(j);
 		if(whitespaceMode == null){
 		    // do nothing
 		}
@@ -244,7 +456,7 @@
 		else if(whitespaceMode.equals(MobyClient.WHITESPACE_ATTR_STRIP_FLANKING_VAL)){
 		    varValue = varValue.trim(); // removes leading and trailing whitespace
 		}
-		result = result.replaceAll("\\$"+j, varValue);
+		result = result.replaceAll("\\$"+j+"(?=\\D|\\z)", varValue);
 		
 		// Binary data and XPath are incompatible, since many bytes are not allowed in XML
 		// so only create the node list if not Base64 encoding




More information about the MOBY-guts mailing list