[MOBY-guts] biomoby commit
Gary Schiltz
gss at pub.open-bio.org
Wed Mar 15 00:45:49 UTC 2006
gss
Tue Mar 14 19:45:49 EST 2006
Update of /home/repository/moby/s-moby/ref-impl/semanticmoby.org/src/org/semanticmoby/ref/tools
In directory pub.open-bio.org:/tmp/cvs-serv28286/src/org/semanticmoby/ref/tools
Modified Files:
MetadataRetriever.java
Log Message:
Simplified, fixed several bugs; converted line endings to Unix
s-moby/ref-impl/semanticmoby.org/src/org/semanticmoby/ref/tools MetadataRetriever.java,1.4,1.5
===================================================================
RCS file: /home/repository/moby/s-moby/ref-impl/semanticmoby.org/src/org/semanticmoby/ref/tools/MetadataRetriever.java,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -r1.4 -r1.5
--- /home/repository/moby/s-moby/ref-impl/semanticmoby.org/src/org/semanticmoby/ref/tools/MetadataRetriever.java 2006/01/31 00:30:08 1.4
+++ /home/repository/moby/s-moby/ref-impl/semanticmoby.org/src/org/semanticmoby/ref/tools/MetadataRetriever.java 2006/03/15 00:45:49 1.5
@@ -1,29 +1,22 @@
package org.semanticmoby.ref.tools;
-import java.io.InputStream;
-import java.net.URL;
-import java.net.URLConnection;
-import java.util.HashSet;
-import java.util.Iterator;
-import java.util.Set;
-import java.util.StringTokenizer;
-
-import org.semanticmoby.graph.MOBYResource;
-import org.semanticmoby.vocabulary.MOBY;
-
-import com.hp.hpl.jena.rdf.model.Model;
-import com.hp.hpl.jena.rdf.model.ModelFactory;
-import com.hp.hpl.jena.rdf.model.RDFNode;
-import com.hp.hpl.jena.rdf.model.Statement;
-import com.hp.hpl.jena.rdf.model.StmtIterator;
+import java.io.*;
+import java.net.*;
+import java.util.*;
+
+import org.semanticmoby.api.*;
+import org.semanticmoby.graph.*;
+import org.semanticmoby.vocabulary.*;
+
+import com.hp.hpl.jena.rdf.model.*;
/**
- * This class is responsible for retrieving a list of words to associate
- * with a resource. It traverses the resource graph searching for MOBY
- * metadata properties, and builds a set of URLs from the metadata property
- * values. For each of the URLs in the resulting set, it then opens an HTTP
- * connection to the URL and reads all the words it finds, up to a set
- * number of bytes, and adds these words to a set.
+ * This class is responsible for retrieving a list of words to associate with
+ * a resource. It traverses the resource graph searching for moby:metadata
+ * properties, and collects a set of URLs from the property values. For each
+ * URL in the resulting set, it then opens a connection to the URL and reads
+ * all the words it finds, up to a set number of bytes, and adds these words
+ * to a set.
*/
public class MetadataRetriever {
@@ -37,90 +30,84 @@
*/
int maxBytes;
+ /**
+ * The set of keywords being accumulated
+ */
private Set keywords = null;
+ /**
+ * Creates an instance for retrieving up to some number of bytes
+ * from moby:metadata URLs associated with a model
+ * @param model the model containing the moby:metadata statements
+ * @param maxBytes the maximum number of bytes to retrieve
+ */
public MetadataRetriever(Model model, int maxBytes) {
this.model = model;
this.maxBytes = maxBytes;
}
+ /**
+ * Retrieves, if necessary, and returns the set of keywords
+ */
public Set getKeywords() {
if (keywords == null) {
- keywords = retrieveKeywords();
+ retrieveKeywords();
}
return keywords;
}
- private Set retrieveKeywords() {
+ /**
+ * Retrieves and returns the set of keywords from the moby:metadata
+ * URL properties in the model
+ */
+ private void retrieveKeywords() {
- Set words = new HashSet();
+ keywords = new HashSet();
Iterator it = getMetadataURLs().iterator();
int addedSoFar = 0;
while (it.hasNext() && addedSoFar < maxBytes) {
String url = (String) it.next();
- addedSoFar += addKeywords(words, url, addedSoFar);
+ addedSoFar += addKeywords(url, addedSoFar);
}
- return words;
}
- private int addKeywords(Set words, String urlString, int addedSoFar) {
-
+ /**
+ * Adds keywords for the given URL string and returns the number of
+ * bytes added.
+ */
+ private int addKeywords(String urlString, int addedSoFar) {
+
int added = 0;
InputStream stream = null;
+ int remaining = maxBytes - addedSoFar;
try {
+ // Open a connection to the URL from which keywords are to be read
URL url = new URL(urlString);
stream = url.openStream();
-
- int available = stream.available();
- int allowed = maxBytes - addedSoFar;
-
- // If reading all that is available would result in some words not
- // being read, we might read part of a word. In that case, read
- // 100 bytes extra and discard the last word.
- int toRead = 0;
- boolean discardLastWord;
-
- if (allowed >= available) {
- // The number of bytes allowed to be read is at least as
- // many as what is available, so read everything and don't
- // discard the last word
- toRead = available;
- discardLastWord = false;
- } else {
- // The number of bytes available is greater than the number
- // allowed to be read, so read the allowed number plus a
- // few extra in order to hopefully get to a token separator,
- // but discard the last word read.
- toRead = allowed + 20;
- discardLastWord = true;
- }
-
- // Read up to toRead bytes into a buffer
- byte[] buffer = new byte[toRead];
- stream.read(buffer, 0, toRead);
-
- // Use a string tokenizer to break the buffer into words
- StringTokenizer st = new StringTokenizer(new String(buffer));
- while (st.hasMoreTokens()) {
- String word = st.nextToken();
-
- // remove all symbols except for letters, _, -, ., '
- word.toLowerCase();
- word.replaceAll("[^a-z:-_.\']", "");
-
- // Add the word if either we are not discarding the final
- // one, or if this word is not the final one
- if ((! discardLastWord) || st.hasMoreTokens()) {
-
- words.add(word);
- added += word.length();
- }
+
+ // Create a buffer for accumulating characters from the stream
+ StringBuffer sb = new StringBuffer();
+
+ char ch;
+ while (((ch = (char) stream.read()) != -1) && (remaining > 0)) {
+ if (Character.isWhitespace(ch)) {
+ // If we hit a whitespace character, convert the buffer
+ // into a lowercase string, add it to the buffer, and
+ // reset the buffer
+ keywords.add(sb.toString().toLowerCase());
+ sb = new StringBuffer();
+ } else {
+ // Add the character to the buffer
+ sb.append(ch);
+ added++;
+ remaining--;
+ }
}
} catch (Throwable t) {
- t.printStackTrace();
+ Log.error("Error while reading " + urlString + ": " + t.getMessage());
} finally {
// Regardless of what happens, try to close the stream if it
// was created
@@ -128,7 +115,8 @@
try {
stream.close();
} catch (Throwable t) {
- t.printStackTrace();
+ Log.error("Error closing connection to " +
+ urlString + ": " + t.getMessage());
}
}
}
@@ -153,19 +141,20 @@
// not a literal, so add it inside a try/catch block
urls.add(stmt.getString());
} catch (Throwable t) {
- t.printStackTrace();
+ Log.warn("A moby:metadata property with a " +
+ "non-string value was found");
}
}
return urls;
}
-
+
/*
public static void main(String[] args) throws Throwable {
String resourceURL =
- "http://www.semanticmoby.org/examples/proxies/gramene-query";
+ "http://www.semanticmoby.org/examples/proxies/flybase-genequery";
Model m = ModelFactory.createDefaultModel();
m.read(resourceURL);
- MetadataRetriever r = new MetadataRetriever(m, 15);
+ MetadataRetriever r = new MetadataRetriever(m, 100000);
Set keys = r.getKeywords();
for (Iterator it = keys.iterator(); it.hasNext();) {
System.out.println(it.next());
More information about the MOBY-guts
mailing list