[MOBY-guts] biomoby commit

Gary Schiltz gss at pub.open-bio.org
Wed Mar 15 00:45:49 UTC 2006


gss
Tue Mar 14 19:45:49 EST 2006
Update of /home/repository/moby/s-moby/ref-impl/semanticmoby.org/src/org/semanticmoby/ref/tools
In directory pub.open-bio.org:/tmp/cvs-serv28286/src/org/semanticmoby/ref/tools

Modified Files:
	MetadataRetriever.java 
Log Message:
Simplified, fixed several bugs; converted line endings to Unix

s-moby/ref-impl/semanticmoby.org/src/org/semanticmoby/ref/tools MetadataRetriever.java,1.4,1.5
===================================================================
RCS file: /home/repository/moby/s-moby/ref-impl/semanticmoby.org/src/org/semanticmoby/ref/tools/MetadataRetriever.java,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -r1.4 -r1.5
--- /home/repository/moby/s-moby/ref-impl/semanticmoby.org/src/org/semanticmoby/ref/tools/MetadataRetriever.java	2006/01/31 00:30:08	1.4
+++ /home/repository/moby/s-moby/ref-impl/semanticmoby.org/src/org/semanticmoby/ref/tools/MetadataRetriever.java	2006/03/15 00:45:49	1.5
@@ -1,29 +1,22 @@
 package org.semanticmoby.ref.tools;
 
-import java.io.InputStream;
-import java.net.URL;
-import java.net.URLConnection;
-import java.util.HashSet;
-import java.util.Iterator;
-import java.util.Set;
-import java.util.StringTokenizer;
-
-import org.semanticmoby.graph.MOBYResource;
-import org.semanticmoby.vocabulary.MOBY;
-
-import com.hp.hpl.jena.rdf.model.Model;
-import com.hp.hpl.jena.rdf.model.ModelFactory;
-import com.hp.hpl.jena.rdf.model.RDFNode;
-import com.hp.hpl.jena.rdf.model.Statement;
-import com.hp.hpl.jena.rdf.model.StmtIterator;
+import java.io.*;
+import java.net.*;
+import java.util.*;
+
+import org.semanticmoby.api.*;
+import org.semanticmoby.graph.*;
+import org.semanticmoby.vocabulary.*;
+
+import com.hp.hpl.jena.rdf.model.*;
 
 /**
- * This class is responsible for retrieving a list of words to associate
- * with a resource. It traverses the resource graph searching for MOBY
- * metadata properties, and builds a set of URLs from the metadata property
- * values. For each of the URLs in the resulting set, it then opens an HTTP
- * connection to the URL and reads all the words it finds, up to a set
- * number of bytes, and adds these words to a set.
+ * This class is responsible for retrieving a list of words to associate with
+ * a resource. It traverses the resource graph searching for moby:metadata
+ * properties, and collects a set of URLs from the property values. For each
+ * URL in the resulting set, it then opens a connection to the URL and reads
+ * all the words it finds, up to a set number of bytes, and adds these words
+ * to a set.
  */
 public class MetadataRetriever {
 
@@ -37,90 +30,84 @@
      */
     int maxBytes;
 
+    /**
+     * The set of keywords being accumulated
+     */
     private Set keywords = null;
 
+    /**
+     * Creates an instance for retrieving up to some number of bytes
+     * from moby:metadata URLs associated with a model
+     * @param model the model containing the moby:metadata statements
+     * @param maxBytes the maximum number of bytes to retrieve
+     */
     public MetadataRetriever(Model model, int maxBytes) {
         this.model = model;
         this.maxBytes = maxBytes;
     }
 
+    /**
+     * Retrieves, if necessary, and returns the set of keywords
+     */
     public Set getKeywords() {
         if (keywords == null) {
-            keywords = retrieveKeywords();
+            retrieveKeywords();
         }
         return keywords;
     }
 
-    private Set retrieveKeywords() {
+    /**
+     * Retrieves and returns the set of keywords from the moby:metadata
+     * URL properties in the model
+     */
+    private void retrieveKeywords() {
 
-        Set words = new HashSet();
+    	keywords = new HashSet();
 
         Iterator it = getMetadataURLs().iterator();
         int addedSoFar = 0;
 
         while (it.hasNext() && addedSoFar < maxBytes) {
             String url = (String) it.next();
-            addedSoFar += addKeywords(words, url, addedSoFar);
+            addedSoFar += addKeywords(url, addedSoFar);
         }
-        return words;
     }
 
-    private int addKeywords(Set words, String urlString, int addedSoFar) {
-
+    /**
+     * Adds keywords for the given URL string and returns the number of
+     * bytes added.
+     */
+    private int addKeywords(String urlString, int addedSoFar) {
+    	
         int added = 0;
         InputStream stream = null;
+        int remaining = maxBytes - addedSoFar;
 
         try {
+        	// Open a connection to the URL from which keywords are to be read
             URL url = new URL(urlString);
             stream = url.openStream();
-
-            int available = stream.available();
-            int allowed = maxBytes - addedSoFar;
-
-            // If reading all that is available would result in some words not
-            // being read, we might read part of a word. In that case, read
-            // 100 bytes extra and discard the last word.
-            int toRead = 0;
-            boolean discardLastWord;
-
-            if (allowed >= available) {
-                // The number of bytes allowed to be read is at least as
-                // many as what is available, so read everything and don't
-                // discard the last word
-                toRead = available;
-                discardLastWord = false;
-            } else {
-                // The number of bytes available is greater than the number
-                // allowed to be read, so read the allowed number plus a
-                // few extra in order to hopefully get to a token separator,
-                // but discard the last word read.
-                toRead = allowed + 20;
-                discardLastWord = true;
-            }
-
-            // Read up to toRead bytes into a buffer
-            byte[] buffer = new byte[toRead];
-            stream.read(buffer, 0, toRead);
-
-            // Use a string tokenizer to break the buffer into words
-            StringTokenizer st = new StringTokenizer(new String(buffer));
-            while (st.hasMoreTokens()) {
-                String word = st.nextToken();
-
-                // remove all symbols except for letters, _, -, ., '
-                word.toLowerCase();
-                word.replaceAll("[^a-z:-_.\']", "");
-
-                // Add the word if either we are not discarding the final
-                // one, or if this word is not the final one
-                if ((! discardLastWord) || st.hasMoreTokens()) {
-
-                    words.add(word);
-                    added += word.length();
-                }
+            
+            // Create a buffer for accumulating characters from the stream
+            StringBuffer sb = new StringBuffer();
+            
+            char ch;
+            while (((ch = (char) stream.read()) != -1) && (remaining > 0)) {
+            	if (Character.isWhitespace(ch)) {
+            		// If we hit a whitespace character, convert the buffer
+            		// into a lowercase string, add it to the buffer, and
+            		// reset the buffer
+            		keywords.add(sb.toString().toLowerCase());
+            		sb = new StringBuffer();
+            	} else {
+            		// Add the character to the buffer
+                	sb.append(ch);
+            		added++;
+            		remaining--;
+            	}
             }
         } catch (Throwable t) {
-            t.printStackTrace();
+        	Log.error("Error while reading " + urlString + ": " + t.getMessage());
         } finally {
             // Regardless of what happens, try to close the stream if it
             // was created
@@ -128,7 +115,8 @@
                 try {
                     stream.close();
                 } catch (Throwable t) {
-                    t.printStackTrace();
+                	Log.error("Error closing connection to " +
+                			  urlString + ": " + t.getMessage());
                 }
             }
         }
@@ -153,19 +141,20 @@
                 // not a literal, so add it inside a try/catch block
                 urls.add(stmt.getString());
             } catch (Throwable t) {
-                t.printStackTrace();
+            	Log.warn("A moby:metadata property with a " +
+            			 "non-string value was found");
             }
         }
         return urls;
     }
-
+    
     /*
     public static void main(String[] args) throws Throwable {
         String resourceURL =
-            "http://www.semanticmoby.org/examples/proxies/gramene-query";
+            "http://www.semanticmoby.org/examples/proxies/flybase-genequery";
         Model m = ModelFactory.createDefaultModel();
         m.read(resourceURL);
-        MetadataRetriever r = new MetadataRetriever(m, 15);
+        MetadataRetriever r = new MetadataRetriever(m, 100000);
         Set keys = r.getKeywords();
         for (Iterator it = keys.iterator(); it.hasNext();) {
             System.out.println(it.next());




More information about the MOBY-guts mailing list