incubator-droids-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From thors...@apache.org
Subject svn commit: r813386 - in /incubator/droids/trunk/droids-solr: ./ example/conf/ src/main/java/org/apache/droids/solr/ src/test/java/org/apache/droids/solr/
Date Thu, 10 Sep 2009 11:53:45 GMT
Author: thorsten
Date: Thu Sep 10 11:53:44 2009
New Revision: 813386

URL: http://svn.apache.org/viewvc?rev=813386&view=rev
Log:
Droids-62 Customizable solr handler. 
due-to Bertil Chapuis.
thanks Bertil Chapuis

Added:
    incubator/droids/trunk/droids-solr/src/main/java/org/apache/droids/solr/AdvancedSolrHandler.java
    incubator/droids/trunk/droids-solr/src/test/java/org/apache/droids/solr/AdvancedSolrHandleTest.java
    incubator/droids/trunk/droids-solr/src/test/java/org/apache/droids/solr/MockContentEntity.java
Modified:
    incubator/droids/trunk/droids-solr/example/conf/schema.xml
    incubator/droids/trunk/droids-solr/pom.xml

Modified: incubator/droids/trunk/droids-solr/example/conf/schema.xml
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-solr/example/conf/schema.xml?rev=813386&r1=813385&r2=813386&view=diff
==============================================================================
--- incubator/droids/trunk/droids-solr/example/conf/schema.xml (original)
+++ incubator/droids/trunk/droids-solr/example/conf/schema.xml Thu Sep 10 11:53:44 2009
@@ -62,7 +62,10 @@
    <field name="id"      type="string" indexed="true" stored="true" required="true" />

    <field name="name"    type="string" indexed="true" stored="true" />
    <field name="host"    type="string" indexed="true" stored="true" />
-   <field name="content" type="text"   indexed="true" stored="false" />   
+   <field name="mime"    type="string" indexed="true" stored="true" />
+   <field name="content" type="text"   indexed="true" stored="true" />
+   <field name="selector" type="text"   indexed="true" stored="true" />
+
  </fields>
 
  <!-- Field to use to determine and enforce document uniqueness. 

Modified: incubator/droids/trunk/droids-solr/pom.xml
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-solr/pom.xml?rev=813386&r1=813385&r2=813386&view=diff
==============================================================================
--- incubator/droids/trunk/droids-solr/pom.xml (original)
+++ incubator/droids/trunk/droids-solr/pom.xml Thu Sep 10 11:53:44 2009
@@ -41,6 +41,17 @@
       <version>${pom.version}</version>
     </dependency>
     <dependency>
+        <groupId>nekohtml</groupId>
+        <artifactId>nekohtml</artifactId>
+        <version>${nekohtml.version}</version>
+        <exclusions>
+          <exclusion>
+            <groupId>xml-apis</groupId>
+            <artifactId>xml-apis</artifactId>
+          </exclusion>
+        </exclusions>
+      </dependency>
+    <dependency>
       <groupId>org.apache.solr</groupId>
       <artifactId>solr-solrj</artifactId>
       <version>1.3.0</version>

Added: incubator/droids/trunk/droids-solr/src/main/java/org/apache/droids/solr/AdvancedSolrHandler.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-solr/src/main/java/org/apache/droids/solr/AdvancedSolrHandler.java?rev=813386&view=auto
==============================================================================
--- incubator/droids/trunk/droids-solr/src/main/java/org/apache/droids/solr/AdvancedSolrHandler.java
(added)
+++ incubator/droids/trunk/droids-solr/src/main/java/org/apache/droids/solr/AdvancedSolrHandler.java
Thu Sep 10 11:53:44 2009
@@ -0,0 +1,431 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.solr;
+
+import java.io.IOException;
+import java.net.URI;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Set;
+import java.util.Stack;
+import java.util.Map.Entry;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.droids.api.ContentEntity;
+import org.apache.droids.api.Handler;
+import org.apache.droids.exception.DroidsException;
+import org.apache.solr.client.solrj.SolrServer;
+import org.apache.solr.client.solrj.SolrServerException;
+import org.apache.solr.common.SolrInputDocument;
+import org.cyberneko.html.parsers.SAXParser;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.InputSource;
+import org.xml.sax.Locator;
+import org.xml.sax.SAXException;
+import org.xml.sax.SAXNotRecognizedException;
+import org.xml.sax.SAXNotSupportedException;
+
+/**
+ * A Droids Handler which allows to specify selectors to store 
+ * documents' parts in a Solr index.
+ * 
+ * A selector is an Entry made of a key which matches the solr fiel and
+ * of a value which correspond to a path selector.
+ * 
+ * Path selectors are always absolute and supports indexes. 
+ * 
+ * Here are some examples:
+ * 	- /html[0]/div[0]
+ * 	- /html[0]/div[0]/p[0]
+ * 	- /html[0]/div[1]/p[2]
+ */
+public class AdvancedSolrHandler implements Handler {
+
+	/**
+	 * A solr server
+	 */
+	private SolrServer server;
+	
+	/**
+	 * The selectors allow to save specific parts of the document in the index.
+	 * The HashMap's key matches the Solr field.
+	 * The HashMap's value is an absolute path corresponding to an element.
+	 */
+	private HashMap<String, String> selectors;
+	
+	/**
+	 * A content handler
+	 */
+	private SolrContentHandler contentHandler = new SolrContentHandler(selectors);
+
+	/**
+	 * An HTML parser
+	 */
+	private SAXParser parser;
+	
+	/**
+	 * @return the current solr server
+	 */
+	public SolrServer getServer() {
+		return server;
+	}
+
+	/**
+	 * @param solr a solr server 
+	 */
+	public void setServer(SolrServer solr) {
+		this.server = solr;
+	}
+
+	/**
+	 * @return the current path selectors
+	 */
+	public HashMap<String, String> getSelectors() {
+		return selectors;
+	}
+
+	/**
+	 * @param selectors an hash map containing path selectors
+	 */
+	public void setSelectors(HashMap<String, String> selectors) {
+		contentHandler.initPatterns(selectors);
+		this.selectors = selectors;
+	}
+
+	/* 
+	 * @see org.apache.droids.api.Handler#handle(java.net.URI, org.apache.droids.api.ContentEntity)
+	 */
+	public void handle(URI uri, ContentEntity entity) throws IOException, DroidsException {
+		SolrInputDocument doc = createSolrInputDocument(uri, entity);
+		try {
+			server.add(doc);
+		} catch (SolrServerException e) {
+			throw new DroidsException(e);
+		}
+	}
+
+	/**
+	 * Generates a SolrInputDocument from an URI and a ContentEntity 
+	 * which correspond to the document which need to be saved in the index
+	 * 
+	 * @param uri an uri
+	 * @param entity an entity
+	 * @return
+	 */
+	private SolrInputDocument createSolrInputDocument(URI uri, ContentEntity entity) {
+		SolrInputDocument doc = new SolrInputDocument();
+
+		doc.setField("id", uri.getPath());
+		doc.setField("name", uri.toASCIIString());
+		doc.setField("host", uri.getHost());
+		doc.setField("mime", entity.getMimeType());
+		doc.setField("content", entity.getParse().getText());
+		
+		if (parser == null) initParser();
+		
+		if (selectors.size() > 0) {
+			contentHandler.initDocument(doc);
+			try {
+				parser.setContentHandler(contentHandler);
+				parser.parse(new InputSource(entity.obtainContent()));
+			} catch (IOException e) {
+				e.printStackTrace();
+			} catch (SAXException e) {
+				e.printStackTrace();
+			}
+		}
+		
+		return doc;
+	}
+
+	/**
+	 * Initialize a Cyber Necko parser configured to return lower case element's names
+	 * 
+	 * @return
+	 */
+	private SAXParser initParser() {
+		parser = new SAXParser();
+		try {
+			parser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
+			parser.setFeature("http://cyberneko.org/html/features/balance-tags/ignore-outside-content",
false);
+			parser.setFeature("http://cyberneko.org/html/features/balance-tags/document-fragment",
true);
+			parser.setFeature("http://cyberneko.org/html/features/report-errors", false);
+		} catch (SAXNotRecognizedException ex) {
+			throw new IllegalStateException(ex);
+		} catch (SAXNotSupportedException ex) {
+			throw new IllegalStateException(ex);
+		}
+		return parser;
+	}
+
+	/**
+	 * A class that implements a SAX ContentHandler and uses patterns to record documents 
+	 * elements in a SolrInputDocuement.
+	 */
+	private class SolrContentHandler implements ContentHandler {
+
+		private SolrInputDocument doc;
+		
+		/**
+		 * the patterns which match element's path
+		 */
+		private HashMap<String, Pattern> patterns = new HashMap<String, Pattern>();
+
+		/**
+		 * stores the values which match the patterns
+		 */
+		private HashMap<String, String> valueRecorders = new HashMap<String, String>();
+
+		/**
+		 * A two dimensional stack used to store the current path
+		 */
+		private Stack<Stack<String>> path = new Stack<Stack<String>>();
+
+		private Integer level = 0;
+
+		private Integer lastLevel = 0;
+		
+		/**
+		 * Constructor
+		 * 
+		 * @param selectors an HashMap which contains selectors
+		 */
+		public SolrContentHandler(HashMap<String, String> selectors) {
+			initPatterns(selectors);
+		}
+		
+		/**
+		 * @param selectors
+		 * @return
+		 */
+		public void initPatterns(HashMap<String, String> selectors) {
+			if (selectors != null) {
+				
+				// clear the current patterns
+				patterns.clear();
+				
+				// pattern for the element and its index
+				final Pattern p = Pattern.compile("^([a-zA-Z:-_\\.]+)(\\[([0-9]*)\\]){0,1}$");
+	
+				// for each selector
+				Set<String> keys = selectors.keySet();
+				for (String key : keys) {
+					// creating a pattern
+					String regex = "^";
+					String selector = selectors.get(key);
+					String[] elements = selector.split("/");
+					// which match all the elements and their respective indices
+					for (String element : elements) {
+						Matcher m = p.matcher(element);
+						if (m.find()) {
+							String elementName = m.group(1);
+							String elementIndex = m.group(3);
+							regex += "/" + elementName;
+							if (elementIndex == null) {
+								regex += "\\[[0-9]*\\]";
+							} else {
+								regex += "\\[" + elementIndex + "\\]";
+							}
+						}
+					}
+					regex += "$";
+	
+					// storing the new Pattern
+					Pattern pattern = Pattern.compile(regex);
+					patterns.put(key, pattern);
+				}
+			}
+		}
+		
+		/**
+		 * Initialization of the document used for indexation
+		 * 
+		 * @param doc a solr document
+		 */
+		public void initDocument(SolrInputDocument doc) {
+			this.doc = doc;
+		}
+
+		/* 
+		 * @see org.xml.sax.ContentHandler#startDocument()
+		 */
+		@Override
+		public void startDocument() throws SAXException {
+			level = 0;
+		}
+
+		/* 
+		 * @see org.xml.sax.ContentHandler#endDocument()
+		 */
+		@Override
+		public void endDocument() throws SAXException {
+			level = 0;
+		}
+
+		/* 
+		 * @see org.xml.sax.ContentHandler#startElement(java.lang.String, java.lang.String, java.lang.String,
org.xml.sax.Attributes)
+		 */
+		@Override
+		public void startElement(String uri, String localName, String qName, Attributes atts) throws
SAXException {			
+			// set the level properties.
+			level++;
+
+			// go down in the hierarchy of elements.
+			if (level == lastLevel && path.size() > 0) {
+				path.get(path.size() - 1).add(localName);
+			} else if (level > lastLevel) {
+				Stack<String> s = new Stack<String>();
+				s.add(localName);
+				path.add(s);
+			}
+
+			// if the path matches a pattern, starts recording the matching content.
+			String path = getCurrentPath();
+			Iterator<Entry<String, Pattern>> entries = patterns.entrySet().iterator();
+			while (entries.hasNext()) {
+				Entry<String, Pattern> entry = entries.next();
+				String patternName = entry.getKey();
+				Pattern patternValue = entry.getValue();
+				Matcher matcher = patternValue.matcher(path);
+				if (matcher.find()) {
+					valueRecorders.put(patternName, "");
+				}
+			}
+
+		}
+
+		/* 
+		 * @see org.xml.sax.ContentHandler#endElement(java.lang.String, java.lang.String, java.lang.String)
+		 */
+		@Override
+		public void endElement(String uri, String localName, String qName) throws SAXException
{
+			// check if we climb in the hierarchy.
+			if (level < lastLevel && path.size() > 0) { 
+				path.pop();
+			}
+			
+			// set the level properties.
+			lastLevel = level;
+			level--;
+			
+			// if the path matches a selector, stores the matching content.
+			String path = getCurrentPath();
+			Iterator<Entry<String, Pattern>> entries = patterns.entrySet().iterator();
+			while (entries.hasNext()) {
+				Entry<String, Pattern> entry = entries.next();
+				String patternName = entry.getKey();
+				Pattern patternValue = entry.getValue();
+				Matcher matcher = patternValue.matcher(path);
+				if (matcher.find()) {
+					// add the matching content to the solr document.
+					String value = valueRecorders.remove(patternName);
+					doc.addField(patternName, value);
+				}
+			}
+		}
+
+		/* 
+		 * @see org.xml.sax.ContentHandler#characters(char[], int, int)
+		 */
+		@Override
+		public void characters(char[] ch, int start, int length) throws SAXException {
+			// store the content in each recorder
+			Set<String> keys = valueRecorders.keySet();
+			for (String key : keys) {
+				String recorder = valueRecorders.get(key);
+				recorder += new String(Arrays.copyOfRange(ch, start, start + length));
+				valueRecorders.put(key, recorder);
+			}
+		}
+		
+		/* 
+		 * @see org.xml.sax.ContentHandler#ignorableWhitespace(char[], int, int)
+		 */
+		@Override
+		public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
+			characters(ch, start, length);
+		}
+
+		/* 
+		 * @see org.xml.sax.ContentHandler#startPrefixMapping(java.lang.String, java.lang.String)
+		 */
+		@Override
+		public void startPrefixMapping(String prefix, String uri) throws SAXException {
+
+		}
+
+		/* 
+		 * @see org.xml.sax.ContentHandler#endPrefixMapping(java.lang.String)
+		 */
+		@Override
+		public void endPrefixMapping(String prefix) throws SAXException {
+
+		}
+
+		/* 
+		 * @see org.xml.sax.ContentHandler#processingInstruction(java.lang.String, java.lang.String)
+		 */
+		@Override
+		public void processingInstruction(String target, String data) throws SAXException {
+
+		}
+
+		/* 
+		 * @see org.xml.sax.ContentHandler#setDocumentLocator(org.xml.sax.Locator)
+		 */
+		@Override
+		public void setDocumentLocator(Locator locator) {
+
+		}
+
+		/* 
+		 * @see org.xml.sax.ContentHandler#skippedEntity(java.lang.String)
+		 */
+		@Override
+		public void skippedEntity(String name) throws SAXException {
+
+		}
+
+		/**
+		 * Computes the current path by crossing the path stack.
+		 * 
+		 * @return a path
+		 */
+		private String getCurrentPath() {
+			String p = "";
+
+			// find the element at each level
+			for (Stack<String> h : path) {
+				String element = h.get(h.size() - 1);
+				Integer index = -1;
+				// find the element's index 
+				for (String e : h) {
+					if (e.equals(element)) {
+						index++;
+					}
+				}
+				// path with the index at each level
+				p += "/" + element + "[" + index + "]";
+			}
+			return p;
+		}
+
+	}
+}

Added: incubator/droids/trunk/droids-solr/src/test/java/org/apache/droids/solr/AdvancedSolrHandleTest.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-solr/src/test/java/org/apache/droids/solr/AdvancedSolrHandleTest.java?rev=813386&view=auto
==============================================================================
--- incubator/droids/trunk/droids-solr/src/test/java/org/apache/droids/solr/AdvancedSolrHandleTest.java
(added)
+++ incubator/droids/trunk/droids-solr/src/test/java/org/apache/droids/solr/AdvancedSolrHandleTest.java
Thu Sep 10 11:53:44 2009
@@ -0,0 +1,124 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.solr;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.util.HashMap;
+
+import junit.framework.TestCase;
+
+import org.apache.droids.api.ContentEntity;
+import org.apache.droids.api.Parse;
+import org.apache.droids.exception.DroidsException;
+import org.apache.solr.client.solrj.SolrQuery;
+import org.apache.solr.client.solrj.SolrServer;
+import org.apache.solr.client.solrj.SolrServerException;
+import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer;
+import org.apache.solr.client.solrj.response.QueryResponse;
+import org.apache.solr.common.SolrDocument;
+import org.apache.solr.common.SolrInputDocument;
+import org.apache.solr.core.CoreContainer;
+import org.apache.solr.core.CoreDescriptor;
+import org.apache.solr.core.SolrCore;
+import org.apache.solr.core.SolrResourceLoader;
+
+public class AdvancedSolrHandleTest extends TestCase {
+
+	SolrServer solr;
+	
+	String simpleHtmlPage = "" +
+		"<html>" +
+			"<body>" +
+				"<div>" +
+					"<p>p0</p>" +
+					"<p>p1</p>" +
+					"<p>p2</p>" +
+				"</div>" +
+				"<div>" +
+					"<p>p3</p>" +
+					"<p>p4</p>" +
+					"<p>p5</p>" +
+				"</div>" +
+			"</body>" +
+		"</html>";
+
+	protected String getSolrHome() {
+		return "example";
+	}
+
+	@Override
+	public void setUp() throws Exception {
+		super.setUp();
+
+		SolrResourceLoader loader = new SolrResourceLoader(getSolrHome());
+		CoreContainer container = new CoreContainer(loader);
+		CoreDescriptor descriptor = new CoreDescriptor(container, "cname", ".");
+		SolrCore core = container.create(descriptor);
+		container.register(core.getName(), core, false);
+
+		solr = new EmbeddedSolrServer(container, core.getName());
+	}
+
+	public void tearDown() throws Exception {
+		// remove everything....
+		solr.deleteByQuery("*:*");
+		solr.commit();
+	}
+	
+	public void performSelection(String html, String field, String selector, String expectedValue)
throws IOException, DroidsException, URISyntaxException, SolrServerException {
+		AdvancedSolrHandler handler = new AdvancedSolrHandler();
+		handler.setServer(solr);
+		
+		HashMap<String, String> selectors = new HashMap<String, String>();
+		selectors.put(field, selector);
+		handler.setSelectors(selectors);
+		
+		MockContentEntity contentEntity = new MockContentEntity();
+		contentEntity.setCharset("UTF-8");
+		contentEntity.setMimeType("text/html");
+		contentEntity.setText(html);
+		
+		handler.handle(new URI("http://localhost/"), contentEntity);
+		solr.commit();
+		
+		SolrQuery query = new SolrQuery();
+		query.setQuery("*:*");
+		query.setFields(field);
+		QueryResponse response = solr.query(query);
+		
+		SolrDocument doc = response.getResults().iterator().next();
+		String value = (String)doc.getFieldValue(field);
+		
+		assertEquals(expectedValue, value);
+	}
+	
+	public void testSelectorA() throws Exception {
+		performSelection(simpleHtmlPage, "selector", "/html[0]/body[0]/div[0]/p[0]", "p0");
+	}
+	
+	public void testSelectorB() throws Exception {
+		performSelection(simpleHtmlPage, "selector", "/html[0]/body[0]/div[1]/p[1]", "p4");
+	}
+	
+	public void testSelectorC() throws Exception {
+		performSelection(simpleHtmlPage, "selector", "/html[0]/body[0]/div[1]", "p3p4p5");
+	}
+
+}
\ No newline at end of file

Added: incubator/droids/trunk/droids-solr/src/test/java/org/apache/droids/solr/MockContentEntity.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-solr/src/test/java/org/apache/droids/solr/MockContentEntity.java?rev=813386&view=auto
==============================================================================
--- incubator/droids/trunk/droids-solr/src/test/java/org/apache/droids/solr/MockContentEntity.java
(added)
+++ incubator/droids/trunk/droids-solr/src/test/java/org/apache/droids/solr/MockContentEntity.java
Thu Sep 10 11:53:44 2009
@@ -0,0 +1,68 @@
+package org.apache.droids.solr;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Collection;
+
+import org.apache.droids.api.ContentEntity;
+import org.apache.droids.api.Link;
+import org.apache.droids.api.Parse;
+import org.apache.droids.parse.ParseImpl;
+
+public class MockContentEntity implements ContentEntity {
+
+	private String text;
+	
+	private String charset = "UTF-8";
+	
+	private String mimeType = "text/html";
+	
+	private Collection<Link> outlinks = new ArrayList<Link>();
+
+	public Collection<Link> getOutlinks() {
+		return outlinks;
+	}
+
+	public void setOutlinks(Collection<Link> outlinks) {
+		this.outlinks = outlinks;
+	}
+
+	public String getText() {
+		return text;
+	}
+
+	public void setText(String text) {
+		this.text = text;
+	}
+	
+	@Override
+	public String getCharset() {
+		return charset;
+	}
+
+	public void setCharset(String charset) {
+		this.charset = charset;
+	}
+	
+	@Override
+	public String getMimeType() {
+		return mimeType;
+	}
+	
+	public void setMimeType(String mimeType) {
+		this.mimeType = mimeType;
+	}
+
+	@Override
+	public Parse getParse() {
+		return new ParseImpl(text, outlinks);
+	}
+
+	@Override
+	public InputStream obtainContent() throws IOException {
+		return new ByteArrayInputStream(text.getBytes());
+	}
+
+}



Mime
View raw message