From droids-commits-return-557-apmail-incubator-droids-commits-archive=incubator.apache.org@incubator.apache.org Thu Mar 28 11:22:41 2013 Return-Path: X-Original-To: apmail-incubator-droids-commits-archive@minotaur.apache.org Delivered-To: apmail-incubator-droids-commits-archive@minotaur.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 41691FD4F for ; Thu, 28 Mar 2013 11:22:41 +0000 (UTC) Received: (qmail 83703 invoked by uid 500); 28 Mar 2013 11:20:48 -0000 Delivered-To: apmail-incubator-droids-commits-archive@incubator.apache.org Received: (qmail 83650 invoked by uid 500); 28 Mar 2013 11:20:47 -0000 Mailing-List: contact droids-commits-help@incubator.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: droids-dev@incubator.apache.org Delivered-To: mailing list droids-commits@incubator.apache.org Received: (qmail 83623 invoked by uid 99); 28 Mar 2013 11:20:46 -0000 Received: from athena.apache.org (HELO athena.apache.org) (140.211.11.136) by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 28 Mar 2013 11:20:46 +0000 X-ASF-Spam-Status: No, hits=-2000.0 required=5.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 28 Mar 2013 11:20:32 +0000 Received: from eris.apache.org (localhost [127.0.0.1]) by eris.apache.org (Postfix) with ESMTP id 30536238889B; Thu, 28 Mar 2013 11:18:11 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r1462042 - in /incubator/droids/branches/0.2.x-cleanup: droids-nekohtml/ droids-nekohtml/src/ droids-nekohtml/src/main/ droids-nekohtml/src/main/java/ droids-nekohtml/src/main/java/org/ droids-nekohtml/src/main/java/org/apache/ droids-nekoh... Date: Thu, 28 Mar 2013 11:18:11 -0000 To: droids-commits@incubator.apache.org From: tobr@apache.org X-Mailer: svnmailer-1.0.8-patched Message-Id: <20130328111811.30536238889B@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Author: tobr Date: Thu Mar 28 11:18:10 2013 New Revision: 1462042 URL: http://svn.apache.org/r1462042 Log: added module for extracting data given by selectors Added: incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/ incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/pom.xml (with props) incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/src/ incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/src/main/ incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/src/main/java/ incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/src/main/java/org/ incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/src/main/java/org/apache/ incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/src/main/java/org/apache/droids/ incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/src/main/java/org/apache/droids/nekohtml/ incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/src/main/java/org/apache/droids/nekohtml/NekoHtmlParser.java - copied, changed from r1461977, incubator/droids/branches/0.2.x-cleanup/droids-solr/src/main/java/org/apache/droids/solr/AdvancedSolrHandler.java incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/src/test/ incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/src/test/java/ incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/src/test/java/org/ incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/src/test/java/org/apache/ incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/src/test/java/org/apache/droids/ incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/src/test/java/org/apache/droids/nekohtml/ incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/src/test/java/org/apache/droids/nekohtml/NekoHtmlParserTest.java - copied, changed from r1461977, incubator/droids/branches/0.2.x-cleanup/droids-solr/src/test/java/org/apache/droids/solr/AdvancedSolrHandleTest.java Removed: incubator/droids/branches/0.2.x-cleanup/droids-solr/src/main/java/org/apache/droids/solr/AdvancedSolrHandler.java incubator/droids/branches/0.2.x-cleanup/droids-solr/src/test/java/org/apache/droids/solr/AdvancedSolrHandleTest.java Added: incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/pom.xml URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/pom.xml?rev=1462042&view=auto ============================================================================== --- incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/pom.xml (added) +++ incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/pom.xml Thu Mar 28 11:18:10 2013 @@ -0,0 +1,62 @@ + + + + 4.0.0 + jar + + + org.apache.droids + droids + 0.3.0-incubating-SNAPSHOT + + + droids-nekohtml + APACHE DROIDS NEKOHTML PARSER + + + 1.9.18 + + + + + org.apache.droids + droids-core + ${project.version} + + + net.sourceforge.nekohtml + nekohtml + ${nekohtml.version} + + + + junit + junit + ${junit.version} + test + + + ch.qos.logback + logback-classic + ${logback.version} + test + + + Propchange: incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/pom.xml ------------------------------------------------------------------------------ svn:eol-style = native Propchange: incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/pom.xml ------------------------------------------------------------------------------ svn:keywords = Author Date Id Revision Propchange: incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/pom.xml ------------------------------------------------------------------------------ svn:mime-type = text/xml Copied: incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/src/main/java/org/apache/droids/nekohtml/NekoHtmlParser.java (from r1461977, incubator/droids/branches/0.2.x-cleanup/droids-solr/src/main/java/org/apache/droids/solr/AdvancedSolrHandler.java) URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/src/main/java/org/apache/droids/nekohtml/NekoHtmlParser.java?p2=incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/src/main/java/org/apache/droids/nekohtml/NekoHtmlParser.java&p1=incubator/droids/branches/0.2.x-cleanup/droids-solr/src/main/java/org/apache/droids/solr/AdvancedSolrHandler.java&r1=1461977&r2=1462042&rev=1462042&view=diff ============================================================================== --- incubator/droids/branches/0.2.x-cleanup/droids-solr/src/main/java/org/apache/droids/solr/AdvancedSolrHandler.java (original) +++ incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/src/main/java/org/apache/droids/nekohtml/NekoHtmlParser.java Thu Mar 28 11:18:10 2013 @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.droids.solr; +package org.apache.droids.nekohtml; import java.io.IOException; import java.util.Arrays; @@ -28,10 +28,8 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.droids.core.DroidsException; +import org.apache.droids.core.Parser; import org.apache.droids.core.Task; -import org.apache.solr.client.solrj.SolrServer; -import org.apache.solr.client.solrj.SolrServerException; -import org.apache.solr.common.SolrInputDocument; import org.cyberneko.html.parsers.SAXParser; import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; @@ -42,11 +40,11 @@ import org.xml.sax.SAXNotRecognizedExcep import org.xml.sax.SAXNotSupportedException; /** - * A Droids Handler which allows to specify selectors to store - * documents' parts in a Solr index. + * A Droids Parser which allows to specify selectors to extract + * documents parts. *

- * A selector is an Entry made of a key which matches the solr fiel and - * of a value which correspond to a path selector. + * A selector is an Entry made of a key and + * a value which correspond to a path selector. *

* Path selectors are always absolute and supports indexes. *

@@ -55,18 +53,9 @@ import org.xml.sax.SAXNotSupportedExcept * - /html[0]/div[0]/p[0] * - /html[0]/div[1]/p[2] */ -public class AdvancedSolrHandler extends SolrHandler { - /** - * The selectors allow to save specific parts of the document in the index. - * The HashMap's key matches the Solr field. - * The HashMap's value is an absolute path corresponding to an element. - */ +public class NekoHtmlParser implements Parser { private Map selectors; - - /** - * A content handler - */ - private SolrContentHandler contentHandler = new SolrContentHandler(selectors); + private Map patterns; /** * An HTML parser @@ -74,62 +63,23 @@ public class AdvancedSolrHandler extends private SAXParser parser; - public AdvancedSolrHandler(String solrUrl) { - super(solrUrl); - } - - public AdvancedSolrHandler(SolrServer solrServer) { - super(solrServer); + public NekoHtmlParser() { + this(new HashMap()); } - - /** - * @return the current path selectors - */ - public Map getSelectors() { - return selectors; + public NekoHtmlParser(HashMap selectors) { + this.patterns = new HashMap(); + setSelectors(selectors); + if (parser == null) initParser(); } - /** - * @param selectors an hash map containing path selectors - */ - public void setSelectors(HashMap selectors) { - contentHandler.initPatterns(selectors); - this.selectors = selectors; - } - /* - * @see org.apache.droids.api.Handler#handle(java.net.URI, org.apache.droids.api.DroidsContentEntity) - */ @Override - public void handle(Task task) throws DroidsException, IOException { - SolrInputDocument doc = createSolrInputDocument(task); - try { - getSolrServer().add(doc); - } catch (SolrServerException e) { - throw new DroidsException(e); - } - } - - /** - * Generates a SolrInputDocument from an URI and a DroidsContentEntity - * which correspond to the document which need to be saved in the index - * - * @param task the task - * @return - */ - private SolrInputDocument createSolrInputDocument(Task task) { - SolrInputDocument doc = new SolrInputDocument(); - - doc.setField("id", task.getURI().getPath()); - doc.setField("name", task.getURI().toASCIIString()); - doc.setField("contentType", task.getContentEntity().getContentType()); - doc.setField("content", task.getParserData().getText()); - - if (parser == null) initParser(); + public void parse(T task) throws DroidsException { + NekoContentHandler contentHandler; - if (!selectors.isEmpty()) { - contentHandler.initDocument(doc); + if (!patterns.isEmpty()) { + contentHandler = new NekoContentHandler(task, patterns); try { parser.setContentHandler(contentHandler); parser.parse(new InputSource(task.getContentEntity().getContent())); @@ -140,15 +90,35 @@ public class AdvancedSolrHandler extends } } - return doc; } /** + * Get the selectors. + * + * @return the map of selectors + */ + public Map getSelectors() { + return selectors; + } + + /** + * The selectors allow to save specific parts of the document in the index. + * The key of the map is used to identify the rule. + * The value contains the selection path rule e.g. /html[0]/div[0] + * + * @param selectors Map of selectors + */ + public void setSelectors(HashMap selectors) { + this.selectors = selectors; + initPatterns(); + } + + /** + * * Initialize a Cyber Necko parser configured to return lower case element's names * - * @return */ - private SAXParser initParser() { + private void initParser() { parser = new SAXParser(); try { parser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower"); @@ -160,26 +130,59 @@ public class AdvancedSolrHandler extends } catch (SAXNotSupportedException ex) { throw new IllegalStateException(ex); } - return parser; } - @Override - public void finish() { + /** + * Initialize patterns. + * Transform the selector map to regex rules. + */ + private void initPatterns() { + if (selectors != null) { - } + // clear the current patterns + patterns.clear(); - @Override - public void cleanUp(String query) { + // pattern for the element and its index + final Pattern p = Pattern.compile("^([a-zA-Z:-_\\.]+)(\\[([0-9]*)\\]){0,1}$"); + + // for each selector + Set keys = selectors.keySet(); + for (String key : keys) { + // creating a pattern + String regex = "^"; + String selector = selectors.get(key); + String[] elements = selector.split("/"); + // which match all the elements and their respective indices + for (String element : elements) { + Matcher m = p.matcher(element); + if (m.find()) { + String elementName = m.group(1); + String elementIndex = m.group(3); + regex += "/" + elementName; + if (elementIndex == null) { + regex += "\\[[0-9]*\\]"; + } else { + regex += "\\[" + elementIndex + "\\]"; + } + } + } + regex += "$"; + // storing the new Pattern + Pattern pattern = Pattern.compile(regex); + patterns.put(key, pattern); + } + } } + /** - * A class that implements a SAX ContentHandler and uses patterns to record documents - * elements in a SolrInputDocuement. + * A class that implements a SAX ContentHandler and uses patterns to + * extract elements. */ - private class SolrContentHandler implements ContentHandler { + private class NekoContentHandler implements ContentHandler { - private SolrInputDocument doc; + private T task; /** * the patterns which match element's path @@ -200,66 +203,9 @@ public class AdvancedSolrHandler extends private int lastLevel = 0; - /** - * Constructor - * - * @param selectors an Map which contains selectors - */ - public SolrContentHandler(Map selectors) { - initPatterns(selectors); - } - - /** - * Initialize patterns - * - * @param selectors - */ - public void initPatterns(Map selectors) { - if (selectors != null) { - - // clear the current patterns - patterns.clear(); - - // pattern for the element and its index - final Pattern p = Pattern.compile("^([a-zA-Z:-_\\.]+)(\\[([0-9]*)\\]){0,1}$"); - - // for each selector - Set keys = selectors.keySet(); - for (String key : keys) { - // creating a pattern - String regex = "^"; - String selector = selectors.get(key); - String[] elements = selector.split("/"); - // which match all the elements and their respective indices - for (String element : elements) { - Matcher m = p.matcher(element); - if (m.find()) { - String elementName = m.group(1); - String elementIndex = m.group(3); - regex += "/" + elementName; - if (elementIndex == null) { - regex += "\\[[0-9]*\\]"; - } else { - regex += "\\[" + elementIndex + "\\]"; - } - } - } - regex += "$"; - - // storing the new Pattern - Pattern pattern = Pattern.compile(regex); - patterns.put(key, pattern); - } - } - } - - /** - * Initialization of the document used for indexation - * - * @param doc a solr document - */ - public void initDocument(SolrInputDocument doc) { - this.doc = doc; + public NekoContentHandler(T task, Map patterns) { + this.task = task; + this.patterns = patterns; } /* @@ -335,7 +281,7 @@ public class AdvancedSolrHandler extends if (matcher.find()) { // add the matching content to the solr document. String value = valueRecorders.remove(patternName); - doc.addField(patternName, value); + task.getParserData().add(patternName, value); } } } Copied: incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/src/test/java/org/apache/droids/nekohtml/NekoHtmlParserTest.java (from r1461977, incubator/droids/branches/0.2.x-cleanup/droids-solr/src/test/java/org/apache/droids/solr/AdvancedSolrHandleTest.java) URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/src/test/java/org/apache/droids/nekohtml/NekoHtmlParserTest.java?p2=incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/src/test/java/org/apache/droids/nekohtml/NekoHtmlParserTest.java&p1=incubator/droids/branches/0.2.x-cleanup/droids-solr/src/test/java/org/apache/droids/solr/AdvancedSolrHandleTest.java&r1=1461977&r2=1462042&rev=1462042&view=diff ============================================================================== --- incubator/droids/branches/0.2.x-cleanup/droids-solr/src/test/java/org/apache/droids/solr/AdvancedSolrHandleTest.java (original) +++ incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/src/test/java/org/apache/droids/nekohtml/NekoHtmlParserTest.java Thu Mar 28 11:18:10 2013 @@ -14,35 +14,24 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.droids.solr; +package org.apache.droids.nekohtml; import java.io.ByteArrayInputStream; -import java.io.IOException; +import java.io.UnsupportedEncodingException; import java.net.URI; import java.net.URISyntaxException; import java.util.HashMap; -import org.apache.droids.core.ContentEntity; +import org.apache.droids.core.BasicTask; import org.apache.droids.core.DroidsException; import org.apache.droids.core.Task; -import org.apache.solr.client.solrj.SolrQuery; -import org.apache.solr.client.solrj.SolrServer; -import org.apache.solr.client.solrj.SolrServerException; -import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer; -import org.apache.solr.client.solrj.response.QueryResponse; -import org.apache.solr.common.SolrDocument; -import org.apache.solr.core.CoreContainer; -import org.junit.After; -import org.junit.Before; import org.junit.Test; import static org.junit.Assert.*; -public class AdvancedSolrHandleTest { - - SolrServer solr; - - String simpleHtmlPage = "" + +public class NekoHtmlParserTest { + private Task task; + private static final String TEST_CONTENT = "" + "" + "" + "

" + @@ -58,65 +47,42 @@ public class AdvancedSolrHandleTest { "" + ""; - protected String getSolrHome() { - return "example"; - } - - @Before - public void setUp() throws Exception { - CoreContainer.Initializer initializer = new CoreContainer.Initializer(); - CoreContainer coreContainer = initializer.initialize(); - solr = new EmbeddedSolrServer(coreContainer, ""); - } - - @After - public void tearDown() throws Exception { - // remove everything.... - solr.deleteByQuery("*:*"); - solr.commit(); + public NekoHtmlParserTest() { + try { + task = new BasicTask(new URI("http://localhost/")); + + task.getContentEntity().setContent(new ByteArrayInputStream(TEST_CONTENT.getBytes("UTF-8"))); + } catch (URISyntaxException e) { + e.printStackTrace(); + } catch (UnsupportedEncodingException e) { + e.printStackTrace(); + } } - public void performSelection(String html, String field, String selector, String expectedValue) throws IOException, DroidsException, URISyntaxException, SolrServerException { - AdvancedSolrHandler handler = new AdvancedSolrHandler(solr); - + public void performSelection(String field, String selector, String expectedValue) throws URISyntaxException, DroidsException { HashMap selectors = new HashMap(); selectors.put(field, selector); - handler.setSelectors(selectors); - - Task task = new SolrTask(new URI("http://localhost/")); - - ContentEntity contentEntity = task.getContentEntity(); - contentEntity.setContentType("text/html"); - contentEntity.setCharset("UTF-8"); - contentEntity.setContent(new ByteArrayInputStream(html.getBytes("UTF-8"))); - - handler.handle(task); - solr.commit(); - SolrQuery query = new SolrQuery(); - query.setQuery("*:*"); - query.setFields(field); - QueryResponse response = solr.query(query); + NekoHtmlParser parser = new NekoHtmlParser(selectors); - SolrDocument doc = response.getResults().iterator().next(); - String value = (String) doc.getFieldValue(field); + parser.parse(task); - assertEquals(expectedValue, value); + assertEquals(expectedValue, task.getParserData().get(field)); } @Test public void testSelectorA() throws Exception { - performSelection(simpleHtmlPage, "selector", "/html[0]/body[0]/div[0]/p[0]", "p0"); + performSelection("selector", "/html[0]/body[0]/div[0]/p[0]", "p0"); } @Test public void testSelectorB() throws Exception { - performSelection(simpleHtmlPage, "selector", "/html[0]/body[0]/div[1]/p[1]", "p4"); + performSelection("selector", "/html[0]/body[0]/div[1]/p[1]", "p4"); } @Test public void testSelectorC() throws Exception { - performSelection(simpleHtmlPage, "selector", "/html[0]/body[0]/div[1]", "p3p4p5"); + performSelection("selector", "/html[0]/body[0]/div[1]", "p3p4p5"); } } \ No newline at end of file