incubator-droids-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From t...@apache.org
Subject svn commit: r1462042 - in /incubator/droids/branches/0.2.x-cleanup: droids-nekohtml/ droids-nekohtml/src/ droids-nekohtml/src/main/ droids-nekohtml/src/main/java/ droids-nekohtml/src/main/java/org/ droids-nekohtml/src/main/java/org/apache/ droids-nekoh...
Date Thu, 28 Mar 2013 11:18:11 GMT
Author: tobr
Date: Thu Mar 28 11:18:10 2013
New Revision: 1462042

URL: http://svn.apache.org/r1462042
Log:
added module for extracting data given by selectors

Added:
    incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/
    incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/pom.xml   (with props)
    incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/src/
    incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/src/main/
    incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/src/main/java/
    incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/src/main/java/org/
    incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/src/main/java/org/apache/
    incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/src/main/java/org/apache/droids/
    incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/src/main/java/org/apache/droids/nekohtml/
    incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/src/main/java/org/apache/droids/nekohtml/NekoHtmlParser.java
      - copied, changed from r1461977, incubator/droids/branches/0.2.x-cleanup/droids-solr/src/main/java/org/apache/droids/solr/AdvancedSolrHandler.java
    incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/src/test/
    incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/src/test/java/
    incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/src/test/java/org/
    incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/src/test/java/org/apache/
    incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/src/test/java/org/apache/droids/
    incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/src/test/java/org/apache/droids/nekohtml/
    incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/src/test/java/org/apache/droids/nekohtml/NekoHtmlParserTest.java
      - copied, changed from r1461977, incubator/droids/branches/0.2.x-cleanup/droids-solr/src/test/java/org/apache/droids/solr/AdvancedSolrHandleTest.java
Removed:
    incubator/droids/branches/0.2.x-cleanup/droids-solr/src/main/java/org/apache/droids/solr/AdvancedSolrHandler.java
    incubator/droids/branches/0.2.x-cleanup/droids-solr/src/test/java/org/apache/droids/solr/AdvancedSolrHandleTest.java

Added: incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/pom.xml
URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/pom.xml?rev=1462042&view=auto
==============================================================================
--- incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/pom.xml (added)
+++ incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/pom.xml Thu Mar 28 11:18:10 2013
@@ -0,0 +1,62 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+<project xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"
+         xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+    <modelVersion>4.0.0</modelVersion>
+    <packaging>jar</packaging>
+
+    <parent>
+        <groupId>org.apache.droids</groupId>
+        <artifactId>droids</artifactId>
+        <version>0.3.0-incubating-SNAPSHOT</version>
+    </parent>
+
+    <artifactId>droids-nekohtml</artifactId>
+    <name>APACHE DROIDS NEKOHTML PARSER</name>
+
+    <properties>
+        <nekohtml.version>1.9.18</nekohtml.version>
+    </properties>
+
+    <dependencies>
+        <dependency>
+            <groupId>org.apache.droids</groupId>
+            <artifactId>droids-core</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>net.sourceforge.nekohtml</groupId>
+            <artifactId>nekohtml</artifactId>
+            <version>${nekohtml.version}</version>
+        </dependency>
+        <!-- FOR TESTING -->
+        <dependency>
+            <groupId>junit</groupId>
+            <artifactId>junit</artifactId>
+            <version>${junit.version}</version>
+            <scope>test</scope>
+        </dependency>
+        <dependency>
+            <groupId>ch.qos.logback</groupId>
+            <artifactId>logback-classic</artifactId>
+            <version>${logback.version}</version>
+            <scope>test</scope>
+        </dependency>
+    </dependencies>
+</project>

Propchange: incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/pom.xml
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/pom.xml
------------------------------------------------------------------------------
    svn:keywords = Author Date Id Revision

Propchange: incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/pom.xml
------------------------------------------------------------------------------
    svn:mime-type = text/xml

Copied: incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/src/main/java/org/apache/droids/nekohtml/NekoHtmlParser.java
(from r1461977, incubator/droids/branches/0.2.x-cleanup/droids-solr/src/main/java/org/apache/droids/solr/AdvancedSolrHandler.java)
URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/src/main/java/org/apache/droids/nekohtml/NekoHtmlParser.java?p2=incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/src/main/java/org/apache/droids/nekohtml/NekoHtmlParser.java&p1=incubator/droids/branches/0.2.x-cleanup/droids-solr/src/main/java/org/apache/droids/solr/AdvancedSolrHandler.java&r1=1461977&r2=1462042&rev=1462042&view=diff
==============================================================================
--- incubator/droids/branches/0.2.x-cleanup/droids-solr/src/main/java/org/apache/droids/solr/AdvancedSolrHandler.java
(original)
+++ incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/src/main/java/org/apache/droids/nekohtml/NekoHtmlParser.java
Thu Mar 28 11:18:10 2013
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.droids.solr;
+package org.apache.droids.nekohtml;
 
 import java.io.IOException;
 import java.util.Arrays;
@@ -28,10 +28,8 @@ import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
 import org.apache.droids.core.DroidsException;
+import org.apache.droids.core.Parser;
 import org.apache.droids.core.Task;
-import org.apache.solr.client.solrj.SolrServer;
-import org.apache.solr.client.solrj.SolrServerException;
-import org.apache.solr.common.SolrInputDocument;
 import org.cyberneko.html.parsers.SAXParser;
 import org.xml.sax.Attributes;
 import org.xml.sax.ContentHandler;
@@ -42,11 +40,11 @@ import org.xml.sax.SAXNotRecognizedExcep
 import org.xml.sax.SAXNotSupportedException;
 
 /**
- * A Droids Handler which allows to specify selectors to store
- * documents' parts in a Solr index.
+ * A Droids Parser which allows to specify selectors to extract
+ * documents parts.
  * <p/>
- * A selector is an Entry made of a key which matches the solr fiel and
- * of a value which correspond to a path selector.
+ * A selector is an Entry made of a key and
+ * a value which correspond to a path selector.
  * <p/>
  * Path selectors are always absolute and supports indexes.
  * <p/>
@@ -55,18 +53,9 @@ import org.xml.sax.SAXNotSupportedExcept
  * - /html[0]/div[0]/p[0]
  * - /html[0]/div[1]/p[2]
  */
-public class AdvancedSolrHandler extends SolrHandler {
-    /**
-     * The selectors allow to save specific parts of the document in the index.
-     * The HashMap's key matches the Solr field.
-     * The HashMap's value is an absolute path corresponding to an element.
-     */
+public class NekoHtmlParser<T extends Task> implements Parser<T> {
     private Map<String, String> selectors;
-
-    /**
-     * A content handler
-     */
-    private SolrContentHandler contentHandler = new SolrContentHandler(selectors);
+    private Map<String, Pattern> patterns;
 
     /**
      * An HTML parser
@@ -74,62 +63,23 @@ public class AdvancedSolrHandler extends
     private SAXParser parser;
 
 
-    public AdvancedSolrHandler(String solrUrl) {
-        super(solrUrl);
-    }
-
-    public AdvancedSolrHandler(SolrServer solrServer) {
-        super(solrServer);
+    public NekoHtmlParser() {
+        this(new HashMap<String, String>());
     }
 
-
-    /**
-     * @return the current path selectors
-     */
-    public Map<String, String> getSelectors() {
-        return selectors;
+    public NekoHtmlParser(HashMap<String, String> selectors) {
+        this.patterns =  new HashMap<String, Pattern>();
+        setSelectors(selectors);
+        if (parser == null) initParser();
     }
 
-    /**
-     * @param selectors an hash map containing path selectors
-     */
-    public void setSelectors(HashMap<String, String> selectors) {
-        contentHandler.initPatterns(selectors);
-        this.selectors = selectors;
-    }
 
-    /*
-     * @see org.apache.droids.api.Handler#handle(java.net.URI, org.apache.droids.api.DroidsContentEntity)
-     */
     @Override
-    public void handle(Task task) throws DroidsException, IOException {
-        SolrInputDocument doc = createSolrInputDocument(task);
-        try {
-            getSolrServer().add(doc);
-        } catch (SolrServerException e) {
-            throw new DroidsException(e);
-        }
-    }
-
-    /**
-     * Generates a SolrInputDocument from an URI and a DroidsContentEntity
-     * which correspond to the document which need to be saved in the index
-     *
-     * @param task   the task
-     * @return
-     */
-    private SolrInputDocument createSolrInputDocument(Task task) {
-        SolrInputDocument doc = new SolrInputDocument();
-
-        doc.setField("id", task.getURI().getPath());
-        doc.setField("name", task.getURI().toASCIIString());
-        doc.setField("contentType", task.getContentEntity().getContentType());
-        doc.setField("content", task.getParserData().getText());
-
-        if (parser == null) initParser();
+    public void parse(T task) throws DroidsException {
+        NekoContentHandler contentHandler;
 
-        if (!selectors.isEmpty()) {
-            contentHandler.initDocument(doc);
+        if (!patterns.isEmpty()) {
+            contentHandler = new NekoContentHandler(task, patterns);
             try {
                 parser.setContentHandler(contentHandler);
                 parser.parse(new InputSource(task.getContentEntity().getContent()));
@@ -140,15 +90,35 @@ public class AdvancedSolrHandler extends
             }
         }
 
-        return doc;
     }
 
     /**
+     * Get the selectors.
+     *
+     * @return the map of selectors
+     */
+    public Map<String, String> getSelectors() {
+        return selectors;
+    }
+
+    /**
+     * The selectors allow to save specific parts of the document in the index.
+     * The key of the map is used to identify the rule.
+     * The value contains the selection path rule e.g. /html[0]/div[0]
+     *
+     * @param selectors Map of selectors
+     */
+    public void setSelectors(HashMap<String, String> selectors) {
+        this.selectors = selectors;
+        initPatterns();
+    }
+
+    /**
+     *
      * Initialize a Cyber Necko parser configured to return lower case element's names
      *
-     * @return
      */
-    private SAXParser initParser() {
+    private void initParser() {
         parser = new SAXParser();
         try {
             parser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
@@ -160,26 +130,59 @@ public class AdvancedSolrHandler extends
         } catch (SAXNotSupportedException ex) {
             throw new IllegalStateException(ex);
         }
-        return parser;
     }
 
-    @Override
-    public void finish() {
+    /**
+     * Initialize patterns.
+     * Transform the selector map to regex rules.
+     */
+    private void initPatterns() {
+        if (selectors != null) {
 
-    }
+            // clear the current patterns
+            patterns.clear();
 
-    @Override
-    public void cleanUp(String query) {
+            // pattern for the element and its index
+            final Pattern p = Pattern.compile("^([a-zA-Z:-_\\.]+)(\\[([0-9]*)\\]){0,1}$");
+
+            // for each selector
+            Set<String> keys = selectors.keySet();
+            for (String key : keys) {
+                // creating a pattern
+                String regex = "^";
+                String selector = selectors.get(key);
+                String[] elements = selector.split("/");
+                // which match all the elements and their respective indices
+                for (String element : elements) {
+                    Matcher m = p.matcher(element);
+                    if (m.find()) {
+                        String elementName = m.group(1);
+                        String elementIndex = m.group(3);
+                        regex += "/" + elementName;
+                        if (elementIndex == null) {
+                            regex += "\\[[0-9]*\\]";
+                        } else {
+                            regex += "\\[" + elementIndex + "\\]";
+                        }
+                    }
+                }
+                regex += "$";
 
+                // storing the new Pattern
+                Pattern pattern = Pattern.compile(regex);
+                patterns.put(key, pattern);
+            }
+        }
     }
 
+
     /**
-     * A class that implements a SAX ContentHandler and uses patterns to record documents
-     * elements in a SolrInputDocuement.
+     * A class that implements a SAX ContentHandler and uses patterns to
+     * extract elements.
      */
-    private class SolrContentHandler implements ContentHandler {
+    private class NekoContentHandler implements ContentHandler {
 
-        private SolrInputDocument doc;
+        private T task;
 
         /**
          * the patterns which match element's path
@@ -200,66 +203,9 @@ public class AdvancedSolrHandler extends
 
         private int lastLevel = 0;
 
-        /**
-         * Constructor
-         *
-         * @param selectors an Map which contains selectors
-         */
-        public SolrContentHandler(Map<String, String> selectors) {
-            initPatterns(selectors);
-        }
-
-        /**
-         * Initialize patterns
-         *
-         * @param selectors
-         */
-        public void initPatterns(Map<String, String> selectors) {
-            if (selectors != null) {
-
-                // clear the current patterns
-                patterns.clear();
-
-                // pattern for the element and its index
-                final Pattern p = Pattern.compile("^([a-zA-Z:-_\\.]+)(\\[([0-9]*)\\]){0,1}$");
-
-                // for each selector
-                Set<String> keys = selectors.keySet();
-                for (String key : keys) {
-                    // creating a pattern
-                    String regex = "^";
-                    String selector = selectors.get(key);
-                    String[] elements = selector.split("/");
-                    // which match all the elements and their respective indices
-                    for (String element : elements) {
-                        Matcher m = p.matcher(element);
-                        if (m.find()) {
-                            String elementName = m.group(1);
-                            String elementIndex = m.group(3);
-                            regex += "/" + elementName;
-                            if (elementIndex == null) {
-                                regex += "\\[[0-9]*\\]";
-                            } else {
-                                regex += "\\[" + elementIndex + "\\]";
-                            }
-                        }
-                    }
-                    regex += "$";
-
-                    // storing the new Pattern
-                    Pattern pattern = Pattern.compile(regex);
-                    patterns.put(key, pattern);
-                }
-            }
-        }
-
-        /**
-         * Initialization of the document used for indexation
-         *
-         * @param doc a solr document
-         */
-        public void initDocument(SolrInputDocument doc) {
-            this.doc = doc;
+        public NekoContentHandler(T task, Map<String, Pattern> patterns) {
+            this.task = task;
+            this.patterns = patterns;
         }
 
         /*
@@ -335,7 +281,7 @@ public class AdvancedSolrHandler extends
                 if (matcher.find()) {
                     // add the matching content to the solr document.
                     String value = valueRecorders.remove(patternName);
-                    doc.addField(patternName, value);
+                    task.getParserData().add(patternName, value);
                 }
             }
         }

Copied: incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/src/test/java/org/apache/droids/nekohtml/NekoHtmlParserTest.java
(from r1461977, incubator/droids/branches/0.2.x-cleanup/droids-solr/src/test/java/org/apache/droids/solr/AdvancedSolrHandleTest.java)
URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/src/test/java/org/apache/droids/nekohtml/NekoHtmlParserTest.java?p2=incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/src/test/java/org/apache/droids/nekohtml/NekoHtmlParserTest.java&p1=incubator/droids/branches/0.2.x-cleanup/droids-solr/src/test/java/org/apache/droids/solr/AdvancedSolrHandleTest.java&r1=1461977&r2=1462042&rev=1462042&view=diff
==============================================================================
--- incubator/droids/branches/0.2.x-cleanup/droids-solr/src/test/java/org/apache/droids/solr/AdvancedSolrHandleTest.java
(original)
+++ incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/src/test/java/org/apache/droids/nekohtml/NekoHtmlParserTest.java
Thu Mar 28 11:18:10 2013
@@ -14,35 +14,24 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.droids.solr;
+package org.apache.droids.nekohtml;
 
 import java.io.ByteArrayInputStream;
-import java.io.IOException;
+import java.io.UnsupportedEncodingException;
 import java.net.URI;
 import java.net.URISyntaxException;
 import java.util.HashMap;
 
-import org.apache.droids.core.ContentEntity;
+import org.apache.droids.core.BasicTask;
 import org.apache.droids.core.DroidsException;
 import org.apache.droids.core.Task;
-import org.apache.solr.client.solrj.SolrQuery;
-import org.apache.solr.client.solrj.SolrServer;
-import org.apache.solr.client.solrj.SolrServerException;
-import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer;
-import org.apache.solr.client.solrj.response.QueryResponse;
-import org.apache.solr.common.SolrDocument;
-import org.apache.solr.core.CoreContainer;
-import org.junit.After;
-import org.junit.Before;
 import org.junit.Test;
 
 import static org.junit.Assert.*;
 
-public class AdvancedSolrHandleTest {
-
-    SolrServer solr;
-
-    String simpleHtmlPage = "" +
+public class NekoHtmlParserTest {
+    private Task task;
+    private static final String TEST_CONTENT = "" +
             "<html>" +
             "<body>" +
             "<div>" +
@@ -58,65 +47,42 @@ public class AdvancedSolrHandleTest {
             "</body>" +
             "</html>";
 
-    protected String getSolrHome() {
-        return "example";
-    }
-
-    @Before
-    public void setUp() throws Exception {
-        CoreContainer.Initializer initializer = new CoreContainer.Initializer();
-        CoreContainer coreContainer = initializer.initialize();
-        solr = new EmbeddedSolrServer(coreContainer, "");
-    }
-
-    @After
-    public void tearDown() throws Exception {
-        // remove everything....
-        solr.deleteByQuery("*:*");
-        solr.commit();
+    public NekoHtmlParserTest() {
+        try {
+            task = new BasicTask(new URI("http://localhost/"));
+
+            task.getContentEntity().setContent(new ByteArrayInputStream(TEST_CONTENT.getBytes("UTF-8")));
+        } catch (URISyntaxException e) {
+            e.printStackTrace();
+        } catch (UnsupportedEncodingException e) {
+            e.printStackTrace();
+        }
     }
 
-    public void performSelection(String html, String field, String selector, String expectedValue)
throws IOException, DroidsException, URISyntaxException, SolrServerException {
-        AdvancedSolrHandler handler = new AdvancedSolrHandler(solr);
-
+    public void performSelection(String field, String selector, String expectedValue) throws
URISyntaxException, DroidsException {
         HashMap<String, String> selectors = new HashMap<String, String>();
         selectors.put(field, selector);
-        handler.setSelectors(selectors);
-
-        Task task = new SolrTask(new URI("http://localhost/"));
-
-        ContentEntity contentEntity = task.getContentEntity();
-        contentEntity.setContentType("text/html");
-        contentEntity.setCharset("UTF-8");
-        contentEntity.setContent(new ByteArrayInputStream(html.getBytes("UTF-8")));
-
-        handler.handle(task);
-        solr.commit();
 
-        SolrQuery query = new SolrQuery();
-        query.setQuery("*:*");
-        query.setFields(field);
-        QueryResponse response = solr.query(query);
+        NekoHtmlParser<Task> parser = new NekoHtmlParser<Task>(selectors);
 
-        SolrDocument doc = response.getResults().iterator().next();
-        String value = (String) doc.getFieldValue(field);
+        parser.parse(task);
 
-        assertEquals(expectedValue, value);
+        assertEquals(expectedValue, task.getParserData().get(field));
     }
 
     @Test
     public void testSelectorA() throws Exception {
-        performSelection(simpleHtmlPage, "selector", "/html[0]/body[0]/div[0]/p[0]", "p0");
+        performSelection("selector", "/html[0]/body[0]/div[0]/p[0]", "p0");
     }
 
     @Test
     public void testSelectorB() throws Exception {
-        performSelection(simpleHtmlPage, "selector", "/html[0]/body[0]/div[1]/p[1]", "p4");
+        performSelection("selector", "/html[0]/body[0]/div[1]/p[1]", "p4");
     }
 
     @Test
     public void testSelectorC() throws Exception {
-        performSelection(simpleHtmlPage, "selector", "/html[0]/body[0]/div[1]", "p3p4p5");
+        performSelection("selector", "/html[0]/body[0]/div[1]", "p3p4p5");
     }
 
 }
\ No newline at end of file



Mime
View raw message