jackrabbit-oak-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From alexparvule...@apache.org
Subject svn commit: r1439803 - in /jackrabbit/oak/trunk/oak-lucene/src: main/java/org/apache/jackrabbit/oak/plugins/index/lucene/ test/java/org/apache/jackrabbit/oak/plugins/index/lucene/
Date Tue, 29 Jan 2013 09:49:25 GMT
Author: alexparvulescu
Date: Tue Jan 29 09:49:25 2013
New Revision: 1439803

URL: http://svn.apache.org/viewvc?rev=1439803&view=rev
Log:
OAK-154 Full text search index - refactored Tika code to prevent various ClassNotFound exceptions
(heavily inspired by jackrabbit code)

Modified:
    jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexConstants.java
    jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexDiff.java
    jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexUpdate.java
    jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexQueryTest.java

Modified: jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexConstants.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexConstants.java?rev=1439803&r1=1439802&r2=1439803&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexConstants.java
(original)
+++ jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexConstants.java
Tue Jan 29 09:49:25 2013
@@ -20,7 +20,6 @@ import org.apache.jackrabbit.oak.plugins
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.util.Version;
-import org.apache.tika.Tika;
 
 public interface LuceneIndexConstants extends IndexConstants {
 
@@ -32,6 +31,4 @@ public interface LuceneIndexConstants ex
 
     Analyzer ANALYZER = new StandardAnalyzer(VERSION);
 
-    Tika TIKA = new Tika();
-
 }

Modified: jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexDiff.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexDiff.java?rev=1439803&r1=1439802&r2=1439803&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexDiff.java
(original)
+++ jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexDiff.java
Tue Jan 29 09:49:25 2013
@@ -32,6 +32,9 @@ import org.apache.jackrabbit.oak.spi.sta
 import org.apache.jackrabbit.oak.spi.state.NodeState;
 import org.apache.jackrabbit.oak.spi.state.NodeStateUtils;
 import org.apache.jackrabbit.oak.spi.state.ReadOnlyBuilder;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.Parser;
 
 /**
  * {@link IndexHook} implementation that is responsible for keeping the
@@ -52,6 +55,9 @@ public class LuceneIndexDiff implements 
 
     private final Map<String, LuceneIndexUpdate> updates;
 
+    private final Parser parser = new AutoDetectParser(
+            TikaConfig.getDefaultConfig());
+
     private LuceneIndexDiff(LuceneIndexDiff parent, NodeBuilder node,
             String name, String path, Map<String, LuceneIndexUpdate> updates) {
         this.parent = parent;
@@ -66,7 +72,7 @@ public class LuceneIndexDiff implements 
                 NodeBuilder child = index.child(indexName);
                 if (isIndexNode(child) && !this.updates.containsKey(getPath())) {
                     this.updates.put(getPath(), new LuceneIndexUpdate(
-                            getPath(), child));
+                            getPath(), child, parser));
                 }
             }
         }

Modified: jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexUpdate.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexUpdate.java?rev=1439803&r1=1439802&r2=1439803&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexUpdate.java
(original)
+++ jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexUpdate.java
Tue Jan 29 09:49:25 2013
@@ -22,6 +22,7 @@ import static org.apache.jackrabbit.oak.
 
 import java.io.Closeable;
 import java.io.IOException;
+import java.io.InputStream;
 import java.util.Map;
 import java.util.Set;
 import java.util.TreeMap;
@@ -29,6 +30,7 @@ import java.util.TreeSet;
 
 import javax.jcr.PropertyType;
 
+import org.apache.jackrabbit.JcrConstants;
 import org.apache.jackrabbit.oak.api.Blob;
 import org.apache.jackrabbit.oak.api.CommitFailedException;
 import org.apache.jackrabbit.oak.api.PropertyState;
@@ -40,12 +42,21 @@ import org.apache.lucene.index.IndexWrit
 import org.apache.lucene.index.IndexWriterConfig;
 import org.apache.lucene.index.SerialMergeScheduler;
 import org.apache.lucene.search.PrefixQuery;
-import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.WriteOutContentHandler;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 import com.google.common.base.Preconditions;
 
 class LuceneIndexUpdate implements Closeable, LuceneIndexConstants {
 
+    private static final Logger log = LoggerFactory
+            .getLogger(LuceneIndexUpdate.class);
+
     private static IndexWriterConfig getIndexWriterConfig() {
         // FIXME: Hack needed to make Lucene work in an OSGi environment
         Thread thread = Thread.currentThread();
@@ -62,6 +73,16 @@ class LuceneIndexUpdate implements Close
 
     private static final IndexWriterConfig config = getIndexWriterConfig();
 
+    /**
+     * Parser used for extracting text content from binary properties for full
+     * text indexing.
+     */
+    private final Parser parser;
+    /**
+     * The media types supported by the parser used.
+     */
+    private Set<MediaType> supportedMediaTypes;
+
     private final String path;
 
     private final NodeBuilder index;
@@ -70,9 +91,10 @@ class LuceneIndexUpdate implements Close
 
     private final Set<String> remove = new TreeSet<String>();
 
-    public LuceneIndexUpdate(String path, NodeBuilder index) {
+    public LuceneIndexUpdate(String path, NodeBuilder index, Parser parser) {
         this.path = path;
         this.index = index;
+        this.parser = parser;
     }
 
     public void insert(String path, NodeBuilder value) {
@@ -155,27 +177,18 @@ class LuceneIndexUpdate implements Close
             path = "/" + path;
         }
         writer.updateDocument(newPathTerm(path), makeDocument(path, state));
-        // for (ChildNodeEntry entry : state.getChildNodeEntries()) {
-        // if (NodeStateUtils.isHidden(entry.getName())) {
-        // continue;
-        // }
-        // addSubtreeWriter(writer, concat(path, entry.getName()),
-        // entry.getNodeState(), paths);
-        // }
     }
 
-    private static Document makeDocument(String path, NodeState state) {
+    private Document makeDocument(String path, NodeState state) {
         Document document = new Document();
         document.add(newPathField(path));
         for (PropertyState property : state.getProperties()) {
-            String pname = property.getName();
             switch (property.getType().tag()) {
             case PropertyType.BINARY:
-                for (Blob v : property.getValue(Type.BINARIES)) {
-                    document.add(newPropertyField(pname, parseStringValue(v)));
-                }
+                addBinaryValue(document, property, state);
                 break;
             default:
+                String pname = property.getName();
                 for (String v : property.getValue(Type.STRINGS)) {
                     document.add(newPropertyField(pname, v));
                 }
@@ -185,13 +198,74 @@ class LuceneIndexUpdate implements Close
         return document;
     }
 
-    private static String parseStringValue(Blob v) {
+    private void addBinaryValue(Document doc, PropertyState property,
+            NodeState state) {
+        String type = getOrNull(state, JcrConstants.JCR_MIMETYPE);
+        if (type == null || !isSupportedMediaType(type)) {
+            return;
+        }
+        Metadata metadata = new Metadata();
+        metadata.set(Metadata.CONTENT_TYPE, type);
+        // jcr:encoding is not mandatory
+        String encoding = getOrNull(state, JcrConstants.JCR_ENCODING);
+        if (encoding != null) {
+            metadata.set(Metadata.CONTENT_ENCODING, encoding);
+        }
+
+        String name = property.getName();
+        for (Blob v : property.getValue(Type.BINARIES)) {
+            doc.add(newPropertyField(name, parseStringValue(v, metadata)));
+        }
+    }
+
+    private static String getOrNull(NodeState state, String name) {
+        PropertyState p = state.getProperty(name);
+        if (p != null) {
+            return p.getValue(Type.STRING);
+        }
+        return null;
+    }
+
+    /**
+     * Returns <code>true</code> if the provided type is among the types
+     * supported by the Tika parser we are using.
+     *
+     * @param type  the type to check.
+     * @return whether the type is supported by the Tika parser we are using.
+     */
+    private boolean isSupportedMediaType(final String type) {
+        if (supportedMediaTypes == null) {
+            supportedMediaTypes = parser.getSupportedTypes(null);
+        }
+        return supportedMediaTypes.contains(MediaType.parse(type));
+    }
+
+    private String parseStringValue(Blob v, Metadata metadata) {
+        WriteOutContentHandler handler = new WriteOutContentHandler();
         try {
-            return TIKA.parseToString(v.getNewStream());
-        } catch (IOException e) {
-        } catch (TikaException e) {
+            InputStream stream = v.getNewStream();
+            try {
+                parser.parse(stream, handler, metadata, new ParseContext());
+            } finally {
+                stream.close();
+            }
+        } catch (LinkageError e) {
+            // Capture and ignore errors caused by extraction libraries
+            // not being present. This is equivalent to disabling
+            // selected media types in configuration, so we can simply
+            // ignore these errors.
+        } catch (Throwable t) {
+            // Capture and report any other full text extraction problems.
+            // The special STOP exception is used for normal termination.
+            if (!handler.isWriteLimitReached(t)) {
+                log.debug("Failed to extract text from a binary property."
+                        + " This is a fairly common case, and nothing to"
+                        + " worry about. The stack trace is included to"
+                        + " help improve the text extraction feature.", t);
+                return "TextExtractionError";
+            }
         }
-        return "";
+        return handler.toString();
     }
 
     @Override

Modified: jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexQueryTest.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexQueryTest.java?rev=1439803&r1=1439802&r2=1439803&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexQueryTest.java
(original)
+++ jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexQueryTest.java
Tue Jan 29 09:49:25 2013
@@ -53,7 +53,6 @@ public class LuceneIndexQueryTest extend
     }
 
     @Test
-    @Ignore("OAK-420")
     public void sql2() throws Exception {
         test("sql2.txt");
     }
@@ -94,7 +93,6 @@ public class LuceneIndexQueryTest extend
     }
 
     @Test
-    @Ignore("OAK-420")
     public void ischildnodeTest() throws Exception {
         JsopUtil.apply(
                 root,
@@ -114,4 +112,4 @@ public class LuceneIndexQueryTest extend
         assertEquals("/, /parents", result.next());
         assertFalse(result.hasNext());
     }
-}
\ No newline at end of file
+}



Mime
View raw message