jackrabbit-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From ju...@apache.org
Subject svn commit: r1072006 - in /jackrabbit/trunk/jackrabbit-core/src: main/java/org/apache/jackrabbit/core/query/lucene/ test/java/org/apache/jackrabbit/core/query/lucene/ test/resources/META-INF/ test/resources/META-INF/services/
Date Fri, 18 Feb 2011 14:27:16 GMT
Author: jukka
Date: Fri Feb 18 14:27:15 2011
New Revision: 1072006

URL: http://svn.apache.org/viewvc?rev=1072006&view=rev
Log:
JCR-2885: Move tika-parsers dependency to deployment packages

Drop the JackrabbitParser backwards compatibility layer in favor of
a new tikaConfigPath configuration option.

Use the Tika autoloading feature to automatically load a blocking
parser class for test cases that measure index queue functionality.
We no longer need to pollute src/main/java with this test functionality.

Added:
    jackrabbit/trunk/jackrabbit-core/src/test/java/org/apache/jackrabbit/core/query/lucene/BlockingParser.java
    jackrabbit/trunk/jackrabbit-core/src/test/resources/META-INF/
    jackrabbit/trunk/jackrabbit-core/src/test/resources/META-INF/services/
    jackrabbit/trunk/jackrabbit-core/src/test/resources/META-INF/services/org.apache.tika.parser.Parser
Removed:
    jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/JackrabbitParser.java
Modified:
    jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/SearchIndex.java
    jackrabbit/trunk/jackrabbit-core/src/test/java/org/apache/jackrabbit/core/query/lucene/IndexingQueueTest.java

Modified: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/SearchIndex.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/SearchIndex.java?rev=1072006&r1=1072005&r2=1072006&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/SearchIndex.java
(original)
+++ jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/SearchIndex.java
Fri Feb 18 14:27:15 2011
@@ -60,6 +60,8 @@ import org.apache.lucene.search.Similari
 import org.apache.lucene.search.Sort;
 import org.apache.lucene.search.SortField;
 import org.apache.lucene.search.TermQuery;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.Parser;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -74,6 +76,8 @@ import javax.xml.parsers.ParserConfigura
 import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
+import java.net.MalformedURLException;
+import java.net.URL;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
@@ -207,9 +211,15 @@ public class SearchIndex extends Abstrac
     private final JackrabbitAnalyzer analyzer = new JackrabbitAnalyzer();
 
     /**
-     * The parser for extracting text content from binary properties.
+     * Path of the Tika configuration file used for text extraction.
      */
-    private final JackrabbitParser parser = new JackrabbitParser();
+    private String tikaConfigPath = null;
+
+    /**
+     * The Tika parser for extracting text content from binary properties.
+     * Initialized by the {@link #getParser()} method during first access.
+     */
+    private Parser parser = null;
 
     /**
      * The namespace mappings used internally.
@@ -864,12 +874,66 @@ public class SearchIndex extends Abstrac
     }
 
     /**
+     * Returns the path of the Tika configuration used for text extraction.
+     *
+     * @return path of the Tika configuration file
+     */
+    public String getTikaConfigPath() {
+        return tikaConfigPath;
+    }
+
+    /**
+     * Sets the path of the Tika configuration used for text extraction.
+     * The path can be either a file system or a class resource path.
+     * The default setting is the tika-config.xml class resource relative
+     * to org.apache.core.query.lucene.
+     *
+     * @param tikaConfigPath path of the Tika configuration file
+     */
+    public void setTikaConfigPath(String tikaConfigPath) {
+        this.tikaConfigPath = tikaConfigPath;
+    }
+
+    /**
      * Returns the parser used for extracting text content
      * from binary properties for full text indexing.
      *
      * @return the configured parser
      */
-    public Parser getParser() {
+    public synchronized Parser getParser() {
+        if (parser == null) {
+            URL url = null;
+            if (tikaConfigPath != null) {
+                File file = new File(tikaConfigPath);
+                if (file.exists()) {
+                    try {
+                        url = file.toURI().toURL();
+                    } catch (MalformedURLException e) {
+                        log.warn("Invalid Tika configuration path: " + file, e);
+                    }
+                } else {
+                    ClassLoader loader = SearchIndex.class.getClassLoader();
+                    url = loader.getResource(tikaConfigPath);
+                }
+            }
+            if (url == null) {
+                url = SearchIndex.class.getResource("tika-config.xml");
+            }
+
+            TikaConfig config = null;
+            if (url != null) {
+                try {
+                    config = new TikaConfig(url);
+                } catch (Exception e) {
+                    log.warn("Tika configuration not available: " + url, e);
+                }
+            }
+            if (config == null) {
+                config = TikaConfig.getDefaultConfig();
+            }
+
+            parser = new AutoDetectParser(config);
+        }
         return parser;
     }
 
@@ -1078,7 +1142,7 @@ public class SearchIndex extends Abstrac
             throws RepositoryException {
         NodeIndexer indexer = new NodeIndexer(
                 node, getContext().getItemStateManager(), nsMappings,
-                getContext().getExecutor(), parser);
+                getContext().getExecutor(), getParser());
         indexer.setSupportHighlighting(supportHighlighting);
         indexer.setIndexingConfiguration(indexingConfig);
         indexer.setIndexFormatVersion(indexFormatVersion);
@@ -1906,7 +1970,9 @@ public class SearchIndex extends Abstrac
      * @deprecated 
      */
     public void setTextFilterClasses(String filterClasses) {
-        parser.setTextFilterClasses(filterClasses);
+        log.warn("The textFilterClasses configuration parameter has"
+                + " been deprecated, and the configured value will"
+                + " be ignored: {}", filterClasses);
     }
 
     /**

Added: jackrabbit/trunk/jackrabbit-core/src/test/java/org/apache/jackrabbit/core/query/lucene/BlockingParser.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/test/java/org/apache/jackrabbit/core/query/lucene/BlockingParser.java?rev=1072006&view=auto
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/test/java/org/apache/jackrabbit/core/query/lucene/BlockingParser.java
(added)
+++ jackrabbit/trunk/jackrabbit-core/src/test/java/org/apache/jackrabbit/core/query/lucene/BlockingParser.java
Fri Feb 18 14:27:15 2011
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.core.query.lucene;
+
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.EmptyParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class BlockingParser extends EmptyParser {
+
+    public static final MediaType TYPE = MediaType.application("x-blocked");
+
+    /**
+     * Flag for blocking text extraction.
+     */
+    private static volatile boolean blocked = false;
+
+    /**
+     * Waits until text extraction is no longer blocked.
+     */
+    private synchronized static void waitIfBlocked() {
+        try {
+            while (blocked) {
+                BlockingParser.class.wait();
+            }
+        } catch (InterruptedException e) {
+            throw new RuntimeException("Text extraction block interrupted", e);
+        }
+    }
+
+    /**
+     * Blocks text extraction.
+     */
+    static synchronized void block() {
+        blocked = true;
+    }
+
+    /**
+     * Unblocks text extraction.
+     */
+    static synchronized void unblock() {
+        blocked = false;
+        BlockingParser.class.notifyAll();
+    }
+
+    @Override
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return Collections.singleton(TYPE);
+    }
+
+    @Override
+    public void parse(
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws SAXException {
+        waitIfBlocked();
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+        xhtml.element("p", "The quick brown fox jumped over the lazy dog.");
+        xhtml.endDocument();
+    }
+
+}

Modified: jackrabbit/trunk/jackrabbit-core/src/test/java/org/apache/jackrabbit/core/query/lucene/IndexingQueueTest.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/test/java/org/apache/jackrabbit/core/query/lucene/IndexingQueueTest.java?rev=1072006&r1=1072005&r2=1072006&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/test/java/org/apache/jackrabbit/core/query/lucene/IndexingQueueTest.java
(original)
+++ jackrabbit/trunk/jackrabbit-core/src/test/java/org/apache/jackrabbit/core/query/lucene/IndexingQueueTest.java
Fri Feb 18 14:27:15 2011
@@ -16,15 +16,14 @@
  */
 package org.apache.jackrabbit.core.query.lucene;
 
-import java.io.ByteArrayInputStream;
 import java.io.File;
 import java.io.FilenameFilter;
 import java.io.IOException;
-import java.io.InputStream;
 import java.util.Calendar;
 
 import javax.jcr.Node;
 import javax.jcr.NodeIterator;
+import javax.jcr.PropertyType;
 import javax.jcr.RepositoryException;
 import javax.jcr.query.Query;
 
@@ -40,26 +39,20 @@ import org.apache.jackrabbit.core.query.
  */
 public class IndexingQueueTest extends AbstractIndexingTest {
 
-    private static final File TEMP_DIR = new File(System.getProperty("java.io.tmpdir"));

-
-    private static final String CONTENT_TYPE = "text/plain";
-
-    private static final String ENCODING = "UTF-8";
+    private static final File TEMP_DIR =
+        new File(System.getProperty("java.io.tmpdir")); 
 
     public void testQueue() throws Exception {
         SearchIndex index = getSearchIndex();
         IndexingQueue queue = index.getIndex().getIndexingQueue();
 
-        JackrabbitParser.block();
+        BlockingParser.block();
         assertEquals(0, queue.getNumPendingDocuments());
 
-        String text = "the quick brown fox jumps over the lazy dog.";
-        InputStream in = new ByteArrayInputStream(text.getBytes(ENCODING));
         Node resource = testRootNode.addNode(nodeName1, "nt:resource");
-        resource.setProperty("jcr:data", in);
+        resource.setProperty("jcr:data", "", PropertyType.BINARY);
         resource.setProperty("jcr:lastModified", Calendar.getInstance());
-        resource.setProperty("jcr:mimeType", CONTENT_TYPE);
-        resource.setProperty("jcr:encoding", ENCODING);
+        resource.setProperty("jcr:mimeType", BlockingParser.TYPE.toString());
         session.save();
 
         assertEquals(1, queue.getNumPendingDocuments());
@@ -68,7 +61,7 @@ public class IndexingQueueTest extends A
         NodeIterator nodes = q.execute().getNodes();
         assertFalse(nodes.hasNext());
 
-        JackrabbitParser.unblock();
+        BlockingParser.unblock();
         index.flush();
         assertEquals(0, queue.getNumPendingDocuments());
 
@@ -78,13 +71,12 @@ public class IndexingQueueTest extends A
     }
 
     public void testInitialIndex() throws Exception {
-        JackrabbitParser.block();
+        BlockingParser.block();
         File indexDir = new File(getSearchIndex().getPath());
 
         // fill workspace
         Node testFolder = testRootNode.addNode("folder", "nt:folder");
-        String text = "the quick brown fox jumps over the lazy dog.";
-        int num = createFiles(testFolder, text.getBytes(ENCODING), 10, 2, 0);
+        int num = createFiles(testFolder, 10, 2, 0);
         session.save();
 
         // shutdown workspace
@@ -104,7 +96,7 @@ public class IndexingQueueTest extends A
 
         int initialNumExtractorFiles = getNumExtractorFiles();
 
-        JackrabbitParser.unblock();
+        BlockingParser.unblock();
         Thread t = new Thread(new Runnable() {
             public void run() {
                 try {
@@ -139,7 +131,7 @@ public class IndexingQueueTest extends A
      * Test case for JCR-2082
      */
     public void testReaderUpToDate() throws Exception {
-        JackrabbitParser.block();
+        BlockingParser.block();
         SearchIndex index = getSearchIndex();
         File indexDir = new File(index.getPath());
 
@@ -158,7 +150,7 @@ public class IndexingQueueTest extends A
             fail("Unable to delete index directory");
         }
 
-        JackrabbitParser.unblock();
+        BlockingParser.unblock();
         // start workspace again by getting a session
         session = getHelper().getSuperuserSession(WORKSPACE_NAME);
 
@@ -168,27 +160,24 @@ public class IndexingQueueTest extends A
         assertEquals(1, getSize(q.execute().getNodes()));
     }
 
-    private int createFiles(Node folder, byte[] data,
-                            int filesPerLevel, int levels, int count)
+    private int createFiles(
+            Node folder, int filesPerLevel, int levels, int count)
             throws RepositoryException {
         levels--;
         for (int i = 0; i < filesPerLevel; i++) {
             // create files
             Node file = folder.addNode("file" + i, "nt:file");
-            InputStream in = new ByteArrayInputStream(data);
             Node resource = file.addNode("jcr:content", "nt:resource");
-            resource.setProperty("jcr:data", in);
+            resource.setProperty("jcr:data", "", PropertyType.BINARY);
             resource.setProperty("jcr:lastModified", Calendar.getInstance());
-            resource.setProperty("jcr:mimeType", CONTENT_TYPE);
-            resource.setProperty("jcr:encoding", ENCODING);
+            resource.setProperty("jcr:mimeType", BlockingParser.TYPE.toString());
             count++;
         }
         if (levels > 0) {
             for (int i = 0; i < filesPerLevel; i++) {
                 // create files
                 Node subFolder = folder.addNode("folder" + i, "nt:folder");
-                count = createFiles(subFolder, data,
-                        filesPerLevel, levels, count);
+                count = createFiles(subFolder, filesPerLevel, levels, count);
             }
         }
         return count;

Added: jackrabbit/trunk/jackrabbit-core/src/test/resources/META-INF/services/org.apache.tika.parser.Parser
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/test/resources/META-INF/services/org.apache.tika.parser.Parser?rev=1072006&view=auto
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/test/resources/META-INF/services/org.apache.tika.parser.Parser
(added)
+++ jackrabbit/trunk/jackrabbit-core/src/test/resources/META-INF/services/org.apache.tika.parser.Parser
Fri Feb 18 14:27:15 2011
@@ -0,0 +1,16 @@
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+org.apache.jackrabbit.core.query.lucene.BlockingParser



Mime
View raw message