jackrabbit-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From alexparvule...@apache.org
Subject svn commit: r1327116 - in /jackrabbit/trunk/jackrabbit-core/src: main/java/org/apache/jackrabbit/core/query/lucene/ test/java/org/apache/jackrabbit/core/query/lucene/
Date Tue, 17 Apr 2012 14:50:00 GMT
Author: alexparvulescu
Date: Tue Apr 17 14:49:59 2012
New Revision: 1327116

URL: http://svn.apache.org/viewvc?rev=1327116&view=rev
Log:
JCR-3296 Indexing ignored file types creates some garbage

Added:
    jackrabbit/trunk/jackrabbit-core/src/test/java/org/apache/jackrabbit/core/query/lucene/LazyTextExtractorFieldTest.java
  (with props)
Modified:
    jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/LazyTextExtractorField.java
    jackrabbit/trunk/jackrabbit-core/src/test/java/org/apache/jackrabbit/core/query/lucene/TestAll.java

Modified: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/LazyTextExtractorField.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/LazyTextExtractorField.java?rev=1327116&r1=1327115&r2=1327116&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/LazyTextExtractorField.java
(original)
+++ jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/LazyTextExtractorField.java
Tue Apr 17 14:49:59 2012
@@ -30,10 +30,10 @@ import org.apache.lucene.document.Field.
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.WriteOutContentHandler;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.DefaultHandler;
 
 /**
  * <code>LazyTextExtractorField</code> implements a Lucene field with a String
@@ -53,13 +53,6 @@ public class LazyTextExtractorField exte
         LoggerFactory.getLogger(LazyTextExtractorField.class);
 
     /**
-     * The exception used to forcibly terminate the extraction process
-     * when the maximum field length is reached.
-     */
-    private static final SAXException STOP =
-        new SAXException("max field length reached");
-
-    /**
      * The extracted text content of the given binary value.
      * Set to non-null when the text extraction task finishes.
      */
@@ -85,8 +78,12 @@ public class LazyTextExtractorField exte
                 highlighting ? Store.YES : Store.NO,
                 withNorms ? Field.Index.ANALYZED : Field.Index.ANALYZED_NO_NORMS,
                 highlighting ? TermVector.WITH_OFFSETS : TermVector.NO);
-        executor.execute(
-                new ParsingTask(parser, value, metadata, maxFieldLength));
+        executor.execute(new ParsingTask(parser, value, metadata,
+                maxFieldLength) {
+            public void setExtractedText(String value) {
+                LazyTextExtractorField.this.setExtractedText(value);
+            }
+        });
     }
 
     /**
@@ -152,7 +149,7 @@ public class LazyTextExtractorField exte
     /**
      * The background task for extracting text from a binary value.
      */
-    private class ParsingTask extends DefaultHandler implements LowPriorityTask {
+    abstract static class ParsingTask extends BodyContentHandler implements LowPriorityTask
{
 
         private final Parser parser;
 
@@ -160,17 +157,21 @@ public class LazyTextExtractorField exte
 
         private final Metadata metadata;
 
-        private final int maxFieldLength;
+        private final WriteOutContentHandler writeOutContentHandler;
 
-        private final StringBuilder builder = new StringBuilder();
+        public ParsingTask(Parser parser, InternalValue value,
+                Metadata metadata, int maxFieldLength) {
+            this(new WriteOutContentHandler(maxFieldLength), parser, value,
+                    metadata);
+        }
 
-        public ParsingTask(
-                Parser parser, InternalValue value, Metadata metadata,
-                int maxFieldLength) {
+        private ParsingTask(WriteOutContentHandler writeOutContentHandler,
+                Parser parser, InternalValue value, Metadata metadata) {
+            super(writeOutContentHandler);
+            this.writeOutContentHandler = writeOutContentHandler;
             this.parser = parser;
             this.value = value;
             this.metadata = metadata;
-            this.maxFieldLength = maxFieldLength;
         }
 
         public void run() {
@@ -189,36 +190,20 @@ public class LazyTextExtractorField exte
             } catch (Throwable t) {
                 // Capture and report any other full text extraction problems.
                 // The special STOP exception is used for normal termination.
-                if (t != STOP) {
+                if (!writeOutContentHandler.isWriteLimitReached(t)) {
                     log.debug("Failed to extract text from a binary property."
                             + " This is a fairly common case, and nothing to"
                             + " worry about. The stack trace is included to"
                             + " help improve the text extraction feature.", t);
-                    builder.replace(0, builder.length(), "TextExtractionError");
+                    setExtractedText("TextExtractionError");
+                    return;
                 }
             } finally {
                 value.discard();
             }
-            setExtractedText(builder.toString());
-        }
-
-        @Override
-        public void characters(char[] ch, int start, int length)
-                throws SAXException {
-            builder.append(
-                    ch, start,
-                    Math.min(length, maxFieldLength - builder.length()));
-            if (builder.length() >= maxFieldLength) {
-                throw STOP;
-            }
-        }
-
-        @Override
-        public void ignorableWhitespace(char[] ch, int start, int length)
-                throws SAXException {
-            characters(ch, start, length);
+            setExtractedText(writeOutContentHandler.toString());
         }
 
+        protected abstract void setExtractedText(String value);
     }
-
 }

Added: jackrabbit/trunk/jackrabbit-core/src/test/java/org/apache/jackrabbit/core/query/lucene/LazyTextExtractorFieldTest.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/test/java/org/apache/jackrabbit/core/query/lucene/LazyTextExtractorFieldTest.java?rev=1327116&view=auto
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/test/java/org/apache/jackrabbit/core/query/lucene/LazyTextExtractorFieldTest.java
(added)
+++ jackrabbit/trunk/jackrabbit-core/src/test/java/org/apache/jackrabbit/core/query/lucene/LazyTextExtractorFieldTest.java
Tue Apr 17 14:49:59 2012
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.core.query.lucene;
+
+import org.apache.jackrabbit.core.data.RandomInputStream;
+import org.apache.jackrabbit.core.query.AbstractIndexingTest;
+import org.apache.jackrabbit.core.query.lucene.LazyTextExtractorField.ParsingTask;
+import org.apache.jackrabbit.core.value.InternalValue;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.Parser;
+
+public class LazyTextExtractorFieldTest extends AbstractIndexingTest {
+
+    /**
+     * @see <a
+     *      href="https://issues.apache.org/jira/browse/JCR-3296">JCR-3296</a>
+     *      Indexing ignored file types creates some garbage
+     */
+    public void testEmptyParser() throws Exception {
+
+        InternalValue val = InternalValue
+                .create(new RandomInputStream(1, 1024));
+
+        Metadata metadata = new Metadata();
+        metadata.set(Metadata.CONTENT_TYPE, "application/java-archive");
+        metadata.set(Metadata.CONTENT_ENCODING, "UTF-8");
+
+        Parser p = getSearchIndex().getParser();
+
+        ParsingTask task = new ParsingTask(p, val, metadata, Integer.MAX_VALUE) {
+            public void setExtractedText(String value) {
+                assertEquals("", value);
+            }
+        };
+        task.run();
+    }
+}

Propchange: jackrabbit/trunk/jackrabbit-core/src/test/java/org/apache/jackrabbit/core/query/lucene/LazyTextExtractorFieldTest.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: jackrabbit/trunk/jackrabbit-core/src/test/java/org/apache/jackrabbit/core/query/lucene/LazyTextExtractorFieldTest.java
------------------------------------------------------------------------------
    svn:keywords = Author Date Id Revision Rev URL

Modified: jackrabbit/trunk/jackrabbit-core/src/test/java/org/apache/jackrabbit/core/query/lucene/TestAll.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/test/java/org/apache/jackrabbit/core/query/lucene/TestAll.java?rev=1327116&r1=1327115&r2=1327116&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/test/java/org/apache/jackrabbit/core/query/lucene/TestAll.java
(original)
+++ jackrabbit/trunk/jackrabbit-core/src/test/java/org/apache/jackrabbit/core/query/lucene/TestAll.java
Tue Apr 17 14:49:59 2012
@@ -44,6 +44,7 @@ public class TestAll extends TestCase {
         suite.addTestSuite(IndexingConfigurationImplTest.class);
         suite.addTestSuite(SQL2IndexingAggregateTest.class);
         suite.addTestSuite(SQL2IndexingAggregateTest2.class);
+        suite.addTestSuite(LazyTextExtractorFieldTest.class);
 
         return suite;
     }



Mime
View raw message