jackrabbit-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mreut...@apache.org
Subject svn commit: r489000 - in /jackrabbit/trunk/jackrabbit-core: ./ src/main/java/org/apache/jackrabbit/core/query/ src/main/java/org/apache/jackrabbit/core/query/lucene/
Date Wed, 20 Dec 2006 09:35:15 GMT
Author: mreutegg
Date: Wed Dec 20 01:35:14 2006
New Revision: 489000

URL: http://svn.apache.org/viewvc?view=rev&rev=489000
Log:
JCR-415: Enhance indexing of binary content
- Use text-extractor module in jackrabbit-core

Added:
    jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/JackrabbitTextExtractor.java
  (with props)
    jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/TextExtractorFilter.java
  (with props)
    jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/TextExtractorReader.java
  (with props)
    jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/TextFilterExtractor.java
  (with props)
Modified:
    jackrabbit/trunk/jackrabbit-core/pom.xml
    jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/TextFilter.java
    jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/NodeIndexer.java
    jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/SearchIndex.java
    jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/TextPlainTextFilter.java

Modified: jackrabbit/trunk/jackrabbit-core/pom.xml
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/pom.xml?view=diff&rev=489000&r1=488999&r2=489000
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/pom.xml (original)
+++ jackrabbit/trunk/jackrabbit-core/pom.xml Wed Dec 20 01:35:14 2006
@@ -300,6 +300,11 @@
       <version>${pom.version}</version>
     </dependency>
     <dependency>
+      <groupId>org.apache.jackrabbit</groupId>
+      <artifactId>jackrabbit-text-extractors</artifactId>
+      <version>${pom.version}</version>
+    </dependency>
+    <dependency>
       <groupId>log4j</groupId>
       <artifactId>log4j</artifactId>
       <version>1.2.8</version>

Modified: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/TextFilter.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/TextFilter.java?view=diff&rev=489000&r1=488999&r2=489000
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/TextFilter.java
(original)
+++ jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/TextFilter.java
Wed Dec 20 01:35:14 2006
@@ -29,6 +29,9 @@
  * mime type ({@link #canFilter(String)} and if one of them returns
  * <code>true</code> the text representation is created with
  * {@link #doFilter(PropertyState, String)}
+ *
+ * @deprecated use the {@link org.apache.jackrabbit.extractor.TextExtractor}
+ *             interface
  */
 public interface TextFilter {
 

Added: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/JackrabbitTextExtractor.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/JackrabbitTextExtractor.java?view=auto&rev=489000
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/JackrabbitTextExtractor.java
(added)
+++ jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/JackrabbitTextExtractor.java
Wed Dec 20 01:35:14 2006
@@ -0,0 +1,198 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.core.query.lucene;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Reader;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.Set;
+import java.util.StringTokenizer;
+
+import org.apache.jackrabbit.core.query.TextFilter;
+import org.apache.jackrabbit.extractor.CompositeTextExtractor;
+import org.apache.jackrabbit.extractor.DelegatingTextExtractor;
+import org.apache.jackrabbit.extractor.EmptyTextExtractor;
+import org.apache.jackrabbit.extractor.TextExtractor;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Backwards-compatible Jackrabbit text extractor component. This class
+ * implements the following functionality:
+ * <ul>
+ *   <li>
+ *     Parses the configured {@link TextExtractor} and {@link TextFilter}
+ *     class names and instantiates the configured classes.
+ *   </li>
+ *   <li>
+ *     Acts as the delegate extractor for any configured
+ *     {@link DelegatingTextExtractor} instances.
+ *   </li>
+ *   <li>
+ *     Maintains a {@link CompositeTextExtractor} instance that contains
+ *     all the configured extractors and to which all text extraction calls
+ *     are delegated.
+ *   </li>
+ *   <li>
+ *     Creates a {@link TextFilterExtractor} adapter for a configured
+ *     {@link TextFilter} instance when it is first used and adds that adapter
+ *     to the composite extractor for use in text extraction.
+ *   </li>
+ *   <li>
+ *     Logs a warning and creates a dummy {@link EmptyTextExtractor} instance
+ *     for any unsupported content types when first detected. The dummy
+ *     extractor is added to the composite extractor to prevent future
+ *     warnings about the same content type.
+ *   </li>
+ * </ul>
+ */
+public class JackrabbitTextExtractor implements TextExtractor {
+
+    /**
+     * Logger instance.
+     */
+    private static final Logger logger =
+        LoggerFactory.getLogger(JackrabbitTextExtractor.class);
+
+    /**
+     * Set of content types that are known to be supported by the
+     * composite extractor.
+     */
+    private final Set types = new HashSet();
+
+    /**
+     * Composite extractor used to for all text extration tasks. Contains
+     * all the {@link TextExtractor} instances for directly supported content
+     * types, the {@link TextFilterExtractor} adapters for backwards
+     * compatibility with configured {@link TextFilter} instances that have
+     * already been used, and the dummy {@link EmptyTextExtractor} instances
+     * created for unsupported content types.
+     */
+    private final CompositeTextExtractor extractor =
+        new CompositeTextExtractor();
+
+    /**
+     * Configured {@link TextFilter} instances. Used for backwards
+     * compatibility with existing configuration files and {@link TextFilter}
+     * implementations.
+     */
+    private final Collection filters = new ArrayList();
+
+    /**
+     * Creates a Jackrabbit text extractor containing the configured component
+     * classes.
+     *
+     * @param classes configured {@link TextExtractor} (and {@link TextFilter})
+     *                class names (space- or comma-separated)
+     */
+    public JackrabbitTextExtractor(String classes) {
+        logger.debug("JackrabbitTextExtractor({})", classes);
+        StringTokenizer tokenizer = new StringTokenizer(classes, ", \t\n\r\f");
+        while (tokenizer.hasMoreTokens()) {
+            String name = tokenizer.nextToken();
+            try {
+                Object object = Class.forName(name).newInstance();
+                if (object instanceof DelegatingTextExtractor) {
+                    ((DelegatingTextExtractor) object)
+                        .setDelegateTextExtractor(this);
+                }
+                if (object instanceof TextExtractor) {
+                    extractor.addTextExtractor((TextExtractor) object);
+                } else if (object instanceof TextFilter) {
+                    filters.add(object);
+                } else {
+                    logger.warn("Unknown text extractor class: {}", name);
+                }
+            } catch (ClassNotFoundException e) {
+                logger.warn("Extractor class not found: " + name, e);
+            } catch (LinkageError e) {
+                logger.warn("Extractor dependency not found: " + name, e);
+            } catch (IllegalAccessException e) {
+                logger.warn("Extractor constructor not accessible: " + name, e);
+            } catch (InstantiationException e) {
+                logger.warn("Extractor instantiation failed: " + name, e);
+            }
+        }
+
+        types.addAll(Arrays.asList(extractor.getContentTypes()));
+    }
+
+    //-------------------------------------------------------< TextExtractor >
+
+    /**
+     * Returns the content types that the component extractors are known
+     * to support.
+     *
+     * @return supported content types
+     */
+    public String[] getContentTypes() {
+        return extractor.getContentTypes(); // and then some
+    }
+
+    /**
+     * Extracts the text content from the given binary stream. The given
+     * content type is used to look up a configured text extractor to which
+     * to delegate the request.
+     * <p>
+     * If a matching extractor is not found, then the configured text filters
+     * searched for an instance that claims to support the given content type.
+     * A text extractor adapter is created for that filter and saved in the
+     * extractor map for future use before delegating the request to the
+     * adapter.
+     * <p>
+     * If not even a text filter is found for the given content type, a warning
+     * is logged and an empty text extractor is created for that content type
+     * and saved in the extractor map for future use before delegating the
+     * request to the empty extractor.
+     *
+     * @param stream binary stream
+     * @param type content type
+     * @param encoding character encoding, or <code>null</code>
+     * @return reader for the text content of the binary stream
+     * @throws IOException if the binary stream can not be read
+     */
+    public Reader extractText(InputStream stream, String type, String encoding)
+            throws IOException {
+        logger.debug("extractText(stream, {}, {})", type, encoding);
+        if (!types.contains(type)) {
+            Iterator iterator = filters.iterator();
+            while (iterator.hasNext()) {
+                TextFilter filter = (TextFilter) iterator.next();
+                if (filter.canFilter(type)) {
+                    types.add(type);
+                    extractor.addTextExtractor(
+                            new TextFilterExtractor(type, filter));
+                    break;
+                }
+            }
+        }
+
+        if (!types.contains(type)) {
+            logger.warn("Full text indexing of {} is not supported", type);
+            types.add(type);
+            extractor.addTextExtractor(new EmptyTextExtractor(type));
+        }
+
+        return extractor.extractText(stream, type, encoding);
+    }
+
+}

Propchange: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/JackrabbitTextExtractor.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/NodeIndexer.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/NodeIndexer.java?view=diff&rev=489000&r1=488999&r2=489000
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/NodeIndexer.java
(original)
+++ jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/NodeIndexer.java
Wed Dec 20 01:35:14 2006
@@ -17,13 +17,14 @@
 package org.apache.jackrabbit.core.query.lucene;
 
 import org.apache.jackrabbit.core.PropertyId;
-import org.apache.jackrabbit.core.query.TextFilter;
 import org.apache.jackrabbit.core.state.ItemStateException;
 import org.apache.jackrabbit.core.state.ItemStateManager;
 import org.apache.jackrabbit.core.state.NoSuchItemStateException;
 import org.apache.jackrabbit.core.state.NodeState;
 import org.apache.jackrabbit.core.state.PropertyState;
+import org.apache.jackrabbit.core.value.BLOBFileValue;
 import org.apache.jackrabbit.core.value.InternalValue;
+import org.apache.jackrabbit.extractor.TextExtractor;
 import org.apache.jackrabbit.name.NoPrefixDeclaredException;
 import org.apache.jackrabbit.name.Path;
 import org.apache.jackrabbit.name.QName;
@@ -37,12 +38,11 @@
 import javax.jcr.NamespaceException;
 import javax.jcr.PropertyType;
 import javax.jcr.RepositoryException;
+
+import java.io.InputStream;
 import java.io.Reader;
 import java.util.Calendar;
-import java.util.Collections;
 import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
 import java.util.Set;
 
 /**
@@ -72,9 +72,9 @@
     protected final NamespaceMappings mappings;
 
     /**
-     * List of text filters in use.
+     * Content extractor.
      */
-    protected final List textFilters;
+    protected final TextExtractor extractor;
 
     /**
      * Creates a new node indexer.
@@ -82,16 +82,16 @@
      * @param node          the node state to index.
      * @param stateProvider the persistent item state manager to retrieve properties.
      * @param mappings      internal namespace mappings.
-     * @param textFilters   List of {@link org.apache.jackrabbit.core.query.TextFilter}s.
+     * @param extractor     content extractor
      */
     protected NodeIndexer(NodeState node,
                           ItemStateManager stateProvider,
                           NamespaceMappings mappings,
-                          List textFilters) {
+                          TextExtractor extractor) {
         this.node = node;
         this.stateProvider = stateProvider;
         this.mappings = mappings;
-        this.textFilters = textFilters;
+        this.extractor = extractor;
     }
 
     /**
@@ -100,8 +100,7 @@
      * @param node          the node state to index.
      * @param stateProvider the state provider to retrieve property values.
      * @param mappings      internal namespace mappings.
-     * @param textFilters   list of text filters to use for indexing binary
-     *                      properties.
+     * @param extractor     text extractor
      * @return the lucene Document.
      * @throws RepositoryException if an error occurs while reading property
      *                             values from the <code>ItemStateProvider</code>.
@@ -109,9 +108,9 @@
     public static Document createDocument(NodeState node,
                                           ItemStateManager stateProvider,
                                           NamespaceMappings mappings,
-                                          List textFilters)
+                                          TextExtractor extractor)
             throws RepositoryException {
-        NodeIndexer indexer = new NodeIndexer(node, stateProvider, mappings, textFilters);
+        NodeIndexer indexer = new NodeIndexer(node, stateProvider, mappings, extractor);
         return indexer.createDoc();
     }
 
@@ -256,14 +255,16 @@
      * Adds the binary value to the document as the named field.
      * <p/>
      * This implementation checks if this {@link #node} is of type nt:resource
-     * and if that is the case, tries to extract text from the data atom using
-     * the {@link #textFilters}.
+     * and if that is the case, tries to extract text from the binary property
+     * using the {@link #extractor}.
      *
      * @param doc           The document to which to add the field
      * @param fieldName     The name of the field to add
      * @param internalValue The value for the field to add to the document.
      */
-    protected void addBinaryValue(Document doc, String fieldName, Object internalValue) {
+    protected void addBinaryValue(Document doc,
+                                  String fieldName,
+                                  Object internalValue) {
         // 'check' if node is of type nt:resource
         try {
             String jcrData = mappings.getPrefix(QName.NS_JCR_URI) + ":data";
@@ -271,43 +272,53 @@
                 // don't know how to index
                 return;
             }
-            if (node.hasPropertyName(QName.JCR_MIMETYPE)) {
-                PropertyState dataProp = (PropertyState) stateProvider.getItemState(
-                        new PropertyId(node.getNodeId(), QName.JCR_DATA));
-                PropertyState mimeTypeProp =
-                        (PropertyState) stateProvider.getItemState(
-                                new PropertyId(node.getNodeId(), QName.JCR_MIMETYPE));
+
+            InternalValue typeValue = getValue(QName.JCR_MIMETYPE);
+            if (typeValue != null) {
+                String type = typeValue.internalValue().toString();
 
                 // jcr:encoding is not mandatory
                 String encoding = null;
-                if (node.hasPropertyName(QName.JCR_ENCODING)) {
-                    PropertyState encodingProp =
-                            (PropertyState) stateProvider.getItemState(
-                                    new PropertyId(node.getNodeId(), QName.JCR_ENCODING));
-                    encoding = encodingProp.getValues()[0].internalValue().toString();
-                }
-
-                String mimeType = mimeTypeProp.getValues()[0].internalValue().toString();
-                Map fields = Collections.EMPTY_MAP;
-                for (Iterator it = textFilters.iterator(); it.hasNext();) {
-                    TextFilter filter = (TextFilter) it.next();
-                    // use the first filter that can handle the mimeType
-                    if (filter.canFilter(mimeType)) {
-                        fields = filter.doFilter(dataProp, encoding);
-                        break;
-                    }
+                InternalValue encodingValue = getValue(QName.JCR_ENCODING);
+                if (encodingValue != null) {
+                    encoding = encodingValue.internalValue().toString();
                 }
 
-                for (Iterator it = fields.keySet().iterator(); it.hasNext();) {
-                    String field = (String) it.next();
-                    Reader r = (Reader) fields.get(field);
-                    doc.add(new Field(field, r));
-                }
+                InputStream stream =
+                        ((BLOBFileValue) internalValue).getStream();
+                Reader reader =
+                        new TextExtractorReader(extractor, stream, type, encoding);
+                doc.add(new Field(FieldNames.FULLTEXT, reader));
             }
         } catch (Exception e) {
             // TODO: How to recover from a transient indexing failure?
             log.warn("Exception while indexing binary property: " + e.toString());
             log.debug("Dump: ", e);
+        }
+    }
+
+    /**
+     * Utility method that extracts the first value of the named property
+     * of the current node. Returns <code>null</code> if the property does
+     * not exist or contains no values.
+     *
+     * @param name property name
+     * @return value of the named property, or <code>null</code>
+     * @throws ItemStateException if the property can not be accessed
+     */
+    protected InternalValue getValue(QName name) throws ItemStateException {
+        try {
+            PropertyId id = new PropertyId(node.getNodeId(), name);
+            PropertyState property =
+                (PropertyState) stateProvider.getItemState(id);
+            InternalValue[] values = property.getValues();
+            if (values.length > 0) {
+                return values[0];
+            } else {
+                return null;
+            }
+        } catch (NoSuchItemStateException e) {
+            return null;
         }
     }
 

Modified: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/SearchIndex.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/SearchIndex.java?view=diff&rev=489000&r1=488999&r2=489000
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/SearchIndex.java
(original)
+++ jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/SearchIndex.java
Wed Dec 20 01:35:14 2006
@@ -23,10 +23,11 @@
 import org.apache.jackrabbit.core.query.AbstractQueryHandler;
 import org.apache.jackrabbit.core.query.ExecutableQuery;
 import org.apache.jackrabbit.core.query.QueryHandlerContext;
-import org.apache.jackrabbit.core.query.TextFilter;
 import org.apache.jackrabbit.core.query.QueryHandler;
 import org.apache.jackrabbit.core.state.NodeState;
 import org.apache.jackrabbit.core.state.NodeStateIterator;
+import org.apache.jackrabbit.extractor.DefaultTextExtractor;
+import org.apache.jackrabbit.extractor.TextExtractor;
 import org.apache.jackrabbit.name.NoPrefixDeclaredException;
 import org.apache.jackrabbit.name.QName;
 import org.apache.jackrabbit.name.NameFormat;
@@ -50,9 +51,7 @@
 import java.io.File;
 import java.util.Iterator;
 import java.util.List;
-import java.util.StringTokenizer;
 import java.util.ArrayList;
-import java.util.Collections;
 import java.util.HashSet;
 import java.util.Set;
 
@@ -91,11 +90,6 @@
     public static final int DEFAULT_MAX_FIELD_LENGTH = 10000;
 
     /**
-     * Default text filters.
-     */
-    public static final String DEFAULT_TEXT_FILTERS = TextPlainTextFilter.class.getName();
-
-    /**
      * The actual index
      */
     private MultiIndex index;
@@ -106,9 +100,17 @@
     private Analyzer analyzer;
 
     /**
-     * List of {@link org.apache.jackrabbit.core.query.TextFilter} instance.
+     * List of text extractor and text filter class names. The configured
+     * classes will be instantiated and used to extract text content from
+     * binary properties.
+     */
+    private String textFilterClasses =
+        DefaultTextExtractor.class.getName();
+
+    /**
+     * Text extractor for extracting text content of binary properties.
      */
-    private List textFilters;
+    private TextExtractor extractor;
 
     /**
      * The location of the search index.
@@ -199,7 +201,6 @@
      */
     public SearchIndex() {
         this.analyzer = new StandardAnalyzer(new String[]{});
-        setTextFilterClasses(DEFAULT_TEXT_FILTERS);
     }
 
     /**
@@ -242,6 +243,8 @@
             }
         }
 
+        extractor = new JackrabbitTextExtractor(textFilterClasses);
+
         index = new MultiIndex(indexDir, this, context.getItemStateManager(),
                 context.getRootId(), excludedIDs, nsMappings);
         if (index.getRedoLogApplied() || forceConsistencyCheck) {
@@ -413,13 +416,12 @@
     }
 
     /**
-     * Returns an unmodifiable list of {@link TextFilter} configured for
-     * this search index.
+     * Returns the text extractor in use for indexing.
      *
-     * @return unmodifiable list of text filters.
+     * @return the text extractor in use for indexing.
      */
-    protected List getTextFilters() {
-        return textFilters;
+    public TextExtractor getTextExtractor() {
+        return extractor;
     }
 
     /**
@@ -473,7 +475,7 @@
     protected Document createDocument(NodeState node, NamespaceMappings nsMappings)
             throws RepositoryException {
         return NodeIndexer.createDocument(node, getContext().getItemStateManager(),
-                nsMappings, textFilters);
+                nsMappings, extractor);
     }
 
     /**
@@ -753,34 +755,17 @@
     }
 
     /**
-     * Sets a new set of text filter classes that are in use for indexing
-     * binary properties. The <code>filterClasses</code> must be a comma
-     * separated <code>String</code> of fully qualified class names implementing
-     * {@link org.apache.jackrabbit.core.query.TextFilter}. Each class must
-     * provide a default constructor.
-     * </p>
-     * Filter class names that cannot be resolved are skipped and a warn message
-     * is logged.
+     * Sets the list of text extractors (and text filters) to use for
+     * extracting text content from binary properties. The list must be
+     * comma (or whitespace) separated, and contain fully qualified class
+     * names of the {@link TextExtractor} (and {@link org.apache.jackrabbit.core.query.TextFilter})
classes
+     * to be used. The configured classes must all have a public default
+     * constructor.
      *
-     * @param filterClasses comma separated list of filter class names
+     * @param filterClasses comma separated list of class names
      */
     public void setTextFilterClasses(String filterClasses) {
-        List filters = new ArrayList();
-        StringTokenizer tokenizer = new StringTokenizer(filterClasses, ", \t\n\r\f");
-        while (tokenizer.hasMoreTokens()) {
-            String className = tokenizer.nextToken();
-            try {
-                Class filterClass = Class.forName(className);
-                TextFilter filter = (TextFilter) filterClass.newInstance();
-                filters.add(filter);
-            } catch (Exception e) {
-                log.warn("Invalid TextFilter class: " + className, e);
-            } catch (LinkageError e) {
-                log.warn("Missing dependency for text filter: " + className);
-                log.warn(e.toString());
-            }
-        }
-        textFilters = Collections.unmodifiableList(filters);
+        this.textFilterClasses = filterClasses;
     }
 
     /**
@@ -790,14 +775,7 @@
      * @return class names of the text filters in use.
      */
     public String getTextFilterClasses() {
-        StringBuffer names = new StringBuffer();
-        String delim = "";
-        for (Iterator it = textFilters.iterator(); it.hasNext();) {
-            names.append(delim);
-            names.append(it.next().getClass().getName());
-            delim = ",";
-        }
-        return names.toString();
+        return textFilterClasses;
     }
 
     /**

Added: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/TextExtractorFilter.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/TextExtractorFilter.java?view=auto&rev=489000
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/TextExtractorFilter.java
(added)
+++ jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/TextExtractorFilter.java
Wed Dec 20 01:35:14 2006
@@ -0,0 +1,120 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.core.query.lucene;
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.HashMap;
+import java.util.Map;
+
+import javax.jcr.RepositoryException;
+
+import org.apache.jackrabbit.core.query.TextFilter;
+import org.apache.jackrabbit.core.state.PropertyState;
+import org.apache.jackrabbit.core.value.BLOBFileValue;
+import org.apache.jackrabbit.core.value.InternalValue;
+import org.apache.jackrabbit.extractor.TextExtractor;
+
+/**
+ * Utility base class for migrating functionality from existing implementations
+ * of the deprecated {@link TextFilter} interface to the new
+ * {@link TextExtractor} interface. Once the functionality of an existing
+ * TextFilter has been copied to a new TextExtractor, the original class can
+ * be replaced with the following template to keep backwards compatibility
+ * while avoiding the burden of maintaining duplicate code:
+ * <pre>
+ * <b>public class</b> SomeTextFilter <b>extends</b> TextExtractorFilter
{
+ *     <b>public</b> SomeTextFilter() {
+ *         <b>super</b>(<b>new</b> SomeTextExtractor());
+ *     }
+ * }
+ * </pre>
+ */
+public class TextExtractorFilter implements TextFilter {
+
+    /**
+     * The adapted text extractor.
+     */
+    private final TextExtractor extractor;
+
+    /**
+     * Creates a text filter adapter for the given text extractor.
+     *
+     * @param extractor adapted text extractor
+     */
+    public TextExtractorFilter(TextExtractor extractor) {
+        this.extractor = extractor;
+    }
+
+    /**
+     * Returns true if the adapted text extractor supports the given
+     * content type.
+     *
+     * @param mimeType content type
+     * @return <code>true</code> if the content type is supported,
+     *         <code>false</code> otherwise
+     */
+    public boolean canFilter(String mimeType) {
+        mimeType = mimeType.toLowerCase();
+        String[] types = extractor.getContentTypes();
+        for (int i = 0; i < types.length; i++) {
+            if (types[i].equals(mimeType)) {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    /**
+     * Extracts text content of the given binary property using the adapted
+     * text extractor.
+     *
+     * @param data binary property
+     * @param encoding character encoding, or <code>null</code>
+     * @return map that contains a reader for the extracted text as
+     *         the {@link FieldNames#FULLTEXT} entry
+     * @throws RepositoryException if the binary property can not be read
+     */
+    public Map doFilter(PropertyState data, String encoding)
+            throws RepositoryException {
+        InternalValue[] values = data.getValues();
+        if (values.length == 1) {
+            try {
+                String type = "application/octet-stream";
+                String[] types = extractor.getContentTypes();
+                if (types.length > 0) {
+                    type = types[0];
+                }
+
+                BLOBFileValue blob = (BLOBFileValue) values[0].internalValue();
+                Reader reader =
+                    extractor.extractText(blob.getStream(), type, encoding);
+
+                Map result = new HashMap();
+                result.put(FieldNames.FULLTEXT, reader);
+                return result;
+            } catch (IOException e) {
+                throw new RepositoryException("Text extraction error", e);
+            }
+        } else {
+            // multi value not supported
+            throw new RepositoryException(
+                    "Multi-valued binary properties not supported.");
+        }
+    }
+
+}

Propchange: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/TextExtractorFilter.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/TextExtractorReader.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/TextExtractorReader.java?view=auto&rev=489000
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/TextExtractorReader.java
(added)
+++ jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/TextExtractorReader.java
Wed Dec 20 01:35:14 2006
@@ -0,0 +1,114 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.core.query.lucene;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Reader;
+
+import org.apache.jackrabbit.extractor.TextExtractor;
+
+/**
+ * Reader that extracts the text content of a binary stream for reading
+ * only when the first character is requested. This class is used by the
+ * {@link NodeIndexer} class to postpone text extraction to when the
+ * content is actually needs.
+ *
+ * @see http://issues.apache.org/jira/browse/JCR-264
+ */
+public class TextExtractorReader extends Reader {
+
+    /**
+     * Text extractor to use in extracting text content from the binary stream.
+     */
+    private final TextExtractor extractor;
+
+    /**
+     * Binary stream from which to extract the content for reading.
+     */
+    private final InputStream stream;
+
+    /**
+     * Content type of the binary stream.
+     */
+    private final String type;
+
+    /**
+     * Character encoding of the binary stream, or <code>null</code>.
+     */
+    private final String encoding;
+
+    /**
+     * Reader for the extracted text content. Set to <code>null</code> until
+     * the first character request triggers the text extraction.
+     */
+    private Reader reader;
+
+    /**
+     * Creates a reader that extracts the text content from the given binary
+     * stream.
+     *
+     * @param extractor text extractor
+     * @param stream binary stream
+     * @param type content type
+     * @param encoding character encoding, or <code>null</code>
+     */
+    public TextExtractorReader(
+            TextExtractor extractor, InputStream stream,
+            String type, String encoding) {
+        this.extractor = extractor;
+        this.stream = stream;
+        this.type = type;
+        this.encoding = encoding;
+        this.reader = null;
+    }
+
+    //---------------------------------------------------------< InputStream >
+
+    /**
+     * Reads up to the given number of characters to the given buffer position
+     * from the extracted text content reader. Uses the text extractor to
+     * create the text content reader when first invoked.
+     *
+     * @param buffer buffer to place characters in
+     * @param offset buffer offset
+     * @param length maximum number of characters to read
+     * @return number of read characters
+     * @throws IOException if text extraction fails
+     */
+    public int read(char[] buffer, int offset, int length) throws IOException {
+        if (reader == null) {
+            reader = extractor.extractText(stream, type, encoding);
+        }
+        return reader.read(buffer, offset, length);
+    }
+
+    /**
+     * Closes the reader of the extracted text, or the binary stream if the
+     * text content was never extracted.
+     *
+     * @throws IOException if the reader or stream can not be closed
+     */
+    public void close() throws IOException {
+        if (reader != null) {
+            reader.close();
+        } else {
+            stream.close();
+        }
+    }
+
+}

Propchange: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/TextExtractorReader.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/TextFilterExtractor.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/TextFilterExtractor.java?view=auto&rev=489000
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/TextFilterExtractor.java
(added)
+++ jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/TextFilterExtractor.java
Wed Dec 20 01:35:14 2006
@@ -0,0 +1,124 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.core.query.lucene;
+
+import java.io.FilterReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Reader;
+import java.io.StringReader;
+import java.util.Map;
+
+import javax.jcr.RepositoryException;
+
+import org.apache.jackrabbit.core.PropertyId;
+import org.apache.jackrabbit.core.query.TextFilter;
+import org.apache.jackrabbit.core.state.ItemState;
+import org.apache.jackrabbit.core.state.PropertyState;
+import org.apache.jackrabbit.core.value.BLOBFileValue;
+import org.apache.jackrabbit.core.value.InternalValue;
+import org.apache.jackrabbit.extractor.TextExtractor;
+
+/**
+ * Adapter class for achieving backwards compatibility with classes
+ * implementing the deprectated {@link TextFilter} interface. This class
+ * implements the {@link TextExtractor} interface through calls to an
+ * underlying {@link TextFilter} instance.
+ */
+public class TextFilterExtractor implements TextExtractor {
+
+    /**
+     * Supported content types.
+     */
+    private final String[] types;
+
+    /**
+     * The adapted text filter.
+     */
+    private final TextFilter filter;
+
+    /**
+     * Creates a text extractor adapter that supports the given content
+     * types using the given text filter.
+     *
+     * @param types supported content types
+     * @param filter text filter to be adapted
+     */
+    public TextFilterExtractor(String[] types, TextFilter filter) {
+        this.types = types;
+        this.filter = filter;
+    }
+
+    /**
+     * Creates a text extractor adapter that supports the given content
+     * type using the given text filter.
+     *
+     * @param type supported content type
+     * @param filter text filter to be adapted
+     */
+    public TextFilterExtractor(String type, TextFilter filter) {
+        this(new String[] { type }, filter);
+    }
+
+    /**
+     * Returns the supported content types.
+     *
+     * @return supported content types
+     */
+    public String[] getContentTypes() {
+        return types;
+    }
+
+    /**
+     * Extracts the text content of the given binary stream by calling the
+     * underlying {@link TextFilter} instance. A dummy {@link PropertyState}
+     * instance is created to comply with the
+     * {@link TextFilter#doFilter(PropertyState, String)} method signature.
+     *
+     * @param stream binary stream
+     * @param type content type
+     * @param encoding character encoding, or <code>null</code>
+     * @return reader reader for the extracted text content
+     * @throws IOException if the adapted call fails
+     */
+    public Reader extractText(InputStream stream, String type, String encoding)
+            throws IOException {
+        final InternalValue value = InternalValue.create(stream);
+        try {
+            PropertyState state = new PropertyState(
+                    (PropertyId) null, ItemState.STATUS_EXISTING, true);
+            state.setValues(new InternalValue[] { value });
+            Map fields = filter.doFilter(state, encoding);
+            Object fulltext = fields.get(FieldNames.FULLTEXT);
+            if (fulltext instanceof Reader) {
+                return new FilterReader((Reader) fulltext) {
+                    public void close() throws IOException {
+                        super.close();
+                        ((BLOBFileValue) value.internalValue()).discard();
+                    }
+                };
+            } else {
+                ((BLOBFileValue) value.internalValue()).discard();
+                return new StringReader("");
+            }
+        } catch (RepositoryException e) {
+            ((BLOBFileValue) value.internalValue()).discard();
+            return new StringReader("");
+        }
+    }
+
+}

Propchange: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/TextFilterExtractor.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/TextPlainTextFilter.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/TextPlainTextFilter.java?view=diff&rev=489000&r1=488999&r2=489000
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/TextPlainTextFilter.java
(original)
+++ jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/TextPlainTextFilter.java
Wed Dec 20 01:35:14 2006
@@ -16,65 +16,21 @@
  */
 package org.apache.jackrabbit.core.query.lucene;
 
-import org.apache.jackrabbit.core.query.TextFilter;
-import org.apache.jackrabbit.core.state.PropertyState;
-import org.apache.jackrabbit.core.value.BLOBFileValue;
-import org.apache.jackrabbit.core.value.InternalValue;
-
-import javax.jcr.RepositoryException;
-import java.io.InputStreamReader;
-import java.io.Reader;
-import java.io.UnsupportedEncodingException;
-import java.util.HashMap;
-import java.util.Map;
+import org.apache.jackrabbit.extractor.PlainTextExtractor;
 
 /**
- * Implements a {@link org.apache.jackrabbit.core.query.TextFilter} that handles binary properties
of mime-type
- * text/plain.
+ * Text filter for <code>text/plain</code> content.
+ *
+ * @deprecated use {@link PlainTextExtractor}, this class is kept for
+ *             backwards compatibility with existing configuration files
  */
-public class TextPlainTextFilter implements TextFilter {
+public class TextPlainTextFilter extends TextExtractorFilter {
 
     /**
-     * Returns <code>true</code> for <code>text/plain</code>; <code>false</code>
-     * in all other cases.
-     * @param mimeType the mime-type.
-     * @return <code>true</code> for <code>text/plain</code>; <code>false</code>
-     * in all other cases.
+     * Creates a text filter for <code>text/plain</code> content.
      */
-    public boolean canFilter(String mimeType) {
-        return "text/plain".equalsIgnoreCase(mimeType);
+    public TextPlainTextFilter() {
+        super(new PlainTextExtractor());
     }
 
-    /**
-     * Returns a map with a single entry for field {@link FieldNames#FULLTEXT}.
-     * @param data the data property.
-     * @param encoding the encoding
-     * @return a map with a single Reader value for field
-     *  {@link FieldNames#FULLTEXT}.
-     * @throws RepositoryException if encoding is not supported or data is a
-     *  multi-value property.
-     */
-    public Map doFilter(PropertyState data, String encoding) throws RepositoryException {
-        InternalValue[] values = data.getValues();
-        if (values.length == 1) {
-            BLOBFileValue blob = (BLOBFileValue) values[0].internalValue();
-            try {
-                Reader reader;
-                if (encoding == null) {
-                    // use platform default
-                    reader = new InputStreamReader(blob.getStream());
-                } else {
-                    reader = new InputStreamReader(blob.getStream(), encoding);
-                }
-                Map result = new HashMap();
-                result.put(FieldNames.FULLTEXT, reader);
-                return result;
-            } catch (UnsupportedEncodingException e) {
-                throw new RepositoryException(e);
-            }
-        } else {
-            // multi value not supported
-            throw new RepositoryException("Multi-valued binary properties not supported.");
-        }
-    }
 }



Mime
View raw message