Author: mreutegg
Date: Wed Dec 20 01:35:14 2006
New Revision: 489000
URL: http://svn.apache.org/viewvc?view=rev&rev=489000
Log:
JCR-415: Enhance indexing of binary content
- Use text-extractor module in jackrabbit-core
Added:
jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/JackrabbitTextExtractor.java
(with props)
jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/TextExtractorFilter.java
(with props)
jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/TextExtractorReader.java
(with props)
jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/TextFilterExtractor.java
(with props)
Modified:
jackrabbit/trunk/jackrabbit-core/pom.xml
jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/TextFilter.java
jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/NodeIndexer.java
jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/SearchIndex.java
jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/TextPlainTextFilter.java
Modified: jackrabbit/trunk/jackrabbit-core/pom.xml
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/pom.xml?view=diff&rev=489000&r1=488999&r2=489000
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/pom.xml (original)
+++ jackrabbit/trunk/jackrabbit-core/pom.xml Wed Dec 20 01:35:14 2006
@@ -300,6 +300,11 @@
<version>${pom.version}</version>
</dependency>
<dependency>
+ <groupId>org.apache.jackrabbit</groupId>
+ <artifactId>jackrabbit-text-extractors</artifactId>
+ <version>${pom.version}</version>
+ </dependency>
+ <dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
<version>1.2.8</version>
Modified: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/TextFilter.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/TextFilter.java?view=diff&rev=489000&r1=488999&r2=489000
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/TextFilter.java
(original)
+++ jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/TextFilter.java
Wed Dec 20 01:35:14 2006
@@ -29,6 +29,9 @@
* mime type ({@link #canFilter(String)} and if one of them returns
* <code>true</code> the text representation is created with
* {@link #doFilter(PropertyState, String)}
+ *
+ * @deprecated use the {@link org.apache.jackrabbit.extractor.TextExtractor}
+ * interface
*/
public interface TextFilter {
Added: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/JackrabbitTextExtractor.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/JackrabbitTextExtractor.java?view=auto&rev=489000
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/JackrabbitTextExtractor.java
(added)
+++ jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/JackrabbitTextExtractor.java
Wed Dec 20 01:35:14 2006
@@ -0,0 +1,198 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.core.query.lucene;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Reader;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.Set;
+import java.util.StringTokenizer;
+
+import org.apache.jackrabbit.core.query.TextFilter;
+import org.apache.jackrabbit.extractor.CompositeTextExtractor;
+import org.apache.jackrabbit.extractor.DelegatingTextExtractor;
+import org.apache.jackrabbit.extractor.EmptyTextExtractor;
+import org.apache.jackrabbit.extractor.TextExtractor;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Backwards-compatible Jackrabbit text extractor component. This class
+ * implements the following functionality:
+ * <ul>
+ * <li>
+ * Parses the configured {@link TextExtractor} and {@link TextFilter}
+ * class names and instantiates the configured classes.
+ * </li>
+ * <li>
+ * Acts as the delegate extractor for any configured
+ * {@link DelegatingTextExtractor} instances.
+ * </li>
+ * <li>
+ * Maintains a {@link CompositeTextExtractor} instance that contains
+ * all the configured extractors and to which all text extraction calls
+ * are delegated.
+ * </li>
+ * <li>
+ * Creates a {@link TextFilterExtractor} adapter for a configured
+ * {@link TextFilter} instance when it is first used and adds that adapter
+ * to the composite extractor for use in text extraction.
+ * </li>
+ * <li>
+ * Logs a warning and creates a dummy {@link EmptyTextExtractor} instance
+ * for any unsupported content types when first detected. The dummy
+ * extractor is added to the composite extractor to prevent future
+ * warnings about the same content type.
+ * </li>
+ * </ul>
+ */
+public class JackrabbitTextExtractor implements TextExtractor {
+
+ /**
+ * Logger instance.
+ */
+ private static final Logger logger =
+ LoggerFactory.getLogger(JackrabbitTextExtractor.class);
+
+ /**
+ * Set of content types that are known to be supported by the
+ * composite extractor.
+ */
+ private final Set types = new HashSet();
+
+ /**
+ * Composite extractor used to for all text extration tasks. Contains
+ * all the {@link TextExtractor} instances for directly supported content
+ * types, the {@link TextFilterExtractor} adapters for backwards
+ * compatibility with configured {@link TextFilter} instances that have
+ * already been used, and the dummy {@link EmptyTextExtractor} instances
+ * created for unsupported content types.
+ */
+ private final CompositeTextExtractor extractor =
+ new CompositeTextExtractor();
+
+ /**
+ * Configured {@link TextFilter} instances. Used for backwards
+ * compatibility with existing configuration files and {@link TextFilter}
+ * implementations.
+ */
+ private final Collection filters = new ArrayList();
+
+ /**
+ * Creates a Jackrabbit text extractor containing the configured component
+ * classes.
+ *
+ * @param classes configured {@link TextExtractor} (and {@link TextFilter})
+ * class names (space- or comma-separated)
+ */
+ public JackrabbitTextExtractor(String classes) {
+ logger.debug("JackrabbitTextExtractor({})", classes);
+ StringTokenizer tokenizer = new StringTokenizer(classes, ", \t\n\r\f");
+ while (tokenizer.hasMoreTokens()) {
+ String name = tokenizer.nextToken();
+ try {
+ Object object = Class.forName(name).newInstance();
+ if (object instanceof DelegatingTextExtractor) {
+ ((DelegatingTextExtractor) object)
+ .setDelegateTextExtractor(this);
+ }
+ if (object instanceof TextExtractor) {
+ extractor.addTextExtractor((TextExtractor) object);
+ } else if (object instanceof TextFilter) {
+ filters.add(object);
+ } else {
+ logger.warn("Unknown text extractor class: {}", name);
+ }
+ } catch (ClassNotFoundException e) {
+ logger.warn("Extractor class not found: " + name, e);
+ } catch (LinkageError e) {
+ logger.warn("Extractor dependency not found: " + name, e);
+ } catch (IllegalAccessException e) {
+ logger.warn("Extractor constructor not accessible: " + name, e);
+ } catch (InstantiationException e) {
+ logger.warn("Extractor instantiation failed: " + name, e);
+ }
+ }
+
+ types.addAll(Arrays.asList(extractor.getContentTypes()));
+ }
+
+ //-------------------------------------------------------< TextExtractor >
+
+ /**
+ * Returns the content types that the component extractors are known
+ * to support.
+ *
+ * @return supported content types
+ */
+ public String[] getContentTypes() {
+ return extractor.getContentTypes(); // and then some
+ }
+
+ /**
+ * Extracts the text content from the given binary stream. The given
+ * content type is used to look up a configured text extractor to which
+ * to delegate the request.
+ * <p>
+ * If a matching extractor is not found, then the configured text filters
+ * searched for an instance that claims to support the given content type.
+ * A text extractor adapter is created for that filter and saved in the
+ * extractor map for future use before delegating the request to the
+ * adapter.
+ * <p>
+ * If not even a text filter is found for the given content type, a warning
+ * is logged and an empty text extractor is created for that content type
+ * and saved in the extractor map for future use before delegating the
+ * request to the empty extractor.
+ *
+ * @param stream binary stream
+ * @param type content type
+ * @param encoding character encoding, or <code>null</code>
+ * @return reader for the text content of the binary stream
+ * @throws IOException if the binary stream can not be read
+ */
+ public Reader extractText(InputStream stream, String type, String encoding)
+ throws IOException {
+ logger.debug("extractText(stream, {}, {})", type, encoding);
+ if (!types.contains(type)) {
+ Iterator iterator = filters.iterator();
+ while (iterator.hasNext()) {
+ TextFilter filter = (TextFilter) iterator.next();
+ if (filter.canFilter(type)) {
+ types.add(type);
+ extractor.addTextExtractor(
+ new TextFilterExtractor(type, filter));
+ break;
+ }
+ }
+ }
+
+ if (!types.contains(type)) {
+ logger.warn("Full text indexing of {} is not supported", type);
+ types.add(type);
+ extractor.addTextExtractor(new EmptyTextExtractor(type));
+ }
+
+ return extractor.extractText(stream, type, encoding);
+ }
+
+}
Propchange: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/JackrabbitTextExtractor.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/NodeIndexer.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/NodeIndexer.java?view=diff&rev=489000&r1=488999&r2=489000
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/NodeIndexer.java
(original)
+++ jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/NodeIndexer.java
Wed Dec 20 01:35:14 2006
@@ -17,13 +17,14 @@
package org.apache.jackrabbit.core.query.lucene;
import org.apache.jackrabbit.core.PropertyId;
-import org.apache.jackrabbit.core.query.TextFilter;
import org.apache.jackrabbit.core.state.ItemStateException;
import org.apache.jackrabbit.core.state.ItemStateManager;
import org.apache.jackrabbit.core.state.NoSuchItemStateException;
import org.apache.jackrabbit.core.state.NodeState;
import org.apache.jackrabbit.core.state.PropertyState;
+import org.apache.jackrabbit.core.value.BLOBFileValue;
import org.apache.jackrabbit.core.value.InternalValue;
+import org.apache.jackrabbit.extractor.TextExtractor;
import org.apache.jackrabbit.name.NoPrefixDeclaredException;
import org.apache.jackrabbit.name.Path;
import org.apache.jackrabbit.name.QName;
@@ -37,12 +38,11 @@
import javax.jcr.NamespaceException;
import javax.jcr.PropertyType;
import javax.jcr.RepositoryException;
+
+import java.io.InputStream;
import java.io.Reader;
import java.util.Calendar;
-import java.util.Collections;
import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
import java.util.Set;
/**
@@ -72,9 +72,9 @@
protected final NamespaceMappings mappings;
/**
- * List of text filters in use.
+ * Content extractor.
*/
- protected final List textFilters;
+ protected final TextExtractor extractor;
/**
* Creates a new node indexer.
@@ -82,16 +82,16 @@
* @param node the node state to index.
* @param stateProvider the persistent item state manager to retrieve properties.
* @param mappings internal namespace mappings.
- * @param textFilters List of {@link org.apache.jackrabbit.core.query.TextFilter}s.
+ * @param extractor content extractor
*/
protected NodeIndexer(NodeState node,
ItemStateManager stateProvider,
NamespaceMappings mappings,
- List textFilters) {
+ TextExtractor extractor) {
this.node = node;
this.stateProvider = stateProvider;
this.mappings = mappings;
- this.textFilters = textFilters;
+ this.extractor = extractor;
}
/**
@@ -100,8 +100,7 @@
* @param node the node state to index.
* @param stateProvider the state provider to retrieve property values.
* @param mappings internal namespace mappings.
- * @param textFilters list of text filters to use for indexing binary
- * properties.
+ * @param extractor text extractor
* @return the lucene Document.
* @throws RepositoryException if an error occurs while reading property
* values from the <code>ItemStateProvider</code>.
@@ -109,9 +108,9 @@
public static Document createDocument(NodeState node,
ItemStateManager stateProvider,
NamespaceMappings mappings,
- List textFilters)
+ TextExtractor extractor)
throws RepositoryException {
- NodeIndexer indexer = new NodeIndexer(node, stateProvider, mappings, textFilters);
+ NodeIndexer indexer = new NodeIndexer(node, stateProvider, mappings, extractor);
return indexer.createDoc();
}
@@ -256,14 +255,16 @@
* Adds the binary value to the document as the named field.
* <p/>
* This implementation checks if this {@link #node} is of type nt:resource
- * and if that is the case, tries to extract text from the data atom using
- * the {@link #textFilters}.
+ * and if that is the case, tries to extract text from the binary property
+ * using the {@link #extractor}.
*
* @param doc The document to which to add the field
* @param fieldName The name of the field to add
* @param internalValue The value for the field to add to the document.
*/
- protected void addBinaryValue(Document doc, String fieldName, Object internalValue) {
+ protected void addBinaryValue(Document doc,
+ String fieldName,
+ Object internalValue) {
// 'check' if node is of type nt:resource
try {
String jcrData = mappings.getPrefix(QName.NS_JCR_URI) + ":data";
@@ -271,43 +272,53 @@
// don't know how to index
return;
}
- if (node.hasPropertyName(QName.JCR_MIMETYPE)) {
- PropertyState dataProp = (PropertyState) stateProvider.getItemState(
- new PropertyId(node.getNodeId(), QName.JCR_DATA));
- PropertyState mimeTypeProp =
- (PropertyState) stateProvider.getItemState(
- new PropertyId(node.getNodeId(), QName.JCR_MIMETYPE));
+
+ InternalValue typeValue = getValue(QName.JCR_MIMETYPE);
+ if (typeValue != null) {
+ String type = typeValue.internalValue().toString();
// jcr:encoding is not mandatory
String encoding = null;
- if (node.hasPropertyName(QName.JCR_ENCODING)) {
- PropertyState encodingProp =
- (PropertyState) stateProvider.getItemState(
- new PropertyId(node.getNodeId(), QName.JCR_ENCODING));
- encoding = encodingProp.getValues()[0].internalValue().toString();
- }
-
- String mimeType = mimeTypeProp.getValues()[0].internalValue().toString();
- Map fields = Collections.EMPTY_MAP;
- for (Iterator it = textFilters.iterator(); it.hasNext();) {
- TextFilter filter = (TextFilter) it.next();
- // use the first filter that can handle the mimeType
- if (filter.canFilter(mimeType)) {
- fields = filter.doFilter(dataProp, encoding);
- break;
- }
+ InternalValue encodingValue = getValue(QName.JCR_ENCODING);
+ if (encodingValue != null) {
+ encoding = encodingValue.internalValue().toString();
}
- for (Iterator it = fields.keySet().iterator(); it.hasNext();) {
- String field = (String) it.next();
- Reader r = (Reader) fields.get(field);
- doc.add(new Field(field, r));
- }
+ InputStream stream =
+ ((BLOBFileValue) internalValue).getStream();
+ Reader reader =
+ new TextExtractorReader(extractor, stream, type, encoding);
+ doc.add(new Field(FieldNames.FULLTEXT, reader));
}
} catch (Exception e) {
// TODO: How to recover from a transient indexing failure?
log.warn("Exception while indexing binary property: " + e.toString());
log.debug("Dump: ", e);
+ }
+ }
+
+ /**
+ * Utility method that extracts the first value of the named property
+ * of the current node. Returns <code>null</code> if the property does
+ * not exist or contains no values.
+ *
+ * @param name property name
+ * @return value of the named property, or <code>null</code>
+ * @throws ItemStateException if the property can not be accessed
+ */
+ protected InternalValue getValue(QName name) throws ItemStateException {
+ try {
+ PropertyId id = new PropertyId(node.getNodeId(), name);
+ PropertyState property =
+ (PropertyState) stateProvider.getItemState(id);
+ InternalValue[] values = property.getValues();
+ if (values.length > 0) {
+ return values[0];
+ } else {
+ return null;
+ }
+ } catch (NoSuchItemStateException e) {
+ return null;
}
}
Modified: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/SearchIndex.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/SearchIndex.java?view=diff&rev=489000&r1=488999&r2=489000
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/SearchIndex.java
(original)
+++ jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/SearchIndex.java
Wed Dec 20 01:35:14 2006
@@ -23,10 +23,11 @@
import org.apache.jackrabbit.core.query.AbstractQueryHandler;
import org.apache.jackrabbit.core.query.ExecutableQuery;
import org.apache.jackrabbit.core.query.QueryHandlerContext;
-import org.apache.jackrabbit.core.query.TextFilter;
import org.apache.jackrabbit.core.query.QueryHandler;
import org.apache.jackrabbit.core.state.NodeState;
import org.apache.jackrabbit.core.state.NodeStateIterator;
+import org.apache.jackrabbit.extractor.DefaultTextExtractor;
+import org.apache.jackrabbit.extractor.TextExtractor;
import org.apache.jackrabbit.name.NoPrefixDeclaredException;
import org.apache.jackrabbit.name.QName;
import org.apache.jackrabbit.name.NameFormat;
@@ -50,9 +51,7 @@
import java.io.File;
import java.util.Iterator;
import java.util.List;
-import java.util.StringTokenizer;
import java.util.ArrayList;
-import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
@@ -91,11 +90,6 @@
public static final int DEFAULT_MAX_FIELD_LENGTH = 10000;
/**
- * Default text filters.
- */
- public static final String DEFAULT_TEXT_FILTERS = TextPlainTextFilter.class.getName();
-
- /**
* The actual index
*/
private MultiIndex index;
@@ -106,9 +100,17 @@
private Analyzer analyzer;
/**
- * List of {@link org.apache.jackrabbit.core.query.TextFilter} instance.
+ * List of text extractor and text filter class names. The configured
+ * classes will be instantiated and used to extract text content from
+ * binary properties.
+ */
+ private String textFilterClasses =
+ DefaultTextExtractor.class.getName();
+
+ /**
+ * Text extractor for extracting text content of binary properties.
*/
- private List textFilters;
+ private TextExtractor extractor;
/**
* The location of the search index.
@@ -199,7 +201,6 @@
*/
public SearchIndex() {
this.analyzer = new StandardAnalyzer(new String[]{});
- setTextFilterClasses(DEFAULT_TEXT_FILTERS);
}
/**
@@ -242,6 +243,8 @@
}
}
+ extractor = new JackrabbitTextExtractor(textFilterClasses);
+
index = new MultiIndex(indexDir, this, context.getItemStateManager(),
context.getRootId(), excludedIDs, nsMappings);
if (index.getRedoLogApplied() || forceConsistencyCheck) {
@@ -413,13 +416,12 @@
}
/**
- * Returns an unmodifiable list of {@link TextFilter} configured for
- * this search index.
+ * Returns the text extractor in use for indexing.
*
- * @return unmodifiable list of text filters.
+ * @return the text extractor in use for indexing.
*/
- protected List getTextFilters() {
- return textFilters;
+ public TextExtractor getTextExtractor() {
+ return extractor;
}
/**
@@ -473,7 +475,7 @@
protected Document createDocument(NodeState node, NamespaceMappings nsMappings)
throws RepositoryException {
return NodeIndexer.createDocument(node, getContext().getItemStateManager(),
- nsMappings, textFilters);
+ nsMappings, extractor);
}
/**
@@ -753,34 +755,17 @@
}
/**
- * Sets a new set of text filter classes that are in use for indexing
- * binary properties. The <code>filterClasses</code> must be a comma
- * separated <code>String</code> of fully qualified class names implementing
- * {@link org.apache.jackrabbit.core.query.TextFilter}. Each class must
- * provide a default constructor.
- * </p>
- * Filter class names that cannot be resolved are skipped and a warn message
- * is logged.
+ * Sets the list of text extractors (and text filters) to use for
+ * extracting text content from binary properties. The list must be
+ * comma (or whitespace) separated, and contain fully qualified class
+ * names of the {@link TextExtractor} (and {@link org.apache.jackrabbit.core.query.TextFilter})
classes
+ * to be used. The configured classes must all have a public default
+ * constructor.
*
- * @param filterClasses comma separated list of filter class names
+ * @param filterClasses comma separated list of class names
*/
public void setTextFilterClasses(String filterClasses) {
- List filters = new ArrayList();
- StringTokenizer tokenizer = new StringTokenizer(filterClasses, ", \t\n\r\f");
- while (tokenizer.hasMoreTokens()) {
- String className = tokenizer.nextToken();
- try {
- Class filterClass = Class.forName(className);
- TextFilter filter = (TextFilter) filterClass.newInstance();
- filters.add(filter);
- } catch (Exception e) {
- log.warn("Invalid TextFilter class: " + className, e);
- } catch (LinkageError e) {
- log.warn("Missing dependency for text filter: " + className);
- log.warn(e.toString());
- }
- }
- textFilters = Collections.unmodifiableList(filters);
+ this.textFilterClasses = filterClasses;
}
/**
@@ -790,14 +775,7 @@
* @return class names of the text filters in use.
*/
public String getTextFilterClasses() {
- StringBuffer names = new StringBuffer();
- String delim = "";
- for (Iterator it = textFilters.iterator(); it.hasNext();) {
- names.append(delim);
- names.append(it.next().getClass().getName());
- delim = ",";
- }
- return names.toString();
+ return textFilterClasses;
}
/**
Added: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/TextExtractorFilter.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/TextExtractorFilter.java?view=auto&rev=489000
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/TextExtractorFilter.java
(added)
+++ jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/TextExtractorFilter.java
Wed Dec 20 01:35:14 2006
@@ -0,0 +1,120 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.core.query.lucene;
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.HashMap;
+import java.util.Map;
+
+import javax.jcr.RepositoryException;
+
+import org.apache.jackrabbit.core.query.TextFilter;
+import org.apache.jackrabbit.core.state.PropertyState;
+import org.apache.jackrabbit.core.value.BLOBFileValue;
+import org.apache.jackrabbit.core.value.InternalValue;
+import org.apache.jackrabbit.extractor.TextExtractor;
+
+/**
+ * Utility base class for migrating functionality from existing implementations
+ * of the deprecated {@link TextFilter} interface to the new
+ * {@link TextExtractor} interface. Once the functionality of an existing
+ * TextFilter has been copied to a new TextExtractor, the original class can
+ * be replaced with the following template to keep backwards compatibility
+ * while avoiding the burden of maintaining duplicate code:
+ * <pre>
+ * <b>public class</b> SomeTextFilter <b>extends</b> TextExtractorFilter
{
+ * <b>public</b> SomeTextFilter() {
+ * <b>super</b>(<b>new</b> SomeTextExtractor());
+ * }
+ * }
+ * </pre>
+ */
+public class TextExtractorFilter implements TextFilter {
+
+ /**
+ * The adapted text extractor.
+ */
+ private final TextExtractor extractor;
+
+ /**
+ * Creates a text filter adapter for the given text extractor.
+ *
+ * @param extractor adapted text extractor
+ */
+ public TextExtractorFilter(TextExtractor extractor) {
+ this.extractor = extractor;
+ }
+
+ /**
+ * Returns true if the adapted text extractor supports the given
+ * content type.
+ *
+ * @param mimeType content type
+ * @return <code>true</code> if the content type is supported,
+ * <code>false</code> otherwise
+ */
+ public boolean canFilter(String mimeType) {
+ mimeType = mimeType.toLowerCase();
+ String[] types = extractor.getContentTypes();
+ for (int i = 0; i < types.length; i++) {
+ if (types[i].equals(mimeType)) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ /**
+ * Extracts text content of the given binary property using the adapted
+ * text extractor.
+ *
+ * @param data binary property
+ * @param encoding character encoding, or <code>null</code>
+ * @return map that contains a reader for the extracted text as
+ * the {@link FieldNames#FULLTEXT} entry
+ * @throws RepositoryException if the binary property can not be read
+ */
+ public Map doFilter(PropertyState data, String encoding)
+ throws RepositoryException {
+ InternalValue[] values = data.getValues();
+ if (values.length == 1) {
+ try {
+ String type = "application/octet-stream";
+ String[] types = extractor.getContentTypes();
+ if (types.length > 0) {
+ type = types[0];
+ }
+
+ BLOBFileValue blob = (BLOBFileValue) values[0].internalValue();
+ Reader reader =
+ extractor.extractText(blob.getStream(), type, encoding);
+
+ Map result = new HashMap();
+ result.put(FieldNames.FULLTEXT, reader);
+ return result;
+ } catch (IOException e) {
+ throw new RepositoryException("Text extraction error", e);
+ }
+ } else {
+ // multi value not supported
+ throw new RepositoryException(
+ "Multi-valued binary properties not supported.");
+ }
+ }
+
+}
Propchange: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/TextExtractorFilter.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/TextExtractorReader.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/TextExtractorReader.java?view=auto&rev=489000
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/TextExtractorReader.java
(added)
+++ jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/TextExtractorReader.java
Wed Dec 20 01:35:14 2006
@@ -0,0 +1,114 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.core.query.lucene;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Reader;
+
+import org.apache.jackrabbit.extractor.TextExtractor;
+
+/**
+ * Reader that extracts the text content of a binary stream for reading
+ * only when the first character is requested. This class is used by the
+ * {@link NodeIndexer} class to postpone text extraction to when the
+ * content is actually needs.
+ *
+ * @see http://issues.apache.org/jira/browse/JCR-264
+ */
+public class TextExtractorReader extends Reader {
+
+ /**
+ * Text extractor to use in extracting text content from the binary stream.
+ */
+ private final TextExtractor extractor;
+
+ /**
+ * Binary stream from which to extract the content for reading.
+ */
+ private final InputStream stream;
+
+ /**
+ * Content type of the binary stream.
+ */
+ private final String type;
+
+ /**
+ * Character encoding of the binary stream, or <code>null</code>.
+ */
+ private final String encoding;
+
+ /**
+ * Reader for the extracted text content. Set to <code>null</code> until
+ * the first character request triggers the text extraction.
+ */
+ private Reader reader;
+
+ /**
+ * Creates a reader that extracts the text content from the given binary
+ * stream.
+ *
+ * @param extractor text extractor
+ * @param stream binary stream
+ * @param type content type
+ * @param encoding character encoding, or <code>null</code>
+ */
+ public TextExtractorReader(
+ TextExtractor extractor, InputStream stream,
+ String type, String encoding) {
+ this.extractor = extractor;
+ this.stream = stream;
+ this.type = type;
+ this.encoding = encoding;
+ this.reader = null;
+ }
+
+ //---------------------------------------------------------< InputStream >
+
+ /**
+ * Reads up to the given number of characters to the given buffer position
+ * from the extracted text content reader. Uses the text extractor to
+ * create the text content reader when first invoked.
+ *
+ * @param buffer buffer to place characters in
+ * @param offset buffer offset
+ * @param length maximum number of characters to read
+ * @return number of read characters
+ * @throws IOException if text extraction fails
+ */
+ public int read(char[] buffer, int offset, int length) throws IOException {
+ if (reader == null) {
+ reader = extractor.extractText(stream, type, encoding);
+ }
+ return reader.read(buffer, offset, length);
+ }
+
+ /**
+ * Closes the reader of the extracted text, or the binary stream if the
+ * text content was never extracted.
+ *
+ * @throws IOException if the reader or stream can not be closed
+ */
+ public void close() throws IOException {
+ if (reader != null) {
+ reader.close();
+ } else {
+ stream.close();
+ }
+ }
+
+}
Propchange: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/TextExtractorReader.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/TextFilterExtractor.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/TextFilterExtractor.java?view=auto&rev=489000
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/TextFilterExtractor.java
(added)
+++ jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/TextFilterExtractor.java
Wed Dec 20 01:35:14 2006
@@ -0,0 +1,124 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.core.query.lucene;
+
+import java.io.FilterReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Reader;
+import java.io.StringReader;
+import java.util.Map;
+
+import javax.jcr.RepositoryException;
+
+import org.apache.jackrabbit.core.PropertyId;
+import org.apache.jackrabbit.core.query.TextFilter;
+import org.apache.jackrabbit.core.state.ItemState;
+import org.apache.jackrabbit.core.state.PropertyState;
+import org.apache.jackrabbit.core.value.BLOBFileValue;
+import org.apache.jackrabbit.core.value.InternalValue;
+import org.apache.jackrabbit.extractor.TextExtractor;
+
+/**
+ * Adapter class for achieving backwards compatibility with classes
+ * implementing the deprectated {@link TextFilter} interface. This class
+ * implements the {@link TextExtractor} interface through calls to an
+ * underlying {@link TextFilter} instance.
+ */
+public class TextFilterExtractor implements TextExtractor {
+
+ /**
+ * Supported content types.
+ */
+ private final String[] types;
+
+ /**
+ * The adapted text filter.
+ */
+ private final TextFilter filter;
+
+ /**
+ * Creates a text extractor adapter that supports the given content
+ * types using the given text filter.
+ *
+ * @param types supported content types
+ * @param filter text filter to be adapted
+ */
+ public TextFilterExtractor(String[] types, TextFilter filter) {
+ this.types = types;
+ this.filter = filter;
+ }
+
+ /**
+ * Creates a text extractor adapter that supports the given content
+ * type using the given text filter.
+ *
+ * @param type supported content type
+ * @param filter text filter to be adapted
+ */
+ public TextFilterExtractor(String type, TextFilter filter) {
+ this(new String[] { type }, filter);
+ }
+
+ /**
+ * Returns the supported content types.
+ *
+ * @return supported content types
+ */
+ public String[] getContentTypes() {
+ return types;
+ }
+
+ /**
+ * Extracts the text content of the given binary stream by calling the
+ * underlying {@link TextFilter} instance. A dummy {@link PropertyState}
+ * instance is created to comply with the
+ * {@link TextFilter#doFilter(PropertyState, String)} method signature.
+ *
+ * @param stream binary stream
+ * @param type content type
+ * @param encoding character encoding, or <code>null</code>
+ * @return reader reader for the extracted text content
+ * @throws IOException if the adapted call fails
+ */
+ public Reader extractText(InputStream stream, String type, String encoding)
+ throws IOException {
+ final InternalValue value = InternalValue.create(stream);
+ try {
+ PropertyState state = new PropertyState(
+ (PropertyId) null, ItemState.STATUS_EXISTING, true);
+ state.setValues(new InternalValue[] { value });
+ Map fields = filter.doFilter(state, encoding);
+ Object fulltext = fields.get(FieldNames.FULLTEXT);
+ if (fulltext instanceof Reader) {
+ return new FilterReader((Reader) fulltext) {
+ public void close() throws IOException {
+ super.close();
+ ((BLOBFileValue) value.internalValue()).discard();
+ }
+ };
+ } else {
+ ((BLOBFileValue) value.internalValue()).discard();
+ return new StringReader("");
+ }
+ } catch (RepositoryException e) {
+ ((BLOBFileValue) value.internalValue()).discard();
+ return new StringReader("");
+ }
+ }
+
+}
Propchange: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/TextFilterExtractor.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/TextPlainTextFilter.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/TextPlainTextFilter.java?view=diff&rev=489000&r1=488999&r2=489000
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/TextPlainTextFilter.java
(original)
+++ jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/TextPlainTextFilter.java
Wed Dec 20 01:35:14 2006
@@ -16,65 +16,21 @@
*/
package org.apache.jackrabbit.core.query.lucene;
-import org.apache.jackrabbit.core.query.TextFilter;
-import org.apache.jackrabbit.core.state.PropertyState;
-import org.apache.jackrabbit.core.value.BLOBFileValue;
-import org.apache.jackrabbit.core.value.InternalValue;
-
-import javax.jcr.RepositoryException;
-import java.io.InputStreamReader;
-import java.io.Reader;
-import java.io.UnsupportedEncodingException;
-import java.util.HashMap;
-import java.util.Map;
+import org.apache.jackrabbit.extractor.PlainTextExtractor;
/**
- * Implements a {@link org.apache.jackrabbit.core.query.TextFilter} that handles binary properties
of mime-type
- * text/plain.
+ * Text filter for <code>text/plain</code> content.
+ *
+ * @deprecated use {@link PlainTextExtractor}, this class is kept for
+ * backwards compatibility with existing configuration files
*/
-public class TextPlainTextFilter implements TextFilter {
+public class TextPlainTextFilter extends TextExtractorFilter {
/**
- * Returns <code>true</code> for <code>text/plain</code>; <code>false</code>
- * in all other cases.
- * @param mimeType the mime-type.
- * @return <code>true</code> for <code>text/plain</code>; <code>false</code>
- * in all other cases.
+ * Creates a text filter for <code>text/plain</code> content.
*/
- public boolean canFilter(String mimeType) {
- return "text/plain".equalsIgnoreCase(mimeType);
+ public TextPlainTextFilter() {
+ super(new PlainTextExtractor());
}
- /**
- * Returns a map with a single entry for field {@link FieldNames#FULLTEXT}.
- * @param data the data property.
- * @param encoding the encoding
- * @return a map with a single Reader value for field
- * {@link FieldNames#FULLTEXT}.
- * @throws RepositoryException if encoding is not supported or data is a
- * multi-value property.
- */
- public Map doFilter(PropertyState data, String encoding) throws RepositoryException {
- InternalValue[] values = data.getValues();
- if (values.length == 1) {
- BLOBFileValue blob = (BLOBFileValue) values[0].internalValue();
- try {
- Reader reader;
- if (encoding == null) {
- // use platform default
- reader = new InputStreamReader(blob.getStream());
- } else {
- reader = new InputStreamReader(blob.getStream(), encoding);
- }
- Map result = new HashMap();
- result.put(FieldNames.FULLTEXT, reader);
- return result;
- } catch (UnsupportedEncodingException e) {
- throw new RepositoryException(e);
- }
- } else {
- // multi value not supported
- throw new RepositoryException("Multi-valued binary properties not supported.");
- }
- }
}
|