jackrabbit-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mreut...@apache.org
Subject svn commit: r160905 - in incubator/jackrabbit/trunk: ./ src/java/META-INF/ src/java/META-INF/services/ src/java/org/apache/jackrabbit/core/search/ src/java/org/apache/jackrabbit/core/search/lucene/
Date Mon, 11 Apr 2005 15:30:33 GMT
Author: mreutegg
Date: Mon Apr 11 08:30:31 2005
New Revision: 160905

URL: http://svn.apache.org/viewcvs?view=rev&rev=160905
Log:
Implement indexing of jcr:data property of nt:resource nodes.

Added:
    incubator/jackrabbit/trunk/src/java/META-INF/
    incubator/jackrabbit/trunk/src/java/META-INF/services/
    incubator/jackrabbit/trunk/src/java/META-INF/services/org.apache.jackrabbit.core.search.TextFilterService
    incubator/jackrabbit/trunk/src/java/org/apache/jackrabbit/core/search/TextFilter.java
  (with props)
    incubator/jackrabbit/trunk/src/java/org/apache/jackrabbit/core/search/TextFilterService.java
  (with props)
    incubator/jackrabbit/trunk/src/java/org/apache/jackrabbit/core/search/TextPlainTextFilter.java
  (with props)
Modified:
    incubator/jackrabbit/trunk/project.xml
    incubator/jackrabbit/trunk/src/java/org/apache/jackrabbit/core/search/lucene/NodeIndexer.java

Modified: incubator/jackrabbit/trunk/project.xml
URL: http://svn.apache.org/viewcvs/incubator/jackrabbit/trunk/project.xml?view=diff&r1=160904&r2=160905
==============================================================================
--- incubator/jackrabbit/trunk/project.xml (original)
+++ incubator/jackrabbit/trunk/project.xml Mon Apr 11 08:30:31 2005
@@ -366,6 +366,7 @@
        <includes>
          <include>**/*.xml</include>
          <include>**/*.properties</include>
+         <include>**/*.TextFilterService</include>
        </includes>
      </resource>
    </resources>

Added: incubator/jackrabbit/trunk/src/java/META-INF/services/org.apache.jackrabbit.core.search.TextFilterService
URL: http://svn.apache.org/viewcvs/incubator/jackrabbit/trunk/src/java/META-INF/services/org.apache.jackrabbit.core.search.TextFilterService?view=auto&rev=160905
==============================================================================
--- incubator/jackrabbit/trunk/src/java/META-INF/services/org.apache.jackrabbit.core.search.TextFilterService
(added)
+++ incubator/jackrabbit/trunk/src/java/META-INF/services/org.apache.jackrabbit.core.search.TextFilterService
Mon Apr 11 08:30:31 2005
@@ -0,0 +1,21 @@
+# Copyright 2004-2005 The Apache Software Foundation or its licensors,
+#                     as applicable.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#
+# This file lists all available TextFilter implementations that are shipped
+# with Jackrabbit.
+#
+
+org.apache.jackrabbit.core.search.TextPlainTextFilter

Added: incubator/jackrabbit/trunk/src/java/org/apache/jackrabbit/core/search/TextFilter.java
URL: http://svn.apache.org/viewcvs/incubator/jackrabbit/trunk/src/java/org/apache/jackrabbit/core/search/TextFilter.java?view=auto&rev=160905
==============================================================================
--- incubator/jackrabbit/trunk/src/java/org/apache/jackrabbit/core/search/TextFilter.java
(added)
+++ incubator/jackrabbit/trunk/src/java/org/apache/jackrabbit/core/search/TextFilter.java
Mon Apr 11 08:30:31 2005
@@ -0,0 +1,59 @@
+/*
+ * Copyright 2004-2005 The Apache Software Foundation or its licensors,
+ *                     as applicable.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.core.search;
+
+import org.apache.jackrabbit.core.state.PropertyState;
+
+import javax.jcr.RepositoryException;
+import java.util.Map;
+
+/**
+ * Defines an interface for extracting text out of binary properties according
+ * to their mime-type.
+ *
+ * @see TextFilterService
+ */
+public interface TextFilter {
+
+    /**
+     * Returns <code>true</code> if this <code>TextFilter</code>
can index
+     * content of <code>mimeType</code>; <code>false</code> otherwise.
+     *
+     * @param mimeType the mime type of the content to index.
+     * @return whether this <code>TextFilter</code> can index content of
+     *         <code>mimeType</code>.
+     */
+    public boolean canFilter(String mimeType);
+
+    /**
+     * Creates an text representation of a binary property <code>data</code>.
+     * The returned map contains {@link java.io.Reader} values. Keys to the
+     * reader values are <code>String</code>s that serve as field names.
+     * <p/>
+     * E.g. a TextFilter for a html document may extract multiple fields: one
+     * for the title and one for the whole content.
+     *
+     * @param data     the data property that contains the binary content.
+     * @param encoding the encoding of the content or <code>null</code> if
+     *                 <code>data</code> does not use encoding.
+     * @return the extracted text.
+     * @throws RepositoryException if an error occurs while reading from the
+     *                             node or if the data is malformed.
+     */
+    public Map doFilter(PropertyState data, String encoding)
+            throws RepositoryException;
+}

Propchange: incubator/jackrabbit/trunk/src/java/org/apache/jackrabbit/core/search/TextFilter.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: incubator/jackrabbit/trunk/src/java/org/apache/jackrabbit/core/search/TextFilterService.java
URL: http://svn.apache.org/viewcvs/incubator/jackrabbit/trunk/src/java/org/apache/jackrabbit/core/search/TextFilterService.java?view=auto&rev=160905
==============================================================================
--- incubator/jackrabbit/trunk/src/java/org/apache/jackrabbit/core/search/TextFilterService.java
(added)
+++ incubator/jackrabbit/trunk/src/java/org/apache/jackrabbit/core/search/TextFilterService.java
Mon Apr 11 08:30:31 2005
@@ -0,0 +1,118 @@
+/*
+ * Copyright 2004-2005 The Apache Software Foundation or its licensors,
+ *                     as applicable.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.core.search;
+
+import org.apache.log4j.Logger;
+import org.apache.jackrabbit.core.state.PropertyState;
+
+import javax.jcr.RepositoryException;
+import java.util.Map;
+import java.util.Iterator;
+import java.util.Collections;
+import java.util.List;
+import java.util.ArrayList;
+
+import sun.misc.Service;
+
+/**
+ * Implements a service that looks up {@link TextFilter} implementations that
+ * are registered in a jar file as providers for
+ * <code>META-INF/services/org.apache.jackrabbit.core.search.TextFilterService</code>
+ * E.g. the jackrabbit jar file contains entries for some {@link TextFilter}
+ * implementations such as {@link TextPlainTextFilter}. Custom
+ * {@link TextFilter} implementations may be added to Jackrabbit by packaging
+ * them into a jar file together with a
+ * <code>META-INF/services/org.apache.jackrabbit.core.search.TextFilterService</code>
+ * file that contains the names of the custom {@link TextFilter} classes. Those
+ * filters are then automatically loaded by Jackrabbit on startup.
+ * <p/>
+ * See also: <a href="http://java.sun.com/products/jdk/1.3/docs/guide/jar/jar.html">
+ * JAR File Specification</a>
+ * <p/>
+ * {@link TextFilter} implementations are asked if they can handle a certain
+ * mime type ({@link TextFilter#canFilter(String)} and if one of them returns
+ * <code>true</code> the text representation is created with {@link
+ * TextFilter#doFilter(PropertyState)}
+ */
+public class TextFilterService {
+
+    /**
+     * Logger instance for this class.
+     */
+    private static final Logger log = Logger.getLogger(TextFilterService.class);
+
+    /**
+     * List of all {@link TextFilter}s known to the system.
+     */
+    private static final List filters = new ArrayList();
+
+    /**
+     * Initializes the {@link #filters} list.
+     */
+    static {
+        Iterator it = Service.providers(TextFilterService.class);
+        while (it.hasNext()) {
+            filters.add(it.next());
+        }
+    }
+
+    /**
+     * Extracts text from a binary property which claims to be of a certain
+     * mime-type. This metod eventually calls
+     * {@link TextFilter#doFilter(PropertyState, String)}.
+     *
+     * @param data     the binary data
+     * @param mimeType the mime type
+     * @return the extracted content
+     * @throws RepositoryException if an error occurs while creating the index
+     *                             layout. This includes the case where
+     *                             <code>data</code> is not according to
+     *                             <code>mimeType</code>.
+     */
+    public static Map extractText(PropertyState data,
+                                  String mimeType,
+                                  String encoding) throws RepositoryException {
+        TextFilter filter = getFilter(mimeType);
+        if (filter != null) {
+            return filter.doFilter(data, encoding);
+        } else {
+            return Collections.EMPTY_MAP;
+        }
+    }
+
+    /**
+     * Looks up the {@link TextFilter} that can extract text from binary content
+     * with a certain <code>mimeType</code>.
+     *
+     * @param mimeType the mime type of the content to filter.
+     * @return the {@link TextFilter} indexer instance or <code>null</code> if
+     *         there is no indexer that can handle content of
+     *         <code>mimeType</code>.
+     */
+    private static TextFilter getFilter(String mimeType) {
+        log.debug("Find TextFilter for mime-type: " + mimeType);
+        for (Iterator it = filters.iterator(); it.hasNext();) {
+            TextFilter filter = (TextFilter) it.next();
+            if (filter.canFilter(mimeType)) {
+                log.debug("Found TextFilter implementation: " + filter.getClass().getName());
+                return filter;
+            }
+        }
+        log.debug("No TextFilter found");
+        return null;
+    }
+}

Propchange: incubator/jackrabbit/trunk/src/java/org/apache/jackrabbit/core/search/TextFilterService.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: incubator/jackrabbit/trunk/src/java/org/apache/jackrabbit/core/search/TextPlainTextFilter.java
URL: http://svn.apache.org/viewcvs/incubator/jackrabbit/trunk/src/java/org/apache/jackrabbit/core/search/TextPlainTextFilter.java?view=auto&rev=160905
==============================================================================
--- incubator/jackrabbit/trunk/src/java/org/apache/jackrabbit/core/search/TextPlainTextFilter.java
(added)
+++ incubator/jackrabbit/trunk/src/java/org/apache/jackrabbit/core/search/TextPlainTextFilter.java
Mon Apr 11 08:30:31 2005
@@ -0,0 +1,80 @@
+/*
+ * Copyright 2004-2005 The Apache Software Foundation or its licensors,
+ *                     as applicable.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.core.search;
+
+import org.apache.jackrabbit.core.state.PropertyState;
+import org.apache.jackrabbit.core.InternalValue;
+import org.apache.jackrabbit.core.BLOBFileValue;
+import org.apache.jackrabbit.core.search.lucene.FieldNames;
+
+import javax.jcr.RepositoryException;
+import java.util.Map;
+import java.util.HashMap;
+import java.io.InputStreamReader;
+import java.io.UnsupportedEncodingException;
+import java.io.Reader;
+
+/**
+ * Implements a {@link TextFilter} that handles binary properties of mime-type
+ * text/plain.
+ */
+public class TextPlainTextFilter implements TextFilter {
+
+    /**
+     * Returns <code>true</code> for <code>text/plain</code>; <code>false</code>
+     * in all other cases.
+     * @param mimeType the mime-type.
+     * @return <code>true</code> for <code>text/plain</code>; <code>false</code>
+     * in all other cases.
+     */
+    public boolean canFilter(String mimeType) {
+        return "text/plain".equalsIgnoreCase(mimeType);
+    }
+
+    /**
+     * Returns a map with a single entry for field {@link FieldNames#FULLTEXT}.
+     * @param data the data property.
+     * @param encoding the encoding
+     * @return a map with a single Reader value for field
+     *  {@link FieldNames#FULLTEXT}.
+     * @throws RepositoryException if encoding is not supported or data is a
+     *  multi-value property.
+     */
+    public Map doFilter(PropertyState data, String encoding) throws RepositoryException {
+        InternalValue[] values = data.getValues();
+        if (values.length > 0) {
+            BLOBFileValue blob = (BLOBFileValue) values[0].internalValue();
+            try {
+                Reader reader;
+                if (encoding == null) {
+                    // use platform default
+                    reader = new InputStreamReader(blob.getStream());
+                } else {
+                    reader = new InputStreamReader(blob.getStream(), encoding);
+                }
+                Map result = new HashMap();
+                result.put(FieldNames.FULLTEXT, reader);
+                return result;
+            } catch (UnsupportedEncodingException e) {
+                throw new RepositoryException(e);
+            }
+        } else {
+            // multi value not supported
+            throw new RepositoryException("Multi-valued binary properties not supported.");
+        }
+    }
+}

Propchange: incubator/jackrabbit/trunk/src/java/org/apache/jackrabbit/core/search/TextPlainTextFilter.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: incubator/jackrabbit/trunk/src/java/org/apache/jackrabbit/core/search/lucene/NodeIndexer.java
URL: http://svn.apache.org/viewcvs/incubator/jackrabbit/trunk/src/java/org/apache/jackrabbit/core/search/lucene/NodeIndexer.java?view=diff&r1=160904&r2=160905
==============================================================================
--- incubator/jackrabbit/trunk/src/java/org/apache/jackrabbit/core/search/lucene/NodeIndexer.java
(original)
+++ incubator/jackrabbit/trunk/src/java/org/apache/jackrabbit/core/search/lucene/NodeIndexer.java
Mon Apr 11 08:30:31 2005
@@ -29,6 +29,9 @@
 import org.apache.jackrabbit.core.InternalValue;
 import org.apache.jackrabbit.core.QName;
 import org.apache.jackrabbit.core.Path;
+import org.apache.jackrabbit.core.Constants;
+import org.apache.jackrabbit.core.search.TextFilterService;
+import org.apache.log4j.Logger;
 
 import javax.jcr.NamespaceException;
 import javax.jcr.PropertyType;
@@ -36,18 +39,39 @@
 import java.util.Calendar;
 import java.util.Iterator;
 import java.util.List;
+import java.util.Map;
+import java.io.Reader;
 
 /**
  * Creates a lucene <code>Document</code> object from a {@link javax.jcr.Node}.
- *
- * todo add support for indexing of nt:resource. e.g. when mime type is text/*
  */
 public class NodeIndexer {
 
     /**
+     * The logger instance for this class.
+     */
+    private static final Logger log = Logger.getLogger(NodeIndexer.class);
+
+    /**
+     * QName for jcr:encoding
+     */
+    private static final QName JCR_ENCODING = new QName(Constants.NS_JCR_URI, "encoding");
+
+    /**
+     * QName for jcr:mimeType
+     */
+    private static final QName JCR_MIMETYPE = new QName(Constants.NS_JCR_URI, "mimeType");
+
+    /**
+     * QName for jcr:data
+     */
+    private static final QName JCR_DATA = new QName(Constants.NS_JCR_URI, "data");
+
+    /**
      * The <code>NodeState</code> of the node to index
      */
     protected final NodeState node;
+
     /**
      * The persistent item state provider
      */
@@ -222,15 +246,45 @@
     /**
      * Adds the binary value to the document as the named field.
      * <p>
-     * This implementation does nothing as binary indexing is not implemented
-     * here.
+     * This implementation checks if this {@link #node} is of type nt:resource
+     * and if that is the case, tries to extract text from the data atom using
+     * {@link TextFilterService}add a {@link FieldNames#FULLTEXT} field
+     * .
      * 
      * @param doc The document to which to add the field
      * @param fieldName The name of the field to add
      * @param internalValue The value for the field to add to the document.
      */
     protected void addBinaryValue(Document doc, String fieldName, Object internalValue) {
-        // don't know how to index -> ignore
+        // 'check' if node is of type nt:resource
+        try {
+            String jcrData = mappings.getPrefix(Constants.NS_JCR_URI) + ":data";
+            if (!jcrData.equals(fieldName)) {
+                // don't know how to index
+                return;
+            }
+            if (node.hasPropertyEntry(JCR_ENCODING)
+                    && node.hasPropertyEntry(JCR_MIMETYPE)) {
+                PropertyState dataProp = (PropertyState) stateProvider.getItemState(new PropertyId(node.getUUID(),
JCR_DATA));
+                PropertyState mimeTypeProp = (PropertyState) stateProvider.getItemState(new
PropertyId(node.getUUID(), JCR_MIMETYPE));
+                PropertyState encodingProp = (PropertyState) stateProvider.getItemState(new
PropertyId(node.getUUID(), JCR_ENCODING));
+
+                Map fields = TextFilterService.extractText(dataProp,
+                        mimeTypeProp.getValues()[0].internalValue().toString(),
+                        encodingProp.getValues()[0].internalValue().toString());
+                for (Iterator it = fields.keySet().iterator(); it.hasNext();) {
+                    String field = (String) it.next();
+                    Reader r = (Reader) fields.get(field);
+                    doc.add(Field.Text(field, r));
+                }
+            }
+        } catch (ItemStateException e) {
+            log.warn("Exception while indexing binary property: " + e.toString());
+            log.debug("Dump: ", e);
+        } catch (RepositoryException e) {
+            log.warn("Exception while indexing binary property: " + e.toString());
+            log.debug("Dump: ", e);
+        }
     }
     
     /**



Mime
View raw message