jackrabbit-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mreut...@apache.org
Subject svn commit: r537791 - in /jackrabbit/trunk/jackrabbit-core/src: main/java/org/apache/jackrabbit/core/query/lucene/ test/java/org/apache/jackrabbit/core/query/
Date Mon, 14 May 2007 11:52:15 GMT
Author: mreutegg
Date: Mon May 14 04:52:14 2007
New Revision: 537791

URL: http://svn.apache.org/viewvc?view=rev&rev=537791
Log:
JCR-920: rep:excerpt() should also work on properties

Added:
    jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/HighlightingExcerptProvider.java
  (with props)
Modified:
    jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/AbstractExcerpt.java
    jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/DefaultHighlighter.java
    jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/DefaultXMLExcerpt.java
    jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/RowIteratorImpl.java
    jackrabbit/trunk/jackrabbit-core/src/test/java/org/apache/jackrabbit/core/query/TextExtractorTest.java

Modified: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/AbstractExcerpt.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/AbstractExcerpt.java?view=diff&rev=537791&r1=537790&r2=537791
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/AbstractExcerpt.java
(original)
+++ jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/AbstractExcerpt.java
Mon May 14 04:52:14 2007
@@ -24,20 +24,28 @@
 import org.apache.lucene.index.TermDocs;
 import org.apache.lucene.index.TermFreqVector;
 import org.apache.lucene.index.TermPositionVector;
+import org.apache.lucene.index.TermVectorOffsetInfo;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Token;
 import org.apache.jackrabbit.core.NodeId;
 
 import java.io.IOException;
+import java.io.StringReader;
+import java.io.Reader;
 import java.util.Set;
 import java.util.HashSet;
 import java.util.Iterator;
+import java.util.TreeMap;
+import java.util.SortedMap;
+import java.util.Arrays;
 
 /**
  * <code>AbstractExcerpt</code> implements base functionality for an excerpt
  * provider.
  */
-public abstract class AbstractExcerpt implements ExcerptProvider {
+public abstract class AbstractExcerpt implements HighlightingExcerptProvider {
 
     /**
      * Logger instance for this class.
@@ -144,6 +152,14 @@
     }
 
     /**
+     * {@inheritDoc}
+     */
+    public String highlight(String text) throws IOException {
+        return createExcerpt(createTermPositionVector(text),
+                text, 1, (text.length() + 1) * 2);
+    }
+
+    /**
      * Creates an excerpt for the given <code>text</code> using token offset
      * information provided by <code>tpv</code>.
      *
@@ -180,5 +196,86 @@
             }
         }
         return relevantTerms;
+    }
+
+    /**
+     * @param text the text.
+     * @return a <code>TermPositionVector</code> for the given text.
+     */
+    private TermPositionVector createTermPositionVector(String text) {
+        // term -> TermVectorOffsetInfo[]
+        final SortedMap termMap = new TreeMap();
+        Reader r = new StringReader(text);
+        TokenStream ts = index.getTextAnalyzer().tokenStream("", r);
+        Token t;
+        try {
+            while ((t = ts.next()) != null) {
+                TermVectorOffsetInfo[] info =
+                        (TermVectorOffsetInfo[]) termMap.get(t.termText());
+                if (info == null) {
+                    info = new TermVectorOffsetInfo[1];
+                } else {
+                    TermVectorOffsetInfo[] tmp = info;
+                    info = new TermVectorOffsetInfo[tmp.length + 1];
+                    System.arraycopy(tmp, 0, info, 0, tmp.length);
+                }
+                info[info.length - 1] = new TermVectorOffsetInfo(
+                        t.startOffset(), t.endOffset());
+                termMap.put(t.termText(), info);
+            }
+        } catch (IOException e) {
+            // should never happen, we are reading from a string
+        }
+
+        return new TermPositionVector() {
+
+            private String[] terms =
+                    (String[]) termMap.keySet().toArray(new String[termMap.size()]);
+
+            public int[] getTermPositions(int index) {
+                return null;
+            }
+
+            public TermVectorOffsetInfo[] getOffsets(int index) {
+                TermVectorOffsetInfo[] info = TermVectorOffsetInfo.EMPTY_OFFSET_INFO;
+                if (index >= 0 && index < terms.length) {
+                    info = (TermVectorOffsetInfo[]) termMap.get(terms[index]);
+                }
+                return info;
+            }
+
+            public String getField() {
+                return "";
+            }
+
+            public int size() {
+                return terms.length;
+            }
+
+            public String[] getTerms() {
+                return terms;
+            }
+
+            public int[] getTermFrequencies() {
+                int[] freqs = new int[terms.length];
+                for (int i = 0; i < terms.length; i++) {
+                    freqs[i] = ((TermVectorOffsetInfo[]) termMap.get(terms[i])).length;
+                }
+                return freqs;
+            }
+
+            public int indexOf(String term) {
+                int res = Arrays.binarySearch(terms, term);
+                return res >= 0 ? res : -1;
+            }
+
+            public int[] indexesOf(String[] terms, int start, int len) {
+                int res[] = new int[len];
+                for (int i = 0; i < len; i++) {
+                    res[i] = indexOf(terms[i]);
+                }
+                return res;
+            }
+        };
     }
 }

Modified: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/DefaultHighlighter.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/DefaultHighlighter.java?view=diff&rev=537791&r1=537790&r2=537791
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/DefaultHighlighter.java
(original)
+++ jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/DefaultHighlighter.java
Mon May 14 04:52:14 2007
@@ -183,7 +183,23 @@
             throws IOException {
         if (offsets == null || offsets.length == 0) {
             // nothing to highlight
-            return excerptStart + excerptEnd;
+            StringBuffer text = new StringBuffer(excerptStart);
+            text.append(fragmentStart);
+            int min = text.length();
+            char[] buf = new char[surround * 2];
+            int len = reader.read(buf);
+            text.append(buf, 0, len);
+            if (len == buf.length) {
+                for (int i = text.length() - 1; i > min; i--) {
+                    if (Character.isWhitespace(text.charAt(i))) {
+                        text.delete(i, text.length());
+                        text.append(" ...");
+                        break;
+                    }
+                }
+            }
+            text.append(fragmentEnd).append(excerptEnd);
+            return text.toString();
         }
         int lastOffset = offsets.length; // Math.min(10, offsets.length); // 10 terms is
plenty?
         ArrayList fragmentInfoList = new ArrayList();
@@ -334,12 +350,14 @@
                 }
                 sb.append(Text.encodeIllegalXMLCharacters(
                         new String(cbuf, 0, EOF ? skip : (surround - skippedChars))));
-                char lastChar = sb.charAt(sb.length() - 1);
-                if (lastChar != '.' && lastChar != '!' && lastChar != '?')
{
-                    sb.append(" ...");
+                if (!EOF) {
+                    char lastChar = sb.charAt(sb.length() - 1);
+                    if (lastChar != '.' && lastChar != '!' && lastChar !=
'?') {
+                        sb.append(" ...");
+                    }
                 }
-                sb.append(fragmentEnd);
             }
+            sb.append(fragmentEnd);
         }
         sb.append(excerptEnd);
         return sb.toString();

Modified: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/DefaultXMLExcerpt.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/DefaultXMLExcerpt.java?view=diff&rev=537791&r1=537790&r2=537791
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/DefaultXMLExcerpt.java
(original)
+++ jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/DefaultXMLExcerpt.java
Mon May 14 04:52:14 2007
@@ -32,15 +32,15 @@
  * &lt;/excerpt>
  * </pre>
  */
-class DefaultXMLExcerpt extends AbstractExcerpt {
+public class DefaultXMLExcerpt extends AbstractExcerpt {
 
     /**
      * {@inheritDoc}
      */
     protected String createExcerpt(TermPositionVector tpv,
-                                 String text,
-                                 int maxFragments,
-                                 int maxFragmentSize)
+                                   String text,
+                                   int maxFragments,
+                                   int maxFragmentSize)
             throws IOException {
         return DefaultHighlighter.highlight(tpv, getQueryTerms(), text,
                 "<highlight>", "</highlight>", maxFragments, maxFragmentSize
/ 2);

Added: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/HighlightingExcerptProvider.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/HighlightingExcerptProvider.java?view=auto&rev=537791
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/HighlightingExcerptProvider.java
(added)
+++ jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/HighlightingExcerptProvider.java
Mon May 14 04:52:14 2007
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.core.query.lucene;
+
+import java.io.IOException;
+
+/**
+ * <code>HighlightingExcerptProvider</code> extends the
+ * <code>ExcerptProvider</code> interface with a method that highlights matching
+ * terms in arbitrary text.
+ */
+public interface HighlightingExcerptProvider extends ExcerptProvider {
+
+    /**
+     * Highlights the matching terms in the passed <code>text</code>.
+     *
+     * @param text the input text.
+     * @return the highlighted text.
+     * @throws IOException if an error occurs while highlighting the text.
+     */
+    public String highlight(String text)
+            throws IOException;
+}

Propchange: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/HighlightingExcerptProvider.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/RowIteratorImpl.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/RowIteratorImpl.java?view=diff&rev=537791&r1=537790&r2=537791
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/RowIteratorImpl.java
(original)
+++ jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/RowIteratorImpl.java
Mon May 14 04:52:14 2007
@@ -303,7 +303,12 @@
             try {
                 QName prop = NameFormat.parse(propertyName, resolver);
                 if (!propertySet.contains(prop)) {
-                    throw new ItemNotFoundException(propertyName);
+                    if (isExcerptFunction(propertyName)) {
+                        // excerpt function with parameter
+                        return getExcerpt(propertyName);
+                    } else {
+                        throw new ItemNotFoundException(propertyName);
+                    }
                 }
                 if (node.hasProperty(prop)) {
                     Property p = node.getProperty(prop);
@@ -388,12 +393,19 @@
             }
             String pathStr = excerptCall.substring(
                     idx + EXCERPT_FUNC_LPAR.length(), end).trim();
+            String decodedPath = ISO9075.decode(pathStr);
             try {
-                NodeImpl n = (NodeImpl) node.getNode(ISO9075.decode(pathStr));
+                NodeImpl n = (NodeImpl) node.getNode(decodedPath);
                 return createExcerpt(n.getNodeId());
             } catch (PathNotFoundException e) {
-                // does not exist
-                return null;
+                // does not exist or references a property
+                try {
+                    Property p = node.getProperty(decodedPath);
+                    return highlight(p.getValue().getString());
+                } catch (PathNotFoundException e1) {
+                    // does not exist
+                    return null;
+                }
             }
         }
 
@@ -417,6 +429,28 @@
                 } else {
                     return null;
                 }
+            } catch (IOException e) {
+                return null;
+            }
+        }
+
+        /**
+         * Highlights the matching terms in the passed <code>text</code>.
+         *
+         * @return a StringValue or <code>null</code> if highlighting fails.
+         */
+        private Value highlight(String text) {
+            if (!(excerptProvider instanceof HighlightingExcerptProvider)) {
+                return null;
+            }
+            HighlightingExcerptProvider hep =
+                    (HighlightingExcerptProvider) excerptProvider;
+            try {
+                long time = System.currentTimeMillis();
+                text = hep.highlight(text);
+                time = System.currentTimeMillis() - time;
+                log.debug("Highlighted text in {} ms.", new Long(time));
+                return new StringValue(text);
             } catch (IOException e) {
                 return null;
             }

Modified: jackrabbit/trunk/jackrabbit-core/src/test/java/org/apache/jackrabbit/core/query/TextExtractorTest.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/test/java/org/apache/jackrabbit/core/query/TextExtractorTest.java?view=diff&rev=537791&r1=537790&r2=537791
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/test/java/org/apache/jackrabbit/core/query/TextExtractorTest.java
(original)
+++ jackrabbit/trunk/jackrabbit-core/src/test/java/org/apache/jackrabbit/core/query/TextExtractorTest.java
Mon May 14 04:52:14 2007
@@ -32,14 +32,20 @@
 
     private static final String TEST_FOLDER = "test-data";
 
+    private int fileCount = 0;
+
     public void testImport() throws Exception {
         File sourceFolder = new File(TEST_FOLDER);
         // only run if there is test data
         if (!sourceFolder.exists()) {
             return;
         }
+        long time = System.currentTimeMillis();
         addContents(sourceFolder,
                 testRootNode.addNode(sourceFolder.getName(), "nt:folder"));
+        superuser.save();
+        time = System.currentTimeMillis() - time;
+        System.out.println("Imported " + fileCount + " files in " + time + " ms.");
     }
 
     /**
@@ -56,8 +62,10 @@
                 } else {
                     addFile(n, f);
                     System.out.println("Added file: " + f.getAbsolutePath());
-                    // save after a file had been added
-                    n.getSession().save();
+                    // save after 100 files
+                    if (++fileCount % 100 == 0) {
+                        n.getSession().save();
+                    }
                 }
             }
         }



Mime
View raw message