jackrabbit-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mreut...@apache.org
Subject svn commit: r612123 - in /jackrabbit/trunk/jackrabbit-core/src: main/java/org/apache/jackrabbit/core/query/lucene/ test/java/org/apache/jackrabbit/core/query/ test/repository/workspaces/default/
Date Tue, 15 Jan 2008 14:23:45 GMT
Author: mreutegg
Date: Tue Jan 15 06:23:43 2008
New Revision: 612123

URL: http://svn.apache.org/viewvc?rev=612123&view=rev
Log:
JCR-1313: Additional excerpt provider implementation

Added:
    jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/WeightedHTMLExcerpt.java
  (with props)
    jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/WeightedHighlighter.java
  (with props)
    jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/WeightedXMLExcerpt.java
  (with props)
    jackrabbit/trunk/jackrabbit-core/src/test/java/org/apache/jackrabbit/core/query/ExcerptTest.java
  (with props)
Modified:
    jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/AbstractExcerpt.java
    jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/DefaultHighlighter.java
    jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/DefaultXMLExcerpt.java
    jackrabbit/trunk/jackrabbit-core/src/test/java/org/apache/jackrabbit/core/query/AbstractQueryTest.java
    jackrabbit/trunk/jackrabbit-core/src/test/java/org/apache/jackrabbit/core/query/TestAll.java
    jackrabbit/trunk/jackrabbit-core/src/test/repository/workspaces/default/workspace.xml

Modified: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/AbstractExcerpt.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/AbstractExcerpt.java?rev=612123&r1=612122&r2=612123&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/AbstractExcerpt.java
(original)
+++ jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/AbstractExcerpt.java
Tue Jan 15 06:23:43 2008
@@ -140,11 +140,8 @@
                 return createExcerpt((TermPositionVector) tfv, text.toString(),
                         maxFragments, maxFragmentSize);
             } else {
-                log.debug("No TermPositionVector on Fulltext field, using {}",
-                        SimpleExcerptProvider.class.getName());
-                SimpleExcerptProvider exProvider = new SimpleExcerptProvider();
-                exProvider.init(query, index);
-                return exProvider.getExcerpt(id, maxFragments, maxFragmentSize);
+                log.debug("No TermPositionVector on Fulltext field.");
+                return null;
             }
         } finally {
             reader.close();

Modified: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/DefaultHighlighter.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/DefaultHighlighter.java?rev=612123&r1=612122&r2=612123&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/DefaultHighlighter.java
(original)
+++ jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/DefaultHighlighter.java
Tue Jan 15 06:23:43 2008
@@ -42,7 +42,7 @@
  * @see org.apache.lucene.index.TermPositionVector
  * @see org.apache.lucene.index.TermFreqVector
  */
-class DefaultHighlighter {
+public class DefaultHighlighter {
 
     /**
      * A default value of <tt>3</tt>
@@ -50,9 +50,9 @@
     public static final int DEFAULT_MAXFRAGMENTS = 3;
 
     /**
-     * A default value of <tt>80</tt>
+     * A default value of <tt>75</tt>
      */
-    public static final int DEFAULT_SURROUND = 80;
+    public static final int DEFAULT_SURROUND = 75;
 
     public static final String START_EXCERPT = "<excerpt>";
 
@@ -62,28 +62,11 @@
 
     public static final String END_FRAGMENT_SEPARATOR = "</fragment>";
 
-    private DefaultHighlighter() {
-    }
+    public static final String START_HIGHLIGHT = "<highlight>";
 
-    /**
-     * @param tvec       the term position vector for this hit
-     * @param queryTerms the query terms.
-     * @param text       the original text that was used to create the tokens.
-     * @param prepend    the string used to prepend a highlighted token, for
-     *                   example <tt>&quot;&lt;b&gt;&quot;</tt>
-     * @param append     the string used to append a highlighted token, for
-     *                   example <tt>&quot;&lt;/b&gt;&quot;</tt>
-     * @return a String with text fragments where tokens from the query are
-     *         highlighted
-     */
-    public static String highlight(TermPositionVector tvec,
-                                   Set queryTerms,
-                                   String text,
-                                   String prepend,
-                                   String append)
-            throws IOException {
-        return highlight(tvec, queryTerms, text, prepend, append,
-                DEFAULT_MAXFRAGMENTS, DEFAULT_SURROUND);
+    public static final String END_HIGHLIGHT = "</highlight>";
+
+    protected DefaultHighlighter() {
     }
 
     /**
@@ -118,6 +101,46 @@
                                    int maxFragments,
                                    int surround)
             throws IOException {
+        return new DefaultHighlighter().doHighlight(tvec, queryTerms, text,
+                excerptStart, excerptEnd, fragmentStart, fragmentEnd, hlStart,
+                hlEnd, maxFragments, surround);
+    }
+
+    /**
+     * @param tvec         the term position vector for this hit
+     * @param queryTerms   the query terms.
+     * @param text         the original text that was used to create the tokens.
+     * @param maxFragments the maximum number of fragments
+     * @param surround     the maximum number of chars surrounding a highlighted
+     *                     token
+     * @return a String with text fragments where tokens from the query are
+     *         highlighted
+     */
+    public static String highlight(TermPositionVector tvec,
+                                   Set queryTerms,
+                                   String text,
+                                   int maxFragments,
+                                   int surround)
+            throws IOException {
+        return highlight(tvec, queryTerms, text, START_EXCERPT, END_EXCERPT,
+                START_FRAGMENT_SEPARATOR, END_FRAGMENT_SEPARATOR,
+                START_HIGHLIGHT, END_HIGHLIGHT, maxFragments, surround);
+    }
+
+    /**
+     * @see #highlight(TermPositionVector, Set, String, String, String, String, String, String,
String, int, int)
+     */
+    protected String doHighlight(TermPositionVector tvec,
+                                 Set queryTerms,
+                                 String text,
+                                 String excerptStart,
+                                 String excerptEnd,
+                                 String fragmentStart,
+                                 String fragmentEnd,
+                                 String hlStart,
+                                 String hlEnd,
+                                 int maxFragments,
+                                 int surround) throws IOException {
         String[] terms = new String[queryTerms.size()];
         Iterator it = queryTerms.iterator();
         for (int i = 0; it.hasNext(); i++) {
@@ -138,68 +161,42 @@
             java.util.Arrays.sort(offsets, new TermVectorOffsetInfoSorter());
         }
 
-        return mergeFragments(offsets, new StringReader(text), excerptStart,
+        return mergeFragments(offsets, text, excerptStart,
                 excerptEnd, fragmentStart, fragmentEnd, hlStart, hlEnd,
                 maxFragments, surround);
     }
 
-    /**
-     * @param tvec         the term position vector for this hit
-     * @param queryTerms   the query terms.
-     * @param text         the original text that was used to create the tokens.
-     * @param prepend      the string used to prepend a highlighted token, for
-     *                     example <tt>&quot;&lt;b&gt;&quot;</tt>
-     * @param append       the string used to append a highlighted token, for
-     *                     example <tt>&quot;&lt;/b&gt;&quot;</tt>
-     * @param maxFragments the maximum number of fragments
-     * @param surround     the maximum number of chars surrounding a highlighted
-     *                     token
-     * @return a String with text fragments where tokens from the query are
-     *         highlighted
-     */
-    public static String highlight(TermPositionVector tvec,
-                                   Set queryTerms,
-                                   String text,
-                                   String prepend,
-                                   String append,
-                                   int maxFragments,
-                                   int surround)
-            throws IOException {
-        return highlight(tvec, queryTerms, text, START_EXCERPT, END_EXCERPT,
-                START_FRAGMENT_SEPARATOR, END_FRAGMENT_SEPARATOR, prepend,
-                append, maxFragments, surround);
-    }
-
-    private static String mergeFragments(TermVectorOffsetInfo[] offsets,
-                                         StringReader reader,
-                                         String excerptStart,
-                                         String excerptEnd,
-                                         String fragmentStart,
-                                         String fragmentEnd,
-                                         String hlStart,
-                                         String hlEnd,
-                                         int maxFragments,
-                                         int surround)
+    protected String mergeFragments(TermVectorOffsetInfo[] offsets,
+                                    String text,
+                                    String excerptStart,
+                                    String excerptEnd,
+                                    String fragmentStart,
+                                    String fragmentEnd,
+                                    String hlStart,
+                                    String hlEnd,
+                                    int maxFragments,
+                                    int surround)
             throws IOException {
+        StringReader reader = new StringReader(text);
         if (offsets == null || offsets.length == 0) {
             // nothing to highlight
-            StringBuffer text = new StringBuffer(excerptStart);
-            text.append(fragmentStart);
-            int min = text.length();
+            StringBuffer excerpt = new StringBuffer(excerptStart);
+            excerpt.append(fragmentStart);
+            int min = excerpt.length();
             char[] buf = new char[surround * 2];
             int len = reader.read(buf);
-            text.append(buf, 0, len);
+            excerpt.append(buf, 0, len);
             if (len == buf.length) {
-                for (int i = text.length() - 1; i > min; i--) {
-                    if (Character.isWhitespace(text.charAt(i))) {
-                        text.delete(i, text.length());
-                        text.append(" ...");
+                for (int i = excerpt.length() - 1; i > min; i--) {
+                    if (Character.isWhitespace(excerpt.charAt(i))) {
+                        excerpt.delete(i, excerpt.length());
+                        excerpt.append(" ...");
                         break;
                     }
                 }
             }
-            text.append(fragmentEnd).append(excerptEnd);
-            return text.toString();
+            excerpt.append(fragmentEnd).append(excerptEnd);
+            return excerpt.toString();
         }
         int lastOffset = offsets.length; // Math.min(10, offsets.length); // 10 terms is
plenty?
         ArrayList fragmentInfoList = new ArrayList();

Modified: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/DefaultXMLExcerpt.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/DefaultXMLExcerpt.java?rev=612123&r1=612122&r2=612123&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/DefaultXMLExcerpt.java
(original)
+++ jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/DefaultXMLExcerpt.java
Tue Jan 15 06:23:43 2008
@@ -43,6 +43,6 @@
                                    int maxFragmentSize)
             throws IOException {
         return DefaultHighlighter.highlight(tpv, getQueryTerms(), text,
-                "<highlight>", "</highlight>", maxFragments, maxFragmentSize
/ 2);
+                maxFragments, maxFragmentSize / 2);
     }
 }

Added: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/WeightedHTMLExcerpt.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/WeightedHTMLExcerpt.java?rev=612123&view=auto
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/WeightedHTMLExcerpt.java
(added)
+++ jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/WeightedHTMLExcerpt.java
Tue Jan 15 06:23:43 2008
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.core.query.lucene;
+
+import org.apache.lucene.index.TermPositionVector;
+
+import java.io.IOException;
+
+/**
+ * <code>WeightedHTMLExcerpt</code> creates a HTML excerpt with the following
+ * format:
+ * <pre>
+ * &lt;div>
+ *     &lt;span>&lt;strong>Jackrabbit&lt;/strong> implements both the
mandatory XPath and optional SQL &lt;strong>query&lt;/strong> syntax.&lt;/span>
+ *     &lt;span>Before parsing the XPath &lt;strong>query&lt;/strong>
in &lt;strong>Jackrabbit&lt;/strong>, the statement is surrounded&lt;/span>
+ * &lt;/div>
+ * </pre>
+ * In contrast to {@link DefaultHTMLExcerpt} this implementation weights
+ * fragments based on the proximity of highlighted terms. Highlighted terms that
+ * are adjacent have a higher weight. In addition, the more highlighted terms,
+ * the higher the weight.
+ * 
+ * @see WeightedHighlighter
+ */
+public class WeightedHTMLExcerpt extends AbstractExcerpt {
+
+    /**
+     * {@inheritDoc}
+     */
+    protected String createExcerpt(TermPositionVector tpv,
+                                   String text,
+                                   int maxFragments,
+                                   int maxFragmentSize) throws IOException {
+        return WeightedHighlighter.highlight(tpv, getQueryTerms(), text,
+                "<div>", "</div>", "<span>", "</span>", "<strong>",
"</strong>",
+                maxFragments, maxFragmentSize / 2);
+    }
+}

Propchange: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/WeightedHTMLExcerpt.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/WeightedHighlighter.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/WeightedHighlighter.java?rev=612123&view=auto
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/WeightedHighlighter.java
(added)
+++ jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/WeightedHighlighter.java
Tue Jan 15 06:23:43 2008
@@ -0,0 +1,355 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.core.query.lucene;
+
+import org.apache.lucene.index.TermPositionVector;
+import org.apache.lucene.index.TermVectorOffsetInfo;
+import org.apache.lucene.util.PriorityQueue;
+
+import java.util.Set;
+import java.util.Iterator;
+import java.util.ArrayList;
+import java.util.BitSet;
+import java.util.List;
+import java.util.IdentityHashMap;
+import java.util.Map;
+import java.util.LinkedList;
+import java.io.IOException;
+
+/**
+ * <code>WeightedHighlighter</code> implements a highlighter that weights the
+ * fragments based on the proximity of the highlighted terms to each other. The
+ * returned fragments are not necessarily in sequence as the text occurs in the
+ * content.
+ */
+public class WeightedHighlighter extends DefaultHighlighter {
+
+    /**
+     * Punctuation characters that mark the end of a sentence.
+     */
+    private static final BitSet PUNCTUATION = new BitSet();
+
+    static {
+        PUNCTUATION.set('.');
+        PUNCTUATION.set('!');
+        PUNCTUATION.set(0xa1); // inverted exclamation mark
+        PUNCTUATION.set('?');
+        PUNCTUATION.set(0xbf); // inverted question mark
+        // todo add more
+    }
+
+    protected WeightedHighlighter() {
+    }
+
+    /**
+     * @param tvec          the term position vector for this hit
+     * @param queryTerms    the query terms.
+     * @param text          the original text that was used to create the
+     *                      tokens.
+     * @param excerptStart  this string is prepended to the excerpt
+     * @param excerptEnd    this string is appended to the excerpt
+     * @param fragmentStart this string is prepended to every fragment
+     * @param fragmentEnd   this string is appended to the end of every
+     *                      fragement.
+     * @param hlStart       the string used to prepend a highlighted token, for
+     *                      example <tt>&quot;&lt;b&gt;&quot;</tt>
+     * @param hlEnd         the string used to append a highlighted token, for
+     *                      example <tt>&quot;&lt;/b&gt;&quot;</tt>
+     * @param maxFragments  the maximum number of fragments
+     * @param surround      the maximum number of chars surrounding a
+     *                      highlighted token
+     * @return a String with text fragments where tokens from the query are
+     *         highlighted
+     */
+    public static String highlight(TermPositionVector tvec,
+                                   Set queryTerms,
+                                   String text,
+                                   String excerptStart,
+                                   String excerptEnd,
+                                   String fragmentStart,
+                                   String fragmentEnd,
+                                   String hlStart,
+                                   String hlEnd,
+                                   int maxFragments,
+                                   int surround) throws IOException {
+        return new WeightedHighlighter().doHighlight(tvec, queryTerms, text,
+                excerptStart, excerptEnd, fragmentStart, fragmentEnd, hlStart,
+                hlEnd, maxFragments, surround);
+    }
+
+    /**
+     * @param tvec         the term position vector for this hit
+     * @param queryTerms   the query terms.
+     * @param text         the original text that was used to create the tokens.
+     * @param maxFragments the maximum number of fragments
+     * @param surround     the maximum number of chars surrounding a highlighted
+     *                     token
+     * @return a String with text fragments where tokens from the query are
+     *         highlighted
+     */
+    public static String highlight(TermPositionVector tvec,
+                                   Set queryTerms,
+                                   String text,
+                                   int maxFragments,
+                                   int surround) throws IOException {
+        return highlight(tvec, queryTerms, text, START_EXCERPT, END_EXCERPT,
+                START_FRAGMENT_SEPARATOR, END_FRAGMENT_SEPARATOR,
+                START_HIGHLIGHT, END_HIGHLIGHT, maxFragments, surround);
+    }
+
+    protected String mergeFragments(TermVectorOffsetInfo[] offsets,
+                                    String text,
+                                    String excerptStart,
+                                    String excerptEnd,
+                                    String fragmentStart,
+                                    String fragmentEnd,
+                                    String hlStart,
+                                    String hlEnd,
+                                    int maxFragments,
+                                    int surround) {
+
+        if (offsets == null || offsets.length == 0) {
+            // nothing to highlight
+            StringBuffer excerpt = new StringBuffer(excerptStart);
+            excerpt.append(fragmentStart);
+            int min = excerpt.length();
+            excerpt.append(text.substring(0, Math.min(text.length(), surround * 2)));
+            if (text.length() > excerpt.length()) {
+                for (int i = excerpt.length() - 1; i > min; i--) {
+                    if (Character.isWhitespace(excerpt.charAt(i))) {
+                        excerpt.delete(i, excerpt.length());
+                        excerpt.append(" ...");
+                        break;
+                    }
+                }
+            }
+            excerpt.append(fragmentEnd).append(excerptEnd);
+            return excerpt.toString();
+        }
+
+        PriorityQueue bestFragments = new FragmentInfoPriorityQueue(maxFragments);
+        for (int i = 0; i < offsets.length; i++) {
+            FragmentInfo fi = new FragmentInfo(offsets[i], surround * 2);
+            for (int j = i + 1; j < offsets.length; j++) {
+                if (!fi.add(offsets[j], text)) {
+                    break;
+                }
+            }
+            bestFragments.insert(fi);
+        }
+
+        // retrieve fragment infos from queue and fill into list, least
+        // fragment comes out first
+        List infos = new LinkedList();
+        while (bestFragments.size() > 0) {
+            FragmentInfo fi = (FragmentInfo) bestFragments.pop();
+            infos.add(0, fi);
+        }
+
+        Map offsetInfos = new IdentityHashMap();
+        // remove overlapping fragment infos
+        for (Iterator it = infos.iterator(); it.hasNext(); ) {
+            FragmentInfo fi = (FragmentInfo) it.next();
+            boolean overlap = false;
+            for (Iterator fit = fi.iterator(); fit.hasNext() && !overlap; ) {
+                TermVectorOffsetInfo oi = (TermVectorOffsetInfo) fit.next();
+                if (offsetInfos.containsKey(oi)) {
+                    overlap = true;
+                }
+            }
+            if (overlap) {
+                it.remove();
+            } else {
+                for (Iterator oit = fi.iterator(); oit.hasNext(); ) {
+                    offsetInfos.put(oit.next(), null);
+                }
+            }
+        }
+
+        // create excerpts
+        StringBuffer sb = new StringBuffer(excerptStart);
+        for (Iterator it = infos.iterator(); it.hasNext(); ) {
+            FragmentInfo fi = (FragmentInfo) it.next();
+            sb.append(fragmentStart);
+            int limit = Math.max(0, fi.getStartOffset() / 2 + fi.getEndOffset() / 2 - surround);
+            int len = startFragment(sb, text, fi.getStartOffset(), limit);
+            TermVectorOffsetInfo lastOffsetInfo = null;
+            for (Iterator fIt = fi.iterator(); fIt.hasNext(); ) {
+                TermVectorOffsetInfo oi = (TermVectorOffsetInfo) fIt.next();
+                if (lastOffsetInfo != null) {
+                    // fill in text between terms
+                    sb.append(text.substring(lastOffsetInfo.getEndOffset(), oi.getStartOffset()));
+                }
+                sb.append(hlStart);
+                sb.append(text.substring(oi.getStartOffset(), oi.getEndOffset()));
+                sb.append(hlEnd);
+                lastOffsetInfo = oi;
+            }
+            limit = Math.min(text.length(), fi.getStartOffset() - len + (surround * 2));
+            endFragment(sb, text, fi.getEndOffset(), limit);
+            sb.append(fragmentEnd);
+        }
+        sb.append(excerptEnd);
+        return sb.toString();
+    }
+
+    /**
+     * Writes the start of a fragment to the string buffer <code>sb</code>. The
+     * first occurrence of a matching term is indicated by the
+     * <code>offset</code> into the <code>text</code>.
+     *
+     * @param sb     where to append the start of the fragment.
+     * @param text   the original text.
+     * @param offset the start offset of the first matching term in the
+     *               fragment.
+     * @param limit  do not go back further than <code>limit</code>.
+     * @return the length of the start fragment that was appended to
+     *         <code>sb</code>.
+     */
+    private static int startFragment(StringBuffer sb, String text, int offset, int limit)
{
+        if (limit == 0) {
+            // append all
+            sb.append(text.substring(0, offset));
+            return offset;
+        }
+        String intro = "... ";
+        int start = offset;
+        for (int i = offset - 1; i >= limit; i--) {
+            if (Character.isWhitespace(text.charAt(i))) {
+                // potential start
+                start = i + 1;
+                if (i - 1 >= limit && PUNCTUATION.get(text.charAt(i - 1))) {
+                    // start of sentence found
+                    intro = "";
+                    break;
+                }
+            }
+        }
+        sb.append(intro).append(text.substring(start, offset));
+        return offset - start;
+    }
+
+    /**
+     * Writes the end of a fragment to the string buffer <code>sb</code>. The
+     * last occurrence of a matching term is indicated by the
+     * <code>offset</code> into the <code>text</code>.
+     *
+     * @param sb     where to append the start of the fragment.
+     * @param text   the original text.
+     * @param offset the end offset of the last matching term in the fragment.
+     * @param limit  do not go further than <code>limit</code>.
+     */
+    private static void endFragment(StringBuffer sb, String text, int offset, int limit)
{
+        if (limit == text.length()) {
+            // append all
+            sb.append(text.substring(offset));
+            return;
+        }
+        int end = offset;
+        for (int i = end; i < limit; i++) {
+            if (Character.isWhitespace(text.charAt(i))) {
+                // potential end
+                end = i;
+            }
+        }
+        sb.append(text.substring(offset, end)).append(" ...");
+    }
+
+    private static class FragmentInfo {
+        ArrayList offsetInfosList;
+        int startOffset;
+        int endOffset;
+        int maxFragmentSize;
+        int quality;
+
+        public FragmentInfo(TermVectorOffsetInfo offsetinfo, int maxFragmentSize) {
+            offsetInfosList = new ArrayList();
+            offsetInfosList.add(offsetinfo);
+            startOffset = offsetinfo.getStartOffset();
+            endOffset = offsetinfo.getEndOffset();
+            this.maxFragmentSize = maxFragmentSize;
+            quality = 0;
+        }
+
+        public boolean add(TermVectorOffsetInfo offsetinfo, String text) {
+            if (offsetinfo.getEndOffset() > (startOffset + maxFragmentSize)) {
+                return false;
+            }
+            offsetInfosList.add(offsetinfo);
+            if (offsetinfo.getStartOffset() - endOffset <= 3) {
+                // boost quality when terms are adjacent
+                // and only separated by whitespace character
+                boolean boost = true;
+                for (int i = endOffset; i < offsetinfo.getStartOffset(); i++) {
+                    if (!Character.isWhitespace(text.charAt(i))) {
+                        boost = false;
+                        break;
+                    }
+                }
+                if (boost) {
+                    quality += 10;
+                } else {
+                    quality++;
+                }
+            } else {
+                quality++;
+            }
+            endOffset = offsetinfo.getEndOffset();
+            return true;
+        }
+
+        public Iterator iterator() {
+            return offsetInfosList.iterator();
+        }
+
+        public int getStartOffset() {
+            return startOffset;
+        }
+
+        public int getEndOffset() {
+            return endOffset;
+        }
+
+        public int getQuality() {
+            return quality;
+        }
+
+    }
+
+    private static class FragmentInfoPriorityQueue extends PriorityQueue {
+
+        public FragmentInfoPriorityQueue(int size) {
+            initialize(size);
+        }
+
+        /**
+         * Checks the quality of two {@link FragmentInfo} objects. The one with
+         * the lower quality is considered less than the other. If both
+         * fragments have the same quality, the one with the higher start offset
+         * is considered the lesser. This will result in a queue that keeps the
+         * {@link FragmentInfo} with the best quality.
+         */
+        protected boolean lessThan(Object a, Object b) {
+            FragmentInfo infoA = (FragmentInfo) a;
+            FragmentInfo infoB = (FragmentInfo) b;
+            if (infoA.getQuality() == infoB.getQuality()) {
+                return infoA.getStartOffset() > infoB.getStartOffset();
+            }
+            return infoA.getQuality() < infoB.getQuality();
+        }
+    }
+}

Propchange: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/WeightedHighlighter.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/WeightedXMLExcerpt.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/WeightedXMLExcerpt.java?rev=612123&view=auto
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/WeightedXMLExcerpt.java
(added)
+++ jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/WeightedXMLExcerpt.java
Tue Jan 15 06:23:43 2008
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.core.query.lucene;
+
+import org.apache.lucene.index.TermPositionVector;
+
+import java.io.IOException;
+
+/**
+ * <code>WeightedXMLExcerpt</code> creates an XML excerpt of a matching node.
In
+ * contrast to {@link DefaultXMLExcerpt} this implementation weights fragments
+ * based on the proximity of highlighted terms. Highlighted terms that are
+ * adjacent have a higher weight. In addition, the more highlighted terms, the
+ * higher the weight.
+ * <br/>
+ * E.g. if you search for 'jackrabbit' and 'query' you may get the following
+ * result for a node:
+ * <pre>
+ * &lt;excerpt>
+ *     &lt;fragment>&lt;highlight>Jackrabbit&lt;/highlight> implements
both the mandatory XPath and optional SQL &lt;highlight>query&lt;/highlight>
syntax.&lt;/fragment>
+ *     &lt;fragment>Before parsing the XPath &lt;highlight>query&lt;/highlight>
in &lt;highlight>Jackrabbit&lt;/highlight>, the statement is surrounded&lt;/fragment>
+ * &lt;/excerpt>
+ * </pre>
+ *
+ * @see WeightedHighlighter
+ */
+public class WeightedXMLExcerpt extends AbstractExcerpt {
+
+    /**
+     * {@inheritDoc}
+     */
+    protected String createExcerpt(TermPositionVector tpv,
+                                   String text,
+                                   int maxFragments,
+                                   int maxFragmentSize)
+            throws IOException {
+        return WeightedHighlighter.highlight(tpv, getQueryTerms(), text,
+                maxFragments, maxFragmentSize / 2);
+    }
+}

Propchange: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/WeightedXMLExcerpt.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: jackrabbit/trunk/jackrabbit-core/src/test/java/org/apache/jackrabbit/core/query/AbstractQueryTest.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/test/java/org/apache/jackrabbit/core/query/AbstractQueryTest.java?rev=612123&r1=612122&r2=612123&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/test/java/org/apache/jackrabbit/core/query/AbstractQueryTest.java
(original)
+++ jackrabbit/trunk/jackrabbit-core/src/test/java/org/apache/jackrabbit/core/query/AbstractQueryTest.java
Tue Jan 15 06:23:43 2008
@@ -21,6 +21,7 @@
 import javax.jcr.query.QueryResult;
 import javax.jcr.query.RowIterator;
 import javax.jcr.query.Query;
+import javax.jcr.query.QueryManager;
 import javax.jcr.NodeIterator;
 import javax.jcr.RepositoryException;
 import javax.jcr.Node;
@@ -36,6 +37,18 @@
  */
 public class AbstractQueryTest extends AbstractJCRTest {
 
+    protected QueryManager qm;
+
+    protected void setUp() throws Exception {
+        super.setUp();
+        qm = superuser.getWorkspace().getQueryManager();
+    }
+
+    protected void tearDown() throws Exception {
+        qm = null;
+        super.tearDown();
+    }
+
     /**
      * Checks if the <code>result</code> contains a number of <code>hits</code>.
      *
@@ -115,7 +128,7 @@
      */
     protected void executeXPathQuery(String xpath, Node[] nodes)
             throws RepositoryException {
-        QueryResult res = superuser.getWorkspace().getQueryManager().createQuery(xpath, Query.XPATH).execute();
+        QueryResult res = qm.createQuery(xpath, Query.XPATH).execute();
         checkResult(res, nodes);
     }
 
@@ -127,7 +140,7 @@
      */
     protected void executeSQLQuery(String sql, Node[] nodes)
             throws RepositoryException {
-        QueryResult res = superuser.getWorkspace().getQueryManager().createQuery(sql, Query.SQL).execute();
+        QueryResult res = qm.createQuery(sql, Query.SQL).execute();
         checkResult(res, nodes);
     }
 
@@ -156,6 +169,23 @@
         for (Iterator it = resultPaths.iterator(); it.hasNext();) {
             String path = (String) it.next();
             assertTrue(path + " is not expected to be part of the result set", expectedPaths.contains(path));
+        }
+    }
+
+    /**
+     * Executes the query specified by <code>statement</code> and returns the
+     * query result.
+     *
+     * @param statement either a SQL or XPath statement.
+     * @return the query result.
+     * @throws RepositoryException if an error occurs.
+     */
+    protected QueryResult executeQuery(String statement)
+            throws RepositoryException {
+        if (statement.trim().toLowerCase().startsWith("select")) {
+            return qm.createQuery(statement, Query.SQL).execute();
+        } else {
+            return qm.createQuery(statement, Query.XPATH).execute();
         }
     }
 }

Added: jackrabbit/trunk/jackrabbit-core/src/test/java/org/apache/jackrabbit/core/query/ExcerptTest.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/test/java/org/apache/jackrabbit/core/query/ExcerptTest.java?rev=612123&view=auto
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/test/java/org/apache/jackrabbit/core/query/ExcerptTest.java
(added)
+++ jackrabbit/trunk/jackrabbit-core/src/test/java/org/apache/jackrabbit/core/query/ExcerptTest.java
Tue Jan 15 06:23:43 2008
@@ -0,0 +1,140 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.core.query;
+
+import javax.jcr.RepositoryException;
+import javax.jcr.Value;
+import javax.jcr.Node;
+import javax.jcr.query.QueryResult;
+import javax.jcr.query.Row;
+import javax.jcr.query.RowIterator;
+
+/**
+ * <code>ExcerptTest</code> checks if HTML excerpts are created correctly. The
+ * test cases assume the following implementation details:
+ * <ul>
+ * <li>An excerpt is enclosed with a &lt;div> element</li>
+ * <li>A fragment is enclosed with a &lt;span> element</li>
+ * <li>Terms are highlighted with a &lt;strong> element</li>
+ * <li>The maximum number of fragment created is three</li>
+ * <li>The maximum excerpt length is 150 characters</li>
+ * <li>A fragment contains at most 75 characters (excluding '... ') before the first
term is highlighted</li>
+ * <li>At least the following sentence separators are recognized: '.', '!' and '?'</li>
+ * <li>If there is additial text after the fragment end ' ...' is appended to the fragment</li>
+ * <li>If the fragment starts within a sentence, then the fragment is prefixed with
'... '</li>
+ * </ul>
+ */
+public class ExcerptTest extends AbstractQueryTest {
+
+    private static final String EXCERPT_START = "<div><span>";
+
+    private static final String EXCERPT_END = "</span></div>";
+
+    public void testHightlightFirstWord() throws RepositoryException {
+        checkExcerpt("jackrabbit bla bla bla",
+                "<strong>jackrabbit</strong> bla bla bla",
+                "jackrabbit");
+    }
+
+    public void testHightlightLastWord() throws RepositoryException {
+        checkExcerpt("bla bla bla jackrabbit",
+                "bla bla bla <strong>jackrabbit</strong>",
+                "jackrabbit");
+    }
+
+    public void testHightlightWordBetween() throws RepositoryException {
+        checkExcerpt("bla bla jackrabbit bla bla",
+                "bla bla <strong>jackrabbit</strong> bla bla",
+                "jackrabbit");
+    }
+
+    public void testMoreTextDotsAtEnd() throws RepositoryException {
+        checkExcerpt("bla bla jackrabbit bla bla bla bla bla bla bla bla bla bla bla bla
bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla",
+                "bla bla <strong>jackrabbit</strong> bla bla bla bla bla bla
bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla
bla bla bla ...",
+                "jackrabbit");
+    }
+
+    public void testMoreTextDotsAtStart() throws RepositoryException {
+        checkExcerpt("bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla
bla bla bla jackrabbit bla bla bla bla",
+                "... bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla
<strong>jackrabbit</strong> bla bla bla bla",
+                "jackrabbit");
+    }
+
+    public void testMoreTextDotsAtStartAndEnd() throws RepositoryException {
+        checkExcerpt("bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla
bla bla bla jackrabbit bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla
bla bla",
+                "... bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla
<strong>jackrabbit</strong> bla bla bla bla bla bla bla bla bla bla bla bla bla
bla bla bla bla ...",
+                "jackrabbit");
+    }
+
+    public void testPunctuationStartsFragment() throws RepositoryException {
+        checkExcerpt("bla bla bla bla bla bla bla bla. bla bla bla bla bla bla bla bla bla
bla bla bla jackrabbit bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla
bla bla",
+                "bla bla bla bla bla bla bla bla bla bla bla bla <strong>jackrabbit</strong>
bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla",
+                "jackrabbit");
+    }
+
+    public void testPunctuationStartsFragmentEndsWithDots() throws RepositoryException {
+        checkExcerpt("bla bla bla bla bla bla bla bla. bla bla bla bla bla bla bla bla bla
bla bla bla jackrabbit bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla
bla bla bla bla bla bla bla",
+                "bla bla bla bla bla bla bla bla bla bla bla bla <strong>jackrabbit</strong>
bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla ...",
+                "jackrabbit");
+    }
+
+    public void testHighlightMultipleTerms() throws RepositoryException {
+        checkExcerpt("bla bla bla apache jackrabbit bla bla bla",
+                "bla bla bla <strong>apache</strong> <strong>jackrabbit</strong>
bla bla bla",
+                "apache jackrabbit");
+    }
+
+    public void testPreferPhrase() throws RepositoryException {
+        checkExcerpt("bla apache bla jackrabbit bla bla bla bla bla bla bla bla bla bla bla
bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla
bla bla bla bla bla apache jackrabbit bla bla bla",
+                "... bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla <strong>apache</strong>
<strong>jackrabbit</strong> bla bla bla</span><span>bla <strong>apache</strong>
bla <strong>jackrabbit</strong> bla bla bla bla bla bla bla bla bla bla bla bla
bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla ...",
+                "apache jackrabbit");
+    }
+
+    private void checkExcerpt(String text, String fragmentText, String terms)
+            throws RepositoryException {
+        String excerpt = createExcerpt(fragmentText);
+        createTestData(text);
+        String stmt = getStatement(terms);
+        QueryResult result = executeQuery(stmt);
+        RowIterator rows = result.getRows();
+        assertEquals(1, rows.getSize());
+        assertEquals(excerpt, getExcerpt(rows.nextRow()));
+    }
+
+    private String getStatement(String terms) {
+        return testPath + "/*[jcr:contains(., '"+ terms + "')]/rep:excerpt(.)";
+    }
+
+    private void createTestData(String text) throws RepositoryException {
+        Node n = testRootNode.addNode(nodeName1);
+        n.setProperty("text", text);
+        testRootNode.save();
+    }
+
+    private String getExcerpt(Row row) throws RepositoryException {
+        Value v = row.getValue("rep:excerpt(.)");
+        if (v != null) {
+            return v.getString();
+        } else {
+            return null;
+        }
+    }
+
+    private String createExcerpt(String fragments) {
+        return EXCERPT_START + fragments + EXCERPT_END;
+    }
+}

Propchange: jackrabbit/trunk/jackrabbit-core/src/test/java/org/apache/jackrabbit/core/query/ExcerptTest.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: jackrabbit/trunk/jackrabbit-core/src/test/java/org/apache/jackrabbit/core/query/TestAll.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/test/java/org/apache/jackrabbit/core/query/TestAll.java?rev=612123&r1=612122&r2=612123&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/test/java/org/apache/jackrabbit/core/query/TestAll.java
(original)
+++ jackrabbit/trunk/jackrabbit-core/src/test/java/org/apache/jackrabbit/core/query/TestAll.java
Tue Jan 15 06:23:43 2008
@@ -54,7 +54,8 @@
         suite.addTestSuite(PathQueryNodeTest.class);
         suite.addTestSuite(SynonymProviderTest.class);
         suite.addTestSuite(ArrayHitsTest.class);
-        
+        suite.addTestSuite(ExcerptTest.class);
+
         // exclude long running tests per default
         //suite.addTestSuite(MassiveRangeTest.class);
         //suite.addTestSuite(ConcurrentQueryTest.class);

Modified: jackrabbit/trunk/jackrabbit-core/src/test/repository/workspaces/default/workspace.xml
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/test/repository/workspaces/default/workspace.xml?rev=612123&r1=612122&r2=612123&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/test/repository/workspaces/default/workspace.xml
(original)
+++ jackrabbit/trunk/jackrabbit-core/src/test/repository/workspaces/default/workspace.xml
Tue Jan 15 06:23:43 2008
@@ -38,6 +38,8 @@
     <param name="path" value="${wsp.home}/index" />
     <param name="synonymProviderClass" value="org.apache.jackrabbit.core.query.lucene.PropertiesSynonymProvider"/>
     <param name="synonymProviderConfigPath" value="../synonyms.properties"/>
+    <param name="supportHighlighting" value="true"/>
+    <param name="excerptProviderClass" value="org.apache.jackrabbit.core.query.lucene.WeightedHTMLExcerpt"/>
   </SearchIndex>
 </Workspace>
 



Mime
View raw message