jackrabbit-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mreut...@apache.org
Subject svn commit: r535792 - in /jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene: DefaultHighlighter.java DefaultXMLExcerpt.java
Date Mon, 07 May 2007 08:59:34 GMT
Author: mreutegg
Date: Mon May  7 01:59:33 2007
New Revision: 535792

URL: http://svn.apache.org/viewvc?view=rev&rev=535792
Log:
JCR-898: Improve excerpt fragments

Modified:
    jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/DefaultHighlighter.java
    jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/DefaultXMLExcerpt.java

Modified: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/DefaultHighlighter.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/DefaultHighlighter.java?view=diff&rev=535792&r1=535791&r2=535792
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/DefaultHighlighter.java
(original)
+++ jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/DefaultHighlighter.java
Mon May  7 01:59:33 2007
@@ -176,8 +176,10 @@
         int skip;
         int nextStart;
         int skippedChars;
+        int firstWhitespace;
         for (int i = 0; i < bestFragmentsList.size(); i++) {
             fi = (FragmentInfo) bestFragmentsList.get(i);
+            fi.trim();
             nextStart = fi.getStartOffset();
             skip = nextStart - pos;
             if (skip > surround * 2) {
@@ -212,19 +214,39 @@
                 pos += skip;
             }
             // start fragment
-            skippedChars = 0;
             cbuf = new char[nextStart - pos];
+            skippedChars = Math.max(cbuf.length - 1, 0);
+            firstWhitespace = skippedChars;
             reader.read(cbuf, 0, nextStart - pos);
             pos += (nextStart - pos);
             sb.append(START_FRAGMENT_SEPARATOR);
-            // find first whitespace
-            for (; skippedChars < cbuf.length; skippedChars++) {
-                if (Character.isWhitespace(cbuf[skippedChars])) {
-                    skippedChars += 1;
-                    break;
+            // find last period followed by whitespace
+            if (cbuf.length > 0) {
+                for (; skippedChars >= 0; skippedChars--) {
+                    if (Character.isWhitespace(cbuf[skippedChars])) {
+                        firstWhitespace = skippedChars;
+                        if (skippedChars - 1 >= 0 &&
+                                cbuf[skippedChars - 1] == '.') {
+                            skippedChars++;
+                            break;
+                        }
+                    }
+                }
+            }
+            boolean sentenceStart = true;
+            if (skippedChars == -1) {
+                if (pos == cbuf.length) {
+                    // this fragment is the start of the text -> skip none
+                    skippedChars = 0;
+                } else {
+                    sentenceStart = false;
+                    skippedChars = firstWhitespace + 1;
                 }
             }
 
+            if (!sentenceStart) {
+                sb.append("... ");
+            }
             sb.append(Text.encodeIllegalXMLCharacters(
                     new String(cbuf, skippedChars, cbuf.length - skippedChars)));
 
@@ -272,6 +294,10 @@
                 }
                 sb.append(Text.encodeIllegalXMLCharacters(
                         new String(cbuf, 0, EOF ? skip : (surround - skippedChars))));
+                char lastChar = sb.charAt(sb.length() - 1);
+                if (lastChar != '.' && lastChar != '!' && lastChar != '?')
{
+                    sb.append(" ...");
+                }
                 sb.append(END_FRAGMENT_SEPARATOR);
             }
         }
@@ -319,6 +345,16 @@
 
         public int numTerms() {
             return numTerms;
+        }
+
+        public void trim() {
+            int end = startOffset + (mergeGap / 2);
+            for (Iterator it = offsetInfosList.iterator(); it.hasNext(); ) {
+                TermVectorOffsetInfo tvoi = (TermVectorOffsetInfo) it.next();
+                if (tvoi.getStartOffset() > end) {
+                    it.remove();
+                }
+            }
         }
     }
 

Modified: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/DefaultXMLExcerpt.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/DefaultXMLExcerpt.java?view=diff&rev=535792&r1=535791&r2=535792
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/DefaultXMLExcerpt.java
(original)
+++ jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/DefaultXMLExcerpt.java
Mon May  7 01:59:33 2007
@@ -103,12 +103,15 @@
                 // if a term text ends with characters that are considered noise
                 // then the offset of the next field will be off by the number
                 // of noise characters.
-                // therefore we delete noise characters at the end of the text
-                for (int j = text.length() - 1; j >= 0; j--) {
-                    if (Character.isLetterOrDigit(text.charAt(j))) {
-                        break;
-                    } else {
-                        text.deleteCharAt(j);
+                // therefore we delete noise characters at the end of the text.
+                // this process is required for all but the last field
+                if (i < fields.length - 1) {
+                    for (int j = text.length() - 1; j >= 0; j--) {
+                        if (Character.isLetterOrDigit(text.charAt(j))) {
+                            break;
+                        } else {
+                            text.deleteCharAt(j);
+                        }
                     }
                 }
                 separator = " ";



Mime
View raw message