lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From markrmil...@apache.org
Subject svn commit: r1095375 - in /lucene/dev/branches/branch_3x: ./ lucene/ lucene/contrib/ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/ lucene/contrib/highlighter/src/test/org/apache/lucene/search/highlight/ solr/ solr/src/java/org...
Date Wed, 20 Apr 2011 11:52:14 GMT
Author: markrmiller
Date: Wed Apr 20 11:52:13 2011
New Revision: 1095375

URL: http://svn.apache.org/viewvc?rev=1095375&view=rev
Log:
LUCENE-2939: Highlighter should try and use maxDocCharsToAnalyze in WeightedSpanTermExtractor
when adding a new field to MemoryIndex as well as when using CachingTokenStream
SOLR-2390: Performance of usePhraseHighlighter is terrible on very large Documents, regardless
of hl.maxDocCharsToAnalyze

Added:
    lucene/dev/branches/branch_3x/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/OffsetLimitTokenFilter.java
      - copied unchanged from r1095260, lucene/dev/trunk/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/OffsetLimitTokenFilter.java
    lucene/dev/branches/branch_3x/lucene/contrib/highlighter/src/test/org/apache/lucene/search/highlight/OffsetLimitTokenFilterTest.java
      - copied, changed from r1095260, lucene/dev/trunk/lucene/contrib/highlighter/src/test/org/apache/lucene/search/highlight/OffsetLimitTokenFilterTest.java
Modified:
    lucene/dev/branches/branch_3x/   (props changed)
    lucene/dev/branches/branch_3x/lucene/   (props changed)
    lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt
    lucene/dev/branches/branch_3x/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java
    lucene/dev/branches/branch_3x/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java
    lucene/dev/branches/branch_3x/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java
    lucene/dev/branches/branch_3x/solr/   (props changed)
    lucene/dev/branches/branch_3x/solr/CHANGES.txt
    lucene/dev/branches/branch_3x/solr/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java

Modified: lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt?rev=1095375&r1=1095374&r2=1095375&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt Wed Apr 20 11:52:13 2011
@@ -2,10 +2,15 @@ Lucene contrib change Log
 
 ======================= Lucene 3.x (not yet released) =======================
 
-Bug fixes
+Bug Fixes
 
  * LUCENE-3026: SmartChineseAnalyzer's WordTokenFilter threw NullPointerException
    on sentences longer than 32,767 characters.  (wangzhenghang via Robert Muir)
+   
+ * LUCENE-2939: Highlighter should try and use maxDocCharsToAnalyze in 
+   WeightedSpanTermExtractor when adding a new field to MemoryIndex as well as 
+   when using CachingTokenStream. This can be a significant performance bug for
+   large documents. (Mark Miller)
 
 New Features
 

Modified: lucene/dev/branches/branch_3x/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java?rev=1095375&r1=1095374&r2=1095375&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java
(original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java
Wed Apr 20 11:52:13 2011
@@ -197,6 +197,11 @@ public class Highlighter
 	    tokenStream.reset();
 	    
 		TextFragment currentFrag =	new TextFragment(newText,newText.length(), docFrags.size());
+		
+    if (fragmentScorer instanceof QueryScorer) {
+      ((QueryScorer) fragmentScorer).setMaxDocCharsToAnalyze(maxDocCharsToAnalyze);
+    }
+    
 		TokenStream newStream = fragmentScorer.init(tokenStream);
 		if(newStream != null) {
 		  tokenStream = newStream;

Modified: lucene/dev/branches/branch_3x/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java?rev=1095375&r1=1095374&r2=1095375&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java
(original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java
Wed Apr 20 11:52:13 2011
@@ -54,6 +54,7 @@ public class QueryScorer implements Scor
   private IndexReader reader;
   private boolean skipInitExtractor;
   private boolean wrapToCaching = true;
+  private int maxCharsToAnalyze;
 
   /**
    * @param query Query to use for highlighting
@@ -209,7 +210,7 @@ public class QueryScorer implements Scor
   private TokenStream initExtractor(TokenStream tokenStream) throws IOException {
     WeightedSpanTermExtractor qse = defaultField == null ? new WeightedSpanTermExtractor()
         : new WeightedSpanTermExtractor(defaultField);
-
+    qse.setMaxDocCharsToAnalyze(maxCharsToAnalyze);
     qse.setExpandMultiTermQuery(expandMultiTermQuery);
     qse.setWrapIfNotCachingTokenFilter(wrapToCaching);
     if (reader == null) {
@@ -265,4 +266,8 @@ public class QueryScorer implements Scor
   public void setWrapIfNotCachingTokenFilter(boolean wrap) {
     this.wrapToCaching = wrap;
   }
+
+  public void setMaxDocCharsToAnalyze(int maxDocCharsToAnalyze) {
+    this.maxCharsToAnalyze = maxDocCharsToAnalyze;
+  }
 }

Modified: lucene/dev/branches/branch_3x/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java?rev=1095375&r1=1095374&r2=1095375&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java
(original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java
Wed Apr 20 11:52:13 2011
@@ -57,6 +57,7 @@ public class WeightedSpanTermExtractor {
   private boolean expandMultiTermQuery;
   private boolean cachedTokenStream;
   private boolean wrapToCaching = true;
+  private int maxDocCharsToAnalyze;
 
   public WeightedSpanTermExtractor() {
   }
@@ -323,13 +324,13 @@ public class WeightedSpanTermExtractor {
 
   private IndexReader getReaderForField(String field) throws IOException {
     if(wrapToCaching && !cachedTokenStream && !(tokenStream instanceof CachingTokenFilter))
{
-      tokenStream = new CachingTokenFilter(tokenStream);
+      tokenStream = new CachingTokenFilter(new OffsetLimitTokenFilter(tokenStream, maxDocCharsToAnalyze));
       cachedTokenStream = true;
     }
     IndexReader reader = readers.get(field);
     if (reader == null) {
       MemoryIndex indexer = new MemoryIndex();
-      indexer.addField(field, tokenStream);
+      indexer.addField(field, new OffsetLimitTokenFilter(tokenStream, maxDocCharsToAnalyze));
       tokenStream.reset();
       IndexSearcher searcher = indexer.createSearcher();
       reader = searcher.getIndexReader();
@@ -575,4 +576,8 @@ public class WeightedSpanTermExtractor {
 
   }
 
+  protected final void setMaxDocCharsToAnalyze(int maxDocCharsToAnalyze) {
+    this.maxDocCharsToAnalyze = maxDocCharsToAnalyze;
+  }
+
 }

Copied: lucene/dev/branches/branch_3x/lucene/contrib/highlighter/src/test/org/apache/lucene/search/highlight/OffsetLimitTokenFilterTest.java
(from r1095260, lucene/dev/trunk/lucene/contrib/highlighter/src/test/org/apache/lucene/search/highlight/OffsetLimitTokenFilterTest.java)
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/highlighter/src/test/org/apache/lucene/search/highlight/OffsetLimitTokenFilterTest.java?p2=lucene/dev/branches/branch_3x/lucene/contrib/highlighter/src/test/org/apache/lucene/search/highlight/OffsetLimitTokenFilterTest.java&p1=lucene/dev/trunk/lucene/contrib/highlighter/src/test/org/apache/lucene/search/highlight/OffsetLimitTokenFilterTest.java&r1=1095260&r2=1095375&rev=1095375&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/contrib/highlighter/src/test/org/apache/lucene/search/highlight/OffsetLimitTokenFilterTest.java
(original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/highlighter/src/test/org/apache/lucene/search/highlight/OffsetLimitTokenFilterTest.java
Wed Apr 20 11:52:13 2011
@@ -1 +1 @@
-package org.apache.lucene.search.highlight;

/**
 * Licensed to the Apache Software Foundation
(ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed
with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses
this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use
this file except in compliance with
 * the License.  You may obtain a copy of the License
at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable
law or agreed to in writing, software
 * distributed under the License is distributed on an
"AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
 * limitations under
the License.
 */

import java.io.Reader;
import java.io.StringReader;

import org.apache.lucene.analysis.Analyzer;
import
org.apache.lucene.analysis.BaseToken
 StreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;

public
class OffsetLimitTokenFilterTest extends BaseTokenStreamTestCase {
  
  public void testFilter()
throws Exception {
    TokenStream stream = new MockTokenizer(new StringReader(
        "short
toolong evenmuchlongertext a ab toolong foo"),
        MockTokenizer.WHITESPACE, false);
   OffsetLimitTokenFilter filter = new OffsetLimitTokenFilter(stream, 10);
    assertTokenStreamContents(filter,
new String[] {"short", "toolong"});
    
    stream = new MockTokenizer(new StringReader(
   "short toolong evenmuchlongertext a ab toolong foo"),
    MockTokenizer.WHITESPACE, false);
   filter = new OffsetLimitTokenFilter(stream, 12);
    assertTokenStreamContents(filter,
new String[] {"short", "toolong"});
    
    stream = new MockTokenizer(new StringReader(
       "short toolong evenmuchlongertext a ab toolong foo"),
        MockTokenizer.WHITESPACE,
false);
   
  filter = new OffsetLimitTokenFilter(stream, 30);
    assertTokenStreamContents(filter, new
String[] {"short", "toolong",
        "evenmuchlongertext"});
    
    
    checkOneTermReuse(new
Analyzer() {
      
      @Override
      public TokenStream tokenStream(String fieldName,
Reader reader) {
        return new OffsetLimitTokenFilter(new MockTokenizer(reader,
    
       MockTokenizer.WHITESPACE, false), 10);
      }
    }, "llenges", "llenges");
  }
}
\ No newline at end of file
+package org.apache.lucene.search.highlight;

/**
 * Licensed to the Apache Software Foundation
(ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed
with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses
this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use
this file except in compliance with
 * the License.  You may obtain a copy of the License
at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable
law or agreed to in writing, software
 * distributed under the License is distributed on an
"AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
 * limitations under
the License.
 */

import java.io.Reader;
import java.io.StringReader;

import org.apache.lucene.analysis.Analyzer;
import
org.apache.lucene.analysis.BaseToken
 StreamTestCase;
import org.apache.lucene.analysis.WhitespaceAnalyzer;

import org.apache.lucene.analysis.TokenStream;
import
org.apache.lucene.util.Version;

public class OffsetLimitTokenFilterTest extends BaseTokenStreamTestCase
{
  
  public void testFilter() throws Exception {

    TokenStream stream = new WhitespaceAnalyzer(Version.LUCENE_CURRENT)
       .tokenStream("field", new StringReader(
            "short toolong evenmuchlongertext
a ab toolong foo"));
      
    OffsetLimitTokenFilter filter = new OffsetLimitTokenFilter(stream,
10);
    assertTokenStreamContents(filter, new String[] {"short", "toolong"});
    
    stream
= new WhitespaceAnalyzer(Version.LUCENE_CURRENT).tokenStream("field", new StringReader(
 
      "short toolong evenmuchlongertext a ab toolong foo"));

    filter = new OffsetLimitTokenFilter(stream,
12);
    assertTokenStreamContents(filter, new String[] {"short", "toolong"});
  
    stream
= new WhitespaceAnalyzer(Version.LUCENE_CURRENT).token
 Stream(
        "field", new StringReader(
            "short toolong evenmuchlongertext
a ab toolong foo"));

    filter = new OffsetLimitTokenFilter(stream, 30);
    assertTokenStreamContents(filter,
new String[] {"short", "toolong",
        "evenmuchlongertext"});
    
    
    checkOneTermReuse(new
Analyzer() {
      
      @Override
      public TokenStream tokenStream(String fieldName,
Reader reader) {
        return new WhitespaceAnalyzer(Version.LUCENE_CURRENT).tokenStream(fieldName,
reader);
      }
    }, "llenges", "llenges");
  }
}
\ No newline at end of file

Modified: lucene/dev/branches/branch_3x/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/CHANGES.txt?rev=1095375&r1=1095374&r2=1095375&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/CHANGES.txt (original)
+++ lucene/dev/branches/branch_3x/solr/CHANGES.txt Wed Apr 20 11:52:13 2011
@@ -75,6 +75,9 @@ Bug Fixes
 
 * SOLR-2333: The "rename" core admin action does not persist the new name to solr.xml
   (Rasmus Hahn, Paul R. Brown via Mark Miller)
+  
+* SOLR-2390: Performance of usePhraseHighlighter is terrible on very large Documents, 
+  regardless of hl.maxDocCharsToAnalyze. (Mark Miller)
 
 Other Changes
 ----------------------

Modified: lucene/dev/branches/branch_3x/solr/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java?rev=1095375&r1=1095374&r2=1095375&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java
(original)
+++ lucene/dev/branches/branch_3x/solr/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java
Wed Apr 20 11:52:13 2011
@@ -431,12 +431,20 @@ public class DefaultSolrHighlighter exte
         // fall back to analyzer
         tstream = createAnalyzerTStream(schema, fieldName, docTexts[j]);
       }
-                   
+      
+      int maxCharsToAnalyze = params.getFieldInt(fieldName,
+          HighlightParams.MAX_CHARS,
+          Highlighter.DEFAULT_MAX_CHARS_TO_ANALYZE);
+      
       Highlighter highlighter;
       if (Boolean.valueOf(req.getParams().get(HighlightParams.USE_PHRASE_HIGHLIGHTER, "true")))
{
         // TODO: this is not always necessary - eventually we would like to avoid this wrap
         //       when it is not needed.
-        tstream = new CachingTokenFilter(tstream);
+        if (maxCharsToAnalyze < 0) {
+          tstream = new CachingTokenFilter(tstream);
+        } else {
+          tstream = new CachingTokenFilter(new OffsetLimitTokenFilter(tstream, maxCharsToAnalyze));
+        }
         
         // get highlighter
         highlighter = getPhraseHighlighter(query, fieldName, req, (CachingTokenFilter) tstream);
@@ -449,9 +457,6 @@ public class DefaultSolrHighlighter exte
         highlighter = getHighlighter(query, fieldName, req);
       }
       
-      int maxCharsToAnalyze = params.getFieldInt(fieldName,
-          HighlightParams.MAX_CHARS,
-          Highlighter.DEFAULT_MAX_CHARS_TO_ANALYZE);
       if (maxCharsToAnalyze < 0) {
         highlighter.setMaxDocCharsToAnalyze(docTexts[j].length());
       } else {



Mime
View raw message