Mailing-List: contact commits-help@lucene.apache.org; run by ezmlm
Precedence: bulk
Reply-To: dev@lucene.apache.org
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
Subject: svn commit: r1458867 - in /lucene/dev/branches/branch_4x: ./ lucene/
 lucene/highlighter/
 lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/
 lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/
Date: Wed, 20 Mar 2013 15:02:54 -0000
To: commits@lucene.apache.org
From: mikemccand@apache.org
Message-Id: <20130320150255.59A962388847@eris.apache.org>

Author: mikemccand
Date: Wed Mar 20 15:02:54 2013
New Revision: 1458867

URL: http://svn.apache.org/r1458867
Log:
LUCENE-4856: If there are no matches for a given field, return the first maxPassages sentences

Modified:
    lucene/dev/branches/branch_4x/   (props changed)
    lucene/dev/branches/branch_4x/lucene/   (props changed)
    lucene/dev/branches/branch_4x/lucene/CHANGES.txt   (contents, props changed)
    lucene/dev/branches/branch_4x/lucene/highlighter/   (props changed)
    lucene/dev/branches/branch_4x/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java
    lucene/dev/branches/branch_4x/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java

Modified: lucene/dev/branches/branch_4x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/CHANGES.txt?rev=1458867&r1=1458866&r2=1458867&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_4x/lucene/CHANGES.txt Wed Mar 20 15:02:54 2013
@@ -90,6 +90,9 @@ New Features
   takes int[] docIDs instead of TopDocs.  (Robert Muir, Mike
   McCandless)
 
+* LUCENE-4856: If there are no matches for a given field, return the
+  first maxPassages sentences (Robert Muir, Mike McCandless)
+
 API Changes
 
 * LUCENE-4844: removed TaxonomyReader.getParent(), you should use

Modified: lucene/dev/branches/branch_4x/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java?rev=1458867&r1=1458866&r2=1458867&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java (original)
+++ lucene/dev/branches/branch_4x/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java Wed Mar 20 15:02:54 2013
@@ -19,6 +19,7 @@ package org.apache.lucene.search.posting
 
 import java.io.IOException;
 import java.text.BreakIterator;
+import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Comparator;
 import java.util.HashMap;
@@ -32,6 +33,7 @@ import java.util.TreeSet;
 import org.apache.lucene.index.AtomicReader;
 import org.apache.lucene.index.AtomicReaderContext;
 import org.apache.lucene.index.DocsAndPositionsEnum;
+import org.apache.lucene.index.FieldInfo.IndexOptions;
 import org.apache.lucene.index.FieldInfo;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexReaderContext;
@@ -41,7 +43,6 @@ import org.apache.lucene.index.StoredFie
 import org.apache.lucene.index.Term;
 import org.apache.lucene.index.Terms;
 import org.apache.lucene.index.TermsEnum;
-import org.apache.lucene.index.FieldInfo.IndexOptions;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.ScoreDoc;
@@ -142,7 +143,7 @@ public class PostingsHighlighter {
     this.scorer = scorer;
     this.formatter = formatter;
   }
-  
+
   /**
    * Highlights the top passages from a single field.
    * 
@@ -152,7 +153,8 @@ public class PostingsHighlighter {
    * @param searcher searcher that was previously used to execute the query.
    * @param topDocs TopDocs containing the summary result documents to highlight.
    * @return Array of formatted snippets corresponding to the documents in <code>topDocs</code>. 
-   *         If no highlights were found for a document, its value is <code>null</code>.
+   *         If no highlights were found for a document, the
+   *         first sentence for the field will be returned.
    * @throws IOException if an I/O error occurred during processing
    * @throws IllegalArgumentException if <code>field</code> was indexed without 
    *         {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
@@ -172,7 +174,9 @@ public class PostingsHighlighter {
    * @param maxPassages The maximum number of top-N ranked passages used to 
    *        form the highlighted snippets.
    * @return Array of formatted snippets corresponding to the documents in <code>topDocs</code>. 
-   *         If no highlights were found for a document, its value is <code>null</code>.
+   *         If no highlights were found for a document, the
+   *         first {@code maxPassages} sentences from the
+   *         field will be returned.
    * @throws IOException if an I/O error occurred during processing
    * @throws IllegalArgumentException if <code>field</code> was indexed without 
    *         {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
@@ -201,7 +205,8 @@ public class PostingsHighlighter {
    * @param topDocs TopDocs containing the summary result documents to highlight.
    * @return Map keyed on field name, containing the array of formatted snippets 
    *         corresponding to the documents in <code>topDocs</code>. 
-   *         If no highlights were found for a document, its value is <code>null</code>.
+   *         If no highlights were found for a document, the
+   *         first sentence from the field will be returned.
    * @throws IOException if an I/O error occurred during processing
    * @throws IllegalArgumentException if <code>field</code> was indexed without 
    *         {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
@@ -231,7 +236,9 @@ public class PostingsHighlighter {
    *        form the highlighted snippets.
    * @return Map keyed on field name, containing the array of formatted snippets 
    *         corresponding to the documents in <code>topDocs</code>. 
-   *         If no highlights were found for a document, its value is <code>null</code>.
+   *         If no highlights were found for a document, the
+   *         first {@code maxPassages} sentences from the
+   *         field will be returned.
    * @throws IOException if an I/O error occurred during processing
    * @throws IllegalArgumentException if <code>field</code> was indexed without 
    *         {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
@@ -259,7 +266,9 @@ public class PostingsHighlighter {
    *        form the highlighted snippets.
    * @return Map keyed on field name, containing the array of formatted snippets 
    *         corresponding to the documents in <code>topDocs</code>. 
-   *         If no highlights were found for a document, its value is <code>null</code>.
+   *         If no highlights were found for a document, the
+   *         first {@code maxPassages} from the field will
+   *         be returned.
    * @throws IOException if an I/O error occurred during processing
    * @throws IllegalArgumentException if <code>field</code> was indexed without 
    *         {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
@@ -327,7 +336,7 @@ public class PostingsHighlighter {
     DocsAndPositionsEnum postings[] = null;
     TermsEnum termsEnum = null;
     int lastLeaf = -1;
-    
+
     for (int i = 0; i < docids.length; i++) {
       String content = contents[i];
       if (content.length() == 0) {
@@ -347,8 +356,12 @@ public class PostingsHighlighter {
         postings = new DocsAndPositionsEnum[terms.length];
       }
       Passage passages[] = highlightDoc(field, terms, content.length(), bi, doc - subContext.docBase, termsEnum, postings, maxPassages);
+      if (passages.length == 0) {
+        passages = getEmptyHighlight(field, bi, maxPassages);
+      }
       if (passages.length > 0) {
-        // otherwise a null snippet
+        // otherwise a null snippet (eg if field is missing
+        // entirely from the doc)
         highlights.put(doc, formatter.format(passages, content));
       }
       lastLeaf = leaf;
@@ -476,7 +489,35 @@ public class PostingsHighlighter {
       }
       current.score += weights[off.id] * scorer.tf(tf, current.endOffset - current.startOffset);
     }
-    return new Passage[0];
+
+    // Dead code but compiler disagrees:
+    assert false;
+    return null;
+  }
+
+  /** Called to summarize a document when no hits were
+   *  found.  By default this just returns the first
+   *  {@code maxPassages} sentences; subclasses can override
+   *  to customize. */
+  protected Passage[] getEmptyHighlight(String fieldName, BreakIterator bi, int maxPassages) {
+    // BreakIterator should be un-next'd:
+    List<Passage> passages = new ArrayList<Passage>();
+    int pos = bi.current();
+    assert pos == 0;
+    while (passages.size() < maxPassages) {
+      int next = bi.next();
+      if (next == BreakIterator.DONE) {
+        break;
+      }
+      Passage passage = new Passage();
+      passage.score = Float.NaN;
+      passage.startOffset = pos;
+      passage.endOffset = next;
+      passages.add(passage);
+      pos = next;
+    }
+
+    return passages.toArray(new Passage[passages.size()]);
   }
   
   private static class OffsetsEnum implements Comparable<OffsetsEnum> {

Modified: lucene/dev/branches/branch_4x/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java?rev=1458867&r1=1458866&r2=1458867&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java (original)
+++ lucene/dev/branches/branch_4x/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java Wed Mar 20 15:02:54 2013
@@ -20,6 +20,7 @@ package org.apache.lucene.search.posting
 import java.io.BufferedReader;
 import java.io.IOException;
 import java.io.InputStreamReader;
+import java.text.BreakIterator;
 import java.util.Map;
 
 import org.apache.lucene.analysis.Analyzer;
@@ -373,7 +374,6 @@ public class TestPostingsHighlighter ext
     assertEquals(1, snippets.length);
     assertTrue(snippets[0].contains("<b>Square</b>"));
     assertTrue(snippets[0].contains("<b>Porter</b>"));
-    //System.out.println("GOT: " + snippets.length + "; " + Arrays.toString(snippets));
     ir.close();
     dir.close();
   }
@@ -547,4 +547,205 @@ public class TestPostingsHighlighter ext
     ir.close();
     dir.close();
   }
+
+  /** Make sure highlighter returns first N sentences if
+   *  there were no hits. */
+  public void testEmptyHighlights() throws Exception {
+    Directory dir = newDirectory();
+    IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
+    iwc.setMergePolicy(newLogMergePolicy());
+    RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
+    
+    FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
+    offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
+    Document doc = new Document();
+
+    Field body = new Field("body", "test this is.  another sentence this test has.  far away is that planet.", offsetsType);
+    doc.add(body);
+    iw.addDocument(doc);
+    
+    IndexReader ir = iw.getReader();
+    iw.close();
+    
+    IndexSearcher searcher = newSearcher(ir);
+    PostingsHighlighter highlighter = new PostingsHighlighter();
+    Query query = new TermQuery(new Term("body", "highlighting"));
+    int[] docIDs = new int[] {0};
+    String snippets[] = highlighter.highlightFields(new String[] {"body"}, query, searcher, docIDs, 2).get("body");
+    assertEquals(1, snippets.length);
+    assertEquals("test this is.  another sentence this test has.  ", snippets[0]);
+
+    ir.close();
+    dir.close();
+  }
+
+  /** Make sure highlighter we can customize how emtpy
+   *  highlight is returned. */
+  public void testCustomEmptyHighlights() throws Exception {
+    Directory dir = newDirectory();
+    IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
+    iwc.setMergePolicy(newLogMergePolicy());
+    RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
+    
+    FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
+    offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
+    Document doc = new Document();
+
+    Field body = new Field("body", "test this is.  another sentence this test has.  far away is that planet.", offsetsType);
+    doc.add(body);
+    iw.addDocument(doc);
+    
+    IndexReader ir = iw.getReader();
+    iw.close();
+    
+    IndexSearcher searcher = newSearcher(ir);
+    PostingsHighlighter highlighter = new PostingsHighlighter() {
+        @Override
+        public Passage[] getEmptyHighlight(String fieldName, BreakIterator bi, int maxPassages) {
+          return new Passage[0];
+        }
+      };
+    Query query = new TermQuery(new Term("body", "highlighting"));
+    int[] docIDs = new int[] {0};
+    String snippets[] = highlighter.highlightFields(new String[] {"body"}, query, searcher, docIDs, 2).get("body");
+    assertEquals(1, snippets.length);
+    assertNull(snippets[0]);
+
+    ir.close();
+    dir.close();
+  }
+
+  /** Make sure highlighter returns whole text when there
+   *  are no hits and BreakIterator is null. */
+  public void testEmptyHighlightsWhole() throws Exception {
+    Directory dir = newDirectory();
+    IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
+    iwc.setMergePolicy(newLogMergePolicy());
+    RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
+    
+    FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
+    offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
+    Document doc = new Document();
+
+    Field body = new Field("body", "test this is.  another sentence this test has.  far away is that planet.", offsetsType);
+    doc.add(body);
+    iw.addDocument(doc);
+    
+    IndexReader ir = iw.getReader();
+    iw.close();
+    
+    IndexSearcher searcher = newSearcher(ir);
+    PostingsHighlighter highlighter = new PostingsHighlighter(10000, null, new PassageScorer(), new PassageFormatter());
+    Query query = new TermQuery(new Term("body", "highlighting"));
+    int[] docIDs = new int[] {0};
+    String snippets[] = highlighter.highlightFields(new String[] {"body"}, query, searcher, docIDs, 2).get("body");
+    assertEquals(1, snippets.length);
+    assertEquals("test this is.  another sentence this test has.  far away is that planet.", snippets[0]);
+
+    ir.close();
+    dir.close();
+  }
+
+  /** Make sure highlighter is OK with entirely missing
+   *  field. */
+  public void testFieldIsMissing() throws Exception {
+    Directory dir = newDirectory();
+    IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
+    iwc.setMergePolicy(newLogMergePolicy());
+    RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
+    
+    FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
+    offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
+    Document doc = new Document();
+
+    Field body = new Field("body", "test this is.  another sentence this test has.  far away is that planet.", offsetsType);
+    doc.add(body);
+    iw.addDocument(doc);
+    
+    IndexReader ir = iw.getReader();
+    iw.close();
+    
+    IndexSearcher searcher = newSearcher(ir);
+    PostingsHighlighter highlighter = new PostingsHighlighter();
+    Query query = new TermQuery(new Term("bogus", "highlighting"));
+    int[] docIDs = new int[] {0};
+    String snippets[] = highlighter.highlightFields(new String[] {"bogus"}, query, searcher, docIDs, 2).get("bogus");
+    assertEquals(1, snippets.length);
+    assertNull(snippets[0]);
+
+    ir.close();
+    dir.close();
+  }
+
+  public void testFieldIsJustSpace() throws Exception {
+    Directory dir = newDirectory();
+    IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
+    iwc.setMergePolicy(newLogMergePolicy());
+    RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
+    
+    FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
+    offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
+
+    Document doc = new Document();
+    doc.add(new Field("body", "   ", offsetsType));
+    doc.add(new Field("id", "id", offsetsType));
+    iw.addDocument(doc);
+
+    doc = new Document();
+    doc.add(new Field("body", "something", offsetsType));
+    iw.addDocument(doc);
+    
+    IndexReader ir = iw.getReader();
+    iw.close();
+    
+    IndexSearcher searcher = newSearcher(ir);
+    PostingsHighlighter highlighter = new PostingsHighlighter();
+    int docID = searcher.search(new TermQuery(new Term("id", "id")), 1).scoreDocs[0].doc;
+
+    Query query = new TermQuery(new Term("body", "highlighting"));
+    int[] docIDs = new int[1];
+    docIDs[0] = docID;
+    String snippets[] = highlighter.highlightFields(new String[] {"body"}, query, searcher, docIDs, 2).get("body");
+    assertEquals(1, snippets.length);
+    assertEquals("   ", snippets[0]);
+
+    ir.close();
+    dir.close();
+  }
+
+  public void testFieldIsEmptyString() throws Exception {
+    Directory dir = newDirectory();
+    IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
+    iwc.setMergePolicy(newLogMergePolicy());
+    RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
+    
+    FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
+    offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
+
+    Document doc = new Document();
+    doc.add(new Field("body", "", offsetsType));
+    doc.add(new Field("id", "id", offsetsType));
+    iw.addDocument(doc);
+
+    doc = new Document();
+    doc.add(new Field("body", "something", offsetsType));
+    iw.addDocument(doc);
+    
+    IndexReader ir = iw.getReader();
+    iw.close();
+    
+    IndexSearcher searcher = newSearcher(ir);
+    PostingsHighlighter highlighter = new PostingsHighlighter();
+    int docID = searcher.search(new TermQuery(new Term("id", "id")), 1).scoreDocs[0].doc;
+
+    Query query = new TermQuery(new Term("body", "highlighting"));
+    int[] docIDs = new int[1];
+    docIDs[0] = docID;
+    String snippets[] = highlighter.highlightFields(new String[] {"body"}, query, searcher, docIDs, 2).get("body");
+    assertEquals(1, snippets.length);
+    assertNull(snippets[0]);
+
+    ir.close();
+    dir.close();
+  }
 }