Return-Path: X-Original-To: apmail-lucene-commits-archive@www.apache.org Delivered-To: apmail-lucene-commits-archive@www.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id E755DF63F for ; Wed, 20 Mar 2013 15:03:17 +0000 (UTC) Received: (qmail 7009 invoked by uid 500); 20 Mar 2013 15:03:17 -0000 Mailing-List: contact commits-help@lucene.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@lucene.apache.org Delivered-To: mailing list commits@lucene.apache.org Received: (qmail 7002 invoked by uid 99); 20 Mar 2013 15:03:17 -0000 Received: from athena.apache.org (HELO athena.apache.org) (140.211.11.136) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 20 Mar 2013 15:03:17 +0000 X-ASF-Spam-Status: No, hits=-2000.0 required=5.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 20 Mar 2013 15:03:15 +0000 Received: from eris.apache.org (localhost [127.0.0.1]) by eris.apache.org (Postfix) with ESMTP id 59A962388847; Wed, 20 Mar 2013 15:02:55 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r1458867 - in /lucene/dev/branches/branch_4x: ./ lucene/ lucene/highlighter/ lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/ lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/ Date: Wed, 20 Mar 2013 15:02:54 -0000 To: commits@lucene.apache.org From: mikemccand@apache.org X-Mailer: svnmailer-1.0.8-patched Message-Id: <20130320150255.59A962388847@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Author: mikemccand Date: Wed Mar 20 15:02:54 2013 New Revision: 1458867 URL: http://svn.apache.org/r1458867 Log: LUCENE-4856: If there are no matches for a given field, return the first maxPassages sentences Modified: lucene/dev/branches/branch_4x/ (props changed) lucene/dev/branches/branch_4x/lucene/ (props changed) lucene/dev/branches/branch_4x/lucene/CHANGES.txt (contents, props changed) lucene/dev/branches/branch_4x/lucene/highlighter/ (props changed) lucene/dev/branches/branch_4x/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java lucene/dev/branches/branch_4x/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java Modified: lucene/dev/branches/branch_4x/lucene/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/CHANGES.txt?rev=1458867&r1=1458866&r2=1458867&view=diff ============================================================================== --- lucene/dev/branches/branch_4x/lucene/CHANGES.txt (original) +++ lucene/dev/branches/branch_4x/lucene/CHANGES.txt Wed Mar 20 15:02:54 2013 @@ -90,6 +90,9 @@ New Features takes int[] docIDs instead of TopDocs. (Robert Muir, Mike McCandless) +* LUCENE-4856: If there are no matches for a given field, return the + first maxPassages sentences (Robert Muir, Mike McCandless) + API Changes * LUCENE-4844: removed TaxonomyReader.getParent(), you should use Modified: lucene/dev/branches/branch_4x/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java?rev=1458867&r1=1458866&r2=1458867&view=diff ============================================================================== --- lucene/dev/branches/branch_4x/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java (original) +++ lucene/dev/branches/branch_4x/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java Wed Mar 20 15:02:54 2013 @@ -19,6 +19,7 @@ package org.apache.lucene.search.posting import java.io.IOException; import java.text.BreakIterator; +import java.util.ArrayList; import java.util.Arrays; import java.util.Comparator; import java.util.HashMap; @@ -32,6 +33,7 @@ import java.util.TreeSet; import org.apache.lucene.index.AtomicReader; import org.apache.lucene.index.AtomicReaderContext; import org.apache.lucene.index.DocsAndPositionsEnum; +import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReaderContext; @@ -41,7 +43,6 @@ import org.apache.lucene.index.StoredFie import org.apache.lucene.index.Term; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; @@ -142,7 +143,7 @@ public class PostingsHighlighter { this.scorer = scorer; this.formatter = formatter; } - + /** * Highlights the top passages from a single field. * @@ -152,7 +153,8 @@ public class PostingsHighlighter { * @param searcher searcher that was previously used to execute the query. * @param topDocs TopDocs containing the summary result documents to highlight. * @return Array of formatted snippets corresponding to the documents in topDocs. - * If no highlights were found for a document, its value is null. + * If no highlights were found for a document, the + * first sentence for the field will be returned. * @throws IOException if an I/O error occurred during processing * @throws IllegalArgumentException if field was indexed without * {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS} @@ -172,7 +174,9 @@ public class PostingsHighlighter { * @param maxPassages The maximum number of top-N ranked passages used to * form the highlighted snippets. * @return Array of formatted snippets corresponding to the documents in topDocs. - * If no highlights were found for a document, its value is null. + * If no highlights were found for a document, the + * first {@code maxPassages} sentences from the + * field will be returned. * @throws IOException if an I/O error occurred during processing * @throws IllegalArgumentException if field was indexed without * {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS} @@ -201,7 +205,8 @@ public class PostingsHighlighter { * @param topDocs TopDocs containing the summary result documents to highlight. * @return Map keyed on field name, containing the array of formatted snippets * corresponding to the documents in topDocs. - * If no highlights were found for a document, its value is null. + * If no highlights were found for a document, the + * first sentence from the field will be returned. * @throws IOException if an I/O error occurred during processing * @throws IllegalArgumentException if field was indexed without * {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS} @@ -231,7 +236,9 @@ public class PostingsHighlighter { * form the highlighted snippets. * @return Map keyed on field name, containing the array of formatted snippets * corresponding to the documents in topDocs. - * If no highlights were found for a document, its value is null. + * If no highlights were found for a document, the + * first {@code maxPassages} sentences from the + * field will be returned. * @throws IOException if an I/O error occurred during processing * @throws IllegalArgumentException if field was indexed without * {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS} @@ -259,7 +266,9 @@ public class PostingsHighlighter { * form the highlighted snippets. * @return Map keyed on field name, containing the array of formatted snippets * corresponding to the documents in topDocs. - * If no highlights were found for a document, its value is null. + * If no highlights were found for a document, the + * first {@code maxPassages} from the field will + * be returned. * @throws IOException if an I/O error occurred during processing * @throws IllegalArgumentException if field was indexed without * {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS} @@ -327,7 +336,7 @@ public class PostingsHighlighter { DocsAndPositionsEnum postings[] = null; TermsEnum termsEnum = null; int lastLeaf = -1; - + for (int i = 0; i < docids.length; i++) { String content = contents[i]; if (content.length() == 0) { @@ -347,8 +356,12 @@ public class PostingsHighlighter { postings = new DocsAndPositionsEnum[terms.length]; } Passage passages[] = highlightDoc(field, terms, content.length(), bi, doc - subContext.docBase, termsEnum, postings, maxPassages); + if (passages.length == 0) { + passages = getEmptyHighlight(field, bi, maxPassages); + } if (passages.length > 0) { - // otherwise a null snippet + // otherwise a null snippet (eg if field is missing + // entirely from the doc) highlights.put(doc, formatter.format(passages, content)); } lastLeaf = leaf; @@ -476,7 +489,35 @@ public class PostingsHighlighter { } current.score += weights[off.id] * scorer.tf(tf, current.endOffset - current.startOffset); } - return new Passage[0]; + + // Dead code but compiler disagrees: + assert false; + return null; + } + + /** Called to summarize a document when no hits were + * found. By default this just returns the first + * {@code maxPassages} sentences; subclasses can override + * to customize. */ + protected Passage[] getEmptyHighlight(String fieldName, BreakIterator bi, int maxPassages) { + // BreakIterator should be un-next'd: + List passages = new ArrayList(); + int pos = bi.current(); + assert pos == 0; + while (passages.size() < maxPassages) { + int next = bi.next(); + if (next == BreakIterator.DONE) { + break; + } + Passage passage = new Passage(); + passage.score = Float.NaN; + passage.startOffset = pos; + passage.endOffset = next; + passages.add(passage); + pos = next; + } + + return passages.toArray(new Passage[passages.size()]); } private static class OffsetsEnum implements Comparable { Modified: lucene/dev/branches/branch_4x/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java?rev=1458867&r1=1458866&r2=1458867&view=diff ============================================================================== --- lucene/dev/branches/branch_4x/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java (original) +++ lucene/dev/branches/branch_4x/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java Wed Mar 20 15:02:54 2013 @@ -20,6 +20,7 @@ package org.apache.lucene.search.posting import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; +import java.text.BreakIterator; import java.util.Map; import org.apache.lucene.analysis.Analyzer; @@ -373,7 +374,6 @@ public class TestPostingsHighlighter ext assertEquals(1, snippets.length); assertTrue(snippets[0].contains("Square")); assertTrue(snippets[0].contains("Porter")); - //System.out.println("GOT: " + snippets.length + "; " + Arrays.toString(snippets)); ir.close(); dir.close(); } @@ -547,4 +547,205 @@ public class TestPostingsHighlighter ext ir.close(); dir.close(); } + + /** Make sure highlighter returns first N sentences if + * there were no hits. */ + public void testEmptyHighlights() throws Exception { + Directory dir = newDirectory(); + IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())); + iwc.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); + + FieldType offsetsType = new FieldType(TextField.TYPE_STORED); + offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); + Document doc = new Document(); + + Field body = new Field("body", "test this is. another sentence this test has. far away is that planet.", offsetsType); + doc.add(body); + iw.addDocument(doc); + + IndexReader ir = iw.getReader(); + iw.close(); + + IndexSearcher searcher = newSearcher(ir); + PostingsHighlighter highlighter = new PostingsHighlighter(); + Query query = new TermQuery(new Term("body", "highlighting")); + int[] docIDs = new int[] {0}; + String snippets[] = highlighter.highlightFields(new String[] {"body"}, query, searcher, docIDs, 2).get("body"); + assertEquals(1, snippets.length); + assertEquals("test this is. another sentence this test has. ", snippets[0]); + + ir.close(); + dir.close(); + } + + /** Make sure highlighter we can customize how emtpy + * highlight is returned. */ + public void testCustomEmptyHighlights() throws Exception { + Directory dir = newDirectory(); + IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())); + iwc.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); + + FieldType offsetsType = new FieldType(TextField.TYPE_STORED); + offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); + Document doc = new Document(); + + Field body = new Field("body", "test this is. another sentence this test has. far away is that planet.", offsetsType); + doc.add(body); + iw.addDocument(doc); + + IndexReader ir = iw.getReader(); + iw.close(); + + IndexSearcher searcher = newSearcher(ir); + PostingsHighlighter highlighter = new PostingsHighlighter() { + @Override + public Passage[] getEmptyHighlight(String fieldName, BreakIterator bi, int maxPassages) { + return new Passage[0]; + } + }; + Query query = new TermQuery(new Term("body", "highlighting")); + int[] docIDs = new int[] {0}; + String snippets[] = highlighter.highlightFields(new String[] {"body"}, query, searcher, docIDs, 2).get("body"); + assertEquals(1, snippets.length); + assertNull(snippets[0]); + + ir.close(); + dir.close(); + } + + /** Make sure highlighter returns whole text when there + * are no hits and BreakIterator is null. */ + public void testEmptyHighlightsWhole() throws Exception { + Directory dir = newDirectory(); + IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())); + iwc.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); + + FieldType offsetsType = new FieldType(TextField.TYPE_STORED); + offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); + Document doc = new Document(); + + Field body = new Field("body", "test this is. another sentence this test has. far away is that planet.", offsetsType); + doc.add(body); + iw.addDocument(doc); + + IndexReader ir = iw.getReader(); + iw.close(); + + IndexSearcher searcher = newSearcher(ir); + PostingsHighlighter highlighter = new PostingsHighlighter(10000, null, new PassageScorer(), new PassageFormatter()); + Query query = new TermQuery(new Term("body", "highlighting")); + int[] docIDs = new int[] {0}; + String snippets[] = highlighter.highlightFields(new String[] {"body"}, query, searcher, docIDs, 2).get("body"); + assertEquals(1, snippets.length); + assertEquals("test this is. another sentence this test has. far away is that planet.", snippets[0]); + + ir.close(); + dir.close(); + } + + /** Make sure highlighter is OK with entirely missing + * field. */ + public void testFieldIsMissing() throws Exception { + Directory dir = newDirectory(); + IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())); + iwc.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); + + FieldType offsetsType = new FieldType(TextField.TYPE_STORED); + offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); + Document doc = new Document(); + + Field body = new Field("body", "test this is. another sentence this test has. far away is that planet.", offsetsType); + doc.add(body); + iw.addDocument(doc); + + IndexReader ir = iw.getReader(); + iw.close(); + + IndexSearcher searcher = newSearcher(ir); + PostingsHighlighter highlighter = new PostingsHighlighter(); + Query query = new TermQuery(new Term("bogus", "highlighting")); + int[] docIDs = new int[] {0}; + String snippets[] = highlighter.highlightFields(new String[] {"bogus"}, query, searcher, docIDs, 2).get("bogus"); + assertEquals(1, snippets.length); + assertNull(snippets[0]); + + ir.close(); + dir.close(); + } + + public void testFieldIsJustSpace() throws Exception { + Directory dir = newDirectory(); + IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())); + iwc.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); + + FieldType offsetsType = new FieldType(TextField.TYPE_STORED); + offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); + + Document doc = new Document(); + doc.add(new Field("body", " ", offsetsType)); + doc.add(new Field("id", "id", offsetsType)); + iw.addDocument(doc); + + doc = new Document(); + doc.add(new Field("body", "something", offsetsType)); + iw.addDocument(doc); + + IndexReader ir = iw.getReader(); + iw.close(); + + IndexSearcher searcher = newSearcher(ir); + PostingsHighlighter highlighter = new PostingsHighlighter(); + int docID = searcher.search(new TermQuery(new Term("id", "id")), 1).scoreDocs[0].doc; + + Query query = new TermQuery(new Term("body", "highlighting")); + int[] docIDs = new int[1]; + docIDs[0] = docID; + String snippets[] = highlighter.highlightFields(new String[] {"body"}, query, searcher, docIDs, 2).get("body"); + assertEquals(1, snippets.length); + assertEquals(" ", snippets[0]); + + ir.close(); + dir.close(); + } + + public void testFieldIsEmptyString() throws Exception { + Directory dir = newDirectory(); + IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())); + iwc.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); + + FieldType offsetsType = new FieldType(TextField.TYPE_STORED); + offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); + + Document doc = new Document(); + doc.add(new Field("body", "", offsetsType)); + doc.add(new Field("id", "id", offsetsType)); + iw.addDocument(doc); + + doc = new Document(); + doc.add(new Field("body", "something", offsetsType)); + iw.addDocument(doc); + + IndexReader ir = iw.getReader(); + iw.close(); + + IndexSearcher searcher = newSearcher(ir); + PostingsHighlighter highlighter = new PostingsHighlighter(); + int docID = searcher.search(new TermQuery(new Term("id", "id")), 1).scoreDocs[0].doc; + + Query query = new TermQuery(new Term("body", "highlighting")); + int[] docIDs = new int[1]; + docIDs[0] = docID; + String snippets[] = highlighter.highlightFields(new String[] {"body"}, query, searcher, docIDs, 2).get("body"); + assertEquals(1, snippets.length); + assertNull(snippets[0]); + + ir.close(); + dir.close(); + } }