Return-Path: X-Original-To: apmail-lucene-commits-archive@www.apache.org Delivered-To: apmail-lucene-commits-archive@www.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 5B433FC57 for ; Fri, 22 Mar 2013 14:52:11 +0000 (UTC) Received: (qmail 38545 invoked by uid 500); 22 Mar 2013 14:52:11 -0000 Mailing-List: contact commits-help@lucene.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@lucene.apache.org Delivered-To: mailing list commits@lucene.apache.org Received: (qmail 38538 invoked by uid 99); 22 Mar 2013 14:52:11 -0000 Received: from nike.apache.org (HELO nike.apache.org) (192.87.106.230) by apache.org (qpsmtpd/0.29) with ESMTP; Fri, 22 Mar 2013 14:52:11 +0000 X-ASF-Spam-Status: No, hits=-2000.0 required=5.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Fri, 22 Mar 2013 14:52:06 +0000 Received: from eris.apache.org (localhost [127.0.0.1]) by eris.apache.org (Postfix) with ESMTP id EBB9D23888FD; Fri, 22 Mar 2013 14:51:44 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r1459817 - in /lucene/dev/branches/branch_4x: ./ lucene/ lucene/highlighter/ lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/ lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/ solr/ solr/core/ so... Date: Fri, 22 Mar 2013 14:51:44 -0000 To: commits@lucene.apache.org From: mikemccand@apache.org X-Mailer: svnmailer-1.0.8-patched Message-Id: <20130322145144.EBB9D23888FD@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Author: mikemccand Date: Fri Mar 22 14:51:44 2013 New Revision: 1459817 URL: http://svn.apache.org/r1459817 Log: LUCENE-4860: per-field control over scoring and formatting Modified: lucene/dev/branches/branch_4x/ (props changed) lucene/dev/branches/branch_4x/lucene/ (props changed) lucene/dev/branches/branch_4x/lucene/CHANGES.txt (contents, props changed) lucene/dev/branches/branch_4x/lucene/highlighter/ (props changed) lucene/dev/branches/branch_4x/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/Passage.java lucene/dev/branches/branch_4x/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java lucene/dev/branches/branch_4x/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java lucene/dev/branches/branch_4x/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighterRanking.java lucene/dev/branches/branch_4x/solr/ (props changed) lucene/dev/branches/branch_4x/solr/core/ (props changed) lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/highlight/PostingsSolrHighlighter.java Modified: lucene/dev/branches/branch_4x/lucene/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/CHANGES.txt?rev=1459817&r1=1459816&r2=1459817&view=diff ============================================================================== --- lucene/dev/branches/branch_4x/lucene/CHANGES.txt (original) +++ lucene/dev/branches/branch_4x/lucene/CHANGES.txt Fri Mar 22 14:51:44 2013 @@ -103,6 +103,12 @@ New Features * LUCENE-4752: New SortingMergePolicy (in lucene/misc) that sorts documents before merging segments. (Adrien Grand, Shai Erera, David Smiley) +* LUCENE-4860: Customize scoring and formatting per-field in + PosthingsHighlighter by subclassing and overriding the getFormatter + and/or getScorer methods. This also changes Passage.getMatchTerms() + to return BytesRef[] instead of Term[]. (Robert Muir, Mike + McCandless) + API Changes * LUCENE-4844: removed TaxonomyReader.getParent(), you should use Modified: lucene/dev/branches/branch_4x/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/Passage.java URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/Passage.java?rev=1459817&r1=1459816&r2=1459817&view=diff ============================================================================== --- lucene/dev/branches/branch_4x/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/Passage.java (original) +++ lucene/dev/branches/branch_4x/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/Passage.java Fri Mar 22 14:51:44 2013 @@ -17,8 +17,8 @@ package org.apache.lucene.search.posting * limitations under the License. */ -import org.apache.lucene.index.Term; import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.RamUsageEstimator; import org.apache.lucene.util.SorterTemplate; @@ -36,15 +36,15 @@ public final class Passage { int matchStarts[] = new int[8]; int matchEnds[] = new int[8]; - Term matchTerms[] = new Term[8]; + BytesRef matchTerms[] = new BytesRef[8]; int numMatches = 0; - void addMatch(int startOffset, int endOffset, Term term) { + void addMatch(int startOffset, int endOffset, BytesRef term) { assert startOffset >= this.startOffset && startOffset <= this.endOffset; if (numMatches == matchStarts.length) { matchStarts = ArrayUtil.grow(matchStarts, numMatches+1); matchEnds = ArrayUtil.grow(matchEnds, numMatches+1); - Term newMatchTerms[] = new Term[ArrayUtil.oversize(numMatches+1, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; + BytesRef newMatchTerms[] = new BytesRef[ArrayUtil.oversize(numMatches+1, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; System.arraycopy(matchTerms, 0, newMatchTerms, 0, numMatches); matchTerms = newMatchTerms; } @@ -57,7 +57,7 @@ public final class Passage { void sort() { final int starts[] = matchStarts; final int ends[] = matchEnds; - final Term terms[] = matchTerms; + final BytesRef terms[] = matchTerms; new SorterTemplate() { @Override protected void swap(int i, int j) { @@ -69,7 +69,7 @@ public final class Passage { ends[i] = ends[j]; ends[j] = temp; - Term tempTerm = terms[i]; + BytesRef tempTerm = terms[i]; terms[i] = terms[j]; terms[j] = tempTerm; } @@ -157,11 +157,11 @@ public final class Passage { } /** - * Term of the matches, corresponding with {@link #getMatchStarts()}. + * BytesRef (term text) of the matches, corresponding with {@link #getMatchStarts()}. *

* Only {@link #getNumMatches()} are valid. */ - public Term[] getMatchTerms() { + public BytesRef[] getMatchTerms() { return matchTerms; } } Modified: lucene/dev/branches/branch_4x/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java?rev=1459817&r1=1459816&r2=1459817&view=diff ============================================================================== --- lucene/dev/branches/branch_4x/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java (original) +++ lucene/dev/branches/branch_4x/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java Fri Mar 22 14:51:44 2013 @@ -97,8 +97,14 @@ public class PostingsHighlighter { private final int maxLength; private final BreakIterator breakIterator; - private final PassageScorer scorer; - private final PassageFormatter formatter; + + /** Set the first time {@link #getFormatter} is called, + * and then reused. */ + private PassageFormatter defaultFormatter; + + /** Set the first time {@link #getScorer} is called, + * and then reused. */ + private PassageScorer defaultScorer; /** * Creates a new highlighter with default parameters. @@ -113,7 +119,7 @@ public class PostingsHighlighter { * @throws IllegalArgumentException if maxLength is negative or Integer.MAX_VALUE */ public PostingsHighlighter(int maxLength) { - this(maxLength, BreakIterator.getSentenceInstance(Locale.ROOT), new PassageScorer(), new PassageFormatter()); + this(maxLength, BreakIterator.getSentenceInstance(Locale.ROOT)); } /** @@ -122,11 +128,9 @@ public class PostingsHighlighter { * @param breakIterator used for finding passage * boundaries; pass null to highlight the entire * content as a single Passage. - * @param scorer used for ranking passages. - * @param formatter used for formatting passages into highlighted snippets. * @throws IllegalArgumentException if maxLength is negative or Integer.MAX_VALUE */ - public PostingsHighlighter(int maxLength, BreakIterator breakIterator, PassageScorer scorer, PassageFormatter formatter) { + public PostingsHighlighter(int maxLength, BreakIterator breakIterator) { if (maxLength < 0 || maxLength == Integer.MAX_VALUE) { // two reasons: no overflow problems in BreakIterator.preceding(offset+1), // our sentinel in the offsets queue uses this value to terminate. @@ -135,13 +139,30 @@ public class PostingsHighlighter { if (breakIterator == null) { breakIterator = new WholeBreakIterator(); } - if (scorer == null || formatter == null) { - throw new NullPointerException(); - } this.maxLength = maxLength; this.breakIterator = breakIterator; - this.scorer = scorer; - this.formatter = formatter; + } + + /** Returns the {@link PassageFormatter} to use for + * formatting passages into highlighted snippets. This + * returns a new {@code PassageFormatter} by default; + * subclasses can override to customize. */ + protected PassageFormatter getFormatter(String field) { + if (defaultFormatter == null) { + defaultFormatter = new PassageFormatter(); + } + return defaultFormatter; + } + + /** Returns the {@link PassageScorer} to use for + * ranking passages. This + * returns a new {@code PassageScorer} by default; + * subclasses can override to customize. */ + protected PassageScorer getScorer(String field) { + if (defaultScorer == null) { + defaultScorer = new PassageScorer(); + } + return defaultScorer; } /** @@ -302,7 +323,13 @@ public class PostingsHighlighter { Term ceiling = new Term(field, UnicodeUtil.BIG_TERM); SortedSet fieldTerms = queryTerms.subSet(floor, ceiling); // TODO: should we have some reasonable defaults for term pruning? (e.g. stopwords) - Term terms[] = fieldTerms.toArray(new Term[fieldTerms.size()]); + + // Strip off the redundant field: + BytesRef terms[] = new BytesRef[fieldTerms.size()]; + int termUpto = 0; + for(Term term : fieldTerms) { + terms[termUpto++] = term.bytes(); + } Map fieldHighlights = highlightField(field, contents[i], bi, terms, docids, leaves, maxPassages); String[] result = new String[docids.length]; @@ -333,7 +360,7 @@ public class PostingsHighlighter { return contents; } - private Map highlightField(String field, String contents[], BreakIterator bi, Term terms[], int[] docids, List leaves, int maxPassages) throws IOException { + private Map highlightField(String field, String contents[], BreakIterator bi, BytesRef terms[], int[] docids, List leaves, int maxPassages) throws IOException { Map highlights = new HashMap(); // reuse in the real sense... for docs in same segment we just advance our old enum @@ -341,6 +368,11 @@ public class PostingsHighlighter { TermsEnum termsEnum = null; int lastLeaf = -1; + PassageFormatter fieldFormatter = getFormatter(field); + if (fieldFormatter == null) { + throw new NullPointerException("PassageFormatter cannot be null"); + } + for (int i = 0; i < docids.length; i++) { String content = contents[i]; if (content.length() == 0) { @@ -366,7 +398,7 @@ public class PostingsHighlighter { if (passages.length > 0) { // otherwise a null snippet (eg if field is missing // entirely from the doc) - highlights.put(doc, formatter.format(passages, content)); + highlights.put(doc, fieldFormatter.format(passages, content)); } lastLeaf = leaf; } @@ -377,8 +409,12 @@ public class PostingsHighlighter { // algorithm: treat sentence snippets as miniature documents // we can intersect these with the postings lists via BreakIterator.preceding(offset),s // score each sentence as norm(sentenceStartOffset) * sum(weight * tf(freq)) - private Passage[] highlightDoc(String field, Term terms[], int contentLength, BreakIterator bi, int doc, + private Passage[] highlightDoc(String field, BytesRef terms[], int contentLength, BreakIterator bi, int doc, TermsEnum termsEnum, DocsAndPositionsEnum[] postings, int n) throws IOException { + PassageScorer scorer = getScorer(field); + if (scorer == null) { + throw new NullPointerException("PassageScorer cannot be null"); + } PriorityQueue pq = new PriorityQueue(); float weights[] = new float[terms.length]; // initialize postings @@ -389,7 +425,7 @@ public class PostingsHighlighter { continue; } else if (de == null) { postings[i] = EMPTY; // initially - if (!termsEnum.seekExact(terms[i].bytes(), true)) { + if (!termsEnum.seekExact(terms[i], true)) { continue; // term not found } de = postings[i] = termsEnum.docsAndPositions(null, null, DocsAndPositionsEnum.FLAG_OFFSETS); Modified: lucene/dev/branches/branch_4x/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java?rev=1459817&r1=1459816&r2=1459817&view=diff ============================================================================== --- lucene/dev/branches/branch_4x/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java (original) +++ lucene/dev/branches/branch_4x/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java Fri Mar 22 14:51:44 2013 @@ -456,7 +456,7 @@ public class TestPostingsHighlighter ext iw.close(); IndexSearcher searcher = newSearcher(ir); - PostingsHighlighter highlighter = new PostingsHighlighter(10000, null, new PassageScorer(), new PassageFormatter()); + PostingsHighlighter highlighter = new PostingsHighlighter(10000, null); Query query = new TermQuery(new Term("body", "test")); TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER); assertEquals(1, topDocs.totalHits); @@ -526,7 +526,7 @@ public class TestPostingsHighlighter ext IndexSearcher searcher = newSearcher(ir); - PostingsHighlighter highlighter = new PostingsHighlighter(10000, null, new PassageScorer(), new PassageFormatter()) { + PostingsHighlighter highlighter = new PostingsHighlighter(10000, null) { @Override protected String[][] loadFieldValues(IndexSearcher searcher, String[] fields, int[] docids, int maxLength) throws IOException { assert fields.length == 1; @@ -635,7 +635,7 @@ public class TestPostingsHighlighter ext iw.close(); IndexSearcher searcher = newSearcher(ir); - PostingsHighlighter highlighter = new PostingsHighlighter(10000, null, new PassageScorer(), new PassageFormatter()); + PostingsHighlighter highlighter = new PostingsHighlighter(10000, null); Query query = new TermQuery(new Term("body", "highlighting")); int[] docIDs = new int[] {0}; String snippets[] = highlighter.highlightFields(new String[] {"body"}, query, searcher, docIDs, 2).get("body"); Modified: lucene/dev/branches/branch_4x/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighterRanking.java URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighterRanking.java?rev=1459817&r1=1459816&r2=1459817&view=diff ============================================================================== --- lucene/dev/branches/branch_4x/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighterRanking.java (original) +++ lucene/dev/branches/branch_4x/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighterRanking.java Fri Mar 22 14:51:44 2013 @@ -112,16 +112,26 @@ public class TestPostingsHighlighterRank private void checkQuery(IndexSearcher is, Query query, int doc, int maxTopN) throws IOException { for (int n = 1; n < maxTopN; n++) { - FakePassageFormatter f1 = new FakePassageFormatter(); + final FakePassageFormatter f1 = new FakePassageFormatter(); PostingsHighlighter p1 = new PostingsHighlighter(Integer.MAX_VALUE-1, - BreakIterator.getSentenceInstance(Locale.ROOT), - new PassageScorer(), - f1); - FakePassageFormatter f2 = new FakePassageFormatter(); + BreakIterator.getSentenceInstance(Locale.ROOT)) { + @Override + protected PassageFormatter getFormatter(String field) { + assertEquals("body", field); + return f1; + } + }; + + final FakePassageFormatter f2 = new FakePassageFormatter(); PostingsHighlighter p2 = new PostingsHighlighter(Integer.MAX_VALUE-1, - BreakIterator.getSentenceInstance(Locale.ROOT), - new PassageScorer(), - f2); + BreakIterator.getSentenceInstance(Locale.ROOT)) { + @Override + protected PassageFormatter getFormatter(String field) { + assertEquals("body", field); + return f2; + } + }; + BooleanQuery bq = new BooleanQuery(false); bq.add(query, BooleanClause.Occur.MUST); bq.add(new TermQuery(new Term("id", Integer.toString(doc))), BooleanClause.Occur.MUST); @@ -170,8 +180,7 @@ public class TestPostingsHighlighterRank // we use a very simple analyzer. so we can assert the matches are correct int lastMatchStart = -1; for (int i = 0; i < p.getNumMatches(); i++) { - Term term = p.getMatchTerms()[i]; - assertEquals("body", term.field()); + BytesRef term = p.getMatchTerms()[i]; int matchStart = p.getMatchStarts()[i]; assertTrue(matchStart >= 0); // must at least start within the passage @@ -184,9 +193,8 @@ public class TestPostingsHighlighterRank // single character terms assertEquals(matchStart+1, matchEnd); // and the offsets must be correct... - BytesRef bytes = term.bytes(); - assertEquals(1, bytes.length); - assertEquals((char)bytes.bytes[bytes.offset], Character.toLowerCase(content.charAt(matchStart))); + assertEquals(1, term.length); + assertEquals((char)term.bytes[term.offset], Character.toLowerCase(content.charAt(matchStart))); } // record just the start/end offset for simplicity seen.add(new Pair(p.getStartOffset(), p.getEndOffset())); @@ -262,9 +270,12 @@ public class TestPostingsHighlighterRank IndexSearcher searcher = newSearcher(ir); PostingsHighlighter highlighter = new PostingsHighlighter(10000, - BreakIterator.getSentenceInstance(Locale.ROOT), - new PassageScorer(1.2f, 0, 87), - new PassageFormatter()); + BreakIterator.getSentenceInstance(Locale.ROOT)) { + @Override + protected PassageScorer getScorer(String field) { + return new PassageScorer(1.2f, 0, 87); + } + }; Query query = new TermQuery(new Term("body", "test")); TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER); assertEquals(1, topDocs.totalHits); @@ -299,9 +310,12 @@ public class TestPostingsHighlighterRank IndexSearcher searcher = newSearcher(ir); PostingsHighlighter highlighter = new PostingsHighlighter(10000, - BreakIterator.getSentenceInstance(Locale.ROOT), - new PassageScorer(0, 0.75f, 87), - new PassageFormatter()); + BreakIterator.getSentenceInstance(Locale.ROOT)) { + @Override + protected PassageScorer getScorer(String field) { + return new PassageScorer(0, 0.75f, 87); + } + }; BooleanQuery query = new BooleanQuery(); query.add(new TermQuery(new Term("body", "foo")), BooleanClause.Occur.SHOULD); query.add(new TermQuery(new Term("body", "bar")), BooleanClause.Occur.SHOULD); Modified: lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/highlight/PostingsSolrHighlighter.java URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/highlight/PostingsSolrHighlighter.java?rev=1459817&r1=1459816&r2=1459817&view=diff ============================================================================== --- lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/highlight/PostingsSolrHighlighter.java (original) +++ lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/highlight/PostingsSolrHighlighter.java Fri Mar 22 14:51:44 2013 @@ -97,7 +97,7 @@ public class PostingsSolrHighlighter ext if (pivot == null) { pivot = "87"; } - PassageScorer scorer = new PassageScorer(Float.parseFloat(k1), Float.parseFloat(b), Float.parseFloat(pivot)); + final PassageScorer scorer = new PassageScorer(Float.parseFloat(k1), Float.parseFloat(b), Float.parseFloat(pivot)); // formatter parameters: preTag/postTag/ellipsis String preTag = attributes.get("preTag"); @@ -112,7 +112,7 @@ public class PostingsSolrHighlighter ext if (ellipsis == null) { ellipsis = "... "; } - PassageFormatter formatter = new PassageFormatter(preTag, postTag, ellipsis); + final PassageFormatter formatter = new PassageFormatter(preTag, postTag, ellipsis); String summarizeEmpty = attributes.get("summarizeEmpty"); final boolean summarizeEmptyBoolean; @@ -127,7 +127,7 @@ public class PostingsSolrHighlighter ext if (attributes.containsKey("maxLength")) { maxLength = Integer.parseInt(attributes.get("maxLength")); } - highlighter = new PostingsHighlighter(maxLength, breakIterator, scorer, formatter) { + highlighter = new PostingsHighlighter(maxLength, breakIterator) { @Override protected Passage[] getEmptyHighlight(String fieldName, BreakIterator bi, int maxPassages) { if (summarizeEmptyBoolean) { @@ -136,6 +136,16 @@ public class PostingsSolrHighlighter ext return new Passage[0]; } } + + @Override + protected PassageFormatter getFormatter(String fieldName) { + return formatter; + } + + @Override + protected PassageScorer getScorer(String fieldName) { + return scorer; + } }; }