lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From markrmil...@apache.org
Subject svn commit: r720669 - in /lucene/java/trunk: CHANGES.txt src/java/org/apache/lucene/search/spans/NearSpansOrdered.java src/test/org/apache/lucene/search/spans/TestPayloadSpans.java
Date Tue, 25 Nov 2008 23:38:56 GMT
Author: markrmiller
Date: Tue Nov 25 15:38:56 2008
New Revision: 720669

URL: http://svn.apache.org/viewvc?rev=720669&view=rev
Log:
LUCENE-1465: NearSpansOrdered returns payloads from first possible match 
   rather than the correct, shortest match; Payloads could be returned even
   if the max slop was exceeded; The wrong payload could be returned in 
   certain situations.

Modified:
    lucene/java/trunk/CHANGES.txt
    lucene/java/trunk/src/java/org/apache/lucene/search/spans/NearSpansOrdered.java
    lucene/java/trunk/src/test/org/apache/lucene/search/spans/TestPayloadSpans.java

Modified: lucene/java/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/CHANGES.txt?rev=720669&r1=720668&r2=720669&view=diff
==============================================================================
--- lucene/java/trunk/CHANGES.txt (original)
+++ lucene/java/trunk/CHANGES.txt Tue Nov 25 15:38:56 2008
@@ -1,4 +1,4 @@
-Lucene Change Log
+Lucene Change Log
 $Id$
 
 ======================= Trunk (not yet released) =======================
@@ -49,6 +49,11 @@
    are non-congruent (same field name maps to different field
    numbers).  This bug was introduced with LUCENE-1219.  (Andrzej
    Bialecki via Mike McCandless).
+   
+3. LUCENE-1465: NearSpansOrdered returns payloads from first possible match 
+   rather than the correct, shortest match; Payloads could be returned even
+   if the max slop was exceeded; The wrong payload could be returned in 
+   certain situations. (Jonathan Mamou, Greg Shackles, Mark Miller)
 
 New features
 

Modified: lucene/java/trunk/src/java/org/apache/lucene/search/spans/NearSpansOrdered.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/search/spans/NearSpansOrdered.java?rev=720669&r1=720668&r2=720669&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/search/spans/NearSpansOrdered.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/search/spans/NearSpansOrdered.java Tue Nov
25 15:38:56 2008
@@ -20,11 +20,15 @@
 import org.apache.lucene.index.IndexReader;
 
 import java.io.IOException;
+import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Comparator;
+import java.util.HashSet;
+import java.util.Iterator;
 import java.util.LinkedList;
 import java.util.List;
 import java.util.Collection;
+import java.util.Set;
 
 /** A Spans that is formed from the ordered subspans of a SpanNearQuery
  * where the subspans do not overlap and have a maximum slop between them.
@@ -234,17 +238,22 @@
   private boolean shrinkToAfterShortestMatch() throws IOException {
     matchStart = subSpans[subSpans.length - 1].start();
     matchEnd = subSpans[subSpans.length - 1].end();
+    Set possibleMatchPayloads = new HashSet();
     if (subSpans[subSpans.length - 1].isPayloadAvailable()) {
-      matchPayload.addAll(subSpans[subSpans.length - 1].getPayload());
+      possibleMatchPayloads.addAll(subSpans[subSpans.length - 1].getPayload());
     }
+
+    Collection possiblePayload = null;
+    
     int matchSlop = 0;
     int lastStart = matchStart;
     int lastEnd = matchEnd;
     for (int i = subSpans.length - 2; i >= 0; i--) {
       PayloadSpans prevSpans = subSpans[i];
-      
-      if (subSpans[i].isPayloadAvailable()) {
-        matchPayload.addAll(0, subSpans[i].getPayload());
+      if (prevSpans.isPayloadAvailable()) {
+        Collection payload = prevSpans.getPayload();
+        possiblePayload = new ArrayList(payload.size());
+        possiblePayload.addAll(payload);
       }
       
       int prevStart = prevSpans.start();
@@ -265,9 +274,19 @@
           } else { // prevSpans still before (lastStart, lastEnd)
             prevStart = ppStart;
             prevEnd = ppEnd;
+            if (prevSpans.isPayloadAvailable()) {
+              Collection payload = prevSpans.getPayload();
+              possiblePayload = new ArrayList(payload.size());
+              possiblePayload.addAll(payload);
+            }
           }
         }
       }
+
+      if (possiblePayload != null) {
+        possibleMatchPayloads.addAll(possiblePayload);
+      }
+      
       assert prevStart <= matchStart;
       if (matchStart > prevEnd) { // Only non overlapping spans add to slop.
         matchSlop += (matchStart - prevEnd);
@@ -280,7 +299,14 @@
       lastStart = prevStart;
       lastEnd = prevEnd;
     }
-    return matchSlop <= allowedSlop; // ordered and allowed slop
+    
+    boolean match = matchSlop <= allowedSlop;
+    
+    if(match && possibleMatchPayloads.size() > 0) {
+      matchPayload.addAll(possibleMatchPayloads);
+    }
+
+    return match; // ordered and allowed slop
   }
 
   public String toString() {

Modified: lucene/java/trunk/src/test/org/apache/lucene/search/spans/TestPayloadSpans.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/search/spans/TestPayloadSpans.java?rev=720669&r1=720668&r2=720669&view=diff
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/search/spans/TestPayloadSpans.java (original)
+++ lucene/java/trunk/src/test/org/apache/lucene/search/spans/TestPayloadSpans.java Tue Nov
25 15:38:56 2008
@@ -18,15 +18,15 @@
 
 import java.io.IOException;
 import java.io.Reader;
+import java.io.StringReader;
 import java.util.Collection;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.Set;
 
-import junit.framework.TestCase;
-
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.LowerCaseTokenizer;
+import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
@@ -34,6 +34,7 @@
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
+import org.apache.lucene.index.CorruptIndexException;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.Payload;
@@ -42,8 +43,10 @@
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.Similarity;
 import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.TopDocs;
 import org.apache.lucene.search.payloads.PayloadHelper;
 import org.apache.lucene.search.payloads.PayloadSpanUtil;
+import org.apache.lucene.store.LockObtainFailedException;
 import org.apache.lucene.store.RAMDirectory;
 import org.apache.lucene.util.LuceneTestCase;
 
@@ -143,13 +146,16 @@
 
     spanNearQuery = new SpanNearQuery(clauses, 6, true);
      
-   
+    // xx within 6 of rr
+    
     SpanQuery[] clauses2 = new SpanQuery[2];
      
     clauses2[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "yy"));
     clauses2[1] = spanNearQuery;
      
     SpanNearQuery nestedSpanNearQuery = new SpanNearQuery(clauses2, 6, false);
+    
+    // yy within 6 of xx within 6 of rr
 
     spans = nestedSpanNearQuery.getPayloadSpans(searcher.getIndexReader());
     assertTrue("spans is null and it shouldn't be", spans != null);
@@ -221,6 +227,114 @@
     checkSpans(spans, 2, new int[]{8, 8});
   }
   
+  public void testShrinkToAfterShortestMatch() throws CorruptIndexException,
+      LockObtainFailedException, IOException {
+    RAMDirectory directory = new RAMDirectory();
+    IndexWriter writer = new IndexWriter(directory, new TestPayloadAnalyzer(),
+        IndexWriter.MaxFieldLength.LIMITED);
+    Document doc = new Document();
+    doc.add(new Field("content", new StringReader("a b c d e f g h i j a k")));
+    writer.addDocument(doc);
+    writer.close();
+
+    IndexSearcher is = new IndexSearcher(directory);
+
+    SpanTermQuery stq1 = new SpanTermQuery(new Term("content", "a"));
+    SpanTermQuery stq2 = new SpanTermQuery(new Term("content", "k"));
+    SpanQuery[] sqs = { stq1, stq2 };
+    SpanNearQuery snq = new SpanNearQuery(sqs, 1, true);
+    PayloadSpans spans = snq.getPayloadSpans(is.getIndexReader());
+
+    TopDocs topDocs = is.search(snq, 1);
+    Set payloadSet = new HashSet();
+    for (int i = 0; i < topDocs.scoreDocs.length; i++) {
+      while (spans.next()) {
+        Collection payloads = spans.getPayload();
+
+        for (Iterator it = payloads.iterator(); it.hasNext();) {
+          payloadSet.add(new String((byte[]) it.next()));
+        }
+      }
+    }
+    assertEquals(2, payloadSet.size());
+    assertTrue(payloadSet.contains("a:Noise:10"));
+    assertTrue(payloadSet.contains("k:Noise:11"));
+  }
+  
+  public void testShrinkToAfterShortestMatch2() throws CorruptIndexException,
+      LockObtainFailedException, IOException {
+    RAMDirectory directory = new RAMDirectory();
+    IndexWriter writer = new IndexWriter(directory, new TestPayloadAnalyzer(),
+        IndexWriter.MaxFieldLength.LIMITED);
+    Document doc = new Document();
+    doc.add(new Field("content", new StringReader("a b a d k f a h i k a k")));
+    writer.addDocument(doc);
+    writer.close();
+
+    IndexSearcher is = new IndexSearcher(directory);
+
+    SpanTermQuery stq1 = new SpanTermQuery(new Term("content", "a"));
+    SpanTermQuery stq2 = new SpanTermQuery(new Term("content", "k"));
+    SpanQuery[] sqs = { stq1, stq2 };
+    SpanNearQuery snq = new SpanNearQuery(sqs, 0, true);
+    PayloadSpans spans = snq.getPayloadSpans(is.getIndexReader());
+
+    TopDocs topDocs = is.search(snq, 1);
+    Set payloadSet = new HashSet();
+    for (int i = 0; i < topDocs.scoreDocs.length; i++) {
+      while (spans.next()) {
+        Collection payloads = spans.getPayload();
+        int cnt = 0;
+        for (Iterator it = payloads.iterator(); it.hasNext();) {
+          payloadSet.add(new String((byte[]) it.next()));
+        }
+      }
+    }
+    assertEquals(2, payloadSet.size());
+    assertTrue(payloadSet.contains("a:Noise:10"));
+    assertTrue(payloadSet.contains("k:Noise:11"));
+  }
+  
+  public void testShrinkToAfterShortestMatch3() throws CorruptIndexException,
+      LockObtainFailedException, IOException {
+    RAMDirectory directory = new RAMDirectory();
+    IndexWriter writer = new IndexWriter(directory, new TestPayloadAnalyzer(),
+        IndexWriter.MaxFieldLength.LIMITED);
+    Document doc = new Document();
+    doc.add(new Field("content", new StringReader("j k a l f k k p a t a k l k t a")));
+    writer.addDocument(doc);
+    writer.close();
+
+    IndexSearcher is = new IndexSearcher(directory);
+
+    SpanTermQuery stq1 = new SpanTermQuery(new Term("content", "a"));
+    SpanTermQuery stq2 = new SpanTermQuery(new Term("content", "k"));
+    SpanQuery[] sqs = { stq1, stq2 };
+    SpanNearQuery snq = new SpanNearQuery(sqs, 0, true);
+    PayloadSpans spans = snq.getPayloadSpans(is.getIndexReader());
+
+    TopDocs topDocs = is.search(snq, 1);
+    Set payloadSet = new HashSet();
+    for (int i = 0; i < topDocs.scoreDocs.length; i++) {
+      while (spans.next()) {
+        Collection payloads = spans.getPayload();
+
+        for (Iterator it = payloads.iterator(); it.hasNext();) {
+          payloadSet.add(new String((byte[]) it.next()));
+        }
+      }
+    }
+    assertEquals(2, payloadSet.size());
+    if(DEBUG) {
+      Iterator pit = payloadSet.iterator();
+      while (pit.hasNext()) {
+        System.out.println("match:" + pit.next());
+      }
+    }
+    assertTrue(payloadSet.contains("a:Noise:10"));
+    assertTrue(payloadSet.contains("k:Noise:11"));
+  }
+  
   public void testPayloadSpanUtil() throws Exception {
     RAMDirectory directory = new RAMDirectory();
     PayloadAnalyzer analyzer = new PayloadAnalyzer();
@@ -383,4 +497,13 @@
       return false;
     }
   }
+  
+  public class TestPayloadAnalyzer extends Analyzer {
+
+    public TokenStream tokenStream(String fieldName, Reader reader) {
+      TokenStream result = new LowerCaseTokenizer(reader);
+      result = new PayloadFilter(result, fieldName);
+      return result;
+    }
+  }
 }
\ No newline at end of file



Mime
View raw message