lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mikemcc...@apache.org
Subject svn commit: r780220 - in /lucene/java/trunk: ./ src/java/org/apache/lucene/index/ src/test/org/apache/lucene/index/ src/test/org/apache/lucene/search/
Date Sat, 30 May 2009 09:36:11 GMT
Author: mikemccand
Date: Sat May 30 09:36:10 2009
New Revision: 780220

URL: http://svn.apache.org/viewvc?rev=780220&view=rev
Log:
LUCENE-1552: properly index first token(s) with 0 position increment

Modified:
    lucene/java/trunk/CHANGES.txt
    lucene/java/trunk/common-build.xml
    lucene/java/trunk/src/java/org/apache/lucene/index/DocInverterPerField.java
    lucene/java/trunk/src/java/org/apache/lucene/index/DocumentsWriter.java
    lucene/java/trunk/src/java/org/apache/lucene/index/DocumentsWriterThreadState.java
    lucene/java/trunk/src/java/org/apache/lucene/index/IndexWriter.java
    lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexWriter.java
    lucene/java/trunk/src/test/org/apache/lucene/search/TestPositionIncrement.java

Modified: lucene/java/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/CHANGES.txt?rev=780220&r1=780219&r2=780220&view=diff
==============================================================================
--- lucene/java/trunk/CHANGES.txt (original)
+++ lucene/java/trunk/CHANGES.txt Sat May 30 09:36:10 2009
@@ -71,6 +71,17 @@
     with SortField.FIELD_DOC (it was unnecessary as Lucene breaks ties
     internally by docID). (Shai Erera via Michael McCandless)
 
+ 6. LUCENE-1542: When the first token(s) have 0 position increment,
+    IndexWriter used to incorrectly record the position as -1, if no
+    payload is present, or Integer.MAX_VALUE if a payload is present.
+    This causes positional queries to fail to match.  The bug is now
+    fixed, but if your app relies on the buggy behavior then you must
+    call IndexWriter.setAllowMinus1Position().  That API is deprecated
+    so you must fix your application, and rebuild your index, to not
+    rely on this behavior by the 3.0 release of Lucene. (Jonathan
+    Mamou, Mark Miller via Mike McCandless)
+
+
 API Changes
 
 1. LUCENE-1419: Add expert API to set custom indexing chain. This API is 
@@ -186,6 +197,16 @@
 10. LUCENE-1647: Fix case where IndexReader.undeleteAll would cause
     the segment's deletion count to be incorrect. (Mike McCandless)
 
+11. LUCENE-1542: When the first token(s) have 0 position increment,
+    IndexWriter used to incorrectly record the position as -1, if no
+    payload is present, or Integer.MAX_VALUE if a payload is present.
+    This causes positional queries to fail to match.  The bug is now
+    fixed, but if your app relies on the buggy behavior then you must
+    call IndexWriter.setAllowMinus1Position().  That API is deprecated
+    so you must fix your application, and rebuild your index, to not
+    rely on this behavior by the 3.0 release of Lucene. (Jonathan
+    Mamou, Mark Miller via Mike McCandless)
+
  New features
 
  1. LUCENE-1411: Added expert API to open an IndexWriter on a prior

Modified: lucene/java/trunk/common-build.xml
URL: http://svn.apache.org/viewvc/lucene/java/trunk/common-build.xml?rev=780220&r1=780219&r2=780220&view=diff
==============================================================================
--- lucene/java/trunk/common-build.xml (original)
+++ lucene/java/trunk/common-build.xml Sat May 30 09:36:10 2009
@@ -42,7 +42,7 @@
   <property name="Name" value="Lucene"/>
   <property name="dev.version" value="2.9-dev"/>
   <property name="version" value="${dev.version}"/>
-  <property name="compatibility.tag" value="lucene_2_4_back_compat_tests_20090526"/>
+  <property name="compatibility.tag" value="lucene_2_4_back_compat_tests_20090530"/>
   <property name="spec.version" value="${version}"/>	
   <property name="year" value="2000-${current.year}"/>
   <property name="final.name" value="lucene-${name}-${version}"/>

Modified: lucene/java/trunk/src/java/org/apache/lucene/index/DocInverterPerField.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/DocInverterPerField.java?rev=780220&r1=780219&r2=780220&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/DocInverterPerField.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/DocInverterPerField.java Sat May 30
09:36:10 2009
@@ -126,6 +126,9 @@
           // reset the TokenStream to the first token
           stream.reset();
 
+          // deprecated
+          final boolean allowMinus1Position = docState.allowMinus1Position;
+
           try {
             int offsetEnd = fieldState.offset-1;
             
@@ -162,7 +165,11 @@
               }
               
               final int posIncr = posIncrAttribute.getPositionIncrement();
-              fieldState.position += posIncr - 1;
+              fieldState.position += posIncr;
+              if (allowMinus1Position || fieldState.position > 0) {
+                fieldState.position--;
+              }
+
               if (posIncr == 0)
                 fieldState.numOverlap++;
 

Modified: lucene/java/trunk/src/java/org/apache/lucene/index/DocumentsWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/DocumentsWriter.java?rev=780220&r1=780219&r2=780220&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/DocumentsWriter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/DocumentsWriter.java Sat May 30 09:36:10
2009
@@ -150,6 +150,9 @@
     Document doc;
     String maxTermPrefix;
 
+    // deprecated
+    boolean allowMinus1Position;
+
     // Only called by asserts
     public boolean testPoint(String name) {
       return docWriter.writer.testPoint(name);
@@ -298,6 +301,11 @@
       threadStates[i].docState.similarity = similarity;
   }
 
+  synchronized void setAllowMinus1Position() {
+    for(int i=0;i<threadStates.length;i++)
+      threadStates[i].docState.allowMinus1Position = true;;
+  }
+
   /** Set how much RAM we can use before flushing. */
   synchronized void setRAMBufferSizeMB(double mb) {
     if (mb == IndexWriter.DISABLE_AUTO_FLUSH) {

Modified: lucene/java/trunk/src/java/org/apache/lucene/index/DocumentsWriterThreadState.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/DocumentsWriterThreadState.java?rev=780220&r1=780219&r2=780220&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/DocumentsWriterThreadState.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/DocumentsWriterThreadState.java Sat
May 30 09:36:10 2009
@@ -40,6 +40,7 @@
     docState.infoStream = docWriter.infoStream;
     docState.similarity = docWriter.similarity;
     docState.docWriter = docWriter;
+    docState.allowMinus1Position = docWriter.writer.getAllowMinus1Position();
     consumer = docWriter.consumer.addThread(this);
   }
 

Modified: lucene/java/trunk/src/java/org/apache/lucene/index/IndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/IndexWriter.java?rev=780220&r1=780219&r2=780220&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/IndexWriter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/IndexWriter.java Sat May 30 09:36:10
2009
@@ -5485,6 +5485,22 @@
     throw oom;
   }
 
+  // deprecated
+  private boolean allowMinus1Position;
+
+  /** Deprecated: emulates IndexWriter's buggy behavior when
+   *  first token(s) have positionIncrement==0 (ie, prior to
+   *  fixing LUCENE-1542) */
+  public void setAllowMinus1Position() {
+    allowMinus1Position = true;
+    docWriter.setAllowMinus1Position();
+  }
+
+  // deprecated
+  boolean getAllowMinus1Position() {
+    return allowMinus1Position;
+  }
+
   // Used only by assert for testing.  Current points:
   //   startDoFlush
   //   startCommitMerge

Modified: lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexWriter.java?rev=780220&r1=780219&r2=780220&view=diff
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexWriter.java (original)
+++ lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexWriter.java Sat May 30 09:36:10
2009
@@ -3594,7 +3594,7 @@
     TermPositions tps = s.getIndexReader().termPositions(new Term("field", "a"));
     assertTrue(tps.next());
     assertEquals(1, tps.freq());
-    assertEquals(-1, tps.nextPosition());
+    assertEquals(0, tps.nextPosition());
     w.close();
 
     assertTrue(_TestUtil.checkIndex(dir));

Modified: lucene/java/trunk/src/test/org/apache/lucene/search/TestPositionIncrement.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/search/TestPositionIncrement.java?rev=780220&r1=780219&r2=780220&view=diff
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/search/TestPositionIncrement.java (original)
+++ lucene/java/trunk/src/test/org/apache/lucene/search/TestPositionIncrement.java Sat May
30 09:36:10 2009
@@ -17,8 +17,11 @@
  * limitations under the License.
  */
 
-import java.io.IOException;
 import java.io.Reader;
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.Collection;
+import java.util.Iterator;
 
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.StopFilter;
@@ -26,14 +29,27 @@
 import org.apache.lucene.analysis.WhitespaceAnalyzer;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermPositions;
 import org.apache.lucene.queryParser.QueryParser;
-import org.apache.lucene.store.RAMDirectory;
+import org.apache.lucene.store.MockRAMDirectory;
+import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.analysis.LowerCaseTokenizer;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.index.Payload;
+import org.apache.lucene.search.payloads.PayloadSpanUtil;
+import org.apache.lucene.search.spans.PayloadSpans;
+import org.apache.lucene.search.spans.SpanNearQuery;
+import org.apache.lucene.search.spans.SpanQuery;
+import org.apache.lucene.search.spans.SpanTermQuery;
+import org.apache.lucene.search.spans.Spans;
 
 /**
  * Term position unit test.
@@ -48,7 +64,7 @@
       public TokenStream tokenStream(String fieldName, Reader reader) {
         return new TokenStream() {
           private final String[] TOKENS = {"1", "2", "3", "4", "5"};
-          private final int[] INCREMENTS = {1, 2, 1, 0, 1};
+          private final int[] INCREMENTS = {0, 2, 1, 0, 1};
           private int i = 0;
 
           PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
@@ -67,7 +83,7 @@
         };
       }
     };
-    RAMDirectory store = new RAMDirectory();
+    Directory store = new MockRAMDirectory();
     IndexWriter writer = new IndexWriter(store, analyzer, true,
                                          IndexWriter.MaxFieldLength.LIMITED);
     Document d = new Document();
@@ -75,8 +91,20 @@
     writer.addDocument(d);
     writer.optimize();
     writer.close();
+    
 
     IndexSearcher searcher = new IndexSearcher(store);
+    
+    TermPositions pos = searcher.getIndexReader().termPositions(new Term("field", "1"));
+    pos.next();
+    // first token should be at position 0
+    assertEquals(0, pos.nextPosition());
+    
+    pos = searcher.getIndexReader().termPositions(new Term("field", "2"));
+    pos.next();
+    // second token should be at position 2
+    assertEquals(2, pos.nextPosition());
+    
     PhraseQuery q;
     ScoreDoc[] hits;
 
@@ -202,4 +230,146 @@
       StopFilter.setEnablePositionIncrementsDefault(dflt);
     }
   }
+  
+  public void testPayloadsPos0() throws Exception {
+    for(int x=0;x<2;x++) {
+      Directory dir = new MockRAMDirectory();
+      IndexWriter writer = new IndexWriter(dir,
+                                           new TestPayloadAnalyzer(), true,
+                                           IndexWriter.MaxFieldLength.LIMITED);
+      if (x == 1) {
+        writer.setAllowMinus1Position();
+      }
+      Document doc = new Document();
+      doc.add(new Field("content",
+                        new StringReader("a a b c d e a f g h i j a b k k")));
+      writer.addDocument(doc);
+
+      IndexReader r = writer.getReader();
+
+      TermPositions tp = r.termPositions(new Term("content", "a"));
+      int count = 0;
+      assertTrue(tp.next());
+      // "a" occurs 4 times
+      assertEquals(4, tp.freq());
+      int expected;
+      if (x == 1) {
+        expected = Integer.MAX_VALUE;
+      } else {
+        expected = 0;
+      }
+      assertEquals(expected, tp.nextPosition());
+      if (x == 1) {
+        continue;
+      }
+      assertEquals(1, tp.nextPosition());
+      assertEquals(3, tp.nextPosition());
+      assertEquals(6, tp.nextPosition());
+
+      // only one doc has "a"
+      assertFalse(tp.next());
+
+      IndexSearcher is = new IndexSearcher(r);
+    
+      SpanTermQuery stq1 = new SpanTermQuery(new Term("content", "a"));
+      SpanTermQuery stq2 = new SpanTermQuery(new Term("content", "k"));
+      SpanQuery[] sqs = { stq1, stq2 };
+      SpanNearQuery snq = new SpanNearQuery(sqs, 30, false);
+
+      count = 0;
+      boolean sawZero = false;
+      //System.out.println("\ngetPayloadSpans test");
+      PayloadSpans pspans = snq.getPayloadSpans(is.getIndexReader());
+      while (pspans.next()) {
+        //System.out.println(pspans.doc() + " - " + pspans.start() + " - "+ pspans.end());
+        Collection payloads = pspans.getPayload();
+        sawZero |= pspans.start() == 0;
+        for (Iterator it = payloads.iterator(); it.hasNext();) {
+          count++;
+          it.next();
+          //System.out.println(new String((byte[]) it.next()));
+        }
+      }
+      assertEquals(5, count);
+      assertTrue(sawZero);
+
+      //System.out.println("\ngetSpans test");
+      Spans spans = snq.getSpans(is.getIndexReader());
+      count = 0;
+      sawZero = false;
+      while (spans.next()) {
+        count++;
+        sawZero |= spans.start() == 0;
+        //System.out.println(spans.doc() + " - " + spans.start() + " - " + spans.end());
+      }
+      assertEquals(4, count);
+      assertTrue(sawZero);
+		
+      //System.out.println("\nPayloadSpanUtil test");
+
+      sawZero = false;
+      PayloadSpanUtil psu = new PayloadSpanUtil(is.getIndexReader());
+      Collection pls = psu.getPayloadsForQuery(snq);
+      count = pls.size();
+      for (Iterator it = pls.iterator(); it.hasNext();) {
+        String s = new String((byte[]) it.next());
+        //System.out.println(s);
+        sawZero |= s.equals("pos: 0");
+      }
+      assertEquals(5, count);
+      assertTrue(sawZero);
+      writer.close();
+      is.getIndexReader().close();
+      dir.close();
+    }
+  }
+}
+
+class TestPayloadAnalyzer extends Analyzer {
+
+  public TokenStream tokenStream(String fieldName, Reader reader) {
+    TokenStream result = new LowerCaseTokenizer(reader);
+    return new PayloadFilter(result, fieldName);
+  }
+}
+
+class PayloadFilter extends TokenFilter {
+  String fieldName;
+
+  int pos;
+
+  int i;
+
+  final PositionIncrementAttribute posIncrAttr;
+  final PayloadAttribute payloadAttr;
+  final TermAttribute termAttr;
+
+  public PayloadFilter(TokenStream input, String fieldName) {
+    super(input);
+    this.fieldName = fieldName;
+    pos = 0;
+    i = 0;
+    posIncrAttr = (PositionIncrementAttribute) input.addAttribute(PositionIncrementAttribute.class);
+    payloadAttr = (PayloadAttribute) input.addAttribute(PayloadAttribute.class);
+    termAttr = (TermAttribute) input.addAttribute(TermAttribute.class);
+  }
+
+  public boolean incrementToken() throws IOException {
+    if (input.incrementToken()) {
+      payloadAttr.setPayload(new Payload(("pos: " + pos).getBytes()));
+      int posIncr;
+      if (i % 2 == 1) {
+        posIncr = 1;
+      } else {
+        posIncr = 0;
+      }
+      posIncrAttr.setPositionIncrement(posIncr);
+      pos += posIncr;
+      // System.out.println("term=" + termAttr.term() + " pos=" + pos);
+      i++;
+      return true;
+    } else {
+      return false;
+    }
+  }
 }



Mime
View raw message