lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From rm...@apache.org
Subject svn commit: r1334448 - in /lucene/dev/trunk/lucene: core/src/java/org/apache/lucene/codecs/pulsing/ core/src/test/org/apache/lucene/index/ test-framework/src/java/org/apache/lucene/util/
Date Sat, 05 May 2012 16:53:16 GMT
Author: rmuir
Date: Sat May  5 16:53:16 2012
New Revision: 1334448

URL: http://svn.apache.org/viewvc?rev=1334448&view=rev
Log:
LUCENE-4031: support offsets in Pulsing

Modified:
    lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/pulsing/PulsingPostingsReader.java
    lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/pulsing/PulsingPostingsWriter.java
    lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/index/TestPostingsOffsets.java
    lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java

Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/pulsing/PulsingPostingsReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/pulsing/PulsingPostingsReader.java?rev=1334448&r1=1334447&r2=1334448&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/pulsing/PulsingPostingsReader.java
(original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/pulsing/PulsingPostingsReader.java
Sat May  5 16:53:16 2012
@@ -148,7 +148,7 @@ public class PulsingPostingsReader exten
     PulsingTermState termState = (PulsingTermState) _termState;
 
     // if we have positions, its total TF, otherwise its computed based on docFreq.
-    long count = fieldInfo.indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS ? termState.totalTermFreq
: termState.docFreq;
+    long count = fieldInfo.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
>= 0 ? termState.totalTermFreq : termState.docFreq;
     //System.out.println("  count=" + count + " threshold=" + maxPositions);
 
     if (count <= maxPositions) {
@@ -217,7 +217,11 @@ public class PulsingPostingsReader exten
   @Override
   public DocsAndPositionsEnum docsAndPositions(FieldInfo field, BlockTermState _termState,
Bits liveDocs, DocsAndPositionsEnum reuse,
                                                boolean needsOffsets) throws IOException {
-    //System.out.println("D&P: field=" + field.name);
+    if (field.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) {
+      return null;
+    } else if (needsOffsets && field.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS)
< 0) {
+      return null;
+    }
 
     final PulsingTermState termState = (PulsingTermState) _termState;
 
@@ -258,6 +262,7 @@ public class PulsingPostingsReader exten
     private final ByteArrayDataInput postings = new ByteArrayDataInput();
     private final IndexOptions indexOptions;
     private final boolean storePayloads;
+    private final boolean storeOffsets;
     private Bits liveDocs;
     private int docID = -1;
     private int accum;
@@ -267,6 +272,7 @@ public class PulsingPostingsReader exten
     public PulsingDocsEnum(FieldInfo fieldInfo) {
       indexOptions = fieldInfo.indexOptions;
       storePayloads = fieldInfo.storePayloads;
+      storeOffsets = fieldInfo.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS)
>= 0;
     }
 
     public PulsingDocsEnum reset(Bits liveDocs, PulsingTermState termState) {
@@ -314,7 +320,7 @@ public class PulsingPostingsReader exten
             freq = postings.readVInt();     // else read freq
           }
 
-          if (indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
+          if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0)
{
             // Skip positions
             if (storePayloads) {
               for(int pos=0;pos<freq;pos++) {
@@ -322,6 +328,10 @@ public class PulsingPostingsReader exten
                 if ((posCode & 1) != 0) {
                   payloadLength = postings.readVInt();
                 }
+                if (storeOffsets && (postings.readVInt() & 1) != 0) {
+                  // new offset length
+                  postings.readVInt();
+                }
                 if (payloadLength != 0) {
                   postings.skipBytes(payloadLength);
                 }
@@ -330,6 +340,10 @@ public class PulsingPostingsReader exten
               for(int pos=0;pos<freq;pos++) {
                 // TODO: skipVInt
                 postings.readVInt();
+                if (storeOffsets && (postings.readVInt() & 1) != 0) {
+                  // new offset length
+                  postings.readVInt();
+                }
               }
             }
           }
@@ -367,6 +381,10 @@ public class PulsingPostingsReader exten
     private byte[] postingsBytes;
     private final ByteArrayDataInput postings = new ByteArrayDataInput();
     private final boolean storePayloads;
+    private final boolean storeOffsets;
+    // note: we could actually reuse across different options, if we passed this to reset()
+    // and re-init'ed storeOffsets accordingly (made it non-final)
+    private final IndexOptions indexOptions;
 
     private Bits liveDocs;
     private int docID = -1;
@@ -376,15 +394,19 @@ public class PulsingPostingsReader exten
     private int position;
     private int payloadLength;
     private BytesRef payload;
+    private int startOffset;
+    private int offsetLength;
 
     private boolean payloadRetrieved;
 
     public PulsingDocsAndPositionsEnum(FieldInfo fieldInfo) {
+      indexOptions = fieldInfo.indexOptions;
       storePayloads = fieldInfo.storePayloads;
+      storeOffsets = fieldInfo.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS)
>= 0;
     }
 
     boolean canReuse(FieldInfo fieldInfo) {
-      return storePayloads == fieldInfo.storePayloads;
+      return indexOptions == fieldInfo.indexOptions && storePayloads == fieldInfo.storePayloads;
     }
 
     public PulsingDocsAndPositionsEnum reset(Bits liveDocs, PulsingTermState termState) {
@@ -401,6 +423,8 @@ public class PulsingPostingsReader exten
       posPending = 0;
       docID = -1;
       accum = 0;
+      startOffset = storeOffsets ? 0 : -1; // always return -1 if no offsets are stored
+      offsetLength = 0;
       //System.out.println("PR d&p reset storesPayloads=" + storePayloads + " bytes="
+ bytes.length + " this=" + this);
       return this;
     }
@@ -427,6 +451,7 @@ public class PulsingPostingsReader exten
           freq = postings.readVInt();     // else read freq
         }
         posPending = freq;
+        startOffset = storeOffsets ? 0 : -1; // always return -1 if no offsets are stored
 
         if (liveDocs == null || liveDocs.get(accum)) {
           //System.out.println("  return docID=" + docID + " freq=" + freq);
@@ -480,6 +505,15 @@ public class PulsingPostingsReader exten
       } else {
         position += postings.readVInt();
       }
+      
+      if (storeOffsets) {
+        int offsetCode = postings.readVInt();
+        if ((offsetCode & 1) != 0) {
+          // new offset length
+          offsetLength = postings.readVInt();
+        }
+        startOffset += offsetCode >>> 1;
+      }
 
       //System.out.println("PR d&p nextPos return pos=" + position + " this=" + this);
       return position;
@@ -487,12 +521,12 @@ public class PulsingPostingsReader exten
 
     @Override
     public int startOffset() {
-      return -1;
+      return startOffset;
     }
 
     @Override
     public int endOffset() {
-      return -1;
+      return startOffset + offsetLength;
     }
 
     private void skipPositions() throws IOException {

Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/pulsing/PulsingPostingsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/pulsing/PulsingPostingsWriter.java?rev=1334448&r1=1334447&r2=1334448&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/pulsing/PulsingPostingsWriter.java
(original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/pulsing/PulsingPostingsWriter.java
Sat May  5 16:53:16 2012
@@ -79,6 +79,8 @@ public final class PulsingPostingsWriter
     int termFreq;                                 // only incremented on first position for
a given doc
     int pos;
     int docID;
+    int startOffset;
+    int endOffset;
   }
 
   // TODO: -- lazy init this?  ie, if every single term
@@ -123,9 +125,6 @@ public final class PulsingPostingsWriter
   @Override
   public void setField(FieldInfo fieldInfo) {
     this.indexOptions = fieldInfo.indexOptions;
-    if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >=
0) {
-      throw new UnsupportedOperationException("this codec cannot index offsets: " + indexOptions);
-    }
     if (DEBUG) System.out.println("PW field=" + fieldInfo.name + " indexOptions=" + indexOptions);
     storePayloads = fieldInfo.storePayloads;
     wrappedPostingsWriter.setField(fieldInfo);
@@ -186,11 +185,13 @@ public final class PulsingPostingsWriter
     if (pendingCount == -1) {
       // We've already seen too many docs for this term --
       // just forward to our fallback writer
-      wrappedPostingsWriter.addPosition(position, payload, -1, -1);
+      wrappedPostingsWriter.addPosition(position, payload, startOffset, endOffset);
     } else {
       // buffer up
       final Position pos = pending[pendingCount++];
       pos.pos = position;
+      pos.startOffset = startOffset;
+      pos.endOffset = endOffset;
       pos.docID = currentDoc.docID;
       if (payload != null && payload.length > 0) {
         if (pos.payload == null) {
@@ -240,10 +241,11 @@ public final class PulsingPostingsWriter
       // given codec wants to store other interesting
       // stuff, it could use this pulsing codec to do so
 
-      if (indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
+      if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) {
         int lastDocID = 0;
         int pendingIDX = 0;
         int lastPayloadLength = -1;
+        int lastOffsetLength = -1;
         while(pendingIDX < pendingCount) {
           final Position doc = pending[pendingIDX];
 
@@ -260,14 +262,15 @@ public final class PulsingPostingsWriter
           }
 
           int lastPos = 0;
+          int lastOffset = 0;
           for(int posIDX=0;posIDX<doc.termFreq;posIDX++) {
             final Position pos = pending[pendingIDX++];
             assert pos.docID == doc.docID;
             final int posDelta = pos.pos - lastPos;
             lastPos = pos.pos;
             if (DEBUG) System.out.println("    write pos=" + pos.pos);
+            final int payloadLength = pos.payload == null ? 0 : pos.payload.length;
             if (storePayloads) {
-              final int payloadLength = pos.payload == null ? 0 : pos.payload.length;
               if (payloadLength != lastPayloadLength) {
                 buffer.writeVInt((posDelta << 1)|1);
                 buffer.writeVInt(payloadLength);
@@ -275,12 +278,28 @@ public final class PulsingPostingsWriter
               } else {
                 buffer.writeVInt(posDelta << 1);
               }
-              if (payloadLength > 0) {
-                buffer.writeBytes(pos.payload.bytes, 0, pos.payload.length);
-              }
             } else {
               buffer.writeVInt(posDelta);
             }
+            
+            if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS)
>= 0) {
+              //System.out.println("write=" + pos.startOffset + "," + pos.endOffset);
+              int offsetDelta = pos.startOffset - lastOffset;
+              int offsetLength = pos.endOffset - pos.startOffset;
+              if (offsetLength != lastOffsetLength) {
+                buffer.writeVInt(offsetDelta << 1 | 1);
+                buffer.writeVInt(offsetLength);
+              } else {
+                buffer.writeVInt(offsetDelta << 1);
+              }
+              lastOffset = pos.startOffset;
+              lastOffsetLength = offsetLength;             
+            }
+            
+            if (payloadLength > 0) {
+              assert storePayloads;
+              buffer.writeBytes(pos.payload.bytes, 0, pos.payload.length);
+            }
           }
         }
       } else if (indexOptions == IndexOptions.DOCS_AND_FREQS) {
@@ -387,7 +406,7 @@ public final class PulsingPostingsWriter
           wrappedPostingsWriter.startDoc(doc.docID, doc.termFreq);
         }
         if (DEBUG) System.out.println("PW:   wrapped.addPos pos=" + pos.pos);
-        wrappedPostingsWriter.addPosition(pos.pos, pos.payload, -1, -1);
+        wrappedPostingsWriter.addPosition(pos.pos, pos.payload, pos.startOffset, pos.endOffset);
       }
       //wrappedPostingsWriter.finishDoc();
     } else {

Modified: lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/index/TestPostingsOffsets.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/index/TestPostingsOffsets.java?rev=1334448&r1=1334447&r2=1334448&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/index/TestPostingsOffsets.java
(original)
+++ lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/index/TestPostingsOffsets.java
Sat May  5 16:53:16 2012
@@ -30,6 +30,8 @@ import org.apache.lucene.analysis.Token;
 import org.apache.lucene.codecs.Codec;
 import org.apache.lucene.codecs.lucene40.Lucene40PostingsFormat;
 import org.apache.lucene.codecs.memory.MemoryPostingsFormat;
+import org.apache.lucene.codecs.nestedpulsing.NestedPulsingPostingsFormat;
+import org.apache.lucene.codecs.pulsing.Pulsing40PostingsFormat;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.FieldType;
@@ -56,11 +58,13 @@ public class TestPostingsOffsets extends
     iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
     
     if (Codec.getDefault().getName().equals("Lucene40")) {
-      // pulsing etc are not implemented
-      if (random().nextBoolean()) {
-        iwc.setCodec(_TestUtil.alwaysPostingsFormat(new Lucene40PostingsFormat()));
-      } else {
-        iwc.setCodec(_TestUtil.alwaysPostingsFormat(new MemoryPostingsFormat()));
+      // sep etc are not implemented
+      switch(random().nextInt(4)) {
+        case 0: iwc.setCodec(_TestUtil.alwaysPostingsFormat(new Lucene40PostingsFormat()));
break;
+        case 1: iwc.setCodec(_TestUtil.alwaysPostingsFormat(new MemoryPostingsFormat()));
break;
+        case 2: iwc.setCodec(_TestUtil.alwaysPostingsFormat(
+            new Pulsing40PostingsFormat(_TestUtil.nextInt(random(), 1, 3)))); break;
+        case 3: iwc.setCodec(_TestUtil.alwaysPostingsFormat(new NestedPulsingPostingsFormat()));
break;
       }
     }
   }
@@ -73,6 +77,11 @@ public class TestPostingsOffsets extends
 
     FieldType ft = new FieldType(TextField.TYPE_UNSTORED);
     ft.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
+    if (random().nextBoolean()) {
+      ft.setStoreTermVectors(true);
+      ft.setStoreTermVectorPositions(random().nextBoolean());
+      ft.setStoreTermVectorOffsets(random().nextBoolean());
+    }
     Token[] tokens = new Token[] {
       makeToken("a", 1, 0, 6),
       makeToken("b", 1, 8, 9),
@@ -132,11 +141,13 @@ public class TestPostingsOffsets extends
     Analyzer analyzer = withPayloads ? new MockPayloadAnalyzer() : new MockAnalyzer(random());
     iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer);
     if (Codec.getDefault().getName().equals("Lucene40")) {
-      // pulsing etc are not implemented
-      if (random().nextBoolean()) {
-        iwc.setCodec(_TestUtil.alwaysPostingsFormat(new Lucene40PostingsFormat()));
-      } else {
-        iwc.setCodec(_TestUtil.alwaysPostingsFormat(new MemoryPostingsFormat()));
+      // sep etc are not implemented
+      switch(random().nextInt(4)) {
+        case 0: iwc.setCodec(_TestUtil.alwaysPostingsFormat(new Lucene40PostingsFormat()));
break;
+        case 1: iwc.setCodec(_TestUtil.alwaysPostingsFormat(new MemoryPostingsFormat()));
break;
+        case 2: iwc.setCodec(_TestUtil.alwaysPostingsFormat(
+            new Pulsing40PostingsFormat(_TestUtil.nextInt(random(), 1, 3)))); break;
+        case 3: iwc.setCodec(_TestUtil.alwaysPostingsFormat(new NestedPulsingPostingsFormat()));
break;
       }
     }
     iwc.setMergePolicy(newLogMergePolicy()); // will rely on docids a bit for skipping

Modified: lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java?rev=1334448&r1=1334447&r2=1334448&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java
(original)
+++ lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java
Sat May  5 16:53:16 2012
@@ -574,6 +574,12 @@ public class _TestUtil {
    *  default codecs and formats, but always writes in the specified
    *  format. */
   public static Codec alwaysPostingsFormat(final PostingsFormat format) {
+    // TODO: we really need for postings impls etc to announce themselves
+    // (and maybe their params, too) to infostream on flush and merge.
+    // otherwise in a real debugging situation we won't know whats going on!
+    if (LuceneTestCase.VERBOSE) {
+      System.out.println("forcing postings format to:" + format);
+    }
     return new Lucene40Codec() {
       @Override
       public PostingsFormat getPostingsFormatForField(String field) {



Mime
View raw message