lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From busc...@apache.org
Subject svn commit: r518486 - in /lucene/java/trunk: ./ src/java/org/apache/lucene/analysis/ src/java/org/apache/lucene/index/ src/java/org/apache/lucene/store/ src/site/src/documentation/content/xdocs/ src/test/org/apache/lucene/index/ src/test/org/apache/luc...
Date Thu, 15 Mar 2007 05:15:45 GMT
Author: buschmi
Date: Wed Mar 14 22:15:43 2007
New Revision: 518486

URL: http://svn.apache.org/viewvc?view=rev&rev=518486
Log:
LUCENE-755: Added the ability to store arbitrary binary metadata (payloads) in the posting list.

Added:
    lucene/java/trunk/src/java/org/apache/lucene/index/Payload.java
    lucene/java/trunk/src/test/org/apache/lucene/index/TestPayloads.java
Modified:
    lucene/java/trunk/CHANGES.txt
    lucene/java/trunk/src/java/org/apache/lucene/analysis/Token.java
    lucene/java/trunk/src/java/org/apache/lucene/index/DocumentWriter.java
    lucene/java/trunk/src/java/org/apache/lucene/index/FieldInfo.java
    lucene/java/trunk/src/java/org/apache/lucene/index/FieldInfos.java
    lucene/java/trunk/src/java/org/apache/lucene/index/FilterIndexReader.java
    lucene/java/trunk/src/java/org/apache/lucene/index/IndexReader.java
    lucene/java/trunk/src/java/org/apache/lucene/index/MultiReader.java
    lucene/java/trunk/src/java/org/apache/lucene/index/MultipleTermPositions.java
    lucene/java/trunk/src/java/org/apache/lucene/index/ParallelReader.java
    lucene/java/trunk/src/java/org/apache/lucene/index/SegmentMerger.java
    lucene/java/trunk/src/java/org/apache/lucene/index/SegmentReader.java
    lucene/java/trunk/src/java/org/apache/lucene/index/SegmentTermDocs.java
    lucene/java/trunk/src/java/org/apache/lucene/index/SegmentTermPositions.java
    lucene/java/trunk/src/java/org/apache/lucene/index/TermPositions.java
    lucene/java/trunk/src/java/org/apache/lucene/store/BufferedIndexOutput.java
    lucene/java/trunk/src/java/org/apache/lucene/store/FSDirectory.java
    lucene/java/trunk/src/java/org/apache/lucene/store/IndexOutput.java
    lucene/java/trunk/src/java/org/apache/lucene/store/RAMOutputStream.java
    lucene/java/trunk/src/site/src/documentation/content/xdocs/fileformats.xml
    lucene/java/trunk/src/test/org/apache/lucene/store/MockRAMOutputStream.java

Modified: lucene/java/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/CHANGES.txt?view=diff&rev=518486&r1=518485&r2=518486
==============================================================================
--- lucene/java/trunk/CHANGES.txt (original)
+++ lucene/java/trunk/CHANGES.txt Wed Mar 14 22:15:43 2007
@@ -82,6 +82,13 @@
 
  2. LUCENE-822: Added FieldSelector capabilities to Searchable for use with RemoteSearcher, and other Searchable implementations. (Mark Miller, Grant Ingersoll)
 
+ 3. LUCENE-755: Added the ability to store arbitrary binary metadata in the posting list.
+    These metadata are called Payloads. For every position of a Token one Payload in the form
+    of a variable length byte array can be stored in the prox file.
+    Remark: The APIs introduced with this feature are in experimental state and thus
+            contain appropriate warnings in the javadocs.
+    (Michael Busch)
+
 Optimizations
 
  1. LUCENE-761: The proxStream is now cloned lazily in SegmentTermPositions

Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/Token.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/Token.java?view=diff&rev=518486&r1=518485&r2=518486
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/Token.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/Token.java Wed Mar 14 22:15:43 2007
@@ -1,5 +1,8 @@
 package org.apache.lucene.analysis;
 
+import org.apache.lucene.index.Payload;
+import org.apache.lucene.index.TermPositions;
+
 /**
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
@@ -20,23 +23,40 @@
 /** A Token is an occurence of a term from the text of a field.  It consists of
   a term's text, the start and end offset of the term in the text of the field,
   and a type string.
-
+  <p>
   The start and end offsets permit applications to re-associate a token with
   its source text, e.g., to display highlighted query terms in a document
   browser, or to show matching text fragments in a KWIC (KeyWord In Context)
   display, etc.
-
+  <p>
   The type is an interned string, assigned by a lexical analyzer
   (a.k.a. tokenizer), naming the lexical or syntactic class that the token
   belongs to.  For example an end of sentence marker token might be implemented
-  with type "eos".  The default token type is "word".  */
-
+  with type "eos".  The default token type is "word".  
+  <p>
+  A Token can optionally have metadata (a.k.a. Payload) in the form of a variable
+  length byte array. Use {@link TermPositions#getPayloadLength()} and 
+  {@link TermPositions#getPayload(byte[], int)} to retrieve the payloads from the index.
+  
+  <br><br>
+  <b>
+  Warning: The status of the Payloads feature is experimental. The APIs
+  introduced here might change in the future and will not be supported anymore
+  in such a case. If you want to use this feature in a production environment
+  you should wait for an official release.
+  </b> 
+
+  @see org.apache.lucene.index.Payload
+  */
+  // TODO: Remove warning after API has been finalized
 public class Token implements Cloneable {
   String termText;				  // the text of the term
   int startOffset;				  // start in source text
   int endOffset;				  // end in source text
   String type = "word";				  // lexical type
-
+  
+  Payload payload;
+  
   private int positionIncrement = 1;
 
   /** Constructs a Token with the given term text, and start & end offsets.
@@ -114,6 +134,36 @@
 
   /** Returns this Token's lexical type.  Defaults to "word". */
   public final String type() { return type; }
+
+  /** 
+   * Sets this Token's payload.<br>
+   * <br>
+   * <b>
+   * Warning: The status of the Payloads feature is experimental. The APIs
+   * introduced here might change in the future and will not be supported anymore
+   * in such a case. If you want to use this feature in a production environment
+   * you should wait for an official release.
+   * </b>  
+   */
+  // TODO: Remove warning after API has been finalized
+  public void setPayload(Payload payload) {
+    this.payload = payload;
+  }
+  
+  /** 
+   * Returns this Token's payload.<br> 
+   * <br>
+   * <b>
+   * Warning: The status of the Payloads feature is experimental. The APIs
+   * introduced here might change in the future and will not be supported anymore
+   * in such a case. If you want to use this feature in a production environment
+   * you should wait for an official release.
+   * </b>   
+   */
+  // TODO: Remove warning after API has been finalized
+  public Payload getPayload() {
+    return this.payload;
+  }
 
   public String toString() {
     StringBuffer sb = new StringBuffer();

Modified: lucene/java/trunk/src/java/org/apache/lucene/index/DocumentWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/DocumentWriter.java?view=diff&rev=518486&r1=518485&r2=518486
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/DocumentWriter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/DocumentWriter.java Wed Mar 14 22:15:43 2007
@@ -31,6 +31,7 @@
 import java.io.Reader;
 import java.io.StringReader;
 import java.util.Arrays;
+import java.util.BitSet;
 import java.util.Enumeration;
 import java.util.Hashtable;
 import java.util.Iterator;
@@ -69,34 +70,41 @@
 
   final void addDocument(String segment, Document doc)
           throws CorruptIndexException, IOException {
-    // write field names
+    // create field infos
     fieldInfos = new FieldInfos();
     fieldInfos.add(doc);
-    fieldInfos.write(directory, segment + ".fnm");
-
-    // write field values
-    FieldsWriter fieldsWriter =
-            new FieldsWriter(directory, segment, fieldInfos);
-    try {
-      fieldsWriter.addDocument(doc);
-    } finally {
-      fieldsWriter.close();
-    }
-
+    
     // invert doc into postingTable
     postingTable.clear();			  // clear postingTable
     fieldLengths = new int[fieldInfos.size()];    // init fieldLengths
     fieldPositions = new int[fieldInfos.size()];  // init fieldPositions
     fieldOffsets = new int[fieldInfos.size()];    // init fieldOffsets
-
+    fieldStoresPayloads = new BitSet(fieldInfos.size());
+    
     fieldBoosts = new float[fieldInfos.size()];	  // init fieldBoosts
     Arrays.fill(fieldBoosts, doc.getBoost());
 
+    // Before we write the FieldInfos we invert the Document. The reason is that
+    // during invertion the TokenStreams of tokenized fields are being processed 
+    // and we might encounter tokens that have payloads associated with them. In 
+    // this case we have to update the FieldInfo of the particular field.
     invertDocument(doc);
 
     // sort postingTable into an array
     Posting[] postings = sortPostingTable();
+    
+    // write field infos 
+    fieldInfos.write(directory, segment + ".fnm");
 
+    // write field values
+    FieldsWriter fieldsWriter =
+            new FieldsWriter(directory, segment, fieldInfos);
+    try {
+      fieldsWriter.addDocument(doc);
+    } finally {
+      fieldsWriter.close();
+    }
+    
     /*
     for (int i = 0; i < postings.length; i++) {
       Posting posting = postings[i];
@@ -125,6 +133,10 @@
   private int[] fieldPositions;
   private int[] fieldOffsets;
   private float[] fieldBoosts;
+  
+  // If any of the tokens of a paticular field carry a payload
+  // then we enable payloads for that field. 
+  private BitSet fieldStoresPayloads;
 
   // Tokenizes the fields of a document into Postings.
   private final void invertDocument(Document doc)
@@ -144,9 +156,9 @@
         if (!field.isTokenized()) {		  // un-tokenized field
           String stringValue = field.stringValue();
           if(field.isStoreOffsetWithTermVector())
-            addPosition(fieldName, stringValue, position++, new TermVectorOffsetInfo(offset, offset + stringValue.length()));
+            addPosition(fieldName, stringValue, position++, null, new TermVectorOffsetInfo(offset, offset + stringValue.length()));
           else
-            addPosition(fieldName, stringValue, position++, null);
+            addPosition(fieldName, stringValue, position++, null, null);
           offset += stringValue.length();
           length++;
         } else 
@@ -167,10 +179,19 @@
             for (Token t = stream.next(); t != null; t = stream.next()) {
               position += (t.getPositionIncrement() - 1);
               
-              if(field.isStoreOffsetWithTermVector())
-                addPosition(fieldName, t.termText(), position++, new TermVectorOffsetInfo(offset + t.startOffset(), offset + t.endOffset()));
-              else
-                addPosition(fieldName, t.termText(), position++, null);
+              Payload payload = t.getPayload();
+              if (payload != null) {
+                // enable payloads for this field
+              	fieldStoresPayloads.set(fieldNumber);
+              }
+              
+              TermVectorOffsetInfo termVectorOffsetInfo;
+              if (field.isStoreOffsetWithTermVector()) {
+                termVectorOffsetInfo = new TermVectorOffsetInfo(offset + t.startOffset(), offset + t.endOffset());
+              } else {
+                termVectorOffsetInfo = null;
+              }
+              addPosition(fieldName, t.termText(), position++, payload, termVectorOffsetInfo);
               
               lastToken = t;
               if (++length >= maxFieldLength) {
@@ -194,11 +215,16 @@
         fieldOffsets[fieldNumber] = offset;
       }
     }
+    
+    // update fieldInfos for all fields that have one or more tokens with payloads
+    for (int i = fieldStoresPayloads.nextSetBit(0); i >= 0; i = fieldStoresPayloads.nextSetBit(i+1)) { 
+    	fieldInfos.fieldInfo(i).storePayloads = true;
+    }
   }
 
   private final Term termBuffer = new Term("", ""); // avoid consing
 
-  private final void addPosition(String field, String text, int position, TermVectorOffsetInfo offset) {
+  private final void addPosition(String field, String text, int position, Payload payload, TermVectorOffsetInfo offset) {
     termBuffer.set(field, text);
     //System.out.println("Offset: " + offset);
     Posting ti = (Posting) postingTable.get(termBuffer);
@@ -209,9 +235,25 @@
         int[] positions = ti.positions;
         System.arraycopy(positions, 0, newPositions, 0, freq);
         ti.positions = newPositions;
+        
+        if (ti.payloads != null) {
+          // the current field stores payloads
+          Payload[] newPayloads = new Payload[freq * 2];  // grow payloads array
+          Payload[] payloads = ti.payloads;
+          System.arraycopy(payloads, 0, newPayloads, 0, payloads.length);
+          ti.payloads = newPayloads;
+        }
       }
       ti.positions[freq] = position;		  // add new position
 
+      if (payload != null) {
+        if (ti.payloads == null) {
+          // lazily allocate payload array
+          ti.payloads = new Payload[ti.positions.length];
+        }
+        ti.payloads[freq] = payload;
+      }
+      
       if (offset != null) {
         if (ti.offsets.length == freq){
           TermVectorOffsetInfo [] newOffsets = new TermVectorOffsetInfo[freq*2];
@@ -224,7 +266,7 @@
       ti.freq = freq + 1;			  // update frequency
     } else {					  // word not seen before
       Term term = new Term(field, text, false);
-      postingTable.put(term, new Posting(term, position, offset));
+      postingTable.put(term, new Posting(term, position, payload, offset));
     }
   }
 
@@ -307,10 +349,31 @@
                                 termIndexInterval);
       TermInfo ti = new TermInfo();
       String currentField = null;
-
+      boolean currentFieldHasPayloads = false;
+      
       for (int i = 0; i < postings.length; i++) {
         Posting posting = postings[i];
 
+        // check to see if we switched to a new field
+        String termField = posting.term.field();
+        if (currentField != termField) {
+          // changing field - see if there is something to save
+          currentField = termField;
+          FieldInfo fi = fieldInfos.fieldInfo(currentField);
+          currentFieldHasPayloads = fi.storePayloads;
+          if (fi.storeTermVector) {
+            if (termVectorWriter == null) {
+              termVectorWriter =
+                new TermVectorsWriter(directory, segment, fieldInfos);
+              termVectorWriter.openDocument();
+            }
+            termVectorWriter.openField(currentField);
+
+          } else if (termVectorWriter != null) {
+            termVectorWriter.closeField();
+          }
+        }
+        
         // add an entry to the dictionary with pointers to prox and freq files
         ti.set(1, freq.getFilePointer(), prox.getFilePointer(), -1);
         tis.add(posting.term, ti);
@@ -326,28 +389,62 @@
 
         int lastPosition = 0;			  // write positions
         int[] positions = posting.positions;
+        Payload[] payloads = posting.payloads;
+        int lastPayloadLength = -1;
+        
+        
+        // The following encoding is being used for positions and payloads:
+        // Case 1: current field does not store payloads
+        //           Positions     -> <PositionDelta>^freq
+        //           PositionDelta -> VInt
+        //         The PositionDelta is the difference between the current
+        //         and the previous position
+        // Case 2: current field stores payloads
+        //           Positions     -> <PositionDelta, Payload>^freq
+        //           Payload       ->  <PayloadLength?, PayloadData>
+        //           PositionDelta -> VInt
+        //           PayloadLength -> VInt
+        //           PayloadData   -> byte^PayloadLength
+        //         In this case PositionDelta/2 is the difference between
+        //         the current and the previous position. If PositionDelta
+        //         is odd, then a PayloadLength encoded as VInt follows,
+        //         if PositionDelta is even, then it is assumed that the
+        //         length of the current Payload equals the length of the
+        //         previous Payload.        
         for (int j = 0; j < postingFreq; j++) {		  // use delta-encoding
           int position = positions[j];
-          prox.writeVInt(position - lastPosition);
-          lastPosition = position;
-        }
-        // check to see if we switched to a new field
-        String termField = posting.term.field();
-        if (currentField != termField) {
-          // changing field - see if there is something to save
-          currentField = termField;
-          FieldInfo fi = fieldInfos.fieldInfo(currentField);
-          if (fi.storeTermVector) {
-            if (termVectorWriter == null) {
-              termVectorWriter =
-                new TermVectorsWriter(directory, segment, fieldInfos);
-              termVectorWriter.openDocument();
+          int delta = position - lastPosition;
+          if (currentFieldHasPayloads) {
+            int payloadLength = 0;
+            Payload payload = null;
+            if (payloads != null) {
+              payload = payloads[j];
+              if (payload != null) {
+                payloadLength = payload.length;
+              }
             }
-            termVectorWriter.openField(currentField);
-
-          } else if (termVectorWriter != null) {
-            termVectorWriter.closeField();
+            if (payloadLength == lastPayloadLength) {
+            	// the length of the current payload equals the length
+            	// of the previous one. So we do not have to store the length
+            	// again and we only shift the position delta by one bit
+              prox.writeVInt(delta * 2);
+            } else {
+            	// the length of the current payload is different from the
+            	// previous one. We shift the position delta, set the lowest
+            	// bit and store the current payload length as VInt.
+              prox.writeVInt(delta * 2 + 1);
+              prox.writeVInt(payloadLength);
+              lastPayloadLength = payloadLength;
+            }
+            if (payloadLength > 0) {
+            	// write current payload
+              prox.writeBytes(payload.data, payload.offset, payload.length);
+            }
+          } else {
+          	// field does not store payloads, just write position delta as VInt
+            prox.writeVInt(delta);
           }
+          lastPosition = position;
         }
         if (termVectorWriter != null && termVectorWriter.isFieldOpen()) {
             termVectorWriter.addTerm(posting.term.text(), postingFreq, posting.positions, posting.offsets);
@@ -397,18 +494,27 @@
   Term term;					  // the Term
   int freq;					  // its frequency in doc
   int[] positions;				  // positions it occurs at
+  Payload[] payloads; // the payloads of the terms
   TermVectorOffsetInfo [] offsets;
+  
 
-  Posting(Term t, int position, TermVectorOffsetInfo offset) {
+  Posting(Term t, int position, Payload payload, TermVectorOffsetInfo offset) {
     term = t;
     freq = 1;
     positions = new int[1];
     positions[0] = position;
+    
+    if (payload != null) {
+      payloads = new Payload[1];
+      payloads[0] = payload;
+    } else 
+      payloads = null;    
+    
+
     if(offset != null){
-    offsets = new TermVectorOffsetInfo[1];
-    offsets[0] = offset;
-    }
-    else
+      offsets = new TermVectorOffsetInfo[1];
+      offsets[0] = offset;
+    } else
       offsets = null;
   }
 }

Modified: lucene/java/trunk/src/java/org/apache/lucene/index/FieldInfo.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/FieldInfo.java?view=diff&rev=518486&r1=518485&r2=518486
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/FieldInfo.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/FieldInfo.java Wed Mar 14 22:15:43 2007
@@ -28,9 +28,12 @@
   boolean storePositionWithTermVector;
 
   boolean omitNorms; // omit norms associated with indexed fields
+  
+  boolean storePayloads; // whether this field stores payloads together with term positions
 
   FieldInfo(String na, boolean tk, int nu, boolean storeTermVector, 
-            boolean storePositionWithTermVector,  boolean storeOffsetWithTermVector, boolean omitNorms) {
+            boolean storePositionWithTermVector,  boolean storeOffsetWithTermVector, 
+            boolean omitNorms, boolean storePayloads) {
     name = na;
     isIndexed = tk;
     number = nu;
@@ -38,5 +41,6 @@
     this.storeOffsetWithTermVector = storeOffsetWithTermVector;
     this.storePositionWithTermVector = storePositionWithTermVector;
     this.omitNorms = omitNorms;
+    this.storePayloads = storePayloads;
   }
 }

Modified: lucene/java/trunk/src/java/org/apache/lucene/index/FieldInfos.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/FieldInfos.java?view=diff&rev=518486&r1=518485&r2=518486
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/FieldInfos.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/FieldInfos.java Wed Mar 14 22:15:43 2007
@@ -39,6 +39,7 @@
   static final byte STORE_POSITIONS_WITH_TERMVECTOR = 0x4;
   static final byte STORE_OFFSET_WITH_TERMVECTOR = 0x8;
   static final byte OMIT_NORMS = 0x10;
+  static final byte STORE_PAYLOADS = 0x20;
   
   private ArrayList byNumber = new ArrayList();
   private HashMap byName = new HashMap();
@@ -156,9 +157,29 @@
    */
   public void add(String name, boolean isIndexed, boolean storeTermVector,
                   boolean storePositionWithTermVector, boolean storeOffsetWithTermVector, boolean omitNorms) {
+    add(name, isIndexed, storeTermVector, storePositionWithTermVector,
+        storeOffsetWithTermVector, omitNorms, false);
+  }
+  
+  /** If the field is not yet known, adds it. If it is known, checks to make
+   *  sure that the isIndexed flag is the same as was given previously for this
+   *  field. If not - marks it as being indexed.  Same goes for the TermVector
+   * parameters.
+   *
+   * @param name The name of the field
+   * @param isIndexed true if the field is indexed
+   * @param storeTermVector true if the term vector should be stored
+   * @param storePositionWithTermVector true if the term vector with positions should be stored
+   * @param storeOffsetWithTermVector true if the term vector with offsets should be stored
+   * @param omitNorms true if the norms for the indexed field should be omitted
+   * @param storePayloads true if payloads should be stored for this field
+   */
+  public void add(String name, boolean isIndexed, boolean storeTermVector,
+                  boolean storePositionWithTermVector, boolean storeOffsetWithTermVector,
+                  boolean omitNorms, boolean storePayloads) {
     FieldInfo fi = fieldInfo(name);
     if (fi == null) {
-      addInternal(name, isIndexed, storeTermVector, storePositionWithTermVector, storeOffsetWithTermVector, omitNorms);
+      addInternal(name, isIndexed, storeTermVector, storePositionWithTermVector, storeOffsetWithTermVector, omitNorms, storePayloads);
     } else {
       if (fi.isIndexed != isIndexed) {
         fi.isIndexed = true;                      // once indexed, always index
@@ -175,6 +196,9 @@
       if (fi.omitNorms != omitNorms) {
         fi.omitNorms = false;                // once norms are stored, always store
       }
+      if (fi.storePayloads != storePayloads) {
+        fi.storePayloads = true;
+      }
 
     }
   }
@@ -182,10 +206,10 @@
 
   private void addInternal(String name, boolean isIndexed,
                            boolean storeTermVector, boolean storePositionWithTermVector, 
-                           boolean storeOffsetWithTermVector, boolean omitNorms) {
+                           boolean storeOffsetWithTermVector, boolean omitNorms, boolean storePayloads) {
     FieldInfo fi =
       new FieldInfo(name, isIndexed, byNumber.size(), storeTermVector, storePositionWithTermVector,
-              storeOffsetWithTermVector, omitNorms);
+              storeOffsetWithTermVector, omitNorms, storePayloads);
     byNumber.add(fi);
     byName.put(name, fi);
   }
@@ -271,6 +295,7 @@
       if (fi.storePositionWithTermVector) bits |= STORE_POSITIONS_WITH_TERMVECTOR;
       if (fi.storeOffsetWithTermVector) bits |= STORE_OFFSET_WITH_TERMVECTOR;
       if (fi.omitNorms) bits |= OMIT_NORMS;
+      if (fi.storePayloads) bits |= STORE_PAYLOADS;
       output.writeString(fi.name);
       output.writeByte(bits);
     }
@@ -286,8 +311,9 @@
       boolean storePositionsWithTermVector = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0;
       boolean storeOffsetWithTermVector = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0;
       boolean omitNorms = (bits & OMIT_NORMS) != 0;
-
-      addInternal(name, isIndexed, storeTermVector, storePositionsWithTermVector, storeOffsetWithTermVector, omitNorms);
+      boolean storePayloads = (bits & STORE_PAYLOADS) != 0;
+      
+      addInternal(name, isIndexed, storeTermVector, storePositionsWithTermVector, storeOffsetWithTermVector, omitNorms, storePayloads);
     }    
   }
 

Modified: lucene/java/trunk/src/java/org/apache/lucene/index/FilterIndexReader.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/FilterIndexReader.java?view=diff&rev=518486&r1=518485&r2=518486
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/FilterIndexReader.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/FilterIndexReader.java Wed Mar 14 22:15:43 2007
@@ -62,6 +62,14 @@
     public int nextPosition() throws IOException {
       return ((TermPositions) this.in).nextPosition();
     }
+    
+    public int getPayloadLength() {
+      return ((TermPositions) this.in).getPayloadLength();
+    }
+
+    public byte[] getPayload(byte[] data, int offset) throws IOException {
+      return ((TermPositions) this.in).getPayload(data, offset);
+    }
   }
 
   /** Base class for filtering {@link TermEnum} implementations. */

Modified: lucene/java/trunk/src/java/org/apache/lucene/index/IndexReader.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/IndexReader.java?view=diff&rev=518486&r1=518485&r2=518486
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/IndexReader.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/IndexReader.java Wed Mar 14 22:15:43 2007
@@ -67,6 +67,8 @@
     public static final FieldOption ALL = new FieldOption ("ALL");
     // all indexed fields
     public static final FieldOption INDEXED = new FieldOption ("INDEXED");
+    // all fields that store payloads
+    public static final FieldOption STORES_PAYLOADS = new FieldOption ("STORES_PAYLOADS");
     // all fields which are not indexed
     public static final FieldOption UNINDEXED = new FieldOption ("UNINDEXED");
     // all fields which are indexed with termvectors enables

Modified: lucene/java/trunk/src/java/org/apache/lucene/index/MultiReader.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/MultiReader.java?view=diff&rev=518486&r1=518485&r2=518486
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/MultiReader.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/MultiReader.java Wed Mar 14 22:15:43 2007
@@ -455,5 +455,12 @@
   public int nextPosition() throws IOException {
     return ((TermPositions)current).nextPosition();
   }
-
+  
+  public int getPayloadLength() {
+    return ((TermPositions)current).getPayloadLength();
+  }
+   
+  public byte[] getPayload(byte[] data, int offset) throws IOException {
+    return ((TermPositions)current).getPayload(data, offset);
+  }
 }

Modified: lucene/java/trunk/src/java/org/apache/lucene/index/MultipleTermPositions.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/MultipleTermPositions.java?view=diff&rev=518486&r1=518485&r2=518486
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/MultipleTermPositions.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/MultipleTermPositions.java Wed Mar 14 22:15:43 2007
@@ -191,5 +191,23 @@
   public int read(int[] arg0, int[] arg1) throws IOException {
     throw new UnsupportedOperationException();
   }
+  
+  
+  /**
+   * Not implemented.
+   * @throws UnsupportedOperationException
+   */
+  public int getPayloadLength() {
+    throw new UnsupportedOperationException();
+  }
+   
+  /**
+   * Not implemented.
+   * @throws UnsupportedOperationException
+   */
+  public byte[] getPayload(byte[] data, int offset) throws IOException {
+    throw new UnsupportedOperationException();
+  }
+
 
 }

Modified: lucene/java/trunk/src/java/org/apache/lucene/index/ParallelReader.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/ParallelReader.java?view=diff&rev=518486&r1=518485&r2=518486
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/ParallelReader.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/ParallelReader.java Wed Mar 14 22:15:43 2007
@@ -419,7 +419,15 @@
       return ((TermPositions)termDocs).nextPosition();
     }
 
+    public int getPayloadLength() {
+      return ((TermPositions)termDocs).getPayloadLength();
+    }
+
+    public byte[] getPayload(byte[] data, int offset) throws IOException {
+      return ((TermPositions)termDocs).getPayload(data, offset);
+    }
   }
 
 }
+
 

Added: lucene/java/trunk/src/java/org/apache/lucene/index/Payload.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/Payload.java?view=auto&rev=518486
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/Payload.java (added)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/Payload.java Wed Mar 14 22:15:43 2007
@@ -0,0 +1,114 @@
+package org.apache.lucene.index;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Serializable;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+
+ /**
+  *  A Payload is metadata that can be stored together with each occurrence 
+  *  of a term. This metadata is stored inline in the posting list of the
+  *  specific term.  
+  *  <p>
+  *  To store payloads in the index a {@link TokenStream} has to be used that
+  *  produces {@link Token}s containing payload data.
+  *  <p>
+  *  Use {@link TermPositions#getPayloadLength()} and {@link TermPositions#getPayload(byte[], int)}
+  *  to retrieve the payloads from the index.<br>
+  *  <br>
+  *  
+  *  <b>
+  *  Warning: The status of the Payloads feature is experimental. The APIs
+  *  introduced here might change in the future and will not be supported anymore
+  *  in such a case. If you want to use this feature in a production environment
+  *  you should wait for an official release.
+  *  </b>
+  */    
+  // TODO: Remove warning after API has been finalized
+  public class Payload implements Serializable {
+    protected byte[] data;
+    protected int offset;
+    protected int length;
+    
+    protected Payload() {
+      // no-arg constructor since this class implements Serializable
+    }
+    
+    /**
+     * Creates a new payload with the the given array as data.
+     * 
+     * @param data the data of this payload
+     */
+    public Payload(byte[] data) {
+      this(data, 0, data.length);
+    }
+
+    /**
+     * Creates a new payload with the the given array as data. 
+     * 
+     * @param data the data of this payload
+     * @param offset the offset in the data byte array
+     * @param length the length of the data
+     */
+    public Payload(byte[] data, int offset, int length) {
+      if (offset < 0 || offset + length > data.length) {
+        throw new IllegalArgumentException();
+      }
+      this.data = data;
+      this.offset = offset;
+      this.length = length;
+    }
+    
+    public int length() {
+      return this.length;
+    }
+    
+    /**
+     * Returns the byte at the given index.
+     */
+    public byte byteAt(int index) {
+      if (0 <= index && index < this.length) {
+        return this.data[this.offset + index];    
+      }
+      throw new ArrayIndexOutOfBoundsException(index);
+    }
+    
+    /**
+     * Allocates a new byte array, copies the payload data into it and returns it. 
+     */
+    public byte[] toByteArray() {
+      byte[] retArray = new byte[this.length];
+      System.arraycopy(this.data, this.offset, retArray, 0, this.length);
+      return retArray;
+    }
+    
+    /**
+     * Copies the payload data to a byte array.
+     * 
+     * @param target the target byte array
+     * @param targetOffset the offset in the target byte array
+     */
+    public void copyTo(byte[] target, int targetOffset) {
+      if (this.length > target.length + targetOffset) {
+        throw new ArrayIndexOutOfBoundsException();
+      }
+      System.arraycopy(this.data, this.offset, target, targetOffset, this.length);
+    }
+}

Modified: lucene/java/trunk/src/java/org/apache/lucene/index/SegmentMerger.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/SegmentMerger.java?view=diff&rev=518486&r1=518485&r2=518486
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/SegmentMerger.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/SegmentMerger.java Wed Mar 14 22:15:43 2007
@@ -157,11 +157,11 @@
   }
 
   private void addIndexed(IndexReader reader, FieldInfos fieldInfos, Collection names, boolean storeTermVectors, boolean storePositionWithTermVector,
-                         boolean storeOffsetWithTermVector) throws IOException {
+                         boolean storeOffsetWithTermVector, boolean storePayloads) throws IOException {
     Iterator i = names.iterator();
     while (i.hasNext()) {
       String field = (String)i.next();
-      fieldInfos.add(field, true, storeTermVectors, storePositionWithTermVector, storeOffsetWithTermVector, !reader.hasNorms(field));
+      fieldInfos.add(field, true, storeTermVectors, storePositionWithTermVector, storeOffsetWithTermVector, !reader.hasNorms(field), storePayloads);
     }
   }
 
@@ -176,11 +176,12 @@
     int docCount = 0;
     for (int i = 0; i < readers.size(); i++) {
       IndexReader reader = (IndexReader) readers.elementAt(i);
-      addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION_OFFSET), true, true, true);
-      addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION), true, true, false);
-      addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_OFFSET), true, false, true);
-      addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR), true, false, false);
-      addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.INDEXED), false, false, false);
+      addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION_OFFSET), true, true, true, false);
+      addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION), true, true, false, false);
+      addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_OFFSET), true, false, true, false);
+      addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR), true, false, false, false);
+      addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.STORES_PAYLOADS), false, false, false, true);
+      addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.INDEXED), false, false, false, false);
       fieldInfos.add(reader.getFieldNames(IndexReader.FieldOption.UNINDEXED), false);
     }
     fieldInfos.write(directory, segment + ".fnm");
@@ -326,6 +327,8 @@
       termInfosWriter.add(smis[0].term, termInfo);
     }
   }
+  
+  private byte[] payloadBuffer = null;
 
   /** Process postings from multiple segments all positioned on the
    *  same term. Writes out merged entries into freqOutput and
@@ -342,6 +345,8 @@
     int lastDoc = 0;
     int df = 0;					  // number of docs w/ term
     resetSkip();
+    boolean storePayloads = fieldInfos.fieldInfo(smis[0].term.field).storePayloads;
+    int lastPayloadLength = -1;   // ensures that we write the first length
     for (int i = 0; i < n; i++) {
       SegmentMergeInfo smi = smis[i];
       TermPositions postings = smi.getPositions();
@@ -361,7 +366,7 @@
         df++;
 
         if ((df % skipInterval) == 0) {
-          bufferSkip(lastDoc);
+          bufferSkip(lastDoc, storePayloads, lastPayloadLength);
         }
 
         int docCode = (doc - lastDoc) << 1;	  // use low bit to flag freq=1
@@ -374,11 +379,33 @@
           freqOutput.writeVInt(docCode);	  // write doc
           freqOutput.writeVInt(freq);		  // write frequency in doc
         }
-
+        
+        /** See {@link DocumentWriter#writePostings(Posting[], String) for 
+         *  documentation about the encoding of positions and payloads
+         */
         int lastPosition = 0;			  // write position deltas
         for (int j = 0; j < freq; j++) {
           int position = postings.nextPosition();
-          proxOutput.writeVInt(position - lastPosition);
+          int delta = position - lastPosition;
+          if (storePayloads) {
+            int payloadLength = postings.getPayloadLength();
+            if (payloadLength == lastPayloadLength) {
+              proxOutput.writeVInt(delta * 2);
+            } else {
+              proxOutput.writeVInt(delta * 2 + 1);
+              proxOutput.writeVInt(payloadLength);
+              lastPayloadLength = payloadLength;
+            }
+            if (payloadLength > 0) {
+              if (payloadBuffer == null || payloadBuffer.length < payloadLength) {
+                payloadBuffer = new byte[payloadLength];
+              }
+              postings.getPayload(payloadBuffer, 0);
+              proxOutput.writeBytes(payloadBuffer, 0, payloadLength);
+            }
+          } else {
+            proxOutput.writeVInt(delta);
+          }
           lastPosition = position;
         }
       }
@@ -388,21 +415,59 @@
 
   private RAMOutputStream skipBuffer = new RAMOutputStream();
   private int lastSkipDoc;
+  private int lastSkipPayloadLength;
   private long lastSkipFreqPointer;
   private long lastSkipProxPointer;
 
   private void resetSkip() {
     skipBuffer.reset();
     lastSkipDoc = 0;
+    lastSkipPayloadLength = -1;  // we don't have to write the first length in the skip list
     lastSkipFreqPointer = freqOutput.getFilePointer();
     lastSkipProxPointer = proxOutput.getFilePointer();
   }
 
-  private void bufferSkip(int doc) throws IOException {
+  private void bufferSkip(int doc, boolean storePayloads, int payloadLength) throws IOException {
     long freqPointer = freqOutput.getFilePointer();
     long proxPointer = proxOutput.getFilePointer();
 
-    skipBuffer.writeVInt(doc - lastSkipDoc);
+    // To efficiently store payloads in the posting lists we do not store the length of
+    // every payload. Instead we omit the length for a payload if the previous payload had
+    // the same length.
+    // However, in order to support skipping the payload length at every skip point must be known.
+    // So we use the same length encoding that we use for the posting lists for the skip data as well:
+    // Case 1: current field does not store payloads
+    //           SkipDatum                 --> DocSkip, FreqSkip, ProxSkip
+    //           DocSkip,FreqSkip,ProxSkip --> VInt
+    //           DocSkip records the document number before every SkipInterval th  document in TermFreqs. 
+    //           Document numbers are represented as differences from the previous value in the sequence.
+    // Case 2: current field stores payloads
+    //           SkipDatum                 --> DocSkip, PayloadLength?, FreqSkip,ProxSkip
+    //           DocSkip,FreqSkip,ProxSkip --> VInt
+    //           PayloadLength             --> VInt    
+    //         In this case DocSkip/2 is the difference between
+    //         the current and the previous value. If DocSkip
+    //         is odd, then a PayloadLength encoded as VInt follows,
+    //         if DocSkip is even, then it is assumed that the
+    //         current payload length equals the length at the previous
+    //         skip point
+    if (storePayloads) {
+      int delta = doc - lastSkipDoc;
+      if (payloadLength == lastSkipPayloadLength) {
+        // the current payload length equals the length at the previous skip point,
+        // so we don't store the length again
+        skipBuffer.writeVInt(delta * 2);
+      } else {
+        // the payload length is different from the previous one. We shift the DocSkip, 
+        // set the lowest bit and store the current payload length as VInt.
+        skipBuffer.writeVInt(delta * 2 + 1);
+        skipBuffer.writeVInt(payloadLength);
+        lastSkipPayloadLength = payloadLength;
+      }
+    } else {
+      // current field does not store payloads
+      skipBuffer.writeVInt(doc - lastSkipDoc);
+    }
     skipBuffer.writeVInt((int) (freqPointer - lastSkipFreqPointer));
     skipBuffer.writeVInt((int) (proxPointer - lastSkipProxPointer));
 

Modified: lucene/java/trunk/src/java/org/apache/lucene/index/SegmentReader.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/SegmentReader.java?view=diff&rev=518486&r1=518485&r2=518486
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/SegmentReader.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/SegmentReader.java Wed Mar 14 22:15:43 2007
@@ -374,6 +374,9 @@
       else if (!fi.isIndexed && fieldOption == IndexReader.FieldOption.UNINDEXED) {
         fieldSet.add(fi.name);
       }
+      else if (fi.storePayloads && fieldOption == IndexReader.FieldOption.STORES_PAYLOADS) {
+        fieldSet.add(fi.name);
+      }
       else if (fi.isIndexed && fieldOption == IndexReader.FieldOption.INDEXED) {
         fieldSet.add(fi.name);
       }
@@ -582,7 +585,12 @@
     
     return termVectorsReader.get(docNumber);
   }
-
+  
+  /** Returns the field infos of this segment */
+  FieldInfos fieldInfos() {
+    return fieldInfos;
+  }
+  
   /**
    * Return the name of the segment this reader is reading.
    */

Modified: lucene/java/trunk/src/java/org/apache/lucene/index/SegmentTermDocs.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/SegmentTermDocs.java?view=diff&rev=518486&r1=518485&r2=518486
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/SegmentTermDocs.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/SegmentTermDocs.java Wed Mar 14 22:15:43 2007
@@ -39,6 +39,9 @@
   private long proxPointer;
   private long skipPointer;
   private boolean haveSkipped;
+  
+  private int payloadLengthAtLastSkip;
+  protected boolean currentFieldStoresPayloads;
 
   protected SegmentTermDocs(SegmentReader parent) {
     this.parent = parent;
@@ -49,23 +52,31 @@
 
   public void seek(Term term) throws IOException {
     TermInfo ti = parent.tis.get(term);
-    seek(ti);
+    seek(ti, term);
   }
 
   public void seek(TermEnum termEnum) throws IOException {
     TermInfo ti;
+    Term term;
     
     // use comparison of fieldinfos to verify that termEnum belongs to the same segment as this SegmentTermDocs
-    if (termEnum instanceof SegmentTermEnum && ((SegmentTermEnum) termEnum).fieldInfos == parent.fieldInfos)          // optimized case
-      ti = ((SegmentTermEnum) termEnum).termInfo();
-    else                                          // punt case
-      ti = parent.tis.get(termEnum.term());
-      
-    seek(ti);
+    if (termEnum instanceof SegmentTermEnum && ((SegmentTermEnum) termEnum).fieldInfos == parent.fieldInfos) {        // optimized case
+      SegmentTermEnum segmentTermEnum = ((SegmentTermEnum) termEnum);
+      term = segmentTermEnum.term();
+      ti = segmentTermEnum.termInfo();
+    } else  {                                         // punt case
+      term = termEnum.term();
+      ti = parent.tis.get(term);        
+    }
+    
+    seek(ti, term);
   }
 
-  void seek(TermInfo ti) throws IOException {
+  void seek(TermInfo ti, Term term) throws IOException {
     count = 0;
+    payloadLengthAtLastSkip = 0;
+    FieldInfo fi = parent.fieldInfos.fieldInfo(term.field);
+    currentFieldStoresPayloads = (fi != null) ? fi.storePayloads : false;
     if (ti == null) {
       df = 0;
     } else {
@@ -141,7 +152,7 @@
   }
 
   /** Overridden by SegmentTermPositions to skip in prox stream. */
-  protected void skipProx(long proxPointer) throws IOException {}
+  protected void skipProx(long proxPointer, int payloadLength) throws IOException {}
 
   /** Optimized implementation. */
   public boolean skipTo(int target) throws IOException {
@@ -157,6 +168,7 @@
 
       // scan skip data
       int lastSkipDoc = skipDoc;
+      int lastPayloadLength = 0;
       long lastFreqPointer = freqStream.getFilePointer();
       long lastProxPointer = -1;
       int numSkipped = -1 - (count % skipInterval);
@@ -165,6 +177,7 @@
         lastSkipDoc = skipDoc;
         lastFreqPointer = freqPointer;
         lastProxPointer = proxPointer;
+        lastPayloadLength = payloadLengthAtLastSkip;
         
         if (skipDoc != 0 && skipDoc >= doc)
           numSkipped += skipInterval;
@@ -172,7 +185,21 @@
         if(skipCount >= numSkips)
           break;
 
-        skipDoc += skipStream.readVInt();
+        if (currentFieldStoresPayloads) {
+          // the current field stores payloads.
+          // if the doc delta is odd then we have
+          // to read the current payload length
+          // because it differs from the length of the
+          // previous payload
+          int delta = skipStream.readVInt();
+          if ((delta & 1) != 0) {
+            payloadLengthAtLastSkip = skipStream.readVInt();
+          }
+          delta >>>= 1;
+          skipDoc += delta;
+        } else {
+          skipDoc += skipStream.readVInt();
+        }
         freqPointer += skipStream.readVInt();
         proxPointer += skipStream.readVInt();
 
@@ -182,7 +209,7 @@
       // if we found something to skip, then skip it
       if (lastFreqPointer > freqStream.getFilePointer()) {
         freqStream.seek(lastFreqPointer);
-        skipProx(lastProxPointer);
+        skipProx(lastProxPointer, lastPayloadLength);
 
         doc = lastSkipDoc;
         count += numSkipped;

Modified: lucene/java/trunk/src/java/org/apache/lucene/index/SegmentTermPositions.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/SegmentTermPositions.java?view=diff&rev=518486&r1=518485&r2=518486
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/SegmentTermPositions.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/SegmentTermPositions.java Wed Mar 14 22:15:43 2007
@@ -27,6 +27,12 @@
   private int proxCount;
   private int position;
   
+  // the current payload length
+  private int payloadLength;
+  // indicates whether the payload of the currend position has
+  // been read from the proxStream yet
+  private boolean needToLoadPayload;
+  
   // these variables are being used to remember information
   // for a lazy skip
   private long lazySkipPointer = 0;
@@ -37,13 +43,15 @@
     this.proxStream = null;  // the proxStream will be cloned lazily when nextPosition() is called for the first time
   }
 
-  final void seek(TermInfo ti) throws IOException {
-    super.seek(ti);
+  final void seek(TermInfo ti, Term term) throws IOException {
+    super.seek(ti, term);
     if (ti != null)
       lazySkipPointer = ti.proxPointer;
     
     lazySkipProxCount = 0;
     proxCount = 0;
+    payloadLength = 0;
+    needToLoadPayload = false;
   }
 
   public final void close() throws IOException {
@@ -55,9 +63,28 @@
     // perform lazy skips if neccessary
     lazySkip();
     proxCount--;
-    return position += proxStream.readVInt();
+    return position += readDeltaPosition();
   }
 
+  private final int readDeltaPosition() throws IOException {
+    int delta = proxStream.readVInt();
+    if (currentFieldStoresPayloads) {
+      // if the current field stores payloads then
+      // the position delta is shifted one bit to the left.
+      // if the LSB is set, then we have to read the current
+      // payload length
+      if ((delta & 1) != 0) {
+        payloadLength = proxStream.readVInt();
+      } 
+      delta >>>= 1;
+      needToLoadPayload = true;
+    } else {
+      payloadLength = 0;
+      needToLoadPayload = false;
+    }
+    return delta;
+  }
+  
   protected final void skippingDoc() throws IOException {
     // we remember to skip a document lazily
     lazySkipProxCount += freq;
@@ -82,16 +109,27 @@
 
 
   /** Called by super.skipTo(). */
-  protected void skipProx(long proxPointer) throws IOException {
+  protected void skipProx(long proxPointer, int payloadLength) throws IOException {
     // we save the pointer, we might have to skip there lazily
     lazySkipPointer = proxPointer;
     lazySkipProxCount = 0;
     proxCount = 0;
+    this.payloadLength = payloadLength;
+    needToLoadPayload = false;
   }
 
   private void skipPositions(int n) throws IOException {
-    for (int f = n; f > 0; f--)         // skip unread positions
-      proxStream.readVInt();
+    for (int f = n; f > 0; f--) {        // skip unread positions
+      readDeltaPosition();
+      skipPayload();
+    }      
+  }
+  
+  private void skipPayload() throws IOException {
+    if (needToLoadPayload && payloadLength > 0) {
+      proxStream.seek(proxStream.getFilePointer() + payloadLength);
+    }
+    needToLoadPayload = false;
   }
 
   // It is not always neccessary to move the prox pointer
@@ -109,6 +147,10 @@
       // clone lazily
       proxStream = (IndexInput)parent.proxStream.clone();
     }
+    
+    // we might have to skip the current payload
+    // if it was not read yet
+    skipPayload();
       
     if (lazySkipPointer != 0) {
       proxStream.seek(lazySkipPointer);
@@ -119,6 +161,32 @@
       skipPositions(lazySkipProxCount);
       lazySkipProxCount = 0;
     }
+  }
+  
+  public int getPayloadLength() {
+    return payloadLength;
+  }
+
+  public byte[] getPayload(byte[] data, int offset) throws IOException {
+    if (!needToLoadPayload) {
+      throw new IOException("Payload cannot be loaded more than once for the same term position.");
+    }
+
+    // read payloads lazily
+    byte[] retArray;
+    int retOffset;
+    if (data == null || data.length - offset < payloadLength) {
+      // the array is too small to store the payload data,
+      // so we allocate a new one
+      retArray = new byte[payloadLength];
+      retOffset = 0;
+    } else {
+      retArray = data;
+      retOffset = offset;
+    }
+    proxStream.readBytes(retArray, retOffset, payloadLength);
+    needToLoadPayload = false;
+    return retArray;
   }
 
 }

Modified: lucene/java/trunk/src/java/org/apache/lucene/index/TermPositions.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/TermPositions.java?view=diff&rev=518486&r1=518485&r2=518486
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/TermPositions.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/TermPositions.java Wed Mar 14 22:15:43 2007
@@ -32,10 +32,53 @@
     extends TermDocs
 {
     /** Returns next position in the current document.  It is an error to call
-	this more than {@link #freq()} times
-	without calling {@link #next()}<p> This is
-	invalid until {@link #next()} is called for
-	the first time.
+    this more than {@link #freq()} times
+    without calling {@link #next()}<p> This is
+    invalid until {@link #next()} is called for
+    the first time.
     */
     int nextPosition() throws IOException;
+    
+    /** 
+     * Returns the length of the payload at the current term position.
+     * This is invalid until {@link #nextPosition()} is called for
+     * the first time.<br>
+     * <br>
+     * <b>
+     * Warning: The status of the Payloads feature is experimental. The APIs
+     * introduced here might change in the future and will not be supported anymore
+     * in such a case. If you want to use this feature in a production environment
+     * you should wait for an official release.
+     * </b> 
+     * @return length of the current payload in number of bytes
+     */
+    // TODO: Remove warning after API has been finalized
+    int getPayloadLength();
+    
+    /** 
+     * Returns the payload data at the current term position.
+     * This is invalid until {@link #nextPosition()} is called for
+     * the first time.
+     * This method must not be called more than once after each call
+     * of {@link #nextPosition()}. However, payloads are loaded lazily,
+     * so if the payload data for the current position is not needed,
+     * this method may not be called at all for performance reasons.<br>
+     * <br>
+     * <b>
+     * Warning: The status of the Payloads feature is experimental. The APIs
+     * introduced here might change in the future and will not be supported anymore
+     * in such a case. If you want to use this feature in a production environment
+     * you should wait for an official release.
+     * </b>
+     * 
+     * @param data the array into which the data of this payload is to be
+     *             stored, if it is big enough; otherwise, a new byte[] array
+     *             is allocated for this purpose. 
+     * @param offset the offset in the array into which the data of this payload
+     *               is to be stored.
+     * @return a byte[] array containing the data of this payload
+     * @throws IOException
+     */
+    // TODO: Remove warning after API has been finalized
+    byte[] getPayload(byte[] data, int offset) throws IOException;
 }

Modified: lucene/java/trunk/src/java/org/apache/lucene/store/BufferedIndexOutput.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/store/BufferedIndexOutput.java?view=diff&rev=518486&r1=518485&r2=518486
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/store/BufferedIndexOutput.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/store/BufferedIndexOutput.java Wed Mar 14 22:15:43 2007
@@ -24,8 +24,8 @@
   static final int BUFFER_SIZE = 1024;
 
   private final byte[] buffer = new byte[BUFFER_SIZE];
-  private long bufferStart = 0;			  // position in file of buffer
-  private int bufferPosition = 0;		  // position in buffer
+  private long bufferStart = 0;           // position in file of buffer
+  private int bufferPosition = 0;         // position in buffer
 
   /** Writes a single byte.
    * @see IndexInput#readByte()
@@ -41,12 +41,12 @@
    * @param length the number of bytes to write
    * @see IndexInput#readBytes(byte[],int,int)
    */
-  public void writeBytes(byte[] b, int length) throws IOException {
+  public void writeBytes(byte[] b, int offset, int length) throws IOException {
     int bytesLeft = BUFFER_SIZE - bufferPosition;
     // is there enough space in the buffer?
     if (bytesLeft >= length) {
       // we add the data to the end of the buffer
-      System.arraycopy(b, 0, buffer, bufferPosition, length);
+      System.arraycopy(b, offset, buffer, bufferPosition, length);
       bufferPosition += length;
       // if the buffer is full, flush it
       if (BUFFER_SIZE - bufferPosition == 0)
@@ -58,7 +58,7 @@
         if (bufferPosition > 0)
           flush();
         // and write data at once
-        flushBuffer(b, length);
+        flushBuffer(b, offset, length);
         bufferStart += length;
       } else {
         // we fill/flush the buffer (until the input is written)
@@ -66,7 +66,7 @@
         int pieceLength;
         while (pos < length) {
           pieceLength = (length - pos < bytesLeft) ? length - pos : bytesLeft;
-          System.arraycopy(b, pos, buffer, bufferPosition, pieceLength);
+          System.arraycopy(b, pos + offset, buffer, bufferPosition, pieceLength);
           pos += pieceLength;
           bufferPosition += pieceLength;
           // if the buffer is full, flush it
@@ -92,8 +92,18 @@
    * @param b the bytes to write
    * @param len the number of bytes to write
    */
-  protected abstract void flushBuffer(byte[] b, int len) throws IOException;
+  private void flushBuffer(byte[] b, int len) throws IOException {
+    flushBuffer(b, 0, len);
+  }
 
+  /** Expert: implements buffer write.  Writes bytes at the current position in
+   * the output.
+   * @param b the bytes to write
+   * @param offset the offset in the byte array
+   * @param len the number of bytes to write
+   */
+  protected abstract void flushBuffer(byte[] b, int offset, int len) throws IOException;
+  
   /** Closes this stream to further operations. */
   public void close() throws IOException {
     flush();

Modified: lucene/java/trunk/src/java/org/apache/lucene/store/FSDirectory.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/store/FSDirectory.java?view=diff&rev=518486&r1=518485&r2=518486
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/store/FSDirectory.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/store/FSDirectory.java Wed Mar 14 22:15:43 2007
@@ -588,8 +588,8 @@
   }
 
   /** output methods: */
-  public void flushBuffer(byte[] b, int size) throws IOException {
-    file.write(b, 0, size);
+  public void flushBuffer(byte[] b, int offset, int size) throws IOException {
+    file.write(b, offset, size);
   }
   public void close() throws IOException {
     // only close the file if it has not been closed yet

Modified: lucene/java/trunk/src/java/org/apache/lucene/store/IndexOutput.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/store/IndexOutput.java?view=diff&rev=518486&r1=518485&r2=518486
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/store/IndexOutput.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/store/IndexOutput.java Wed Mar 14 22:15:43 2007
@@ -36,7 +36,17 @@
    * @param length the number of bytes to write
    * @see IndexInput#readBytes(byte[],int,int)
    */
-  public abstract void writeBytes(byte[] b, int length) throws IOException;
+  public void writeBytes(byte[] b, int length) throws IOException {
+    writeBytes(b, 0, length);
+  }
+
+  /** Writes an array of bytes.
+   * @param b the bytes to write
+   * @param offset the offset in the byte array
+   * @param length the number of bytes to write
+   * @see IndexInput#readBytes(byte[],int,int)
+   */
+  public abstract void writeBytes(byte[] b, int offset, int length) throws IOException;
 
   /** Writes an int as four bytes.
    * @see IndexInput#readInt()

Modified: lucene/java/trunk/src/java/org/apache/lucene/store/RAMOutputStream.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/store/RAMOutputStream.java?view=diff&rev=518486&r1=518485&r2=518486
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/store/RAMOutputStream.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/store/RAMOutputStream.java Wed Mar 14 22:15:43 2007
@@ -66,7 +66,7 @@
     file.setLength(0);
   }
 
-  public void flushBuffer(byte[] src, int len) throws IOException {
+  public void flushBuffer(byte[] src, int offset, int len) throws IOException {
     byte[] buffer;
     int bufferPos = 0;
     while (bufferPos != len) {
@@ -81,7 +81,7 @@
       else
         buffer = (byte[]) file.buffers.get(bufferNumber);
 
-      System.arraycopy(src, bufferPos, buffer, bufferOffset, bytesToCopy);
+      System.arraycopy(src, offset + bufferPos, buffer, bufferOffset, bytesToCopy);
       bufferPos += bytesToCopy;
       pointer += bytesToCopy;
     }

Modified: lucene/java/trunk/src/site/src/documentation/content/xdocs/fileformats.xml
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/site/src/documentation/content/xdocs/fileformats.xml?view=diff&rev=518486&r1=518485&r2=518486
==============================================================================
--- lucene/java/trunk/src/site/src/documentation/content/xdocs/fileformats.xml (original)
+++ lucene/java/trunk/src/site/src/documentation/content/xdocs/fileformats.xml Wed Mar 14 22:15:43 2007
@@ -1013,6 +1013,7 @@
                         <li>If the third lowest-order bit is set (0x04), term positions are stored with the term vectors.</li>
                         <li>If the fourth lowest-order bit is set (0x08), term offsets are stored with the term vectors.</li>
                         <li>If the fifth lowest-order bit is set (0x10), norms are omitted for the indexed field.</li>
+                        <li>If the sixth lowest-order bit is set (0x20), payloads are stored for the indexed field.</li>
                     </ul>
                 </p>
 
@@ -1298,9 +1299,9 @@
                     <sup>DocFreq/SkipInterval</sup>
                 </p>
                 <p>SkipDatum --&gt;
-                    DocSkip,FreqSkip,ProxSkip
+                    DocSkip,PayloadLength?,FreqSkip,ProxSkip
                 </p>
-                <p>DocDelta,Freq,DocSkip,FreqSkip,ProxSkip --&gt;
+                <p>DocDelta,Freq,DocSkip,PayloadLength,FreqSkip,ProxSkip --&gt;
                     VInt
                 </p>
                 <p>TermFreqs
@@ -1328,9 +1329,17 @@
                     SkipInterval
                     <sup>th</sup>
                     document in TermFreqs.
-                    Document numbers are represented as differences
-                    from the previous value in the sequence. FreqSkip
-                    and ProxSkip record the position of every
+                    If payloads are disabled for the term's field,
+                    then DocSkip represents the difference from the
+                    previous value in the sequence.
+                    If payloads are enabled for the term's field, 
+                    then DocSkip/2 represents the difference from the
+                    previous value in the sequence. If payloads are enabled
+                    and DocSkip is odd,
+                    then PayloadLength is stored indicating the length 
+                    of the last payload before the SkipInterval<sup>th</sup>
+                    document in TermPositions.
+					FreqSkip and ProxSkip record the position of every
                     SkipInterval
                     <sup>th</sup>
                     entry in FreqFile and
@@ -1379,12 +1388,21 @@
                     <sup>DocFreq</sup>
                 </p>
                 <p>Positions --&gt;
-                    &lt;PositionDelta&gt;
+                    &lt;PositionDelta,Payload?&gt;
                     <sup>Freq</sup>
                 </p>
+                <p>Payload --&gt;
+                    &lt;PayloadLength?,PayloadData&gt;
+                </p>
                 <p>PositionDelta --&gt;
                     VInt
                 </p>
+                <p>PayloadLength --&gt;
+                    VInt
+                </p>
+                <p>PayloadData --&gt;
+                    byte<sup>PayloadLength</sup>
+                </p>
                 <p>TermPositions
                     are ordered by term (the term is implicit, from the .tis file).
                 </p>
@@ -1393,18 +1411,29 @@
                     number is implicit from the .frq file).
                 </p>
                 <p>PositionDelta
-                    is the difference between the position of the current occurrence in
+                    is, if payloads are disabled for the term's field, the difference 
+                    between the position of the current occurrence in
                     the document and the previous occurrence (or zero, if this is the
                     first occurrence in this document).
+                    If payloads are enabled for the term's field, then PositionDelta/2
+                    is the difference between the current and the previous position. If
+                    payloads are enabled and PositionDelta is odd, then PayloadLength is 
+                    stored, indicating the length of the payload at the current term position.
                 </p>
                 <p>
                     For example, the TermPositions for a
                     term which occurs as the fourth term in one document, and as the
                     fifth and ninth term in a subsequent document, would be the following
-                    sequence of VInts:
+                    sequence of VInts (payloads disabled):
                 </p>
                 <p>4,
                     5, 4
+                </p>
+                <p>PayloadData
+                    is metadata associated with the current term position. If PayloadLength
+                    is stored at the current position, then it indicates the length of this 
+                    Payload. If PayloadLength is not stored, then this Payload has the same
+                    length as the Payload at the previous position.
                 </p>
             </section>
             <section id="Normalization Factors"><title>Normalization Factors</title>

Added: lucene/java/trunk/src/test/org/apache/lucene/index/TestPayloads.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/index/TestPayloads.java?view=auto&rev=518486
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/index/TestPayloads.java (added)
+++ lucene/java/trunk/src/test/org/apache/lucene/index/TestPayloads.java Wed Mar 14 22:15:43 2007
@@ -0,0 +1,443 @@
+package org.apache.lucene.index;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.File;
+import java.io.IOException;
+import java.io.Reader;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Random;
+
+import junit.framework.TestCase;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.WhitespaceAnalyzer;
+import org.apache.lucene.analysis.WhitespaceTokenizer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.FSDirectory;
+import org.apache.lucene.store.RAMDirectory;
+
+
+public class TestPayloads extends TestCase {
+    
+    // Simple tests to test the Payload class
+    public void testPayload() throws Exception {
+        byte[] testData = "This is a test!".getBytes();
+        Payload payload = new Payload(testData);
+        assertEquals("Wrong payload length.", testData.length, payload.length());
+        
+        // test copyTo()
+        byte[] target = new byte[testData.length - 1];
+        try {
+            payload.copyTo(target, 0);
+            fail("Expected exception not thrown");
+        } catch (Exception expected) {
+            // expected exception
+        }
+        
+        target = new byte[testData.length + 3];
+        payload.copyTo(target, 3);
+        
+        for (int i = 0; i < testData.length; i++) {
+            assertEquals(testData[i], target[i + 3]);
+        }
+        
+
+        // test toByteArray()
+        target = payload.toByteArray();
+        assertByteArrayEquals(testData, target);
+
+        // test byteAt()
+        for (int i = 0; i < testData.length; i++) {
+            assertEquals(payload.byteAt(i), testData[i]);
+        }
+        
+        try {
+            payload.byteAt(testData.length + 1);
+            fail("Expected exception not thrown");
+        } catch (Exception expected) {
+            // expected exception
+        }
+    }
+
+    // Tests whether the DocumentWriter and SegmentMerger correctly enable the
+    // payload bit in the FieldInfo
+    public void testPayloadFieldBit() throws Exception {
+        Directory ram = new RAMDirectory();
+        PayloadAnalyzer analyzer = new PayloadAnalyzer();
+        IndexWriter writer = new IndexWriter(ram, analyzer, true);
+        Document d = new Document();
+        // this field won't have any payloads
+        d.add(new Field("f1", "This field has no payloads", Field.Store.NO, Field.Index.TOKENIZED));
+        // this field will have payloads in all docs, however not for all term positions,
+        // so this field is used to check if the DocumentWriter correctly enables the payloads bit
+        // even if only some term positions have payloads
+        d.add(new Field("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.TOKENIZED));
+        d.add(new Field("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.TOKENIZED));
+        // this field is used to verify if the SegmentMerger enables payloads for a field if it has payloads 
+        // enabled in only some documents
+        d.add(new Field("f3", "This field has payloads in some docs", Field.Store.NO, Field.Index.TOKENIZED));
+        // only add payload data for field f2
+        analyzer.setPayloadData("f2", 1, "somedata".getBytes(), 0, 1);
+        writer.addDocument(d);
+        // flush
+        writer.close();        
+        
+        // only one segment in the index, so we can cast to SegmentReader
+        SegmentReader reader = (SegmentReader) IndexReader.open(ram);
+        FieldInfos fi = reader.fieldInfos();
+        assertFalse("Payload field bit should not be set.", fi.fieldInfo("f1").storePayloads);
+        assertTrue("Payload field bit should be set.", fi.fieldInfo("f2").storePayloads);
+        assertFalse("Payload field bit should not be set.", fi.fieldInfo("f3").storePayloads);
+        reader.close();
+        
+        // now we add another document which has payloads for field f3 and verify if the SegmentMerger
+        // enabled payloads for that field
+        writer = new IndexWriter(ram, analyzer, true);
+        d = new Document();
+        d.add(new Field("f1", "This field has no payloads", Field.Store.NO, Field.Index.TOKENIZED));
+        d.add(new Field("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.TOKENIZED));
+        d.add(new Field("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.TOKENIZED));
+        d.add(new Field("f3", "This field has payloads in some docs", Field.Store.NO, Field.Index.TOKENIZED));
+        // add payload data for field f2 and f3
+        analyzer.setPayloadData("f2", "somedata".getBytes(), 0, 1);
+        analyzer.setPayloadData("f3", "somedata".getBytes(), 0, 3);
+        writer.addDocument(d);
+        // force merge
+        writer.optimize();
+        // flush
+        writer.close();
+
+        // only one segment in the index, so we can cast to SegmentReader
+        reader = (SegmentReader) IndexReader.open(ram);
+        fi = reader.fieldInfos();
+        assertFalse("Payload field bit should not be set.", fi.fieldInfo("f1").storePayloads);
+        assertTrue("Payload field bit should be set.", fi.fieldInfo("f2").storePayloads);
+        assertTrue("Payload field bit should be set.", fi.fieldInfo("f3").storePayloads);
+        reader.close();        
+    }
+
+    // Tests if payloads are correctly stored and loaded using both RamDirectory and FSDirectory
+    public void testPayloadsEncoding() throws Exception {
+        // first perform the test using a RAMDirectory
+        Directory dir = new RAMDirectory();
+        performTest(dir);
+        
+        // now use a FSDirectory and repeat same test
+        String dirName = "test_payloads"; 
+        dir = FSDirectory.getDirectory(dirName);
+        performTest(dir);
+        rmDir(dirName);
+    }
+    
+    // builds an index with payloads in the given Directory and performs
+    // different tests to verify the payload encoding
+    private void performTest(Directory dir) throws Exception {
+        PayloadAnalyzer analyzer = new PayloadAnalyzer();
+        IndexWriter writer = new IndexWriter(dir, analyzer, true);
+        
+        // should be in sync with value in TermInfosWriter
+        final int skipInterval = 16;
+        
+        final int numTerms = 5;
+        final String fieldName = "f1";
+        
+        int numDocs = skipInterval + 1; 
+        // create content for the test documents with just a few terms
+        Term[] terms = generateTerms(fieldName, numTerms);
+        StringBuffer sb = new StringBuffer();
+        for (int i = 0; i < terms.length; i++) {
+            sb.append(terms[i].text);
+            sb.append(" ");
+        }
+        String content = sb.toString();
+        
+        
+        int payloadDataLength = numTerms * numDocs * 2 + numTerms * numDocs * (numDocs - 1) / 2;
+        byte[] payloadData = generateRandomData(payloadDataLength);
+        
+        Document d = new Document();
+        d.add(new Field(fieldName, content, Field.Store.NO, Field.Index.TOKENIZED));
+        // add the same document multiple times to have the same payload lengths for all
+        // occurrences within two consecutive skip intervals
+        int offset = 0;
+        for (int i = 0; i < 2 * numDocs; i++) {
+            analyzer.setPayloadData(fieldName, payloadData, offset, 1);
+            offset += numTerms;
+            writer.addDocument(d);
+        }
+        
+        // now we make sure to have different payload lengths next at the next skip point        
+        for (int i = 0; i < numDocs; i++) {
+            analyzer.setPayloadData(fieldName, payloadData, offset, i);
+            offset += i * numTerms;
+            writer.addDocument(d);
+        }
+        
+        writer.optimize();
+        // flush
+        writer.close();
+        
+        
+        /*
+         * Verify the index
+         * first we test if all payloads are stored correctly
+         */        
+        IndexReader reader = IndexReader.open(dir);
+        
+        byte[] verifyPayloadData = new byte[payloadDataLength];
+        offset = 0;
+        TermPositions[] tps = new TermPositions[numTerms];
+        for (int i = 0; i < numTerms; i++) {
+            tps[i] = reader.termPositions(terms[i]);
+        }
+        
+        while (tps[0].next()) {
+            for (int i = 1; i < numTerms; i++) {
+                tps[i].next();
+            }
+            int freq = tps[0].freq();
+
+            for (int i = 0; i < freq; i++) {
+                for (int j = 0; j < numTerms; j++) {
+                    tps[j].nextPosition();
+                    tps[j].getPayload(verifyPayloadData, offset);
+                    offset += tps[j].getPayloadLength();
+                }
+            }
+        }
+        
+        for (int i = 0; i < numTerms; i++) {
+            tps[i].close();
+        }
+        
+        assertByteArrayEquals(payloadData, verifyPayloadData);
+        
+        /*
+         *  test lazy skipping
+         */        
+        TermPositions tp = reader.termPositions(terms[0]);
+        tp.next();
+        tp.nextPosition();
+        // now we don't read this payload
+        tp.nextPosition();
+        assertEquals("Wrong payload length.", 1, tp.getPayloadLength());
+        byte[] payload = tp.getPayload(null, 0);
+        assertEquals(payload[0], payloadData[numTerms]);
+        tp.nextPosition();
+        
+        // we don't read this payload and skip to a different document
+        tp.skipTo(5);
+        tp.nextPosition();
+        assertEquals("Wrong payload length.", 1, tp.getPayloadLength());
+        payload = tp.getPayload(null, 0);
+        assertEquals(payload[0], payloadData[5 * numTerms]);
+                
+        
+        /*
+         * Test different lengths at skip points
+         */
+        tp.seek(terms[1]);
+        tp.next();
+        tp.nextPosition();
+        assertEquals("Wrong payload length.", 1, tp.getPayloadLength());
+        tp.skipTo(skipInterval - 1);
+        tp.nextPosition();
+        assertEquals("Wrong payload length.", 1, tp.getPayloadLength());
+        tp.skipTo(2 * skipInterval - 1);
+        tp.nextPosition();
+        assertEquals("Wrong payload length.", 1, tp.getPayloadLength());
+        tp.skipTo(3 * skipInterval - 1);
+        tp.nextPosition();
+        assertEquals("Wrong payload length.", 3 * skipInterval - 2 * numDocs - 1, tp.getPayloadLength());
+        
+        /*
+         * Test multiple call of getPayload()
+         */
+        tp.getPayload(null, 0);
+        try {
+            // it is forbidden to call getPayload() more than once
+            // without calling nextPosition()
+            tp.getPayload(null, 0);
+            fail("Expected exception not thrown");
+        } catch (Exception expected) {
+            // expected exception
+        }
+        
+        reader.close();
+        
+        // test long payload
+        analyzer = new PayloadAnalyzer();
+        writer = new IndexWriter(dir, analyzer, true);
+        String singleTerm = "lucene";
+        
+        d = new Document();
+        d.add(new Field(fieldName, singleTerm, Field.Store.NO, Field.Index.TOKENIZED));
+        // add a payload whose length is greater than the buffer size of BufferedIndexOutput
+        payloadData = generateRandomData(2000);
+        analyzer.setPayloadData(fieldName, payloadData, 100, 1500);
+        writer.addDocument(d);
+
+        
+        writer.optimize();
+        // flush
+        writer.close();
+        
+        reader = IndexReader.open(dir);
+        tp = reader.termPositions(new Term(fieldName, singleTerm));
+        tp.next();
+        tp.nextPosition();
+
+        verifyPayloadData = new byte[tp.getPayloadLength()];
+        tp.getPayload(verifyPayloadData, 0);
+        byte[] portion = new byte[1500];
+        System.arraycopy(payloadData, 100, portion, 0, 1500);
+        
+        assertByteArrayEquals(portion, verifyPayloadData);
+        reader.close();
+        
+    }
+    
+    private byte[] generateRandomData(int n) {
+        Random rnd = new Random();
+        byte[] data = new byte[n];
+        rnd.nextBytes(data);
+        return data;
+    }
+    
+    private Term[] generateTerms(String fieldName, int n) {
+        int maxDigits = (int) (Math.log(n) / Math.log(10));
+        Term[] terms = new Term[n];
+        StringBuffer sb = new StringBuffer();
+        for (int i = 0; i < n; i++) {
+            sb.setLength(0);
+            sb.append("t");
+            int zeros = maxDigits - (int) (Math.log(i) / Math.log(10));
+            for (int j = 0; j < zeros; j++) {
+                sb.append("0");
+            }
+            sb.append(i);
+            terms[i] = new Term(fieldName, sb.toString());
+        }
+        return terms;
+    }
+
+
+    private void rmDir(String dir) {
+        File fileDir = new File(dir);
+        if (fileDir.exists()) {
+          File[] files = fileDir.listFiles();
+          if (files != null) {
+            for (int i = 0; i < files.length; i++) {
+              files[i].delete();
+            }
+          }
+          fileDir.delete();
+        }
+      }
+
+ 
+
+    void assertByteArrayEquals(byte[] b1, byte[] b2) {
+        if (b1.length != b2.length) {
+          fail("Byte arrays have different lengths: " + b1.length + ", " + b2.length);
+        }
+        
+        for (int i = 0; i < b1.length; i++) {
+          if (b1[i] != b2[i]) {
+            fail("Byte arrays different at index " + i + ": " + b1[i] + ", " + b2[i]);
+          }
+        }
+      }    
+    
+    
+    /**
+     * This Analyzer uses an WhitespaceTokenizer and PayloadFilter.
+     */
+    private static class PayloadAnalyzer extends Analyzer {
+        Map fieldToData = new HashMap();
+        
+        void setPayloadData(String field, byte[] data, int offset, int length) {
+            fieldToData.put(field, new PayloadData(0, data, offset, length));
+        }
+
+        void setPayloadData(String field, int numFieldInstancesToSkip, byte[] data, int offset, int length) {
+            fieldToData.put(field, new PayloadData(numFieldInstancesToSkip, data, offset, length));
+        }
+        
+        public TokenStream tokenStream(String fieldName, Reader reader) {
+            PayloadData payload = (PayloadData) fieldToData.get(fieldName);
+            TokenStream ts = new WhitespaceTokenizer(reader);
+            if (payload != null) {
+                if (payload.numFieldInstancesToSkip == 0) {
+                    ts = new PayloadFilter(ts, payload.data, payload.offset, payload.length);
+                } else {
+                    payload.numFieldInstancesToSkip--;
+                }
+            }
+            return ts;
+        }
+        
+        private static class PayloadData {
+            byte[] data;
+            int offset;
+            int length;
+            int numFieldInstancesToSkip;
+            
+            PayloadData(int skip, byte[] data, int offset, int length) {
+                numFieldInstancesToSkip = skip;
+                this.data = data;
+                this.offset = offset;
+                this.length = length;
+            }
+        }
+    }
+
+    
+    /**
+     * This Filter adds payloads to the tokens.
+     */
+    private static class PayloadFilter extends TokenFilter {
+        private byte[] data;
+        private int length;
+        private int offset;
+        
+        public PayloadFilter(TokenStream in, byte[] data, int offset, int length) {
+            super(in);
+            this.data = data;
+            this.length = length;
+            this.offset = offset;
+        }
+        
+        public Token next() throws IOException {
+            Token nextToken = input.next();
+            if (nextToken != null && offset + length <= data.length) {
+              nextToken.setPayload(new Payload(data, offset, length));
+              offset += length;
+            }            
+            
+            return nextToken;
+        }
+      }
+}

Modified: lucene/java/trunk/src/test/org/apache/lucene/store/MockRAMOutputStream.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/store/MockRAMOutputStream.java?view=diff&rev=518486&r1=518485&r2=518486
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/store/MockRAMOutputStream.java (original)
+++ lucene/java/trunk/src/test/org/apache/lucene/store/MockRAMOutputStream.java Wed Mar 14 22:15:43 2007
@@ -48,7 +48,7 @@
     }
   }
 
-  public void flushBuffer(byte[] src, int len) throws IOException {
+  public void flushBuffer(byte[] src, int offset, int len) throws IOException {
     long freeSpace = dir.maxSize - dir.sizeInBytes();
     long realUsage = 0;
 
@@ -63,14 +63,14 @@
     if (dir.maxSize != 0 && freeSpace <= len) {
       if (freeSpace > 0 && freeSpace < len) {
         realUsage += freeSpace;
-        super.flushBuffer(src, (int) freeSpace);
+        super.flushBuffer(src, offset, (int) freeSpace);
       }
       if (realUsage > dir.maxUsedSize) {
         dir.maxUsedSize = realUsage;
       }
       throw new IOException("fake disk full at " + dir.getRecomputedActualSizeInBytes() + " bytes");
     } else {
-      super.flushBuffer(src, len);
+      super.flushBuffer(src, offset, len);
     }
 
     if (first) {



Mime
View raw message