lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From busc...@apache.org
Subject svn commit: r797715 - in /lucene/java/trunk: ./ src/java/org/apache/lucene/analysis/ src/java/org/apache/lucene/analysis/standard/ src/java/org/apache/lucene/index/ src/test/org/apache/lucene/index/
Date Sat, 25 Jul 2009 04:11:34 GMT
Author: buschmi
Date: Sat Jul 25 04:11:33 2009
New Revision: 797715

URL: http://svn.apache.org/viewvc?rev=797715&view=rev
Log:
LUCENE-1448: Add TokenStream.end() to perform end-of-stream operations. Fixes offset problems
when multiple fields with the same name are added to a document.

Modified:
    lucene/java/trunk/CHANGES.txt
    lucene/java/trunk/src/java/org/apache/lucene/analysis/Analyzer.java
    lucene/java/trunk/src/java/org/apache/lucene/analysis/CachingTokenFilter.java
    lucene/java/trunk/src/java/org/apache/lucene/analysis/CharTokenizer.java
    lucene/java/trunk/src/java/org/apache/lucene/analysis/KeywordTokenizer.java
    lucene/java/trunk/src/java/org/apache/lucene/analysis/TeeSinkTokenFilter.java
    lucene/java/trunk/src/java/org/apache/lucene/analysis/TokenFilter.java
    lucene/java/trunk/src/java/org/apache/lucene/analysis/TokenStream.java
    lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
    lucene/java/trunk/src/java/org/apache/lucene/index/DocInverterPerField.java
    lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexWriter.java

Modified: lucene/java/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/CHANGES.txt?rev=797715&r1=797714&r2=797715&view=diff
==============================================================================
--- lucene/java/trunk/CHANGES.txt (original)
+++ lucene/java/trunk/CHANGES.txt Sat Jul 25 04:11:33 2009
@@ -609,6 +609,11 @@
     CONSTANT_SCORE_AUTO_REWRITE tries to pick the most performant
     constant-score rewrite method.  (Mike McCandless)
     
+34. LUCENE-1448: Added TokenStream.end(), to perform end-of-stream
+    operations.  This is currently used to fix offset problems when 
+    multiple fields with the same name are added to a document.
+    (Mike McCandless, Mark Miller, Michael Busch)
+    
 Optimizations
 
  1. LUCENE-1427: Fixed QueryWrapperFilter to not waste time computing

Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/Analyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/Analyzer.java?rev=797715&r1=797714&r2=797715&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/Analyzer.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/Analyzer.java Sat Jul 25 04:11:33
2009
@@ -24,6 +24,8 @@
 import org.apache.lucene.util.CloseableThreadLocal;
 import org.apache.lucene.store.AlreadyClosedException;
 
+import org.apache.lucene.document.Fieldable;
+
 /** An Analyzer builds TokenStreams, which analyze text.  It thus represents a
  *  policy for extracting index terms from text.
  *  <p>
@@ -123,6 +125,24 @@
     return 0;
   }
 
+  /**
+   * Just like {@link #getPositionIncrementGap}, except for
+   * Token offsets instead.  By default this returns 1 for
+   * tokenized fields and, as if the fields were joined
+   * with an extra space character, and 0 for un-tokenized
+   * fields.  This method is only called if the field
+   * produced at least one token for indexing.
+   *
+   * @param Fieldable the field just indexed
+   * @return offset gap, added to the next token emitted from {@link #tokenStream(String,Reader)}
+   */
+  public int getOffsetGap(Fieldable field) {
+    if (field.isTokenized())
+      return 1;
+    else
+      return 0;
+  }
+
   /** Frees persistent resources used by this Analyzer */
   public void close() {
     tokenStreams.close();

Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/CachingTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/CachingTokenFilter.java?rev=797715&r1=797714&r2=797715&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/CachingTokenFilter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/CachingTokenFilter.java Sat Jul
25 04:11:33 2009
@@ -36,6 +36,7 @@
 public class CachingTokenFilter extends TokenFilter {
   private List cache = null;
   private Iterator iterator = null; 
+  private AttributeSource.State finalState;
   
   public CachingTokenFilter(TokenStream input) {
     super(input);
@@ -69,6 +70,12 @@
     restoreState((AttributeSource.State) iterator.next());
     return true;
   }
+  
+  public final void end() throws IOException {
+    if (finalState != null) {
+      restoreState(finalState);
+    }
+  }
 
   public void reset() throws IOException {
     if(cache != null) {
@@ -80,6 +87,9 @@
     while(input.incrementToken()) {
       cache.add(captureState());
     }
+    // capture final state
+    input.end();
+    finalState = captureState();
   }
 
 }

Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/CharTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/CharTokenizer.java?rev=797715&r1=797714&r2=797715&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/CharTokenizer.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/CharTokenizer.java Sat Jul 25 04:11:33
2009
@@ -63,6 +63,7 @@
         offset += dataLen;
         dataLen = input.read(ioBuffer);
         if (dataLen == -1) {
+          dataLen = 0;                            // so next offset += dataLen won't decrement
offset
           if (length > 0)
             break;
           else
@@ -93,6 +94,12 @@
     offsetAtt.setOffset(input.correctOffset(start), input.correctOffset(start+length));
     return true;
   }
+  
+  public final void end() {
+    // set final offset
+    int finalOffset = input.correctOffset(offset);
+    offsetAtt.setOffset(finalOffset, finalOffset);
+  }
 
   /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
    * not be overridden. Delegates to the backwards compatibility layer. */

Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/KeywordTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/KeywordTokenizer.java?rev=797715&r1=797714&r2=797715&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/KeywordTokenizer.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/KeywordTokenizer.java Sat Jul 25
04:11:33 2009
@@ -31,6 +31,7 @@
   private static final int DEFAULT_BUFFER_SIZE = 256;
 
   private boolean done;
+  private int finalOffset;
   private TermAttribute termAtt;
   private OffsetAttribute offsetAtt;
   
@@ -59,11 +60,17 @@
           buffer = termAtt.resizeTermBuffer(1+buffer.length);
       }
       termAtt.setTermLength(upto);
-      offsetAtt.setOffset(input.correctOffset(0), input.correctOffset(upto));
+      finalOffset = input.correctOffset(upto);
+      offsetAtt.setOffset(input.correctOffset(0), finalOffset);
       return true;
     }
     return false;
   }
+  
+  public final void end() {
+    // set final offset 
+    offsetAtt.setOffset(finalOffset, finalOffset);
+  }
 
   /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
    * not be overridden. Delegates to the backwards compatibility layer. */

Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/TeeSinkTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/TeeSinkTokenFilter.java?rev=797715&r1=797714&r2=797715&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/TeeSinkTokenFilter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/TeeSinkTokenFilter.java Sat Jul
25 04:11:33 2009
@@ -149,6 +149,17 @@
     return false;
   }
   
+  public final void end() throws IOException {
+    super.end();
+    AttributeSource.State finalState = captureState();
+    for (Iterator it = sinks.iterator(); it.hasNext(); ) {
+      final SinkTokenStream sink = (SinkTokenStream) ((WeakReference) it.next()).get();
+      if (sink != null) {
+        sink.setFinalState(finalState);
+      }
+    }
+  }
+  
   /**
    * A filter that decides which {@link AttributeSource} states to store in the sink.
    */
@@ -162,6 +173,7 @@
   
   public static final class SinkTokenStream extends TokenStream {
     private final List cachedStates = new LinkedList();
+    private AttributeSource.State finalState;
     private Iterator it = null;
     private SinkFilter filter;
     
@@ -181,6 +193,10 @@
       cachedStates.add(state);
     }
     
+    private void setFinalState(AttributeSource.State finalState) {
+      this.finalState = finalState;
+    }
+    
     public final boolean incrementToken() throws IOException {
       // lazy init the iterator
       if (it == null) {
@@ -195,12 +211,18 @@
       restoreState(state);
       return true;
     }
+  
+    public final void end() throws IOException {
+      if (finalState != null) {
+        restoreState(finalState);
+      }
+    }
     
     public final void reset() {
       it = cachedStates.iterator();
     }
   }
-  
+    
   private static final SinkFilter ACCEPT_ALL_FILTER = new SinkFilter() {
     public boolean accept(AttributeSource source) {
       return true;

Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/TokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/TokenFilter.java?rev=797715&r1=797714&r2=797715&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/TokenFilter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/TokenFilter.java Sat Jul 25 04:11:33
2009
@@ -39,6 +39,13 @@
     this.input = input;
   }
   
+  /** Performs end-of-stream operations, if any, and calls then <code>end()</code>
on the
+   * input TokenStream.<p/> 
+   * <b>NOTE:</b> Be sure to call <code>super.end()</code> first
when overriding this method.*/
+  public void end() throws IOException {
+    input.end();
+  }
+  
   /** Close the input TokenStream. */
   public void close() throws IOException {
     input.close();

Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/TokenStream.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/TokenStream.java?rev=797715&r1=797714&r2=797715&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/TokenStream.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/TokenStream.java Sat Jul 25 04:11:33
2009
@@ -268,6 +268,22 @@
     tokenWrapper.delegate = token;
     return true;
   }
+  
+  /**
+   * This method is called by the consumer after the last token has been consumed, 
+   * i.e. after {@link #incrementToken()} returned <code>false</code> (using
the new TokenStream API)
+   * or after {@link #next(Token)} or {@link #next()} returned <code>null</code>
(old TokenStream API).
+   * <p/>
+   * This method can be used to perform any end-of-stream operations, such as setting the
final
+   * offset of a stream. The final offset of a stream might differ from the offset of the
last token
+   * e.g. in case one or more whitespaces followed after the last token, but a {@link WhitespaceTokenizer}
+   * was used.
+   * 
+   * @throws IOException
+   */
+  public void end() throws IOException {
+    // do nothing by default
+  }
 
   /** Returns the next token in the stream, or null at EOS.
    *  When possible, the input Token should be used as the

Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java?rev=797715&r1=797714&r2=797715&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
(original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
Sat Jul 25 04:11:33 2009
@@ -183,6 +183,12 @@
         posIncr++;
     }
   }
+  
+  public final void end() {
+    // set final offset
+    int finalOffset = input.correctOffset(scanner.yychar() + scanner.yylength());
+    offsetAtt.setOffset(finalOffset, finalOffset);
+  }
 
   /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
    * not be overridden. Delegates to the backwards compatibility layer. */

Modified: lucene/java/trunk/src/java/org/apache/lucene/index/DocInverterPerField.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/DocInverterPerField.java?rev=797715&r1=797714&r2=797715&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/DocInverterPerField.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/DocInverterPerField.java Sat Jul 25
04:11:33 2009
@@ -74,6 +74,8 @@
       // tokenized.
       if (field.isIndexed() && doInvert) {
 
+        final boolean anyToken;
+        
         if (fieldState.length > 0)
           fieldState.position += docState.analyzer.getPositionIncrementGap(fieldInfo.name);
 
@@ -95,6 +97,7 @@
           fieldState.offset += valueLength;
           fieldState.length++;
           fieldState.position++;
+          anyToken = valueLength > 0;
         } else {                                  // tokenized field
           final TokenStream stream;
           final TokenStream streamValue = field.tokenStreamValue();
@@ -124,6 +127,8 @@
           // reset the TokenStream to the first token
           stream.reset();
 
+          final int startLength = fieldState.length;
+          
           // deprecated
           final boolean allowMinus1Position = docState.allowMinus1Position;
 
@@ -183,12 +188,18 @@
 
               hasMoreTokens = stream.incrementToken();
             }
-            fieldState.offset = offsetEnd+1;
+            // trigger streams to perform end-of-stream operations
+            stream.end();
+            
+            fieldState.offset += offsetAttribute.endOffset();
+            anyToken = fieldState.length > startLength;
           } finally {
             stream.close();
           }
         }
 
+        if (anyToken)
+          fieldState.offset += docState.analyzer.getOffsetGap(field);
         fieldState.boost *= field.getBoost();
       }
     }

Modified: lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexWriter.java?rev=797715&r1=797714&r2=797715&view=diff
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexWriter.java (original)
+++ lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexWriter.java Sat Jul 25 04:11:33
2009
@@ -17,17 +17,26 @@
  * limitations under the License.
  */
 
-import java.io.*;
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.IOException;
+import java.io.PrintStream;
+import java.io.Reader;
+import java.io.StringReader;
 import java.util.ArrayList;
 import java.util.Arrays;
-import java.util.List;
-import java.util.Random;
-import java.util.Map;
 import java.util.HashMap;
 import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Random;
 
 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CachingTokenFilter;
+import org.apache.lucene.analysis.SimpleAnalyzer;
 import org.apache.lucene.analysis.SinkTokenizer;
+import org.apache.lucene.analysis.StopAnalyzer;
+import org.apache.lucene.analysis.TeeSinkTokenFilter;
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
@@ -35,8 +44,6 @@
 import org.apache.lucene.analysis.WhitespaceTokenizer;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.analysis.standard.StandardTokenizer;
-import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.search.IndexSearcher;
@@ -55,7 +62,6 @@
 import org.apache.lucene.store.MockRAMDirectory;
 import org.apache.lucene.store.RAMDirectory;
 import org.apache.lucene.store.SingleInstanceLockFactory;
-import org.apache.lucene.util.AttributeSource;
 import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.util.UnicodeUtil;
 import org.apache.lucene.util._TestUtil;
@@ -4150,20 +4156,238 @@
     Field f = new Field("field", "abcd", Field.Store.NO, Field.Index.NOT_ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
     doc.add(f);
     doc.add(f);
+    Field f2 = new Field("field", "", Field.Store.NO, Field.Index.NOT_ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
+    doc.add(f2);
+    doc.add(f);
     w.addDocument(doc);
     w.close();
 
     IndexReader r = IndexReader.open(dir);
     TermVectorOffsetInfo[] termOffsets = ((TermPositionVector) r.getTermFreqVector(0, "field")).getOffsets(0);
-    assertEquals(2, termOffsets.length);
+
+    // Token "" occurred once
+    assertEquals(1, termOffsets.length);
+    assertEquals(8, termOffsets[0].getStartOffset());
+    assertEquals(8, termOffsets[0].getEndOffset());
+
+    // Token "abcd" occurred three times
+    termOffsets = ((TermPositionVector) r.getTermFreqVector(0, "field")).getOffsets(1);
+    assertEquals(3, termOffsets.length);
     assertEquals(0, termOffsets[0].getStartOffset());
     assertEquals(4, termOffsets[0].getEndOffset());
     assertEquals(4, termOffsets[1].getStartOffset());
     assertEquals(8, termOffsets[1].getEndOffset());
+    assertEquals(8, termOffsets[2].getStartOffset());
+    assertEquals(12, termOffsets[2].getEndOffset());
+    r.close();
+    dir.close();
+  }
+
+  // LUCENE-1442
+  public void testDoubleOffsetCounting2() throws Exception {
+    MockRAMDirectory dir = new MockRAMDirectory();
+    IndexWriter w = new IndexWriter(dir, new SimpleAnalyzer(), IndexWriter.MaxFieldLength.LIMITED);
+    Document doc = new Document();
+    Field f = new Field("field", "abcd", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
+    doc.add(f);
+    doc.add(f);
+    w.addDocument(doc);
+    w.close();
+
+    IndexReader r = IndexReader.open(dir);
+    TermVectorOffsetInfo[] termOffsets = ((TermPositionVector) r.getTermFreqVector(0, "field")).getOffsets(0);
+    assertEquals(2, termOffsets.length);
+    assertEquals(0, termOffsets[0].getStartOffset());
+    assertEquals(4, termOffsets[0].getEndOffset());
+    assertEquals(5, termOffsets[1].getStartOffset());
+    assertEquals(9, termOffsets[1].getEndOffset());
+    r.close();
+    dir.close();
+  }
+
+  // LUCENE-1448
+  public void testEndOffsetPositionCharAnalyzer() throws Exception {
+    MockRAMDirectory dir = new MockRAMDirectory();
+    IndexWriter w = new IndexWriter(dir, new WhitespaceAnalyzer(), IndexWriter.MaxFieldLength.LIMITED);
+    Document doc = new Document();
+    Field f = new Field("field", "abcd   ", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
+    doc.add(f);
+    doc.add(f);
+    w.addDocument(doc);
+    w.close();
+
+    IndexReader r = IndexReader.open(dir);
+    TermVectorOffsetInfo[] termOffsets = ((TermPositionVector) r.getTermFreqVector(0, "field")).getOffsets(0);
+    assertEquals(2, termOffsets.length);
+    assertEquals(0, termOffsets[0].getStartOffset());
+    assertEquals(4, termOffsets[0].getEndOffset());
+    assertEquals(8, termOffsets[1].getStartOffset());
+    assertEquals(12, termOffsets[1].getEndOffset());
+    r.close();
+    dir.close();
+  }
+
+  // LUCENE-1448
+  public void testEndOffsetPositionWithCachingTokenFilter() throws Exception {
+    MockRAMDirectory dir = new MockRAMDirectory();
+    Analyzer analyzer = new WhitespaceAnalyzer();
+    IndexWriter w = new IndexWriter(dir, analyzer, IndexWriter.MaxFieldLength.LIMITED);
+    Document doc = new Document();
+    TokenStream stream = new CachingTokenFilter(analyzer.tokenStream("field", new StringReader("abcd
  ")));
+    Field f = new Field("field", stream, Field.TermVector.WITH_POSITIONS_OFFSETS);
+    doc.add(f);
+    doc.add(f);
+    w.addDocument(doc);
+    w.close();
+
+    IndexReader r = IndexReader.open(dir);
+    TermVectorOffsetInfo[] termOffsets = ((TermPositionVector) r.getTermFreqVector(0, "field")).getOffsets(0);
+    assertEquals(2, termOffsets.length);
+    assertEquals(0, termOffsets[0].getStartOffset());
+    assertEquals(4, termOffsets[0].getEndOffset());
+    assertEquals(8, termOffsets[1].getStartOffset());
+    assertEquals(12, termOffsets[1].getEndOffset());
+    r.close();
+    dir.close();
+  }
+
+  // LUCENE-1448
+  public void testEndOffsetPositionWithTeeSinkTokenFilter() throws Exception {
+    MockRAMDirectory dir = new MockRAMDirectory();
+    Analyzer analyzer = new WhitespaceAnalyzer();
+    IndexWriter w = new IndexWriter(dir, analyzer, IndexWriter.MaxFieldLength.LIMITED);
+    Document doc = new Document();
+    TeeSinkTokenFilter tee = new TeeSinkTokenFilter(analyzer.tokenStream("field", new StringReader("abcd
  ")));
+    TokenStream sink = tee.newSinkTokenStream();
+    Field f1 = new Field("field", tee, Field.TermVector.WITH_POSITIONS_OFFSETS);
+    Field f2 = new Field("field", sink, Field.TermVector.WITH_POSITIONS_OFFSETS);
+    doc.add(f1);
+    doc.add(f2);
+    w.addDocument(doc);
+    w.close();
+
+    IndexReader r = IndexReader.open(dir);
+    TermVectorOffsetInfo[] termOffsets = ((TermPositionVector) r.getTermFreqVector(0, "field")).getOffsets(0);
+    assertEquals(2, termOffsets.length);
+    assertEquals(0, termOffsets[0].getStartOffset());
+    assertEquals(4, termOffsets[0].getEndOffset());
+    assertEquals(8, termOffsets[1].getStartOffset());
+    assertEquals(12, termOffsets[1].getEndOffset());
+    r.close();
+    dir.close();
+  }
+  
+  // LUCENE-1448
+  public void testEndOffsetPositionStopFilter() throws Exception {
+    MockRAMDirectory dir = new MockRAMDirectory();
+    IndexWriter w = new IndexWriter(dir, new StopAnalyzer(), IndexWriter.MaxFieldLength.LIMITED);
+    Document doc = new Document();
+    Field f = new Field("field", "abcd the", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
+    doc.add(f);
+    doc.add(f);
+    w.addDocument(doc);
+    w.close();
+
+    IndexReader r = IndexReader.open(dir);
+    TermVectorOffsetInfo[] termOffsets = ((TermPositionVector) r.getTermFreqVector(0, "field")).getOffsets(0);
+    assertEquals(2, termOffsets.length);
+    assertEquals(0, termOffsets[0].getStartOffset());
+    assertEquals(4, termOffsets[0].getEndOffset());
+    assertEquals(9, termOffsets[1].getStartOffset());
+    assertEquals(13, termOffsets[1].getEndOffset());
+    r.close();
+    dir.close();
+  }
+
+  // LUCENE-1448
+  public void testEndOffsetPositionStandard() throws Exception {
+    MockRAMDirectory dir = new MockRAMDirectory();
+    IndexWriter w = new IndexWriter(dir, new StandardAnalyzer(), IndexWriter.MaxFieldLength.LIMITED);
+    Document doc = new Document();
+    Field f = new Field("field", "abcd the  ", Field.Store.NO,
+        Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
+    Field f2 = new Field("field", "crunch man", Field.Store.NO,
+        Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
+    doc.add(f);
+    doc.add(f2);
+    w.addDocument(doc);
+    w.close();
+
+    IndexReader r = IndexReader.open(dir);
+    TermPositionVector tpv = ((TermPositionVector) r.getTermFreqVector(0, "field"));
+    TermVectorOffsetInfo[] termOffsets = tpv.getOffsets(0);
+    assertEquals(1, termOffsets.length);
+    assertEquals(0, termOffsets[0].getStartOffset());
+    assertEquals(4, termOffsets[0].getEndOffset());
+    termOffsets = tpv.getOffsets(1);
+    assertEquals(11, termOffsets[0].getStartOffset());
+    assertEquals(17, termOffsets[0].getEndOffset());
+    termOffsets = tpv.getOffsets(2);
+    assertEquals(18, termOffsets[0].getStartOffset());
+    assertEquals(21, termOffsets[0].getEndOffset());
+    r.close();
+    dir.close();
+  }
+
+  // LUCENE-1448
+  public void testEndOffsetPositionStandardEmptyField() throws Exception {
+    MockRAMDirectory dir = new MockRAMDirectory();
+    IndexWriter w = new IndexWriter(dir, new StandardAnalyzer(), IndexWriter.MaxFieldLength.LIMITED);
+    Document doc = new Document();
+    Field f = new Field("field", "", Field.Store.NO,
+                        Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
+    Field f2 = new Field("field", "crunch man", Field.Store.NO,
+        Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
+    doc.add(f);
+    doc.add(f2);
+    w.addDocument(doc);
+    w.close();
+
+    IndexReader r = IndexReader.open(dir);
+    TermPositionVector tpv = ((TermPositionVector) r.getTermFreqVector(0, "field"));
+    TermVectorOffsetInfo[] termOffsets = tpv.getOffsets(0);
+    assertEquals(1, termOffsets.length);
+    assertEquals(0, termOffsets[0].getStartOffset());
+    assertEquals(6, termOffsets[0].getEndOffset());
+    termOffsets = tpv.getOffsets(1);
+    assertEquals(7, termOffsets[0].getStartOffset());
+    assertEquals(10, termOffsets[0].getEndOffset());
+    r.close();
+    dir.close();
+  }
+
+  // LUCENE-1448
+  public void testEndOffsetPositionStandardEmptyField2() throws Exception {
+    MockRAMDirectory dir = new MockRAMDirectory();
+    IndexWriter w = new IndexWriter(dir, new StandardAnalyzer(), IndexWriter.MaxFieldLength.LIMITED);
+    Document doc = new Document();
+
+    Field f = new Field("field", "abcd", Field.Store.NO,
+                        Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
+    doc.add(f);
+    doc.add(new Field("field", "", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
+
+    Field f2 = new Field("field", "crunch", Field.Store.NO,
+        Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
+    doc.add(f2);
+
+    w.addDocument(doc);
+    w.close();
+
+    IndexReader r = IndexReader.open(dir);
+    TermPositionVector tpv = ((TermPositionVector) r.getTermFreqVector(0, "field"));
+    TermVectorOffsetInfo[] termOffsets = tpv.getOffsets(0);
+    assertEquals(1, termOffsets.length);
+    assertEquals(0, termOffsets[0].getStartOffset());
+    assertEquals(4, termOffsets[0].getEndOffset());
+    termOffsets = tpv.getOffsets(1);
+    assertEquals(5, termOffsets[0].getStartOffset());
+    assertEquals(11, termOffsets[0].getEndOffset());
     r.close();
     dir.close();
   }
 
+
   // LUCENE-1468 -- make sure opening an IndexWriter with
   // create=true does not remove non-index files
   



Mime
View raw message