lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From yo...@apache.org
Subject svn commit: r596398 - in /lucene/java/trunk/src: java/org/apache/lucene/analysis/Token.java java/org/apache/lucene/index/DocumentsWriter.java test/org/apache/lucene/index/TestDocumentWriter.java
Date Mon, 19 Nov 2007 19:10:39 GMT
Author: yonik
Date: Mon Nov 19 11:10:37 2007
New Revision: 596398

URL: http://svn.apache.org/viewvc?rev=596398&view=rev
Log:
LUCENE-1057: call clear when reusing token, change clear to only resent essential fields,
re-add Token.clone()

Modified:
    lucene/java/trunk/src/java/org/apache/lucene/analysis/Token.java
    lucene/java/trunk/src/java/org/apache/lucene/index/DocumentsWriter.java
    lucene/java/trunk/src/test/org/apache/lucene/index/TestDocumentWriter.java

Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/Token.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/Token.java?rev=596398&r1=596397&r2=596398&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/Token.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/Token.java Mon Nov 19 11:10:37 2007
@@ -361,14 +361,29 @@
     return sb.toString();
   }
 
-  /** Reset all state for this token back to defaults. */
+  /** Resets the term text, payload, and positionIncrement to default.
+   * Other fields such as startOffset, endOffset and the token type are
+   * not reset since they are normally overwritten by the tokenizer. */
   public void clear() {
     payload = null;
     // Leave termBuffer to allow re-use
     termLength = 0;
     termText = null;
     positionIncrement = 1;
-    startOffset = endOffset = 0;
-    type = DEFAULT_TYPE;
+    // startOffset = endOffset = 0;
+    // type = DEFAULT_TYPE;
+  }
+
+  public Object clone() {
+    try {
+      Token t = (Token)super.clone();
+      if (termBuffer != null) {
+        t.termBuffer = null;
+        t.setTermBuffer(termBuffer, 0, termLength);
+      }
+      return t;
+    } catch (CloneNotSupportedException e) {
+      throw new RuntimeException(e);  // shouldn't happen
+    }
   }
 }

Modified: lucene/java/trunk/src/java/org/apache/lucene/index/DocumentsWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/DocumentsWriter.java?rev=596398&r1=596397&r2=596398&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/DocumentsWriter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/DocumentsWriter.java Mon Nov 19 11:10:37
2007
@@ -1281,6 +1281,7 @@
         if (!field.isTokenized()) {		  // un-tokenized field
           String stringValue = field.stringValue();
           Token token = localToken;
+          token.clear();
           token.setTermText(stringValue);
           token.setStartOffset(offset);
           token.setEndOffset(offset + stringValue.length());
@@ -1319,7 +1320,10 @@
           try {
             offsetEnd = offset-1;
             Token token;
-            while((token = stream.next(localToken)) != null) {
+            for(;;) {
+              localToken.clear();
+              token = stream.next(localToken);
+              if (token == null) break;
               position += (token.getPositionIncrement() - 1);
               addPosition(token);
               if (++length >= maxFieldLength) {

Modified: lucene/java/trunk/src/test/org/apache/lucene/index/TestDocumentWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/index/TestDocumentWriter.java?rev=596398&r1=596397&r2=596398&view=diff
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/index/TestDocumentWriter.java (original)
+++ lucene/java/trunk/src/test/org/apache/lucene/index/TestDocumentWriter.java Mon Nov 19
11:10:37 2007
@@ -17,22 +17,17 @@
  * limitations under the License.
  */
 
-import org.apache.lucene.util.LuceneTestCase;
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.SimpleAnalyzer;
-import org.apache.lucene.analysis.Token;
-import org.apache.lucene.analysis.WhitespaceAnalyzer;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.WhitespaceTokenizer;
-import org.apache.lucene.document.*;
+import org.apache.lucene.analysis.*;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
 import org.apache.lucene.document.Field.TermVector;
+import org.apache.lucene.document.Fieldable;
 import org.apache.lucene.search.Similarity;
 import org.apache.lucene.store.RAMDirectory;
+import org.apache.lucene.util.LuceneTestCase;
 
-import java.io.Reader;
 import java.io.IOException;
-
-import java.util.Arrays;
+import java.io.Reader;
 
 public class TestDocumentWriter extends LuceneTestCase {
   private RAMDirectory dir;
@@ -130,7 +125,71 @@
     assertEquals(0, termPositions.nextPosition());
     assertEquals(502, termPositions.nextPosition());
   }
-  
+
+  public void testTokenReuse() throws IOException {
+    Analyzer analyzer = new Analyzer() {
+      public TokenStream tokenStream(String fieldName, Reader reader) {
+        return new TokenFilter(new WhitespaceTokenizer(reader)) {
+          boolean first=true;
+          Token buffered;
+
+          public Token next() throws IOException {
+            return input.next();
+          }
+
+          public Token next(Token result) throws IOException {
+            if (buffered != null) {
+              Token t = buffered;
+              buffered=null;
+              return t;
+            }
+            Token t = input.next(result);
+            if (t==null) return null;
+            if (Character.isDigit(t.termBuffer()[0])) {
+              t.setPositionIncrement(t.termBuffer()[0] - '0');
+            }
+            if (first) {
+              // set payload on first position only
+              t.setPayload(new Payload(new byte[]{100}));
+              first = false;
+            }
+
+            // index a "synonym" for every token
+            buffered = (Token)t.clone();
+            buffered.setPayload(null);
+            buffered.setPositionIncrement(0);
+            buffered.setTermBuffer(new char[]{'b'}, 0, 1);
+
+            return t;
+          }
+        };
+      }
+    };
+
+    IndexWriter writer = new IndexWriter(dir, analyzer, true);
+
+    Document doc = new Document();
+    doc.add(new Field("f1", "a 5 a a", Field.Store.YES, Field.Index.TOKENIZED));
+
+    writer.addDocument(doc);
+    writer.flush();
+    SegmentInfo info = writer.newestSegment();
+    writer.close();
+    SegmentReader reader = SegmentReader.get(info);
+
+    TermPositions termPositions = reader.termPositions(new Term("f1", "a"));
+    assertTrue(termPositions.next());
+    int freq = termPositions.freq();
+    assertEquals(3, freq);
+    assertEquals(0, termPositions.nextPosition());
+    assertEquals(true, termPositions.isPayloadAvailable());
+    assertEquals(6, termPositions.nextPosition());
+    assertEquals(false, termPositions.isPayloadAvailable());
+    assertEquals(7, termPositions.nextPosition());
+    assertEquals(false, termPositions.isPayloadAvailable());
+  }
+
+
   public void testPreAnalyzedField() throws IOException {
     Similarity similarity = Similarity.getDefault();
     IndexWriter writer = new IndexWriter(dir, new SimpleAnalyzer(), true);



Mime
View raw message