lucene-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Marvin Humphrey <mar...@rectangular.com>
Subject bytecount as prefix
Date Tue, 11 Apr 2006 19:05:55 GMT
Greets,

I'm back working on converting Lucene to using a byte count instead  
of a char count at as a prefix at the head of each String.  Three  
tests are failing: TestIndexModifier, TestConstantScoreRangeQuery,  
and TestRangeFilter.

Why those and not others?

Marvin Humphrey
Rectangular Research
http://www.rectangular.com/


     [junit] Testsuite: org.apache.lucene.index.TestIndexModifier
     [junit] Tests run: 3, Failures: 0, Errors: 0, Time elapsed:  
26.784 sec

     [junit] ------------- Standard Error -----------------
     [junit] java.lang.RuntimeException: Internal error: 2 deleted 0  
documents, term=id:242
     [junit]     at org.apache.lucene.index.IndexThread.run 
(TestIndexModifier.java:245)
     [junit] java.lang.RuntimeException: Internal error: 1 deleted 0  
documents, term=id:241
     [junit]     at org.apache.lucene.index.IndexThread.run 
(TestIndexModifier.java:245)
     [junit] java.lang.RuntimeException: Internal error: 2 deleted 0  
documents, term=id:287
     [junit]     at org.apache.lucene.index.IndexThread.run 
(TestIndexModifier.java:245)
     [junit] java.lang.RuntimeException: Internal error: 1 deleted 0  
documents, term=id:286
     [junit]     at org.apache.lucene.index.IndexThread.run 
(TestIndexModifier.java:245)
     [junit] java.lang.RuntimeException: Internal error: 2 deleted 0  
documents, term=id:294
     [junit]     at org.apache.lucene.index.IndexThread.run 
(TestIndexModifier.java:245)
     [junit] java.lang.RuntimeException: Internal error: 1 deleted 0  
documents, term=id:291
     [junit]     at org.apache.lucene.index.IndexThread.run 
(TestIndexModifier.java:245)
     [junit] ------------- ---------------- ---------------

     [junit] Testsuite:  
org.apache.lucene.search.TestConstantScoreRangeQuery
     [junit] Tests run: 7, Failures: 2, Errors: 0, Time elapsed: 1.07  
sec

     [junit] Testcase: testRangeQueryId 
(org.apache.lucene.search.TestConstantScoreRangeQuery):   FAILED
     [junit] find all expected:<10001> but was:<0>
     [junit] junit.framework.AssertionFailedError: find all  
expected:<10001> but was:<0>
     [junit]     at  
org.apache.lucene.search.TestConstantScoreRangeQuery.testRangeQueryId 
(TestConstantScoreRangeQuery.java:220)
     [junit]     at sun.reflect.NativeMethodAccessorImpl.invoke0 
(Native Method)
     [junit]     at sun.reflect.NativeMethodAccessorImpl.invoke 
(NativeMethodAccessorImpl.java:39)
     [junit]     at sun.reflect.DelegatingMethodAccessorImpl.invoke 
(DelegatingMethodAccessorImpl.java:25)


     [junit] Testcase: testRangeQueryRand 
(org.apache.lucene.search.TestConstantScoreRangeQuery): FAILED
     [junit] find all expected:<10001> but was:<0>
     [junit] junit.framework.AssertionFailedError: find all  
expected:<10001> but was:<0>
     [junit]     at  
org.apache.lucene.search.TestConstantScoreRangeQuery.testRangeQueryRand( 
TestConstantScoreRangeQuery.java:300)
     [junit]     at sun.reflect.NativeMethodAccessorImpl.invoke0 
(Native Method)
     [junit]     at sun.reflect.NativeMethodAccessorImpl.invoke 
(NativeMethodAccessorImpl.java:39)
     [junit]     at sun.reflect.DelegatingMethodAccessorImpl.invoke 
(DelegatingMethodAccessorImpl.java:25)


     [junit] Test  
org.apache.lucene.search.TestConstantScoreRangeQuery FAILED


     [junit] Testcase: testRangeFilterId 
(org.apache.lucene.search.TestRangeFilter):      FAILED
     [junit] find all expected:<10001> but was:<0>
     [junit] junit.framework.AssertionFailedError: find all  
expected:<10001> but was:<0>
     [junit]     at  
org.apache.lucene.search.TestRangeFilter.testRangeFilterId 
(TestRangeFilter.java:63)
     [junit]     at sun.reflect.NativeMethodAccessorImpl.invoke0 
(Native Method)
     [junit]     at sun.reflect.NativeMethodAccessorImpl.invoke 
(NativeMethodAccessorImpl.java:39)
     [junit]     at sun.reflect.DelegatingMethodAccessorImpl.invoke 
(DelegatingMethodAccessorImpl.java:25)


     [junit] Testcase: testRangeFilterRand 
(org.apache.lucene.search.TestRangeFilter):    FAILED
     [junit] find all expected:<10001> but was:<0>
     [junit] junit.framework.AssertionFailedError: find all  
expected:<10001> but was:<0>
     [junit]     at  
org.apache.lucene.search.TestRangeFilter.testRangeFilterRand 
(TestRangeFilter.java:142)
     [junit]     at sun.reflect.NativeMethodAccessorImpl.invoke0 
(Native Method)
     [junit]     at sun.reflect.NativeMethodAccessorImpl.invoke 
(NativeMethodAccessorImpl.java:39)
     [junit]     at sun.reflect.DelegatingMethodAccessorImpl.invoke 
(DelegatingMethodAccessorImpl.java:25)


     [junit] Test org.apache.lucene.search.TestRangeFilter FAILED


slothbear:~/Desktop/search_engine_stuff/lucene_393239 marvin$ svn diff
Index: src/java/org/apache/lucene/index/TermBuffer.java
===================================================================
--- src/java/org/apache/lucene/index/TermBuffer.java    (revision  
393239)
+++ src/java/org/apache/lucene/index/TermBuffer.java    (working copy)
@@ -18,42 +18,41 @@
import java.io.IOException;
import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.StringHelper;
final class TermBuffer implements Cloneable {
-  private static final char[] NO_CHARS = new char[0];
+  private static final byte[] NO_BYTES = new byte[0];
    private String field;
-  private char[] text = NO_CHARS;
-  private int textLength;
+  private byte[] bytes = NO_BYTES;
+  private int bytesLength;
    private Term term;                            // cached
    public final int compareTo(TermBuffer other) {
      if (field == other.field)                    // fields are  
interned
-      return compareChars(text, textLength, other.text,  
other.textLength);
+      return compareBytes(bytes, bytesLength, other.bytes,  
other.bytesLength);
      else
        return field.compareTo(other.field);
    }
-  private static final int compareChars(char[] v1, int len1,
-                                        char[] v2, int len2) {
+  private static final int compareBytes(byte[] bytes1, int len1,
+                                        byte[] bytes2, int len2) {
      int end = Math.min(len1, len2);
      for (int k = 0; k < end; k++) {
-      char c1 = v1[k];
-      char c2 = v2[k];
-      if (c1 != c2) {
-        return c1 - c2;
+      if (bytes1[k] != bytes2[k]) {
+        return bytes1[k] - bytes2[k];
        }
      }
      return len1 - len2;
    }
-  private final void setTextLength(int newLength) {
-    if (text.length < newLength) {
-      char[] newText = new char[newLength];
-      System.arraycopy(text, 0, newText, 0, textLength);
-      text = newText;
+  private final void setBytesLength(int newLength) {
+    if (bytes.length < newLength) {
+      byte[] newBytes = new byte[newLength];
+      System.arraycopy(bytes, 0, newBytes, 0, bytesLength);
+      bytes = newBytes;
      }
-    textLength = newLength;
+    bytesLength = newLength;
    }
    public final void read(IndexInput input, FieldInfos fieldInfos)
@@ -62,28 +61,29 @@
      int start = input.readVInt();
      int length = input.readVInt();
      int totalLength = start + length;
-    setTextLength(totalLength);
-    input.readChars(this.text, start, length);
+    setBytesLength(totalLength);
+    input.readBytes(this.bytes, start, length);
      this.field = fieldInfos.fieldName(input.readVInt());
    }
-  public final void set(Term term) {
-    if (term == null) {
+  public final void set(Term t) {
+    if (t == null) {
        reset();
        return;
      }
-    // copy text into the buffer
-    setTextLength(term.text().length());
-    term.text().getChars(0, term.text().length(), text, 0);
-
-    this.field = term.field();
-    this.term = term;
+    // convert chars into UTF-8 bytes, store in buffer
+    try {
+        bytes = t.text().getBytes("UTF-8");
+    } catch (java.io.UnsupportedEncodingException e) { }
+    setBytesLength(bytes.length);
+    this.field = t.field();
+    this.term = t;
    }
    public final void set(TermBuffer other) {
-    setTextLength(other.textLength);
-    System.arraycopy(other.text, 0, text, 0, textLength);
+    setBytesLength(other.bytesLength);
+    System.arraycopy(other.bytes, 0, bytes, 0, bytesLength);
      this.field = other.field;
      this.term = other.term;
@@ -91,7 +91,7 @@
    public void reset() {
      this.field = null;
-    this.textLength = 0;
+    this.bytesLength = 0;
      this.term = null;
    }
@@ -100,8 +100,12 @@
        return null;
      if (term == null)
-      term = new Term(field, new String(text, 0, textLength), false);
+      try {
+        term = new Term(field,
+            new String(bytes, 0, bytesLength, "UTF-8"), false );
+    } catch (java.io.UnsupportedEncodingException e) { }
+
      return term;
    }
@@ -111,8 +115,8 @@
        clone = (TermBuffer)super.clone();
      } catch (CloneNotSupportedException e) {}
-    clone.text = new char[text.length];
-    System.arraycopy(text, 0, clone.text, 0, textLength);
+    clone.bytes = new byte[bytes.length];
+    System.arraycopy(bytes, 0, clone.bytes, 0, bytesLength);
      return clone;
    }
Index: src/java/org/apache/lucene/index/TermInfosWriter.java
===================================================================
--- src/java/org/apache/lucene/index/TermInfosWriter.java        
(revision 393239)
+++ src/java/org/apache/lucene/index/TermInfosWriter.java        
(working copy)
@@ -33,6 +33,8 @@
    private IndexOutput output;
    private Term lastTerm = new Term("", "");
    private TermInfo lastTi = new TermInfo();
+  private static final byte[] NO_BYTES = new byte[0];
+  private byte[] lastBytes = NO_BYTES;
    private long size = 0;
    // TODO: the default values for these two parameters should be  
settable from
@@ -121,16 +123,21 @@
    private final void writeTerm(Term term)
         throws IOException {
-    int start = StringHelper.stringDifference(lastTerm.text,  
term.text);
-    int length = term.text.length() - start;
+    byte[] bytes = term.text().getBytes("UTF-8");
+    int totalLength = bytes.length;
+    int start = StringHelper.bytesDifference(lastBytes, bytes);
+    int diffLength = totalLength - start;
+
      output.writeVInt(start);                   // write shared  
prefix length
-    output.writeVInt(length);                  // write delta length
-    output.writeChars(term.text, start, length);  // write delta chars
+    output.writeVInt(diffLength);                  // write delta  
length
+    for (int i = start ; i < totalLength; i++) {
+      output.writeByte(bytes[i]);              // write delta UTF-8  
bytes
+    }
      output.writeVInt(fieldInfos.fieldNumber(term.field)); // write  
field num
-    lastTerm = term;
+    lastBytes = bytes;
    }
Index: src/java/org/apache/lucene/store/IndexInput.java
===================================================================
--- src/java/org/apache/lucene/store/IndexInput.java    (revision  
393239)
+++ src/java/org/apache/lucene/store/IndexInput.java    (working copy)
@@ -17,13 +17,14 @@
   */
import java.io.IOException;
+import org.apache.lucene.util.StringHelper;
/** Abstract base class for input from a file in a {@link Directory}.  A
   * random-access input stream.  Used for all Lucene index input  
operations.
   * @see Directory
   */
public abstract class IndexInput implements Cloneable {
-  private char[] chars;                           // used by  
readString()
+  private byte[] bytes;                           // used by  
readString()
    /** Reads and returns a single byte.
     * @see IndexOutput#writeByte(byte)
@@ -87,10 +88,10 @@
     */
    public String readString() throws IOException {
      int length = readVInt();
-    if (chars == null || length > chars.length)
-      chars = new char[length];
-    readChars(chars, 0, length);
-    return new String(chars, 0, length);
+    if (bytes == null || length > bytes.length)
+      bytes = new byte[length];
+    readBytes(bytes, 0, length);
+    return new String(bytes, 0, length, "UTF-8");
    }
    /** Reads UTF-8 encoded characters into an array.
@@ -104,15 +105,29 @@
      final int end = start + length;
      for (int i = start; i < end; i++) {
        byte b = readByte();
-      if ((b & 0x80) == 0)
-       buffer[i] = (char)(b & 0x7F);
-      else if ((b & 0xE0) != 0xE0) {
-       buffer[i] = (char)(((b & 0x1F) << 6)
-                | (readByte() & 0x3F));
-      } else
-       buffer[i] = (char)(((b & 0x0F) << 12)
-               | ((readByte() & 0x3F) << 6)
-               |  (readByte() & 0x3F));
+      switch (StringHelper.TRAILING_BYTES_FOR_UTF8[b & 0xFF]) {
+        case 0:
+          buffer[i] = (char)(b & 0x7F);
+          break;
+        case 1:
+          buffer[i] = (char)(((b & 0x1F) << 6)
+            | (readByte() & 0x3F));
+          break;
+        case 2:
+          buffer[i] = (char)(((b & 0x0F) << 12)
+            | ((readByte() & 0x3F) << 6)
+            |  (readByte() & 0x3F));
+          break;
+        case 3:
+          int utf32 = (((b & 0x0F) << 18)
+            | ((readByte() & 0x3F) << 12)
+            | ((readByte() & 0x3F) << 6)
+            |  (readByte() & 0x3F));
+          buffer[i] = (char)((utf32 >> 10) + 0xD7C0);
+          i++;
+          buffer[i] = (char)((utf32 & 0x03FF) + 0xDC00);
+          break;
+      }
      }
    }
@@ -148,7 +163,7 @@
        clone = (IndexInput)super.clone();
      } catch (CloneNotSupportedException e) {}
-    clone.chars = null;
+    clone.bytes = null;
      return clone;
    }
Index: src/java/org/apache/lucene/store/IndexOutput.java
===================================================================
--- src/java/org/apache/lucene/store/IndexOutput.java   (revision  
393239)
+++ src/java/org/apache/lucene/store/IndexOutput.java   (working copy)
@@ -17,6 +17,7 @@
   */
import java.io.IOException;
+import org.apache.lucene.util.StringHelper;
/** Abstract base class for output to a file in a Directory.  A  
random-access
   * output stream.  Used for all Lucene index output operations.
@@ -85,7 +86,7 @@
     * @see IndexInput#readString()
     */
    public void writeString(String s) throws IOException {
-    int length = s.length();
+    int length = StringHelper.countUTF8Bytes(s);
      writeVInt(length);
      writeChars(s, 0, length);
    }
@@ -101,15 +102,37 @@
      final int end = start + length;
      for (int i = start; i < end; i++) {
        final int code = (int)s.charAt(i);
-      if (code >= 0x01 && code <= 0x7F)
-       writeByte((byte)code);
-      else if (((code >= 0x80) && (code <= 0x7FF)) || code == 0) {
-       writeByte((byte)(0xC0 | (code >> 6)));
-       writeByte((byte)(0x80 | (code & 0x3F)));
+      if (code < 0x80)
+        writeByte((byte)code);
+      else if (code < 0x800) {
+        writeByte((byte)(0xC0 | (code >> 6)));
+        writeByte((byte)(0x80 | (code & 0x3F)));
+      } else if (code < 0xD800 || code > 0xDFFF) {
+        writeByte((byte)(0xE0 | (code >> 12)));
+        writeByte((byte)(0x80 | ((code >> 6) & 0x3F)));
+        writeByte((byte)(0x80 | (code & 0x3F)));
        } else {
-       writeByte((byte)(0xE0 | (code >>> 12)));
-       writeByte((byte)(0x80 | ((code >> 6) & 0x3F)));
-       writeByte((byte)(0x80 | (code & 0x3F)));
+        // surrogate pair
+        int utf32;
+        // confirm valid high surrogate
+        if (code < 0xDC00 && (i < end-1)) {
+          utf32 = ((int)s.charAt(i+1));
+          // confirm valid low surrogate and write pair
+          if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) {
+            utf32 = ((code - 0xD7C0) << 10) + (utf32 & 0x3FF);
+            i++;
+            writeByte((byte)(0xF0 | (utf32 >> 18)));
+            writeByte((byte)(0x80 | ((utf32 >> 12) & 0x3F)));
+            writeByte((byte)(0x80 | ((utf32 >> 6) & 0x3F)));
+            writeByte((byte)(0x80 | (utf32 & 0x3F)));
+            continue;
+          }
+        }
+        // replace unpaired surrogate or out-of-order low surrogate
+        // with substitution character
+        writeByte((byte)0xEF);
+        writeByte((byte)0xBF);
+        writeByte((byte)0xBD);
        }
      }
    }
Index: src/java/org/apache/lucene/util/StringHelper.java
===================================================================
--- src/java/org/apache/lucene/util/StringHelper.java   (revision  
393239)
+++ src/java/org/apache/lucene/util/StringHelper.java   (working copy)
@@ -16,6 +16,8 @@
   * limitations under the License.
   */
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
/**
   * Methods for manipulating strings.
@@ -25,6 +27,64 @@
public abstract class StringHelper {
    /**
+   * Compares two byte[] arrays, element by element, and returns the
+   * number of elements common to both arrays.
+   *
+   * @param bytes1 The first byte[] to compare
+   * @param bytes2 The second byte[] to compare
+   * @return The number of common elements.
+   */
+  public static final int bytesDifference(byte[] bytes1, byte[]  
bytes2) {
+    int len1 = bytes1.length;
+    int len2 = bytes2.length;
+    int len = len1 < len2 ? len1 : len2;
+    for (int i = 0; i < len; i++) {
+      if (bytes1[i] != bytes2[i]) {
+        return i;
+      }
+    }
+    return len;
+  }
+
+  public static final byte[] TRAILING_BYTES_FOR_UTF8 = {
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+    3,3,3,3,3,3,3,3
+  };
+
+  /**
+   * Count the number of bytes which would be occupied by this string
+   * were it to be converted to UTF8.
+   *
+   * @param s The string to operate against
+   * @return The number of UTF8 bytes
+   */
+  public static final int countUTF8Bytes(String s) {
+    int byteCount = s.length();    // start with 1 byte per char
+    int max = byteCount - 1;
+    for (int i = 0; i < max; i++) {
+      // add the number of trailing bytes for each char
+      byteCount += TRAILING_BYTES_FOR_UTF8[ (int)s.charAt(i) ];
+    }
+    return byteCount;
+  }
+
+
+  /**
     * Compares two strings, character by character, and returns the
     * first position where the two strings differ from one another.
     *




---------------------------------------------------------------------
To unsubscribe, e-mail: java-dev-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-dev-help@lucene.apache.org


Mime
View raw message