lucene-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From "David Balmain" <dbalmain...@gmail.com>
Subject Re: bytecount as prefix
Date Sat, 06 May 2006 08:11:02 GMT
Hi Marvin,

Where are you with this? I also have a vested interest in seeing
Lucene move to using byte counts. I was wondering if I could help out.
Is the patch you pasted here the latest you have?

Cheers,
Dave

On 4/12/06, Marvin Humphrey <marvin@rectangular.com> wrote:
> Greets,
>
> I'm back working on converting Lucene to using a byte count instead
> of a char count at as a prefix at the head of each String.  Three
> tests are failing: TestIndexModifier, TestConstantScoreRangeQuery,
> and TestRangeFilter.
>
> Why those and not others?
>
> Marvin Humphrey
> Rectangular Research
> http://www.rectangular.com/
>
>
>     [junit] Testsuite: org.apache.lucene.index.TestIndexModifier
>     [junit] Tests run: 3, Failures: 0, Errors: 0, Time elapsed:
> 26.784 sec
>
>     [junit] ------------- Standard Error -----------------
>     [junit] java.lang.RuntimeException: Internal error: 2 deleted 0
> documents, term=id:242
>     [junit]     at org.apache.lucene.index.IndexThread.run
> (TestIndexModifier.java:245)
>     [junit] java.lang.RuntimeException: Internal error: 1 deleted 0
> documents, term=id:241
>     [junit]     at org.apache.lucene.index.IndexThread.run
> (TestIndexModifier.java:245)
>     [junit] java.lang.RuntimeException: Internal error: 2 deleted 0
> documents, term=id:287
>     [junit]     at org.apache.lucene.index.IndexThread.run
> (TestIndexModifier.java:245)
>     [junit] java.lang.RuntimeException: Internal error: 1 deleted 0
> documents, term=id:286
>     [junit]     at org.apache.lucene.index.IndexThread.run
> (TestIndexModifier.java:245)
>     [junit] java.lang.RuntimeException: Internal error: 2 deleted 0
> documents, term=id:294
>     [junit]     at org.apache.lucene.index.IndexThread.run
> (TestIndexModifier.java:245)
>     [junit] java.lang.RuntimeException: Internal error: 1 deleted 0
> documents, term=id:291
>     [junit]     at org.apache.lucene.index.IndexThread.run
> (TestIndexModifier.java:245)
>     [junit] ------------- ---------------- ---------------
>
>     [junit] Testsuite:
> org.apache.lucene.search.TestConstantScoreRangeQuery
>     [junit] Tests run: 7, Failures: 2, Errors: 0, Time elapsed: 1.07
> sec
>
>     [junit] Testcase: testRangeQueryId
> (org.apache.lucene.search.TestConstantScoreRangeQuery):   FAILED
>     [junit] find all expected:<10001> but was:<0>
>     [junit] junit.framework.AssertionFailedError: find all
> expected:<10001> but was:<0>
>     [junit]     at
> org.apache.lucene.search.TestConstantScoreRangeQuery.testRangeQueryId
> (TestConstantScoreRangeQuery.java:220)
>     [junit]     at sun.reflect.NativeMethodAccessorImpl.invoke0
> (Native Method)
>     [junit]     at sun.reflect.NativeMethodAccessorImpl.invoke
> (NativeMethodAccessorImpl.java:39)
>     [junit]     at sun.reflect.DelegatingMethodAccessorImpl.invoke
> (DelegatingMethodAccessorImpl.java:25)
>
>
>     [junit] Testcase: testRangeQueryRand
> (org.apache.lucene.search.TestConstantScoreRangeQuery): FAILED
>     [junit] find all expected:<10001> but was:<0>
>     [junit] junit.framework.AssertionFailedError: find all
> expected:<10001> but was:<0>
>     [junit]     at
> org.apache.lucene.search.TestConstantScoreRangeQuery.testRangeQueryRand(
> TestConstantScoreRangeQuery.java:300)
>     [junit]     at sun.reflect.NativeMethodAccessorImpl.invoke0
> (Native Method)
>     [junit]     at sun.reflect.NativeMethodAccessorImpl.invoke
> (NativeMethodAccessorImpl.java:39)
>     [junit]     at sun.reflect.DelegatingMethodAccessorImpl.invoke
> (DelegatingMethodAccessorImpl.java:25)
>
>
>     [junit] Test
> org.apache.lucene.search.TestConstantScoreRangeQuery FAILED
>
>
>     [junit] Testcase: testRangeFilterId
> (org.apache.lucene.search.TestRangeFilter):      FAILED
>     [junit] find all expected:<10001> but was:<0>
>     [junit] junit.framework.AssertionFailedError: find all
> expected:<10001> but was:<0>
>     [junit]     at
> org.apache.lucene.search.TestRangeFilter.testRangeFilterId
> (TestRangeFilter.java:63)
>     [junit]     at sun.reflect.NativeMethodAccessorImpl.invoke0
> (Native Method)
>     [junit]     at sun.reflect.NativeMethodAccessorImpl.invoke
> (NativeMethodAccessorImpl.java:39)
>     [junit]     at sun.reflect.DelegatingMethodAccessorImpl.invoke
> (DelegatingMethodAccessorImpl.java:25)
>
>
>     [junit] Testcase: testRangeFilterRand
> (org.apache.lucene.search.TestRangeFilter):    FAILED
>     [junit] find all expected:<10001> but was:<0>
>     [junit] junit.framework.AssertionFailedError: find all
> expected:<10001> but was:<0>
>     [junit]     at
> org.apache.lucene.search.TestRangeFilter.testRangeFilterRand
> (TestRangeFilter.java:142)
>     [junit]     at sun.reflect.NativeMethodAccessorImpl.invoke0
> (Native Method)
>     [junit]     at sun.reflect.NativeMethodAccessorImpl.invoke
> (NativeMethodAccessorImpl.java:39)
>     [junit]     at sun.reflect.DelegatingMethodAccessorImpl.invoke
> (DelegatingMethodAccessorImpl.java:25)
>
>
>     [junit] Test org.apache.lucene.search.TestRangeFilter FAILED
>
>
> slothbear:~/Desktop/search_engine_stuff/lucene_393239 marvin$ svn diff
> Index: src/java/org/apache/lucene/index/TermBuffer.java
> ===================================================================
> --- src/java/org/apache/lucene/index/TermBuffer.java    (revision
> 393239)
> +++ src/java/org/apache/lucene/index/TermBuffer.java    (working copy)
> @@ -18,42 +18,41 @@
> import java.io.IOException;
> import org.apache.lucene.store.IndexInput;
> +import org.apache.lucene.util.StringHelper;
> final class TermBuffer implements Cloneable {
> -  private static final char[] NO_CHARS = new char[0];
> +  private static final byte[] NO_BYTES = new byte[0];
>    private String field;
> -  private char[] text = NO_CHARS;
> -  private int textLength;
> +  private byte[] bytes = NO_BYTES;
> +  private int bytesLength;
>    private Term term;                            // cached
>    public final int compareTo(TermBuffer other) {
>      if (field == other.field)                    // fields are
> interned
> -      return compareChars(text, textLength, other.text,
> other.textLength);
> +      return compareBytes(bytes, bytesLength, other.bytes,
> other.bytesLength);
>      else
>        return field.compareTo(other.field);
>    }
> -  private static final int compareChars(char[] v1, int len1,
> -                                        char[] v2, int len2) {
> +  private static final int compareBytes(byte[] bytes1, int len1,
> +                                        byte[] bytes2, int len2) {
>      int end = Math.min(len1, len2);
>      for (int k = 0; k < end; k++) {
> -      char c1 = v1[k];
> -      char c2 = v2[k];
> -      if (c1 != c2) {
> -        return c1 - c2;
> +      if (bytes1[k] != bytes2[k]) {
> +        return bytes1[k] - bytes2[k];
>        }
>      }
>      return len1 - len2;
>    }
> -  private final void setTextLength(int newLength) {
> -    if (text.length < newLength) {
> -      char[] newText = new char[newLength];
> -      System.arraycopy(text, 0, newText, 0, textLength);
> -      text = newText;
> +  private final void setBytesLength(int newLength) {
> +    if (bytes.length < newLength) {
> +      byte[] newBytes = new byte[newLength];
> +      System.arraycopy(bytes, 0, newBytes, 0, bytesLength);
> +      bytes = newBytes;
>      }
> -    textLength = newLength;
> +    bytesLength = newLength;
>    }
>    public final void read(IndexInput input, FieldInfos fieldInfos)
> @@ -62,28 +61,29 @@
>      int start = input.readVInt();
>      int length = input.readVInt();
>      int totalLength = start + length;
> -    setTextLength(totalLength);
> -    input.readChars(this.text, start, length);
> +    setBytesLength(totalLength);
> +    input.readBytes(this.bytes, start, length);
>      this.field = fieldInfos.fieldName(input.readVInt());
>    }
> -  public final void set(Term term) {
> -    if (term == null) {
> +  public final void set(Term t) {
> +    if (t == null) {
>        reset();
>        return;
>      }
> -    // copy text into the buffer
> -    setTextLength(term.text().length());
> -    term.text().getChars(0, term.text().length(), text, 0);
> -
> -    this.field = term.field();
> -    this.term = term;
> +    // convert chars into UTF-8 bytes, store in buffer
> +    try {
> +        bytes = t.text().getBytes("UTF-8");
> +    } catch (java.io.UnsupportedEncodingException e) { }
> +    setBytesLength(bytes.length);
> +    this.field = t.field();
> +    this.term = t;
>    }
>    public final void set(TermBuffer other) {
> -    setTextLength(other.textLength);
> -    System.arraycopy(other.text, 0, text, 0, textLength);
> +    setBytesLength(other.bytesLength);
> +    System.arraycopy(other.bytes, 0, bytes, 0, bytesLength);
>      this.field = other.field;
>      this.term = other.term;
> @@ -91,7 +91,7 @@
>    public void reset() {
>      this.field = null;
> -    this.textLength = 0;
> +    this.bytesLength = 0;
>      this.term = null;
>    }
> @@ -100,8 +100,12 @@
>        return null;
>      if (term == null)
> -      term = new Term(field, new String(text, 0, textLength), false);
> +      try {
> +        term = new Term(field,
> +            new String(bytes, 0, bytesLength, "UTF-8"), false );
> +    } catch (java.io.UnsupportedEncodingException e) { }
> +
>      return term;
>    }
> @@ -111,8 +115,8 @@
>        clone = (TermBuffer)super.clone();
>      } catch (CloneNotSupportedException e) {}
> -    clone.text = new char[text.length];
> -    System.arraycopy(text, 0, clone.text, 0, textLength);
> +    clone.bytes = new byte[bytes.length];
> +    System.arraycopy(bytes, 0, clone.bytes, 0, bytesLength);
>      return clone;
>    }
> Index: src/java/org/apache/lucene/index/TermInfosWriter.java
> ===================================================================
> --- src/java/org/apache/lucene/index/TermInfosWriter.java
> (revision 393239)
> +++ src/java/org/apache/lucene/index/TermInfosWriter.java
> (working copy)
> @@ -33,6 +33,8 @@
>    private IndexOutput output;
>    private Term lastTerm = new Term("", "");
>    private TermInfo lastTi = new TermInfo();
> +  private static final byte[] NO_BYTES = new byte[0];
> +  private byte[] lastBytes = NO_BYTES;
>    private long size = 0;
>    // TODO: the default values for these two parameters should be
> settable from
> @@ -121,16 +123,21 @@
>    private final void writeTerm(Term term)
>         throws IOException {
> -    int start = StringHelper.stringDifference(lastTerm.text,
> term.text);
> -    int length = term.text.length() - start;
> +    byte[] bytes = term.text().getBytes("UTF-8");
> +    int totalLength = bytes.length;
> +    int start = StringHelper.bytesDifference(lastBytes, bytes);
> +    int diffLength = totalLength - start;
> +
>      output.writeVInt(start);                   // write shared
> prefix length
> -    output.writeVInt(length);                  // write delta length
> -    output.writeChars(term.text, start, length);  // write delta chars
> +    output.writeVInt(diffLength);                  // write delta
> length
> +    for (int i = start ; i < totalLength; i++) {
> +      output.writeByte(bytes[i]);              // write delta UTF-8
> bytes
> +    }
>      output.writeVInt(fieldInfos.fieldNumber(term.field)); // write
> field num
> -    lastTerm = term;
> +    lastBytes = bytes;
>    }
> Index: src/java/org/apache/lucene/store/IndexInput.java
> ===================================================================
> --- src/java/org/apache/lucene/store/IndexInput.java    (revision
> 393239)
> +++ src/java/org/apache/lucene/store/IndexInput.java    (working copy)
> @@ -17,13 +17,14 @@
>   */
> import java.io.IOException;
> +import org.apache.lucene.util.StringHelper;
> /** Abstract base class for input from a file in a {@link Directory}.  A
>   * random-access input stream.  Used for all Lucene index input
> operations.
>   * @see Directory
>   */
> public abstract class IndexInput implements Cloneable {
> -  private char[] chars;                           // used by
> readString()
> +  private byte[] bytes;                           // used by
> readString()
>    /** Reads and returns a single byte.
>     * @see IndexOutput#writeByte(byte)
> @@ -87,10 +88,10 @@
>     */
>    public String readString() throws IOException {
>      int length = readVInt();
> -    if (chars == null || length > chars.length)
> -      chars = new char[length];
> -    readChars(chars, 0, length);
> -    return new String(chars, 0, length);
> +    if (bytes == null || length > bytes.length)
> +      bytes = new byte[length];
> +    readBytes(bytes, 0, length);
> +    return new String(bytes, 0, length, "UTF-8");
>    }
>    /** Reads UTF-8 encoded characters into an array.
> @@ -104,15 +105,29 @@
>      final int end = start + length;
>      for (int i = start; i < end; i++) {
>        byte b = readByte();
> -      if ((b & 0x80) == 0)
> -       buffer[i] = (char)(b & 0x7F);
> -      else if ((b & 0xE0) != 0xE0) {
> -       buffer[i] = (char)(((b & 0x1F) << 6)
> -                | (readByte() & 0x3F));
> -      } else
> -       buffer[i] = (char)(((b & 0x0F) << 12)
> -               | ((readByte() & 0x3F) << 6)
> -               |  (readByte() & 0x3F));
> +      switch (StringHelper.TRAILING_BYTES_FOR_UTF8[b & 0xFF]) {
> +        case 0:
> +          buffer[i] = (char)(b & 0x7F);
> +          break;
> +        case 1:
> +          buffer[i] = (char)(((b & 0x1F) << 6)
> +            | (readByte() & 0x3F));
> +          break;
> +        case 2:
> +          buffer[i] = (char)(((b & 0x0F) << 12)
> +            | ((readByte() & 0x3F) << 6)
> +            |  (readByte() & 0x3F));
> +          break;
> +        case 3:
> +          int utf32 = (((b & 0x0F) << 18)
> +            | ((readByte() & 0x3F) << 12)
> +            | ((readByte() & 0x3F) << 6)
> +            |  (readByte() & 0x3F));
> +          buffer[i] = (char)((utf32 >> 10) + 0xD7C0);
> +          i++;
> +          buffer[i] = (char)((utf32 & 0x03FF) + 0xDC00);
> +          break;
> +      }
>      }
>    }
> @@ -148,7 +163,7 @@
>        clone = (IndexInput)super.clone();
>      } catch (CloneNotSupportedException e) {}
> -    clone.chars = null;
> +    clone.bytes = null;
>      return clone;
>    }
> Index: src/java/org/apache/lucene/store/IndexOutput.java
> ===================================================================
> --- src/java/org/apache/lucene/store/IndexOutput.java   (revision
> 393239)
> +++ src/java/org/apache/lucene/store/IndexOutput.java   (working copy)
> @@ -17,6 +17,7 @@
>   */
> import java.io.IOException;
> +import org.apache.lucene.util.StringHelper;
> /** Abstract base class for output to a file in a Directory.  A
> random-access
>   * output stream.  Used for all Lucene index output operations.
> @@ -85,7 +86,7 @@
>     * @see IndexInput#readString()
>     */
>    public void writeString(String s) throws IOException {
> -    int length = s.length();
> +    int length = StringHelper.countUTF8Bytes(s);
>      writeVInt(length);
>      writeChars(s, 0, length);
>    }
> @@ -101,15 +102,37 @@
>      final int end = start + length;
>      for (int i = start; i < end; i++) {
>        final int code = (int)s.charAt(i);
> -      if (code >= 0x01 && code <= 0x7F)
> -       writeByte((byte)code);
> -      else if (((code >= 0x80) && (code <= 0x7FF)) || code == 0) {
> -       writeByte((byte)(0xC0 | (code >> 6)));
> -       writeByte((byte)(0x80 | (code & 0x3F)));
> +      if (code < 0x80)
> +        writeByte((byte)code);
> +      else if (code < 0x800) {
> +        writeByte((byte)(0xC0 | (code >> 6)));
> +        writeByte((byte)(0x80 | (code & 0x3F)));
> +      } else if (code < 0xD800 || code > 0xDFFF) {
> +        writeByte((byte)(0xE0 | (code >> 12)));
> +        writeByte((byte)(0x80 | ((code >> 6) & 0x3F)));
> +        writeByte((byte)(0x80 | (code & 0x3F)));
>        } else {
> -       writeByte((byte)(0xE0 | (code >>> 12)));
> -       writeByte((byte)(0x80 | ((code >> 6) & 0x3F)));
> -       writeByte((byte)(0x80 | (code & 0x3F)));
> +        // surrogate pair
> +        int utf32;
> +        // confirm valid high surrogate
> +        if (code < 0xDC00 && (i < end-1)) {
> +          utf32 = ((int)s.charAt(i+1));
> +          // confirm valid low surrogate and write pair
> +          if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) {
> +            utf32 = ((code - 0xD7C0) << 10) + (utf32 & 0x3FF);
> +            i++;
> +            writeByte((byte)(0xF0 | (utf32 >> 18)));
> +            writeByte((byte)(0x80 | ((utf32 >> 12) & 0x3F)));
> +            writeByte((byte)(0x80 | ((utf32 >> 6) & 0x3F)));
> +            writeByte((byte)(0x80 | (utf32 & 0x3F)));
> +            continue;
> +          }
> +        }
> +        // replace unpaired surrogate or out-of-order low surrogate
> +        // with substitution character
> +        writeByte((byte)0xEF);
> +        writeByte((byte)0xBF);
> +        writeByte((byte)0xBD);
>        }
>      }
>    }
> Index: src/java/org/apache/lucene/util/StringHelper.java
> ===================================================================
> --- src/java/org/apache/lucene/util/StringHelper.java   (revision
> 393239)
> +++ src/java/org/apache/lucene/util/StringHelper.java   (working copy)
> @@ -16,6 +16,8 @@
>   * limitations under the License.
>   */
> +import java.nio.ByteBuffer;
> +import java.nio.CharBuffer;
> /**
>   * Methods for manipulating strings.
> @@ -25,6 +27,64 @@
> public abstract class StringHelper {
>    /**
> +   * Compares two byte[] arrays, element by element, and returns the
> +   * number of elements common to both arrays.
> +   *
> +   * @param bytes1 The first byte[] to compare
> +   * @param bytes2 The second byte[] to compare
> +   * @return The number of common elements.
> +   */
> +  public static final int bytesDifference(byte[] bytes1, byte[]
> bytes2) {
> +    int len1 = bytes1.length;
> +    int len2 = bytes2.length;
> +    int len = len1 < len2 ? len1 : len2;
> +    for (int i = 0; i < len; i++) {
> +      if (bytes1[i] != bytes2[i]) {
> +        return i;
> +      }
> +    }
> +    return len;
> +  }
> +
> +  public static final byte[] TRAILING_BYTES_FOR_UTF8 = {
> +    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
> +    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
> +    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
> +    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
> +    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
> +    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
> +    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
> +    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
> +
> +    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
> +    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
> +    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
> +    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
> +    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
> +    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
> +    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
> +    3,3,3,3,3,3,3,3
> +  };
> +
> +  /**
> +   * Count the number of bytes which would be occupied by this string
> +   * were it to be converted to UTF8.
> +   *
> +   * @param s The string to operate against
> +   * @return The number of UTF8 bytes
> +   */
> +  public static final int countUTF8Bytes(String s) {
> +    int byteCount = s.length();    // start with 1 byte per char
> +    int max = byteCount - 1;
> +    for (int i = 0; i < max; i++) {
> +      // add the number of trailing bytes for each char
> +      byteCount += TRAILING_BYTES_FOR_UTF8[ (int)s.charAt(i) ];
> +    }
> +    return byteCount;
> +  }
> +
> +
> +  /**
>     * Compares two strings, character by character, and returns the
>     * first position where the two strings differ from one another.
>     *
>
>
>
>
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: java-dev-unsubscribe@lucene.apache.org
> For additional commands, e-mail: java-dev-help@lucene.apache.org
>
>

---------------------------------------------------------------------
To unsubscribe, e-mail: java-dev-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-dev-help@lucene.apache.org


Mime
View raw message