lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mikemcc...@apache.org
Subject svn commit: r641303 [2/2] - in /lucene/java/trunk: ./ docs/ src/java/org/apache/lucene/document/ src/java/org/apache/lucene/index/ src/java/org/apache/lucene/store/ src/java/org/apache/lucene/util/ src/site/src/documentation/content/xdocs/ src/test/org...
Date Wed, 26 Mar 2008 13:39:29 GMT
Modified: lucene/java/trunk/src/java/org/apache/lucene/index/TermVectorsReader.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/TermVectorsReader.java?rev=641303&r1=641302&r2=641303&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/TermVectorsReader.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/TermVectorsReader.java Wed Mar 26 06:39:25 2008
@@ -32,8 +32,16 @@
   // NOTE: if you make a new format, it must be larger than
   // the current format
   static final int FORMAT_VERSION = 2;
+
+  // Changes to speed up bulk merging of term vectors:
   static final int FORMAT_VERSION2 = 3;
 
+  // Changed strings to UTF8 with length-in-bytes not length-in-chars
+  static final int FORMAT_UTF8_LENGTH_IN_BYTES = 4;
+
+  // NOTE: always change this if you switch to a new format!
+  static final int FORMAT_CURRENT = FORMAT_UTF8_LENGTH_IN_BYTES;
+
   //The size in bytes that the FORMAT_VERSION will take up at the beginning of each file 
   static final int FORMAT_SIZE = 4;
 
@@ -134,7 +142,7 @@
   }
 
   boolean canReadRawDocs() {
-    return format >= FORMAT_VERSION2;
+    return format >= FORMAT_UTF8_LENGTH_IN_BYTES;
   }
 
   /** Retrieve the length (in bytes) of the tvd and tvf
@@ -190,9 +198,9 @@
   private int checkValidFormat(IndexInput in) throws CorruptIndexException, IOException
   {
     int format = in.readInt();
-    if (format > FORMAT_VERSION2) {
+    if (format > FORMAT_CURRENT) {
       throw new CorruptIndexException("Incompatible format version: " + format + " expected " 
-                                      + FORMAT_VERSION2 + " or less");
+                                      + FORMAT_CURRENT + " or less");
     }
     return format;
   }
@@ -434,24 +442,45 @@
     int start = 0;
     int deltaLength = 0;
     int totalLength = 0;
-    char [] buffer = new char[10];    // init the buffer with a length of 10 character
-    char[] previousBuffer = {};
-    
+    byte[] byteBuffer;
+    char[] charBuffer;
+    final boolean preUTF8 = format < FORMAT_UTF8_LENGTH_IN_BYTES;
+
+    // init the buffers
+    if (preUTF8) {
+      charBuffer = new char[10];
+      byteBuffer = null;
+    } else {
+      charBuffer = null;
+      byteBuffer = new byte[20];
+    }
+
     for (int i = 0; i < numTerms; i++) {
       start = tvf.readVInt();
       deltaLength = tvf.readVInt();
       totalLength = start + deltaLength;
-      if (buffer.length < totalLength) {  // increase buffer
-        buffer = null;    // give a hint to garbage collector
-        buffer = new char[totalLength];
-        
-        if (start > 0)  // just copy if necessary
-          System.arraycopy(previousBuffer, 0, buffer, 0, start);
-      }
+
+      final String term;
       
-      tvf.readChars(buffer, start, deltaLength);
-      String term = new String(buffer, 0, totalLength);
-      previousBuffer = buffer;
+      if (preUTF8) {
+        // Term stored as java chars
+        if (charBuffer.length < totalLength) {
+          char[] newCharBuffer = new char[(int) (1.5*totalLength)];
+          System.arraycopy(charBuffer, 0, newCharBuffer, 0, start);
+          charBuffer = newCharBuffer;
+        }
+        tvf.readChars(charBuffer, start, deltaLength);
+        term = new String(charBuffer, 0, totalLength);
+      } else {
+        // Term stored as utf8 bytes
+        if (byteBuffer.length < totalLength) {
+          byte[] newByteBuffer = new byte[(int) (1.5*totalLength)];
+          System.arraycopy(byteBuffer, 0, newByteBuffer, 0, start);
+          byteBuffer = newByteBuffer;
+        }
+        tvf.readBytes(byteBuffer, start, deltaLength);
+        term = new String(byteBuffer, 0, totalLength, "UTF-8");
+      }
       int freq = tvf.readVInt();
       int [] positions = null;
       if (storePositions) { //read in the positions

Modified: lucene/java/trunk/src/java/org/apache/lucene/index/TermVectorsWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/TermVectorsWriter.java?rev=641303&r1=641302&r2=641303&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/TermVectorsWriter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/TermVectorsWriter.java Wed Mar 26 06:39:25 2008
@@ -20,6 +20,7 @@
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.IndexOutput;
 import org.apache.lucene.util.StringHelper;
+import org.apache.lucene.util.UnicodeUtil;
 
 import java.io.IOException;
 
@@ -27,17 +28,19 @@
   
   private IndexOutput tvx = null, tvd = null, tvf = null;
   private FieldInfos fieldInfos;
+  final UnicodeUtil.UTF8Result[] utf8Results = new UnicodeUtil.UTF8Result[] {new UnicodeUtil.UTF8Result(),
+                                                                             new UnicodeUtil.UTF8Result()};
 
   public TermVectorsWriter(Directory directory, String segment,
                            FieldInfos fieldInfos)
     throws IOException {
     // Open files for TermVector storage
     tvx = directory.createOutput(segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION);
-    tvx.writeInt(TermVectorsReader.FORMAT_VERSION2);
+    tvx.writeInt(TermVectorsReader.FORMAT_CURRENT);
     tvd = directory.createOutput(segment + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION);
-    tvd.writeInt(TermVectorsReader.FORMAT_VERSION2);
+    tvd.writeInt(TermVectorsReader.FORMAT_CURRENT);
     tvf = directory.createOutput(segment + "." + IndexFileNames.VECTORS_FIELDS_EXTENSION);
-    tvf.writeInt(TermVectorsReader.FORMAT_VERSION2);
+    tvf.writeInt(TermVectorsReader.FORMAT_CURRENT);
 
     this.fieldInfos = fieldInfos;
   }
@@ -97,15 +100,22 @@
         final String[] terms = vectors[i].getTerms();
         final int[] freqs = vectors[i].getTermFrequencies();
 
-        String lastTermText = "";
+        int utf8Upto = 0;
+        utf8Results[1].length = 0;
+
         for (int j=0; j<numTerms; j++) {
-          final String termText = terms[j];
-          int start = StringHelper.stringDifference(lastTermText, termText);
-          int length = termText.length() - start;
+
+          UnicodeUtil.UTF16toUTF8(terms[j], 0, terms[j].length(), utf8Results[utf8Upto]);
+          
+          int start = StringHelper.bytesDifference(utf8Results[1-utf8Upto].result,
+                                                   utf8Results[1-utf8Upto].length,
+                                                   utf8Results[utf8Upto].result,
+                                                   utf8Results[utf8Upto].length);
+          int length = utf8Results[utf8Upto].length - start;
           tvf.writeVInt(start);       // write shared prefix length
           tvf.writeVInt(length);        // write delta length
-          tvf.writeChars(termText, start, length);  // write delta chars
-          lastTermText = termText;
+          tvf.writeBytes(utf8Results[utf8Upto].result, start, length);  // write delta bytes
+          utf8Upto = 1-utf8Upto;
 
           final int termFreq = freqs[j];
 

Modified: lucene/java/trunk/src/java/org/apache/lucene/store/IndexInput.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/store/IndexInput.java?rev=641303&r1=641302&r2=641303&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/store/IndexInput.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/store/IndexInput.java Wed Mar 26 06:39:25 2008
@@ -24,7 +24,9 @@
  * @see Directory
  */
 public abstract class IndexInput implements Cloneable {
-  private char[] chars;                           // used by readString()
+  private byte[] bytes;                           // used by readString()
+  private char[] chars;                           // used by readModifiedUTF8String()
+  private boolean preUTF8Strings;                 // true if we are reading old (modified UTF8) string format
 
   /** Reads and returns a single byte.
    * @see IndexOutput#writeByte(byte)
@@ -102,10 +104,28 @@
     return i;
   }
 
+  /** Call this if readString should read characters stored
+   *  in the old modified UTF8 format (length in java chars
+   *  and java's modified UTF8 encoding).  This is used for
+   *  indices written pre-2.4 See LUCENE-510 for details. */
+  public void setModifiedUTF8StringsMode() {
+    preUTF8Strings = true;
+  }
+
   /** Reads a string.
    * @see IndexOutput#writeString(String)
    */
   public String readString() throws IOException {
+    if (preUTF8Strings)
+      return readModifiedUTF8String();
+    int length = readVInt();
+    if (bytes == null || length > bytes.length)
+      bytes = new byte[(int) (length*1.25)];
+    readBytes(bytes, 0, length);
+    return new String(bytes, 0, length, "UTF-8");
+  }
+
+  private String readModifiedUTF8String() throws IOException {
     int length = readVInt();
     if (chars == null || length > chars.length)
       chars = new char[length];
@@ -113,11 +133,15 @@
     return new String(chars, 0, length);
   }
 
-  /** Reads UTF-8 encoded characters into an array.
+  /** Reads Lucene's old "modified UTF-8" encoded
+   *  characters into an array.
    * @param buffer the array to read characters into
    * @param start the offset in the array to start storing characters
    * @param length the number of characters to read
    * @see IndexOutput#writeChars(String,int,int)
+   * @deprecated -- please use readString or readBytes
+   *                instead, and construct the string
+   *                from those utf8 bytes
    */
   public void readChars(char[] buffer, int start, int length)
        throws IOException {
@@ -144,6 +168,8 @@
    * and it does not have to do any of the bitwise operations, since we don't actually care what is in the byte except to determine
    * how many more bytes to read
    * @param length The number of chars to read
+   * @deprecated this method operates on old "modified utf8" encoded
+   *             strings
    */
   public void skipChars(int length) throws IOException{
     for (int i = 0; i < length; i++) {
@@ -194,6 +220,7 @@
       clone = (IndexInput)super.clone();
     } catch (CloneNotSupportedException e) {}
 
+    clone.bytes = null;
     clone.chars = null;
 
     return clone;

Modified: lucene/java/trunk/src/java/org/apache/lucene/store/IndexOutput.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/store/IndexOutput.java?rev=641303&r1=641302&r2=641303&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/store/IndexOutput.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/store/IndexOutput.java Wed Mar 26 06:39:25 2008
@@ -18,6 +18,7 @@
  */
 
 import java.io.IOException;
+import org.apache.lucene.util.UnicodeUtil;
 
 /** Abstract base class for output to a file in a Directory.  A random-access
  * output stream.  Used for all Lucene index output operations.
@@ -26,6 +27,8 @@
  */
 public abstract class IndexOutput {
 
+  private UnicodeUtil.UTF8Result utf8Result = new UnicodeUtil.UTF8Result();
+
   /** Writes a single byte.
    * @see IndexInput#readByte()
    */
@@ -96,16 +99,18 @@
    * @see IndexInput#readString()
    */
   public void writeString(String s) throws IOException {
-    int length = s.length();
-    writeVInt(length);
-    writeChars(s, 0, length);
+    UnicodeUtil.UTF16toUTF8(s, 0, s.length(), utf8Result);
+    writeVInt(utf8Result.length);
+    writeBytes(utf8Result.result, 0, utf8Result.length);
   }
 
-  /** Writes a sequence of UTF-8 encoded characters from a string.
+  /** Writes a sub sequence of characters from s as the old
+   *  format (modified UTF-8 encoded bytes).
    * @param s the source of the characters
    * @param start the first character in the sequence
    * @param length the number of characters in the sequence
-   * @see IndexInput#readChars(char[],int,int)
+   * @deprecated -- please pre-convert to utf8 bytes
+   * instead or use {@link #writeString}
    */
   public void writeChars(String s, int start, int length)
        throws IOException {
@@ -125,11 +130,12 @@
     }
   }
 
-  /** Writes a sequence of UTF-8 encoded characters from a char[].
+  /** Writes a sub sequence of characters from char[] as
+   *  the old format (modified UTF-8 encoded bytes).
    * @param s the source of the characters
    * @param start the first character in the sequence
    * @param length the number of characters in the sequence
-   * @see IndexInput#readChars(char[],int,int)
+   * @deprecated -- please pre-convert to utf8 bytes instead or use {@link #writeString}
    */
   public void writeChars(char[] s, int start, int length)
     throws IOException {

Modified: lucene/java/trunk/src/java/org/apache/lucene/util/StringHelper.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/util/StringHelper.java?rev=641303&r1=641302&r2=641303&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/util/StringHelper.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/util/StringHelper.java Wed Mar 26 06:39:25 2008
@@ -26,6 +26,22 @@
 public abstract class StringHelper {
 
   /**
+   * Compares two byte[] arrays, element by element, and returns the
+   * number of elements common to both arrays.
+   *
+   * @param bytes1 The first byte[] to compare
+   * @param bytes2 The second byte[] to compare
+   * @return The number of common elements.
+   */
+  public static final int bytesDifference(byte[] bytes1, int len1, byte[] bytes2, int len2) {
+    int len = len1 < len2 ? len1 : len2;
+    for (int i = 0; i < len; i++)
+      if (bytes1[i] != bytes2[i])
+        return i;
+    return len;
+  }
+
+  /**
    * Compares two strings, character by character, and returns the
    * first position where the two strings differ from one another.
    *
@@ -44,7 +60,6 @@
     }
     return len;
   }
-
 
   private StringHelper() {
   }

Added: lucene/java/trunk/src/java/org/apache/lucene/util/UnicodeUtil.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/util/UnicodeUtil.java?rev=641303&view=auto
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/util/UnicodeUtil.java (added)
+++ lucene/java/trunk/src/java/org/apache/lucene/util/UnicodeUtil.java Wed Mar 26 06:39:25 2008
@@ -0,0 +1,447 @@
+package org.apache.lucene.util;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+/*
+ * Some of this code came from the excellent Unicode
+ * conversion examples from:
+ *
+ *   http://www.unicode.org/Public/PROGRAMS/CVTUTF
+ *
+ * Full Copyright for that code follows:
+*/
+
+/*
+ * Copyright 2001-2004 Unicode, Inc.
+ * 
+ * Disclaimer
+ * 
+ * This source code is provided as is by Unicode, Inc. No claims are
+ * made as to fitness for any particular purpose. No warranties of any
+ * kind are expressed or implied. The recipient agrees to determine
+ * applicability of information provided. If this file has been
+ * purchased on magnetic or optical media from Unicode, Inc., the
+ * sole remedy for any claim will be exchange of defective media
+ * within 90 days of receipt.
+ * 
+ * Limitations on Rights to Redistribute This Code
+ * 
+ * Unicode, Inc. hereby grants the right to freely use the information
+ * supplied in this file in the creation of products supporting the
+ * Unicode Standard, and to make copies of this file in any form
+ * for internal or external distribution as long as this notice
+ * remains attached.
+ */
+
+/**
+ * Class to encode java's UTF16 char[] into UTF8 byte[]
+ * without always allocating a new byte[] as
+ * String.getBytes("UTF-8") does.
+ *
+ * <p><b>WARNING</b>: This API is a new and experimental and
+ * may suddenly change. </p>
+ */
+
+final public class UnicodeUtil {
+
+  public static final int UNI_SUR_HIGH_START = 0xD800;
+  public static final int UNI_SUR_HIGH_END = 0xDBFF;
+  public static final int UNI_SUR_LOW_START = 0xDC00;
+  public static final int UNI_SUR_LOW_END = 0xDFFF;
+  public static final int UNI_REPLACEMENT_CHAR = 0xFFFD;
+
+  private static final long UNI_MAX_BMP = 0x0000FFFF;
+
+  private static final int HALF_BASE = 0x0010000;
+  private static final long HALF_SHIFT = 10;
+  private static final long HALF_MASK = 0x3FFL;
+
+  public static final class UTF8Result {
+    public byte[] result = new byte[10];
+    public int length;
+
+    public void setLength(int newLength) {
+      if (result.length < newLength) {
+        byte[] newArray = new byte[(int) (1.5*newLength)];
+        System.arraycopy(result, 0, newArray, 0, length);
+        result = newArray;
+      }
+      length = newLength;
+    }
+  }
+
+  public static final class UTF16Result {
+    public char[] result = new char[10];
+    public int[] offsets = new int[10];
+    public int length;
+
+    public void setLength(int newLength) {
+      if (result.length < newLength) {
+        char[] newArray = new char[(int) (1.5*newLength)];
+        System.arraycopy(result, 0, newArray, 0, length);
+        result = newArray;
+      }
+      length = newLength;
+    }
+
+    public void copyText(UTF16Result other) {
+      setLength(other.length);
+      System.arraycopy(other.result, 0, result, 0, length);
+    }
+  }
+
+  /** Encode characters from a char[] source, starting at
+   *  offset and stopping when the character 0xffff is seen.
+   *  Returns the number of bytes written to bytesOut. */
+  public static void UTF16toUTF8(final char[] source, final int offset, UTF8Result result) {
+
+    int upto = 0;
+    int i = offset;
+    byte[] out = result.result;
+
+    while(true) {
+      
+      final int code = (int) source[i++];
+
+      if (upto+4 > out.length) {
+        byte[] newOut = new byte[2*out.length];
+        assert newOut.length >= upto+4;
+        System.arraycopy(out, 0, newOut, 0, upto);
+        result.result = out = newOut;
+      }
+      if (code < 0x80)
+        out[upto++] = (byte) code;
+      else if (code < 0x800) {
+        out[upto++] = (byte) (0xC0 | (code >> 6));
+        out[upto++] = (byte)(0x80 | (code & 0x3F));
+      } else if (code < 0xD800 || code > 0xDFFF) {
+        if (code == 0xffff)
+          // END
+          break;
+        out[upto++] = (byte)(0xE0 | (code >> 12));
+        out[upto++] = (byte)(0x80 | ((code >> 6) & 0x3F));
+        out[upto++] = (byte)(0x80 | (code & 0x3F));
+      } else {
+        // surrogate pair
+        // confirm valid high surrogate
+        if (code < 0xDC00 && source[i] != 0xffff) {
+          int utf32 = (int) source[i];
+          // confirm valid low surrogate and write pair
+          if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) { 
+            utf32 = ((code - 0xD7C0) << 10) + (utf32 & 0x3FF);
+            i++;
+            out[upto++] = (byte)(0xF0 | (utf32 >> 18));
+            out[upto++] = (byte)(0x80 | ((utf32 >> 12) & 0x3F));
+            out[upto++] = (byte)(0x80 | ((utf32 >> 6) & 0x3F));
+            out[upto++] = (byte)(0x80 | (utf32 & 0x3F));
+            continue;
+          }
+        }
+        // replace unpaired surrogate or out-of-order low surrogate
+        // with substitution character
+        out[upto++] = (byte) 0xEF;
+        out[upto++] = (byte) 0xBF;
+        out[upto++] = (byte) 0xBD;
+      }
+    }
+    //assert matches(source, offset, i-offset-1, out, upto);
+    result.length = upto;
+  }
+
+  /** Encode characters from a char[] source, starting at
+   *  offset for length chars.  Returns the number of bytes
+   *  written to bytesOut. */
+  public static void UTF16toUTF8(final char[] source, final int offset, final int length, UTF8Result result) {
+
+    int upto = 0;
+    int i = offset;
+    final int end = offset + length;
+    byte[] out = result.result;
+
+    while(i < end) {
+      
+      final int code = (int) source[i++];
+
+      if (upto+4 > out.length) {
+        byte[] newOut = new byte[2*out.length];
+        assert newOut.length >= upto+4;
+        System.arraycopy(out, 0, newOut, 0, upto);
+        result.result = out = newOut;
+      }
+      if (code < 0x80)
+        out[upto++] = (byte) code;
+      else if (code < 0x800) {
+        out[upto++] = (byte) (0xC0 | (code >> 6));
+        out[upto++] = (byte)(0x80 | (code & 0x3F));
+      } else if (code < 0xD800 || code > 0xDFFF) {
+        out[upto++] = (byte)(0xE0 | (code >> 12));
+        out[upto++] = (byte)(0x80 | ((code >> 6) & 0x3F));
+        out[upto++] = (byte)(0x80 | (code & 0x3F));
+      } else {
+        // surrogate pair
+        // confirm valid high surrogate
+        if (code < 0xDC00 && i < end && source[i] != 0xffff) {
+          int utf32 = (int) source[i];
+          // confirm valid low surrogate and write pair
+          if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) { 
+            utf32 = ((code - 0xD7C0) << 10) + (utf32 & 0x3FF);
+            i++;
+            out[upto++] = (byte)(0xF0 | (utf32 >> 18));
+            out[upto++] = (byte)(0x80 | ((utf32 >> 12) & 0x3F));
+            out[upto++] = (byte)(0x80 | ((utf32 >> 6) & 0x3F));
+            out[upto++] = (byte)(0x80 | (utf32 & 0x3F));
+            continue;
+          }
+        }
+        // replace unpaired surrogate or out-of-order low surrogate
+        // with substitution character
+        out[upto++] = (byte) 0xEF;
+        out[upto++] = (byte) 0xBF;
+        out[upto++] = (byte) 0xBD;
+      }
+    }
+    //assert matches(source, offset, length, out, upto);
+    result.length = upto;
+  }
+
+  /** Encode characters from this String, starting at offset
+   *  for length characters.  Returns the number of bytes
+   *  written to bytesOut. */
+  public static void UTF16toUTF8(final String s, final int offset, final int length, UTF8Result result) {
+    final int end = offset + length;
+
+    byte[] out = result.result;
+
+    int upto = 0;
+    for(int i=offset;i<end;i++) {
+      final int code = (int) s.charAt(i);
+
+      if (upto+4 > out.length) {
+        byte[] newOut = new byte[2*out.length];
+        assert newOut.length >= upto+4;
+        System.arraycopy(out, 0, newOut, 0, upto);
+        result.result = out = newOut;
+      }
+      if (code < 0x80)
+        out[upto++] = (byte) code;
+      else if (code < 0x800) {
+        out[upto++] = (byte) (0xC0 | (code >> 6));
+        out[upto++] = (byte)(0x80 | (code & 0x3F));
+      } else if (code < 0xD800 || code > 0xDFFF) {
+        out[upto++] = (byte)(0xE0 | (code >> 12));
+        out[upto++] = (byte)(0x80 | ((code >> 6) & 0x3F));
+        out[upto++] = (byte)(0x80 | (code & 0x3F));
+      } else {
+        // surrogate pair
+        // confirm valid high surrogate
+        if (code < 0xDC00 && (i < end-1)) {
+          int utf32 = (int) s.charAt(i+1);
+          // confirm valid low surrogate and write pair
+          if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) { 
+            utf32 = ((code - 0xD7C0) << 10) + (utf32 & 0x3FF);
+            i++;
+            out[upto++] = (byte)(0xF0 | (utf32 >> 18));
+            out[upto++] = (byte)(0x80 | ((utf32 >> 12) & 0x3F));
+            out[upto++] = (byte)(0x80 | ((utf32 >> 6) & 0x3F));
+            out[upto++] = (byte)(0x80 | (utf32 & 0x3F));
+            continue;
+          }
+        }
+        // replace unpaired surrogate or out-of-order low surrogate
+        // with substitution character
+        out[upto++] = (byte) 0xEF;
+        out[upto++] = (byte) 0xBF;
+        out[upto++] = (byte) 0xBD;
+      }
+    }
+    //assert matches(s, offset, length, out, upto);
+    result.length = upto;
+  }
+
+  /** Convert UTF8 bytes into UTF16 characters.  If offset
+   *  is non-zero, conversion starts at that starting point
+   *  in utf8, re-using the results from the previous call
+   *  up until offset. */
+  public static void UTF8toUTF16(final byte[] utf8, final int offset, final int length, final UTF16Result result) {
+
+    final int end = offset + length;
+    char[] out = result.result;
+    if (result.offsets.length <= end) {
+      int[] newOffsets = new int[2*end];
+      System.arraycopy(result.offsets, 0, newOffsets, 0, result.offsets.length);
+      result.offsets  = newOffsets;
+    }
+    final int[] offsets = result.offsets;
+
+    // If incremental decoding fell in the middle of a
+    // single unicode character, rollback to its start:
+    int upto = offset;
+    while(offsets[upto] == -1)
+      upto--;
+
+    int outUpto = offsets[upto];
+
+    // Pre-allocate for worst case 1-for-1
+    if (outUpto+length >= out.length) {
+      char[] newOut = new char[2*(outUpto+length)];
+      System.arraycopy(out, 0, newOut, 0, outUpto);
+      result.result = out = newOut;
+    }
+
+    while (upto < end) {
+
+      final int b = utf8[upto]&0xff;
+      final int ch;
+
+      offsets[upto++] = outUpto;
+
+      if (b < 0xc0) {
+        assert b < 0x80;
+        ch = b;
+      } else if (b < 0xe0) {
+        ch = ((b&0x1f)<<6) + (utf8[upto]&0x3f);
+        offsets[upto++] = -1;
+      } else if (b < 0xf0) {
+        ch = ((b&0xf)<<12) + ((utf8[upto]&0x3f)<<6) + (utf8[upto+1]&0x3f);
+        offsets[upto++] = -1;
+        offsets[upto++] = -1;
+      } else {
+        assert b < 0xf8;
+        ch = ((b&0x7)<<18) + ((utf8[upto]&0x3f)<<12) + ((utf8[upto+1]&0x3f)<<6) + (utf8[upto+2]&0x3f);
+        offsets[upto++] = -1;
+        offsets[upto++] = -1;
+        offsets[upto++] = -1;
+      }
+
+      if (ch <= UNI_MAX_BMP) {
+        // target is a character <= 0xFFFF
+        out[outUpto++] = (char) ch;
+      } else {
+        // target is a character in range 0xFFFF - 0x10FFFF
+        final int chHalf = ch - HALF_BASE;
+        out[outUpto++] = (char) ((chHalf >> HALF_SHIFT) + UNI_SUR_HIGH_START);
+        out[outUpto++] = (char) ((chHalf & HALF_MASK) + UNI_SUR_LOW_START);
+      }
+    }
+
+    offsets[upto] = outUpto;
+    result.length = outUpto;
+  }
+
+  // Only called from assert
+  /*
+  private static boolean matches(char[] source, int offset, int length, byte[] result, int upto) {
+    try {
+      String s1 = new String(source, offset, length);
+      String s2 = new String(result, 0, upto, "UTF-8");
+      if (!s1.equals(s2)) {
+        //System.out.println("DIFF: s1 len=" + s1.length());
+        //for(int i=0;i<s1.length();i++)
+        //  System.out.println("    " + i + ": " + (int) s1.charAt(i));
+        //System.out.println("s2 len=" + s2.length());
+        //for(int i=0;i<s2.length();i++)
+        //  System.out.println("    " + i + ": " + (int) s2.charAt(i));
+
+        // If the input string was invalid, then the
+        // difference is OK
+        if (!validUTF16String(s1))
+          return true;
+
+        return false;
+      }
+      return s1.equals(s2);
+    } catch (UnsupportedEncodingException uee) {
+      return false;
+    }
+  }
+
+  // Only called from assert
+  private static boolean matches(String source, int offset, int length, byte[] result, int upto) {
+    try {
+      String s1 = source.substring(offset, offset+length);
+      String s2 = new String(result, 0, upto, "UTF-8");
+      if (!s1.equals(s2)) {
+        // Allow a difference if s1 is not valid UTF-16
+
+        //System.out.println("DIFF: s1 len=" + s1.length());
+        //for(int i=0;i<s1.length();i++)
+        //  System.out.println("    " + i + ": " + (int) s1.charAt(i));
+        //System.out.println("  s2 len=" + s2.length());
+        //for(int i=0;i<s2.length();i++)
+        //  System.out.println("    " + i + ": " + (int) s2.charAt(i));
+
+        // If the input string was invalid, then the
+        // difference is OK
+        if (!validUTF16String(s1))
+          return true;
+
+        return false;
+      }
+      return s1.equals(s2);
+    } catch (UnsupportedEncodingException uee) {
+      return false;
+    }
+  }
+
+  public static final boolean validUTF16String(String s) {
+    final int size = s.length();
+    for(int i=0;i<size;i++) {
+      char ch = s.charAt(i);
+      if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
+        if (i < size-1) {
+          i++;
+          char nextCH = s.charAt(i);
+          if (nextCH >= UNI_SUR_LOW_START && nextCH <= UNI_SUR_LOW_END) {
+            // Valid surrogate pair
+          } else
+            // Unmatched hight surrogate
+            return false;
+        } else
+          // Unmatched hight surrogate
+          return false;
+      } else if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
+        // Unmatched low surrogate
+        return false;
+    }
+
+    return true;
+  }
+
+  public static final boolean validUTF16String(char[] s, int size) {
+    for(int i=0;i<size;i++) {
+      char ch = s[i];
+      if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
+        if (i < size-1) {
+          i++;
+          char nextCH = s[i];
+          if (nextCH >= UNI_SUR_LOW_START && nextCH <= UNI_SUR_LOW_END) {
+            // Valid surrogate pair
+          } else
+            return false;
+        } else
+          return false;
+      } else if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
+        // Unmatched low surrogate
+        return false;
+    }
+
+    return true;
+  }
+  */
+}

Propchange: lucene/java/trunk/src/java/org/apache/lucene/util/UnicodeUtil.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/java/trunk/src/site/src/documentation/content/xdocs/fileformats.xml
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/site/src/documentation/content/xdocs/fileformats.xml?rev=641303&r1=641302&r2=641303&view=diff
==============================================================================
--- lucene/java/trunk/src/site/src/documentation/content/xdocs/fileformats.xml (original)
+++ lucene/java/trunk/src/site/src/documentation/content/xdocs/fileformats.xml Wed Mar 26 06:39:25 2008
@@ -736,10 +736,7 @@
 
                 <p>
                     Lucene writes unicode
-                    character sequences using Java's
-                    <a href="http://en.wikipedia.org/wiki/UTF-8#Modified_UTF-8">"modified
-                        UTF-8 encoding"</a>
-                    .
+                    character sequences as UTF-8 encoded bytes.
                 </p>
 
 
@@ -748,8 +745,9 @@
             <section id="String"><title>String</title>
 
                 <p>
-                    Lucene writes strings as a VInt representing the length, followed by
-                    the character data.
+		    Lucene writes strings as UTF-8 encoded bytes.
+                    First the length, in bytes, is written as a VInt,
+                    followed by the bytes.
                 </p>
 
                 <p>
@@ -1233,10 +1231,12 @@
                             <br/>
                             --&gt; VInt
                         </p>
-                        <p>This
-                            file is sorted by Term. Terms are ordered first lexicographically
-                            by the term's field name, and within that lexicographically by the
-                            term's text.
+                        <p>
+			    This file is sorted by Term. Terms are
+                            ordered first lexicographically (by UTF16
+                            character code) by the term's field name,
+                            and within that lexicographically (by
+                            UTF16 character code) by the term's text.
                         </p>
                         <p>TIVersion names the version of the format
                             of this file and is -2 in Lucene 1.4.

Modified: lucene/java/trunk/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java?rev=641303&r1=641302&r2=641303&view=diff
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java (original)
+++ lucene/java/trunk/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java Wed Mar 26 06:39:25 2008
@@ -20,6 +20,7 @@
 import org.apache.lucene.util.LuceneTestCase;
 
 import java.util.Arrays;
+import java.util.List;
 import java.util.Enumeration;
 import java.util.zip.ZipFile;
 import java.util.zip.ZipEntry;
@@ -39,6 +40,7 @@
 import org.apache.lucene.store.FSDirectory;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
+import org.apache.lucene.util._TestUtil;
 
 /*
   Verify we can read the pre-2.1 file format, do searches
@@ -131,7 +133,7 @@
     for(int i=0;i<oldNames.length;i++) {
       String dirName = "src/test/org/apache/lucene/index/index." + oldNames[i];
       unzip(dirName, oldNames[i]);
-      searchIndex(oldNames[i]);
+      searchIndex(oldNames[i], oldNames[i]);
       rmDir(oldNames[i]);
     }
   }
@@ -171,7 +173,7 @@
     }
   }
 
-  public void searchIndex(String dirName) throws IOException {
+  public void searchIndex(String dirName, String oldName) throws IOException {
     //QueryParser parser = new QueryParser("contents", new WhitespaceAnalyzer());
     //Query query = parser.parse("handle:1");
 
@@ -179,6 +181,29 @@
 
     Directory dir = FSDirectory.getDirectory(dirName);
     IndexSearcher searcher = new IndexSearcher(dir);
+    IndexReader reader = searcher.getIndexReader();
+
+    _TestUtil.checkIndex(dir);
+
+    for(int i=0;i<35;i++) {
+      if (!reader.isDeleted(i)) {
+        Document d = reader.document(i);
+        List fields = d.getFields();
+        if (oldName.startsWith("23.")) {
+          assertEquals(3, fields.size());
+          Field f = (Field) d.getField("id");
+          assertEquals(""+i, f.stringValue());
+
+          f = (Field) d.getField("utf8");
+          assertEquals("Lu\uD834\uDD1Ece\uD834\uDD60ne \u0000 \u2620 ab\ud917\udc17cd", f.stringValue());
+        
+          f = (Field) d.getField("content2");
+          assertEquals("here is more content with aaa aaa aaa", f.stringValue());
+        }        
+      } else
+        // Only ID 7 is deleted
+        assertEquals(7, i);
+    }
     
     Hits hits = searcher.search(new TermQuery(new Term("content", "aaa")));
 
@@ -189,6 +214,15 @@
 
     testHits(hits, 34, searcher.getIndexReader());
 
+    if (oldName.startsWith("23.")) {
+      hits = searcher.search(new TermQuery(new Term("utf8", "\u0000")));
+      assertEquals(34, hits.length());
+      hits = searcher.search(new TermQuery(new Term("utf8", "Lu\uD834\uDD1Ece\uD834\uDD60ne")));
+      assertEquals(34, hits.length());
+      hits = searcher.search(new TermQuery(new Term("utf8", "ab\ud917\udc17cd")));
+      assertEquals(34, hits.length());
+    }
+
     searcher.close();
     dir.close();
   }
@@ -421,6 +455,7 @@
     Document doc = new Document();
     doc.add(new Field("content", "aaa", Field.Store.NO, Field.Index.TOKENIZED));
     doc.add(new Field("id", Integer.toString(id), Field.Store.YES, Field.Index.UN_TOKENIZED));
+    doc.add(new Field("utf8", "Lu\uD834\uDD1Ece\uD834\uDD60ne \u0000 \u2620 ab\ud917\udc17cd", Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
     doc.add(new Field("content2", "here is more content with aaa aaa aaa", Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
     writer.addDocument(doc);
   }

Modified: lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexInput.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexInput.java?rev=641303&r1=641302&r2=641303&view=diff
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexInput.java (original)
+++ lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexInput.java Wed Mar 26 06:39:25 2008
@@ -24,16 +24,70 @@
 
 public class TestIndexInput extends LuceneTestCase {
   public void testRead() throws IOException {
-    IndexInput is = new MockIndexInput(new byte[]{(byte) 0x80, 0x01,
-            (byte) 0xFF, 0x7F,
-            (byte) 0x80, (byte) 0x80, 0x01,
-            (byte) 0x81, (byte) 0x80, 0x01,
-            0x06, 'L', 'u', 'c', 'e', 'n', 'e'});
-    assertEquals(128, is.readVInt());
-    assertEquals(16383, is.readVInt());
-    assertEquals(16384, is.readVInt());
-    assertEquals(16385, is.readVInt());
-    assertEquals("Lucene", is.readString());
+    IndexInput is = new MockIndexInput(new byte[] { 
+      (byte) 0x80, 0x01,
+      (byte) 0xFF, 0x7F,
+      (byte) 0x80, (byte) 0x80, 0x01,
+      (byte) 0x81, (byte) 0x80, 0x01,
+      0x06, 'L', 'u', 'c', 'e', 'n', 'e',
+
+      // 2-byte UTF-8 (U+00BF "INVERTED QUESTION MARK") 
+      0x02, (byte) 0xC2, (byte) 0xBF,
+      0x0A, 'L', 'u', (byte) 0xC2, (byte) 0xBF, 
+            'c', 'e', (byte) 0xC2, (byte) 0xBF, 
+            'n', 'e',
+
+      // 3-byte UTF-8 (U+2620 "SKULL AND CROSSBONES") 
+      0x03, (byte) 0xE2, (byte) 0x98, (byte) 0xA0,
+      0x0C, 'L', 'u', (byte) 0xE2, (byte) 0x98, (byte) 0xA0,
+            'c', 'e', (byte) 0xE2, (byte) 0x98, (byte) 0xA0,
+            'n', 'e',
+
+      // surrogate pairs
+      // (U+1D11E "MUSICAL SYMBOL G CLEF")
+      // (U+1D160 "MUSICAL SYMBOL EIGHTH NOTE")
+      0x04, (byte) 0xF0, (byte) 0x9D, (byte) 0x84, (byte) 0x9E,
+      0x08, (byte) 0xF0, (byte) 0x9D, (byte) 0x84, (byte) 0x9E, 
+            (byte) 0xF0, (byte) 0x9D, (byte) 0x85, (byte) 0xA0, 
+      0x0E, 'L', 'u',
+            (byte) 0xF0, (byte) 0x9D, (byte) 0x84, (byte) 0x9E,
+            'c', 'e', 
+            (byte) 0xF0, (byte) 0x9D, (byte) 0x85, (byte) 0xA0, 
+            'n', 'e',  
+
+      // null bytes
+      0x01, 0x00,
+      0x08, 'L', 'u', 0x00, 'c', 'e', 0x00, 'n', 'e',
+      
+      // Modified UTF-8 null bytes
+      0x02, (byte) 0xC0, (byte) 0x80,
+      0x0A, 'L', 'u', (byte) 0xC0, (byte) 0x80, 
+            'c', 'e', (byte) 0xC0, (byte) 0x80, 
+            'n', 'e',
+
+    });
+        
+    assertEquals(128,is.readVInt());
+    assertEquals(16383,is.readVInt());
+    assertEquals(16384,is.readVInt());
+    assertEquals(16385,is.readVInt());
+    assertEquals("Lucene",is.readString());
+
+    assertEquals("\u00BF",is.readString());
+    assertEquals("Lu\u00BFce\u00BFne",is.readString());
+
+    assertEquals("\u2620",is.readString());
+    assertEquals("Lu\u2620ce\u2620ne",is.readString());
+
+    assertEquals("\uD834\uDD1E",is.readString());
+    assertEquals("\uD834\uDD1E\uD834\uDD60",is.readString());
+    assertEquals("Lu\uD834\uDD1Ece\uD834\uDD60ne",is.readString());
+    
+    assertEquals("\u0000",is.readString());
+    assertEquals("Lu\u0000ce\u0000ne",is.readString());
+
+    assertEquals("\u0000",is.readString());
+    assertEquals("Lu\u0000ce\u0000ne",is.readString());
   }
 
   /**

Modified: lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexWriter.java?rev=641303&r1=641302&r2=641303&view=diff
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexWriter.java (original)
+++ lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexWriter.java Wed Mar 26 06:39:25 2008
@@ -25,6 +25,7 @@
 import java.util.Random;
 
 import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.UnicodeUtil;
 
 import org.apache.lucene.analysis.WhitespaceAnalyzer;
 import org.apache.lucene.analysis.WhitespaceTokenizer;
@@ -3328,5 +3329,224 @@
     assertTrue(failure.fail1 && failure.fail2);
     w.abort();
     dir.close();
+  }
+  
+  final String[] utf8Data = new String[] {
+    // unpaired low surrogate
+    "ab\udc17cd", "ab\ufffdcd",
+    "\udc17abcd", "\ufffdabcd",
+    "\udc17", "\ufffd",
+    "ab\udc17\udc17cd", "ab\ufffd\ufffdcd",
+    "\udc17\udc17abcd", "\ufffd\ufffdabcd",
+    "\udc17\udc17", "\ufffd\ufffd",
+
+    // unpaired high surrogate
+    "ab\ud917cd", "ab\ufffdcd",
+    "\ud917abcd", "\ufffdabcd",
+    "\ud917", "\ufffd",
+    "ab\ud917\ud917cd", "ab\ufffd\ufffdcd",
+    "\ud917\ud917abcd", "\ufffd\ufffdabcd",
+    "\ud917\ud917", "\ufffd\ufffd",
+
+    // backwards surrogates
+    "ab\udc17\ud917cd", "ab\ufffd\ufffdcd",
+    "\udc17\ud917abcd", "\ufffd\ufffdabcd",
+    "\udc17\ud917", "\ufffd\ufffd",
+    "ab\udc17\ud917\udc17\ud917cd", "ab\ufffd\ud917\udc17\ufffdcd",
+    "\udc17\ud917\udc17\ud917abcd", "\ufffd\ud917\udc17\ufffdabcd",
+    "\udc17\ud917\udc17\ud917", "\ufffd\ud917\udc17\ufffd"
+  };
+
+  // LUCENE-510
+  public void testInvalidUTF16() throws Throwable {
+    MockRAMDirectory dir = new MockRAMDirectory();
+    IndexWriter w = new IndexWriter(dir, false, new WhitespaceAnalyzer(), true, IndexWriter.MaxFieldLength.UNLIMITED);
+    Document doc = new Document();
+
+    final int count = utf8Data.length/2;
+    for(int i=0;i<count;i++)
+      doc.add(new Field("f" + i, utf8Data[2*i], Field.Store.YES, Field.Index.TOKENIZED));
+    w.addDocument(doc);
+    w.close();
+
+    IndexReader ir = IndexReader.open(dir);
+    Document doc2 = ir.document(0);
+    for(int i=0;i<count;i++) {
+      assertEquals("field " + i + " was not indexed correctly", 1, ir.docFreq(new Term("f"+i, utf8Data[2*i+1])));
+      assertEquals("field " + i + " is incorrect", utf8Data[2*i+1], doc2.getField("f"+i).stringValue());
+    }
+    ir.close();
+    dir.close();
+  }
+
+  // LUCENE-510
+  public void testAllUnicodeChars() throws Throwable {
+
+    UnicodeUtil.UTF8Result utf8 = new UnicodeUtil.UTF8Result();
+    UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result();
+    char[] chars = new char[2];
+    for(int ch=0;ch<0x0010FFFF;ch++) {
+
+      if (ch == 0xd800)
+        // Skip invalid code points
+        ch = 0xe000;
+
+      int len = 0;
+      if (ch <= 0xffff) {
+        chars[len++] = (char) ch;
+      } else {
+        chars[len++] = (char) (((ch-0x0010000) >> 10) + UnicodeUtil.UNI_SUR_HIGH_START);
+        chars[len++] = (char) (((ch-0x0010000) & 0x3FFL) + UnicodeUtil.UNI_SUR_LOW_START);
+      }
+
+      UnicodeUtil.UTF16toUTF8(chars, 0, len, utf8);
+      
+      String s1 = new String(chars, 0, len);
+      String s2 = new String(utf8.result, 0, utf8.length, "UTF-8");
+      assertEquals("codepoint " + ch, s1, s2);
+
+      UnicodeUtil.UTF8toUTF16(utf8.result, 0, utf8.length, utf16);
+      assertEquals("codepoint " + ch, s1, new String(utf16.result, 0, utf16.length));
+
+      byte[] b = s1.getBytes("UTF-8");
+      assertEquals(utf8.length, b.length);
+      for(int j=0;j<utf8.length;j++)
+        assertEquals(utf8.result[j], b[j]);
+    }
+  }
+
+  Random r = new Random();
+
+  private int nextInt(int lim) {
+    return r.nextInt(lim);
+  }
+
+  private int nextInt(int start, int end) {
+    return start + nextInt(end-start);
+  }
+
+  private boolean fillUnicode(char[] buffer, char[] expected, int offset, int count) {
+    final int len = offset + count;
+    boolean hasIllegal = false;
+
+    if (offset > 0 && buffer[offset] >= 0xdc00 && buffer[offset] < 0xe000)
+      // Don't start in the middle of a valid surrogate pair
+      offset--;
+
+    for(int i=offset;i<len;i++) {
+      int t = nextInt(6);
+      if (0 == t && i < len-1) {
+        // Make a surrogate pair
+        // High surrogate
+        expected[i] = buffer[i++] = (char) nextInt(0xd800, 0xdc00);
+        // Low surrogate
+        expected[i] = buffer[i] = (char) nextInt(0xdc00, 0xe000);
+      } else if (t <= 1)
+        expected[i] = buffer[i] = (char) nextInt(0x80);
+      else if (2 == t)
+        expected[i] = buffer[i] = (char) nextInt(0x80, 0x800);
+      else if (3 == t)
+        expected[i] = buffer[i] = (char) nextInt(0x800, 0xd800);
+      else if (4 == t)
+        expected[i] = buffer[i] = (char) nextInt(0xe000, 0xffff);
+      else if (5 == t && i < len-1) {
+        // Illegal unpaired surrogate
+        if (nextInt(10) == 7) {
+          if (r.nextBoolean())
+            buffer[i] = (char) nextInt(0xd800, 0xdc00);
+          else
+            buffer[i] = (char) nextInt(0xdc00, 0xe000);
+          expected[i++] = 0xfffd;
+          expected[i] = buffer[i] = (char) nextInt(0x800, 0xd800);
+          hasIllegal = true;
+        } else 
+          expected[i] = buffer[i] = (char) nextInt(0x800, 0xd800);
+      } else {
+        expected[i] = buffer[i] = ' ';
+      }
+    }
+
+    return hasIllegal;
+  }
+
+  // LUCENE-510
+  public void testRandomUnicodeStrings() throws Throwable {
+
+    char[] buffer = new char[20];
+    char[] expected = new char[20];
+
+    UnicodeUtil.UTF8Result utf8 = new UnicodeUtil.UTF8Result();
+    UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result();
+
+    for(int iter=0;iter<100000;iter++) {
+      boolean hasIllegal = fillUnicode(buffer, expected, 0, 20);
+
+      UnicodeUtil.UTF16toUTF8(buffer, 0, 20, utf8);
+      if (!hasIllegal) {
+        byte[] b = new String(buffer, 0, 20).getBytes("UTF-8");
+        assertEquals(b.length, utf8.length);
+        for(int i=0;i<b.length;i++)
+          assertEquals(b[i], utf8.result[i]);
+      }
+
+      UnicodeUtil.UTF8toUTF16(utf8.result, 0, utf8.length, utf16);
+      assertEquals(utf16.length, 20);
+      for(int i=0;i<20;i++)
+        assertEquals(expected[i], utf16.result[i]);
+    }
+  }
+
+  // LUCENE-510
+  public void testIncrementalUnicodeStrings() throws Throwable {
+    char[] buffer = new char[20];
+    char[] expected = new char[20];
+
+    UnicodeUtil.UTF8Result utf8 = new UnicodeUtil.UTF8Result();
+    UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result();
+    UnicodeUtil.UTF16Result utf16a = new UnicodeUtil.UTF16Result();
+
+    boolean hasIllegal = false;
+    byte[] last = new byte[60];
+
+    for(int iter=0;iter<100000;iter++) {
+
+      final int prefix;
+
+      if (iter == 0 || hasIllegal)
+        prefix = 0;
+      else
+        prefix = nextInt(20);
+
+      hasIllegal = fillUnicode(buffer, expected, prefix, 20-prefix);
+
+      UnicodeUtil.UTF16toUTF8(buffer, 0, 20, utf8);
+      if (!hasIllegal) {
+        byte[] b = new String(buffer, 0, 20).getBytes("UTF-8");
+        assertEquals(b.length, utf8.length);
+        for(int i=0;i<b.length;i++)
+          assertEquals(b[i], utf8.result[i]);
+      }
+
+      int bytePrefix = 20;
+      if (iter == 0 || hasIllegal)
+        bytePrefix = 0;
+      else
+        for(int i=0;i<20;i++)
+          if (last[i] != utf8.result[i]) {
+            bytePrefix = i;
+            break;
+          }
+      System.arraycopy(utf8.result, 0, last, 0, utf8.length);
+
+      UnicodeUtil.UTF8toUTF16(utf8.result, bytePrefix, utf8.length-bytePrefix, utf16);
+      assertEquals(20, utf16.length);
+      for(int i=0;i<20;i++)
+        assertEquals(expected[i], utf16.result[i]);
+
+      UnicodeUtil.UTF8toUTF16(utf8.result, 0, utf8.length, utf16a);
+      assertEquals(20, utf16a.length);
+      for(int i=0;i<20;i++)
+        assertEquals(expected[i], utf16a.result[i]);
+    }
   }
 }

Modified: lucene/java/trunk/src/test/org/apache/lucene/index/TestStressIndexing2.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/index/TestStressIndexing2.java?rev=641303&r1=641302&r2=641303&view=diff
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/index/TestStressIndexing2.java (original)
+++ lucene/java/trunk/src/test/org/apache/lucene/index/TestStressIndexing2.java Wed Mar 26 06:39:25 2008
@@ -415,8 +415,56 @@
       return r.nextInt(lim);
     }
 
+    // start is inclusive and end is exclusive
+    public int nextInt(int start, int end) {
+      return start + r.nextInt(end-start);
+    }
+
+    char[] buffer = new char[100];
+
+    private int addUTF8Token(int start) {
+      final int end = start + nextInt(20);
+      if (buffer.length < 1+end) {
+        char[] newBuffer = new char[(int) ((1+end)*1.25)];
+        System.arraycopy(buffer, 0, newBuffer, 0, buffer.length);
+        buffer = newBuffer;
+      }
+
+      for(int i=start;i<end;i++) {
+        int t = nextInt(6);
+        if (0 == t && i < end-1) {
+          // Make a surrogate pair
+          // High surrogate
+          buffer[i++] = (char) nextInt(0xd800, 0xdc00);
+          // Low surrogate
+          buffer[i] = (char) nextInt(0xdc00, 0xe000);
+        } else if (t <= 1)
+          buffer[i] = (char) nextInt(0x80);
+        else if (2 == t)
+          buffer[i] = (char) nextInt(0x80, 0x800);
+        else if (3 == t)
+          buffer[i] = (char) nextInt(0x800, 0xd800);
+        else if (4 == t)
+          buffer[i] = (char) nextInt(0xe000, 0xffff);
+        else if (5 == t) {
+          // Illegal unpaired surrogate
+          if (r.nextBoolean())
+            buffer[i] = (char) nextInt(0xd800, 0xdc00);
+          else
+            buffer[i] = (char) nextInt(0xdc00, 0xe000);
+        }
+      }
+      buffer[end] = ' ';
+      return 1+end;
+    }
+
     public String getString(int nTokens) {
       nTokens = nTokens!=0 ? nTokens : r.nextInt(4)+1;
+
+      // Half the time make a random UTF8 string
+      if (r.nextBoolean())
+        return getUTF8String(nTokens);
+
       // avoid StringBuffer because it adds extra synchronization.
       char[] arr = new char[nTokens*2];
       for (int i=0; i<nTokens; i++) {
@@ -424,6 +472,14 @@
         arr[i*2+1] = ' ';
       }
       return new String(arr);
+    }
+    
+    public String getUTF8String(int nTokens) {
+      int upto = 0;
+      Arrays.fill(buffer, (char) 0);
+      for(int i=0;i<nTokens;i++)
+        upto = addUTF8Token(upto);
+      return new String(buffer, 0, upto);
     }
 
     public String getIdString() {

Modified: lucene/java/trunk/src/test/org/apache/lucene/index/index.23.cfs.zip
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/index/index.23.cfs.zip?rev=641303&r1=641302&r2=641303&view=diff
==============================================================================
Binary files - no diff available.

Modified: lucene/java/trunk/src/test/org/apache/lucene/index/index.23.nocfs.zip
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/index/index.23.nocfs.zip?rev=641303&r1=641302&r2=641303&view=diff
==============================================================================
Binary files - no diff available.



Mime
View raw message