lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From rm...@apache.org
Subject svn commit: r887706 - in /lucene/java/branches/flex_1458/src/java/org/apache/lucene: index/SegmentReader.java util/UnicodeUtil.java
Date Sun, 06 Dec 2009 16:20:14 GMT
Author: rmuir
Date: Sun Dec  6 16:20:13 2009
New Revision: 887706

URL: http://svn.apache.org/viewvc?rev=887706&view=rev
Log:
LUCENE-2121: add UnicodeUtil.nextValidUTF16String

Modified:
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/SegmentReader.java
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/UnicodeUtil.java

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/SegmentReader.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/SegmentReader.java?rev=887706&r1=887705&r2=887706&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/SegmentReader.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/SegmentReader.java Sun
Dec  6 16:20:13 2009
@@ -38,6 +38,7 @@
 import org.apache.lucene.util.BitVector;
 import org.apache.lucene.util.Bits;
 import org.apache.lucene.util.CloseableThreadLocal;
+import org.apache.lucene.util.UnicodeUtil;
 import org.apache.lucene.index.codecs.Codecs;
 import org.apache.lucene.index.codecs.Codec;
 import org.apache.lucene.index.codecs.preflex.PreFlexFields;
@@ -1356,19 +1357,12 @@
             // We found exactly the requested field; now
             // seek the term text:
             String text = t.text();
-            TermRef tr;
-
-            // this is a hack only for backwards compatibility.
-            // previously you could supply a term ending with a lead surrogate,
+            // this is only for backwards compatibility.
+            // previously you could supply a term with unpaired surrogates,
             // and it would return the next Term.
             // if someone does this, tack on the lowest possible trail surrogate.
             // this emulates the old behavior, and forms "valid UTF-8" unicode.
-            if (text.length() > 0 
-                && Character.isHighSurrogate(text.charAt(text.length() - 1))) {
-              tr = new TermRef(t.text() + "\uDC00");
-            } else {
-              tr = new TermRef(t.text());
-            }
+            TermRef tr = new TermRef(UnicodeUtil.nextValidUTF16String(text));
             TermsEnum.SeekStatus status = terms.seek(tr);
             if (status == TermsEnum.SeekStatus.END) {
               // Rollover to the next field

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/UnicodeUtil.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/UnicodeUtil.java?rev=887706&r1=887705&r2=887706&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/UnicodeUtil.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/UnicodeUtil.java Sun Dec
 6 16:20:13 2009
@@ -364,6 +364,48 @@
     result.length = outUpto;
   }
 
+  /**
+   * Get the next valid UTF-16 String in UTF-16 order.
+   * <p>
+   * If the input String is already valid, it is returned.
+   * Otherwise the next String in code unit order is returned.
+   * </p>
+   * @param s input String (possibly with unpaired surrogates)
+   * @return next valid UTF-16 String in UTF-16 order
+   */
+  public static String nextValidUTF16String(String s) {
+    final int size = s.length();
+    for (int i = 0; i < size; i++) {
+      char ch = s.charAt(i);
+      if (ch >= UnicodeUtil.UNI_SUR_HIGH_START
+          && ch <= UnicodeUtil.UNI_SUR_HIGH_END) {
+        if (i < size - 1) {
+          i++;
+          char nextCH = s.charAt(i);
+          if (nextCH >= UnicodeUtil.UNI_SUR_LOW_START
+              && nextCH <= UnicodeUtil.UNI_SUR_LOW_END) {
+            // Valid surrogate pair
+          } else
+          // Unmatched high surrogate
+            if (nextCH < UnicodeUtil.UNI_SUR_LOW_START) // SMP not enumerated 
+              return s.substring(0, i) + 
+                (char) UnicodeUtil.UNI_SUR_LOW_START;
+            else // SMP already enumerated
+              return s.substring(0, i - 1) + 
+                (char) (UnicodeUtil.UNI_SUR_LOW_END + 1);
+        } else
+        // Unmatched high surrogate in final position, SMP not yet enumerated
+        return s + (char) UnicodeUtil.UNI_SUR_LOW_START;
+      } else if (ch >= UnicodeUtil.UNI_SUR_LOW_START
+          && ch <= UnicodeUtil.UNI_SUR_LOW_END)
+      // Unmatched low surrogate, SMP already enumerated
+      return s.substring(0, i) + 
+        (char) (UnicodeUtil.UNI_SUR_LOW_END + 1);
+    }
+    
+    return s;
+  }
+  
   // Only called from assert
   /*
   private static boolean matches(char[] source, int offset, int length, byte[] result, int
upto) {



Mime
View raw message