lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mikemcc...@apache.org
Subject svn commit: r831036 - in /lucene/java/branches/lucene_2_9: CHANGES.txt src/java/org/apache/lucene/index/TermsHashPerField.java src/test/org/apache/lucene/index/TestIndexWriter.java
Date Thu, 29 Oct 2009 17:15:13 GMT
Author: mikemccand
Date: Thu Oct 29 17:15:12 2009
New Revision: 831036

URL: http://svn.apache.org/viewvc?rev=831036&view=rev
Log:
LCUENE-2016 (on 2.9 branch): remap invalid U+FFFF char during indexing, to prevent silent
corruption

Modified:
    lucene/java/branches/lucene_2_9/CHANGES.txt
    lucene/java/branches/lucene_2_9/src/java/org/apache/lucene/index/TermsHashPerField.java
    lucene/java/branches/lucene_2_9/src/test/org/apache/lucene/index/TestIndexWriter.java

Modified: lucene/java/branches/lucene_2_9/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/branches/lucene_2_9/CHANGES.txt?rev=831036&r1=831035&r2=831036&view=diff
==============================================================================
--- lucene/java/branches/lucene_2_9/CHANGES.txt (original)
+++ lucene/java/branches/lucene_2_9/CHANGES.txt Thu Oct 29 17:15:12 2009
@@ -36,6 +36,10 @@
  * LUCENE-2004: Fix Constants.LUCENE_MAIN_VERSION to not be inlined
    by client code.  (Uwe Schindler)
 
+ * LUCENE-2016: Replace illegal U+FFFF character with the replacement
+   char (U+FFFD) during indexing, to prevent silent index corruption.
+   (Peter Keegan, Mike McCandless)
+
 API Changes
 
  * Un-deprecate search(Weight weight, Filter filter, int n) from

Modified: lucene/java/branches/lucene_2_9/src/java/org/apache/lucene/index/TermsHashPerField.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/lucene_2_9/src/java/org/apache/lucene/index/TermsHashPerField.java?rev=831036&r1=831035&r2=831036&view=diff
==============================================================================
--- lucene/java/branches/lucene_2_9/src/java/org/apache/lucene/index/TermsHashPerField.java
(original)
+++ lucene/java/branches/lucene_2_9/src/java/org/apache/lucene/index/TermsHashPerField.java
Thu Oct 29 17:15:12 2009
@@ -373,9 +373,11 @@
             ch = tokenText[downto] = UnicodeUtil.UNI_REPLACEMENT_CHAR;
           }            
         }
-      } else if (ch >= UnicodeUtil.UNI_SUR_HIGH_START && ch <= UnicodeUtil.UNI_SUR_HIGH_END)
-        // Unpaired
+      } else if (ch >= UnicodeUtil.UNI_SUR_HIGH_START && (ch <= UnicodeUtil.UNI_SUR_HIGH_END
||
+                                                          ch == 0xffff)) {
+        // Unpaired or 0xffff
         ch = tokenText[downto] = UnicodeUtil.UNI_REPLACEMENT_CHAR;
+      }
 
       code = (code*31) + ch;
     }

Modified: lucene/java/branches/lucene_2_9/src/test/org/apache/lucene/index/TestIndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/lucene_2_9/src/test/org/apache/lucene/index/TestIndexWriter.java?rev=831036&r1=831035&r2=831036&view=diff
==============================================================================
--- lucene/java/branches/lucene_2_9/src/test/org/apache/lucene/index/TestIndexWriter.java
(original)
+++ lucene/java/branches/lucene_2_9/src/test/org/apache/lucene/index/TestIndexWriter.java
Thu Oct 29 17:15:12 2009
@@ -4655,4 +4655,20 @@
     w.close();
     d.close();
   }
+
+  public void testEmbeddedFFFF() throws Throwable {
+
+    Directory d = new MockRAMDirectory();
+    IndexWriter w = new IndexWriter(d, new WhitespaceAnalyzer(), IndexWriter.MaxFieldLength.UNLIMITED);
+    Document doc = new Document();
+    doc.add(new Field("field", "a a\uffffb", Field.Store.NO, Field.Index.ANALYZED));
+    w.addDocument(doc);
+    doc = new Document();
+    doc.add(new Field("field", "a", Field.Store.NO, Field.Index.ANALYZED));
+    w.addDocument(doc);
+    w.close();
+
+    _TestUtil.checkIndex(d);
+    d.close();
+  }
 }



Mime
View raw message