lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mikemcc...@apache.org
Subject svn commit: r1304842 - in /lucene/dev/branches/branch_3x: ./ lucene/ lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
Date Sat, 24 Mar 2012 16:18:30 GMT
Author: mikemccand
Date: Sat Mar 24 16:18:30 2012
New Revision: 1304842

URL: http://svn.apache.org/viewvc?rev=1304842&view=rev
Log:
LUCENE-3905: if real doc's text is too big, take a random slice (not just the prefix string)

Modified:
    lucene/dev/branches/branch_3x/   (props changed)
    lucene/dev/branches/branch_3x/lucene/   (props changed)
    lucene/dev/branches/branch_3x/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java

Modified: lucene/dev/branches/branch_3x/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java?rev=1304842&r1=1304841&r2=1304842&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
(original)
+++ lucene/dev/branches/branch_3x/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
Sat Mar 24 16:18:30 2012
@@ -370,12 +370,19 @@ public abstract class BaseTokenStreamTes
         // real data from linedocs
         text = docs.nextDoc().get("body");
         if (text.length() > maxWordLength) {
-          // Take care not to split up a surrogate pair:
-          if (Character.isHighSurrogate(text.charAt(maxWordLength-1))) {
-            text = text.substring(0, maxWordLength-1);
-          } else {
-            text = text.substring(0, maxWordLength);
+
+          // Take a random slice from the text...:
+          int startPos = random.nextInt(text.length() - maxWordLength);
+          if (startPos > 0 && Character.isLowSurrogate(text.charAt(startPos)))
{
+            // Take care not to split up a surrogate pair:
+            startPos--;
+          }
+          int endPos = startPos + maxWordLength - 1;
+          if (Character.isHighSurrogate(text.charAt(endPos))) {
+            // Take care not to split up a surrogate pair:
+            endPos--;
           }
+          text = text.substring(startPos, 1+endPos);
         }
       } else {
         // synthetic



Mime
View raw message