BUG: Lucene.Net.Core.Util.UnicodeUtil: counter for char array size not accurate and causing
Lucene.Net.Misc.Util.Fst.TestFSTsMisc.TestRandomWords() to fail. Changed initial array size
to count * 2 and removed resize logic, since we trim off the excess anyway at the end. Also
added a threashhold of 1024 - if the count is greater than this we do a pre-loop to determine
the exact amount of memory to allocate.
Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/631cfa7a
Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/631cfa7a
Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/631cfa7a
Branch: refs/heads/api-work
Commit: 631cfa7ad37187c918b068fef77797fffd0bfff8
Parents: f5d02d6
Author: Shad Storhaug <shad@shadstorhaug.com>
Authored: Fri Mar 24 06:24:41 2017 +0700
Committer: Shad Storhaug <shad@shadstorhaug.com>
Committed: Fri Mar 24 06:28:21 2017 +0700
----------------------------------------------------------------------
src/Lucene.Net.Core/Util/UnicodeUtil.cs | 35 +++++++++++++++++-----------
1 file changed, 22 insertions(+), 13 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/631cfa7a/src/Lucene.Net.Core/Util/UnicodeUtil.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Core/Util/UnicodeUtil.cs b/src/Lucene.Net.Core/Util/UnicodeUtil.cs
index c3bb61e..31a998c 100644
--- a/src/Lucene.Net.Core/Util/UnicodeUtil.cs
+++ b/src/Lucene.Net.Core/Util/UnicodeUtil.cs
@@ -715,15 +715,34 @@ namespace Lucene.Net.Util
/// <returns> a char array representing the code points between offset and
count </returns>
// LUCENENET NOTE: This code was originally in the NewString() method (above).
// It has been refactored from the original to remove the exception throw/catch and
- // instead proactively resizes the array instead of relying on excpetions.
+ // instead proactively resizes the array instead of relying on excpetions + copy
operations
public static char[] ToCharArray(int[] codePoints, int offset, int count)
{
if (count < 0)
{
throw new System.ArgumentException();
}
- // LUCENENET: as a first approximation, assume each codepoint is 1 character
- char[] chars = new char[count];
+ int countThreashhold = 1024; // If the number of chars exceeds this, we count
them instead of allocating count * 2
+ // LUCENENET: as a first approximation, assume each codepoint
+ // is 2 characters (since it cannot be longer than this)
+ int arrayLength = count * 2;
+ // LUCENENET: if we go over the threashhold, count the number of
+ // chars we will need so we can allocate the precise amount of memory
+ if (count > countThreashhold)
+ {
+ arrayLength = 0;
+ for (int r = offset, e = offset + count; r < e; ++r)
+ {
+ arrayLength += codePoints[r] < 0x010000 ? 1 : 2;
+ }
+ if (arrayLength < 1)
+ {
+ arrayLength = count * 2;
+ }
+ }
+ // Initialize our array to our exact or oversized length.
+ // It is now safe to assume we have enough space for all of the characters.
+ char[] chars = new char[arrayLength];
int w = 0;
for (int r = offset, e = offset + count; r < e; ++r)
{
@@ -739,18 +758,8 @@ namespace Lucene.Net.Util
else
{
chars[w++] = (char)(LEAD_SURROGATE_OFFSET_ + (cp >> LEAD_SURROGATE_SHIFT_));
- // LUCENENET: resize to the exact length: it's slightly faster to check
if the resize is needed
- if (w >= chars.Length)
- {
- Array.Resize(ref chars, chars.Length + (e - r) * 2 - 1);
- }
chars[w++] = (char)(TRAIL_SURROGATE_MIN_VALUE + (cp & TRAIL_SURROGATE_MASK_));
}
- // LUCENENET: resize to the exact length: it's slightly faster to check if
the resize is needed
- if (w != chars.Length)
- {
- Array.Resize(ref chars, w);
- }
}
var result = new char[w];
|