kudu-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From a...@apache.org
Subject [kudu] 02/02: KUDU-1938 Make UTF-8 truncation faster pt 2
Date Wed, 04 Dec 2019 00:22:11 GMT
This is an automated email from the ASF dual-hosted git repository.

adar pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git

commit e45f893981a36d721752f1e0c743cdf18a6f5428
Author: Attila Bukor <abukor@apache.org>
AuthorDate: Tue Nov 12 15:24:16 2019 +0100

    KUDU-1938 Make UTF-8 truncation faster pt 2
    
    Adds Intel Intrinsics (up to SSE4.2) to speed up the processing of UTF8
    character counting in the case of ASCII-only chunks (fast path) by
    doubling the chunk size in a single pass from 64 to 128 bits.
    
    Before:
    
    [ RUN      ] CharUtilTest.StressTestUtf8
    [       OK ] CharUtilTest.StressTestUtf8 (7746 ms)
    [ RUN      ] CharUtilTest.StressTestAscii
    [       OK ] CharUtilTest.StressTestAscii (1028 ms)
    
    After:
    
    [ RUN      ] CharUtilTest.StressTestUtf8
    [       OK ] CharUtilTest.StressTestUtf8 (9285 ms)
    [ RUN      ] CharUtilTest.StressTestAscii
    [       OK ] CharUtilTest.StressTestAscii (708 ms)
    
    Change-Id: I9a491157dd5c8b4815030bbda921a0afc0bafd28
    Reviewed-on: http://gerrit.cloudera.org:8080/14354
    Reviewed-by: Adar Dembo <adar@cloudera.com>
    Tested-by: Kudu Jenkins
---
 src/kudu/util/char_util-test.cc | 18 ++++++++++++++++++
 src/kudu/util/char_util.cc      | 22 ++++++++++++++++++----
 2 files changed, 36 insertions(+), 4 deletions(-)

diff --git a/src/kudu/util/char_util-test.cc b/src/kudu/util/char_util-test.cc
index 88188bd..eb41355 100644
--- a/src/kudu/util/char_util-test.cc
+++ b/src/kudu/util/char_util-test.cc
@@ -20,12 +20,15 @@
 #include <cstdint>
 #include <memory>
 
+#include <glog/logging.h>
 #include <gtest/gtest.h>
 
 #include "kudu/util/env.h"
 #include "kudu/util/faststring.h"
+#include "kudu/util/init.h"
 #include "kudu/util/path_util.h"
 #include "kudu/util/slice.h"
+#include "kudu/util/status.h"
 #include "kudu/util/test_util.h"
 
 using std::unique_ptr;
@@ -38,6 +41,9 @@ class CharUtilTest : public KuduTest {
   Slice data_ascii_;
 
   void SetUp() override {
+    // UTF8Truncate uses SSE4.1 instructions so we need to make sure the CPU
+    // running the test has these opcodes.
+    CHECK_OK(CheckCPUFlags());
     ReadFileToString(env_, JoinPathSegments(GetTestExecutableDirectory(),
                                            "testdata/char_truncate_utf8.txt"),
                      &string_utf8_);
@@ -92,6 +98,18 @@ TEST_F(CharUtilTest, CorrectnessTestIncompleteUtf8) {
   ASSERT_EQ(test_data, result);
 }
 
+TEST_F(CharUtilTest, CorrectnessTestUtf8AndAscii) {
+  Slice result;
+  Slice data = "ááááááááááááááááááááááááááááááááaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
+
+  auto ptr = Truncate(data, 64, &result);
+  ASSERT_EQ(data, result);
+
+  data = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaáááááááááááááááááááááááááááááááá";
+  ptr = Truncate(data, 64, &result);
+  ASSERT_EQ(data, result);
+}
+
 TEST_F(CharUtilTest, StressTestUtf8) {
   StressTest(data_utf8_, 9000);
 }
diff --git a/src/kudu/util/char_util.cc b/src/kudu/util/char_util.cc
index e232325..4d798e1 100644
--- a/src/kudu/util/char_util.cc
+++ b/src/kudu/util/char_util.cc
@@ -19,6 +19,8 @@
 
 #include <algorithm>
 #include <cstring>
+#include <emmintrin.h>
+#include <smmintrin.h>
 
 namespace kudu {
 
@@ -29,13 +31,26 @@ Slice UTF8Truncate(Slice val, size_t max_utf8_length) {
   str = start = val.data();
   size_t num_bytes = 0;
   size_t size = val.size();
+
+  // Mask used to determine whether there are any non-ASCII characters in a
+  // 128-bit chunk
+  const __m128i mask = _mm_set1_epi32(0x80808080);
+
   while (num_bytes < size) {
     // If the next chunk of bytes are all ASCII we can fast path them.
-    if (size - num_bytes >= 8 &&
-        max_utf8_length - num_utf8_chars >= 8 &&
-        (*(reinterpret_cast<const int64_t*>(str)) & 0x8080808080808080) == 0) {
+    if (size - num_bytes >= 16 &&
+        max_utf8_length - num_utf8_chars >= 16 &&
+        _mm_test_all_zeros(_mm_loadu_si128(reinterpret_cast<const __m128i*>(str)),
+                           mask) == 1) {
+      num_utf8_chars += 16;
+      num_bytes += 16;
+      str += 16;
+    } else if (size - num_bytes >= 8 &&
+               max_utf8_length - num_utf8_chars >= 8 &&
+               (*(reinterpret_cast<const int64_t*>(str)) & 0x8080808080808080)
== 0) {
       num_utf8_chars += 8;
       num_bytes += 8;
+      str += 8;
     } else {
       num_utf8_chars += (*str++ & 0xc0) != 0x80;
       num_bytes++;
@@ -45,7 +60,6 @@ Slice UTF8Truncate(Slice val, size_t max_utf8_length) {
         break;
       }
     }
-    str = start + num_bytes;
   }
   num_bytes = std::min<size_t>(size, num_bytes);
   auto relocated = new uint8_t[num_bytes];


Mime
View raw message