geode-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From jbarr...@apache.org
Subject [geode-native] branch develop updated: GEODE-6218: Improves UTF-8 hashing function performance. (#424)
Date Wed, 19 Dec 2018 21:05:18 GMT
This is an automated email from the ASF dual-hosted git repository.

jbarrett pushed a commit to branch develop
in repository https://gitbox.apache.org/repos/asf/geode-native.git


The following commit(s) were added to refs/heads/develop by this push:
     new f040049  GEODE-6218: Improves UTF-8 hashing function performance. (#424)
f040049 is described below

commit f040049fc4107cd25eeb8f663c53080ce914bfb1
Author: Jacob Barrett <jbarrett@pivotal.io>
AuthorDate: Wed Dec 19 13:05:14 2018 -0800

    GEODE-6218: Improves UTF-8 hashing function performance. (#424)
    
    * Update benchmark to measure different Unicode character widths.
---
 cppcache/benchmark/GeodeHashBM.cpp             | 51 ++++++++++++++++++++------
 cppcache/include/geode/internal/functional.hpp | 43 +++++++++++++++++++++-
 cppcache/src/util/functional.cpp               | 39 --------------------
 cppcache/test/util/functionalTests.cpp         |  8 ++++
 4 files changed, 89 insertions(+), 52 deletions(-)

diff --git a/cppcache/benchmark/GeodeHashBM.cpp b/cppcache/benchmark/GeodeHashBM.cpp
index 9bb5cb0..2aa8c9d 100644
--- a/cppcache/benchmark/GeodeHashBM.cpp
+++ b/cppcache/benchmark/GeodeHashBM.cpp
@@ -22,24 +22,51 @@
 #include "util/string.hpp"
 
 using apache::geode::client::to_utf16;
+using apache::geode::client::to_utf8;
 using apache::geode::client::internal::geode_hash;
 
-class GeodeHashBM : public benchmark::Fixture {};
+template <class ToString, class FromString>
+ToString convert(const FromString& from);
 
-BENCHMARK_DEFINE_F(GeodeHashBM, std_string)(benchmark::State& state) {
-  std::string x(state.range(0), 'x');
-  for (auto _ : state) {
-    int hashcode;
-    benchmark::DoNotOptimize(hashcode = geode_hash<std::string>{}(x));
-  }
+template <>
+std::string convert(const std::u32string& from) {
+  return to_utf8(from);
+}
+
+template <>
+std::u16string convert(const std::u32string& from) {
+  return to_utf16(from);
 }
-BENCHMARK_REGISTER_F(GeodeHashBM, std_string)->Range(8, 8 << 10);
 
-BENCHMARK_DEFINE_F(GeodeHashBM, std_u16string)(benchmark::State& state) {
-  std::u16string x(state.range(0), u'x');
+template <class String, char32_t UnicodeChar>
+void GeodeHashBM(benchmark::State& state) {
+  const std::u32string u32String(state.range(0), UnicodeChar);
+  const String string = convert<String>(u32String);
+
   for (auto _ : state) {
     int hashcode;
-    benchmark::DoNotOptimize(hashcode = geode_hash<std::u16string>{}(x));
+    benchmark::DoNotOptimize(hashcode = geode_hash<String>{}(string));
   }
 }
-BENCHMARK_REGISTER_F(GeodeHashBM, std_u16string)->Range(8, 8 << 10);
+
+constexpr char32_t LATIN_CAPITAL_LETTER_C = U'\U00000043';
+constexpr char32_t INVERTED_EXCLAMATION_MARK = U'\U000000A1';
+constexpr char32_t SAMARITAN_PUNCTUATION_ZIQAA = U'\U00000838';
+constexpr char32_t LINEAR_B_SYLLABLE_B008_A = U'\U00010000';
+
+BENCHMARK_TEMPLATE(GeodeHashBM, std::string, LATIN_CAPITAL_LETTER_C)
+    ->Range(8, 8 << 10);
+BENCHMARK_TEMPLATE(GeodeHashBM, std::u16string, LATIN_CAPITAL_LETTER_C)
+    ->Range(8, 8 << 10);
+BENCHMARK_TEMPLATE(GeodeHashBM, std::string, INVERTED_EXCLAMATION_MARK)
+    ->Range(8, 8 << 10);
+BENCHMARK_TEMPLATE(GeodeHashBM, std::u16string, INVERTED_EXCLAMATION_MARK)
+    ->Range(8, 8 << 10);
+BENCHMARK_TEMPLATE(GeodeHashBM, std::string, SAMARITAN_PUNCTUATION_ZIQAA)
+    ->Range(8, 8 << 10);
+BENCHMARK_TEMPLATE(GeodeHashBM, std::u16string, SAMARITAN_PUNCTUATION_ZIQAA)
+    ->Range(8, 8 << 10);
+BENCHMARK_TEMPLATE(GeodeHashBM, std::string, LINEAR_B_SYLLABLE_B008_A)
+    ->Range(8, 8 << 10);
+BENCHMARK_TEMPLATE(GeodeHashBM, std::u16string, LINEAR_B_SYLLABLE_B008_A)
+    ->Range(8, 8 << 10);
diff --git a/cppcache/include/geode/internal/functional.hpp b/cppcache/include/geode/internal/functional.hpp
index 7cb4377..6fde1d6 100644
--- a/cppcache/include/geode/internal/functional.hpp
+++ b/cppcache/include/geode/internal/functional.hpp
@@ -104,7 +104,48 @@ struct geode_hash<std::u16string> {
  */
 template <>
 struct geode_hash<std::string> {
-  int32_t operator()(const std::string& val);
+  inline int32_t operator()(const std::string& val) {
+    int32_t hash = 0;
+
+    for (auto&& it = val.cbegin(); it < val.cend(); it++) {
+      auto cp = static_cast<uint32_t>(0xff & *it);
+      if (cp < 0x80) {
+        // 1 byte
+      } else if ((cp >> 5) == 0x6) {
+        // 2 bytes
+        ++it;
+        cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f);
+      } else if ((cp >> 4) == 0xe) {
+        // 3 bytes
+        ++it;
+        cp = ((cp << 12) & 0xffff) + (((0xff & *it) << 6) & 0xfff);
+        ++it;
+        cp += (*it) & 0x3f;
+      } else if ((cp >> 3) == 0x1e) {
+        // 4 bytes
+        ++it;
+        cp = ((cp << 18) & 0x1fffff) + (((0xff & *it) << 12) & 0x3ffff);
+        ++it;
+        cp += ((0xff & *it) << 6) & 0xfff;
+        ++it;
+        cp += (*it) & 0x3f;
+      } else {
+        // TODO throw exception
+      }
+
+      if (cp > 0xffff) {
+        // surrogate pair
+        hash = 31 * hash +
+               static_cast<uint16_t>((cp >> 10) + (0xD800 - (0x10000 >>
10)));
+        hash = 31 * hash + static_cast<uint16_t>((cp & 0x3ff) + 0xdc00u);
+      } else {
+        // single code unit
+        hash = 31 * hash + cp;
+      }
+    }
+
+    return hash;
+  }
 };
 
 }  // namespace internal
diff --git a/cppcache/src/util/functional.cpp b/cppcache/src/util/functional.cpp
deleted file mode 100644
index c526eaf..0000000
--- a/cppcache/src/util/functional.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <codecvt>
-#include <locale>
-#include <string>
-
-#include <geode/internal/functional.hpp>
-
-#include "string.hpp"
-
-namespace apache {
-namespace geode {
-namespace client {
-namespace internal {
-
-int32_t geode_hash<std::string>::operator()(const std::string& val) {
-  // TODO string optimize without conversion to UTF-16
-  return geode_hash<std::u16string>{}(to_utf16(val));
-}
-
-}  // namespace internal
-}  // namespace client
-}  // namespace geode
-}  // namespace apache
diff --git a/cppcache/test/util/functionalTests.cpp b/cppcache/test/util/functionalTests.cpp
index f4e0426..9c18bae 100644
--- a/cppcache/test/util/functionalTests.cpp
+++ b/cppcache/test/util/functionalTests.cpp
@@ -32,4 +32,12 @@ TEST(string, geode_hash) {
   EXPECT_EQ(48, hash("0"));
   EXPECT_EQ(57, hash("9"));
   EXPECT_EQ(1077910243, hash("supercalifragilisticexpialidocious"));
+
+  EXPECT_EQ(1544552287, hash("You had me at meat tornad\u00F6!\U000F0000"));
+
+  auto str = std::string("You had me at");
+  str.push_back(0);
+  str.append("meat tornad\u00F6!\U000F0000");
+
+  EXPECT_EQ(701776767, hash(str));
 }


Mime
View raw message