subversion-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From br...@apache.org
Subject svn commit: r1613540 - in /subversion/branches/svn-auth-x509/subversion: include/private/svn_utf_private.h libsvn_subr/utf.c libsvn_subr/utf8proc.c tests/libsvn_subr/utf-test.c
Date Sat, 26 Jul 2014 00:14:24 GMT
Author: brane
Date: Sat Jul 26 00:14:23 2014
New Revision: 1613540

URL: http://svn.apache.org/r1613540
Log:
On the svn_auth_x509 branch: Implement functions that convert UTF-16 (UCS-2)
and UTF-32 big- and little-endian strings to UTF-8.

* subversion/include/private/svn_utf_private.h
  (svn_utf__encode_ucs4_string,
   svn_utf__utf16_to_utf8,
   svn_utf__utf32_to_utf8): New prototypes.
* subversion/libsvn_subr/utf8proc.c
  (svn_utf__encode_ucs4_string): Renamed from encode_ucs4_string.
  (svn_utf__glob): Update calls to encode_ucs4_string.
* subversion/libsvn_subr/utf.c
  (membuf_insert_ucs4): New private helper function.
  (SWAP_SHORT, SWAP_LONG): New macros for byte swapping.
  (IS_UTF16_LEAD_SURROGATE, IS_UTF16_TRAIL_SURROGATE): New macros.
  (svn_utf__utf16_to_utf8, svn_utf__utf32_to_utf8): Implement here.

* subversion/tests/libsvn_subr/utf-test.c
  (test_utf_conversions): New test case.
  (test_funcs): Add test_utf_conversions.

Modified:
    subversion/branches/svn-auth-x509/subversion/include/private/svn_utf_private.h
    subversion/branches/svn-auth-x509/subversion/libsvn_subr/utf.c
    subversion/branches/svn-auth-x509/subversion/libsvn_subr/utf8proc.c
    subversion/branches/svn-auth-x509/subversion/tests/libsvn_subr/utf-test.c

Modified: subversion/branches/svn-auth-x509/subversion/include/private/svn_utf_private.h
URL: http://svn.apache.org/viewvc/subversion/branches/svn-auth-x509/subversion/include/private/svn_utf_private.h?rev=1613540&r1=1613539&r2=1613540&view=diff
==============================================================================
--- subversion/branches/svn-auth-x509/subversion/include/private/svn_utf_private.h (original)
+++ subversion/branches/svn-auth-x509/subversion/include/private/svn_utf_private.h Sat Jul
26 00:14:23 2014
@@ -159,6 +159,22 @@ svn_utf__normalize(const char **result,
 svn_boolean_t
 svn_utf__is_normalized(const char *string, apr_pool_t *scratch_pool);
 
+/* Encode an UCS-4 string to UTF-8, placing the result into BUFFER.
+ * While utf8proc does have a similar function, it does more checking
+ * and processing than we want here; this function does not attempt
+ * any normalizations but just encodes the individual code points.
+ *
+ * Return the length of the result (excluding the NUL terminator) in
+ * *result_length.
+ *
+ * A returned error indicates that a codepoint is invalid.
+ */
+svn_error_t *
+svn_utf__encode_ucs4_string(svn_membuf_t *buffer,
+                            const apr_int32_t *ucs4str,
+                            apr_size_t length,
+                            apr_size_t *result_length);
+
 /* Pattern matching similar to the the SQLite LIKE and GLOB
  * operators. PATTERN, KEY and ESCAPE must all point to UTF-8
  * strings. Furthermore, ESCAPE, if provided, must be a character from
@@ -191,6 +207,38 @@ svn_utf__glob(svn_boolean_t *match,
 const char *
 svn_utf__utf8proc_version(void);
 
+/* Convert an UTF-16 (or UCS-2) string to UTF-8, returning the pointer
+ * in RESULT. If BIG_ENDIAN is set, then UTF16STR is big-endian;
+ * otherwise, it's little-endian.
+ *
+ * Allocate RESULT in RESULT_POOL and use SCRATCH_POOL for
+ * intermediate allocation.
+ *
+ * This function combines UTF-16 surrogate pairs into single code
+ * points, but will leave single lead or trail surrogates unchanged.
+ */
+svn_error_t *
+svn_utf__utf16_to_utf8(const char **result,
+                       const apr_uint16_t *utf16str,
+                       svn_boolean_t big_endian,
+                       apr_pool_t *result_pool,
+                       apr_pool_t *scratch_pool);
+
+/* Convert an UTF-32 string to UTF-8, returning the pointer in
+ * RESULT. If BIG_ENDIAN is set, then UTF32STR is big-endian;
+ * otherwise, it's little-endian.
+ *
+ * Allocate RESULT in RESULT_POOL and use SCRATCH_POOL for
+ * intermediate allocation.
+ */
+svn_error_t *
+svn_utf__utf32_to_utf8(const char **result,
+                       const apr_int32_t *utf32str,
+                       svn_boolean_t big_endian,
+                       apr_pool_t *result_pool,
+                       apr_pool_t *scratch_pool);
+
+
 #ifdef __cplusplus
 }
 #endif /* __cplusplus */

Modified: subversion/branches/svn-auth-x509/subversion/libsvn_subr/utf.c
URL: http://svn.apache.org/viewvc/subversion/branches/svn-auth-x509/subversion/libsvn_subr/utf.c?rev=1613540&r1=1613539&r2=1613540&view=diff
==============================================================================
--- subversion/branches/svn-auth-x509/subversion/libsvn_subr/utf.c (original)
+++ subversion/branches/svn-auth-x509/subversion/libsvn_subr/utf.c Sat Jul 26 00:14:23 2014
@@ -1052,6 +1052,140 @@ svn_utf_cstring_from_utf8_string(const c
 }
 
 
+/* Insert the given UCS-4 VALUE into BUF at the given INDEX. */
+static void
+membuf_insert_ucs4(svn_membuf_t *buf, apr_size_t index, apr_int32_t value)
+{
+  svn_membuf__resize(buf, (index + 1) * sizeof(value));
+  ((apr_int32_t*)buf->data)[index] = value;
+}
+
+/* TODO: Use compiler intrinsics for byte swaps. */
+#define SWAP_SHORT(x)  ((((x) & 0xff) << 8) | (((x) >> 8) & 0xff))
+#define SWAP_LONG(x)   ((((x) & 0xff) << 24) | (((x) & 0xff00) << 8)
   \
+                        | (((x) >> 8) & 0xff00) | (((x) >> 24) & 0xff))
+
+#define IS_UTF16_LEAD_SURROGATE(c)   ((c) >= 0xd800 && (c) <= 0xdbff)
+#define IS_UTF16_TRAIL_SURROGATE(c)  ((c) >= 0xdc00 && (c) <= 0xdfff)
+
+svn_error_t *
+svn_utf__utf16_to_utf8(const char **result,
+                       const apr_uint16_t *utf16str,
+                       svn_boolean_t big_endian,
+                       apr_pool_t *result_pool,
+                       apr_pool_t *scratch_pool)
+{
+  static const apr_uint16_t endiancheck = 0xa55a;
+  const svn_boolean_t arch_big_endian =
+    (((const char*)&endiancheck)[sizeof(endiancheck) - 1] == '\x5a');
+
+  const svn_boolean_t swap_order = (!big_endian != !arch_big_endian);
+  apr_uint16_t lead_surrogate = 0;
+  apr_size_t length = 0;
+
+  svn_membuf_t ucs4buf;
+  svn_membuf_t resultbuf;
+
+  svn_membuf__create(&ucs4buf, 0, scratch_pool);
+
+  while (*utf16str)
+    {
+      const apr_uint16_t code =
+        (swap_order ? SWAP_SHORT(*utf16str) : *utf16str);
+      ++utf16str;
+
+      if (lead_surrogate)
+        {
+          if (IS_UTF16_TRAIL_SURROGATE(code))
+            {
+              /* Combine the lead and trail currogates into a 32-bit code. */
+              membuf_insert_ucs4(&ucs4buf, length++,
+                                 (0x010000
+                                  + (((lead_surrogate & 0x03ff) << 10)
+                                     | (code & 0x03ff))));
+              lead_surrogate = 0;
+              continue;
+            }
+          else
+            {
+              /* If we didn't find a surrogate pair, just dump the
+                 lead surrogate into the stream. */
+              membuf_insert_ucs4(&ucs4buf, length++, lead_surrogate);
+              lead_surrogate = 0;
+            }
+        }
+
+      if (*utf16str && IS_UTF16_LEAD_SURROGATE(code))
+        {
+          /* Store a lead surrogate that is followed by at least one
+             code for the next iteration. */
+          lead_surrogate = code;
+          continue;
+        }
+      else
+        membuf_insert_ucs4(&ucs4buf, length++, code);
+    }
+
+  /* Convert the UCS-4 buffer to UTF-8, assuming an average of 2 bytes
+     per code point for encoding. The buffer will grow as
+     necessary. */
+  svn_membuf__create(&resultbuf, length * 2, result_pool);
+  SVN_ERR(svn_utf__encode_ucs4_string(
+              &resultbuf, ucs4buf.data, length, &length));
+  *result = resultbuf.data;
+  return SVN_NO_ERROR;
+}
+
+
+svn_error_t *
+svn_utf__utf32_to_utf8(const char **result,
+                       const apr_int32_t *utf32str,
+                       svn_boolean_t big_endian,
+                       apr_pool_t *result_pool,
+                       apr_pool_t *scratch_pool)
+{
+  static const apr_int32_t endiancheck = 0xa5cbbc5a;
+  const svn_boolean_t arch_big_endian =
+    (((const char*)&endiancheck)[sizeof(endiancheck) - 1] == '\x5a');
+
+  const svn_boolean_t swap_order = (!big_endian != !arch_big_endian);
+  svn_membuf_t resultbuf;
+  apr_size_t length;
+
+  if (!swap_order)
+    {
+      /* Just use the source string without copying. */
+      const apr_int32_t *endp = utf32str;
+      while (*endp++)
+        ;
+      length = (endp - utf32str);
+    }
+  else
+    {
+      svn_membuf_t ucs4buf;
+      svn_membuf__create(&ucs4buf, 0, scratch_pool);
+
+      length = 0;
+      while (*utf32str)
+        {
+          const apr_int32_t code = SWAP_LONG(*utf32str);
+          ++utf32str;
+          membuf_insert_ucs4(&ucs4buf, length++, code);
+        }
+      utf32str = ucs4buf.data;
+    }
+
+  /* Convert the UCS-4 buffer to UTF-8, assuming an average of 2 bytes
+     per code point for encoding. The buffer will grow as
+     necessary. */
+  svn_membuf__create(&resultbuf, length * 2, result_pool);
+  SVN_ERR(svn_utf__encode_ucs4_string(
+              &resultbuf, utf32str, length, &length));
+  *result = resultbuf.data;
+  return SVN_NO_ERROR;
+}
+
+
 #ifdef WIN32
 
 

Modified: subversion/branches/svn-auth-x509/subversion/libsvn_subr/utf8proc.c
URL: http://svn.apache.org/viewvc/subversion/branches/svn-auth-x509/subversion/libsvn_subr/utf8proc.c?rev=1613540&r1=1613539&r2=1613540&view=diff
==============================================================================
--- subversion/branches/svn-auth-x509/subversion/libsvn_subr/utf8proc.c (original)
+++ subversion/branches/svn-auth-x509/subversion/libsvn_subr/utf8proc.c Sat Jul 26 00:14:23
2014
@@ -219,20 +219,14 @@ encode_ucs4(svn_membuf_t *buffer, apr_in
   return SVN_NO_ERROR;
 }
 
-/* Decode an UCS-4 string to UTF-8, placing the result into BUFFER.
- * While utf8proc does have a similar function, it does more checking
- * and processing than we want here. Return the length of the result
- * (excluding the NUL terminator) in *result_length.
- *
- * A returned error indicates that the codepoint is invalid.
- */
-static svn_error_t *
-encode_ucs4_string(svn_membuf_t *buffer,
-                   apr_int32_t *ucs4str, apr_size_t len,
-                   apr_size_t *result_length)
+svn_error_t *
+svn_utf__encode_ucs4_string(svn_membuf_t *buffer,
+                            const apr_int32_t *ucs4str,
+                            apr_size_t length,
+                            apr_size_t *result_length)
 {
   *result_length = 0;
-  while (len-- > 0)
+  while (length-- > 0)
     SVN_ERR(encode_ucs4(buffer, *ucs4str++, result_length));
   svn_membuf__resize(buffer, *result_length + 1);
   ((char*)buffer->data)[*result_length] = '\0';
@@ -263,8 +257,8 @@ svn_utf__glob(svn_boolean_t *match,
      because apr_fnmatch can't handle it.*/
   SVN_ERR(decompose_normalized(&tempbuf_len, pattern, pattern_len, temp_buf));
   if (!sql_like)
-    SVN_ERR(encode_ucs4_string(pattern_buf, temp_buf->data, tempbuf_len,
-                               &patternbuf_len));
+    SVN_ERR(svn_utf__encode_ucs4_string(pattern_buf, temp_buf->data,
+                                        tempbuf_len, &patternbuf_len));
   else
     {
       /* Convert a LIKE pattern to a GLOB pattern that apr_fnmatch can use. */
@@ -339,8 +333,8 @@ svn_utf__glob(svn_boolean_t *match,
 
   /* Now normalize the string */
   SVN_ERR(decompose_normalized(&tempbuf_len, string, string_len, temp_buf));
-  SVN_ERR(encode_ucs4_string(string_buf, temp_buf->data,
-                             tempbuf_len, &tempbuf_len));
+  SVN_ERR(svn_utf__encode_ucs4_string(string_buf, temp_buf->data,
+                                      tempbuf_len, &tempbuf_len));
 
   *match = !apr_fnmatch(pattern_buf->data, string_buf->data, 0);
   return SVN_NO_ERROR;

Modified: subversion/branches/svn-auth-x509/subversion/tests/libsvn_subr/utf-test.c
URL: http://svn.apache.org/viewvc/subversion/branches/svn-auth-x509/subversion/tests/libsvn_subr/utf-test.c?rev=1613540&r1=1613539&r2=1613540&view=diff
==============================================================================
--- subversion/branches/svn-auth-x509/subversion/tests/libsvn_subr/utf-test.c (original)
+++ subversion/branches/svn-auth-x509/subversion/tests/libsvn_subr/utf-test.c Sat Jul 26 00:14:23
2014
@@ -737,6 +737,78 @@ test_utf_is_normalized(apr_pool_t *pool)
   return SVN_NO_ERROR;
 }
 
+
+static svn_error_t *
+test_utf_conversions(apr_pool_t *pool)
+{
+  static const struct cvt_test_t
+  {
+    svn_boolean_t sixteenbit;
+    svn_boolean_t bigendian;
+    const char *source;
+    const char *result;
+  } tests[] = {
+
+#define UTF_32_LE FALSE, FALSE
+#define UTF_32_BE FALSE, TRUE
+#define UTF_16_LE TRUE, FALSE
+#define UTF_16_BE TRUE, TRUE
+
+    /* Normal character conversion */
+    { UTF_32_LE, "t\0\0\0" "e\0\0\0" "s\0\0\0" "t\0\0\0" "\0\0\0\0", "test" },
+    { UTF_32_BE, "\0\0\0t" "\0\0\0e" "\0\0\0s" "\0\0\0t" "\0\0\0\0", "test" },
+    { UTF_16_LE, "t\0" "e\0" "s\0" "t\0" "\0\0", "test" },
+    { UTF_16_BE, "\0t" "\0e" "\0s" "\0t" "\0\0", "test" },
+
+    /* Valid surrogate pairs */
+    { UTF_16_LE, "\x00\xD8" "\x00\xDC" "\0\0", "\xf0\x90\x80\x80" }, /* U+010000 */
+    { UTF_16_LE, "\x34\xD8" "\x1E\xDD" "\0\0", "\xf0\x9d\x84\x9e" }, /* U+01D11E */
+    { UTF_16_LE, "\xFF\xDB" "\xFD\xDF" "\0\0", "\xf4\x8f\xbf\xbd" }, /* U+10FFFD */
+
+    { UTF_16_BE, "\xD8\x00" "\xDC\x00" "\0\0", "\xf0\x90\x80\x80" }, /* U+010000 */
+    { UTF_16_BE, "\xD8\x34" "\xDD\x1E" "\0\0", "\xf0\x9d\x84\x9e" }, /* U+01D11E */
+    { UTF_16_BE, "\xDB\xFF" "\xDF\xFD" "\0\0", "\xf4\x8f\xbf\xbd" }, /* U+10FFFD */
+
+    /* Swapped, single and trailing surrogate pairs */
+    { UTF_16_LE, "*\0" "\x00\xDC" "\x00\xD8" "*\0\0\0", "*\xed\xb0\x80" "\xed\xa0\x80*" },
+    { UTF_16_LE, "*\0" "\x1E\xDD" "*\0\0\0", "*\xed\xb4\x9e*" },
+    { UTF_16_LE, "*\0" "\xFF\xDB" "*\0\0\0", "*\xed\xaf\xbf*" },
+    { UTF_16_LE, "\x1E\xDD" "\0\0", "\xed\xb4\x9e" },
+    { UTF_16_LE, "\xFF\xDB" "\0\0", "\xed\xaf\xbf" },
+
+    { UTF_16_BE, "\0*" "\xDC\x00" "\xD8\x00" "\0*\0\0", "*\xed\xb0\x80" "\xed\xa0\x80*" },
+    { UTF_16_BE, "\0*" "\xDD\x1E" "\0*\0\0", "*\xed\xb4\x9e*" },
+    { UTF_16_BE, "\0*" "\xDB\xFF" "\0*\0\0", "*\xed\xaf\xbf*" },
+    { UTF_16_BE, "\xDD\x1E" "\0\0", "\xed\xb4\x9e" },
+    { UTF_16_BE, "\xDB\xFF" "\0\0", "\xed\xaf\xbf" },
+
+#undef UTF_32_LE
+#undef UTF_32_BE
+#undef UTF_16_LE
+#undef UTF_16_BE
+
+    { 0 }
+  };
+
+  const struct cvt_test_t *tc;
+  const char *result;
+  int i;
+
+  for (i = 1, tc = tests; tc->source; ++tc, ++i)
+    {
+      if (tc->sixteenbit)
+        SVN_ERR(svn_utf__utf16_to_utf8(&result, (const void*)tc->source,
+                                       tc->bigendian, pool, pool));
+      else
+        SVN_ERR(svn_utf__utf32_to_utf8(&result, (const void*)tc->source,
+                                       tc->bigendian, pool, pool));
+      SVN_ERR_ASSERT(0 == strcmp(result, tc->result));
+    }
+
+  return SVN_NO_ERROR;
+}
+
+
 
 /* The test table.  */
 
@@ -761,6 +833,8 @@ static struct svn_test_descriptor_t test
                    "test svn_utf__fuzzy_escape"),
     SVN_TEST_PASS2(test_utf_is_normalized,
                    "test svn_utf__is_normalized"),
+    SVN_TEST_PASS2(test_utf_conversions,
+                   "test svn_utf__utf{16,32}_to_utf8"),
     SVN_TEST_NULL
   };
 



Mime
View raw message