subversion-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From br...@apache.org
Subject svn commit: r1511676 - in /subversion/trunk/subversion: include/private/svn_utf_private.h libsvn_subr/utf.c libsvn_subr/utf8proc.c tests/cmdline/merge_tests.py tests/cmdline/svnadmin_tests.py tests/libsvn_subr/utf-test.c
Date Thu, 08 Aug 2013 10:27:26 GMT
Author: brane
Date: Thu Aug  8 10:27:25 2013
New Revision: 1511676

URL: http://svn.apache.org/r1511676
Log:
Reimplement UTF-8 fuzzy conversion using the tools provided by utf8proc.
While the new implementation is slower and uses more memory than the
original, it also gives somewhat better results (in the author's opinion).
Since fuzzy escaping is only used as a last resort to generate error
messages, the extra cost should not be a problem.

* subversion/include/private/svn_utf_private.h
  (svn_utf__fuzzy_escape): Declare new private function.

* subversion/libsvn_subr/utf8proc.c
  (svn_utf__utf8proc_version): utf8proc_codepoint_valid is no longer unused.
  (unicode_decomposition): New; common driver for utf8proc_decompose.
  (decompose_normalized): Reimplement using unicode_decomposition.
  (svn_utf__fuzzy_escape): Implement.

* subversion/libsvn_subr/utf.c (fuzzy_escape): Removed.
  (convert_to_stringbuf, svn_utf__cstring_from_utf8_fuzzy):
   Use svn_utf__fuzzy_escape.

* subversion/tests/libsvn_subr/utf-test.c
  (test_utf_fuzzy_escape): New test case for svn_utf__fuzzy_escape.
  (test_funcs): Add test_utf_fuzzy_escape.
* subversion/tests/cmdline/merge_tests.py
  (simple_property_merges): Adjust expected output.
* subversion/tests/cmdline/svnadmin_tests.py
  (verify_non_utf8_paths): Likewise.

Modified:
    subversion/trunk/subversion/include/private/svn_utf_private.h
    subversion/trunk/subversion/libsvn_subr/utf.c
    subversion/trunk/subversion/libsvn_subr/utf8proc.c
    subversion/trunk/subversion/tests/cmdline/merge_tests.py
    subversion/trunk/subversion/tests/cmdline/svnadmin_tests.py
    subversion/trunk/subversion/tests/libsvn_subr/utf-test.c

Modified: subversion/trunk/subversion/include/private/svn_utf_private.h
URL: http://svn.apache.org/viewvc/subversion/trunk/subversion/include/private/svn_utf_private.h?rev=1511676&r1=1511675&r2=1511676&view=diff
==============================================================================
--- subversion/trunk/subversion/include/private/svn_utf_private.h (original)
+++ subversion/trunk/subversion/include/private/svn_utf_private.h Thu Aug  8 10:27:25 2013
@@ -73,6 +73,18 @@ svn_utf__last_valid(const char *src, apr
 const char *
 svn_utf__last_valid2(const char *src, apr_size_t len);
 
+/* Copy LENGTH bytes of SRC, converting characters as follows:
+    - Pass characters from the ASCII subset to the result
+    - Strip all combining marks from the string
+    - Represent other valid Unicode chars as {U+XXXX}
+    - Replace invalid Unicode chars with {U?XXXX}
+    - Represent chars that are not valid UTF-8 as ?\XX
+    - Replace codes outside the Unicode range with a sequence of ?\XX
+    - Represent the null byte as \0
+   Allocate the result in POOL. */
+const char *
+svn_utf__fuzzy_escape(const char *src, apr_size_t length, apr_pool_t *pool);
+
 const char *
 svn_utf__cstring_from_utf8_fuzzy(const char *src,
                                  apr_pool_t *pool,

Modified: subversion/trunk/subversion/libsvn_subr/utf.c
URL: http://svn.apache.org/viewvc/subversion/trunk/subversion/libsvn_subr/utf.c?rev=1511676&r1=1511675&r2=1511676&view=diff
==============================================================================
--- subversion/trunk/subversion/libsvn_subr/utf.c (original)
+++ subversion/trunk/subversion/libsvn_subr/utf.c Thu Aug  8 10:27:25 2013
@@ -480,58 +480,6 @@ get_uton_xlate_handle_node(xlate_handle_
 }
 
 
-/* Copy LEN bytes of SRC, converting non-ASCII and zero bytes to ?\nnn
-   sequences, allocating the result in POOL. */
-static const char *
-fuzzy_escape(const char *src, apr_size_t len, apr_pool_t *pool)
-{
-  const char *src_orig = src, *src_end = src + len;
-  apr_size_t new_len = 0;
-  char *new;
-  const char *new_orig;
-
-  /* First count how big a dest string we'll need. */
-  while (src < src_end)
-    {
-      if (! svn_ctype_isascii(*src) || *src == '\0')
-        new_len += 5;  /* 5 slots, for "?\XXX" */
-      else
-        new_len += 1;  /* one slot for the 7-bit char */
-
-      src++;
-    }
-
-  /* Allocate that amount, plus one slot for '\0' character. */
-  new = apr_palloc(pool, new_len + 1);
-
-  new_orig = new;
-
-  /* And fill it up. */
-  while (src_orig < src_end)
-    {
-      if (! svn_ctype_isascii(*src_orig) || src_orig == '\0')
-        {
-          /* This is the same format as svn_xml_fuzzy_escape uses, but that
-             function escapes different characters.  Please keep in sync!
-             ### If we add another fuzzy escape somewhere, we should abstract
-             ### this out to a common function. */
-          apr_snprintf(new, 6, "?\\%03u", (unsigned char) *src_orig);
-          new += 5;
-        }
-      else
-        {
-          *new = *src_orig;
-          new += 1;
-        }
-
-      src_orig++;
-    }
-
-  *new = '\0';
-
-  return new_orig;
-}
-
 /* Convert SRC_LENGTH bytes of SRC_DATA in NODE->handle, store the result
    in *DEST, which is allocated in POOL. */
 static svn_error_t *
@@ -609,8 +557,8 @@ convert_to_stringbuf(xlate_handle_node_t
           (pool, _("Can't convert string from '%s' to '%s':"),
            node->frompage, node->topage);
 
-      err = svn_error_create(apr_err, NULL, fuzzy_escape(src_data,
-                                                         src_length, pool));
+      err = svn_error_create(
+          apr_err, NULL, svn_utf__fuzzy_escape(src_data, src_length, pool));
       return svn_error_create(apr_err, err, errstr);
     }
   /* Else, exited due to success.  Trim the result buffer down to the
@@ -1007,7 +955,7 @@ svn_utf__cstring_from_utf8_fuzzy(const c
   const char *escaped, *converted;
   svn_error_t *err;
 
-  escaped = fuzzy_escape(src, strlen(src), pool);
+  escaped = svn_utf__fuzzy_escape(src, strlen(src), pool);
 
   /* Okay, now we have a *new* UTF-8 string, one that's guaranteed to
      contain only 7-bit bytes :-).  Recode to native... */

Modified: subversion/trunk/subversion/libsvn_subr/utf8proc.c
URL: http://svn.apache.org/viewvc/subversion/trunk/subversion/libsvn_subr/utf8proc.c?rev=1511676&r1=1511675&r2=1511676&view=diff
==============================================================================
--- subversion/trunk/subversion/libsvn_subr/utf8proc.c (original)
+++ subversion/trunk/subversion/libsvn_subr/utf8proc.c Thu Aug  8 10:27:25 2013
@@ -37,7 +37,6 @@
 const char *svn_utf__utf8proc_version(void)
 {
   /* Unused static function warning removal hack. */
-  UNUSED(utf8proc_codepoint_valid);
   UNUSED(utf8proc_NFD);
   UNUSED(utf8proc_NFC);
   UNUSED(utf8proc_NFKD);
@@ -48,19 +47,21 @@ const char *svn_utf__utf8proc_version(vo
 
 
 
-/* Fill the given BUFFER with an NFD UCS-4 representation of the UTF-8
- * STRING. If LENGTH is SVN_UTF__UNKNOWN_LENGTH, assume STRING is
- * NUL-terminated; otherwise look only at the first LENGTH bytes in
+/* Fill the given BUFFER with decomposed UCS-4 representation of the
+ * UTF-8 STRING. If LENGTH is SVN_UTF__UNKNOWN_LENGTH, assume STRING
+ * is NUL-terminated; otherwise look only at the first LENGTH bytes in
  * STRING. Upon return, BUFFER->data points at an array of UCS-4
- * characters and *RESULT_LENGTH contains the length of the array.
+ * characters, and return the length of the array. TRANSFORM_FLAGS
+ * define exactly how the decomposition is performed.
  *
- * A returned error may indicate that STRING contains invalid UTF-8 or
- * invalid Unicode codepoints. Any error message comes from utf8proc.
+ * A negative return value is an utf8proc error code and may indicate
+ * that STRING contains invalid UTF-8 or was so long that an overflow
+ * occurred.
  */
-static svn_error_t *
-decompose_normalized(apr_size_t *result_length,
-                     const char *string, apr_size_t length,
-                     svn_membuf_t *buffer)
+static ssize_t
+unicode_decomposition(int transform_flags,
+                      const char *string, apr_size_t length,
+                      svn_membuf_t *buffer)
 {
   const int nullterm = (length == SVN_UTF__UNKNOWN_LENGTH
                         ? UTF8PROC_NULLTERM : 0);
@@ -71,23 +72,38 @@ decompose_normalized(apr_size_t *result_
       const ssize_t ucs4len = buffer->size / sizeof(*ucs4buf);
       const ssize_t result =
         utf8proc_decompose((const void*) string, length, ucs4buf, ucs4len,
-                           UTF8PROC_DECOMPOSE | UTF8PROC_STABLE | nullterm);
+                           UTF8PROC_DECOMPOSE | UTF8PROC_STABLE
+                           | transform_flags | nullterm);
 
-      if (result < 0)
-        return svn_error_create(SVN_ERR_UTF8PROC_ERROR, NULL,
-                                gettext(utf8proc_errmsg(result)));
-
-      if (result <= ucs4len)
-        {
-          *result_length = result;
-          return SVN_NO_ERROR;
-        }
+      if (result < 0 || result <= ucs4len)
+        return result;
 
       /* Increase the decomposition buffer size and retry */
       svn_membuf__ensure(buffer, result * sizeof(*ucs4buf));
     }
 }
 
+/* Fill the given BUFFER with an NFD UCS-4 representation of the UTF-8
+ * STRING. If LENGTH is SVN_UTF__UNKNOWN_LENGTH, assume STRING is
+ * NUL-terminated; otherwise look only at the first LENGTH bytes in
+ * STRING. Upon return, BUFFER->data points at an array of UCS-4
+ * characters and *RESULT_LENGTH contains the length of the array.
+ *
+ * A returned error may indicate that STRING contains invalid UTF-8 or
+ * invalid Unicode codepoints. Any error message comes from utf8proc.
+ */
+static svn_error_t *
+decompose_normalized(apr_size_t *result_length,
+                     const char *string, apr_size_t length,
+                     svn_membuf_t *buffer)
+{
+  ssize_t result = unicode_decomposition(0, string, length, buffer);
+  if (result < 0)
+    return svn_error_create(SVN_ERR_UTF8PROC_ERROR, NULL,
+                            gettext(utf8proc_errmsg(result)));
+  *result_length = result;
+  return SVN_NO_ERROR;
+}
 
 /* Compare two arrays of UCS-4 codes, BUFA of length LENA and BUFB of
  * length LENB. Return 0 if they're equal, a negative value if BUFA is
@@ -288,3 +304,166 @@ svn_utf__glob(svn_boolean_t *match,
   *match = !apr_fnmatch(pattern_buf->data, string_buf->data, 0);
   return SVN_NO_ERROR;
 }
+
+
+const char *
+svn_utf__fuzzy_escape(const char *src, apr_size_t length, apr_pool_t *pool)
+{
+  /* Hexadecimal digits for code conversion. */
+  static const char digits[] = "0123456789ABCDEF";
+
+  /* Flags used for Unicode decomposition. */
+  static const int decomp_flags = (
+      UTF8PROC_COMPAT | UTF8PROC_STABLE | UTF8PROC_LUMP
+      | UTF8PROC_NLF2LF | UTF8PROC_STRIPCC | UTF8PROC_STRIPMARK);
+
+  svn_stringbuf_t *result;
+  svn_membuf_t buffer;
+  ssize_t decomp_length;
+  ssize_t len;
+
+  /* Decompose to a non-reversible compatibility format. */
+  svn_membuf__create(&buffer, length * sizeof(apr_int32_t), pool);
+  decomp_length = unicode_decomposition(decomp_flags, src, length, &buffer);
+  if (decomp_length < 0)
+    {
+      svn_membuf_t part;
+      apr_size_t done, prev;
+
+      /* The only other error we can receive here indicates an integer
+         overflow due to the length of the input string. Not very
+         likely, but we certainly shouldn't continue in that case. */
+      SVN_ERR_ASSERT_NO_RETURN(decomp_length == UTF8PROC_ERROR_INVALIDUTF8);
+
+      /* Break the decomposition into parts that are valid UTF-8, and
+         bytes that are not. Represent the invalid bytes in the target
+         erray by their negative value. This works because utf8proc
+         will not generate Unicode code points with values larger than
+         U+10FFFF. */
+      svn_membuf__create(&part, sizeof(apr_int32_t), pool);
+      decomp_length = 0;
+      done = prev = 0;
+      while (done < length)
+        {
+          apr_int32_t uc;
+
+          while (done < length)
+            {
+              len = utf8proc_iterate((uint8_t*)src + done, length - done, &uc);
+              if (len < 0)
+                break;
+              done += len;
+            }
+
+          /* Decompose the valid part */
+          if (done > prev)
+            {
+              len = unicode_decomposition(
+                  decomp_flags, src + prev, done - prev, &part);
+              SVN_ERR_ASSERT_NO_RETURN(len > 0);
+              svn_membuf__resize(
+                  &buffer, (decomp_length + len) * sizeof(apr_int32_t));
+              memcpy((apr_int32_t*)buffer.data + decomp_length,
+                     part.data, len * sizeof(apr_int32_t));
+              decomp_length += len;
+              prev = done;
+            }
+
+          /* What follows could be a valid UTF-8 sequence, but not
+             a valid Unicode character. */
+          if (done < length)
+            {
+              const char *last;
+
+              /* Determine the lenght of the UTF-8 sequence */
+              const char *const p = src + done;
+              const uint8_t index = (uint8_t)*p;
+              len = utf8proc_utf8class[index];
+
+              /* Check if the multi-byte sequence is valid UTF-8. */
+              if (len > 1 && len <= length - done)
+                last = svn_utf__last_valid(p, len);
+              else
+                last = NULL;
+
+              /* Might not be a valid UTF-8 sequence at all */
+              if (!last || (last && last - p < len))
+                {
+                  uc = -((apr_int32_t)(*p & 0xff));
+                  len = 1;
+                }
+              else
+                {
+                  switch (len)
+                    {
+                      /* Decode the UTF-8 sequence without validation. */
+                    case 2:
+                      uc = ((p[0] & 0x1f) <<  6) + (p[1] & 0x3f);
+                      break;
+                    case 3:
+                      uc = (((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) <<
 6)
+                            + (p[2] & 0x3f));
+                      break;
+                    case 4:
+                      uc = (((p[0] & 0x07) << 18) + ((p[1] & 0x3f) <<
12)
+                            + ((p[2] & 0x3f) <<  6) + (p[3] & 0x3f));
+                      break;
+                    default:
+                      SVN_ERR_ASSERT_NO_RETURN(
+                          !"Unexpected invalid UTF-8 byte");
+                    }
+
+                }
+
+              svn_membuf__resize(
+                  &buffer, (decomp_length + 1) * sizeof(apr_int32_t));
+              ((apr_int32_t*)buffer.data)[decomp_length++] = uc;
+              done += len;
+              prev = done;
+            }
+        }
+    }
+
+  /* Scan the result and deleting any combining diacriticals and
+     inserting placeholders where any non-ascii characters remain.  */
+  result = svn_stringbuf_create_ensure(decomp_length, pool);
+  for (len = 0; len < decomp_length; ++len)
+    {
+      const apr_int32_t cp = ((apr_int32_t*)buffer.data)[len];
+      if (cp > 0 && cp < 127)
+        svn_stringbuf_appendbyte(result, (char)cp);
+      else if (cp == 0)
+        svn_stringbuf_appendcstr(result, "\\0");
+      else if (cp < 0)
+        {
+          const apr_int32_t rcp = ((-cp) & 0xff);
+          svn_stringbuf_appendcstr(result, "?\\");
+          svn_stringbuf_appendbyte(result, digits[(rcp & 0x00f0) >> 4]);
+          svn_stringbuf_appendbyte(result, digits[(rcp & 0x000f)]);
+        }
+      else
+        {
+          if (utf8proc_codepoint_valid(cp))
+            {
+              const utf8proc_property_t *prop = utf8proc_get_property(cp);
+              if (prop->combining_class != 0)
+                continue;           /* Combining mark; ignore */
+              svn_stringbuf_appendcstr(result, "{U+");
+            }
+          else
+            svn_stringbuf_appendcstr(result, "{U?");
+          if (cp > 0xffff)
+            {
+              svn_stringbuf_appendbyte(result, digits[(cp & 0xf00000) >> 20]);
+              svn_stringbuf_appendbyte(result, digits[(cp & 0x0f0000) >> 16]);
+            }
+          svn_stringbuf_appendbyte(result, digits[(cp & 0xf000) >> 12]);
+          svn_stringbuf_appendbyte(result, digits[(cp & 0x0f00) >> 8]);
+          svn_stringbuf_appendbyte(result, digits[(cp & 0x00f0) >> 4]);
+          svn_stringbuf_appendbyte(result, digits[(cp & 0x000f)]);
+          svn_stringbuf_appendbyte(result, '}');
+        }
+    }
+
+  return result->data;
+}

Modified: subversion/trunk/subversion/tests/cmdline/merge_tests.py
URL: http://svn.apache.org/viewvc/subversion/trunk/subversion/tests/cmdline/merge_tests.py?rev=1511676&r1=1511675&r2=1511676&view=diff
==============================================================================
--- subversion/trunk/subversion/tests/cmdline/merge_tests.py (original)
+++ subversion/trunk/subversion/tests/cmdline/merge_tests.py Thu Aug  8 10:27:25 2013
@@ -656,7 +656,7 @@ def simple_property_merges(sbox):
     'E/alpha.prej'
     : Item(error_message('foo', 'foo_val', 'mod_foo')),
     'E/beta.prej'
-    : Item(error_message('foo', 'foo?\\129val', 'mod?\\129foo')),
+    : Item(error_message('foo', 'foo?\\81val', 'mod?\\81foo')),
     })
   expected_disk.tweak('E', 'E/alpha', props={'bar' : 'bar_val'})
   expected_disk.tweak('E/beta', props={'bar' : 'bar\201val'})

Modified: subversion/trunk/subversion/tests/cmdline/svnadmin_tests.py
URL: http://svn.apache.org/viewvc/subversion/trunk/subversion/tests/cmdline/svnadmin_tests.py?rev=1511676&r1=1511675&r2=1511676&view=diff
==============================================================================
--- subversion/trunk/subversion/tests/cmdline/svnadmin_tests.py (original)
+++ subversion/trunk/subversion/tests/cmdline/svnadmin_tests.py Thu Aug  8 10:27:25 2013
@@ -1485,8 +1485,8 @@ def verify_non_utf8_paths(sbox):
   expected_stderr = [
     "* Dumped revision 0.\n",
     "WARNING 0x0002: E160005: "
-      "While validating fspath '?\\230': "
-      "Path '?\\230' is not in UTF-8"
+      "While validating fspath '?\\E6': "
+      "Path '?\\E6' is not in UTF-8"
       "\n",
     "* Dumped revision 1.\n",
     ]

Modified: subversion/trunk/subversion/tests/libsvn_subr/utf-test.c
URL: http://svn.apache.org/viewvc/subversion/trunk/subversion/tests/libsvn_subr/utf-test.c?rev=1511676&r1=1511675&r2=1511676&view=diff
==============================================================================
--- subversion/trunk/subversion/tests/libsvn_subr/utf-test.c (original)
+++ subversion/trunk/subversion/tests/libsvn_subr/utf-test.c Thu Aug  8 10:27:25 2013
@@ -618,6 +618,62 @@ test_utf_pattern_match(apr_pool_t *pool)
 }
 
 
+static svn_error_t *
+test_utf_fuzzy_escape(apr_pool_t *pool)
+{
+
+  /* Accented latin, mixed normalization */
+  static const char mixup[] =
+    "S\xcc\x87\xcc\xa3"         /* S with dot above and below */
+    "\xc5\xaf"                  /* u with ring */
+    "b\xcc\xb1"                 /* b with macron below */
+    "\xe1\xb9\xbd"              /* v with tilde */
+    "e\xcc\xa7\xcc\x86"         /* e with breve and cedilla */
+    "\xc8\x91"                  /* r with double grave */
+    "s\xcc\x8c"                 /* s with caron */
+    "\xe1\xb8\xaf"              /* i with diaeresis and acute */
+    "o\xcc\x80\xcc\x9b"         /* o with grave and hook */
+    "\xe1\xb9\x8b";             /* n with circumflex below */
+
+  /* As above, but latin lowercase 'o' replaced with Greek 'omicron' */
+  static const char greekish[] =
+    "S\xcc\x87\xcc\xa3"         /* S with dot above and below */
+    "\xc5\xaf"                  /* u with ring */
+    "b\xcc\xb1"                 /* b with macron below */
+    "\xe1\xb9\xbd"              /* v with tilde */
+    "e\xcc\xa7\xcc\x86"         /* e with breve and cedilla */
+    "\xc8\x91"                  /* r with double grave */
+    "s\xcc\x8c"                 /* s with caron */
+    "\xe1\xb8\xaf"              /* i with diaeresis and acute */
+    "\xce\xbf\xcc\x80\xcc\x9b"  /* omicron with grave and hook */
+    "\xe1\xb9\x8b";             /* n with circumflex below */
+
+  /* More interesting invalid characters. */
+  static const char invalid[] =
+    "Not Unicode: \xef\xb7\x91;"      /* U+FDD1 */
+    "Out of range: \xf4\x90\x80\x81;" /* U+110001 */
+    "Not UTF-8: \xe6;"
+    "Null byte: \0;";
+
+  const char *fuzzy;
+
+  fuzzy = svn_utf__fuzzy_escape(mixup, strlen(mixup), pool);
+  SVN_TEST_ASSERT(0 == strcmp(fuzzy, "Subversion"));
+
+  fuzzy = svn_utf__fuzzy_escape(greekish, strlen(greekish), pool);
+  SVN_TEST_ASSERT(0 == strcmp(fuzzy, "Subversi{U+03BF}n"));
+
+  fuzzy = svn_utf__fuzzy_escape(invalid, sizeof(invalid) - 1, pool);
+  /*fprintf(stderr, "%s\n", fuzzy);*/
+  SVN_TEST_ASSERT(0 == strcmp(fuzzy,
+                              "Not Unicode: {U?FDD1};"
+                              "Out of range: ?\\F4?\\90?\\80?\\81;"
+                              "Not UTF-8: ?\\E6;"
+                              "Null byte: \\0;"));
+
+  return SVN_NO_ERROR;
+}
+
 
 /* The test table.  */
 
@@ -636,5 +692,7 @@ struct svn_test_descriptor_t test_funcs[
                    "test svn_utf__normcmp"),
     SVN_TEST_PASS2(test_utf_pattern_match,
                    "test svn_utf__glob"),
+    SVN_TEST_PASS2(test_utf_fuzzy_escape,
+                   "test svn_utf__fuzzy_escape"),
     SVN_TEST_NULL
   };



Mime
View raw message