cassandra-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From jbel...@apache.org
Subject [2/3] git commit: cqlsh: format output properly, make prettier
Date Fri, 20 Jan 2012 19:24:19 GMT
cqlsh: format output properly, make prettier

most importantly: formats output from queries according to column
storage type, so that e.g. blob data is shown in hex. also formats
timestamp values in a human-readable and cql-readable way, and
shows backslash escapes for text/ascii types when characters can't
be printed. when color is enabled, highlight backslash escapes in
red, so they can easily be distinguished from text that happens to
contain backslashes.

color different types of column data differently: textual data stays
yellow, blob data is red, and other data (types which are shown and
input significantly differently from the byte representation, like
numbers, timestamps, etc) are green.

for static resultsets, add a separator line between header and data
(most helpful for people who don't have colors on). left-justify column
headers instead of right-justifying. for dynamic resultsets, don't
append a final bar separator after the last value.

allow calling into a value renderer/formatter which can make decisions
about output representation and color based on the column type

patch by pcannon; reviewed by jbellis for CASSANDRA-3726


Project: http://git-wip-us.apache.org/repos/asf/cassandra/repo
Commit: http://git-wip-us.apache.org/repos/asf/cassandra/commit/5850178c
Tree: http://git-wip-us.apache.org/repos/asf/cassandra/tree/5850178c
Diff: http://git-wip-us.apache.org/repos/asf/cassandra/diff/5850178c

Branch: refs/heads/trunk
Commit: 5850178cb99dbd1e5e5333442c85470386ec1b4f
Parents: fc61039
Author: paul cannon <paul@datastax.com>
Authored: Wed Jan 18 13:52:52 2012 -0600
Committer: Jonathan Ellis <jbellis@apache.org>
Committed: Fri Jan 20 13:22:38 2012 -0600

----------------------------------------------------------------------
 CHANGES.txt               |    2 +
 bin/cqlsh                 |  311 ++++++++++++++++++++++++++++------
 pylib/cqlshlib/wcwidth.py |  367 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 631 insertions(+), 49 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/cassandra/blob/5850178c/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index 4688381..35fef0f 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,7 @@
 1.0.8
  * (cqlsh) add DESCRIBE COLUMNFAMILIES (CASSANDRA-3586)
+ * (cqlsh) format blobs correctly and use colors to improve output
+   readability (CASSANDRA-3726)
 
 
 1.0.7

http://git-wip-us.apache.org/repos/asf/cassandra/blob/5850178c/bin/cqlsh
----------------------------------------------------------------------
diff --git a/bin/cqlsh b/bin/cqlsh
index 2a18722..1ef7f3b 100755
--- a/bin/cqlsh
+++ b/bin/cqlsh
@@ -44,6 +44,8 @@ import string
 import time
 import optparse
 import ConfigParser
+import codecs
+import re
 
 # cqlsh should run correctly when run out of a Cassandra source tree,
 # out of an unpacked Cassandra tarball, and after a proper package install.
@@ -51,7 +53,7 @@ cqlshlibdir = os.path.join(os.path.dirname(__file__), '..', 'pylib')
 if os.path.isdir(cqlshlibdir):
     sys.path.insert(0, cqlshlibdir)
 
-from cqlshlib import cqlhandling, pylexotron
+from cqlshlib import cqlhandling, pylexotron, wcwidth
 from cqlshlib.cqlhandling import (token_dequote, cql_dequote, cql_escape,
                                   maybe_cql_escape, cql_typename)
 
@@ -94,12 +96,14 @@ parser.add_option('--debug', action='store_true',
                   help='Show additional debugging information')
 
 
-RED = "\033[1;31m%s\033[0m"
-GREEN = "\033[1;32m%s\033[0m"
-BLUE = "\033[1;34m%s\033[0m"
-YELLOW = "\033[1;33m%s\033[0m"
-CYAN = "\033[1;36m%s\033[0m"
-MAGENTA = "\033[1;35m%s\033[0m"
+RED = '\033[0;1;31m'
+GREEN = '\033[0;1;32m'
+YELLOW = '\033[0;1;33m'
+BLUE = '\033[0;1;34m'
+MAGENTA = '\033[0;1;35m'
+CYAN = '\033[0;1;36m'
+WHITE = '\033[0;1;37m'
+ANSI_RESET = '\033[0m'
 
 CQL_ERRORS = (cql.Error,)
 try:
@@ -215,16 +219,113 @@ def trim_if_present(s, prefix):
         return s[len(prefix):]
     return s
 
+class FormattedValue:
+    def __init__(self, strval, coloredval, displaywidth):
+        self.strval = strval
+        self.coloredval = coloredval
+        self.displaywidth = displaywidth
+
+    def _pad(self, width, fill=' '):
+        if width > self.displaywidth:
+            return fill * (width - self.displaywidth)
+        else:
+            return ''
+
+    def rjust(self, width, fill=' '):
+        """
+        Similar to self.strval.rjust(width), but takes expected terminal
+        display width into account for special characters, and does not
+        take color escape codes into account.
+        """
+        return self._pad(width, fill) + self.strval
+
+    def color_rjust(self, width, fill=' '):
+        """
+        Similar to self.rjust(width), but uses this value's colored
+        representation, and does not take color escape codes into account
+        in determining width.
+        """
+        return self._pad(width, fill) + self.coloredval
+
+controlchars_re = re.compile(r'[\x00-\x31\x7f-\xa0]')
+
+def _show_control_chars(match):
+    txt = repr(match.group(0))
+    if txt.startswith('u'):
+        txt = txt[2:-1]
+    else:
+        txt = txt[1:-1]
+    return txt
+
+bits_to_turn_red_re = re.compile(r'\\([^uUx]|u[0-9a-fA-F]{4}|x[0-9a-fA-F]{2}|U[0-9a-fA-F]{8})')
+
+def _turn_bits_red(match):
+    txt = match.group(0)
+    if txt == '\\\\':
+        return '\\'
+    return RED + txt + YELLOW
+
+def format_value(val, casstype, output_encoding, addcolor=False, time_format='', float_precision=3):
+    color = YELLOW
+    coloredval = None
+    displaywidth = None
+
+    if val is None:
+        bval = 'null'
+        color = RED
+    elif casstype == 'UTF8Type':
+        escapedval = val.replace(u'\\', u'\\\\')
+        escapedval = controlchars_re.sub(_show_control_chars, escapedval)
+        bval = escapedval.encode(output_encoding, errors='backslashreplace')
+        displaywidth = wcwidth.wcswidth(bval.decode(output_encoding))
+        if addcolor:
+            coloredval = YELLOW + bits_to_turn_red_re.sub(_turn_bits_red, bval) + ANSI_RESET
+    elif casstype == 'DateType':
+        timestamp = time.localtime(val)
+        bval = time.strftime(time_format, timestamp)
+        color = GREEN
+    elif casstype in ('LongType', 'Int32Type', 'IntegerType'):
+        # base-10 only for now; support others?
+        bval = str(val)
+        color = GREEN
+    elif casstype in ('FloatType', 'DoubleType'):
+        bval = '%.*g' % (float_precision, val)
+        color = GREEN
+    elif casstype in ('DecimalType', 'UUIDType', 'BooleanType'):
+        # let python do these for us
+        bval = str(val)
+        color = GREEN
+    elif casstype == 'BytesType':
+        bval = ''.join('%02x' % ord(c) for c in val)
+        color = RED
+    else:
+        # AsciiType is the only other one known right now, but handle others
+        escapedval = val.replace('\\', '\\\\')
+        bval = controlchars_re.sub(_show_control_chars, escapedval)
+        if addcolor:
+            coloredval = YELLOW + bits_to_turn_red_re.sub(_turn_bits_red, bval) + ANSI_RESET
+
+    if displaywidth is None:
+        displaywidth = len(bval)
+    if not addcolor:
+        coloredval = bval
+    elif coloredval is None:
+        coloredval = color + bval + ANSI_RESET
+
+    return FormattedValue(bval, coloredval, displaywidth)
+
 class Shell(cmd.Cmd):
     default_prompt  = "cqlsh> "
     continue_prompt = "   ... "
     keyspace_prompt          = "cqlsh:%s> "
     keyspace_continue_prompt = "%s    ... "
+    display_time_format = '%Y-%m-%d %H:%M:%S%z'
+    display_float_precision = 3
     num_retries = 4
     debug = False
 
     def __init__(self, hostname, port, color=False, username=None,
-            password=None, completekey='tab'):
+                 password=None, encoding=None, completekey='tab'):
         cmd.Cmd.__init__(self, completekey=completekey)
         self.hostname = hostname
         self.port = port
@@ -233,8 +334,13 @@ class Shell(cmd.Cmd):
 
         self.current_keyspace = None
 
-        self.statement = StringIO()
         self.color = color
+        if encoding is None:
+            encoding = sys.stdout.encoding
+        self.encoding = encoding
+        self.output_codec = codecs.lookup(encoding)
+
+        self.statement = StringIO()
         self.in_comment = False
         self.schema_overrides = {}
 
@@ -245,6 +351,11 @@ class Shell(cmd.Cmd):
         else:
             self.prompt = ""
 
+    def myformat_value(self, val, casstype):
+        return format_value(val, casstype, self.output_codec.name,
+                            addcolor=self.color, time_format=self.display_time_format,
+                            float_precision=self.display_float_precision)
+
     def report_connection(self):
         self.show_host()
         self.show_version()
@@ -562,43 +673,32 @@ class Shell(cmd.Cmd):
         self.printout("")
 
     def print_static_result(self):
-        # first pass, get widths
-        widths = defaultdict(lambda: 0)
-        for row in self.cursor:
-            for desc, value in zip(self.cursor.description, row):
-                name = desc[0]
-                widths[name] = max(widths[name], len(str(name)), len(str(value)))
-        self.cursor._reset()
+        colnames, coltypes = zip(*self.cursor.description)[:2]
+        formatted_data = [map(self.myformat_value, row, coltypes) for row in self.cursor]
+
+        # determine column widths
+        widths = map(len, colnames)
+        for fmtrow in formatted_data:
+            for num, col in enumerate(fmtrow):
+                widths[num] = max(widths[num], len(col.strval))
 
         # print header
-        for desc in self.cursor.description:
-            name = desc[0]
-            width = widths[name]
-            self.printout(" ", newline=False)
-            self.printout(string.rjust(str(name), width), MAGENTA, False)
-            self.printout(" |", newline=False)
-        self.printout("")
+        header = ' | '.join(self.applycolor(name.ljust(w), MAGENTA) for (name, w) in zip(colnames,
widths))
+        print ' ' + header.rstrip()
+        print '-%s-' % '-+-'.join('-' * w for w in widths)
 
         # print row data
-        for row in self.cursor:
-            for desc, value in zip(self.cursor.description, row):
-                name = desc[0]
-                width = widths[desc[0]]
-                self.printout(" ", newline=False)
-                self.printout(string.rjust(str(value), width), YELLOW, False)
-                self.printout(" |", newline=False)
-            self.printout("")
+        for row in formatted_data:
+            line = ' | '.join(col.color_rjust(w) for (col, w) in zip(row, widths))
+            print ' ' + line
 
     def print_dynamic_result(self):
         for row in self.cursor:
-            self.printout(" ", newline=False)
-            for desc, value in zip(self.cursor.description, row):
-                name = desc[0]
-                self.printout(str(name), MAGENTA, False)
-                self.printout(",", newline=False)
-                self.printout(str(value), YELLOW, False)
-                self.printout(" | ", newline=False)
-            self.printout("")
+            colnames, coltypes = zip(*self.cursor.description)[:2]
+            colnames = [self.applycolor(name, MAGENTA) for name in colnames]
+            colvals = [self.myformat_value(val, casstype) for (val, casstype) in zip(row,
coltypes)]
+            line = ' | '.join(name + ',' + col.coloredval for (col, name) in zip(colvals,
colnames))
+            print ' ' + line
 
     def emptyline(self):
         pass
@@ -946,10 +1046,109 @@ class Shell(cmd.Cmd):
             cmd.Cmd.do_help(self, cql_dequote(t).lower())
 
     def help_types(self):
-        self.printout("CQL types recognized by this version of cqlsh:\n")
+        self.printout("\n        CQL types recognized by this version of cqlsh:\n")
         for t in cqlhandling.cql_types:
-            self.printout('  ' + t)
-        self.printout('')
+            self.printout('          ' + t)
+        print """
+        For information on the various recognizable input formats for these
+        types, or on controlling the formatting of cqlsh query output, see
+        one of the following topics:
+
+          HELP TIMESTAMP_INPUT
+          HELP BLOB_INPUT
+          HELP UUID_INPUT
+          HELP BOOLEAN_INPUT
+
+          HELP TEXT_OUTPUT
+          HELP TIMESTAMP_OUTPUT
+        """
+
+    def help_timestamp_input(self):
+        print """
+        Timestamp input
+
+        CQL supports any of the following ISO 8601 formats for timestamp
+        specification:
+
+          yyyy-mm-dd HH:mm
+          yyyy-mm-dd HH:mm:ss
+          yyyy-mm-dd HH:mmZ
+          yyyy-mm-dd HH:mm:ssZ
+          yyyy-mm-dd'T'HH:mm
+          yyyy-mm-dd'T'HH:mmZ
+          yyyy-mm-dd'T'HH:mm:ss
+          yyyy-mm-dd'T'HH:mm:ssZ
+          yyyy-mm-dd
+          yyyy-mm-ddZ
+
+        The Z in these formats refers to an RFC-822 4-digit time zone,
+        expressing the time zone's difference from UTC. For example, a
+        timestamp in Pacific Standard Time might be given thus:
+
+          2012-01-20 16:14:12-0800
+
+        If no time zone is supplied, the current time zone for the Cassandra
+        server node will be used.
+        """
+
+    def help_blob_input(self):
+        print """
+        Blob input
+
+        CQL blob data must be specified in a string literal as hexidecimal
+        data. Example: to store the ASCII values for the characters in the
+        string "CQL", use '43514c'.
+        """
+
+    def help_uuid_input(self):
+        print """
+        UUID input
+
+        UUIDs may be specified in CQL using 32 hexidecimal characters,
+        split up using dashes in the standard UUID format:
+
+          XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX
+        """
+
+    def help_boolean_input(self):
+        print """
+        Boolean input
+
+        CQL accepts the strings 'true' and 'false' (case insensitive)
+        as input for boolean types.
+        """
+
+    def help_text_output(self):
+        print """
+        Textual output
+
+        When control characters, or other characters which can't be encoded
+        in your current locale, are found in values of 'text' or 'ascii'
+        types, it will be shown as a backslash escape. If color is enabled,
+        any such backslash escapes will be shown in a different color from
+        the surrounding text.
+
+        Unicode code points in your data will be output intact, if the
+        encoding for your locale is capable of decoding them. If you prefer
+        that non-ascii characters be shown with Python-style "\\uABCD"
+        escape sequences, invoke cqlsh with an ASCII locale (for example,
+        by setting the $LANG environment variable to "C").
+        """
+
+    help_ascii_output = help_text_output
+
+    def help_timestamp_output(self):
+        print """
+        Timestamp output
+
+        Cqlsh will display timestamps in this format:
+
+          yyyy-mm-dd HH:mm:ssZ
+
+        which is a format acceptable as CQL timestamp input as well. It is
+        planned that cqlsh should allow the user to change that output format
+        if desired, but that feature is not yet available.
+        """
 
     def help_select_expr(self):
         print """
@@ -1451,17 +1650,31 @@ class Shell(cmd.Cmd):
         according to a certain type.
         """
 
-    def printout(self, text, color=None, newline=True, out=sys.stdout):
+    def help_alter_with(self):
+        print """
+        ALTER COLUMNFAMILY: changing column family properties
+
+          ALTER COLUMNFAMILY addamsFamily WITH comment = 'Glad to be here!'
+                                           AND read_repair_chance = 0.2;
+
+        An ALTER COLUMNFAMILY ... WITH statement makes adjustments to the
+        column family properties, as defined when the column family was created
+        (see HELP CREATE_COLUMNFAMILY_OPTIONS, and your Cassandra documentation
+        for information about the supported parameter names and values).
+        """
+
+    def applycolor(self, text, color=None):
         if not color or not self.color:
-            out.write(text)
-        else:
-            out.write(color % text)
+            return text
+        return color + text + ANSI_RESET
 
-        if newline:
-            out.write("\n")
+    def printout(self, text, color=None, newline=True, out=None):
+        if out is None:
+            out = sys.stdout
+        out.write(self.applycolor(str(text), color) + ('\n' if newline else ''))
 
     def printerr(self, text, color=RED, newline=True):
-        self.printout(text, color, newline, sys.stderr)
+        self.printout(text, color, newline=newline, out=sys.stderr)
 
     def add_assumption(self, ksname, cfname, colname, valtype, valclass):
         try:

http://git-wip-us.apache.org/repos/asf/cassandra/blob/5850178c/pylib/cqlshlib/wcwidth.py
----------------------------------------------------------------------
diff --git a/pylib/cqlshlib/wcwidth.py b/pylib/cqlshlib/wcwidth.py
new file mode 100644
index 0000000..43a4aca
--- /dev/null
+++ b/pylib/cqlshlib/wcwidth.py
@@ -0,0 +1,367 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# adapted from http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
+# -thepaul
+
+# This is an implementation of wcwidth() and wcswidth() (defined in
+# IEEE Std 1002.1-2001) for Unicode.
+#
+# http://www.opengroup.org/onlinepubs/007904975/functions/wcwidth.html
+# http://www.opengroup.org/onlinepubs/007904975/functions/wcswidth.html
+#
+# In fixed-width output devices, Latin characters all occupy a single
+# "cell" position of equal width, whereas ideographic CJK characters
+# occupy two such cells. Interoperability between terminal-line
+# applications and (teletype-style) character terminals using the
+# UTF-8 encoding requires agreement on which character should advance
+# the cursor by how many cell positions. No established formal
+# standards exist at present on which Unicode character shall occupy
+# how many cell positions on character terminals. These routines are
+# a first attempt of defining such behavior based on simple rules
+# applied to data provided by the Unicode Consortium.
+#
+# For some graphical characters, the Unicode standard explicitly
+# defines a character-cell width via the definition of the East Asian
+# FullWidth (F), Wide (W), Half-width (H), and Narrow (Na) classes.
+# In all these cases, there is no ambiguity about which width a
+# terminal shall use. For characters in the East Asian Ambiguous (A)
+# class, the width choice depends purely on a preference of backward
+# compatibility with either historic CJK or Western practice.
+# Choosing single-width for these characters is easy to justify as
+# the appropriate long-term solution, as the CJK practice of
+# displaying these characters as double-width comes from historic
+# implementation simplicity (8-bit encoded characters were displayed
+# single-width and 16-bit ones double-width, even for Greek,
+# Cyrillic, etc.) and not any typographic considerations.
+#
+# Much less clear is the choice of width for the Not East Asian
+# (Neutral) class. Existing practice does not dictate a width for any
+# of these characters. It would nevertheless make sense
+# typographically to allocate two character cells to characters such
+# as for instance EM SPACE or VOLUME INTEGRAL, which cannot be
+# represented adequately with a single-width glyph. The following
+# routines at present merely assign a single-cell width to all
+# neutral characters, in the interest of simplicity. This is not
+# entirely satisfactory and should be reconsidered before
+# establishing a formal standard in this area. At the moment, the
+# decision which Not East Asian (Neutral) characters should be
+# represented by double-width glyphs cannot yet be answered by
+# applying a simple rule from the Unicode database content. Setting
+# up a proper standard for the behavior of UTF-8 character terminals
+# will require a careful analysis not only of each Unicode character,
+# but also of each presentation form, something the author of these
+# routines has avoided to do so far.
+#
+# http://www.unicode.org/unicode/reports/tr11/
+#
+# Markus Kuhn -- 2007-05-26 (Unicode 5.0)
+#
+# Permission to use, copy, modify, and distribute this software
+# for any purpose and without fee is hereby granted. The author
+# disclaims all warranties with regard to this software.
+#
+# Latest C version: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
+
+# auxiliary function for binary search in interval table
+def bisearch(ucs, table):
+  min = 0
+  max = len(table) - 1
+  if ucs < table[0][0] or ucs > table[max][1]:
+    return 0
+  while max >= min:
+    mid = (min + max) / 2
+    if ucs > table[mid][1]:
+      min = mid + 1
+    elif ucs < table[mid][0]:
+      max = mid - 1
+    else:
+      return 1
+  return 0
+
+# The following two functions define the column width of an ISO 10646
+# character as follows:
+#
+#    - The null character (U+0000) has a column width of 0.
+#
+#    - Other C0/C1 control characters and DEL will lead to a return
+#      value of -1.
+#
+#    - Non-spacing and enclosing combining characters (general
+#      category code Mn or Me in the Unicode database) have a
+#      column width of 0.
+#
+#    - SOFT HYPHEN (U+00AD) has a column width of 1.
+#
+#    - Other format characters (general category code Cf in the Unicode
+#      database) and ZERO WIDTH SPACE (U+200B) have a column width of 0.
+#
+#    - Hangul Jamo medial vowels and final consonants (U+1160-U+11FF)
+#      have a column width of 0.
+#
+#    - Spacing characters in the East Asian Wide (W) or East Asian
+#      Full-width (F) category as defined in Unicode Technical
+#      Report #11 have a column width of 2.
+#
+#    - All remaining characters (including all printable
+#      ISO 8859-1 and WGL4 characters, Unicode control characters,
+#      etc.) have a column width of 1.
+#
+# This implementation assumes that wchar_t characters are encoded
+# in ISO 10646.
+
+# sorted list of non-overlapping intervals of non-spacing characters
+# generated by "uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B c"
+combining = (
+  ( 0x0300, 0x036F ), ( 0x0483, 0x0486 ), ( 0x0488, 0x0489 ),
+  ( 0x0591, 0x05BD ), ( 0x05BF, 0x05BF ), ( 0x05C1, 0x05C2 ),
+  ( 0x05C4, 0x05C5 ), ( 0x05C7, 0x05C7 ), ( 0x0600, 0x0603 ),
+  ( 0x0610, 0x0615 ), ( 0x064B, 0x065E ), ( 0x0670, 0x0670 ),
+  ( 0x06D6, 0x06E4 ), ( 0x06E7, 0x06E8 ), ( 0x06EA, 0x06ED ),
+  ( 0x070F, 0x070F ), ( 0x0711, 0x0711 ), ( 0x0730, 0x074A ),
+  ( 0x07A6, 0x07B0 ), ( 0x07EB, 0x07F3 ), ( 0x0901, 0x0902 ),
+  ( 0x093C, 0x093C ), ( 0x0941, 0x0948 ), ( 0x094D, 0x094D ),
+  ( 0x0951, 0x0954 ), ( 0x0962, 0x0963 ), ( 0x0981, 0x0981 ),
+  ( 0x09BC, 0x09BC ), ( 0x09C1, 0x09C4 ), ( 0x09CD, 0x09CD ),
+  ( 0x09E2, 0x09E3 ), ( 0x0A01, 0x0A02 ), ( 0x0A3C, 0x0A3C ),
+  ( 0x0A41, 0x0A42 ), ( 0x0A47, 0x0A48 ), ( 0x0A4B, 0x0A4D ),
+  ( 0x0A70, 0x0A71 ), ( 0x0A81, 0x0A82 ), ( 0x0ABC, 0x0ABC ),
+  ( 0x0AC1, 0x0AC5 ), ( 0x0AC7, 0x0AC8 ), ( 0x0ACD, 0x0ACD ),
+  ( 0x0AE2, 0x0AE3 ), ( 0x0B01, 0x0B01 ), ( 0x0B3C, 0x0B3C ),
+  ( 0x0B3F, 0x0B3F ), ( 0x0B41, 0x0B43 ), ( 0x0B4D, 0x0B4D ),
+  ( 0x0B56, 0x0B56 ), ( 0x0B82, 0x0B82 ), ( 0x0BC0, 0x0BC0 ),
+  ( 0x0BCD, 0x0BCD ), ( 0x0C3E, 0x0C40 ), ( 0x0C46, 0x0C48 ),
+  ( 0x0C4A, 0x0C4D ), ( 0x0C55, 0x0C56 ), ( 0x0CBC, 0x0CBC ),
+  ( 0x0CBF, 0x0CBF ), ( 0x0CC6, 0x0CC6 ), ( 0x0CCC, 0x0CCD ),
+  ( 0x0CE2, 0x0CE3 ), ( 0x0D41, 0x0D43 ), ( 0x0D4D, 0x0D4D ),
+  ( 0x0DCA, 0x0DCA ), ( 0x0DD2, 0x0DD4 ), ( 0x0DD6, 0x0DD6 ),
+  ( 0x0E31, 0x0E31 ), ( 0x0E34, 0x0E3A ), ( 0x0E47, 0x0E4E ),
+  ( 0x0EB1, 0x0EB1 ), ( 0x0EB4, 0x0EB9 ), ( 0x0EBB, 0x0EBC ),
+  ( 0x0EC8, 0x0ECD ), ( 0x0F18, 0x0F19 ), ( 0x0F35, 0x0F35 ),
+  ( 0x0F37, 0x0F37 ), ( 0x0F39, 0x0F39 ), ( 0x0F71, 0x0F7E ),
+  ( 0x0F80, 0x0F84 ), ( 0x0F86, 0x0F87 ), ( 0x0F90, 0x0F97 ),
+  ( 0x0F99, 0x0FBC ), ( 0x0FC6, 0x0FC6 ), ( 0x102D, 0x1030 ),
+  ( 0x1032, 0x1032 ), ( 0x1036, 0x1037 ), ( 0x1039, 0x1039 ),
+  ( 0x1058, 0x1059 ), ( 0x1160, 0x11FF ), ( 0x135F, 0x135F ),
+  ( 0x1712, 0x1714 ), ( 0x1732, 0x1734 ), ( 0x1752, 0x1753 ),
+  ( 0x1772, 0x1773 ), ( 0x17B4, 0x17B5 ), ( 0x17B7, 0x17BD ),
+  ( 0x17C6, 0x17C6 ), ( 0x17C9, 0x17D3 ), ( 0x17DD, 0x17DD ),
+  ( 0x180B, 0x180D ), ( 0x18A9, 0x18A9 ), ( 0x1920, 0x1922 ),
+  ( 0x1927, 0x1928 ), ( 0x1932, 0x1932 ), ( 0x1939, 0x193B ),
+  ( 0x1A17, 0x1A18 ), ( 0x1B00, 0x1B03 ), ( 0x1B34, 0x1B34 ),
+  ( 0x1B36, 0x1B3A ), ( 0x1B3C, 0x1B3C ), ( 0x1B42, 0x1B42 ),
+  ( 0x1B6B, 0x1B73 ), ( 0x1DC0, 0x1DCA ), ( 0x1DFE, 0x1DFF ),
+  ( 0x200B, 0x200F ), ( 0x202A, 0x202E ), ( 0x2060, 0x2063 ),
+  ( 0x206A, 0x206F ), ( 0x20D0, 0x20EF ), ( 0x302A, 0x302F ),
+  ( 0x3099, 0x309A ), ( 0xA806, 0xA806 ), ( 0xA80B, 0xA80B ),
+  ( 0xA825, 0xA826 ), ( 0xFB1E, 0xFB1E ), ( 0xFE00, 0xFE0F ),
+  ( 0xFE20, 0xFE23 ), ( 0xFEFF, 0xFEFF ), ( 0xFFF9, 0xFFFB ),
+  ( 0x10A01, 0x10A03 ), ( 0x10A05, 0x10A06 ), ( 0x10A0C, 0x10A0F ),
+  ( 0x10A38, 0x10A3A ), ( 0x10A3F, 0x10A3F ), ( 0x1D167, 0x1D169 ),
+  ( 0x1D173, 0x1D182 ), ( 0x1D185, 0x1D18B ), ( 0x1D1AA, 0x1D1AD ),
+  ( 0x1D242, 0x1D244 ), ( 0xE0001, 0xE0001 ), ( 0xE0020, 0xE007F ),
+  ( 0xE0100, 0xE01EF )
+)
+
+# sorted list of non-overlapping intervals of East Asian Ambiguous
+# characters, generated by "uniset +WIDTH-A -cat=Me -cat=Mn -cat=Cf c"
+ambiguous = (
+  ( 0x00A1, 0x00A1 ), ( 0x00A4, 0x00A4 ), ( 0x00A7, 0x00A8 ),
+  ( 0x00AA, 0x00AA ), ( 0x00AE, 0x00AE ), ( 0x00B0, 0x00B4 ),
+  ( 0x00B6, 0x00BA ), ( 0x00BC, 0x00BF ), ( 0x00C6, 0x00C6 ),
+  ( 0x00D0, 0x00D0 ), ( 0x00D7, 0x00D8 ), ( 0x00DE, 0x00E1 ),
+  ( 0x00E6, 0x00E6 ), ( 0x00E8, 0x00EA ), ( 0x00EC, 0x00ED ),
+  ( 0x00F0, 0x00F0 ), ( 0x00F2, 0x00F3 ), ( 0x00F7, 0x00FA ),
+  ( 0x00FC, 0x00FC ), ( 0x00FE, 0x00FE ), ( 0x0101, 0x0101 ),
+  ( 0x0111, 0x0111 ), ( 0x0113, 0x0113 ), ( 0x011B, 0x011B ),
+  ( 0x0126, 0x0127 ), ( 0x012B, 0x012B ), ( 0x0131, 0x0133 ),
+  ( 0x0138, 0x0138 ), ( 0x013F, 0x0142 ), ( 0x0144, 0x0144 ),
+  ( 0x0148, 0x014B ), ( 0x014D, 0x014D ), ( 0x0152, 0x0153 ),
+  ( 0x0166, 0x0167 ), ( 0x016B, 0x016B ), ( 0x01CE, 0x01CE ),
+  ( 0x01D0, 0x01D0 ), ( 0x01D2, 0x01D2 ), ( 0x01D4, 0x01D4 ),
+  ( 0x01D6, 0x01D6 ), ( 0x01D8, 0x01D8 ), ( 0x01DA, 0x01DA ),
+  ( 0x01DC, 0x01DC ), ( 0x0251, 0x0251 ), ( 0x0261, 0x0261 ),
+  ( 0x02C4, 0x02C4 ), ( 0x02C7, 0x02C7 ), ( 0x02C9, 0x02CB ),
+  ( 0x02CD, 0x02CD ), ( 0x02D0, 0x02D0 ), ( 0x02D8, 0x02DB ),
+  ( 0x02DD, 0x02DD ), ( 0x02DF, 0x02DF ), ( 0x0391, 0x03A1 ),
+  ( 0x03A3, 0x03A9 ), ( 0x03B1, 0x03C1 ), ( 0x03C3, 0x03C9 ),
+  ( 0x0401, 0x0401 ), ( 0x0410, 0x044F ), ( 0x0451, 0x0451 ),
+  ( 0x2010, 0x2010 ), ( 0x2013, 0x2016 ), ( 0x2018, 0x2019 ),
+  ( 0x201C, 0x201D ), ( 0x2020, 0x2022 ), ( 0x2024, 0x2027 ),
+  ( 0x2030, 0x2030 ), ( 0x2032, 0x2033 ), ( 0x2035, 0x2035 ),
+  ( 0x203B, 0x203B ), ( 0x203E, 0x203E ), ( 0x2074, 0x2074 ),
+  ( 0x207F, 0x207F ), ( 0x2081, 0x2084 ), ( 0x20AC, 0x20AC ),
+  ( 0x2103, 0x2103 ), ( 0x2105, 0x2105 ), ( 0x2109, 0x2109 ),
+  ( 0x2113, 0x2113 ), ( 0x2116, 0x2116 ), ( 0x2121, 0x2122 ),
+  ( 0x2126, 0x2126 ), ( 0x212B, 0x212B ), ( 0x2153, 0x2154 ),
+  ( 0x215B, 0x215E ), ( 0x2160, 0x216B ), ( 0x2170, 0x2179 ),
+  ( 0x2190, 0x2199 ), ( 0x21B8, 0x21B9 ), ( 0x21D2, 0x21D2 ),
+  ( 0x21D4, 0x21D4 ), ( 0x21E7, 0x21E7 ), ( 0x2200, 0x2200 ),
+  ( 0x2202, 0x2203 ), ( 0x2207, 0x2208 ), ( 0x220B, 0x220B ),
+  ( 0x220F, 0x220F ), ( 0x2211, 0x2211 ), ( 0x2215, 0x2215 ),
+  ( 0x221A, 0x221A ), ( 0x221D, 0x2220 ), ( 0x2223, 0x2223 ),
+  ( 0x2225, 0x2225 ), ( 0x2227, 0x222C ), ( 0x222E, 0x222E ),
+  ( 0x2234, 0x2237 ), ( 0x223C, 0x223D ), ( 0x2248, 0x2248 ),
+  ( 0x224C, 0x224C ), ( 0x2252, 0x2252 ), ( 0x2260, 0x2261 ),
+  ( 0x2264, 0x2267 ), ( 0x226A, 0x226B ), ( 0x226E, 0x226F ),
+  ( 0x2282, 0x2283 ), ( 0x2286, 0x2287 ), ( 0x2295, 0x2295 ),
+  ( 0x2299, 0x2299 ), ( 0x22A5, 0x22A5 ), ( 0x22BF, 0x22BF ),
+  ( 0x2312, 0x2312 ), ( 0x2460, 0x24E9 ), ( 0x24EB, 0x254B ),
+  ( 0x2550, 0x2573 ), ( 0x2580, 0x258F ), ( 0x2592, 0x2595 ),
+  ( 0x25A0, 0x25A1 ), ( 0x25A3, 0x25A9 ), ( 0x25B2, 0x25B3 ),
+  ( 0x25B6, 0x25B7 ), ( 0x25BC, 0x25BD ), ( 0x25C0, 0x25C1 ),
+  ( 0x25C6, 0x25C8 ), ( 0x25CB, 0x25CB ), ( 0x25CE, 0x25D1 ),
+  ( 0x25E2, 0x25E5 ), ( 0x25EF, 0x25EF ), ( 0x2605, 0x2606 ),
+  ( 0x2609, 0x2609 ), ( 0x260E, 0x260F ), ( 0x2614, 0x2615 ),
+  ( 0x261C, 0x261C ), ( 0x261E, 0x261E ), ( 0x2640, 0x2640 ),
+  ( 0x2642, 0x2642 ), ( 0x2660, 0x2661 ), ( 0x2663, 0x2665 ),
+  ( 0x2667, 0x266A ), ( 0x266C, 0x266D ), ( 0x266F, 0x266F ),
+  ( 0x273D, 0x273D ), ( 0x2776, 0x277F ), ( 0xE000, 0xF8FF ),
+  ( 0xFFFD, 0xFFFD ), ( 0xF0000, 0xFFFFD ), ( 0x100000, 0x10FFFD )
+)
+
+def mk_wcwidth(ucs):
+  # test for 8-bit control characters
+  if ucs == 0:
+    return 0
+  if ucs < 32 or (ucs >= 0x7f and ucs < 0xa0):
+    return -1
+
+  # binary search in table of non-spacing characters
+  if bisearch(ucs, combining):
+    return 0
+
+  # if we arrive here, ucs is not a combining or C0/C1 control character
+
+  return 1 + \
+    int(ucs >= 0x1100 and
+        (ucs <= 0x115f or                     # Hangul Jamo init. consonants
+         ucs == 0x2329 or ucs == 0x232a or
+         (ucs >= 0x2e80 and ucs <= 0xa4cf and
+          ucs != 0x303f) or                   # CJK ... Yi
+         (ucs >= 0xac00 and ucs <= 0xd7a3) or # Hangul Syllables
+         (ucs >= 0xf900 and ucs <= 0xfaff) or # CJK Compatibility Ideographs
+         (ucs >= 0xfe10 and ucs <= 0xfe19) or # Vertical forms
+         (ucs >= 0xfe30 and ucs <= 0xfe6f) or # CJK Compatibility Forms
+         (ucs >= 0xff00 and ucs <= 0xff60) or # Fullwidth Forms
+         (ucs >= 0xffe0 and ucs <= 0xffe6) or
+         (ucs >= 0x20000 and ucs <= 0x2fffd) or
+         (ucs >= 0x30000 and ucs <= 0x3fffd)))
+
+
+def mk_wcswidth(pwcs):
+  width = 0
+  for c in pwcs:
+    w = mk_wcwidth(c)
+    if w < 0:
+      return -1
+    else:
+      width += w
+
+  return width
+
+
+# The following functions are the same as mk_wcwidth() and
+# mk_wcswidth(), except that spacing characters in the East Asian
+# Ambiguous (A) category as defined in Unicode Technical Report #11
+# have a column width of 2. This variant might be useful for users of
+# CJK legacy encodings who want to migrate to UCS without changing
+# the traditional terminal character-width behaviour. It is not
+# otherwise recommended for general use.
+def mk_wcwidth_cjk(ucs):
+  # binary search in table of non-spacing characters
+  if bisearch(ucs, ambiguous):
+    return 2
+
+  return mk_wcwidth(ucs)
+
+
+def mk_wcswidth_cjk(pwcs):
+  width = 0
+
+  for c in pwcs:
+    w = mk_wcwidth_cjk(c)
+    if w < 0:
+      return -1
+    width += w
+
+  return width
+
+# python-y versions, dealing with unicode objects
+def wcwidth(c):
+    return mk_wcwidth(ord(c))
+
+def wcswidth(s):
+    return mk_wcswidth(map(ord, s))
+
+def wcwidth_cjk(c):
+    return mk_wcwidth_cjk(ord(c))
+
+def wcswidth_cjk(s):
+    return mk_wcswidth_cjk(map(ord, s))
+
+if __name__ == "__main__":
+    samples = (
+        ('MUSIC SHARP SIGN', 1),
+        ('FULLWIDTH POUND SIGN', 2),
+        ('FULLWIDTH LATIN CAPITAL LETTER P', 2),
+        ('CJK RADICAL BOLT OF CLOTH', 2),
+        ('LATIN SMALL LETTER A', 1),
+        ('LATIN SMALL LETTER AE', 1),
+        ('SPACE', 1),
+        ('NO-BREAK SPACE', 1),
+        ('CJK COMPATIBILITY IDEOGRAPH-F920', 2),
+        ('MALAYALAM VOWEL SIGN UU', 0),
+        ('ZERO WIDTH SPACE', 0),
+        ('ZERO WIDTH NO-BREAK SPACE', 0),
+        ('COMBINING PALATALIZED HOOK BELOW', 0),
+        ('COMBINING GRAVE ACCENT', 0),
+    )
+    nonprinting = u'\r\n\t\a\b\f\v\x7f'
+
+    import unicodedata
+
+    for name, printwidth in samples:
+        uchr = unicodedata.lookup(name)
+        calculatedwidth = wcwidth(uchr)
+        assert calculatedwidth == printwidth, \
+                'width for %r should be %d, but is %d?' % (uchr, printwidth, calculatedwidth)
+
+    for c in nonprinting:
+        calculatedwidth = wcwidth(c)
+        assert calculatedwidth < 0, \
+                '%r is a control character, but wcwidth gives %d' % (c, calculatedwidth)
+
+    assert wcwidth('\0') == 0  # special case
+
+    # depending on how python is compiled, code points above U+FFFF may not be
+    # treated as single characters, so ord() won't work. test a few of these
+    # manually.
+
+    assert mk_wcwidth(0xe01ef) == 0
+    assert mk_wcwidth(0x10ffff) == 1
+    assert mk_wcwidth(0x3fffd) == 2
+
+    teststr = u'B\0ig br\u00f8wn moose\ub143\u200b'
+    calculatedwidth = wcswidth(teststr)
+    assert calculatedwidth == 17, 'expected 17, got %d' % calculatedwidth
+
+    calculatedwidth = wcswidth_cjk(teststr)
+    assert calculatedwidth == 18, 'expected 18, got %d' % calculatedwidth
+
+    assert wcswidth(u'foobar\u200b\a') < 0
+
+    print 'tests pass.'


Mime
View raw message