Return-Path: X-Original-To: apmail-cassandra-commits-archive@www.apache.org Delivered-To: apmail-cassandra-commits-archive@www.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 7EBFD17F76 for ; Thu, 29 Jan 2015 20:38:58 +0000 (UTC) Received: (qmail 64246 invoked by uid 500); 29 Jan 2015 20:38:58 -0000 Delivered-To: apmail-cassandra-commits-archive@cassandra.apache.org Received: (qmail 64205 invoked by uid 500); 29 Jan 2015 20:38:58 -0000 Mailing-List: contact commits-help@cassandra.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@cassandra.apache.org Delivered-To: mailing list commits@cassandra.apache.org Received: (qmail 64191 invoked by uid 99); 29 Jan 2015 20:38:58 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 29 Jan 2015 20:38:58 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id 7E44CE055B; Thu, 29 Jan 2015 20:38:58 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: tylerhobbs@apache.org To: commits@cassandra.apache.org Message-Id: <146e5b7d9fef4f4f97580b6528c358f9@git.apache.org> X-Mailer: ASF-Git Admin Mailer Subject: cassandra git commit: cqlsh: handle unicode BOM at the start of files Date: Thu, 29 Jan 2015 20:38:58 +0000 (UTC) Repository: cassandra Updated Branches: refs/heads/cassandra-2.1 1e5a0e19a -> c49f6666e cqlsh: handle unicode BOM at the start of files Patch by Abhishek Gupta; reviewed by Tyler Hobbs for CASSANDRA-8638 Project: http://git-wip-us.apache.org/repos/asf/cassandra/repo Commit: http://git-wip-us.apache.org/repos/asf/cassandra/commit/c49f6666 Tree: http://git-wip-us.apache.org/repos/asf/cassandra/tree/c49f6666 Diff: http://git-wip-us.apache.org/repos/asf/cassandra/diff/c49f6666 Branch: refs/heads/cassandra-2.1 Commit: c49f6666e457debebc3cfe935412a2b5306208a3 Parents: 1e5a0e1 Author: Abhishek Gupta Authored: Thu Jan 29 14:37:55 2015 -0600 Committer: Tyler Hobbs Committed: Thu Jan 29 14:37:55 2015 -0600 ---------------------------------------------------------------------- CHANGES.txt | 1 + bin/cqlsh | 10 +++++++--- pylib/cqlshlib/util.py | 29 +++++++++++++++++++++++++++++ 3 files changed, 37 insertions(+), 3 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/cassandra/blob/c49f6666/CHANGES.txt ---------------------------------------------------------------------- diff --git a/CHANGES.txt b/CHANGES.txt index fce4898..a8f8b87 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,4 +1,5 @@ 2.1.3 + * (cqlsh) Handle unicode BOM at start of files (CASSANDRA-8638) * Stop compactions before exiting offline tools (CASSANDRA-8623) * Update tools/stress/README.txt to match current behaviour (CASSANDRA-7933) * Fix schema from Thrift conversion with empty metadata (CASSANDRA-8695) http://git-wip-us.apache.org/repos/asf/cassandra/blob/c49f6666/bin/cqlsh ---------------------------------------------------------------------- diff --git a/bin/cqlsh b/bin/cqlsh index 0254fad..26296a3 100755 --- a/bin/cqlsh +++ b/bin/cqlsh @@ -121,7 +121,7 @@ from cqlshlib import cqlhandling, cql3handling, pylexotron, sslhandling, async_i from cqlshlib.displaying import (RED, BLUE, CYAN, ANSI_RESET, COLUMN_NAME_COLORS, FormattedValue, colorme) from cqlshlib.formatting import format_by_type, formatter_for, format_value_utype -from cqlshlib.util import trim_if_present +from cqlshlib.util import trim_if_present, get_file_encoding_bomsize from cqlshlib.tracing import print_trace_session, print_trace DEFAULT_HOST = '127.0.0.1' @@ -1601,7 +1601,9 @@ class Shell(cmd.Cmd): fname = parsed.get_binding('fname') fname = os.path.expanduser(self.cql_unprotect_value(fname)) try: - f = open(fname, 'r') + encoding, bom_size = get_file_encoding_bomsize(fname) + f = codecs.open(fname, 'r', encoding) + f.seek(bom_size) except IOError, e: self.printerr('Could not open %r: %s' % (fname, e)) return @@ -2013,7 +2015,9 @@ def main(options, hostname, port): stdin = None else: try: - stdin = open(options.file, 'r') + encoding, bom_size = get_file_encoding_bomsize(options.file) + stdin = codecs.open(options.file, 'r', encoding) + stdin.seek(bom_size) except IOError, e: sys.exit("Can't open %r: %s" % (options.file, e)) http://git-wip-us.apache.org/repos/asf/cassandra/blob/c49f6666/pylib/cqlshlib/util.py ---------------------------------------------------------------------- diff --git a/pylib/cqlshlib/util.py b/pylib/cqlshlib/util.py index e62ded4..bc58c8b 100644 --- a/pylib/cqlshlib/util.py +++ b/pylib/cqlshlib/util.py @@ -14,8 +14,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +import codecs from itertools import izip + def split_list(items, pred): """ Split up a list (or other iterable) on the elements which satisfy the @@ -36,6 +38,7 @@ def split_list(items, pred): results.append(thisresult) return results + def find_common_prefix(strs): """ Given a list (iterable) of strings, return the longest common prefix. @@ -54,6 +57,7 @@ def find_common_prefix(strs): break return ''.join(common) + def list_bifilter(pred, iterable): """ Filter an iterable into two output lists: the first containing all @@ -70,10 +74,35 @@ def list_bifilter(pred, iterable): (yes_s if pred(i) else no_s).append(i) return yes_s, no_s + def identity(x): return x + def trim_if_present(s, prefix): if s.startswith(prefix): return s[len(prefix):] return s + + +def get_file_encoding_bomsize(filename): + """ + Checks the beginning of a file for a Unicode BOM. Based on this check, + the encoding that should be used to open the file and the number of + bytes that should be skipped (to skip the BOM) are returned. + """ + bom_encodings = ((codecs.BOM_UTF8, 'utf-8-sig'), + (codecs.BOM_UTF16_LE, 'utf-16le'), + (codecs.BOM_UTF16_BE, 'utf-16be'), + (codecs.BOM_UTF32_LE, 'utf-32be'), + (codecs.BOM_UTF32_BE, 'utf-32be')) + + firstbytes = open(filename, 'rb').read(4) + for bom, encoding in bom_encodings: + if firstbytes.startswith(bom): + file_encoding, size = encoding, len(bom) + break + else: + file_encoding, size = "ascii", 0 + + return (file_encoding, size)