Mailing-List: contact commits-help@cassandra.apache.org; run by ezmlm
Precedence: bulk
Reply-To: dev@cassandra.apache.org
Content-Type: text/plain; charset="us-ascii"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
From: tylerhobbs@apache.org
To: commits@cassandra.apache.org
Message-Id: <146e5b7d9fef4f4f97580b6528c358f9@git.apache.org>
Subject: cassandra git commit: cqlsh: handle unicode BOM at the start of files
Date: Thu, 29 Jan 2015 20:38:58 +0000 (UTC)

Repository: cassandra
Updated Branches:
  refs/heads/cassandra-2.1 1e5a0e19a -> c49f6666e


cqlsh: handle unicode BOM at the start of files

Patch by Abhishek Gupta; reviewed by Tyler Hobbs for CASSANDRA-8638


Project: http://git-wip-us.apache.org/repos/asf/cassandra/repo
Commit: http://git-wip-us.apache.org/repos/asf/cassandra/commit/c49f6666
Tree: http://git-wip-us.apache.org/repos/asf/cassandra/tree/c49f6666
Diff: http://git-wip-us.apache.org/repos/asf/cassandra/diff/c49f6666

Branch: refs/heads/cassandra-2.1
Commit: c49f6666e457debebc3cfe935412a2b5306208a3
Parents: 1e5a0e1
Author: Abhishek Gupta <abhishek_gupta@persistent.com>
Authored: Thu Jan 29 14:37:55 2015 -0600
Committer: Tyler Hobbs <tyler@datastax.com>
Committed: Thu Jan 29 14:37:55 2015 -0600

----------------------------------------------------------------------
 CHANGES.txt            |  1 +
 bin/cqlsh              | 10 +++++++---
 pylib/cqlshlib/util.py | 29 +++++++++++++++++++++++++++++
 3 files changed, 37 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/cassandra/blob/c49f6666/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index fce4898..a8f8b87 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,4 +1,5 @@
 2.1.3
+ * (cqlsh) Handle unicode BOM at start of files (CASSANDRA-8638)
  * Stop compactions before exiting offline tools (CASSANDRA-8623)
  * Update tools/stress/README.txt to match current behaviour (CASSANDRA-7933)
  * Fix schema from Thrift conversion with empty metadata (CASSANDRA-8695)

http://git-wip-us.apache.org/repos/asf/cassandra/blob/c49f6666/bin/cqlsh
----------------------------------------------------------------------
diff --git a/bin/cqlsh b/bin/cqlsh
index 0254fad..26296a3 100755
--- a/bin/cqlsh
+++ b/bin/cqlsh
@@ -121,7 +121,7 @@ from cqlshlib import cqlhandling, cql3handling, pylexotron, sslhandling, async_i
 from cqlshlib.displaying import (RED, BLUE, CYAN, ANSI_RESET, COLUMN_NAME_COLORS,
                                  FormattedValue, colorme)
 from cqlshlib.formatting import format_by_type, formatter_for, format_value_utype
-from cqlshlib.util import trim_if_present
+from cqlshlib.util import trim_if_present, get_file_encoding_bomsize
 from cqlshlib.tracing import print_trace_session, print_trace
 
 DEFAULT_HOST = '127.0.0.1'
@@ -1601,7 +1601,9 @@ class Shell(cmd.Cmd):
         fname = parsed.get_binding('fname')
         fname = os.path.expanduser(self.cql_unprotect_value(fname))
         try:
-            f = open(fname, 'r')
+            encoding, bom_size = get_file_encoding_bomsize(fname)
+            f = codecs.open(fname, 'r', encoding)
+            f.seek(bom_size)
         except IOError, e:
             self.printerr('Could not open %r: %s' % (fname, e))
             return
@@ -2013,7 +2015,9 @@ def main(options, hostname, port):
         stdin = None
     else:
         try:
-            stdin = open(options.file, 'r')
+            encoding, bom_size = get_file_encoding_bomsize(options.file)
+            stdin = codecs.open(options.file, 'r', encoding)
+            stdin.seek(bom_size)
         except IOError, e:
             sys.exit("Can't open %r: %s" % (options.file, e))
 

http://git-wip-us.apache.org/repos/asf/cassandra/blob/c49f6666/pylib/cqlshlib/util.py
----------------------------------------------------------------------
diff --git a/pylib/cqlshlib/util.py b/pylib/cqlshlib/util.py
index e62ded4..bc58c8b 100644
--- a/pylib/cqlshlib/util.py
+++ b/pylib/cqlshlib/util.py
@@ -14,8 +14,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import codecs
 from itertools import izip
 
+
 def split_list(items, pred):
     """
     Split up a list (or other iterable) on the elements which satisfy the
@@ -36,6 +38,7 @@ def split_list(items, pred):
             results.append(thisresult)
     return results
 
+
 def find_common_prefix(strs):
     """
     Given a list (iterable) of strings, return the longest common prefix.
@@ -54,6 +57,7 @@ def find_common_prefix(strs):
             break
     return ''.join(common)
 
+
 def list_bifilter(pred, iterable):
     """
     Filter an iterable into two output lists: the first containing all
@@ -70,10 +74,35 @@ def list_bifilter(pred, iterable):
         (yes_s if pred(i) else no_s).append(i)
     return yes_s, no_s
 
+
 def identity(x):
     return x
 
+
 def trim_if_present(s, prefix):
     if s.startswith(prefix):
         return s[len(prefix):]
     return s
+
+
+def get_file_encoding_bomsize(filename):
+    """
+    Checks the beginning of a file for a Unicode BOM.  Based on this check,
+    the encoding that should be used to open the file and the number of
+    bytes that should be skipped (to skip the BOM) are returned.
+    """
+    bom_encodings = ((codecs.BOM_UTF8, 'utf-8-sig'),
+                     (codecs.BOM_UTF16_LE, 'utf-16le'),
+                     (codecs.BOM_UTF16_BE, 'utf-16be'),
+                     (codecs.BOM_UTF32_LE, 'utf-32be'),
+                     (codecs.BOM_UTF32_BE, 'utf-32be'))
+
+    firstbytes = open(filename, 'rb').read(4)
+    for bom, encoding in bom_encodings:
+        if firstbytes.startswith(bom):
+            file_encoding, size = encoding, len(bom)
+            break
+    else:
+        file_encoding, size = "ascii", 0
+
+    return (file_encoding, size)