spark-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From joshro...@apache.org
Subject git commit: [PySpark] [SPARK-2954] [SPARK-2948] [SPARK-2910] [SPARK-2101] Python 2.6 Fixes
Date Mon, 11 Aug 2014 18:54:53 GMT
Repository: spark
Updated Branches:
  refs/heads/branch-1.1 3def842d9 -> 09b8a3ce0


[PySpark] [SPARK-2954] [SPARK-2948] [SPARK-2910] [SPARK-2101] Python 2.6 Fixes

- Modify python/run-tests to test with Python 2.6
- Use unittest2 when running on Python 2.6.
- Fix issue with namedtuple.
- Skip TestOutputFormat.test_newhadoop on Python 2.6 until SPARK-2951 is fixed.
- Fix MLlib _deserialize_double on Python 2.6.

Closes #1868.  Closes #1042.

Author: Josh Rosen <joshrosen@apache.org>

Closes #1874 from JoshRosen/python2.6 and squashes the following commits:

983d259 [Josh Rosen] [SPARK-2954] Fix MLlib _deserialize_double on Python 2.6.
5d18fd7 [Josh Rosen] [SPARK-2948] [SPARK-2910] [SPARK-2101] Python 2.6 fixes

(cherry picked from commit db06a81fb7a413faa3fe0f8c35918f70454cb05d)
Signed-off-by: Josh Rosen <joshrosen@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/09b8a3ce
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/09b8a3ce
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/09b8a3ce

Branch: refs/heads/branch-1.1
Commit: 09b8a3ce0d73915d573e0ebc3e96448736b89bfa
Parents: 3def842
Author: Josh Rosen <joshrosen@apache.org>
Authored: Mon Aug 11 11:54:09 2014 -0700
Committer: Josh Rosen <joshrosen@apache.org>
Committed: Mon Aug 11 11:54:46 2014 -0700

----------------------------------------------------------------------
 python/pyspark/mllib/_common.py | 11 ++++++++++-
 python/pyspark/mllib/tests.py   |  7 ++++++-
 python/pyspark/serializers.py   |  4 ++--
 python/pyspark/tests.py         | 13 ++++++++++---
 python/run-tests                |  8 ++++++++
 5 files changed, 36 insertions(+), 7 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/09b8a3ce/python/pyspark/mllib/_common.py
----------------------------------------------------------------------
diff --git a/python/pyspark/mllib/_common.py b/python/pyspark/mllib/_common.py
index db341da..bb60d3d 100644
--- a/python/pyspark/mllib/_common.py
+++ b/python/pyspark/mllib/_common.py
@@ -16,6 +16,7 @@
 #
 
 import struct
+import sys
 import numpy
 from numpy import ndarray, float64, int64, int32, array_equal, array
 from pyspark import SparkContext, RDD
@@ -78,6 +79,14 @@ DENSE_MATRIX_MAGIC = 3
 LABELED_POINT_MAGIC = 4
 
 
+# Workaround for SPARK-2954: before Python 2.7, struct.unpack couldn't unpack bytearray()s.
+if sys.version_info[:2] <= (2, 6):
+    def _unpack(fmt, string):
+        return struct.unpack(fmt, buffer(string))
+else:
+    _unpack = struct.unpack
+
+
 def _deserialize_numpy_array(shape, ba, offset, dtype=float64):
     """
     Deserialize a numpy array of the given type from an offset in
@@ -191,7 +200,7 @@ def _deserialize_double(ba, offset=0):
         raise TypeError("_deserialize_double called on a %s; wanted bytearray" % type(ba))
     if len(ba) - offset != 8:
         raise TypeError("_deserialize_double called on a %d-byte array; wanted 8 bytes."
% nb)
-    return struct.unpack("d", ba[offset:])[0]
+    return _unpack("d", ba[offset:])[0]
 
 
 def _deserialize_double_vector(ba, offset=0):

http://git-wip-us.apache.org/repos/asf/spark/blob/09b8a3ce/python/pyspark/mllib/tests.py
----------------------------------------------------------------------
diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py
index 6f3ec8a..8a851bd 100644
--- a/python/pyspark/mllib/tests.py
+++ b/python/pyspark/mllib/tests.py
@@ -19,8 +19,13 @@
 Fuller unit tests for Python MLlib.
 """
 
+import sys
 from numpy import array, array_equal
-import unittest
+
+if sys.version_info[:2] <= (2, 6):
+    import unittest2 as unittest
+else:
+    import unittest
 
 from pyspark.mllib._common import _convert_vector, _serialize_double_vector, \
     _deserialize_double_vector, _dot, _squared_distance

http://git-wip-us.apache.org/repos/asf/spark/blob/09b8a3ce/python/pyspark/serializers.py
----------------------------------------------------------------------
diff --git a/python/pyspark/serializers.py b/python/pyspark/serializers.py
index b35558d..df90caf 100644
--- a/python/pyspark/serializers.py
+++ b/python/pyspark/serializers.py
@@ -314,8 +314,8 @@ def _hijack_namedtuple():
 
     _old_namedtuple = _copy_func(collections.namedtuple)
 
-    def namedtuple(name, fields, verbose=False, rename=False):
-        cls = _old_namedtuple(name, fields, verbose, rename)
+    def namedtuple(*args, **kwargs):
+        cls = _old_namedtuple(*args, **kwargs)
         return _hack_namedtuple(cls)
 
     # replace namedtuple with new one

http://git-wip-us.apache.org/repos/asf/spark/blob/09b8a3ce/python/pyspark/tests.py
----------------------------------------------------------------------
diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py
index 88a6117..22b5111 100644
--- a/python/pyspark/tests.py
+++ b/python/pyspark/tests.py
@@ -29,9 +29,14 @@ import subprocess
 import sys
 import tempfile
 import time
-import unittest
 import zipfile
 
+if sys.version_info[:2] <= (2, 6):
+    import unittest2 as unittest
+else:
+    import unittest
+
+
 from pyspark.context import SparkContext
 from pyspark.files import SparkFiles
 from pyspark.serializers import read_int
@@ -605,6 +610,7 @@ class TestOutputFormat(PySparkTestCase):
             conf=input_conf).collect())
         self.assertEqual(old_dataset, dict_data)
 
+    @unittest.skipIf(sys.version_info[:2] <= (2, 6), "Skipped on 2.6 until SPARK-2951
is fixed")
     def test_newhadoop(self):
         basepath = self.tempdir.name
         # use custom ArrayWritable types and converters to handle arrays
@@ -905,8 +911,9 @@ class TestSparkSubmit(unittest.TestCase):
         pattern = re.compile(r'^ *\|', re.MULTILINE)
         content = re.sub(pattern, '', content.strip())
         path = os.path.join(self.programDir, name + ".zip")
-        with zipfile.ZipFile(path, 'w') as zip:
-            zip.writestr(name, content)
+        zip = zipfile.ZipFile(path, 'w')
+        zip.writestr(name, content)
+        zip.close()
         return path
 
     def test_single_script(self):

http://git-wip-us.apache.org/repos/asf/spark/blob/09b8a3ce/python/run-tests
----------------------------------------------------------------------
diff --git a/python/run-tests b/python/run-tests
index 48feba2..1218edc 100755
--- a/python/run-tests
+++ b/python/run-tests
@@ -48,6 +48,14 @@ function run_test() {
 
 echo "Running PySpark tests. Output is in python/unit-tests.log."
 
+# Try to test with Python 2.6, since that's the minimum version that we support:
+if [ $(which python2.6) ]; then
+    export PYSPARK_PYTHON="python2.6"
+fi
+
+echo "Testing with Python version:"
+$PYSPARK_PYTHON --version
+
 run_test "pyspark/rdd.py"
 run_test "pyspark/context.py"
 run_test "pyspark/conf.py"


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org


Mime
View raw message