spark-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From r...@apache.org
Subject spark git commit: [SPARK-6623][SQL] Alias DataFrame.na.drop and DataFrame.na.fill in Python.
Date Tue, 31 Mar 2015 07:25:32 GMT
Repository: spark
Updated Branches:
  refs/heads/branch-1.3 a97d4e6bf -> cf651a46e


[SPARK-6623][SQL] Alias DataFrame.na.drop and DataFrame.na.fill in Python.

To maintain consistency with the Scala API.

Author: Reynold Xin <rxin@databricks.com>

Closes #5284 from rxin/df-na-alias and squashes the following commits:

19f46b7 [Reynold Xin] Show DataFrameNaFunctions in docs.
6618118 [Reynold Xin] [SPARK-6623][SQL] Alias DataFrame.na.drop and DataFrame.na.fill in Python.

(cherry picked from commit b80a030e90d790e27e89b26f536565c582dbf3d5)
Signed-off-by: Reynold Xin <rxin@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/cf651a46
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/cf651a46
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/cf651a46

Branch: refs/heads/branch-1.3
Commit: cf651a46e3ac6d8e76b14a05eb395821032f6ad8
Parents: a97d4e6
Author: Reynold Xin <rxin@databricks.com>
Authored: Tue Mar 31 00:25:23 2015 -0700
Committer: Reynold Xin <rxin@databricks.com>
Committed: Tue Mar 31 00:25:29 2015 -0700

----------------------------------------------------------------------
 python/pyspark/sql/__init__.py  | 10 +++++----
 python/pyspark/sql/dataframe.py | 41 ++++++++++++++++++++++++++++++++++--
 2 files changed, 45 insertions(+), 6 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/cf651a46/python/pyspark/sql/__init__.py
----------------------------------------------------------------------
diff --git a/python/pyspark/sql/__init__.py b/python/pyspark/sql/__init__.py
index 54a0163..9d39e5d 100644
--- a/python/pyspark/sql/__init__.py
+++ b/python/pyspark/sql/__init__.py
@@ -22,22 +22,24 @@ public classes of Spark SQL:
       Main entry point for :class:`DataFrame` and SQL functionality.
     - L{DataFrame}
       A distributed collection of data grouped into named columns.
-    - L{GroupedData}
-      Aggregation methods, returned by :func:`DataFrame.groupBy`.
     - L{Column}
       A column expression in a :class:`DataFrame`.
     - L{Row}
       A row of data in a :class:`DataFrame`.
     - L{HiveContext}
       Main entry point for accessing data stored in Apache Hive.
+    - L{GroupedData}
+      Aggregation methods, returned by :func:`DataFrame.groupBy`.
+    - L{DataFrameNaFunctions}
+      Methods for handling missing data (null values).
     - L{functions}
       List of built-in functions available for :class:`DataFrame`.
 """
 
 from pyspark.sql.context import SQLContext, HiveContext
 from pyspark.sql.types import Row
-from pyspark.sql.dataframe import DataFrame, GroupedData, Column, SchemaRDD
+from pyspark.sql.dataframe import DataFrame, GroupedData, Column, SchemaRDD, DataFrameNaFunctions
 
 __all__ = [
-    'SQLContext', 'HiveContext', 'DataFrame', 'GroupedData', 'Column', 'Row',
+    'SQLContext', 'HiveContext', 'DataFrame', 'GroupedData', 'Column', 'Row', 'DataFrameNaFunctions'
 ]

http://git-wip-us.apache.org/repos/asf/spark/blob/cf651a46/python/pyspark/sql/dataframe.py
----------------------------------------------------------------------
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 4f174de..1550802 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -31,7 +31,7 @@ from pyspark.sql.types import *
 from pyspark.sql.types import _create_cls, _parse_datatype_json_string
 
 
-__all__ = ["DataFrame", "GroupedData", "Column", "SchemaRDD"]
+__all__ = ["DataFrame", "GroupedData", "Column", "SchemaRDD", "DataFrameNaFunctions"]
 
 
 class DataFrame(object):
@@ -86,6 +86,12 @@ class DataFrame(object):
 
         return self._lazy_rdd
 
+    @property
+    def na(self):
+        """Returns a :class:`DataFrameNaFunctions` for handling missing values.
+        """
+        return DataFrameNaFunctions(self)
+
     def toJSON(self, use_unicode=False):
         """Convert a :class:`DataFrame` into a MappedRDD of JSON documents; one document
per row.
 
@@ -693,6 +699,8 @@ class DataFrame(object):
     def dropna(self, how='any', thresh=None, subset=None):
         """Returns a new :class:`DataFrame` omitting rows with null values.
 
+        This is an alias for `na.drop`.
+
         :param how: 'any' or 'all'.
             If 'any', drop a row if it contains any nulls.
             If 'all', drop a row only if all its values are null.
@@ -704,6 +712,10 @@ class DataFrame(object):
         >>> df4.dropna().show()
         age height name
         10  80     Alice
+
+        >>> df4.na.drop().show()
+        age height name
+        10  80     Alice
         """
         if how is not None and how not in ['any', 'all']:
             raise ValueError("how ('" + how + "') should be 'any' or 'all'")
@@ -723,7 +735,7 @@ class DataFrame(object):
         return DataFrame(self._jdf.na().drop(thresh, cols), self.sql_ctx)
 
     def fillna(self, value, subset=None):
-        """Replace null values.
+        """Replace null values, alias for `na.fill`.
 
         :param value: int, long, float, string, or dict.
             Value to replace null values with.
@@ -748,6 +760,13 @@ class DataFrame(object):
         5   null   Bob
         50  null   Tom
         50  null   unknown
+
+        >>> df4.na.fill({'age': 50, 'name': 'unknown'}).show()
+        age height name
+        10  80     Alice
+        5   null   Bob
+        50  null   Tom
+        50  null   unknown
         """
         if not isinstance(value, (float, int, long, basestring, dict)):
             raise ValueError("value should be a float, int, long, string, or dict")
@@ -1134,6 +1153,24 @@ class Column(object):
         return 'Column<%s>' % self._jc.toString().encode('utf8')
 
 
+class DataFrameNaFunctions(object):
+    """Functionality for working with missing data in :class:`DataFrame`.
+    """
+
+    def __init__(self, df):
+        self.df = df
+
+    def drop(self, how='any', thresh=None, subset=None):
+        return self.df.dropna(how=how, thresh=thresh, subset=subset)
+
+    drop.__doc__ = DataFrame.dropna.__doc__
+
+    def fill(self, value, subset=None):
+        return self.df.fillna(value=value, subset=subset)
+
+    fill.__doc__ = DataFrame.fillna.__doc__
+
+
 def _test():
     import doctest
     from pyspark.context import SparkContext


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org


Mime
View raw message