spark-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From r...@apache.org
Subject spark git commit: [SPARK-15493][SQL] default QuoteEscapingEnabled flag to true when writing CSV
Date Wed, 25 May 2016 19:40:28 GMT
Repository: spark
Updated Branches:
  refs/heads/branch-2.0 20cc2eb1b -> 8629537cc


[SPARK-15493][SQL] default QuoteEscapingEnabled flag to true when writing CSV

## What changes were proposed in this pull request?

Default QuoteEscapingEnabled flag to true when writing CSV and add an escapeQuotes option
to be able to change this.

See https://github.com/uniVocity/univocity-parsers/blob/f3eb2af26374940e60d91d1703bde54619f50c51/src/main/java/com/univocity/parsers/csv/CsvWriterSettings.java#L231-L247

This change is needed to be able to write RFC 4180 compatible CSV files (https://tools.ietf.org/html/rfc4180#section-2)

https://issues.apache.org/jira/browse/SPARK-15493

## How was this patch tested?

Added a test that verifies the output is quoted correctly.

Author: Jurriaan Pruis <email@jurriaanpruis.nl>

Closes #13267 from jurriaan/quote-escaping.

(cherry picked from commit c875d81a3de3f209b9eb03adf96b7c740b2c7b52)
Signed-off-by: Reynold Xin <rxin@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8629537c
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8629537c
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8629537c

Branch: refs/heads/branch-2.0
Commit: 8629537cc7d89c97ef2038326f1138d166596315
Parents: 20cc2eb
Author: Jurriaan Pruis <email@jurriaanpruis.nl>
Authored: Wed May 25 12:40:16 2016 -0700
Committer: Reynold Xin <rxin@databricks.com>
Committed: Wed May 25 12:40:26 2016 -0700

----------------------------------------------------------------------
 dev/deps/spark-deps-hadoop-2.2                  |  2 +-
 dev/deps/spark-deps-hadoop-2.3                  |  2 +-
 dev/deps/spark-deps-hadoop-2.4                  |  2 +-
 dev/deps/spark-deps-hadoop-2.6                  |  2 +-
 dev/deps/spark-deps-hadoop-2.7                  |  2 +-
 python/pyspark/sql/readwriter.py                |  7 ++-
 sql/core/pom.xml                                |  2 +-
 .../org/apache/spark/sql/DataFrameWriter.scala  |  3 ++
 .../execution/datasources/csv/CSVOptions.scala  |  2 +
 .../execution/datasources/csv/CSVParser.scala   |  1 +
 .../execution/datasources/csv/CSVSuite.scala    | 51 ++++++++++++++++++++
 11 files changed, 69 insertions(+), 7 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/8629537c/dev/deps/spark-deps-hadoop-2.2
----------------------------------------------------------------------
diff --git a/dev/deps/spark-deps-hadoop-2.2 b/dev/deps/spark-deps-hadoop-2.2
index a9068da..0ac1c00 100644
--- a/dev/deps/spark-deps-hadoop-2.2
+++ b/dev/deps/spark-deps-hadoop-2.2
@@ -159,7 +159,7 @@ stax-api-1.0.1.jar
 stream-2.7.0.jar
 stringtemplate-3.2.1.jar
 super-csv-2.2.0.jar
-univocity-parsers-2.1.0.jar
+univocity-parsers-2.1.1.jar
 validation-api-1.1.0.Final.jar
 xbean-asm5-shaded-4.4.jar
 xmlenc-0.52.jar

http://git-wip-us.apache.org/repos/asf/spark/blob/8629537c/dev/deps/spark-deps-hadoop-2.3
----------------------------------------------------------------------
diff --git a/dev/deps/spark-deps-hadoop-2.3 b/dev/deps/spark-deps-hadoop-2.3
index 7e60a31..fa35fa7 100644
--- a/dev/deps/spark-deps-hadoop-2.3
+++ b/dev/deps/spark-deps-hadoop-2.3
@@ -167,7 +167,7 @@ stax-api-1.0.1.jar
 stream-2.7.0.jar
 stringtemplate-3.2.1.jar
 super-csv-2.2.0.jar
-univocity-parsers-2.1.0.jar
+univocity-parsers-2.1.1.jar
 validation-api-1.1.0.Final.jar
 xbean-asm5-shaded-4.4.jar
 xmlenc-0.52.jar

http://git-wip-us.apache.org/repos/asf/spark/blob/8629537c/dev/deps/spark-deps-hadoop-2.4
----------------------------------------------------------------------
diff --git a/dev/deps/spark-deps-hadoop-2.4 b/dev/deps/spark-deps-hadoop-2.4
index 70d33b4..99dffa9 100644
--- a/dev/deps/spark-deps-hadoop-2.4
+++ b/dev/deps/spark-deps-hadoop-2.4
@@ -167,7 +167,7 @@ stax-api-1.0.1.jar
 stream-2.7.0.jar
 stringtemplate-3.2.1.jar
 super-csv-2.2.0.jar
-univocity-parsers-2.1.0.jar
+univocity-parsers-2.1.1.jar
 validation-api-1.1.0.Final.jar
 xbean-asm5-shaded-4.4.jar
 xmlenc-0.52.jar

http://git-wip-us.apache.org/repos/asf/spark/blob/8629537c/dev/deps/spark-deps-hadoop-2.6
----------------------------------------------------------------------
diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6
index a80f6bc..a3bee36 100644
--- a/dev/deps/spark-deps-hadoop-2.6
+++ b/dev/deps/spark-deps-hadoop-2.6
@@ -175,7 +175,7 @@ stax-api-1.0.1.jar
 stream-2.7.0.jar
 stringtemplate-3.2.1.jar
 super-csv-2.2.0.jar
-univocity-parsers-2.1.0.jar
+univocity-parsers-2.1.1.jar
 validation-api-1.1.0.Final.jar
 xbean-asm5-shaded-4.4.jar
 xercesImpl-2.9.1.jar

http://git-wip-us.apache.org/repos/asf/spark/blob/8629537c/dev/deps/spark-deps-hadoop-2.7
----------------------------------------------------------------------
diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7
index c0b53f7..dbd7a8e 100644
--- a/dev/deps/spark-deps-hadoop-2.7
+++ b/dev/deps/spark-deps-hadoop-2.7
@@ -176,7 +176,7 @@ stax-api-1.0.1.jar
 stream-2.7.0.jar
 stringtemplate-3.2.1.jar
 super-csv-2.2.0.jar
-univocity-parsers-2.1.0.jar
+univocity-parsers-2.1.1.jar
 validation-api-1.1.0.Final.jar
 xbean-asm5-shaded-4.4.jar
 xercesImpl-2.9.1.jar

http://git-wip-us.apache.org/repos/asf/spark/blob/8629537c/python/pyspark/sql/readwriter.py
----------------------------------------------------------------------
diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
index 6f788cf..73d2b81 100644
--- a/python/pyspark/sql/readwriter.py
+++ b/python/pyspark/sql/readwriter.py
@@ -769,7 +769,7 @@ class DataFrameWriter(object):
 
     @since(2.0)
     def csv(self, path, mode=None, compression=None, sep=None, quote=None, escape=None,
-            header=None, nullValue=None):
+            header=None, nullValue=None, escapeQuotes=None):
         """Saves the content of the [[DataFrame]] in CSV format at the specified path.
 
         :param path: the path in any Hadoop supported file system
@@ -790,6 +790,9 @@ class DataFrameWriter(object):
                       value, ``"``.
         :param escape: sets the single character used for escaping quotes inside an already
                        quoted value. If None is set, it uses the default value, ``\``
+        :param escapeQuotes: A flag indicating whether values containing quotes should always
+                             be enclosed in quotes. If None is set, it uses the default value
+                             ``true``, escaping all values containing a quote character.
         :param header: writes the names of columns as the first line. If None is set, it
uses
                        the default value, ``false``.
         :param nullValue: sets the string representation of a null value. If None is set,
it uses
@@ -810,6 +813,8 @@ class DataFrameWriter(object):
             self.option("header", header)
         if nullValue is not None:
             self.option("nullValue", nullValue)
+        if escapeQuotes is not None:
+            self.option("escapeQuotes", nullValue)
         self._jwrite.csv(path)
 
     @since(1.5)

http://git-wip-us.apache.org/repos/asf/spark/blob/8629537c/sql/core/pom.xml
----------------------------------------------------------------------
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index 2ea980b..b833b93 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -39,7 +39,7 @@
     <dependency>
       <groupId>com.univocity</groupId>
       <artifactId>univocity-parsers</artifactId>
-      <version>2.1.0</version>
+      <version>2.1.1</version>
       <type>jar</type>
     </dependency>
     <dependency>

http://git-wip-us.apache.org/repos/asf/spark/blob/8629537c/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
index 6f5fb69..3aacce7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -684,6 +684,9 @@ final class DataFrameWriter private[sql](df: DataFrame) {
    * the separator can be part of the value.</li>
    * <li>`escape` (default `\`): sets the single character used for escaping quotes
inside
    * an already quoted value.</li>
+   * <li>`escapeQuotes` (default `true`): a flag indicating whether values containing
+   * quotes should always be enclosed in quotes. Default is to escape all values containing
+   * a quote character.</li>
    * <li>`header` (default `false`): writes the names of columns as the first line.</li>
    * <li>`nullValue` (default empty string): sets the string representation of a null
value.</li>
    * <li>`compression` (default `null`): compression codec to use when saving to file.
This can be

http://git-wip-us.apache.org/repos/asf/spark/blob/8629537c/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
index e4fd094..9f4ce83 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
@@ -111,6 +111,8 @@ private[sql] class CSVOptions(@transient private val parameters: Map[String,
Str
 
   val maxCharsPerColumn = getInt("maxCharsPerColumn", 1000000)
 
+  val escapeQuotes = getBool("escapeQuotes", true)
+
   val inputBufferSize = 128
 
   val isCommentSet = this.comment != '\u0000'

http://git-wip-us.apache.org/repos/asf/spark/blob/8629537c/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParser.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParser.scala
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParser.scala
index 111995d..b06f123 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParser.scala
@@ -75,6 +75,7 @@ private[sql] class LineCsvWriter(params: CSVOptions, headers: Seq[String])
exten
   writerSettings.setSkipEmptyLines(true)
   writerSettings.setQuoteAllFields(false)
   writerSettings.setHeaders(headers: _*)
+  writerSettings.setQuoteEscapingEnabled(params.escapeQuotes)
 
   private var buffer = new ByteArrayOutputStream()
   private var writer = new CsvWriter(

http://git-wip-us.apache.org/repos/asf/spark/blob/8629537c/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
index bae2907..ad7c05c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -366,6 +366,57 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils
{
     }
   }
 
+  test("save csv with quote escaping enabled") {
+    withTempDir { dir =>
+      val csvDir = new File(dir, "csv").getCanonicalPath
+
+      val data = Seq(("test \"quote\"", 123, "it \"works\"!", "\"very\" well"))
+      val df = spark.createDataFrame(data)
+
+      // escapeQuotes should be true by default
+      df.coalesce(1).write
+        .format("csv")
+        .option("quote", "\"")
+        .option("escape", "\"")
+        .save(csvDir)
+
+      val results = spark.read
+        .format("text")
+        .load(csvDir)
+        .collect()
+
+      val expected = "\"test \"\"quote\"\"\",123,\"it \"\"works\"\"!\",\"\"\"very\"\" well\""
+
+      assert(results.toSeq.map(_.toSeq) === Seq(Seq(expected)))
+    }
+  }
+
+  test("save csv with quote escaping disabled") {
+    withTempDir { dir =>
+      val csvDir = new File(dir, "csv").getCanonicalPath
+
+      val data = Seq(("test \"quote\"", 123, "it \"works\"!", "\"very\" well"))
+      val df = spark.createDataFrame(data)
+
+      // escapeQuotes should be true by default
+      df.coalesce(1).write
+        .format("csv")
+        .option("quote", "\"")
+        .option("escapeQuotes", "false")
+        .option("escape", "\"")
+        .save(csvDir)
+
+      val results = spark.read
+        .format("text")
+        .load(csvDir)
+        .collect()
+
+      val expected = "test \"quote\",123,it \"works\"!,\"\"\"very\"\" well\""
+
+      assert(results.toSeq.map(_.toSeq) === Seq(Seq(expected)))
+    }
+  }
+
   test("commented lines in CSV data") {
     val results = spark.read
       .format("csv")


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org


Mime
View raw message