parquet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From ga...@apache.org
Subject [parquet-mr] branch master updated: PARQUET-1505: Use Java 7 NIO StandardCharsets (#599)
Date Thu, 07 Feb 2019 12:26:52 GMT
This is an automated email from the ASF dual-hosted git repository.

gabor pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/parquet-mr.git


The following commit(s) were added to refs/heads/master by this push:
     new 714bb45  PARQUET-1505: Use Java 7 NIO StandardCharsets (#599)
714bb45 is described below

commit 714bb450856dc951bd361e0cf4a732775eb3cefd
Author: BELUGABEHR <BELUGABEHR@users.noreply.github.com>
AuthorDate: Thu Feb 7 07:26:47 2019 -0500

    PARQUET-1505: Use Java 7 NIO StandardCharsets (#599)
---
 .../org/apache/parquet/avro/TestReadWrite.java     | 10 ++++----
 .../parquet/avro/TestReadWriteOldListBehavior.java | 12 ++++-----
 .../java/org/apache/parquet/cli/BaseCommand.java   |  8 ++----
 .../java/org/apache/parquet/io/api/Binary.java     | 30 +++++++---------------
 .../column/values/dictionary/TestDictionary.java   |  7 +++--
 .../java/org/apache/parquet/bytes/BytesUtils.java  |  3 +++
 .../apache/parquet/hadoop/ParquetFileWriter.java   |  4 +--
 7 files changed, 30 insertions(+), 44 deletions(-)

diff --git a/parquet-avro/src/test/java/org/apache/parquet/avro/TestReadWrite.java b/parquet-avro/src/test/java/org/apache/parquet/avro/TestReadWrite.java
index 69a73cb..4368938 100644
--- a/parquet-avro/src/test/java/org/apache/parquet/avro/TestReadWrite.java
+++ b/parquet-avro/src/test/java/org/apache/parquet/avro/TestReadWrite.java
@@ -18,7 +18,6 @@
  */
 package org.apache.parquet.avro;
 
-import com.google.common.base.Charsets;
 import com.google.common.collect.ImmutableMap;
 import com.google.common.collect.Lists;
 import com.google.common.io.Resources;
@@ -27,6 +26,7 @@ import java.io.IOException;
 import java.math.BigDecimal;
 import java.math.BigInteger;
 import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
@@ -369,7 +369,7 @@ public class TestReadWrite {
         .set("mylong", 2L)
         .set("myfloat", 3.1f)
         .set("mydouble", 4.1)
-        .set("mybytes", ByteBuffer.wrap("hello".getBytes(Charsets.UTF_8)))
+        .set("mybytes", ByteBuffer.wrap("hello".getBytes(StandardCharsets.UTF_8)))
         .set("mystring", "hello")
         .set("mynestedrecord", nestedRecord)
         .set("myenum", "a")
@@ -398,7 +398,7 @@ public class TestReadWrite {
     assertEquals(2L, nextRecord.get("mylong"));
     assertEquals(3.1f, nextRecord.get("myfloat"));
     assertEquals(4.1, nextRecord.get("mydouble"));
-    assertEquals(ByteBuffer.wrap("hello".getBytes(Charsets.UTF_8)), nextRecord.get("mybytes"));
+    assertEquals(ByteBuffer.wrap("hello".getBytes(StandardCharsets.UTF_8)), nextRecord.get("mybytes"));
     assertEquals(str("hello"), nextRecord.get("mystring"));
     assertEquals(expectedEnumSymbol, nextRecord.get("myenum"));
     assertEquals(nestedRecord, nextRecord.get("mynestedrecord"));
@@ -567,7 +567,7 @@ public class TestReadWrite {
     record.put("mylong", 2L);
     record.put("myfloat", 3.1f);
     record.put("mydouble", 4.1);
-    record.put("mybytes", ByteBuffer.wrap("hello".getBytes(Charsets.UTF_8)));
+    record.put("mybytes", ByteBuffer.wrap("hello".getBytes(StandardCharsets.UTF_8)));
     record.put("mystring", "hello");
     record.put("myenum", "a");
     record.put("mynestedint", 1);
@@ -615,7 +615,7 @@ public class TestReadWrite {
     assertEquals(2L, nextRecord.get("mylong"));
     assertEquals(3.1f, nextRecord.get("myfloat"));
     assertEquals(4.1, nextRecord.get("mydouble"));
-    assertEquals(ByteBuffer.wrap("hello".getBytes(Charsets.UTF_8)), nextRecord.get("mybytes"));
+    assertEquals(ByteBuffer.wrap("hello".getBytes(StandardCharsets.UTF_8)), nextRecord.get("mybytes"));
     assertEquals(str("hello"), nextRecord.get("mystring"));
     assertEquals(str("a"), nextRecord.get("myenum")); // enum symbols are unknown
     assertEquals(nestedRecord, nextRecord.get("mynestedrecord"));
diff --git a/parquet-avro/src/test/java/org/apache/parquet/avro/TestReadWriteOldListBehavior.java
b/parquet-avro/src/test/java/org/apache/parquet/avro/TestReadWriteOldListBehavior.java
index af6f938..bcf553e 100644
--- a/parquet-avro/src/test/java/org/apache/parquet/avro/TestReadWriteOldListBehavior.java
+++ b/parquet-avro/src/test/java/org/apache/parquet/avro/TestReadWriteOldListBehavior.java
@@ -18,12 +18,12 @@
  */
 package org.apache.parquet.avro;
 
-import com.google.common.base.Charsets;
 import com.google.common.collect.ImmutableMap;
 import com.google.common.collect.Lists;
 import com.google.common.io.Resources;
 import java.io.File;
 import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
@@ -247,7 +247,7 @@ public class TestReadWriteOldListBehavior {
         .set("mylong", 2L)
         .set("myfloat", 3.1f)
         .set("mydouble", 4.1)
-        .set("mybytes", ByteBuffer.wrap("hello".getBytes(Charsets.UTF_8)))
+        .set("mybytes", ByteBuffer.wrap("hello".getBytes(StandardCharsets.UTF_8)))
         .set("mystring", "hello")
         .set("mynestedrecord", nestedRecord)
         .set("myenum", "a")
@@ -276,7 +276,7 @@ public class TestReadWriteOldListBehavior {
     assertEquals(2L, nextRecord.get("mylong"));
     assertEquals(3.1f, nextRecord.get("myfloat"));
     assertEquals(4.1, nextRecord.get("mydouble"));
-    assertEquals(ByteBuffer.wrap("hello".getBytes(Charsets.UTF_8)), nextRecord.get("mybytes"));
+    assertEquals(ByteBuffer.wrap("hello".getBytes(StandardCharsets.UTF_8)), nextRecord.get("mybytes"));
     assertEquals(str("hello"), nextRecord.get("mystring"));
     assertEquals(expectedEnumSymbol, nextRecord.get("myenum"));
     assertEquals(nestedRecord, nextRecord.get("mynestedrecord"));
@@ -327,7 +327,7 @@ public class TestReadWriteOldListBehavior {
         .set("mylong", 2L)
         .set("myfloat", 3.1f)
         .set("mydouble", 4.1)
-        .set("mybytes", ByteBuffer.wrap("hello".getBytes(Charsets.UTF_8)))
+        .set("mybytes", ByteBuffer.wrap("hello".getBytes(StandardCharsets.UTF_8)))
         .set("mystring", "hello")
         .set("mynestedrecord", nestedRecord)
         .set("myenum", "a")
@@ -512,7 +512,7 @@ public class TestReadWriteOldListBehavior {
     record.put("mylong", 2L);
     record.put("myfloat", 3.1f);
     record.put("mydouble", 4.1);
-    record.put("mybytes", ByteBuffer.wrap("hello".getBytes(Charsets.UTF_8)));
+    record.put("mybytes", ByteBuffer.wrap("hello".getBytes(StandardCharsets.UTF_8)));
     record.put("mystring", "hello");
     record.put("myenum", "a");
     record.put("mynestedint", 1);
@@ -573,7 +573,7 @@ public class TestReadWriteOldListBehavior {
     assertEquals(2L, nextRecord.get("mylong"));
     assertEquals(3.1f, nextRecord.get("myfloat"));
     assertEquals(4.1, nextRecord.get("mydouble"));
-    assertEquals(ByteBuffer.wrap("hello".getBytes(Charsets.UTF_8)), nextRecord.get("mybytes"));
+    assertEquals(ByteBuffer.wrap("hello".getBytes(StandardCharsets.UTF_8)), nextRecord.get("mybytes"));
     assertEquals(str("hello"), nextRecord.get("mystring"));
     assertEquals(str("a"), nextRecord.get("myenum"));
     assertEquals(nestedRecord, nextRecord.get("mynestedrecord"));
diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/BaseCommand.java b/parquet-cli/src/main/java/org/apache/parquet/cli/BaseCommand.java
index f385fde..96ca5a5 100644
--- a/parquet-cli/src/main/java/org/apache/parquet/cli/BaseCommand.java
+++ b/parquet-cli/src/main/java/org/apache/parquet/cli/BaseCommand.java
@@ -20,7 +20,6 @@
 package org.apache.parquet.cli;
 
 import com.beust.jcommander.internal.Lists;
-import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Preconditions;
 import com.google.common.io.CharStreams;
 import com.google.common.io.Resources;
@@ -52,7 +51,7 @@ import java.io.InputStreamReader;
 import java.net.MalformedURLException;
 import java.net.URI;
 import java.net.URL;
-import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
 import java.security.AccessController;
 import java.util.Iterator;
 import java.util.List;
@@ -60,9 +59,6 @@ import java.util.NoSuchElementException;
 
 public abstract class BaseCommand implements Command, Configurable {
 
-  @VisibleForTesting
-  static final Charset UTF8 = Charset.forName("utf8");
-
   private static final String RESOURCE_URI_SCHEME = "resource";
   private static final String STDIN_AS_SOURCE = "stdin";
 
@@ -103,7 +99,7 @@ public abstract class BaseCommand implements Command, Configurable {
     } else {
       FSDataOutputStream outgoing = create(filename);
       try {
-        outgoing.write(content.getBytes(UTF8));
+        outgoing.write(content.getBytes(StandardCharsets.UTF_8));
       } finally {
         outgoing.close();
       }
diff --git a/parquet-column/src/main/java/org/apache/parquet/io/api/Binary.java b/parquet-column/src/main/java/org/apache/parquet/io/api/Binary.java
index 85c82bd..021d171 100644
--- a/parquet-column/src/main/java/org/apache/parquet/io/api/Binary.java
+++ b/parquet-column/src/main/java/org/apache/parquet/io/api/Binary.java
@@ -23,7 +23,6 @@ import java.io.IOException;
 import java.io.ObjectStreamException;
 import java.io.OutputStream;
 import java.io.Serializable;
-import java.io.UnsupportedEncodingException;
 import java.nio.ByteBuffer;
 import java.nio.CharBuffer;
 import java.nio.charset.CharacterCodingException;
@@ -31,12 +30,9 @@ import java.nio.charset.CharsetEncoder;
 import java.nio.charset.StandardCharsets;
 import java.util.Arrays;
 
-import org.apache.parquet.io.ParquetDecodingException;
 import org.apache.parquet.io.ParquetEncodingException;
 import org.apache.parquet.schema.PrimitiveComparator;
 
-import static org.apache.parquet.bytes.BytesUtils.UTF8;
-
 abstract public class Binary implements Comparable<Binary>, Serializable {
 
   protected boolean isBackingBytesReused;
@@ -133,11 +129,10 @@ abstract public class Binary implements Comparable<Binary>, Serializable
{
 
     @Override
     public String toStringUsingUTF8() {
-      return UTF8.decode(ByteBuffer.wrap(value, offset, length)).toString();
-      // TODO: figure out why the following line was much slower
-      // rdb: new String(...) is slower because it instantiates a new Decoder,
-      //      while Charset#decode uses a thread-local decoder cache
-      // return new String(value, offset, length, BytesUtils.UTF8);
+      // Charset#decode uses a thread-local decoder cache and is faster than
+      // new String(...) which instantiates a new Decoder per invocation
+      return StandardCharsets.UTF_8
+          .decode(ByteBuffer.wrap(value, offset, length)).toString();
     }
 
     @Override
@@ -220,11 +215,7 @@ abstract public class Binary implements Comparable<Binary>, Serializable
{
     }
 
     private static ByteBuffer encodeUTF8(String value) {
-      try {
-        return ByteBuffer.wrap(value.getBytes("UTF-8"));
-      } catch (UnsupportedEncodingException e) {
-        throw new ParquetEncodingException("UTF-8 not supported.", e);
-      }
+      return ByteBuffer.wrap(value.getBytes(StandardCharsets.UTF_8));
     }
   }
 
@@ -284,7 +275,7 @@ abstract public class Binary implements Comparable<Binary>, Serializable
{
 
     @Override
     public String toStringUsingUTF8() {
-      return UTF8.decode(ByteBuffer.wrap(value)).toString();
+      return StandardCharsets.UTF_8.decode(ByteBuffer.wrap(value)).toString();
     }
 
     @Override
@@ -393,11 +384,8 @@ abstract public class Binary implements Comparable<Binary>, Serializable
{
     public String toStringUsingUTF8() {
       String ret;
       if (value.hasArray()) {
-        try {
-          ret = new String(value.array(), value.arrayOffset() + offset, length, "UTF-8");
-        } catch (UnsupportedEncodingException e) {
-          throw new ParquetDecodingException("UTF-8 not supported");
-        }
+        ret = new String(value.array(), value.arrayOffset() + offset, length,
+            StandardCharsets.UTF_8);
       } else {
         int limit = value.limit();
         value.limit(offset+length);
@@ -406,7 +394,7 @@ abstract public class Binary implements Comparable<Binary>, Serializable
{
         // no corresponding interface to read a subset of a buffer, would have to slice it
         // which creates another ByteBuffer object or do what is done here to adjust the
         // limit/offset and set them back after
-        ret = UTF8.decode(value).toString();
+        ret = StandardCharsets.UTF_8.decode(value).toString();
         value.limit(limit);
         value.position(position);
       }
diff --git a/parquet-column/src/test/java/org/apache/parquet/column/values/dictionary/TestDictionary.java
b/parquet-column/src/test/java/org/apache/parquet/column/values/dictionary/TestDictionary.java
index ba3f903..2783b69 100644
--- a/parquet-column/src/test/java/org/apache/parquet/column/values/dictionary/TestDictionary.java
+++ b/parquet-column/src/test/java/org/apache/parquet/column/values/dictionary/TestDictionary.java
@@ -27,8 +27,8 @@ import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FLOAT;
 import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32;
 
 import java.io.IOException;
-import java.io.UnsupportedEncodingException;
 import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
 
 import org.apache.parquet.bytes.ByteBufferInputStream;
 import org.junit.Assert;
@@ -627,9 +627,8 @@ public class TestDictionary {
     }
   }
 
-  private void writeRepeatedWithReuse(int COUNT, ValuesWriter cw,
-                                      String prefix) throws UnsupportedEncodingException
{
-    Binary reused = Binary.fromReusedByteArray((prefix + "0").getBytes("UTF-8"));
+  private void writeRepeatedWithReuse(int COUNT, ValuesWriter cw, String prefix) {
+    Binary reused = Binary.fromReusedByteArray((prefix + "0").getBytes(StandardCharsets.UTF_8));
     for (int i = 0; i < COUNT; i++) {
       Binary content = Binary.fromString(prefix + i % 10);
       System.arraycopy(content.getBytesUnsafe(), 0, reused.getBytesUnsafe(), 0, reused.length());
diff --git a/parquet-common/src/main/java/org/apache/parquet/bytes/BytesUtils.java b/parquet-common/src/main/java/org/apache/parquet/bytes/BytesUtils.java
index 2657c7e..2c8162c 100644
--- a/parquet-common/src/main/java/org/apache/parquet/bytes/BytesUtils.java
+++ b/parquet-common/src/main/java/org/apache/parquet/bytes/BytesUtils.java
@@ -24,6 +24,7 @@ import java.io.InputStream;
 import java.io.OutputStream;
 import java.nio.ByteBuffer;
 import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
 
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -34,6 +35,8 @@ import org.slf4j.LoggerFactory;
 public class BytesUtils {
   private static final Logger LOG = LoggerFactory.getLogger(BytesUtils.class);
 
+  /** @deprecated Use {@link StandardCharsets#UTF_8} instead */
+  @Deprecated
   public static final Charset UTF8 = Charset.forName("UTF-8");
 
   /**
diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java
b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java
index 6158dad..14e3729 100644
--- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java
+++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java
@@ -24,7 +24,7 @@ import static org.apache.parquet.hadoop.ParquetWriter.DEFAULT_BLOCK_SIZE;
 import static org.apache.parquet.hadoop.ParquetWriter.MAX_PADDING_SIZE_DEFAULT;
 
 import java.io.IOException;
-import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
@@ -101,7 +101,7 @@ public class ParquetFileWriter {
 
   public static final String PARQUET_METADATA_FILE = "_metadata";
   public static final String MAGIC_STR = "PAR1";
-  public static final byte[] MAGIC = MAGIC_STR.getBytes(Charset.forName("ASCII"));
+  public static final byte[] MAGIC = MAGIC_STR.getBytes(StandardCharsets.US_ASCII);
   public static final String PARQUET_COMMON_METADATA_FILE = "_common_metadata";
   public static final int CURRENT_VERSION = 1;
 


Mime
View raw message