parquet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From jul...@apache.org
Subject [parquet-mr] branch master updated: PARQUET-1135: upgrade thrift and protobuf dependencies
Date Sat, 10 Mar 2018 00:14:29 GMT
This is an automated email from the ASF dual-hosted git repository.

julien pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/parquet-mr.git


The following commit(s) were added to refs/heads/master by this push:
     new 3d2d4fd  PARQUET-1135: upgrade thrift and protobuf dependencies
3d2d4fd is described below

commit 3d2d4fd1588c8eb3f67f34d75b66967d0c7b06b6
Author: Julien Le Dem <julien.ledem@wework.com>
AuthorDate: Fri Mar 9 16:14:11 2018 -0800

    PARQUET-1135: upgrade thrift and protobuf dependencies
    
    Author: Julien Le Dem <julien.ledem@wework.com>
    Author: Julien Le Dem <julien@ledem.net>
    
    Closes #427 from julienledem/PARQUET_1135_thrift_PB and squashes the following commits:
    
    f23b32d9 [Julien Le Dem] remove double install
    78cbf734 [Julien Le Dem] remove running check on protobuf build
    4bc2b8f7 [Julien Le Dem] add timing; upgrade proto version
    e17ca956 [Julien Le Dem] without-nodejs
    d15e523d [Julien Le Dem] PARQUET-1135: upgrade thrift and protobuf dependencies
---
 .travis.yml                                        | 21 ++++++++------
 parquet-protobuf/pom.xml                           |  2 +-
 .../hadoop/thrift/ThriftBytesWriteSupport.java     | 32 ++++++++++++++++++++--
 pom.xml                                            |  2 +-
 4 files changed, 44 insertions(+), 13 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 55f6e9a..ef02f9b 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,34 +1,37 @@
 language: java
 before_install:
+  - date
   - sudo apt-get update -qq
   - sudo apt-get install build-essential
+  - date
   - mkdir protobuf_install
   - pushd protobuf_install
-  - wget https://github.com/google/protobuf/archive/v3.2.0.tar.gz -O protobuf-3.2.0.tar.gz
-  - tar xzf protobuf-3.2.0.tar.gz
-  - cd protobuf-3.2.0
+  - wget https://github.com/google/protobuf/archive/v3.5.1.tar.gz -O protobuf-3.5.1.tar.gz
+  - tar xzf protobuf-3.5.1.tar.gz
+  - cd protobuf-3.5.1
   - sudo apt-get install autoconf automake libtool curl make g++ unzip
   - ./autogen.sh
   - ./configure
   - make
-  - make check
   - sudo make install
   - sudo ldconfig
   - protoc --version
   - popd
+  - date
   - pwd
   - sudo apt-get install -qq libboost-dev libboost-test-dev libboost-program-options-dev
libevent-dev automake libtool flex bison pkg-config g++ libssl-dev
-  - wget -nv http://archive.apache.org/dist/thrift/0.7.0/thrift-0.7.0.tar.gz
-  - tar zxf thrift-0.7.0.tar.gz
-  - cd thrift-0.7.0
+  - wget -nv http://archive.apache.org/dist/thrift/0.9.3/thrift-0.9.3.tar.gz
+  - tar zxf thrift-0.9.3.tar.gz
+  - cd thrift-0.9.3
   - chmod +x ./configure
-  - ./configure --disable-gen-erl --disable-gen-hs --without-ruby --without-haskell --without-erlang
--without-php
+  - ./configure --disable-gen-erl --disable-gen-hs --without-ruby --without-haskell --without-erlang
--without-php --without-nodejs
   - sudo make install
   - cd ..
+  - date
 
 env:
   - HADOOP_PROFILE=default TEST_CODECS=uncompressed,brotli
   - HADOOP_PROFILE=default TEST_CODECS=gzip,snappy
 
-install: mvn install --batch-mode -DskipTests=true -Dmaven.javadoc.skip=true -Dsource.skip=true
> mvn_install.log || mvn install --batch-mode -DskipTests=true -Dmaven.javadoc.skip=true
-Dsource.skip=true > mvn_install.log || (cat mvn_install.log && false)
+install: mvn install --batch-mode -DskipTests=true -Dmaven.javadoc.skip=true -Dsource.skip=true
> mvn_install.log || (cat mvn_install.log && false)
 script: mvn test -P $HADOOP_PROFILE
diff --git a/parquet-protobuf/pom.xml b/parquet-protobuf/pom.xml
index 979b436..5bab770 100644
--- a/parquet-protobuf/pom.xml
+++ b/parquet-protobuf/pom.xml
@@ -31,7 +31,7 @@
 
   <properties>
     <elephant-bird.version>4.4</elephant-bird.version>
-    <protobuf.version>3.2.0</protobuf.version>
+    <protobuf.version>3.5.1</protobuf.version>
     <!-- allow using protoc from an alternative path -->
     <protoc.path>protoc</protoc.path>
   </properties>
diff --git a/parquet-thrift/src/main/java/org/apache/parquet/hadoop/thrift/ThriftBytesWriteSupport.java
b/parquet-thrift/src/main/java/org/apache/parquet/hadoop/thrift/ThriftBytesWriteSupport.java
index ba46c84..6f5d50d 100644
--- a/parquet-thrift/src/main/java/org/apache/parquet/hadoop/thrift/ThriftBytesWriteSupport.java
+++ b/parquet-thrift/src/main/java/org/apache/parquet/hadoop/thrift/ThriftBytesWriteSupport.java
@@ -19,6 +19,8 @@
 package org.apache.parquet.hadoop.thrift;
 
 import java.io.ByteArrayInputStream;
+import java.lang.reflect.InvocationTargetException;
+import java.lang.reflect.Method;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.io.BytesWritable;
@@ -28,7 +30,8 @@ import org.apache.thrift.protocol.TBinaryProtocol;
 import org.apache.thrift.protocol.TProtocol;
 import org.apache.thrift.protocol.TProtocolFactory;
 import org.apache.thrift.transport.TIOStreamTransport;
-
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 import org.apache.parquet.hadoop.BadConfigurationException;
 import org.apache.parquet.hadoop.api.WriteSupport;
 import org.apache.parquet.io.ColumnIOFactory;
@@ -45,6 +48,7 @@ import org.apache.parquet.thrift.ThriftSchemaConverter;
 import org.apache.parquet.thrift.struct.ThriftType.StructType;
 
 public class ThriftBytesWriteSupport extends WriteSupport<BytesWritable> {
+  private static final Logger LOG = LoggerFactory.getLogger(ThriftBytesWriteSupport.class);
   private static final String PARQUET_PROTOCOL_CLASS = "parquet.protocol.class";
 
   public static <U extends TProtocol> void setTProtocolClass(Configuration conf, Class<U>
tProtocolClass) {
@@ -123,8 +127,32 @@ public class ThriftBytesWriteSupport extends WriteSupport<BytesWritable>
{
     return thriftWriteSupport.init(configuration);
   }
 
+  private static Method SET_READ_LENGTH;
+  static {
+    try {
+      SET_READ_LENGTH = TBinaryProtocol.class.getMethod("setReadLength", int.class);
+    } catch (NoSuchMethodException e) {
+      SET_READ_LENGTH = null;
+    }
+  }
+
   private TProtocol protocol(BytesWritable record) {
-    return protocolFactory.getProtocol(new TIOStreamTransport(new ByteArrayInputStream(record.getBytes())));
+    TProtocol protocol = protocolFactory.getProtocol(new TIOStreamTransport(new ByteArrayInputStream(record.getBytes())));
+
+    /* Reduce the chance of OOM when data is corrupted. When readBinary is called on TBinaryProtocol,
it reads the length of the binary first,
+     so if the data is corrupted, it could read a big integer as the length of the binary
and therefore causes OOM to happen.
+     Currently this fix only applies to TBinaryProtocol which has the setReadLength defined
(thrift 0.7).
+      */
+    if (SET_READ_LENGTH != null && protocol instanceof TBinaryProtocol) {
+      try {
+        SET_READ_LENGTH.invoke(protocol, new Object[]{record.getLength()});
+      } catch (IllegalAccessException | IllegalArgumentException | InvocationTargetException
e) {
+        LOG.warn("setReadLength should not throw an exception", e);
+        SET_READ_LENGTH = null;
+      }
+    }
+
+    return protocol;
   }
 
   @Override
diff --git a/pom.xml b/pom.xml
index 8a8e91a..c8c8ccf 100644
--- a/pom.xml
+++ b/pom.xml
@@ -90,8 +90,8 @@
     <scala.maven.test.skip>false</scala.maven.test.skip>
     <pig.version>0.16.0</pig.version>
     <pig.classifier>h2</pig.classifier>
-    <thrift.version>0.7.0</thrift.version>
     <thrift-maven-plugin.version>0.10.0</thrift-maven-plugin.version>
+    <thrift.version>0.9.3</thrift.version>
     <fastutil.version>7.0.13</fastutil.version>
     <semver.api.version>0.9.33</semver.api.version>
     <slf4j.version>1.7.22</slf4j.version>

-- 
To stop receiving notification emails like this one, please contact
julien@apache.org.

Mime
View raw message