impala-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From tarmstr...@apache.org
Subject [1/2] incubator-impala git commit: IMPALA-3223: Supports download of CDH components from S3.
Date Tue, 21 Jun 2016 07:37:59 GMT
Repository: incubator-impala
Updated Branches:
  refs/heads/master 8a04b170d -> fc3ff1c52


IMPALA-3223: Supports download of CDH components from S3.

This change updates the toolchain bootstrapping script
to download the CDH components (hadoop, hbase, hive, llama,
llama-minikdc and sentry) from the toolchain S3 bucket to
the toolchain directory if the environment variable
$DOWNLOAD_CDH_COMPONENTS is true. By default, it is false
which means the CDH components in the thirdparty directory
will be used instead.

To build the ASF tree(https://git-wip-us.apache.org/repos/asf?p=incubator-impala.git),
set $DOWNLOAD_CDH_COMPONENTS to true. Currently, the CDH
components in S3 are snapshots from the thirdparty directory
at 688d0efcd38731e8e27a8236dbdca21c8fd571a1. Once the integration
jenkins job (impala-cdh5-trunk-core-integration) is modified
to upload the latest stable builds to the S3 buckets, we can
remove the thirdparty directory and always use the CDH components
in the toolchain directory.

Note that bootstrap_toolchain.py will not overwrite existing
directories in the toolchain directory. To force a refresh of
cpmponents in the toolchain directory, a user should delete the
cached copy in the toolchain directory and execute
bootstrap_toolchain.py again. This behavior allows users to
develop locally without network connection once the toolchain
has been bootstrapped.

Change-Id: I16fa79db0005554cc0a116e74775647ba99f8dda
Reviewed-on: http://gerrit.cloudera.org:8080/3333
Reviewed-by: Michael Ho <kwho@cloudera.com>
Tested-by: Internal Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/6e71e903
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/6e71e903
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/6e71e903

Branch: refs/heads/master
Commit: 6e71e903ff0b7181c487376512c8dc1421642636
Parents: 8a04b17
Author: Michael Ho <kwho@cloudera.com>
Authored: Fri May 27 19:19:36 2016 -0700
Committer: Tim Armstrong <tarmstrong@cloudera.com>
Committed: Tue Jun 21 00:37:53 2016 -0700

----------------------------------------------------------------------
 CMakeLists.txt                                  |  15 ---
 bin/bootstrap_toolchain.py                      | 114 +++++++++++++------
 bin/impala-config.sh                            |  23 +++-
 buildall.sh                                     |  12 +-
 .../cdh5/etc/init.d/llama-application           |   2 +-
 5 files changed, 106 insertions(+), 60 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/6e71e903/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c6638bd..1569bd3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -14,21 +14,6 @@
 
 cmake_minimum_required(VERSION 2.6)
 
-if ("$ENV{SKIP_TOOLCHAIN_BOOTSTRAP}" STREQUAL "true")
-  message(STATUS "SKIP_TOOLCHAIN_BOOTSTRAP is true, skipping toolchain bootstrap.")
-else()
-  # Download any missing toolchain dependencies. If this fails, fail the build.
-  set(BOOTSTRAP_CMD "$ENV{IMPALA_HOME}/bin/bootstrap_toolchain.py")
-  # Download and unpack the dependencies
-  message(STATUS "Downloading and extracting dependencies.")
-  execute_process(COMMAND ${BOOTSTRAP_CMD} RESULT_VARIABLE BOOTSTRAP_RESULT)
-  if (${BOOTSTRAP_RESULT} EQUAL 0)
-    message(STATUS "Toolchain bootstrap complete.")
-  else()
-    message(FATAL_ERROR "Toolchain bootstrap failed.")
-  endif()
-endif()
-
 # Explicitly define project() to allow modifying the compiler before the project is
 # initialized.
 project(Impala)

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/6e71e903/bin/bootstrap_toolchain.py
----------------------------------------------------------------------
diff --git a/bin/bootstrap_toolchain.py b/bin/bootstrap_toolchain.py
index 2589d1f..5a43e1d 100755
--- a/bin/bootstrap_toolchain.py
+++ b/bin/bootstrap_toolchain.py
@@ -11,15 +11,15 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-# Bootstrapping the native toolchain with prebuilt binaries
 #
-# The purpose of this script is to download prebuilt artifacts of the native toolchain to
-# satisfy the third-party dependencies for Impala. The script checks for the presence of
-# IMPALA_HOME and IMPALA_TOOLCHAIN. IMPALA_HOME indicates that the environment is
-# correctly setup and that we can deduce the version settings of the dependencies from the
-# environment. IMPALA_TOOLCHAIN indicates the location where the prebuilt artifacts should
-# be extracted to.
+# The purpose of this script is to download prebuilt binaries and jar files to satisfy the
+# third-party dependencies for Impala. The script checks for the presence of IMPALA_HOME
+# and IMPALA_TOOLCHAIN. IMPALA_HOME indicates that the environment is correctly setup and
+# that we can deduce the version settings of the dependencies from the environment.
+# IMPALA_TOOLCHAIN indicates the location where the prebuilt artifacts should be extracted
+# to. If DOWNLOAD_CDH_COMPONENTS is set to true, this script will also download and extract
+# the CDH components (i.e. Hadoop, Hive, HBase, Llama, Llama-minikdc and Sentry) into
+# CDH_COMPONENTS_HOME.
 #
 # The script is called as follows without any additional parameters:
 #
@@ -29,6 +29,7 @@ import re
 import sh
 import shutil
 import subprocess
+import sys
 import tempfile
 
 HOST = "https://native-toolchain.s3.amazonaws.com/build"
@@ -67,45 +68,30 @@ def get_platform_release_label(release=None):
 
   raise Exception("Could not find package label for OS version: {0}.".format(release))
 
-def download_package(destination, product, version, compiler, platform_release=None):
-  remove_existing_package(destination, product, version)
-
-  label = get_platform_release_label(release=platform_release)
-  file_name = "{0}-{1}-{2}-{3}.tar.gz".format(product, version, compiler, label)
-  url_path="/{0}/{1}-{2}/{0}-{1}-{2}-{3}.tar.gz".format(product, version, compiler, label)
-  download_path = HOST + url_path
 
+def wget_and_unpack_package(download_path, file_name, destination, wget_no_clobber):
   print "URL {0}".format(download_path)
   print "Downloading {0} to {1}".format(file_name, destination)
   # --no-clobber avoids downloading the file if a file with the name already exists
-  sh.wget(download_path, directory_prefix=destination, no_clobber=True)
+  sh.wget(download_path, directory_prefix=destination, no_clobber=wget_no_clobber)
   print "Extracting {0}".format(file_name)
   sh.tar(z=True, x=True, f=os.path.join(destination, file_name), directory=destination)
   sh.rm(os.path.join(destination, file_name))
 
-def bootstrap(packages):
-  """Validates the presence of $IMPALA_HOME and $IMPALA_TOOLCHAIN in the environment. By
-  checking $IMPALA_HOME is present, we assume that IMPALA_{LIB}_VERSION will be present as
-  well. Will create the directory specified by $IMPALA_TOOLCHAIN if it does not yet
-  exist. Each of the packages specified in `packages` is downloaded and extracted into
-  $IMPALA_TOOLCHAIN.
-
-  """
-  if not os.getenv("IMPALA_HOME"):
-    print("Impala environment not set up correctly, make sure "
-          "impala-config.sh is sourced.")
-    sys.exit(1)
+def download_package(destination, product, version, compiler, platform_release=None):
+  remove_existing_package(destination, product, version)
 
-  # Create the destination directory if necessary
-  toolchain_root = os.getenv("IMPALA_TOOLCHAIN")
-  if not toolchain_root:
-    print("Impala environment not set up correctly, make sure "
-          "$IMPALA_TOOLCHAIN is present.")
-    sys.exit(1)
+  label = get_platform_release_label(release=platform_release)
+  file_name = "{0}-{1}-{2}-{3}.tar.gz".format(product, version, compiler, label)
+  url_path="/{0}/{1}-{2}/{0}-{1}-{2}-{3}.tar.gz".format(product, version, compiler, label)
+  download_path = HOST + url_path
 
-  if not os.path.exists(toolchain_root):
-    os.makedirs(toolchain_root)
+  wget_and_unpack_package(download_path, file_name, destination, True)
 
+def bootstrap(toolchain_root, packages):
+  """Downloads and unpacks each package in the list `packages` into `toolchain_root` if it
+  doesn't exist already.
+  """
   if not try_get_platform_release_label():
     check_custom_toolchain(toolchain_root, packages)
     return
@@ -303,8 +289,62 @@ extern "C" void %s() {
   finally:
     shutil.rmtree(stub_build_dir)
 
+def download_cdh_components(toolchain_root, cdh_components):
+  """Downloads and unpacks the CDH components into $CDH_COMPONENTS_HOME if not found."""
+  cdh_components_home = os.getenv("CDH_COMPONENTS_HOME")
+  if not cdh_components_home:
+    print("Impala environment not set up correctly, make sure "
+          "$CDH_COMPONENTS_HOME is present.")
+    return
+
+  # Create the directory where CDH components live if necessary.
+  if not os.path.exists(cdh_components_home):
+    os.makedirs(cdh_components_home)
+
+  # The URL prefix of where CDH components live in S3.
+  download_path_prefix = HOST + "/cdh_components/"
+
+  for component in cdh_components:
+    pkg_name, pkg_version = unpack_name_and_version(component)
+    pkg_directory = package_directory(cdh_components_home, pkg_name, pkg_version)
+    if os.path.isdir(pkg_directory):
+      continue
+
+    # Download the package if it doesn't exist
+    file_name = "{0}-{1}.tar.gz".format(pkg_name, pkg_version)
+    download_path = download_path_prefix + file_name
+    wget_and_unpack_package(download_path, file_name, cdh_components_home, False)
+
 if __name__ == "__main__":
+  """Validates the presence of $IMPALA_HOME and $IMPALA_TOOLCHAIN in the environment.-
+  By checking $IMPALA_HOME is present, we assume that IMPALA_{LIB}_VERSION will be present
+  as well. Will create the directory specified by $IMPALA_TOOLCHAIN if it doesn't exist
+  yet. Each of the packages specified in `packages` is downloaded and extracted into
+  $IMPALA_TOOLCHAIN. If $DOWNLOAD_CDH_COMPONENTS is true, this function will also download
+  the CDH components (i.e. hadoop, hbase, hive, llama, llama-minikidc and sentry) into the
+  directory specified by $CDH_COMPONENTS_HOME.
+  """
+  if not os.getenv("IMPALA_HOME"):
+    print("Impala environment not set up correctly, make sure "
+          "impala-config.sh is sourced.")
+    sys.exit(1)
+
+  # Create the destination directory if necessary
+  toolchain_root = os.getenv("IMPALA_TOOLCHAIN")
+  if not toolchain_root:
+    print("Impala environment not set up correctly, make sure "
+          "$IMPALA_TOOLCHAIN is present.")
+    sys.exit(1)
+
+  if not os.path.exists(toolchain_root):
+    os.makedirs(toolchain_root)
+
   packages = ["avro", "binutils", "boost", "breakpad", "bzip2", "gcc", "gflags", "glog",
       "gperftools", "gtest", "kudu", "llvm", ("llvm", "3.8.0-asserts-p1"), "lz4",
       "openldap", "rapidjson", "re2", "snappy", "thrift", "zlib"]
-  bootstrap(packages)
+  bootstrap(toolchain_root, packages)
+
+  # Download the CDH components if necessary.
+  if os.getenv("DOWNLOAD_CDH_COMPONENTS", "false") == "true":
+    cdh_components = ["hadoop", "hbase", "hive", "llama", "llama-minikdc", "sentry"]
+    download_cdh_components(toolchain_root, cdh_components)

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/6e71e903/bin/impala-config.sh
----------------------------------------------------------------------
diff --git a/bin/impala-config.sh b/bin/impala-config.sh
index 584e5a2..9db640a 100755
--- a/bin/impala-config.sh
+++ b/bin/impala-config.sh
@@ -46,6 +46,9 @@ if [ -z $IMPALA_TOOLCHAIN ]; then
   return 1
 fi
 
+# If true, will not call $IMPALA_HOME/bin/bootstrap_toolchain.py.
+: ${SKIP_TOOLCHAIN_BOOTSTRAP=false}
+
 # This flag is used in $IMPALA_HOME/cmake_modules/toolchain.cmake.
 # If it's 0, Impala will be built with the compiler in the toolchain directory.
 : ${USE_SYSTEM_GCC=0}
@@ -63,11 +66,17 @@ fi
 # If enabled, debug symbols are added to cross-compiled IR.
 : ${ENABLE_IMPALA_IR_DEBUG_INFO=false}
 
+# If true, download and use the CDH components from S3 instead of the ones
+# in $IMPALA_HOME/thirdparty.
+: ${DOWNLOAD_CDH_COMPONENTS=false}
+
 export IMPALA_TOOLCHAIN
+export SKIP_TOOLCHAIN_BOOTSTRAP
 export USE_SYSTEM_GCC
 export USE_GOLD_LINKER
 export IMPALA_CXX_COMPILER
 export ENABLE_IMPALA_IR_DEBUG_INFO
+export DOWNLOAD_CDH_COMPONENTS
 export IS_OSX=$(if [[ "$OSTYPE" == "darwin"* ]]; then echo true; else echo false; fi)
 
 # To use a local build of Kudu, set KUDU_BUILD_DIR to the path Kudu was built in and
@@ -280,7 +289,7 @@ export IMPALA_HIVE_VERSION=1.1.0-cdh5.9.0-SNAPSHOT
 export IMPALA_SENTRY_VERSION=1.5.1-cdh5.9.0-SNAPSHOT
 export IMPALA_LLAMA_VERSION=1.0.0-cdh5.9.0-SNAPSHOT
 export IMPALA_PARQUET_VERSION=1.5.0-cdh5.9.0-SNAPSHOT
-export IMPALA_MINIKDC_VERSION=1.0.0
+export IMPALA_LLAMA_MINIKDC_VERSION=1.0.0
 
 export IMPALA_FE_DIR=$IMPALA_HOME/fe
 export IMPALA_BE_DIR=$IMPALA_HOME/be
@@ -292,7 +301,11 @@ export IMPALA_COMMON_DIR=$IMPALA_HOME/common
 export PATH=$IMPALA_HOME/bin:$PATH
 
 # The directory in which all the thirdparty CDH components live.
-CDH_COMPONENTS_HOME=$IMPALA_HOME/thirdparty
+if [ "${DOWNLOAD_CDH_COMPONENTS}" = true ]; then
+  export CDH_COMPONENTS_HOME=$IMPALA_TOOLCHAIN/cdh_components
+else
+  export CDH_COMPONENTS_HOME=$IMPALA_HOME/thirdparty
+fi
 
 # Hadoop dependencies are snapshots in the Impala tree
 export HADOOP_HOME=$CDH_COMPONENTS_HOME/hadoop-${IMPALA_HADOOP_VERSION}/
@@ -308,7 +321,7 @@ export MINI_DFS_BASE_DATA_DIR=$IMPALA_HOME/cdh-${CDH_MAJOR_VERSION}-hdfs-data
 export PATH=$HADOOP_HOME/bin:$PATH
 
 export LLAMA_HOME=$CDH_COMPONENTS_HOME/llama-${IMPALA_LLAMA_VERSION}/
-export MINIKDC_HOME=$CDH_COMPONENTS_HOME/llama-minikdc-${IMPALA_MINIKDC_VERSION}
+export MINIKDC_HOME=$CDH_COMPONENTS_HOME/llama-minikdc-${IMPALA_LLAMA_MINIKDC_VERSION}
 export SENTRY_HOME=$CDH_COMPONENTS_HOME/sentry-${IMPALA_SENTRY_VERSION}
 export SENTRY_CONF_DIR=$IMPALA_HOME/fe/src/test/resources
 
@@ -382,10 +395,10 @@ export JAVA_LIBRARY_PATH=${IMPALA_SNAPPY_PATH}
 LIB_JAVA=`find ${JAVA_HOME}/   -name libjava.so | head -1`
 LIB_JSIG=`find ${JAVA_HOME}/   -name libjsig.so | head -1`
 LIB_JVM=` find ${JAVA_HOME}/   -name libjvm.so  | head -1`
-LIB_HDFS=`find ${HADOOP_HOME}/ -name libhdfs.so | head -1`
 LD_LIBRARY_PATH="${LD_LIBRARY_PATH-}"
 LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:`dirname ${LIB_JAVA}`:`dirname ${LIB_JSIG}`"
-LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:`dirname ${LIB_JVM}`:`dirname ${LIB_HDFS}`"
+LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:`dirname ${LIB_JVM}`"
+LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${HADOOP_HOME}/lib/native"
 LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${IMPALA_HOME}/be/build/debug/service"
 LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${IMPALA_SNAPPY_PATH}"
 LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${IMPALA_LZO}/build"

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/6e71e903/buildall.sh
----------------------------------------------------------------------
diff --git a/buildall.sh b/buildall.sh
index 2c49b6f..b4542d8 100755
--- a/buildall.sh
+++ b/buildall.sh
@@ -244,6 +244,15 @@ if [ $CLEAN_ACTION -eq 1 ]; then
     $IMPALA_HOME/bin/clean.sh
 fi
 
+# Populate necessary thirdparty components unless it's set to be skipped.
+if [ "${SKIP_TOOLCHAIN_BOOTSTRAP}" = true ]; then
+  echo "SKIP_TOOLCHAIN_BOOTSTRAP is true, skipping toolchain bootstrap."
+else
+  echo "Downloading and extracting dependencies."
+  $IMPALA_HOME/bin/bootstrap_toolchain.py
+  echo "Toolchain bootstrap complete."
+fi
+
 MAKE_IMPALA_ARGS="${MAKE_IMPALA_ARGS} -build_type=${CMAKE_BUILD_TYPE}"
 
 if [ $BUILD_FE_ONLY -eq 1 ]; then
@@ -254,8 +263,7 @@ fi
 
 if [ -e $HADOOP_LZO/build/native/Linux-*-*/lib/libgplcompression.so ]
 then
-  cp $HADOOP_LZO/build/native/Linux-*-*/lib/libgplcompression.* \
-    $IMPALA_HOME/thirdparty/hadoop-${IMPALA_HADOOP_VERSION}/lib/native/
+  cp $HADOOP_LZO/build/native/Linux-*-*/lib/libgplcompression.* $HADOOP_HOME/lib/native
 else
   echo "No hadoop-lzo found"
 fi

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/6e71e903/testdata/cluster/node_templates/cdh5/etc/init.d/llama-application
----------------------------------------------------------------------
diff --git a/testdata/cluster/node_templates/cdh5/etc/init.d/llama-application b/testdata/cluster/node_templates/cdh5/etc/init.d/llama-application
index d19a42b..b519528 100755
--- a/testdata/cluster/node_templates/cdh5/etc/init.d/llama-application
+++ b/testdata/cluster/node_templates/cdh5/etc/init.d/llama-application
@@ -7,7 +7,7 @@ DIR=$(dirname $0)
 
 HADOOP_LOG_DIR="$LOG_DIR/llama"
 HADOOP_CLASSPATH="$NODE_DIR/etc/llama/conf:$HADOOP_CLASSPATH"
-for JAR in $(find "$IMPALA_HOME"/thirdparty/llama* -name "*jar"); do
+for JAR in $(find "$LLAMA_HOME" -name "*jar"); do
   HADOOP_CLASSPATH="$JAR:$HADOOP_CLASSPATH"
 done
 export HADOOP_CLASSPATH


Mime
View raw message