impala-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From tarmstr...@apache.org
Subject [4/4] incubator-impala git commit: IMPALA-3227: generate test TPC data sets during data load
Date Thu, 28 Jul 2016 05:05:26 GMT
IMPALA-3227: generate test TPC data sets during data load

The generated data is identical to the pregenerated tpch.tar.gz
and tpcds.tar.gz data that was used previously and were not
publically accessible.

This adds a "preload" hook to bin/load-data.py that can execute custom
logic for each data set. This is used to call the TPC-H and TPC-DS data
generation utilities that are already available in the Impala toolchain.

Testing:
Ran private test job with loading from snapshot disabled and without
the tpch/tpcds tarballs available.

Change-Id: Ieccfbd7d8d4a91bffddbe35abb7f5572e71a71cf
Reviewed-on: http://gerrit.cloudera.org:8080/3761
Reviewed-by: Tim Armstrong <tarmstrong@cloudera.com>
Tested-by: Internal Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/c1d70f81
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/c1d70f81
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/c1d70f81

Branch: refs/heads/master
Commit: c1d70f814e777fed22e1804eeea12fc9adcb3163
Parents: 6fbd35f
Author: Tim Armstrong <tarmstrong@cloudera.com>
Authored: Sun Jul 24 23:50:56 2016 -0700
Committer: Internal Jenkins <cloudera-hudson@gerrit.cloudera.org>
Committed: Thu Jul 28 04:56:57 2016 +0000

----------------------------------------------------------------------
 bin/bootstrap_toolchain.py      |  2 +-
 bin/impala-config.sh            |  3 +++
 bin/load-data.py                | 10 ++++++++
 testdata/datasets/tpcds/preload | 49 ++++++++++++++++++++++++++++++++++++
 testdata/datasets/tpch/preload  | 47 ++++++++++++++++++++++++++++++++++
 5 files changed, 110 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/c1d70f81/bin/bootstrap_toolchain.py
----------------------------------------------------------------------
diff --git a/bin/bootstrap_toolchain.py b/bin/bootstrap_toolchain.py
index 5a43e1d..def2184 100755
--- a/bin/bootstrap_toolchain.py
+++ b/bin/bootstrap_toolchain.py
@@ -341,7 +341,7 @@ if __name__ == "__main__":
 
   packages = ["avro", "binutils", "boost", "breakpad", "bzip2", "gcc", "gflags", "glog",
       "gperftools", "gtest", "kudu", "llvm", ("llvm", "3.8.0-asserts-p1"), "lz4",
-      "openldap", "rapidjson", "re2", "snappy", "thrift", "zlib"]
+      "openldap", "rapidjson", "re2", "snappy", "thrift", "tpc-h", "tpc-ds", "zlib"]
   bootstrap(toolchain_root, packages)
 
   # Download the CDH components if necessary.

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/c1d70f81/bin/impala-config.sh
----------------------------------------------------------------------
diff --git a/bin/impala-config.sh b/bin/impala-config.sh
index 4cb0339..7ad1b9d 100755
--- a/bin/impala-config.sh
+++ b/bin/impala-config.sh
@@ -269,6 +269,9 @@ export IMPALA_RAPIDJSON_VERSION=0.11
 export IMPALA_RE2_VERSION=20130115-p1
 export IMPALA_SNAPPY_VERSION=1.1.3
 export IMPALA_SQUEASEL_VERSION=3.3
+# TPC utilities used for test/benchmark data generation.
+export IMPALA_TPC_DS_VERSION=2.1.0
+export IMPALA_TPC_H_VERSION=2.17.0
 export IMPALA_THRIFT_VERSION=0.9.0-p8
 export IMPALA_THRIFT_JAVA_VERSION=0.9.0
 export IMPALA_ZLIB_VERSION=1.2.8

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/c1d70f81/bin/load-data.py
----------------------------------------------------------------------
diff --git a/bin/load-data.py b/bin/load-data.py
index 7c0bbf6..f62a701 100755
--- a/bin/load-data.py
+++ b/bin/load-data.py
@@ -153,6 +153,15 @@ def exec_bash_script(file_name):
   print 'Executing Bash Command: ' + bash_cmd
   exec_cmd(bash_cmd, 'Error bash script: ' + file_name)
 
+def run_dataset_preload(dataset):
+  """Execute a preload script if present in dataset directory. E.g. to generate data
+  before loading"""
+  dataset_preload_script = os.path.join(DATASET_DIR, dataset, "preload")
+  if os.path.exists(dataset_preload_script):
+    print("Running preload script for " + dataset)
+    exec_cmd(dataset_preload_script, "Error executing preload script for " + dataset,
+        exit_on_error=True)
+
 def generate_schema_statements(workload):
   generate_cmd = GENERATE_SCHEMA_CMD % (options.exploration_strategy, workload,
                                         options.scale_factor)
@@ -259,6 +268,7 @@ if __name__ == "__main__":
   for workload in workloads:
     start_time = time.time()
     dataset = get_dataset_for_workload(workload)
+    run_dataset_preload(dataset)
     generate_schema_statements(workload)
     sql_dir = os.path.join(SQL_OUTPUT_DIR, dataset)
     assert os.path.isdir(sql_dir),\

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/c1d70f81/testdata/datasets/tpcds/preload
----------------------------------------------------------------------
diff --git a/testdata/datasets/tpcds/preload b/testdata/datasets/tpcds/preload
new file mode 100755
index 0000000..0deb745
--- /dev/null
+++ b/testdata/datasets/tpcds/preload
@@ -0,0 +1,49 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -euo pipefail
+trap 'echo Error in $0 at line $LINENO: $(cd "'$PWD'" && awk "NR == $LINENO" $0)'
ERR
+
+IMPALA_DATA=${IMPALA_HOME}/testdata/impala-data
+TPC_DS_DATA=${IMPALA_DATA}/tpcds
+TPC_DS_DIRNAME=tpc-ds-${IMPALA_TPC_DS_VERSION}
+TPC_DS_HOME=${IMPALA_TOOLCHAIN}/${TPC_DS_DIRNAME}
+
+echo "Generating TPC-DS data into ${TPC_DS_DATA}"
+# Delete any preexisting data or symlinks
+rm -rf ${TPC_DS_DATA}
+mkdir -p ${TPC_DS_DATA}
+cd ${TPC_DS_DATA}
+
+# dsdgen uses fixed size buffers that cause bizarre issues if the path to the
+# binary is too long. Workaround by symlinking.
+rm -f ${TPC_DS_DIRNAME}
+ln -s ${TPC_DS_HOME} ${TPC_DS_DIRNAME}
+TPC_DS_DSDGEN=${TPC_DS_DIRNAME}/bin/dsdgen
+
+if [ ! -x ${TPC_DS_DSDGEN} ]; then
+  echo "Could not find TPC-DS data generator executable: ${TPC_DS_DSDGEN}"
+  exit 1
+fi
+${TPC_DS_DSDGEN} -force -verbose
+# Impala expects each table to be in its own subdirectory.
+for FILE in *.dat; do
+  FILE_DIR=${FILE%.dat}
+  mkdir -p ${FILE_DIR}
+  mv ${FILE} ${FILE_DIR}
+done

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/c1d70f81/testdata/datasets/tpch/preload
----------------------------------------------------------------------
diff --git a/testdata/datasets/tpch/preload b/testdata/datasets/tpch/preload
new file mode 100755
index 0000000..64553d6
--- /dev/null
+++ b/testdata/datasets/tpch/preload
@@ -0,0 +1,47 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -euo pipefail
+trap 'echo Error in $0 at line $LINENO: $(cd "'$PWD'" && awk "NR == $LINENO" $0)'
ERR
+
+IMPALA_DATA=${IMPALA_HOME}/testdata/impala-data
+TPC_H_DATA=${IMPALA_DATA}/tpch
+TPC_H_HOME=${IMPALA_TOOLCHAIN}/tpc-h-${IMPALA_TPC_H_VERSION}
+TPC_H_DBGEN=${TPC_H_HOME}/bin/dbgen
+
+if [ ! -x ${TPC_H_DBGEN} ]; then
+  echo "Could not find TPC-H data generator executable: ${TPC_H_DBGEN}"
+  exit 1
+fi
+
+echo "Generating TPC-H data into ${TPC_H_DATA}"
+# Delete any preexisting data or symlinks
+# Need to change permissions to work around an old TPC-H data tarball that had a
+# non-writable top-level directory when extracted.
+chmod +w ${TPC_H_DATA} || true
+rm -rf ${TPC_H_DATA}
+mkdir -p ${TPC_H_DATA}
+cd ${TPC_H_DATA}
+
+${TPC_H_DBGEN} -v -f
+# Impala expects each table to be in its own subdirectory.
+for FILE in *.tbl; do
+  FILE_DIR=${FILE%.tbl}
+  mkdir -p ${FILE_DIR}
+  mv ${FILE} ${FILE_DIR}
+done


Mime
View raw message