impala-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
Subject [3/3] incubator-impala git commit: IMPALA-5583: [DOCS] Document default_join_distribution_mode query option
Date Mon, 10 Jul 2017 23:09:09 GMT
IMPALA-5583: [DOCS] Document default_join_distribution_mode query option

New page for the query option.

Change-Id: I4ec6213efc46bce0fe07c590841d51c009fb5c84
Reviewed-by: Mostafa Mokhtar <>
Tested-by: Impala Public Jenkins


Branch: refs/heads/master
Commit: 801c32dec3914939c95c2cab07f8628dd627aef5
Parents: db3f323
Author: John Russell <>
Authored: Mon Jun 26 15:49:27 2017 -0700
Committer: Impala Public Jenkins <>
Committed: Mon Jul 10 23:08:12 2017 +0000

 docs/impala.ditamap                             |   1 +
 docs/impala_keydefs.ditamap                     |   1 +
 .../impala_default_join_distribution_mode.xml   | 134 +++++++++++++++++++
 3 files changed, 136 insertions(+)
diff --git a/docs/impala.ditamap b/docs/impala.ditamap
index 574602a..b10ddbf 100644
--- a/docs/impala.ditamap
+++ b/docs/impala.ditamap
@@ -176,6 +176,7 @@ under the License.
           <topicref href="topics/impala_batch_size.xml"/>
           <topicref href="topics/impala_compression_codec.xml"/>
           <topicref href="topics/impala_debug_action.xml"/>
+          <topicref rev="2.9.0 IMPALA-5381" href="topics/impala_default_join_distribution_mode.xml"/>
           <topicref href="topics/impala_default_order_by_limit.xml"/>
           <topicref audience="hidden" href="topics/impala_disable_cached_reads.xml"/>
           <topicref href="topics/impala_disable_codegen.xml"/>
diff --git a/docs/impala_keydefs.ditamap b/docs/impala_keydefs.ditamap
index 7c9bb60..378a5bb 100644
--- a/docs/impala_keydefs.ditamap
+++ b/docs/impala_keydefs.ditamap
@@ -10749,6 +10749,7 @@ under the License.
   <keydef href="topics/impala_batch_size.xml" keys="batch_size"/>
   <keydef href="topics/impala_compression_codec.xml" keys="compression_codec"/>
   <keydef href="topics/impala_debug_action.xml" keys="debug_action"/>
+  <keydef href="topics/impala_default_join_distribution_mode.xml" keys="default_join_distribution_mode"/>
   <keydef href="topics/impala_default_order_by_limit.xml" keys="default_order_by_limit"/>
   <keydef href="topics/impala_disable_cached_reads.xml" keys="disable_cached_reads"/>
   <keydef href="topics/impala_disable_codegen.xml" keys="disable_codegen"/>
diff --git a/docs/topics/impala_default_join_distribution_mode.xml b/docs/topics/impala_default_join_distribution_mode.xml
new file mode 100644
index 0000000..1b17d50
--- /dev/null
+++ b/docs/topics/impala_default_join_distribution_mode.xml
@@ -0,0 +1,134 @@
+<?xml version="1.0" encoding="UTF-8"?>
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="default_join_distribution_mode" rev="2.9.0 IMPALA-5381 IMPALA-5583">
+  <title>DEFAULT_JOIN_DISTRIBUTION_MODE Query Option</title>
+  <titlealts audience="PDF"><navtitle>DEFAULT_JOIN_DISTRIBUTION_MODE</navtitle></titlealts>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="Impala Query Options"/>
+      <data name="Category" value="Performance"/>
+      <data name="Category" value="Querying"/>
+      <data name="Category" value="Developers"/>
+      <data name="Category" value="Data Analysts"/>
+    </metadata>
+  </prolog>
+  <conbody>
+    <p>
+      <indexterm audience="hidden">DEFAULT_JOIN_DISTRIBUTION_MODE query option</indexterm>
+      This option determines the join distribution that Impala uses when any of the tables
+      involved in a join query is missing statistics.
+    </p>
+    <p>
+      Impala optimizes join queries based on the presence of table statistics,
+      which are produced by the Impala <codeph>COMPUTE STATS</codeph> statement.
+      By default, when a table involved in the join query does not have statistics,
+      Impala uses the <q>broadcast</q> technique that transmits the entire contents
+      of the table to all executor nodes participating in the query. If one table
+      involved in a join has statistics and the other does not, the table without
+      statistics is broadcast. If both tables are missing statistics, the table
+      that is referenced second in the join order is broadcast. This behavior
+      is appropriate when the table involved is relatively small, but can lead to
+      excessive network, memory, and CPU overhead if the table being broadcast is
+      large.
+    </p>
+    <p>
+      Because Impala queries frequently involve very large tables, and suboptimal
+      joins for such tables could result in spilling or out-of-memory errors,
+      the setting <codeph>DEFAULT_JOIN_DISTRIBUTION_MODE=SHUFFLE</codeph> lets
+      override the default behavior. The shuffle join mechanism divides the corresponding
+      of each table involved in a join query using a hashing algorithm, and transmits
+      subsets of the rows to other nodes for processing. Typically, this kind of join is
+      more efficient for joins between large tables of similar size.
+    </p>
+    <p>
+      The setting <codeph>DEFAULT_JOIN_DISTRIBUTION_MODE=SHUFFLE</codeph> is
+      recommended when setting up and deploying new clusters, because it is less likely
+      to result in serious consequences such as spilling or out-of-memory errors if
+      the query plan is based on incomplete information. This setting is not the default,
+      to avoid changing the performance characteristics of join queries for clusters that
+      are already tuned for their existing workloads.
+    </p>
+    <p conref="../shared/impala_common.xml#common/type_integer"/>
+    <p>
+      The allowed values are <codeph>BROADCAST</codeph> (equivalent to 0)
+      or <codeph>SHUFFLE</codeph> (equivalent to 1).
+    </p>
+    <p conref="../shared/impala_common.xml#common/example_blurb"/>
+    <p>
+      The following examples demonstrate appropriate scenarios for each
+      setting of this query option.
+    </p>
+-- Create a billion-row table.
+create table big_table stored as parquet
+  as select * from huge_table limit 1e9;
+-- For a big table with no statistics, the
+-- shuffle join mechanism is appropriate.
+set default_join_distribution_mode=shuffle;
+...join queries involving the big table...
+-- Create a hundred-row table.
+create table tiny_table stored as parquet
+  as select * from huge_table limit 100;
+-- For a tiny table with no statistics, the
+-- broadcast join mechanism is appropriate.
+set default_join_distribution_mode=broadcast;
+...join queries involving the tiny table...
+compute stats tiny_table;
+compute stats big_table;
+-- Once the stats are computed, the query option has
+-- no effect on join queries involving these tables.
+-- Impala can determine the absolute and relative sizes
+-- of each side of the join query by examining the
+-- row size, cardinality, and so on of each table.
+...join queries involving both of these tables...
+    <p conref="../shared/impala_common.xml#common/related_info"/>
+    <p>
+      <xref keyref="compute_stats"/>,
+      <xref keyref="joins"/>,
+      <xref keyref="perf_joins"/>
+    </p>
+  </conbody>

View raw message