impala-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From jbap...@apache.org
Subject [19/51] [partial] incubator-impala git commit: IMPALA-3398: Add docs to main Impala branch.
Date Thu, 17 Nov 2016 23:11:57 GMT
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3be0f122/docs/topics/impala_max_block_mgr_memory.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_max_block_mgr_memory.xml b/docs/topics/impala_max_block_mgr_memory.xml
new file mode 100644
index 0000000..3bf8ac8
--- /dev/null
+++ b/docs/topics/impala_max_block_mgr_memory.xml
@@ -0,0 +1,30 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept rev="2.1.0" id="max_block_mgr_memory">
+
+  <title>MAX_BLOCK_MGR_MEMORY</title>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="Impala Query Options"/>
+      <data name="Category" value="Memory"/>
+      <data name="Category" value="Developers"/>
+      <data name="Category" value="Data Analysts"/>
+    </metadata>
+  </prolog>
+
+  <conbody>
+
+    <p rev="2.1.0">
+      <indexterm audience="Cloudera">MAX_BLOCK_MGR_MEMORY query option</indexterm>
+    </p>
+
+    <p></p>
+
+    <p>
+      <b>Default:</b>
+    </p>
+
+    <p conref="../shared/impala_common.xml#common/added_in_20"/>
+  </conbody>
+</concept>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3be0f122/docs/topics/impala_max_errors.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_max_errors.xml b/docs/topics/impala_max_errors.xml
new file mode 100644
index 0000000..c6eb971
--- /dev/null
+++ b/docs/topics/impala_max_errors.xml
@@ -0,0 +1,47 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="max_errors">
+
+  <title>MAX_ERRORS Query Option</title>
+  <titlealts audience="PDF"><navtitle>MAX_ERRORS</navtitle></titlealts>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="Impala Query Options"/>
+      <data name="Category" value="Troubleshooting"/>
+      <data name="Category" value="Logs"/>
+      <data name="Category" value="Developers"/>
+      <data name="Category" value="Data Analysts"/>
+    </metadata>
+  </prolog>
+
+  <conbody>
+
+    <p>
+      <indexterm audience="Cloudera">MAX_ERRORS query option</indexterm>
+      Maximum number of non-fatal errors for any particular query that are recorded in the
Impala log file. For
+      example, if a billion-row table had a non-fatal data error in every row, you could
diagnose the problem
+      without all billion errors being logged. Unspecified or 0 indicates the built-in default
value of 1000.
+    </p>
+
+    <p>
+      This option only controls how many errors are reported. To specify whether Impala continues
or halts when it
+      encounters such errors, use the <codeph>ABORT_ON_ERROR</codeph> option.
+    </p>
+
+    <p>
+      <b>Type:</b> numeric
+    </p>
+
+    <p>
+      <b>Default:</b> 0 (meaning 1000 errors)
+    </p>
+
+    <p conref="../shared/impala_common.xml#common/related_info"/>
+    <p>
+      <xref href="impala_abort_on_error.xml#abort_on_error"/>,
+      <xref href="impala_logging.xml#logging"/>
+    </p>
+
+  </conbody>
+</concept>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3be0f122/docs/topics/impala_max_io_buffers.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_max_io_buffers.xml b/docs/topics/impala_max_io_buffers.xml
new file mode 100644
index 0000000..477c833
--- /dev/null
+++ b/docs/topics/impala_max_io_buffers.xml
@@ -0,0 +1,31 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="max_io_buffers">
+
+  <title>MAX_IO_BUFFERS Query Option</title>
+  <titlealts audience="PDF"><navtitle>MAX_IO_BUFFERS</navtitle></titlealts>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="Impala Query Options"/>
+      <data name="Category" value="Deprecated Features"/>
+      <data name="Category" value="Developers"/>
+      <data name="Category" value="Data Analysts"/>
+    </metadata>
+  </prolog>
+
+  <conbody>
+
+    <p>
+      Deprecated query option. Currently has no effect.
+    </p>
+
+    <p>
+      <b>Type:</b> numeric
+    </p>
+
+    <p>
+      <b>Default:</b> 0
+    </p>
+  </conbody>
+</concept>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3be0f122/docs/topics/impala_max_num_runtime_filters.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_max_num_runtime_filters.xml b/docs/topics/impala_max_num_runtime_filters.xml
new file mode 100644
index 0000000..edf4058
--- /dev/null
+++ b/docs/topics/impala_max_num_runtime_filters.xml
@@ -0,0 +1,61 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="max_num_runtime_filters" rev="2.5.0">
+
+  <title>MAX_NUM_RUNTIME_FILTERS Query Option (<keyword keyref="impala25"/> or
higher only)</title>
+  <titlealts audience="PDF"><navtitle>MAX_NUM_RUNTIME_FILTERS</navtitle></titlealts>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="Impala Query Options"/>
+      <data name="Category" value="Performance"/>
+      <data name="Category" value="Developers"/>
+      <data name="Category" value="Data Analysts"/>
+    </metadata>
+  </prolog>
+
+  <conbody>
+
+    <p rev="2.5.0">
+      <indexterm audience="Cloudera">MAX_NUM_RUNTIME_FILTERS query option</indexterm>
+      The <codeph>MAX_NUM_RUNTIME_FILTERS</codeph> query option
+      sets an upper limit on the number of runtime filters that can be produced for each
query.
+    </p>
+
+    <p conref="../shared/impala_common.xml#common/type_integer"/>
+
+    <p>
+      <b>Default:</b> 10
+    </p>
+
+    <p conref="../shared/impala_common.xml#common/added_in_250"/>
+
+    <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+    <p>
+      Each runtime filter imposes some memory overhead on the query.
+      Depending on the setting of the <codeph>RUNTIME_BLOOM_FILTER_SIZE</codeph>
+      query option, each filter might consume between 1 and 16 megabytes
+      per plan fragment. There are typically 5 or fewer filters per plan fragment.
+    </p>
+
+    <p>
+      Impala evaluates the effectiveness of each filter, and keeps the
+      ones that eliminate the largest number of partitions or rows.
+      Therefore, this setting can protect against
+      potential problems due to excessive memory overhead for filter production,
+      while still allowing a high level of optimization for suitable queries.
+    </p>
+
+    <p conref="../shared/impala_common.xml#common/runtime_filtering_option_caveat"/>
+
+    <p conref="../shared/impala_common.xml#common/related_info"/>
+    <p>
+      <xref href="impala_runtime_filtering.xml"/>,
+      <!-- <xref href="impala_partitioning.xml#dynamic_partition_pruning"/>, -->
+      <xref href="impala_runtime_bloom_filter_size.xml#runtime_bloom_filter_size"/>,
+      <xref href="impala_runtime_filter_mode.xml#runtime_filter_mode"/>
+    </p>
+
+  </conbody>
+</concept>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3be0f122/docs/topics/impala_max_scan_range_length.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_max_scan_range_length.xml b/docs/topics/impala_max_scan_range_length.xml
new file mode 100644
index 0000000..a99b92b
--- /dev/null
+++ b/docs/topics/impala_max_scan_range_length.xml
@@ -0,0 +1,55 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="max_scan_range_length">
+
+  <title>MAX_SCAN_RANGE_LENGTH Query Option</title>
+  <titlealts audience="PDF"><navtitle>MAX_SCAN_RANGE_LENGTH</navtitle></titlealts>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="Impala Query Options"/>
+      <data name="Category" value="Performance"/>
+      <data name="Category" value="Developers"/>
+      <data name="Category" value="Data Analysts"/>
+    </metadata>
+  </prolog>
+
+  <conbody>
+
+    <p>
+      <indexterm audience="Cloudera">MAX_SCAN_RANGE_LENGTH query option</indexterm>
+      Maximum length of the scan range. Interacts with the number of HDFS blocks in the table
to determine how many
+      CPU cores across the cluster are involved with the processing for a query. (Each core
processes one scan
+      range.)
+    </p>
+
+    <p>
+      Lowering the value can sometimes increase parallelism if you have unused CPU capacity,
but a too-small value
+      can limit query performance because each scan range involves extra overhead.
+    </p>
+
+    <p>
+      Only applicable to HDFS tables. Has no effect on Parquet tables. Unspecified or 0 indicates
backend default,
+      which is the same as the HDFS block size for each table.
+    </p>
+
+    <p>
+      Although the scan range can be arbitrarily long, Impala internally uses an 8 MB read
buffer so that it can
+      query tables with huge block sizes without allocating equivalent blocks of memory.
+    </p>
+
+    <p>
+      <b>Type:</b> numeric
+    </p>
+
+    <p rev="IMPALA-3799 CDH-43746">
+      In <keyword keyref="impala27_full"/> and higher, the argument value can include
unit specifiers,
+      such as <codeph>100m</codeph> or <codeph>100mb</codeph>. In
previous versions,
+      Impala interpreted such formatted values as 0, leading to query failures.
+    </p>
+
+    <p>
+      <b>Default:</b> 0
+    </p>
+  </conbody>
+</concept>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3be0f122/docs/topics/impala_mem_limit.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_mem_limit.xml b/docs/topics/impala_mem_limit.xml
new file mode 100644
index 0000000..8ed6fbe
--- /dev/null
+++ b/docs/topics/impala_mem_limit.xml
@@ -0,0 +1,212 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="mem_limit">
+
+  <title>MEM_LIMIT Query Option</title>
+  <titlealts audience="PDF"><navtitle>MEM_LIMIT</navtitle></titlealts>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="Impala Query Options"/>
+      <data name="Category" value="Scalability"/>
+      <data name="Category" value="Memory"/>
+      <data name="Category" value="Troubleshooting"/>
+      <data name="Category" value="Developers"/>
+      <data name="Category" value="Data Analysts"/>
+    </metadata>
+  </prolog>
+
+  <conbody>
+
+    <p>
+      <indexterm audience="Cloudera">MEM_LIMIT query option</indexterm>
+      When resource management is not enabled, defines the maximum amount of memory a query
can allocate on each node.
+      Therefore, the total memory that can be used by a query is the <codeph>MEM_LIMIT</codeph>
times the number of nodes.
+    </p>
+
+    <p rev="CDH-32135">
+      There are two levels of memory limit for Impala.
+      The <codeph>-mem_limit</codeph> startup option sets an overall limit for
the <cmdname>impalad</cmdname> process
+      (which handles multiple queries concurrently).
+      That limit is typically expressed in terms of a percentage of the RAM available on
the host, such as <codeph>-mem_limit=70%</codeph>.
+      The <codeph>MEM_LIMIT</codeph> query option, which you set through <cmdname>impala-shell</cmdname>
+      or the <codeph>SET</codeph> statement in a JDBC or ODBC application, applies
to each individual query.
+      The <codeph>MEM_LIMIT</codeph> query option is usually expressed as a fixed
size such as <codeph>10gb</codeph>,
+      and must always be less than the <cmdname>impalad</cmdname> memory limit.
+    </p>
+
+    <p rev="CDH-32135">
+      If query processing exceeds the specified memory limit on any node, either the per-query
limit or the
+      <cmdname>impalad</cmdname> limit, Impala cancels the query automatically.
+      Memory limits are checked periodically during query processing, so the actual memory
in use
+      might briefly exceed the limit without the query being cancelled.
+    </p>
+
+    <p>
+      When resource management is enabled in CDH 5, the mechanism for this option changes.
If set, it overrides the
+      automatic memory estimate from Impala. Impala requests this amount of memory from YARN
on each node, and the
+      query does not proceed until that much memory is available. The actual memory used
by the query could be
+      lower, since some queries use much less memory than others. With resource management,
the
+      <codeph>MEM_LIMIT</codeph> setting acts both as a hard limit on the amount
of memory a query can use on any
+      node (enforced by YARN) and a guarantee that that much memory will be available on
each node while the query
+      is being executed. When resource management is enabled but no <codeph>MEM_LIMIT</codeph>
setting is
+      specified, Impala estimates the amount of memory needed on each node for each query,
requests that much
+      memory from YARN before starting the query, and then internally sets the <codeph>MEM_LIMIT</codeph>
on each
+      node to the requested amount of memory during the query. Thus, if the query takes more
memory than was
+      originally estimated, Impala detects that the <codeph>MEM_LIMIT</codeph>
is exceeded and cancels the query
+      itself.
+    </p>
+
+    <p>
+      <b>Type:</b> numeric
+    </p>
+
+    <p rev="CDH-32135">
+      <b>Units:</b> A numeric argument represents memory size in bytes; you can
also use a suffix of <codeph>m</codeph> or <codeph>mb</codeph>
+      for megabytes, or more commonly <codeph>g</codeph> or <codeph>gb</codeph>
for gigabytes. If you specify a value with unrecognized
+      formats, subsequent queries fail with an error.
+    </p>
+
+    <p rev="CDH-32135">
+      <b>Default:</b> 0 (unlimited)
+    </p>
+
+    <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+    <p rev="CDH-32135">
+      The <codeph>MEM_LIMIT</codeph> setting is primarily useful in a high-concurrency
setting,
+      or on a cluster with a workload shared between Impala and other data processing components.
+      You can prevent any query from accidentally using much more memory than expected,
+      which could negatively impact other Impala queries.
+    </p>
+
+    <p rev="CDH-32135">
+      Use the output of the <codeph>SUMMARY</codeph> command in <cmdname>impala-shell</cmdname>
+      to get a report of memory used for each phase of your most heavyweight queries on each
node,
+      and then set a <codeph>MEM_LIMIT</codeph> somewhat higher than that.
+      See <xref href="impala_explain_plan.xml#perf_summary"/> for usage information
about
+      the <codeph>SUMMARY</codeph> command.
+    </p>
+
+    <p conref="../shared/impala_common.xml#common/example_blurb" rev="CDH-32135"/>
+
+    <p rev="CDH-32135">
+      The following examples show how to set the <codeph>MEM_LIMIT</codeph> query
option
+      using a fixed number of bytes, or suffixes representing gigabytes or megabytes.
+    </p>
+
+<codeblock rev="CDH-32135">
+[localhost:21000] > set mem_limit=3000000000;
+MEM_LIMIT set to 3000000000
+[localhost:21000] > select 5;
+Query: select 5
++---+
+| 5 |
++---+
+| 5 |
++---+
+
+[localhost:21000] > set mem_limit=3g;
+MEM_LIMIT set to 3g
+[localhost:21000] > select 5;
+Query: select 5
++---+
+| 5 |
++---+
+| 5 |
++---+
+
+[localhost:21000] > set mem_limit=3gb;
+MEM_LIMIT set to 3gb
+[localhost:21000] > select 5;
++---+
+| 5 |
++---+
+| 5 |
++---+
+
+[localhost:21000] > set mem_limit=3m;
+MEM_LIMIT set to 3m
+[localhost:21000] > select 5;
++---+
+| 5 |
++---+
+| 5 |
++---+
+[localhost:21000] > set mem_limit=3mb;
+MEM_LIMIT set to 3mb
+[nightly55-2.vpc.cloudera.com:21000] > select 5;
++---+
+| 5 |
++---+
+</codeblock>
+
+    <p rev="CDH-32135">
+      The following examples show how unrecognized <codeph>MEM_LIMIT</codeph>
+      values lead to errors for subsequent queries.
+    </p>
+
+<codeblock rev="CDH-32135">
+[localhost:21000] > set mem_limit=3tb;
+MEM_LIMIT set to 3tb
+[localhost:21000] > select 5;
+ERROR: Failed to parse query memory limit from '3tb'.
+
+[localhost:21000] > set mem_limit=xyz;
+MEM_LIMIT set to xyz
+[localhost:21000] > select 5;
+Query: select 5
+ERROR: Failed to parse query memory limit from 'xyz'.
+</codeblock>
+
+    <p rev="CDH-32135">
+      The following examples shows the automatic query cancellation
+      when the <codeph>MEM_LIMIT</codeph> value is exceeded
+      on any host involved in the Impala query. First it runs a
+      successful query and checks the largest amount of memory
+      used on any node for any stage of the query.
+      Then it sets an artificially low <codeph>MEM_LIMIT</codeph>
+      setting so that the same query cannot run.
+    </p>
+
+<codeblock rev="CDH-32135">
+[localhost:21000] > select count(*) from customer;
+Query: select count(*) from customer
++----------+
+| count(*) |
++----------+
+| 150000   |
++----------+
+
+[localhost:21000] > select count(distinct c_name) from customer;
+Query: select count(distinct c_name) from customer
++------------------------+
+| count(distinct c_name) |
++------------------------+
+| 150000                 |
++------------------------+
+
+[localhost:21000] > summary;
++--------------+--------+----------+----------+---------+------------+----------+---------------+---------------+
+| Operator     | #Hosts | Avg Time | Max Time | #Rows   | Est. #Rows | Peak Mem | Est. Peak
Mem | Detail        |
++--------------+--------+----------+----------+---------+------------+----------+---------------+---------------+
+| 06:AGGREGATE | 1      | 230.00ms | 230.00ms | 1       | 1          | 16.00 KB | -1 B  
       | FINALIZE      |
+| 05:EXCHANGE  | 1      | 43.44us  | 43.44us  | 1       | 1          | 0 B      | -1 B  
       | UNPARTITIONED |
+| 02:AGGREGATE | 1      | 227.14ms | 227.14ms | 1       | 1          | 12.00 KB | 10.00 MB
     |               |
+| 04:AGGREGATE | 1      | 126.27ms | 126.27ms | 150.00K | 150.00K    | 15.17 MB | 10.00 MB
     |               |
+| 03:EXCHANGE  | 1      | 44.07ms  | 44.07ms  | 150.00K | 150.00K    | 0 B      | 0 B   
       | HASH(c_name)  |
+<b>| 01:AGGREGATE | 1      | 361.94ms | 361.94ms | 150.00K | 150.00K    | 23.04 MB
| 10.00 MB      |               |</b>
+| 00:SCAN HDFS | 1      | 43.64ms  | 43.64ms  | 150.00K | 150.00K    | 24.19 MB | 64.00 MB
     | tpch.customer |
++--------------+--------+----------+----------+---------+------------+----------+---------------+---------------+
+
+[localhost:21000] > set mem_limit=15mb;
+MEM_LIMIT set to 15mb
+[localhost:21000] > select count(distinct c_name) from customer;
+Query: select count(distinct c_name) from customer
+ERROR:
+Memory limit exceeded
+Query did not have enough memory to get the minimum required buffers in the block manager.
+</codeblock>
+
+  </conbody>
+</concept>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3be0f122/docs/topics/impala_min.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_min.xml b/docs/topics/impala_min.xml
new file mode 100644
index 0000000..047b4ef
--- /dev/null
+++ b/docs/topics/impala_min.xml
@@ -0,0 +1,197 @@
+<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA
Concept//EN" "concept.dtd">
+<concept id="min">
+
+  <title>MIN Function</title>
+  <titlealts audience="PDF"><navtitle>MIN</navtitle></titlealts>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="SQL"/>
+      <data name="Category" value="Impala Functions"/>
+      <data name="Category" value="Analytic Functions"/>
+      <data name="Category" value="Aggregate Functions"/>
+      <data name="Category" value="Querying"/>
+      <data name="Category" value="Developers"/>
+      <data name="Category" value="Data Analysts"/>
+    </metadata>
+  </prolog>
+
+  <conbody>
+
+    <p>
+      <indexterm audience="Cloudera">min() function</indexterm>
+      An aggregate function that returns the minimum value from a set of numbers. Opposite
of the
+      <codeph>MAX</codeph> function. Its single argument can be numeric column,
or the numeric result of a function
+      or expression applied to the column value. Rows with a <codeph>NULL</codeph>
value for the specified column
+      are ignored. If the table is empty, or all the values supplied to <codeph>MIN</codeph>
are
+      <codeph>NULL</codeph>, <codeph>MIN</codeph> returns <codeph>NULL</codeph>.
+    </p>
+
+    <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+<codeblock>MIN([DISTINCT | ALL] <varname>expression</varname>) [OVER (<varname>analytic_clause</varname>)]</codeblock>
+
+    <p>
+      When the query contains a <codeph>GROUP BY</codeph> clause, returns one
value for each combination of
+      grouping values.
+    </p>
+
+    <p conref="../shared/impala_common.xml#common/restrictions_sliding_window"/>
+
+    <p conref="../shared/impala_common.xml#common/return_type_same_except_string"/>
+
+    <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+    <p conref="../shared/impala_common.xml#common/partition_key_optimization"/>
+
+    <p conref="../shared/impala_common.xml#common/complex_types_blurb"/>
+
+    <p conref="../shared/impala_common.xml#common/complex_types_aggregation_explanation"/>
+
+    <p conref="../shared/impala_common.xml#common/complex_types_aggregation_example"/>
+
+    <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+<codeblock>-- Find the smallest value for this column in the table.
+select min(c1) from t1;
+-- Find the smallest value for this column from a subset of the table.
+select min(c1) from t1 where month = 'January' and year = '2013';
+-- Find the smallest value from a set of numeric function results.
+select min(length(s)) from t1;
+-- Can also be used in combination with DISTINCT and/or GROUP BY.
+-- Return more than one result.
+select month, year, min(purchase_price) from store_stats group by month, year;
+-- Filter the input to eliminate duplicates before performing the calculation.
+select min(distinct x) from t1;
+</codeblock>
+
+    <p rev="2.0.0">
+      The following examples show how to use <codeph>MIN()</codeph> in an analytic
context. They use a table
+      containing integers from 1 to 10. Notice how the <codeph>MIN()</codeph>
is reported for each input value, as
+      opposed to the <codeph>GROUP BY</codeph> clause which condenses the result
set.
+<codeblock>select x, property, min(x) over (partition by property) as min from int_t
where property in ('odd','even');
++----+----------+-----+
+| x  | property | min |
++----+----------+-----+
+| 2  | even     | 2   |
+| 4  | even     | 2   |
+| 6  | even     | 2   |
+| 8  | even     | 2   |
+| 10 | even     | 2   |
+| 1  | odd      | 1   |
+| 3  | odd      | 1   |
+| 5  | odd      | 1   |
+| 7  | odd      | 1   |
+| 9  | odd      | 1   |
++----+----------+-----+
+</codeblock>
+
+Adding an <codeph>ORDER BY</codeph> clause lets you experiment with results that
are cumulative or apply to a moving
+set of rows (the <q>window</q>). The following examples use <codeph>MIN()</codeph>
in an analytic context
+(that is, with an <codeph>OVER()</codeph> clause) to display the smallest value
of <codeph>X</codeph>
+encountered up to each row in the result set. The examples use two columns in the <codeph>ORDER
BY</codeph>
+clause to produce a sequence of values that rises and falls, to illustrate how the <codeph>MIN()</codeph>
+result only decreases or stays the same throughout each partition within the result set.
+The basic <codeph>ORDER BY x</codeph> clause implicitly
+activates a window clause of <codeph>RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT
ROW</codeph>,
+which is effectively the same as <codeph>ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT
ROW</codeph>,
+therefore all of these examples produce the same results:
+
+<codeblock>select x, property, min(x) <b>over (order by property, x desc)</b>
as 'minimum to this point'
+  from int_t where property in ('prime','square');
++---+----------+-----------------------+
+| x | property | minimum to this point |
++---+----------+-----------------------+
+| 7 | prime    | 7                     |
+| 5 | prime    | 5                     |
+| 3 | prime    | 3                     |
+| 2 | prime    | 2                     |
+| 9 | square   | 2                     |
+| 4 | square   | 2                     |
+| 1 | square   | 1                     |
++---+----------+-----------------------+
+
+select x, property,
+  min(x) over
+  (
+    <b>order by property, x desc</b>
+    <b>range between unbounded preceding and current row</b>
+  ) as 'minimum to this point'
+from int_t where property in ('prime','square');
++---+----------+-----------------------+
+| x | property | minimum to this point |
++---+----------+-----------------------+
+| 7 | prime    | 7                     |
+| 5 | prime    | 5                     |
+| 3 | prime    | 3                     |
+| 2 | prime    | 2                     |
+| 9 | square   | 2                     |
+| 4 | square   | 2                     |
+| 1 | square   | 1                     |
++---+----------+-----------------------+
+
+select x, property,
+  min(x) over
+  (
+    <b>order by property, x desc</b>
+    <b>rows between unbounded preceding and current row</b>
+  ) as 'minimum to this point'
+from int_t where property in ('prime','square');
++---+----------+-----------------------+
+| x | property | minimum to this point |
++---+----------+-----------------------+
+| 7 | prime    | 7                     |
+| 5 | prime    | 5                     |
+| 3 | prime    | 3                     |
+| 2 | prime    | 2                     |
+| 9 | square   | 2                     |
+| 4 | square   | 2                     |
+| 1 | square   | 1                     |
++---+----------+-----------------------+
+</codeblock>
+
+The following examples show how to construct a moving window, with a running minimum taking
into account all rows before
+and 1 row after the current row.
+Because of a restriction in the Impala <codeph>RANGE</codeph> syntax, this type
of
+moving window is possible with the <codeph>ROWS BETWEEN</codeph> clause but not
the <codeph>RANGE BETWEEN</codeph> clause.
+Because of an extra Impala restriction on the <codeph>MAX()</codeph> and <codeph>MIN()</codeph>
functions in an
+analytic context, the lower bound must be <codeph>UNBOUNDED PRECEDING</codeph>.
+<codeblock>select x, property,
+  min(x) over
+  (
+    <b>order by property, x desc</b>
+    <b>rows between unbounded preceding and 1 following</b>
+  ) as 'local minimum'
+from int_t where property in ('prime','square');
++---+----------+---------------+
+| x | property | local minimum |
++---+----------+---------------+
+| 7 | prime    | 5             |
+| 5 | prime    | 3             |
+| 3 | prime    | 2             |
+| 2 | prime    | 2             |
+| 9 | square   | 2             |
+| 4 | square   | 1             |
+| 1 | square   | 1             |
++---+----------+---------------+
+
+-- Doesn't work because of syntax restriction on RANGE clause.
+select x, property,
+  min(x) over
+  (
+    <b>order by property, x desc</b>
+    <b>range between unbounded preceding and 1 following</b>
+  ) as 'local minimum'
+from int_t where property in ('prime','square');
+ERROR: AnalysisException: RANGE is only supported with both the lower and upper bounds UNBOUNDED
or one UNBOUNDED and the other CURRENT ROW.
+</codeblock>
+    </p>
+
+    <p conref="../shared/impala_common.xml#common/related_info"/>
+
+    <p>
+      <xref href="impala_analytic_functions.xml#analytic_functions"/>, <xref href="impala_max.xml#max"/>,
+      <xref href="impala_avg.xml#avg"/>
+    </p>
+  </conbody>
+</concept>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3be0f122/docs/topics/impala_misc_functions.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_misc_functions.xml b/docs/topics/impala_misc_functions.xml
new file mode 100644
index 0000000..196f3c2
--- /dev/null
+++ b/docs/topics/impala_misc_functions.xml
@@ -0,0 +1,196 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="misc_functions">
+
+  <title>Impala Miscellaneous Functions</title>
+  <titlealts audience="PDF"><navtitle>Miscellaneous Functions</navtitle></titlealts>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="Impala Functions"/>
+      <data name="Category" value="SQL"/>
+      <data name="Category" value="Data Analysts"/>
+      <data name="Category" value="Developers"/>
+      <data name="Category" value="Querying"/>
+    </metadata>
+  </prolog>
+
+  <conbody>
+
+    <p>
+      Impala supports the following utility functions that do not operate on a particular
column or data type:
+    </p>
+
+    <dl>
+      <dlentry rev="1.3.0" id="current_database">
+
+        <dt>
+          <codeph>current_database()</codeph>
+        </dt>
+
+        <dd>
+          <indexterm audience="Cloudera">current_database() function</indexterm>
+          <b>Purpose:</b> Returns the database that the session is currently
using, either <codeph>default</codeph>
+          if no database has been selected, or whatever database the session switched to
through a
+          <codeph>USE</codeph> statement or the <cmdname>impalad</cmdname><codeph>-d</codeph>
option.
+          <p>
+            <b>Return type:</b> <codeph>string</codeph>
+          </p>
+        </dd>
+
+      </dlentry>
+
+      <dlentry rev="5.4.5" id="effective_user">
+
+        <dt>
+          <codeph>effective_user()</codeph>
+        </dt>
+
+        <dd>
+          <indexterm audience="Cloudera">effective_user() function</indexterm>
+          <b>Purpose:</b> Typically returns the same value as <codeph>user()</codeph>,
+          except if delegation is enabled, in which case it returns the ID of the delegated
user.
+          <p>
+            <b>Return type:</b> <codeph>string</codeph>
+          </p>
+          <p>
+            <b>Added in:</b> CDH 5.4.5
+          </p>
+        </dd>
+
+      </dlentry>
+
+      <dlentry rev="1.3.0" id="pid">
+
+        <dt>
+          <codeph>pid()</codeph>
+        </dt>
+
+        <dd>
+          <indexterm audience="Cloudera">pid() function</indexterm>
+          <b>Purpose:</b> Returns the process ID of the <cmdname>impalad</cmdname>
daemon that the session is
+          connected to. You can use it during low-level debugging, to issue Linux commands
that trace, show the
+          arguments, and so on the <cmdname>impalad</cmdname> process.
+          <p>
+            <b>Return type:</b> <codeph>int</codeph>
+          </p>
+        </dd>
+
+      </dlentry>
+
+      <dlentry audience="Cloudera" id="sleep">
+
+        <dt>
+          <codeph>sleep(int ms)</codeph>
+        </dt>
+
+        <dd>
+          <indexterm audience="Cloudera">sleep() function</indexterm>
+          <b>Purpose:</b> Pauses the query for a specified number of milliseconds.
For slowing down queries with
+          small result sets enough to monitor runtime execution, memory usage, or other factors
that otherwise
+          would be difficult to capture during the brief interval of query execution. When
used in the
+          <codeph>SELECT</codeph> list, it is called once for each row in the
result set; adjust the number of
+          milliseconds accordingly. For example, a query <codeph>SELECT *, sleep(5)
FROM
+          table_with_1000_rows</codeph> would take at least 5 seconds to complete (5
milliseconds * 1000 rows in
+          result set). To avoid an excessive number of concurrent queries, use this function
for troubleshooting on
+          test and development systems, not for production queries.
+          <p>
+            <b>Return type:</b> N/A
+          </p>
+        </dd>
+
+      </dlentry>
+
+      <dlentry rev="1.1" id="user">
+
+        <dt>
+          <codeph>user()</codeph>
+        </dt>
+
+        <dd>
+          <indexterm audience="Cloudera">user() function</indexterm>
+          <b>Purpose:</b> Returns the username of the Linux user who is connected
to the <cmdname>impalad</cmdname>
+          daemon. Typically called a single time, in a query without any <codeph>FROM</codeph>
clause, to
+          understand how authorization settings apply in a security context; once you know
the logged-in username,
+          you can check which groups that user belongs to, and from the list of groups you
can check which roles
+          are available to those groups through the authorization policy file.
+          <p conref="../shared/impala_common.xml#common/user_kerberized"/>
+          <p>
+            When delegation is enabled, consider calling the <codeph>effective_user()</codeph>
function instead.
+          </p>
+          <p>
+            <b>Return type:</b> <codeph>string</codeph>
+          </p>
+        </dd>
+
+      </dlentry>
+
+      <dlentry rev="2.5.0 IMPALA-1477" id="uuid">
+
+        <dt>
+          <codeph>uuid()</codeph>
+        </dt>
+
+        <dd>
+          <indexterm audience="Cloudera">uuid() function</indexterm>
+          <b>Purpose:</b> Returns a <xref href="https://en.wikipedia.org/wiki/Universally_unique_identifier"
scope="external" format="html">universal unique identifier</xref>, a 128-bit value
encoded as a string with groups of hexadecimal digits separated by dashes.
+          <p>
+            <b>Return type:</b> <codeph>string</codeph>
+          </p>
+          <p conref="../shared/impala_common.xml#common/added_in_250"/>
+          <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+          <p>
+            Ascending numeric sequences of type <codeph>BIGINT</codeph> are often
used
+            as identifiers within a table, and as join keys across multiple tables.
+            The <codeph>uuid()</codeph> value is a convenient alternative that
does not
+            require storing or querying the highest sequence number. For example, you
+            can use it to quickly construct new unique identifiers during a data import job,
+            or to combine data from different tables without the likelihood of ID collisions.
+          </p>
+          <p conref="../shared/impala_common.xml#common/example_blurb"/>
+<codeblock>
+-- Each call to uuid() produces a new arbitrary value.
+select uuid();
++--------------------------------------+
+| uuid()                               |
++--------------------------------------+
+| c7013e25-1455-457f-bf74-a2046e58caea |
++--------------------------------------+
+
+-- If you get a UUID for each row of a result set, you can use it as a
+-- unique identifier within a table, or even a unique ID across tables.
+select uuid() from four_row_table;
++--------------------------------------+
+| uuid()                               |
++--------------------------------------+
+| 51d3c540-85e5-4cb9-9110-604e53999e2e |
+| 0bb40071-92f6-4a59-a6a4-60d46e9703e2 |
+| 5e9d7c36-9842-4a96-862d-c13cd0457c02 |
+| cae29095-0cc0-4053-a5ea-7fcd3c780861 |
++--------------------------------------+
+</codeblock>
+        </dd>
+
+      </dlentry>
+
+      <dlentry id="version">
+
+        <dt>
+          <codeph>version()</codeph>
+        </dt>
+
+        <dd>
+          <indexterm audience="Cloudera">version() function</indexterm>
+          <b>Purpose:</b> Returns information such as the precise version number
and build date for the
+          <codeph>impalad</codeph> daemon that you are currently connected to.
Typically used to confirm that you
+          are connected to the expected level of Impala to use a particular feature, or to
connect to several nodes
+          and confirm they are all running the same level of <cmdname>impalad</cmdname>.
+          <p>
+            <b>Return type:</b> <codeph>string</codeph> (with one
or more embedded newlines)
+          </p>
+        </dd>
+
+      </dlentry>
+    </dl>
+  </conbody>
+</concept>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3be0f122/docs/topics/impala_mixed_security.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_mixed_security.xml b/docs/topics/impala_mixed_security.xml
new file mode 100644
index 0000000..b9e6933
--- /dev/null
+++ b/docs/topics/impala_mixed_security.xml
@@ -0,0 +1,46 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="mixed_security">
+
+  <title>Using Multiple Authentication Methods with Impala</title>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Security"/>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="Authentication"/>
+      <data name="Category" value="Kerberos"/>
+      <data name="Category" value="LDAP"/>
+      <data name="Category" value="Administrators"/>
+    </metadata>
+  </prolog>
+
+  <conbody>
+
+    <p>
+      Impala 2.0 and later automatically handles both Kerberos and LDAP authentication. Each
+      <cmdname>impalad</cmdname> daemon can accept both Kerberos and LDAP requests
through the same port. No
+      special actions need to be taken if some users authenticate through Kerberos and some
through LDAP.
+    </p>
+
+    <p>
+      Prior to Impala 2.0, you had to configure each <cmdname>impalad</cmdname>
to listen on a specific port
+      depending on the kind of authentication, then configure your network load balancer
to forward each kind of
+      request to a DataNode that was set up with the appropriate authentication type. Once
the initial request was
+      made using either Kerberos or LDAP authentication, Impala automatically handled the
process of coordinating
+      the work across multiple nodes and transmitting intermediate results back to the coordinator
node.
+    </p>
+
+<!--
+    <p>
+    This technique is most suitable for larger clusters, where
+    you are already using load balancing software for high availability.
+    You configure Impala to run on a different port on the nodes configured for LDAP.
+    Then you configure the load balancing software to forward Kerberos
+    connection requests to nodes using the default port, and LDAP connection requests
+    to nodes using an alternative port for LDAP.
+    Consult the documentation for your load balancing software for how to
+    configure that type of forwarding.
+    </p>
+-->
+  </conbody>
+</concept>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3be0f122/docs/topics/impala_ndv.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_ndv.xml b/docs/topics/impala_ndv.xml
new file mode 100644
index 0000000..fcdae77
--- /dev/null
+++ b/docs/topics/impala_ndv.xml
@@ -0,0 +1,135 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept rev="1.2.1" id="ndv">
+
+  <title>NDV Function</title>
+  <titlealts audience="PDF"><navtitle>NDV</navtitle></titlealts>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="SQL"/>
+      <data name="Category" value="Impala Functions"/>
+      <data name="Category" value="Aggregate Functions"/>
+      <data name="Category" value="Querying"/>
+      <data name="Category" value="Developers"/>
+      <data name="Category" value="Data Analysts"/>
+    </metadata>
+  </prolog>
+
+  <conbody>
+
+    <p>
+      <indexterm audience="Cloudera">ndv() function</indexterm>
+      An aggregate function that returns an approximate value similar to the result of <codeph>COUNT(DISTINCT
+      <varname>col</varname>)</codeph>, the <q>number of distinct
values</q>. It is much faster than the
+      combination of <codeph>COUNT</codeph> and <codeph>DISTINCT</codeph>,
and uses a constant amount of memory and
+      thus is less memory-intensive for columns with high cardinality.
+    </p>
+
+    <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+<codeblock>NDV([DISTINCT | ALL] <varname>expression</varname>)</codeblock>
+
+    <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+    <p rev="1.2.2">
+      This is the mechanism used internally by the <codeph>COMPUTE STATS</codeph>
statement for computing the
+      number of distinct values in a column.
+    </p>
+
+    <p>
+      Because this number is an estimate, it might not reflect the precise number of different
values in the
+      column, especially if the cardinality is very low or very high. If the estimated number
is higher than the
+      number of rows in the table, Impala adjusts the value internally during query planning.
+    </p>
+
+    <p conref="../shared/impala_common.xml#common/former_odd_return_type_string"/>
+
+<!-- <p conref="../shared/impala_common.xml#common/restrictions_sliding_window"/>
-->
+
+    <p conref="../shared/impala_common.xml#common/complex_types_blurb"/>
+
+    <p conref="../shared/impala_common.xml#common/complex_types_aggregation_explanation"/>
+
+    <p conref="../shared/impala_common.xml#common/complex_types_aggregation_example"/>
+
+    <p conref="../shared/impala_common.xml#common/restrictions_blurb"/>
+
+    <p conref="../shared/impala_common.xml#common/analytic_not_allowed_caveat"/>
+
+    <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+    <p>
+      The following example queries a billion-row table to illustrate the relative performance
of
+      <codeph>COUNT(DISTINCT)</codeph> and <codeph>NDV()</codeph>.
It shows how <codeph>COUNT(DISTINCT)</codeph>
+      gives a precise answer, but is inefficient for large-scale data where an approximate
result is sufficient.
+      The <codeph>NDV()</codeph> function gives an approximate result but is
much faster.
+    </p>
+
+<codeblock>select count(distinct col1) from sample_data;
++---------------------+
+| count(distinct col1)|
++---------------------+
+| 100000              |
++---------------------+
+Fetched 1 row(s) in 20.13s
+
+select cast(ndv(col1) as bigint) as col1 from sample_data;
++----------+
+| col1     |
++----------+
+| 139017   |
++----------+
+Fetched 1 row(s) in 8.91s
+</codeblock>
+
+    <p>
+      The following example shows how you can code multiple <codeph>NDV()</codeph>
calls in a single query, to
+      easily learn which columns have substantially more or fewer distinct values. This technique
is faster than
+      running a sequence of queries with <codeph>COUNT(DISTINCT)</codeph> calls.
+    </p>
+
+<codeblock>select cast(ndv(col1) as bigint) as col1, cast(ndv(col2) as bigint) as col2,
+    cast(ndv(col3) as bigint) as col3, cast(ndv(col4) as bigint) as col4
+  from sample_data;
++----------+-----------+------------+-----------+
+| col1     | col2      | col3       | col4      |
++----------+-----------+------------+-----------+
+| 139017   | 282       | 46         | 145636240 |
++----------+-----------+------------+-----------+
+Fetched 1 row(s) in 34.97s
+
+select count(distinct col1) from sample_data;
++---------------------+
+| count(distinct col1)|
++---------------------+
+| 100000              |
++---------------------+
+Fetched 1 row(s) in 20.13s
+
+select count(distinct col2) from sample_data;
++----------------------+
+| count(distinct col2) |
++----------------------+
+| 278                  |
++----------------------+
+Fetched 1 row(s) in 20.09s
+
+select count(distinct col3) from sample_data;
++-----------------------+
+| count(distinct col3)  |
++-----------------------+
+| 46                    |
++-----------------------+
+Fetched 1 row(s) in 19.12s
+
+select count(distinct col4) from sample_data;
++----------------------+
+| count(distinct col4) |
++----------------------+
+| 147135880            |
++----------------------+
+Fetched 1 row(s) in 266.95s
+</codeblock>
+  </conbody>
+</concept>



Mime
View raw message