drill-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From tshi...@apache.org
Subject [06/14] drill git commit: BB adds query exec, Aman's review
Date Wed, 20 May 2015 06:06:23 GMT
BB adds query exec, Aman's review


Project: http://git-wip-us.apache.org/repos/asf/drill/repo
Commit: http://git-wip-us.apache.org/repos/asf/drill/commit/f4968c30
Tree: http://git-wip-us.apache.org/repos/asf/drill/tree/f4968c30
Diff: http://git-wip-us.apache.org/repos/asf/drill/diff/f4968c30

Branch: refs/heads/gh-pages
Commit: f4968c30841d86c4178974e172c09d854a31f42f
Parents: 9c24b34
Author: Kristine Hahn <khahn@maprtech.com>
Authored: Tue May 19 13:24:16 2015 -0700
Committer: Kristine Hahn <khahn@maprtech.com>
Committed: Tue May 19 13:24:16 2015 -0700

----------------------------------------------------------------------
 _data/docs.json                                 |  75 ++++++++++++++++---
 .../010-architecture-introduction.md            |  15 +---
 _docs/architecture/015-drill-query-execution.md |  65 ++++++++++++++++
 _docs/img/client-phys-plan.png                  | Bin 0 -> 13083 bytes
 _docs/img/ex-operator.png                       | Bin 0 -> 8582 bytes
 _docs/img/execution-tree.PNG                    | Bin 0 -> 13849 bytes
 _docs/img/leaf-frag.png                         | Bin 0 -> 13577 bytes
 _docs/img/min-frag.png                          | Bin 0 -> 14425 bytes
 _docs/img/operators.png                         | Bin 0 -> 45966 bytes
 _docs/img/query-flow-client.png                 | Bin 0 -> 13734 bytes
 .../040-modifying-query-planning-options.md     |   8 +-
 ...esson-3-run-queries-on-complex-data-types.md |   3 +-
 12 files changed, 132 insertions(+), 34 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/drill/blob/f4968c30/_data/docs.json
----------------------------------------------------------------------
diff --git a/_data/docs.json b/_data/docs.json
index eefb5b8..8185d14 100644
--- a/_data/docs.json
+++ b/_data/docs.json
@@ -417,8 +417,8 @@
                         }
                     ], 
                     "children": [], 
-                    "next_title": "Core Modules", 
-                    "next_url": "/docs/core-modules/", 
+                    "next_title": "Drill Query Execution", 
+                    "next_url": "/docs/drill-query-execution/", 
                     "parent": "Architecture", 
                     "previous_title": "Architecture", 
                     "previous_url": "/docs/architecture/", 
@@ -434,11 +434,28 @@
                         }
                     ], 
                     "children": [], 
-                    "next_title": "Performance", 
-                    "next_url": "/docs/performance/", 
+                    "next_title": "Core Modules", 
+                    "next_url": "/docs/core-modules/", 
                     "parent": "Architecture", 
                     "previous_title": "Architecture Introduction", 
                     "previous_url": "/docs/architecture-introduction/", 
+                    "relative_path": "_docs/architecture/015-drill-query-execution.md", 
+                    "title": "Drill Query Execution", 
+                    "url": "/docs/drill-query-execution/"
+                }, 
+                {
+                    "breadcrumbs": [
+                        {
+                            "title": "Architecture", 
+                            "url": "/docs/architecture/"
+                        }
+                    ], 
+                    "children": [], 
+                    "next_title": "Performance", 
+                    "next_url": "/docs/performance/", 
+                    "parent": "Architecture", 
+                    "previous_title": "Drill Query Execution", 
+                    "previous_url": "/docs/drill-query-execution/", 
                     "relative_path": "_docs/architecture/020-core-modules.md", 
                     "title": "Core Modules", 
                     "url": "/docs/core-modules/"
@@ -478,8 +495,8 @@
                 }
             ], 
             "children": [], 
-            "next_title": "Core Modules", 
-            "next_url": "/docs/core-modules/", 
+            "next_title": "Drill Query Execution", 
+            "next_url": "/docs/drill-query-execution/", 
             "parent": "Architecture", 
             "previous_title": "Architecture", 
             "previous_url": "/docs/architecture/", 
@@ -1716,8 +1733,8 @@
             "next_title": "Performance", 
             "next_url": "/docs/performance/", 
             "parent": "Architecture", 
-            "previous_title": "Architecture Introduction", 
-            "previous_url": "/docs/architecture-introduction/", 
+            "previous_title": "Drill Query Execution", 
+            "previous_url": "/docs/drill-query-execution/", 
             "relative_path": "_docs/architecture/020-core-modules.md", 
             "title": "Core Modules", 
             "url": "/docs/core-modules/"
@@ -2763,6 +2780,23 @@
             "title": "Drill Plan Syntax", 
             "url": "/docs/drill-plan-syntax/"
         }, 
+        "Drill Query Execution": {
+            "breadcrumbs": [
+                {
+                    "title": "Architecture", 
+                    "url": "/docs/architecture/"
+                }
+            ], 
+            "children": [], 
+            "next_title": "Core Modules", 
+            "next_url": "/docs/core-modules/", 
+            "parent": "Architecture", 
+            "previous_title": "Architecture Introduction", 
+            "previous_url": "/docs/architecture-introduction/", 
+            "relative_path": "_docs/architecture/015-drill-query-execution.md", 
+            "title": "Drill Query Execution", 
+            "url": "/docs/drill-query-execution/"
+        }, 
         "Drill in 10 Minutes": {
             "breadcrumbs": [
                 {
@@ -10371,8 +10405,8 @@
                         }
                     ], 
                     "children": [], 
-                    "next_title": "Core Modules", 
-                    "next_url": "/docs/core-modules/", 
+                    "next_title": "Drill Query Execution", 
+                    "next_url": "/docs/drill-query-execution/", 
                     "parent": "Architecture", 
                     "previous_title": "Architecture", 
                     "previous_url": "/docs/architecture/", 
@@ -10388,11 +10422,28 @@
                         }
                     ], 
                     "children": [], 
-                    "next_title": "Performance", 
-                    "next_url": "/docs/performance/", 
+                    "next_title": "Core Modules", 
+                    "next_url": "/docs/core-modules/", 
                     "parent": "Architecture", 
                     "previous_title": "Architecture Introduction", 
                     "previous_url": "/docs/architecture-introduction/", 
+                    "relative_path": "_docs/architecture/015-drill-query-execution.md", 
+                    "title": "Drill Query Execution", 
+                    "url": "/docs/drill-query-execution/"
+                }, 
+                {
+                    "breadcrumbs": [
+                        {
+                            "title": "Architecture", 
+                            "url": "/docs/architecture/"
+                        }
+                    ], 
+                    "children": [], 
+                    "next_title": "Performance", 
+                    "next_url": "/docs/performance/", 
+                    "parent": "Architecture", 
+                    "previous_title": "Drill Query Execution", 
+                    "previous_url": "/docs/drill-query-execution/", 
                     "relative_path": "_docs/architecture/020-core-modules.md", 
                     "title": "Core Modules", 
                     "url": "/docs/core-modules/"

http://git-wip-us.apache.org/repos/asf/drill/blob/f4968c30/_docs/architecture/010-architecture-introduction.md
----------------------------------------------------------------------
diff --git a/_docs/architecture/010-architecture-introduction.md b/_docs/architecture/010-architecture-introduction.md
old mode 100644
new mode 100755
index e80af26..7c06469
--- a/_docs/architecture/010-architecture-introduction.md
+++ b/_docs/architecture/010-architecture-introduction.md
@@ -29,20 +29,7 @@ Though Drill works in a Hadoop cluster environment, Drill is not tied to
 Hadoop and can run in any distributed cluster environment. The only pre-
 requisite for Drill is Zookeeper.
 
-## Query Flow in Drill
-
-The following image represents the flow of a Drill query:
- 
-![drill query flow]({{ site.baseurl }}/docs/img/queryFlow.png)
-
-The flow of a Drill query typically involves the following steps:
-
-  1. The Drill client issues a query. Any Drillbit in the cluster can accept queries from
clients. There is no master-slave concept.
-  2. The Drillbit then parses the query, optimizes it, and generates an optimized distributed
query plan for fast and efficient execution.
-  3. The Drillbit that accepts the query becomes the driving Drillbit node for the request.
It gets a list of available Drillbit nodes in the cluster from ZooKeeper. The driving Drillbit
determines the appropriate nodes to execute various query plan fragments to maximize data
locality.
-  4. The Drillbit schedules the execution of query fragments on individual nodes according
to the execution plan.
-  5. The individual nodes finish their execution and return data to the driving Drillbit.
-  6. The driving Drillbit returns results back to the client.
+See Drill Query Execution.
 
 ## Drill Clients
 

http://git-wip-us.apache.org/repos/asf/drill/blob/f4968c30/_docs/architecture/015-drill-query-execution.md
----------------------------------------------------------------------
diff --git a/_docs/architecture/015-drill-query-execution.md b/_docs/architecture/015-drill-query-execution.md
new file mode 100755
index 0000000..730460a
--- /dev/null
+++ b/_docs/architecture/015-drill-query-execution.md
@@ -0,0 +1,65 @@
+---
+title: "Drill Query Execution"
+parent: "Architecture"
+---
+
+When you submit a Drill query, a client or an application sends the query in the form of
an SQL statement to a Drillbit in the Drill cluster. A Drillbit is the process running on
each active Drill node that coordinates, plans, and executes queries, as well as distributes
query work across the cluster to maximize data locality.
+
+The following image represents the communication between clients, applications, and Drillbits:
+
+![]({{ site.baseurl }}/docs/img/query-flow-client.png)
+
+The Drillbit that receives the query from a client or application becomes the Foreman for
the query and drives the entire query. A parser in the Foreman parses the SQL, applying custom
rules to convert specific SQL operators into a specific logical operator syntax that Drill
understands. This collection of logical operators forms a logical plan. The logical plan describes
the work required to generate the query results and defines what data sources and operations
to apply.
+
+The Foreman sends the logical plan into a cost-based optimizer to optimize the order of SQL
operators in a statement and read the logical plan. The optimizer applies various types of
rules to rearrange operators and functions into an optimal plan. The optimizer converts the
logical plan into a physical plan that describes how to execute the query.
+
+![]({{ site.baseurl }}/docs/img/client-phys-plan.png)
+
+A parallelizer in the Foreman transforms the physical plan into multiple phases, called major
and minor fragments. These fragments create a multi-level execution tree that rewrites the
query and executes it in parallel against the configured data sources, sending the results
back to the client or application.
+
+![]({{ site.baseurl }}/docs/img/execution-tree.png)  
+
+
+## Major Fragments
+A major fragment is an abstract concept that represents a phase of the query execution. A
phase can consist of one or multiple operations that Drill must perform to execute the query.
Drill assigns each major fragment a MajorFragmentID.
+
+For example, to perform a hash aggregation of two files, Drill may create a plan with two
major phases (major fragments) where the first phase is dedicated to scanning the two files
and the second phase is dedicated to the aggregation of the data.  
+
+![]({{ site.baseurl }}/docs/img/ex-operator.png)
+
+Drill separates major fragments by an exchange operator. An exchange is a change in data
location and/or parallelization of the physical plan. An exchange is composed of a sender
and a receiver to allow data to move between nodes. 
+
+Major fragments do not actually perform any query tasks. Each major fragment is divided into
one or multiple minor fragments (discussed in the next section) that actually execute the
operations required to complete the query and return results back to the client.
+
+You can interact with major fragments within the physical plan by capturing a JSON representation
of the plan in a file, manually modifying it, and then submitting it back to Drill using the
SUBMIT PLAN command. You can also view major fragments in the query profile, which is visible
in the Drill Web UI. See [EXPLAIN ]({{ site.baseurl }}/docs/explain/)and [Query Profiles]({{
site.baseurl }}/docs/query-profiles/) for more information.
+
+## Minor Fragments
+Each major fragment is parallelized into minor fragments. A minor fragment is a logical unit
of work that runs inside of a thread. A logical unit of work in Drill is also referred to
as a slice. The execution plan that Drill creates is composed of minor fragments. Drill assigns
each minor fragment a MinorFragmentID.  
+
+![]({{ site.baseurl }}/docs/img/min-frag.png)
+
+The parallelizer in the Foreman creates one or more minor fragments from a major fragment
at execution time, by breaking a major fragment into as many minor fragments as it can run
simultaneously on the cluster.
+
+Drill executes each minor fragment in its own thread as quickly as possible based on its
upstream data requirements. Drill schedules the minor fragments on nodes with data locality.
Otherwise, Drill schedules them in a round-robin fashion on the existing, available Drillbits.
+
+Minor fragments contain one or more relational operators. An operator performs a relational
operation, such as scan, filter, join, or group by. Each operator has a particular operator
type and an OperatorID. Each OperatorID defines its relationship within the minor fragment
to which it belongs.  
+
+![]({{ site.baseurl }}/docs/img/operators.png)
+
+For example, when performing a hash aggregation of two files, Drill breaks the first phase
dedicated to scanning into two minor fragments. Each minor fragment contains scan operators
that scan the files. Drill breaks the second phase dedicated to aggregation into four minor
fragments. Each of the four minor fragments contain hash aggregate operators that perform
the hash  aggregation operations on the data. 
+
+You cannot modify the number of minor fragments within the execution plan. However, you can
view the query profile in the Drill Web UI and modify some configuration options that change
the behavior of minor fragments, such as the maximum number of slices. See [Configuration
Options]({{ site.baseurl }}/docs/configuration-options-introduction/) for more information.
+
+### Execution of Minor Fragments
+Minor fragments can run as root, intermediate, or leaf fragments. An execution tree contains
only one root fragment. The coordinates of the execution tree are numbered from the root,
with the root being zero. Data flows downstream from the leaf fragments to the root fragment.
+ 
+The root fragment runs in the Foreman and receives incoming queries, reads metadata from
tables, rewrites the queries and routes them to the next level in the serving tree. The other
fragments become intermediate or leaf fragments.  
+
+Intermediate fragments start work when data is available or fed to them from other fragments.
They perform operations on the data and then send the data downstream. They also pass the
aggregated results to the root fragment, which performs further aggregation and provides the
query results to the client or application.
+
+The leaf fragments scan tables in parallel and communicate with the storage layer or access
data on local disk. The leaf fragments pass partial results to the intermediate fragments,
which perform parallel operations on intermediate results.
+
+![]({{ site.baseurl }}/docs/leaf-frag.png)
+
+Drill only plans queries that have concurrent running fragments. For example, if 20 available
slices exist in the cluster, Drill plans a query that runs no more than 20 minor fragments
in a particular major fragment. Drill is optimistic and assumes that it can complete all of
the work in parallel. All minor fragments for a particular major fragment start at the same
time based on their upstream data dependency.
+

http://git-wip-us.apache.org/repos/asf/drill/blob/f4968c30/_docs/img/client-phys-plan.png
----------------------------------------------------------------------
diff --git a/_docs/img/client-phys-plan.png b/_docs/img/client-phys-plan.png
new file mode 100755
index 0000000..2314c8c
Binary files /dev/null and b/_docs/img/client-phys-plan.png differ

http://git-wip-us.apache.org/repos/asf/drill/blob/f4968c30/_docs/img/ex-operator.png
----------------------------------------------------------------------
diff --git a/_docs/img/ex-operator.png b/_docs/img/ex-operator.png
new file mode 100755
index 0000000..8a04af8
Binary files /dev/null and b/_docs/img/ex-operator.png differ

http://git-wip-us.apache.org/repos/asf/drill/blob/f4968c30/_docs/img/execution-tree.PNG
----------------------------------------------------------------------
diff --git a/_docs/img/execution-tree.PNG b/_docs/img/execution-tree.PNG
new file mode 100755
index 0000000..9fb8026
Binary files /dev/null and b/_docs/img/execution-tree.PNG differ

http://git-wip-us.apache.org/repos/asf/drill/blob/f4968c30/_docs/img/leaf-frag.png
----------------------------------------------------------------------
diff --git a/_docs/img/leaf-frag.png b/_docs/img/leaf-frag.png
new file mode 100755
index 0000000..5e3e973
Binary files /dev/null and b/_docs/img/leaf-frag.png differ

http://git-wip-us.apache.org/repos/asf/drill/blob/f4968c30/_docs/img/min-frag.png
----------------------------------------------------------------------
diff --git a/_docs/img/min-frag.png b/_docs/img/min-frag.png
new file mode 100755
index 0000000..20b13e4
Binary files /dev/null and b/_docs/img/min-frag.png differ

http://git-wip-us.apache.org/repos/asf/drill/blob/f4968c30/_docs/img/operators.png
----------------------------------------------------------------------
diff --git a/_docs/img/operators.png b/_docs/img/operators.png
new file mode 100755
index 0000000..12a7b3e
Binary files /dev/null and b/_docs/img/operators.png differ

http://git-wip-us.apache.org/repos/asf/drill/blob/f4968c30/_docs/img/query-flow-client.png
----------------------------------------------------------------------
diff --git a/_docs/img/query-flow-client.png b/_docs/img/query-flow-client.png
new file mode 100755
index 0000000..10fe24f
Binary files /dev/null and b/_docs/img/query-flow-client.png differ

http://git-wip-us.apache.org/repos/asf/drill/blob/f4968c30/_docs/performance-tuning/query-plans-and-tuning/040-modifying-query-planning-options.md
----------------------------------------------------------------------
diff --git a/_docs/performance-tuning/query-plans-and-tuning/040-modifying-query-planning-options.md
b/_docs/performance-tuning/query-plans-and-tuning/040-modifying-query-planning-options.md
old mode 100755
new mode 100644
index fed5c25..f6fdb5d
--- a/_docs/performance-tuning/query-plans-and-tuning/040-modifying-query-planning-options.md
+++ b/_docs/performance-tuning/query-plans-and-tuning/040-modifying-query-planning-options.md
@@ -7,9 +7,8 @@ Planner options affect how Drill plans a query. You can use the ALTER SYSTEM|SES
  
 The following planning options affect query planning and performance:
 
-* **planner.width.max\_per_node** 
-
-     Default is 3. Configure this option to achieve fine grained, absolute control over parallelization.
+* **planner.width.max\_per_node**  
+     Configure this option to achieve fine grained, absolute control over parallelization.
 
      In this context width refers to fan out or distribution potential: the ability to run
a query in parallel across the cores on a node and the nodes on a cluster. A physical plan
consists of intermediate operations, known as query "fragments," that run concurrently, yielding
opportunities for parallelism above and below each exchange operator in the plan. An exchange
operator represents a breakpoint in the execution flow where processing can be distributed.
For example, a single-process scan of a file may flow into an exchange operator, followed
by a multi-process aggregation fragment.
  
@@ -19,15 +18,12 @@ The following planning options affect query planning and performance:
      When you modify the default setting, you can supply any meaningful number. The system
does not automatically scale down your setting.  
 
 * **planner.width\_max\_per_query**  
-
      Default is 1000. The maximum number of threads than can run in parallel for a query
across all nodes. Only change this setting when Drill over-parallelizes on very large clusters.
  
 * **planner.slice_target**  
-
      Default is 100000. The minimum number of estimated records to work with in a major fragment
before applying additional parallelization.
  
 * **planner.broadcast_threshold**  
-
      Default is 10000000. The maximum number of records allowed to be broadcast as part of
a join. After one million records, Drill reshuffles data rather than doing a broadcast to
one side of the join. To improve performance you can increase this number, especially on 10GB
Ethernet clusters.
  
 

http://git-wip-us.apache.org/repos/asf/drill/blob/f4968c30/_docs/tutorials/learn-drill-with-the-mapr-sandbox/050-lesson-3-run-queries-on-complex-data-types.md
----------------------------------------------------------------------
diff --git a/_docs/tutorials/learn-drill-with-the-mapr-sandbox/050-lesson-3-run-queries-on-complex-data-types.md
b/_docs/tutorials/learn-drill-with-the-mapr-sandbox/050-lesson-3-run-queries-on-complex-data-types.md
index a41b4a4..17b904b 100644
--- a/_docs/tutorials/learn-drill-with-the-mapr-sandbox/050-lesson-3-run-queries-on-complex-data-types.md
+++ b/_docs/tutorials/learn-drill-with-the-mapr-sandbox/050-lesson-3-run-queries-on-complex-data-types.md
@@ -289,8 +289,7 @@ in descending order. Only clicks that have resulted in a purchase are
counted.
   
 ## Store a Result Set in a Table for Reuse and Analysis
 
-Finally, run another correlated subquery that returns a fairly large result
-set. To facilitate additional analysis on this result set, you can easily and
+To facilitate additional analysis on this result set, you can easily and
 quickly create a Drill table from the results of the query.
 
 ### Continue to use the dfs.clicks workspace


Mime
View raw message