apex-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From tus...@apache.org
Subject [08/13] apex-site git commit: Adding apex-3.6.0 documentation
Date Mon, 01 May 2017 10:19:25 GMT
http://git-wip-us.apache.org/repos/asf/apex-site/blob/fb75848f/docs/apex-3.6/css/theme_extra.css
----------------------------------------------------------------------
diff --git a/docs/apex-3.6/css/theme_extra.css b/docs/apex-3.6/css/theme_extra.css
new file mode 100644
index 0000000..9845d00
--- /dev/null
+++ b/docs/apex-3.6/css/theme_extra.css
@@ -0,0 +1,154 @@
+/*
+ * Sphinx doesn't have support for section dividers like we do in
+ * MkDocs, this styles the section titles in the nav
+ *
+ * https://github.com/mkdocs/mkdocs/issues/175
+ */
+.wy-menu-vertical span {
+    line-height: 18px;
+    padding: 0.4045em 1.618em;
+    display: block;
+    position: relative;
+    font-size: 90%;
+    color: #838383;
+}
+
+.wy-menu-vertical .subnav a {
+    padding: 0.4045em 2.427em;
+}
+
+/*
+ * Long navigations run off the bottom of the screen as the nav
+ * area doesn't scroll.
+ *
+ * https://github.com/mkdocs/mkdocs/pull/202
+ */
+.wy-nav-side {
+    height: 100%;
+    overflow-y: auto;
+}
+
+/*
+ * readthedocs theme hides nav items when the window height is
+ * too small to contain them.
+ *
+ * https://github.com/mkdocs/mkdocs/issues/#348
+ */
+.wy-menu-vertical ul {
+  margin-bottom: 2em;
+}
+
+/*
+ * Fix wrapping in the code highlighting
+ *
+ * https://github.com/mkdocs/mkdocs/issues/233
+ */
+code {
+    white-space: pre;
+    padding: 2px 5px;
+}
+
+/*
+ * Wrap inline code samples otherwise they shoot of the side and
+ * can't be read at all.
+ *
+ * https://github.com/mkdocs/mkdocs/issues/313
+ */
+p code {
+    word-wrap: break-word;
+}
+
+/**
+ * Make code blocks display as blocks and give them the appropriate
+ * font size and padding.
+ *
+ * https://github.com/mkdocs/mkdocs/issues/855
+ */
+pre code {
+  display: block;
+  padding: 12px;
+  font-size: 12px;
+}
+
+/*
+ * Fix link colors when the link text is inline code.
+ *
+ * https://github.com/mkdocs/mkdocs/issues/718
+ */
+a code {
+    color: #2980B9;
+}
+a:hover code {
+    color: #3091d1;
+}
+a:visited code {
+    color: #9B59B6;
+}
+
+/*
+ * The CSS classes from highlight.js seem to clash with the
+ * ReadTheDocs theme causing some code to be incorrectly made
+ * bold and italic.
+ *
+ * https://github.com/mkdocs/mkdocs/issues/411
+ */
+code.cs, code.c {
+    font-weight: inherit;
+    font-style: inherit;
+}
+
+/*
+ * Fix some issues with the theme and non-highlighted code
+ * samples. Without and highlighting styles attached the
+ * formatting is broken.
+ *
+ * https://github.com/mkdocs/mkdocs/issues/319
+ */
+.no-highlight {
+  display: block;
+  padding: 0.5em;
+  color: #333;
+}
+
+
+/*
+ * Additions specific to the search functionality provided by MkDocs
+ */
+
+#mkdocs-search-results article h3
+{
+    margin-top: 23px;
+    border-top: 1px solid #E1E4E5;
+    padding-top: 24px;
+}
+
+#mkdocs-search-results article:first-child h3 {
+    border-top: none;
+}
+
+#mkdocs-search-query{
+    width: 100%;
+    border-radius: 50px;
+    padding: 6px 12px;
+    border-color: #D1D4D5;
+}
+
+.wy-menu-vertical li ul {
+    display: inherit;
+}
+
+.wy-menu-vertical li ul.subnav ul.subnav{
+    padding-left: 1em;
+}
+
+
+/*
+ * Improve inline code blocks within admonitions.
+ *
+ * https://github.com/mkdocs/mkdocs/issues/656
+ */
+ div.admonition code {
+  color: #404040;
+  border: 1px solid rgba(0, 0, 0, 0.2);
+  background: rgba(255, 255, 255, 0.7);
+}

http://git-wip-us.apache.org/repos/asf/apex-site/blob/fb75848f/docs/apex-3.6/development_best_practices/index.html
----------------------------------------------------------------------
diff --git a/docs/apex-3.6/development_best_practices/index.html b/docs/apex-3.6/development_best_practices/index.html
new file mode 100644
index 0000000..bbbff1b
--- /dev/null
+++ b/docs/apex-3.6/development_best_practices/index.html
@@ -0,0 +1,383 @@
+<!DOCTYPE html>
+<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
+<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
+<head>
+  <meta charset="utf-8">
+  <meta http-equiv="X-UA-Compatible" content="IE=edge">
+  <meta name="viewport" content="width=device-width, initial-scale=1.0">
+  
+  
+  
+  <title>Best Practices - Apache Apex Documentation</title>
+  
+
+  <link rel="shortcut icon" href="../favicon.ico">
+  
+
+  
+  <link href='https://fonts.googleapis.com/css?family=Lato:400,700|Roboto+Slab:400,700|Inconsolata:400,700'
rel='stylesheet' type='text/css'>
+
+  <link rel="stylesheet" href="../css/theme.css" type="text/css" />
+  <link rel="stylesheet" href="../css/theme_extra.css" type="text/css" />
+  <link rel="stylesheet" href="../css/highlight.css">
+
+  
+  <script>
+    // Current page data
+    var mkdocs_page_name = "Best Practices";
+    var mkdocs_page_input_path = "development_best_practices.md";
+    var mkdocs_page_url = "/development_best_practices/";
+  </script>
+  
+  <script src="../js/jquery-2.1.1.min.js"></script>
+  <script src="../js/modernizr-2.8.3.min.js"></script>
+  <script type="text/javascript" src="../js/highlight.pack.js"></script>
+  <script src="../js/theme.js"></script> 
+
+  
+</head>
+
+<body class="wy-body-for-nav" role="document">
+
+  <div class="wy-grid-for-nav">
+
+    
+    <nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
+      <div class="wy-side-nav-search">
+        <a href=".." class="icon icon-home"> Apache Apex Documentation</a>
+        <div role="search">
+  <form id ="rtd-search-form" class="wy-form" action="../search.html" method="get">
+    <input type="text" name="q" placeholder="Search docs" />
+  </form>
+</div>
+      </div>
+
+      <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main
navigation">
+        <ul class="current">
+          
+            <li>
+    <li class="toctree-l1 ">
+        <a class="" href="..">Apache Apex</a>
+        
+    </li>
+<li>
+          
+            <li>
+    <ul class="subnav">
+    <li><span>Development</span></li>
+
+        
+            
+    <li class="toctree-l1 ">
+        <a class="" href="../apex_development_setup/">Development Setup</a>
+        
+    </li>
+
+        
+            
+    <li class="toctree-l1 ">
+        <a class="" href="../application_development/">Applications</a>
+        
+    </li>
+
+        
+            
+    <li class="toctree-l1 ">
+        <a class="" href="../application_packages/">Packages</a>
+        
+    </li>
+
+        
+            
+    <li class="toctree-l1 ">
+        <a class="" href="../operator_development/">Operators</a>
+        
+    </li>
+
+        
+            
+    <li class="toctree-l1 ">
+        <a class="" href="../autometrics/">AutoMetric API</a>
+        
+    </li>
+
+        
+            
+    <li class="toctree-l1 ">
+        <a class="" href="../control_tuples/">Custom Control Tuples</a>
+        
+    </li>
+
+        
+            
+    <li class="toctree-l1 current">
+        <a class="current" href="./">Best Practices</a>
+        
+            <ul>
+            
+                <li class="toctree-l3"><a href="#development-best-practices">Development
Best Practices</a></li>
+                
+                    <li><a class="toctree-l4" href="#operators">Operators</a></li>
+                
+                    <li><a class="toctree-l4" href="#input-operators">Input Operators</a></li>
+                
+                    <li><a class="toctree-l4" href="#output-operators">Output
Operators</a></li>
+                
+                    <li><a class="toctree-l4" href="#partitioning">Partitioning</a></li>
+                
+                    <li><a class="toctree-l4" href="#threads">Threads</a></li>
+                
+            
+            </ul>
+        
+    </li>
+
+        
+    </ul>
+<li>
+          
+            <li>
+    <ul class="subnav">
+    <li><span>Operations</span></li>
+
+        
+            
+    <li class="toctree-l1 ">
+        <a class="" href="../apex_cli/">Apex CLI</a>
+        
+    </li>
+
+        
+            
+    <li class="toctree-l1 ">
+        <a class="" href="../security/">Security</a>
+        
+    </li>
+
+        
+    </ul>
+<li>
+          
+            <li>
+    <li class="toctree-l1 ">
+        <a class="" href="../compatibility/">Compatibility</a>
+        
+    </li>
+<li>
+          
+        </ul>
+      </div>
+      &nbsp;
+    </nav>
+
+    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
+
+      
+      <nav class="wy-nav-top" role="navigation" aria-label="top navigation">
+        <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
+        <a href="..">Apache Apex Documentation</a>
+      </nav>
+
+      
+      <div class="wy-nav-content">
+        <div class="rst-content">
+          <div role="navigation" aria-label="breadcrumbs navigation">
+  <ul class="wy-breadcrumbs">
+    <li><a href="..">Docs</a> &raquo;</li>
+    
+      
+        
+          <li>Development &raquo;</li>
+        
+      
+    
+    <li>Best Practices</li>
+    <li class="wy-breadcrumbs-aside">
+      
+    </li>
+  </ul>
+  <hr/>
+</div>
+          <div role="main">
+            <div class="section">
+              
+                <h1 id="development-best-practices">Development Best Practices</h1>
+<p>This document describes the best practices to follow when developing operators and
other application components such as partitoners, stream codecs etc on the Apache Apex platform.</p>
+<h2 id="operators">Operators</h2>
+<p>These are general guidelines for all operators that are covered in the current section.
The subsequent sections talk about special considerations for input and output operators.</p>
+<ul>
+<li>When writing a new operator to be used in an application, consider breaking it
down into<ul>
+<li>An abstract operator that encompasses the core functionality but leaves application
specific schemas and logic to the implementation.</li>
+<li>An optional concrete operator also in the library that extends the abstract operator
and provides commonly used schema types such as strings, byte[] or POJOs.</li>
+</ul>
+</li>
+<li>Follow these conventions for the life cycle methods:<ul>
+<li>Do one time initialization of entities that apply for the entire lifetime of the
operator in the <strong>setup</strong> method, e.g., factory initializations.
Initializations in <strong>setup</strong> are done in the container where the
operator is deployed. Allocating memory for fields in the constructor is not efficient as
it would lead to extra garbage in memory for the following reason. The operator is instantiated
on the client from where the application is launched, serialized and started one of the Hadoop
nodes in a container. So the constructor is first called on the client and if it were to initialize
any of the fields, that state would be saved during serialization. In the Hadoop container
the operator is deserialized and started. This would invoke the constructor again, which will
initialize the fields but their state will get overwritten by the serialized state and the
initial values would become garbage in memory.</li>
+<li>Do one time initialization for live entities in <strong>activate</strong>
method, e.g., opening connections to a database server or starting a thread for asynchronous
operations. The <strong>activate</strong> method is called right before processing
starts so it is a better place for these initializations than at <strong>setup</strong>
which can lead to a delay before processing data from the live entity.  </li>
+<li>Perform periodic tasks based on processing time in application window boundaries.</li>
+<li>Perform initializations needed for each application window in <strong>beginWindow</strong>.</li>
+<li>Perform aggregations needed for each application window  in <strong>endWindow</strong>.</li>
+<li>Teardown of live entities (inverse of tasks performed during activate) should be
in the <strong>deactivate</strong> method.</li>
+<li>Teardown of lifetime entities (those initialized in setup method) should happen
in the <strong>teardown</strong> method.</li>
+<li>If the operator implementation is not finalized mark it with the <strong>@Evolving</strong>
annotation.</li>
+</ul>
+</li>
+<li>If the operator needs to perform operations based on event time of the individual
tuples and not the processing time, extend and use the <strong>WindowedOperator</strong>.
Refer to documentation of that operator for details on how to use it.</li>
+<li>If an operator needs to do some work when it is not receiving any input, it should
implement <strong>IdleTimeHandler</strong> interface. This interface contains
<strong>handleIdleTime</strong> method which will be called whenever the platform
isn’t doing anything else and the operator can do the work in this method. If for any reason
the operator does not have any work to do when this method is called, it should sleep for
a small amount of time such as that specified by the <strong>SPIN_MILLIS</strong>
attribute so that it does not cause a busy wait when called repeatedly by the platform. Also,
the method should not block and return in a reasonable amount of time that is less than the
streaming window size (which is 500ms by default).</li>
+<li>Often operators have customizable parameters such as information about locations
of external systems or parameters that modify the behavior of the operator. Users should be
able to specify these easily without having to change source code. This can be done by making
them properties of the operator because they can then be initialized from external properties
files.<ul>
+<li>Where possible default values should be provided for the properties in the source
code.</li>
+<li>Validation rules should be specified for the properties using javax constraint
validations that check whether the values specified for the properties are in the correct
format, range or other operator requirements. Required properties should have at least a <strong>@NotNull</strong>
validation specifying that they have to be specified by the user.</li>
+</ul>
+</li>
+</ul>
+<h3 id="checkpointing">Checkpointing</h3>
+<p>Checkpointing is a process of snapshotting the state of an operator and saving it
so that in case of failure the state can be used to restore the operator to a prior state
and continue processing. It is automatically performed by the platform at a configurable interval.
All operators in the application are checkpointed in a distributed fashion, thus allowing
the entire state of the application to be saved and available for recovery if needed. Here
are some things to remember when it comes to checkpointing:</p>
+<ul>
+<li>The process of checkpointing involves snapshotting the state by serializing the
operator and saving it to a store. This is done using a <strong>StorageAgent</strong>.
By default a <em>StorageAgent</em> is already provided by the platform and it
is called <strong>AsyncFSStorageAgent</strong>. It serializes the operator using
Kryo and saves the serialized state asynchronously to a filesystem such as HDFS. There are
other implementations of <em>StorageAgent</em> available such as <strong>GeodeKeyValueStorageAgent</strong>
that stores the serialized state in Geode which is an in-memory replicated data grid.</li>
+<li>All variables in the operator marked neither transient nor final are saved so any
variables in the operator that are not part of the state should be marked transient. Specifically
any variables like connection objects, i/o streams, ports are transient, because they need
to be setup again on failure recovery.</li>
+<li>If the operator does not keep any state between windows, mark it with the <strong>@Stateless</strong>
annotation. This results in efficiencies during checkpointing and recovery. The operator will
not be checkpointed and is always restored to the initial state</li>
+<li>The checkpoint interval can be set using the <strong>CHECKPOINT_WINDOW_COUNT</strong>
attribute which specifies the interval in terms of number of streaming windows.</li>
+<li>If the correct functioning of the operator requires the <strong>endWindow</strong>
method be called before checkpointing can happen, then the checkpoint interval should align
with application window interval i.e., it should be a multiple of application window interval.
In this case the operator should be marked with <strong>OperatorAnnotation</strong>
and <strong>checkpointableWithinAppWindow</strong> set to false. If the window
intervals are configured by the user and they don’t align, it will result in a DAG validation
error and application won’t launch.</li>
+<li>In some cases the operator state related to a piece of data needs to be purged
once that data is no longer required by the application, otherwise the state will continue
to build up indefinitely. The platform provides a way to let the operator know about this
using a callback listener called <strong>CheckpointNotificationListener</strong>.
This listener has a callback method called <strong>committed</strong>, which is
called by the platform from time to time with a window id that has been processed successfully
by all the operators in the DAG and hence is no longer needed. The operator can delete all
the state corresponding to window ids less than or equal to the provided window id.</li>
+<li>Sometimes operators need to perform some tasks just before checkpointing. For example,
filesystem operators may want to flush the files just before checkpoint so they can be sure
that all pending data is written to disk and no data is lost if there is an operator failure
just after the checkpoint and the operator restarts from the checkpoint. To do this the operator
would implement the same <em>CheckpointNotificationListener</em> interface and
implement the <strong>beforeCheckpoint</strong> method where it can do these tasks.</li>
+<li>If the operator is going to have a large state, checkpointing the entire state
each time becomes unviable. Furthermore, the amount of memory needed to hold the state could
be larger than the amount of physical memory available. In these cases the operator should
checkpoint the state incrementally and also manage the memory for the state more efficiently.
The platform provides a utiltiy called <strong>ManagedState</strong> that uses
a combination of in memory and disk cache to efficiently store and retrieve data in a performant,
fault tolerant way and also checkpoint it in an incremental fashion. There are operators in
the platform that use <em>ManagedState</em> and can be used as a reference on
how to use this utility such as Dedup or Join operators.</li>
+</ul>
+<h2 id="input-operators">Input Operators</h2>
+<p>Input operators have additional requirements:</p>
+<ul>
+<li>The <strong>emitTuples</strong> method implemented by the operator,
is called by the platform, to give the operator an opportunity to emit some data. This method
is always called within a window boundary but can be called multiple times within the same
window. There are some important guidelines on how to implement this method:<ul>
+<li>This should not be a blocking method and should return in a reasonable time that
is less than the streaming window size (which is 500ms by default). This also applies to other
callback methods called by the platform such as <em>beginWindow</em>, <em>endWindow</em>
etc., but is more important here since this method will be called continuously by the platform.</li>
+<li>If the operator needs to interact with external systems to obtain data and this
can potentially take a long time, then this should be performed asynchronously in a different
thread. Refer to the threading section below for the guidelines when using threading.</li>
+<li>In each invocation, the method can emit any number of data tuples.</li>
+</ul>
+</li>
+</ul>
+<h3 id="idempotence">Idempotence</h3>
+<p>Many applications write data to external systems using output operators. To ensure
that data is present exactly once in the external system even in a failure recovery scenario,
the output operators expect the replayed windows during recovery contain the same data as
before the failure. This is called idempotency. Since operators within the DAG are merely
responding to input data provided to them by the upstream operators and the input operator
has no upstream operator, the responsibility of idempotent replay falls on the input operators.</p>
+<ul>
+<li>For idempotent replay of data, the operator needs to store some meta-information
for every window that would allow it to identify what data was sent in that window. This is
called the idempotent state.<ul>
+<li>If the external source of the input operator allows replayability, this could be
information such as offset of last piece of data in the window, an identifier of the last
piece of data itself or number of data tuples sent.</li>
+<li>However if the external source does not allow replayability from an operator specified
point, then the entire data sent within the window may need to be persisted by the operator.</li>
+</ul>
+</li>
+<li>The platform provides a utility called <em>WindowDataManager</em> to
allow operators to save and retrieve idempotent state every window. Operators should use this
to implement idempotency.</li>
+</ul>
+<h2 id="output-operators">Output Operators</h2>
+<p>Output operators typically connect to external storage systems such as filesystems,
databases or key value stores to store data.</p>
+<ul>
+<li>In some situations, the external systems may not be functioning in a reliable fashion.
They may be having prolonged outages or performance problems. If the operator is being designed
to work in such environments, it needs to be able to to handle these problems gracefully and
not block the DAG or fail. In these scenarios the operator should cache the data into a local
store such as HDFS and interact with external systems in a separate thread so as to not have
problems in the operator lifecycle thread. This pattern is called the <strong>Reconciler</strong>
pattern and there are operators that implement this pattern available in the library for reference.</li>
+</ul>
+<h3 id="end-to-end-exactly-once">End-to-End Exactly Once</h3>
+<p>When output operators store data in external systems, it is important that they
do not lose data or write duplicate data when there is a failure event and the DAG recovers
from that failure. In failure recovery, the windows from the previous checkpoint are replayed
and the operator receives this data again. The operator should ensure that it does not write
this data again. Operator developers should figure out how to do this specifically for the
operators they are developing depending on the logic of the operators. Below are examples
of how a couple of existing output operators do this for reference.</p>
+<ul>
+<li>File output operator that writes data to files keeps track of the file lengths
in the state. These lengths are checkpointed and restored on failure recovery. On restart,
the operator truncates the file to the length equal to the length in the recovered state.
This makes the data in the file same as it was at the time of checkpoint before the failure.
The operator now writes the replayed data from the checkpoint in regular fashion as any other
data. This ensures no data is lost or duplicated in the file.</li>
+<li>The JDBC output operator that writes data to a database table writes the data in
a window in a single transaction. It also writes the current window id into a meta table along
with the data as part of the same transaction. It commits the transaction at the end of the
window. When there is an operator failure before the final commit, the state of the database
is that it contains the data from the previous fully processed window and its window id since
the current window transaction isn’t yet committed. On recovery, the operator reads this
window id back from the meta table. It ignores all the replayed windows whose window id is
less than or equal to the recovered window id and thus ensures that it does not duplicate
data already present in the database. It starts writing data normally again when window id
of data becomes greater than recovered window thus ensuring no data is lost.</li>
+</ul>
+<h2 id="partitioning">Partitioning</h2>
+<p>Partitioning allows an operation to be scaled to handle more pieces of data than
before but with a similar SLA. This is done by creating multiple instances of an operator
and distributing the data among them. Input operators can also be partitioned to stream more
pieces of data into the application. The platform provides a lot of flexibility and options
for partitioning. Partitioning can happen once at startup or can be dynamically changed anytime
while the application is running, and it can be done in a stateless or stateful way by distributing
state from the old partitions to new partitions.</p>
+<p>In the platform, the responsibility for partitioning is shared among different entities.
These are:</p>
+<ol>
+<li>A <strong>partitioner</strong> that specifies <em>how</em>
to partition the operator, specifically it takes an old set of partitions and creates a new
set of partitions. At the start of the application the old set has one partition and the partitioner
can return more than one partitions to start the application with multiple partitions. The
partitioner can have any custom JAVA logic to determine the number of new partitions, set
their initial state as a brand new state or derive it from the state of the old partitions.
It also specifies how the data gets distributed among the new partitions. The new set doesn't
have to contain only new partitions, it can carry over some old partitions if desired.</li>
+<li>An optional <strong>statistics (stats) listener</strong> that specifies
<em>when</em> to partition. The reason it is optional is that it is needed only
when dynamic partitioning is needed. With the stats listener, the stats can be used to determine
when to partition.</li>
+<li>In some cases the <em>operator</em> itself should be aware of partitioning
and would need to provide supporting code.<ul>
+<li>In case of input operators each partition should have a property or a set of properties
that allow it to distinguish itself from the other partitions and fetch unique data.</li>
+</ul>
+</li>
+<li>When an operator that was originally a single instance is split into multiple partitions
with each partition working on a subset of data, the results of the partitions may need to
be combined together to compute the final result. The combining logic would depend on the
logic of the operator. This would be specified by the developer using a <strong>Unifier</strong>,
which is deployed as another operator by the platform. If no <em>Unifier</em>
is specified, the platform inserts a <strong>default unifier</strong> that merges
the results of the multiple partition streams into a single stream. Each output port can have
a different <em>Unifier</em> and this is specified by returning the corresponding
<em>Unifier</em> in the <strong>getUnifier</strong> method of the
output port. The operator developer should provide a custom <em>Unifier</em> wherever
applicable.</li>
+<li>The Apex <em>engine</em> that brings everything together and effects
the partitioning.</li>
+</ol>
+<p>Since partitioning is critical for scalability of applications, operators must support
it. There should be a strong reason for an operator to not support partitioning, such as,
the logic performed by the operator not lending itself to parallelism. In order to support
partitioning, an operator developer, apart from developing the functionality of the operator,
may also need to provide a partitioner, stats listener and supporting code in the operator
as described in the steps above. The next sections delve into this. </p>
+<h3 id="out-of-the-box-partitioning">Out of the box partitioning</h3>
+<p>The platform comes with some built-in partitioning utilities that can be used in
certain scenarios.</p>
+<ul>
+<li>
+<p><strong>StatelessPartitioner</strong> provides a default partitioner,
that can be used for an operator in certain conditions. If the operator satisfies these conditions,
the partitioner can be specified for the operator with a simple setting and no other partitioning
code is needed. The conditions are:</p>
+<ul>
+<li>No dynamic partitioning is needed, see next point about dynamic partitioning. </li>
+<li>There is no distinct initial state for the partitions, i.e., all partitions start
with the same initial state submitted during application launch.</li>
+</ul>
+<p>Typically input or output operators do not fall into this category, although there
are some exceptions. This partitioner is mainly used with operators that are in the middle
of the DAG, after the input and before the output operators. When used with non-input operators,
only the data for the first declared input port is distributed among the different partitions.
All other input ports are treated as broadcast and all partitions receive all the data for
that port.</p>
+</li>
+<li>
+<p><strong>StatelessThroughputBasedPartitioner</strong> in Malhar provides
a dynamic partitioner based on throughput thresholds. Similarly <strong>StatelessLatencyBasedPartitioner</strong>
provides a latency based dynamic partitioner in RTS. If these partitioners can be used, then
separate partitioning related code is not needed. The conditions under which these can be
used are:</p>
+<ul>
+<li>There is no distinct initial state for the partitions.</li>
+<li>There is no state being carried over by the operator from one window to the next
i.e., operator is stateless.</li>
+</ul>
+</li>
+</ul>
+<h3 id="custom-partitioning">Custom partitioning</h3>
+<p>In many cases, operators don’t satisfy the above conditions and a built-in partitioner
cannot be used. Custom partitioning code needs to be written by the operator developer. Below
are guidelines for it.</p>
+<ul>
+<li>Since the operator developer is providing a <em>partitioner</em> for
the operator, the partitioning code should be added to the operator itself by making the operator
implement the Partitioner interface and implementing the required methods, rather than creating
a separate partitioner. The advantage is the user of the operator does not have to explicitly
figure out the partitioner and set it for the operator but still has the option to override
this built-in partitioner with a different one.</li>
+<li>The <em>partitioner</em> is responsible for setting the initial state
of the new partitions, whether it is at the start of the application or when partitioning
is happening while the application is running as in the dynamic partitioning case. In the
dynamic partitioning scenario, the partitioner needs to take the state from the old partitions
and distribute it among the new partitions. It is important to note that apart from the checkpointed
state the partitioner also needs to distribute idempotent state.</li>
+<li>The <em>partitioner</em> interface has two methods, <strong>definePartitions</strong>
and <strong>partitioned</strong>. The method <em>definePartitons</em>
is first called to determine the new partitions, and if enough resources are available on
the cluster, the <em>partitioned</em> method is called passing in the new partitions.
This happens both during initial partitioning and dynamic partitioning. If resources are not
available, partitioning is abandoned and existing partitions continue to run untouched. This
means that any processing intensive operations should be deferred to the <em>partitioned</em>
call instead of doing them in <em>definePartitions</em>, as they may not be needed
if there are not enough resources available in the cluster.</li>
+<li>The <em>partitioner</em>, along with creating the new partitions, should
also specify how the data gets distributed across the new partitions. It should do this by
specifying a mapping called <strong>PartitionKeys</strong> for each partition
that maps the data to that partition. This mapping needs to be specified for every input port
in the operator. If the <em>partitioner</em> wants to use the standard mapping
it can use a utility method called <strong>DefaultPartition.assignPartitionKeys</strong>.</li>
+<li>When the partitioner is scaling the operator up to more partitions, try to reuse
the existing partitions and create new partitions to augment the current set. The reuse can
be achieved by the partitioner returning the current partitions unchanged. This will result
in the current partitions continuing to run untouched.</li>
+<li>In case of dynamic partitioning, as mentioned earlier, a stats listener is also
needed to determine when to re-partition. Like the <em>Partitioner</em> interface,
the operator can also implement the <em>StatsListener</em> interface to provide
a stats listener implementation that will be automatically used.</li>
+<li>The <em>StatsListener</em> has access to all operator statistics to
make its decision on partitioning. Apart from the statistics that the platform computes for
the operators such as throughput, latency etc, operator developers can include their own business
metrics by using the AutoMetric feature.</li>
+<li>If the operator is not partitionable, mark it so with <em>OperatorAnnotation</em>
and <em>partitionable</em> element set to false.</li>
+</ul>
+<h3 id="streamcodecs">StreamCodecs</h3>
+<p>A <strong>StreamCodec</strong> is used in partitioning to distribute
the data tuples among the partitions. The <em>StreamCodec</em> computes an integer
hashcode for a data tuple and this is used along with <em>PartitionKeys</em> mapping
to determine which partition or partitions receive the data tuple. If a <em>StreamCodec</em>
is not specified, then a default one is used by the platform which returns the JAVA hashcode
of the tuple. </p>
+<p><em>StreamCodec</em> is also useful in another aspect of the application.
It is used to serialize and deserialize the tuple to transfer it between operators. The default
<em>StreamCodec</em> uses Kryo library for serialization. </p>
+<p>The following guidelines are useful when considering a custom <em>StreamCodec</em></p>
+<ul>
+<li>A custom <em>StreamCodec</em> is needed if the tuples need to be distributed
based on a criteria different from the hashcode of the tuple. If the correct working of an
operator depends on the data from the upstream operator being distributed using a custom criteria
such as being sticky on a “key” field within the tuple, then a custom <em>StreamCodec</em>
should be provided by the operator developer. This codec can implement the custom criteria.
The operator should also return this custom codec in the <strong>getStreamCodec</strong>
method of the input port.</li>
+<li>When implementing a custom <em>StreamCodec</em> for the purpose of
using a different criteria to distribute the tuples, the codec can extend an existing <em>StreamCodec</em>
and implement the hashcode method, so that the codec does not have to worry about the serialization
and deserialization functionality. The Apex platform provides two pre-built <em>StreamCodec</em>
implementations for this purpose, one is <strong>KryoSerializableStreamCodec</strong>
that uses Kryo for serialization and another one <strong>JavaSerializationStreamCodec</strong>
that uses JAVA serialization.</li>
+<li>Different <em>StreamCodec</em> implementations can be used for different
inputs in a stream with multiple inputs when different criteria of distributing the tuples
is desired between the multiple inputs. </li>
+</ul>
+<h2 id="threads">Threads</h2>
+<p>The operator lifecycle methods such as <strong>setup</strong>, <strong>beginWindow</strong>,
<strong>endWindow</strong>, <strong>process</strong> in <em>InputPorts</em>
are all called from a single operator lifecycle thread, by the platform, unbeknownst to the
user. So the user does not have to worry about dealing with the issues arising from multi-threaded
code. Use of separate threads in an operator is discouraged because in most cases the motivation
for this is parallelism, but parallelism can already be achieved by using multiple partitions
and furthermore mistakes can be made easily when writing multi-threaded code. When dealing
with high volume and velocity data, the corner cases with incorrectly written multi-threaded
code are encountered more easily and exposed. However, there are times when separate threads
are needed, for example, when interacting with external systems the delay in retrieving or
sending data can be large at times, blocking the operator and other DAG pro
 cessing such as committed windows. In these cases the following guidelines must be followed
strictly.</p>
+<ul>
+<li>Threads should be started in <strong>activate</strong> and stopped
in <strong>deactivate</strong>. In <em>deactivate</em> the operator
should wait till any threads it launched, have finished execution. It can do so by calling
<strong>join</strong> on the threads or if using <strong>ExecutorService</strong>,
calling <strong>awaitTermination</strong> on the service.</li>
+<li>Threads should not call any methods on the ports directly as this can cause concurrency
exceptions and also result in invalid states.</li>
+<li>Threads can share state with the lifecycle methods using data structures that are
either explicitly protected by synchronization or are inherently thread safe such as thread
safe queues.</li>
+<li>If this shared state needs to be protected against failure then it needs to be
persisted during checkpoint. To have a consistent checkpoint, the state should not be modified
by the thread when it is being serialized and saved by the operator lifecycle thread during
checkpoint. Since the checkpoint process happens outside the window boundary the thread should
be quiesced between <strong>endWindow</strong> and <strong>beginWindow</strong>
or more efficiently between pre-checkpoint and checkpointed callbacks.</li>
+</ul>
+              
+            </div>
+          </div>
+          <footer>
+  
+    <div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">
+      
+        <a href="../apex_cli/" class="btn btn-neutral float-right" title="Apex CLI">Next
<span class="icon icon-circle-arrow-right"></span></a>
+      
+      
+        <a href="../control_tuples/" class="btn btn-neutral" title="Custom Control Tuples"><span
class="icon icon-circle-arrow-left"></span> Previous</a>
+      
+    </div>
+  
+
+  <hr/>
+
+  <div role="contentinfo">
+    <!-- Copyright etc -->
+    
+  </div>
+
+  Built with <a href="http://www.mkdocs.org">MkDocs</a> using a <a href="https://github.com/snide/sphinx_rtd_theme">theme</a>
provided by <a href="https://readthedocs.org">Read the Docs</a>.
+</footer>
+	  
+        </div>
+      </div>
+
+    </section>
+
+  </div>
+
+<div class="rst-versions" role="note" style="cursor: pointer">
+    <span class="rst-current-version" data-toggle="rst-current-version">
+      
+      
+        <span><a href="../control_tuples/" style="color: #fcfcfc;">&laquo;
Previous</a></span>
+      
+      
+        <span style="margin-left: 15px"><a href="../apex_cli/" style="color: #fcfcfc">Next
&raquo;</a></span>
+      
+    </span>
+</div>
+
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/apex-site/blob/fb75848f/docs/apex-3.6/favicon.ico
----------------------------------------------------------------------
diff --git a/docs/apex-3.6/favicon.ico b/docs/apex-3.6/favicon.ico
new file mode 100644
index 0000000..c0b3dae
Binary files /dev/null and b/docs/apex-3.6/favicon.ico differ

http://git-wip-us.apache.org/repos/asf/apex-site/blob/fb75848f/docs/apex-3.6/fonts/fontawesome-webfont.eot
----------------------------------------------------------------------
diff --git a/docs/apex-3.6/fonts/fontawesome-webfont.eot b/docs/apex-3.6/fonts/fontawesome-webfont.eot
new file mode 100755
index 0000000..0662cb9
Binary files /dev/null and b/docs/apex-3.6/fonts/fontawesome-webfont.eot differ


Mime
View raw message