flink-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From rmetz...@apache.org
Subject [55/78] [abbrv] flink-web git commit: Rebuild site
Date Wed, 18 Jan 2017 14:11:02 GMT
http://git-wip-us.apache.org/repos/asf/flink-web/blob/9ec0a879/content/news/2015/05/11/Juggling-with-Bits-and-Bytes.html
----------------------------------------------------------------------
diff --git a/content/news/2015/05/11/Juggling-with-Bits-and-Bytes.html b/content/news/2015/05/11/Juggling-with-Bits-and-Bytes.html
new file mode 100644
index 0000000..a7a5436
--- /dev/null
+++ b/content/news/2015/05/11/Juggling-with-Bits-and-Bytes.html
@@ -0,0 +1,383 @@
+<!DOCTYPE html>
+<html lang="en">
+  <head>
+    <meta charset="utf-8">
+    <meta http-equiv="X-UA-Compatible" content="IE=edge">
+    <meta name="viewport" content="width=device-width, initial-scale=1">
+    <!-- The above 3 meta tags *must* come first in the head; any other head content must come *after* these tags -->
+    <title>Apache Flink: Juggling with Bits and Bytes</title>
+    <link rel="shortcut icon" href="/favicon.ico" type="image/x-icon">
+    <link rel="icon" href="/favicon.ico" type="image/x-icon">
+
+    <!-- Bootstrap -->
+    <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.4/css/bootstrap.min.css">
+    <link rel="stylesheet" href="/css/flink.css">
+    <link rel="stylesheet" href="/css/syntax.css">
+
+    <!-- Blog RSS feed -->
+    <link href="/blog/feed.xml" rel="alternate" type="application/rss+xml" title="Apache Flink Blog: RSS feed" />
+
+    <!-- jQuery (necessary for Bootstrap's JavaScript plugins) -->
+    <!-- We need to load Jquery in the header for custom google analytics event tracking-->
+    <script src="https://ajax.googleapis.com/ajax/libs/jquery/1.11.2/jquery.min.js"></script>
+
+    <!-- HTML5 shim and Respond.js for IE8 support of HTML5 elements and media queries -->
+    <!-- WARNING: Respond.js doesn't work if you view the page via file:// -->
+    <!--[if lt IE 9]>
+      <script src="https://oss.maxcdn.com/html5shiv/3.7.2/html5shiv.min.js"></script>
+      <script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
+    <![endif]-->
+  </head>
+  <body>  
+    
+
+    <!-- Main content. -->
+    <div class="container">
+    <div class="row">
+
+      
+     <div id="sidebar" class="col-sm-3">
+          <!-- Top navbar. -->
+    <nav class="navbar navbar-default">
+        <!-- The logo. -->
+        <div class="navbar-header">
+          <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#bs-example-navbar-collapse-1">
+            <span class="icon-bar"></span>
+            <span class="icon-bar"></span>
+            <span class="icon-bar"></span>
+          </button>
+          <div class="navbar-logo">
+            <a href="/">
+              <img alt="Apache Flink" src="/img/navbar-brand-logo.png" width="147px" height="73px">
+            </a>
+          </div>
+        </div><!-- /.navbar-header -->
+
+        <!-- The navigation links. -->
+        <div class="collapse navbar-collapse" id="bs-example-navbar-collapse-1">
+          <ul class="nav navbar-nav navbar-main">
+
+            <!-- Downloads -->
+            <li class=""><a class="btn btn-info" href="/downloads.html">Download Flink</a></li>
+
+            <!-- Overview -->
+            <li><a href="/index.html">Home</a></li>
+
+            <!-- Intro -->
+            <li><a href="/introduction.html">Introduction to Flink</a></li>
+
+            <!-- Use cases -->
+            <li><a href="/usecases.html">Flink Use Cases</a></li>
+
+            <!-- Powered by -->
+            <li><a href="/poweredby.html">Powered by Flink</a></li>
+
+            <!-- Ecosystem -->
+            <li><a href="/ecosystem.html">Ecosystem</a></li>
+
+            <!-- Community -->
+            <li><a href="/community.html">Community &amp; Project Info</a></li>
+
+            <!-- Contribute -->
+            <li><a href="/how-to-contribute.html">How to Contribute</a></li>
+
+            <!-- Blog -->
+            <li class=" active hidden-md hidden-sm"><a href="/blog/"><b>Flink Blog</b></a></li>
+
+            <hr />
+
+
+
+            <!-- Documentation -->
+            <!-- <li>
+              <a href="http://ci.apache.org/projects/flink/flink-docs-release-1.1" target="_blank">Documentation <small><span class="glyphicon glyphicon-new-window"></span></small></a>
+            </li> -->
+            <li class="dropdown">
+              <a class="dropdown-toggle" data-toggle="dropdown" href="#">Documentation
+                <span class="caret"></span></a>
+                <ul class="dropdown-menu">
+                  <li><a href="http://ci.apache.org/projects/flink/flink-docs-release-1.1" target="_blank">1.1 (Latest stable release) <small><span class="glyphicon glyphicon-new-window"></span></small></a></li>
+                  <li><a href="http://ci.apache.org/projects/flink/flink-docs-release-1.2" target="_blank">1.2 (Snapshot) <small><span class="glyphicon glyphicon-new-window"></span></small></a></li>
+                </ul>
+              </li>
+
+            <!-- Quickstart -->
+            <li>
+              <a href="http://ci.apache.org/projects/flink/flink-docs-release-1.1/quickstart/setup_quickstart.html" target="_blank">Quickstart <small><span class="glyphicon glyphicon-new-window"></span></small></a>
+            </li>
+
+            <!-- GitHub -->
+            <li>
+              <a href="https://github.com/apache/flink" target="_blank">Flink on GitHub <small><span class="glyphicon glyphicon-new-window"></span></small></a>
+            </li>
+
+
+
+
+
+
+          </ul>
+
+
+
+          <ul class="nav navbar-nav navbar-bottom">
+          <hr />
+
+            <!-- FAQ -->
+            <li ><a href="/faq.html">Project FAQ</a></li>
+
+            <!-- Twitter -->
+            <li><a href="https://twitter.com/apacheflink" target="_blank">@ApacheFlink <small><span class="glyphicon glyphicon-new-window"></span></small></a></li>
+
+            <!-- Visualizer -->
+            <li class=" hidden-md hidden-sm"><a href="/visualizer/" target="_blank">Plan Visualizer <small><span class="glyphicon glyphicon-new-window"></span></small></a></li>
+
+          </ul>
+        </div><!-- /.navbar-collapse -->
+    </nav>
+
+      </div>
+      <div class="col-sm-9">
+      <div class="row-fluid">
+  <div class="col-sm-12">
+    <div class="row">
+      <h1>Juggling with Bits and Bytes</h1>
+
+      <article>
+        <p>11 May 2015 by Fabian Hüske (<a href="https://twitter.com/fhueske">@fhueske</a>)</p>
+
+<h2 id="how-apache-flink-operates-on-binary-data">How Apache Flink operates on binary data</h2>
+
+<p>Nowadays, a lot of open-source systems for analyzing large data sets are implemented in Java or other JVM-based programming languages. The most well-known example is Apache Hadoop, but also newer frameworks such as Apache Spark, Apache Drill, and also Apache Flink run on JVMs. A common challenge that JVM-based data analysis engines face is to store large amounts of data in memory - both for caching and for efficient processing such as sorting and joining of data. Managing the JVM memory well makes the difference between a system that is hard to configure and has unpredictable reliability and performance and a system that behaves robustly with few configuration knobs.</p>
+
+<p>In this blog post we discuss how Apache Flink manages memory, talk about its custom data de/serialization stack, and show how it operates on binary data.</p>
+
+<h2 id="data-objects-lets-put-them-on-the-heap">Data Objects? Let’s put them on the heap!</h2>
+
+<p>The most straight-forward approach to process lots of data in a JVM is to put it as objects on the heap and operate on these objects. Caching a data set as objects would be as simple as maintaining a list containing an object for each record. An in-memory sort would simply sort the list of objects.
+However, this approach has a few notable drawbacks. First of all it is not trivial to watch and control heap memory usage when a lot of objects are created and invalidated constantly. Memory overallocation instantly kills the JVM with an <code>OutOfMemoryError</code>. Another aspect is garbage collection on multi-GB JVMs which are flooded with new objects. The overhead of garbage collection in such environments can easily reach 50% and more. Finally, Java objects come with a certain space overhead depending on the JVM and platform. For data sets with many small objects this can significantly reduce the effectively usable amount of memory. Given proficient system design and careful, use-case specific system parameter tuning, heap memory usage can be more or less controlled and <code>OutOfMemoryErrors</code> avoided. However, such setups are rather fragile especially if data characteristics or the execution environment change.</p>
+
+<h2 id="what-is-flink-doing-about-that">What is Flink doing about that?</h2>
+
+<p>Apache Flink has its roots at a research project which aimed to combine the best technologies of MapReduce-based systems and parallel database systems. Coming from this background, Flink has always had its own way of processing data in-memory. Instead of putting lots of objects on the heap, Flink serializes objects into a fixed number of pre-allocated memory segments. Its DBMS-style sort and join algorithms operate as much as possible on this binary data to keep the de/serialization overhead at a minimum. If more data needs to be processed than can be kept in memory, Flink’s operators partially spill data to disk. In fact, a lot of Flink’s internal implementations look more like C/C++ rather than common Java. The following figure gives a high-level overview of how Flink stores data serialized in memory segments and spills to disk if necessary.</p>
+
+<center>
+<img src="/img/blog/memory-mgmt.png" style="width:90%;margin:15px" />
+</center>
+
+<p>Flink’s style of active memory management and operating on binary data has several benefits:</p>
+
+<ol>
+  <li><strong>Memory-safe execution &amp; efficient out-of-core algorithms.</strong> Due to the fixed amount of allocated memory segments, it is trivial to monitor remaining memory resources. In case of memory shortage, processing operators can efficiently write larger batches of memory segments to disk and later them read back. Consequently, <code>OutOfMemoryErrors</code> are effectively prevented.</li>
+  <li><strong>Reduced garbage collection pressure.</strong> Because all long-lived data is in binary representation in Flink’s managed memory, all data objects are short-lived or even mutable and can be reused. Short-lived objects can be more efficiently garbage-collected, which significantly reduces garbage collection pressure. Right now, the pre-allocated memory segments are long-lived objects on the JVM heap, but the Flink community is actively working on allocating off-heap memory for this purpose. This effort will result in much smaller JVM heaps and facilitate even faster garbage collection cycles.</li>
+  <li><strong>Space efficient data representation.</strong> Java objects have a storage overhead which can be avoided if the data is stored in a binary representation.</li>
+  <li><strong>Efficient binary operations &amp; cache sensitivity.</strong> Binary data can be efficiently compared and operated on given a suitable binary representation. Furthermore, the binary representations can put related values, as well as hash codes, keys, and pointers, adjacently into memory. This gives data structures with usually more cache efficient access patterns.</li>
+</ol>
+
+<p>These properties of active memory management are very desirable in a data processing systems for large-scale data analytics but have a significant price tag attached. Active memory management and operating on binary data is not trivial to implement, i.e., using <code>java.util.HashMap</code> is much easier than implementing a spillable hash-table backed by byte arrays and a custom serialization stack. Of course Apache Flink is not the only JVM-based data processing system that operates on serialized binary data. Projects such as <a href="http://drill.apache.org/">Apache Drill</a>, <a href="http://ignite.incubator.apache.org/">Apache Ignite (incubating)</a> or <a href="http://projectgeode.org/">Apache Geode (incubating)</a> apply similar techniques and it was recently announced that also <a href="http://spark.apache.org/">Apache Spark</a> will evolve into this direction with <a href="https://databricks.com/blog/2015/04/28/project-tungsten-bringing-spark-closer-to-bare-metal.html">
 Project Tungsten</a>.</p>
+
+<p>In the following we discuss in detail how Flink allocates memory, de/serializes objects, and operates on binary data. We will also show some performance numbers comparing processing objects on the heap and operating on binary data.</p>
+
+<h2 id="how-does-flink-allocate-memory">How does Flink allocate memory?</h2>
+
+<p>A Flink worker, called TaskManager, is composed of several internal components such as an actor system for coordination with the Flink master, an IOManager that takes care of spilling data to disk and reading it back, and a MemoryManager that coordinates memory usage. In the context of this blog post, the MemoryManager is of most interest.</p>
+
+<p>The MemoryManager takes care of allocating, accounting, and distributing MemorySegments to data processing operators such as sort and join operators. A <a href="https://github.com/apache/flink/blob/release-0.9.0-milestone-1/flink-core/src/main/java/org/apache/flink/core/memory/MemorySegment.java">MemorySegment</a> is Flink’s distribution unit of memory and is backed by a regular Java byte array (size is 32 KB by default). A MemorySegment provides very efficient write and read access to its backed byte array using Java’s unsafe methods. You can think of a MemorySegment as a custom-tailored version of Java’s NIO ByteBuffer. In order to operate on multiple MemorySegments like on a larger chunk of consecutive memory, Flink uses logical views that implement Java’s <code>java.io.DataOutput</code> and <code>java.io.DataInput</code> interfaces.</p>
+
+<p>MemorySegments are allocated once at TaskManager start-up time and are destroyed when the TaskManager is shut down. Hence, they are reused and not garbage-collected over the whole lifetime of a TaskManager. After all internal data structures of a TaskManager have been initialized and all core services have been started, the MemoryManager starts creating MemorySegments. By default 70% of the JVM heap that is available after service initialization is allocated by the MemoryManager. It is also possible to configure an absolute amount of managed memory. The remaining JVM heap is used for objects that are instantiated during task processing, including objects created by user-defined functions. The following figure shows the memory distribution in the TaskManager JVM after startup.</p>
+
+<center>
+<img src="/img/blog/memory-alloc.png" style="width:60%;margin:15px" />
+</center>
+
+<h2 id="how-does-flink-serialize-objects">How does Flink serialize objects?</h2>
+
+<p>The Java ecosystem offers several libraries to convert objects into a binary representation and back. Common alternatives are standard Java serialization, <a href="https://github.com/EsotericSoftware/kryo">Kryo</a>, <a href="http://avro.apache.org/">Apache Avro</a>, <a href="http://thrift.apache.org/">Apache Thrift</a>, or Google’s <a href="https://github.com/google/protobuf">Protobuf</a>. Flink includes its own custom serialization framework in order to control the binary representation of data. This is important because operating on binary data such as comparing or even manipulating binary data requires exact knowledge of the serialization layout. Further, configuring the serialization layout with respect to operations that are performed on binary data can yield a significant performance boost. Flink’s serialization stack also leverages the fact, that the type of the objects which are going through de/serialization are exactly known before a program is executed.</p>
+
+<p>Flink programs can process data represented as arbitrary Java or Scala objects. Before a program is optimized, the data types at each processing step of the program’s data flow need to be identified. For Java programs, Flink features a reflection-based type extraction component to analyze the return types of user-defined functions. Scala programs are analyzed with help of the Scala compiler. Flink represents each data type with a <a href="https://github.com/apache/flink/blob/release-0.9.0-milestone-1/flink-core/src/main/java/org/apache/flink/api/common/typeinfo/TypeInformation.java">TypeInformation</a>. Flink has TypeInformations for several kinds of data types, including:</p>
+
+<ul>
+  <li>BasicTypeInfo: Any (boxed) Java primitive type or java.lang.String.</li>
+  <li>BasicArrayTypeInfo: Any array of a (boxed) Java primitive type or java.lang.String.</li>
+  <li>WritableTypeInfo: Any implementation of Hadoop’s Writable interface.</li>
+  <li>TupleTypeInfo: Any Flink tuple (Tuple1 to Tuple25). Flink tuples are Java representations for fixed-length tuples with typed fields.</li>
+  <li>CaseClassTypeInfo: Any Scala CaseClass (including Scala tuples).</li>
+  <li>PojoTypeInfo: Any POJO (Java or Scala), i.e., an object with all fields either being public or accessible through getters and setter that follow the common naming conventions.</li>
+  <li>GenericTypeInfo: Any data type that cannot be identified as another type.</li>
+</ul>
+
+<p>Each TypeInformation provides a serializer for the data type it represents. For example, a BasicTypeInfo returns a serializer that writes the respective primitive type, the serializer of a WritableTypeInfo delegates de/serialization to the write() and readFields() methods of the object implementing Hadoop’s Writable interface, and a GenericTypeInfo returns a serializer that delegates serialization to Kryo. Object serialization to a DataOutput which is backed by Flink MemorySegments goes automatically through Java’s efficient unsafe operations. For data types that can be used as keys, i.e., compared and hashed, the TypeInformation provides TypeComparators. TypeComparators compare and hash objects and can - depending on the concrete data type - also efficiently compare binary representations and extract fixed-length binary key prefixes.</p>
+
+<p>Tuple, Pojo, and CaseClass types are composite types, i.e., containers for one or more possibly nested data types. As such, their serializers and comparators are also composite and delegate the serialization and comparison of their member data types to the respective serializers and comparators. The following figure illustrates the serialization of a (nested) <code>Tuple3&lt;Integer, Double, Person&gt;</code> object where <code>Person</code> is a POJO and defined as follows:</p>
+
+<div class="highlight"><pre><code class="language-java"><span class="kd">public</span> <span class="kd">class</span> <span class="nc">Person</span> <span class="o">{</span>
+    <span class="kd">public</span> <span class="kt">int</span> <span class="n">id</span><span class="o">;</span>
+    <span class="kd">public</span> <span class="n">String</span> <span class="n">name</span><span class="o">;</span>
+<span class="o">}</span></code></pre></div>
+
+<center>
+<img src="/img/blog/data-serialization.png" style="width:80%;margin:15px" />
+</center>
+
+<p>Flink’s type system can be easily extended by providing custom TypeInformations, Serializers, and Comparators to improve the performance of serializing and comparing custom data types.</p>
+
+<h2 id="how-does-flink-operate-on-binary-data">How does Flink operate on binary data?</h2>
+
+<p>Similar to many other data processing APIs (including SQL), Flink’s APIs provide transformations to group, sort, and join data sets. These transformations operate on potentially very large data sets. Relational database systems feature very efficient algorithms for these purposes since several decades including external merge-sort, merge-join, and hybrid hash-join. Flink builds on this technology, but generalizes it to handle arbitrary objects using its custom serialization and comparison stack. In the following, we show how Flink operates with binary data by the example of Flink’s in-memory sort algorithm.</p>
+
+<p>Flink assigns a memory budget to its data processing operators. Upon initialization, a sort algorithm requests its memory budget from the MemoryManager and receives a corresponding set of MemorySegments. The set of MemorySegments becomes the memory pool of a so-called sort buffer which collects the data that is be sorted. The following figure illustrates how data objects are serialized into the sort buffer.</p>
+
+<center>
+<img src="/img/blog/sorting-binary-data-1.png" style="width:90%;margin:15px" />
+</center>
+
+<p>The sort buffer is internally organized into two memory regions. The first region holds the full binary data of all objects. The second region contains pointers to the full binary object data and - depending on the key data type - fixed-length sort keys. When an object is added to the sort buffer, its binary data is appended to the first region, and a pointer (and possibly a key) is appended to the second region. The separation of actual data and pointers plus fixed-length keys is done for two purposes. It enables efficient swapping of fix-length entries (key+pointer) and also reduces the data that needs to be moved when sorting. If the sort key is a variable length data type such as a String, the fixed-length sort key must be a prefix key such as the first n characters of a String. Note, not all data types provide a fixed-length (prefix) sort key. When serializing objects into the sort buffer, both memory regions are extended with MemorySegments from the memory pool. Once the me
 mory pool is empty and no more objects can be added, the sort buffer is completely filled and can be sorted. Flink’s sort buffer provides methods to compare and swap elements. This makes the actual sort algorithm pluggable. By default, Flink uses a Quicksort implementation which can fall back to HeapSort. 
+The following figure shows how two objects are compared.</p>
+
+<center>
+<img src="/img/blog/sorting-binary-data-2.png" style="width:80%;margin:15px" />
+</center>
+
+<p>The sort buffer compares two elements by comparing their binary fix-length sort keys. The comparison is successful if either done on a full key (not a prefix key) or if the binary prefix keys are not equal. If the prefix keys are equal (or the sort key data type does not provide a binary prefix key), the sort buffer follows the pointers to the actual object data, deserializes both objects and compares the objects. Depending on the result of the comparison, the sort algorithm decides whether to swap the compared elements or not. The sort buffer swaps two elements by moving their fix-length keys and pointers. The actual data is not moved. Once the sort algorithm finishes, the pointers in the sort buffer are correctly ordered. The following figure shows how the sorted data is returned from the sort buffer.</p>
+
+<center>
+<img src="/img/blog/sorting-binary-data-3.png" style="width:80%;margin:15px" />
+</center>
+
+<p>The sorted data is returned by sequentially reading the pointer region of the sort buffer, skipping the sort keys and following the sorted pointers to the actual data. This data is either deserialized and returned as objects or the binary representation is copied and written to disk in case of an external merge-sort (see this <a href="http://flink.apache.org/news/2015/03/13/peeking-into-Apache-Flinks-Engine-Room.html">blog post on joins in Flink</a>).</p>
+
+<h2 id="show-me-numbers">Show me numbers!</h2>
+
+<p>So, what does operating on binary data mean for performance? We’ll run a benchmark that sorts 10 million <code>Tuple2&lt;Integer, String&gt;</code> objects to find out. The values of the Integer field are sampled from a uniform distribution. The String field values have a length of 12 characters and are sampled from a long-tail distribution. The input data is provided by an iterator that returns a mutable object, i.e., the same tuple object instance is returned with different field values. Flink uses this technique when reading data from memory, network, or disk to avoid unnecessary object instantiations. The benchmarks are run in a JVM with 900 MB heap size which is approximately the required amount of memory to store and sort 10 million tuple objects on the heap without dying of an <code>OutOfMemoryError</code>. We sort the tuples on the Integer field and on the String field using three sorting methods:</p>
+
+<ol>
+  <li><strong>Object-on-heap.</strong> The tuples are stored in a regular <code>java.util.ArrayList</code> with initial capacity set to 10 million entries and sorted using Java’s regular collection sort.</li>
+  <li><strong>Flink-serialized.</strong> The tuple fields are serialized into a sort buffer of 600 MB size using Flink’s custom serializers, sorted as described above, and finally deserialized again. When sorting on the Integer field, the full Integer is used as sort key such that the sort happens entirely on binary data (no deserialization of objects required). For sorting on the String field a 8-byte prefix key is used and tuple objects are deserialized if the prefix keys are equal.</li>
+  <li><strong>Kryo-serialized.</strong> The tuple fields are serialized into a sort buffer of 600 MB size using Kryo serialization and sorted without binary sort keys. This means that each pair-wise comparison requires two object to be deserialized.</li>
+</ol>
+
+<p>All sort methods are implemented using a single thread. The reported times are averaged over ten runs. After each run, we call <code>System.gc()</code> to request a garbage collection run which does not go into measured execution time. The following figure shows the time to store the input data in memory, sort it, and read it back as objects.</p>
+
+<center>
+<img src="/img/blog/sort-benchmark.png" style="width:90%;margin:15px" />
+</center>
+
+<p>We see that Flink’s sort on binary data using its own serializers significantly outperforms the other two methods. Comparing to the object-on-heap method, we see that loading the data into memory is much faster. Since we actually collect the objects, there is no opportunity to reuse the object instances, but have to re-create every tuple. This is less efficient than Flink’s serializers (or Kryo serialization). On the other hand, reading objects from the heap comes for free compared to deserialization. In our benchmark, object cloning was more expensive than serialization and deserialization combined. Looking at the sorting time, we see that also sorting on the binary representation is faster than Java’s collection sort. Sorting data that was serialized using Kryo without binary sort key, is much slower than both other methods. This is due to the heavy deserialization overhead. Sorting the tuples on their String field is faster than sorting on the Integer field due to the lo
 ng-tailed value distribution which significantly reduces the number of pair-wise comparisons. To get a better feeling of what is happening during sorting we monitored the executing JVM using VisualVM. The following screenshots show heap memory usage, garbage collection activity and CPU usage over the execution of 10 runs.</p>
+
+<table width="100%">
+  <tr>
+    <th></th>
+    <th><center><b>Garbage Collection</b></center></th>
+    <th><center><b>Memory Usage</b></center></th>
+  </tr>
+  <tr>
+    <td><b>Object-on-Heap (int)</b></td>
+    <td><img src="/img/blog/objHeap-int-gc.png" style="width:80%" /></td>
+    <td><img src="/img/blog/objHeap-int-mem.png" style="width:80%" /></td>
+  </tr>
+  <tr>
+    <td><b>Flink-Serialized (int)</b></td>
+    <td><img src="/img/blog/flinkSer-int-gc.png" style="width:80%" /></td>
+    <td><img src="/img/blog/flinkSer-int-mem.png" style="width:80%" /></td>
+  </tr>
+  <tr>
+    <td><b>Kryo-Serialized (int)</b></td>
+    <td><img src="/img/blog/kryoSer-int-gc.png" style="width:80%" /></td>
+    <td><img src="/img/blog/kryoSer-int-mem.png" style="width:80%" /></td>
+  </tr>
+</table>
+
+<p>The experiments run single-threaded on an 8-core machine, so full utilization of one core only corresponds to a 12.5% overall utilization. The screenshots show that operating on binary data significantly reduces garbage collection activity. For the object-on-heap approach, the garbage collector runs in very short intervals while filling the sort buffer and causes a lot of CPU usage even for a single processing thread (sorting itself does not trigger the garbage collector). The JVM garbage collects with multiple parallel threads, explaining the high overall CPU utilization. On the other hand, the methods that operate on serialized data rarely trigger the garbage collector and have a much lower CPU utilization. In fact the garbage collector does not run at all if the tuples are sorted on the Integer field using the flink-serialized method because no objects need to be deserialized for pair-wise comparisons. The kryo-serialized method requires slightly more garbage collection since 
 it does not use binary sort keys and deserializes two objects for each comparison.</p>
+
+<p>The memory usage charts shows that the flink-serialized and kryo-serialized constantly occupy a high amount of memory (plus some objects for operation). This is due to the pre-allocation of MemorySegments. The actual memory usage is much lower, because the sort buffers are not completely filled. The following table shows the memory consumption of each method. 10 million records result in about 280 MB of binary data (object data plus pointers and sort keys) depending on the used serializer and presence and size of a binary sort key. Comparing this to the memory requirements of the object-on-heap approach we see that operating on binary data can significantly improve memory efficiency. In our benchmark more than twice as much data can be sorted in-memory if serialized into a sort buffer instead of holding it as objects on the heap.</p>
+
+<table width="100%">
+  <tr>
+  	<th>Occupied Memory</th>
+    <th>Object-on-Heap</th>
+    <th>Flink-Serialized</th>
+    <th>Kryo-Serialized</th>
+  </tr>
+  <tr>
+    <td><b>Sort on Integer</b></td>
+    <td>approx. 700 MB (heap)</td>
+    <td>277 MB (sort buffer)</td>
+    <td>266 MB (sort buffer)</td>
+  </tr>
+  <tr>
+    <td><b>Sort on String</b></td>
+    <td>approx. 700 MB (heap)</td>
+    <td>315 MB (sort buffer)</td>
+    <td>266 MB (sort buffer)</td>
+  </tr>
+</table>
+
+<p><br /></p>
+
+<p>To summarize, the experiments verify the previously stated benefits of operating on binary data.</p>
+
+<h2 id="were-not-done-yet">We’re not done yet!</h2>
+
+<p>Apache Flink features quite a bit of advanced techniques to safely and efficiently process huge amounts of data with limited memory resources. However, there are a few points that could make Flink even more efficient. The Flink community is working on moving the managed memory to off-heap memory. This will allow for smaller JVMs, lower garbage collection overhead, and also easier system configuration. With Flink’s Table API, the semantics of all operations such as aggregations and projections are known (in contrast to black-box user-defined functions). Hence we can generate code for Table API operations that directly operates on binary data. Further improvements include serialization layouts which are tailored towards the operations that are applied on the binary data and code generation for serializers and comparators.</p>
+
+<p>The groundwork (and a lot more) for operating on binary data is done but there is still some room for making Flink even better and faster. If you are crazy about performance and like to juggle with lot of bits and bytes, join the Flink community!</p>
+
+<h2 id="tldr-give-me-three-things-to-remember">TL;DR; Give me three things to remember!</h2>
+
+<ul>
+  <li>Flink’s active memory management avoids nasty <code>OutOfMemoryErrors</code> that kill your JVMs and reduces garbage collection overhead.</li>
+  <li>Flink features a highly efficient data de/serialization stack that facilitates operations on binary data and makes more data fit into memory.</li>
+  <li>Flink’s DBMS-style operators operate natively on binary data yielding high performance in-memory and destage gracefully to disk if necessary.</li>
+</ul>
+
+      </article>
+    </div>
+
+    <div class="row">
+      <div id="disqus_thread"></div>
+      <script type="text/javascript">
+        /* * * CONFIGURATION VARIABLES: EDIT BEFORE PASTING INTO YOUR WEBPAGE * * */
+        var disqus_shortname = 'stratosphere-eu'; // required: replace example with your forum shortname
+
+        /* * * DON'T EDIT BELOW THIS LINE * * */
+        (function() {
+            var dsq = document.createElement('script'); dsq.type = 'text/javascript'; dsq.async = true;
+            dsq.src = '//' + disqus_shortname + '.disqus.com/embed.js';
+             (document.getElementsByTagName('head')[0] || document.getElementsByTagName('body')[0]).appendChild(dsq);
+        })();
+      </script>
+    </div>
+  </div>
+</div>
+      </div>
+    </div>
+
+    <hr />
+
+    <div class="row">
+      <div class="footer text-center col-sm-12">
+        <p>Copyright © 2014-2016 <a href="http://apache.org">The Apache Software Foundation</a>. All Rights Reserved.</p>
+        <p>Apache Flink, Apache, and the Apache feather logo are either registered trademarks or trademarks of The Apache Software Foundation.</p>
+        <p><a href="/privacy-policy.html">Privacy Policy</a> &middot; <a href="/blog/feed.xml">RSS feed</a></p>
+      </div>
+    </div>
+    </div><!-- /.container -->
+
+    <!-- Include all compiled plugins (below), or include individual files as needed -->
+    <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.4/js/bootstrap.min.js"></script>
+    <script src="/js/codetabs.js"></script>
+    <script src="/js/stickysidebar.js"></script>
+
+
+    <!-- Google Analytics -->
+    <script>
+      (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+      (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
+      m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+      })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
+
+      ga('create', 'UA-52545728-1', 'auto');
+      ga('send', 'pageview');
+    </script>
+  </body>
+</html>

http://git-wip-us.apache.org/repos/asf/flink-web/blob/9ec0a879/content/news/2015/05/14/Community-update-April.html
----------------------------------------------------------------------
diff --git a/content/news/2015/05/14/Community-update-April.html b/content/news/2015/05/14/Community-update-April.html
new file mode 100644
index 0000000..0e58fe8
--- /dev/null
+++ b/content/news/2015/05/14/Community-update-April.html
@@ -0,0 +1,231 @@
+<!DOCTYPE html>
+<html lang="en">
+  <head>
+    <meta charset="utf-8">
+    <meta http-equiv="X-UA-Compatible" content="IE=edge">
+    <meta name="viewport" content="width=device-width, initial-scale=1">
+    <!-- The above 3 meta tags *must* come first in the head; any other head content must come *after* these tags -->
+    <title>Apache Flink: April 2015 in the Flink community</title>
+    <link rel="shortcut icon" href="/favicon.ico" type="image/x-icon">
+    <link rel="icon" href="/favicon.ico" type="image/x-icon">
+
+    <!-- Bootstrap -->
+    <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.4/css/bootstrap.min.css">
+    <link rel="stylesheet" href="/css/flink.css">
+    <link rel="stylesheet" href="/css/syntax.css">
+
+    <!-- Blog RSS feed -->
+    <link href="/blog/feed.xml" rel="alternate" type="application/rss+xml" title="Apache Flink Blog: RSS feed" />
+
+    <!-- jQuery (necessary for Bootstrap's JavaScript plugins) -->
+    <!-- We need to load Jquery in the header for custom google analytics event tracking-->
+    <script src="https://ajax.googleapis.com/ajax/libs/jquery/1.11.2/jquery.min.js"></script>
+
+    <!-- HTML5 shim and Respond.js for IE8 support of HTML5 elements and media queries -->
+    <!-- WARNING: Respond.js doesn't work if you view the page via file:// -->
+    <!--[if lt IE 9]>
+      <script src="https://oss.maxcdn.com/html5shiv/3.7.2/html5shiv.min.js"></script>
+      <script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
+    <![endif]-->
+  </head>
+  <body>  
+    
+
+    <!-- Main content. -->
+    <div class="container">
+    <div class="row">
+
+      
+     <div id="sidebar" class="col-sm-3">
+          <!-- Top navbar. -->
+    <nav class="navbar navbar-default">
+        <!-- The logo. -->
+        <div class="navbar-header">
+          <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#bs-example-navbar-collapse-1">
+            <span class="icon-bar"></span>
+            <span class="icon-bar"></span>
+            <span class="icon-bar"></span>
+          </button>
+          <div class="navbar-logo">
+            <a href="/">
+              <img alt="Apache Flink" src="/img/navbar-brand-logo.png" width="147px" height="73px">
+            </a>
+          </div>
+        </div><!-- /.navbar-header -->
+
+        <!-- The navigation links. -->
+        <div class="collapse navbar-collapse" id="bs-example-navbar-collapse-1">
+          <ul class="nav navbar-nav navbar-main">
+
+            <!-- Downloads -->
+            <li class=""><a class="btn btn-info" href="/downloads.html">Download Flink</a></li>
+
+            <!-- Overview -->
+            <li><a href="/index.html">Home</a></li>
+
+            <!-- Intro -->
+            <li><a href="/introduction.html">Introduction to Flink</a></li>
+
+            <!-- Use cases -->
+            <li><a href="/usecases.html">Flink Use Cases</a></li>
+
+            <!-- Powered by -->
+            <li><a href="/poweredby.html">Powered by Flink</a></li>
+
+            <!-- Ecosystem -->
+            <li><a href="/ecosystem.html">Ecosystem</a></li>
+
+            <!-- Community -->
+            <li><a href="/community.html">Community &amp; Project Info</a></li>
+
+            <!-- Contribute -->
+            <li><a href="/how-to-contribute.html">How to Contribute</a></li>
+
+            <!-- Blog -->
+            <li class=" active hidden-md hidden-sm"><a href="/blog/"><b>Flink Blog</b></a></li>
+
+            <hr />
+
+
+
+            <!-- Documentation -->
+            <!-- <li>
+              <a href="http://ci.apache.org/projects/flink/flink-docs-release-1.1" target="_blank">Documentation <small><span class="glyphicon glyphicon-new-window"></span></small></a>
+            </li> -->
+            <li class="dropdown">
+              <a class="dropdown-toggle" data-toggle="dropdown" href="#">Documentation
+                <span class="caret"></span></a>
+                <ul class="dropdown-menu">
+                  <li><a href="http://ci.apache.org/projects/flink/flink-docs-release-1.1" target="_blank">1.1 (Latest stable release) <small><span class="glyphicon glyphicon-new-window"></span></small></a></li>
+                  <li><a href="http://ci.apache.org/projects/flink/flink-docs-release-1.2" target="_blank">1.2 (Snapshot) <small><span class="glyphicon glyphicon-new-window"></span></small></a></li>
+                </ul>
+              </li>
+
+            <!-- Quickstart -->
+            <li>
+              <a href="http://ci.apache.org/projects/flink/flink-docs-release-1.1/quickstart/setup_quickstart.html" target="_blank">Quickstart <small><span class="glyphicon glyphicon-new-window"></span></small></a>
+            </li>
+
+            <!-- GitHub -->
+            <li>
+              <a href="https://github.com/apache/flink" target="_blank">Flink on GitHub <small><span class="glyphicon glyphicon-new-window"></span></small></a>
+            </li>
+
+
+
+
+
+
+          </ul>
+
+
+
+          <ul class="nav navbar-nav navbar-bottom">
+          <hr />
+
+            <!-- FAQ -->
+            <li ><a href="/faq.html">Project FAQ</a></li>
+
+            <!-- Twitter -->
+            <li><a href="https://twitter.com/apacheflink" target="_blank">@ApacheFlink <small><span class="glyphicon glyphicon-new-window"></span></small></a></li>
+
+            <!-- Visualizer -->
+            <li class=" hidden-md hidden-sm"><a href="/visualizer/" target="_blank">Plan Visualizer <small><span class="glyphicon glyphicon-new-window"></span></small></a></li>
+
+          </ul>
+        </div><!-- /.navbar-collapse -->
+    </nav>
+
+      </div>
+      <div class="col-sm-9">
+      <div class="row-fluid">
+  <div class="col-sm-12">
+    <div class="row">
+      <h1>April 2015 in the Flink community</h1>
+
+      <article>
+        <p>14 May 2015 by Kostas Tzoumas (<a href="https://twitter.com/kostas_tzoumas">@kostas_tzoumas</a>)</p>
+
+<p>April was an packed month for Apache Flink.</p>
+
+<h3 id="flink-runner-for-google-cloud-dataflow">Flink runner for Google Cloud Dataflow</h3>
+
+<p>A Flink runner for Google Cloud Dataflow was announced. See the blog
+posts by <a href="http://data-artisans.com/announcing-google-cloud-dataflow-on-flink-and-easy-flink-deployment-on-google-cloud/">data Artisans</a> and
+the <a href="http://googlecloudplatform.blogspot.de/2015/03/announcing-Google-Cloud-Dataflow-runner-for-Apache-Flink.html">Google Cloud Platform Blog</a>.
+Google Cloud Dataflow programs can be written using and open-source
+SDK and run in multiple backends, either as a managed service inside
+Google’s infrastructure, or leveraging open source runners,
+including Apache Flink.</p>
+
+<h2 id="flink-090-milestone1-release">Flink 0.9.0-milestone1 release</h2>
+
+<p>The highlight of April was of course the availability of <a href="/news/2015/04/13/release-0.9.0-milestone1.html">Flink 0.9-milestone1</a>. This was a release packed with new features, including, a Python DataSet API, the new SQL-like Table API, FlinkML, a machine learning library on Flink, Gelly, FLink’s Graph API, as well as a mode to run Flink on YARN leveraging Tez. In case you missed it, check out the <a href="/news/2015/04/13/release-0.9.0-milestone1.html">release announcement blog post</a> for details</p>
+
+<h2 id="conferences-and-meetups">Conferences and meetups</h2>
+
+<p>April kicked off the conference season. Apache Flink was presented at ApacheCon in Texas (<a href="http://www.slideshare.net/fhueske/apache-flink">slides</a>), the Hadoop Summit in Brussels featured two talks on Flink (see slides <a href="http://www.slideshare.net/AljoschaKrettek/data-analysis-with-apache-flink-hadoop-summit-2015">here</a> and <a href="http://www.slideshare.net/GyulaFra/flink-streaming-hadoopsummit">here</a>), as well as at the Hadoop User Groups of the Netherlands (<a href="http://www.slideshare.net/stephanewen1/apache-flink-overview-and-use-cases-at-prehadoop-summit-meetups">slides</a>) and Stockholm. The brand new <a href="http://www.meetup.com/Apache-Flink-Stockholm/">Apache Flink meetup Stockholm</a> was also established.</p>
+
+<h2 id="google-summer-of-code">Google Summer of Code</h2>
+
+<p>Three students will work on Flink during Google’s <a href="https://www.google-melange.com/gsoc/homepage/google/gsoc2015">Summer of Code program</a> on distributed pattern matching, exact and approximate statistics for data streams and windows, as well as asynchronous iterations and updates.</p>
+
+<h2 id="flink-on-the-web">Flink on the web</h2>
+
+<p>Fabian Hueske gave an <a href="http://www.infoq.com/news/2015/04/hueske-apache-flink?utm_campaign=infoq_content&amp;utm_source=infoq&amp;utm_medium=feed&amp;utm_term=global">interview at InfoQ</a> on Apache Flink.</p>
+
+<h2 id="upcoming-events">Upcoming events</h2>
+
+<p>Stay tuned for a wealth of upcoming events! Two Flink talsk will be presented at <a href="http://berlinbuzzwords.de/15/sessions">Berlin Buzzwords</a>, Flink will be presented at the <a href="http://2015.hadoopsummit.org/san-jose/">Hadoop Summit in San Jose</a>. A <a href="http://www.meetup.com/Apache-Flink-Meetup/events/220557545/">training workshop on Apache Flink</a> is being organized in Berlin. Finally, <a href="http://2015.flink-forward.org/">Flink Forward</a>, the first conference to bring together the whole Flink community is taking place in Berlin in October 2015.</p>
+
+      </article>
+    </div>
+
+    <div class="row">
+      <div id="disqus_thread"></div>
+      <script type="text/javascript">
+        /* * * CONFIGURATION VARIABLES: EDIT BEFORE PASTING INTO YOUR WEBPAGE * * */
+        var disqus_shortname = 'stratosphere-eu'; // required: replace example with your forum shortname
+
+        /* * * DON'T EDIT BELOW THIS LINE * * */
+        (function() {
+            var dsq = document.createElement('script'); dsq.type = 'text/javascript'; dsq.async = true;
+            dsq.src = '//' + disqus_shortname + '.disqus.com/embed.js';
+             (document.getElementsByTagName('head')[0] || document.getElementsByTagName('body')[0]).appendChild(dsq);
+        })();
+      </script>
+    </div>
+  </div>
+</div>
+      </div>
+    </div>
+
+    <hr />
+
+    <div class="row">
+      <div class="footer text-center col-sm-12">
+        <p>Copyright © 2014-2016 <a href="http://apache.org">The Apache Software Foundation</a>. All Rights Reserved.</p>
+        <p>Apache Flink, Apache, and the Apache feather logo are either registered trademarks or trademarks of The Apache Software Foundation.</p>
+        <p><a href="/privacy-policy.html">Privacy Policy</a> &middot; <a href="/blog/feed.xml">RSS feed</a></p>
+      </div>
+    </div>
+    </div><!-- /.container -->
+
+    <!-- Include all compiled plugins (below), or include individual files as needed -->
+    <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.4/js/bootstrap.min.js"></script>
+    <script src="/js/codetabs.js"></script>
+    <script src="/js/stickysidebar.js"></script>
+
+
+    <!-- Google Analytics -->
+    <script>
+      (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+      (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
+      m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+      })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
+
+      ga('create', 'UA-52545728-1', 'auto');
+      ga('send', 'pageview');
+    </script>
+  </body>
+</html>

http://git-wip-us.apache.org/repos/asf/flink-web/blob/9ec0a879/content/news/2015/06/24/announcing-apache-flink-0.9.0-release.html
----------------------------------------------------------------------
diff --git a/content/news/2015/06/24/announcing-apache-flink-0.9.0-release.html b/content/news/2015/06/24/announcing-apache-flink-0.9.0-release.html
new file mode 100644
index 0000000..828783b
--- /dev/null
+++ b/content/news/2015/06/24/announcing-apache-flink-0.9.0-release.html
@@ -0,0 +1,431 @@
+<!DOCTYPE html>
+<html lang="en">
+  <head>
+    <meta charset="utf-8">
+    <meta http-equiv="X-UA-Compatible" content="IE=edge">
+    <meta name="viewport" content="width=device-width, initial-scale=1">
+    <!-- The above 3 meta tags *must* come first in the head; any other head content must come *after* these tags -->
+    <title>Apache Flink: Announcing Apache Flink 0.9.0</title>
+    <link rel="shortcut icon" href="/favicon.ico" type="image/x-icon">
+    <link rel="icon" href="/favicon.ico" type="image/x-icon">
+
+    <!-- Bootstrap -->
+    <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.4/css/bootstrap.min.css">
+    <link rel="stylesheet" href="/css/flink.css">
+    <link rel="stylesheet" href="/css/syntax.css">
+
+    <!-- Blog RSS feed -->
+    <link href="/blog/feed.xml" rel="alternate" type="application/rss+xml" title="Apache Flink Blog: RSS feed" />
+
+    <!-- jQuery (necessary for Bootstrap's JavaScript plugins) -->
+    <!-- We need to load Jquery in the header for custom google analytics event tracking-->
+    <script src="https://ajax.googleapis.com/ajax/libs/jquery/1.11.2/jquery.min.js"></script>
+
+    <!-- HTML5 shim and Respond.js for IE8 support of HTML5 elements and media queries -->
+    <!-- WARNING: Respond.js doesn't work if you view the page via file:// -->
+    <!--[if lt IE 9]>
+      <script src="https://oss.maxcdn.com/html5shiv/3.7.2/html5shiv.min.js"></script>
+      <script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
+    <![endif]-->
+  </head>
+  <body>  
+    
+
+    <!-- Main content. -->
+    <div class="container">
+    <div class="row">
+
+      
+     <div id="sidebar" class="col-sm-3">
+          <!-- Top navbar. -->
+    <nav class="navbar navbar-default">
+        <!-- The logo. -->
+        <div class="navbar-header">
+          <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#bs-example-navbar-collapse-1">
+            <span class="icon-bar"></span>
+            <span class="icon-bar"></span>
+            <span class="icon-bar"></span>
+          </button>
+          <div class="navbar-logo">
+            <a href="/">
+              <img alt="Apache Flink" src="/img/navbar-brand-logo.png" width="147px" height="73px">
+            </a>
+          </div>
+        </div><!-- /.navbar-header -->
+
+        <!-- The navigation links. -->
+        <div class="collapse navbar-collapse" id="bs-example-navbar-collapse-1">
+          <ul class="nav navbar-nav navbar-main">
+
+            <!-- Downloads -->
+            <li class=""><a class="btn btn-info" href="/downloads.html">Download Flink</a></li>
+
+            <!-- Overview -->
+            <li><a href="/index.html">Home</a></li>
+
+            <!-- Intro -->
+            <li><a href="/introduction.html">Introduction to Flink</a></li>
+
+            <!-- Use cases -->
+            <li><a href="/usecases.html">Flink Use Cases</a></li>
+
+            <!-- Powered by -->
+            <li><a href="/poweredby.html">Powered by Flink</a></li>
+
+            <!-- Ecosystem -->
+            <li><a href="/ecosystem.html">Ecosystem</a></li>
+
+            <!-- Community -->
+            <li><a href="/community.html">Community &amp; Project Info</a></li>
+
+            <!-- Contribute -->
+            <li><a href="/how-to-contribute.html">How to Contribute</a></li>
+
+            <!-- Blog -->
+            <li class=" active hidden-md hidden-sm"><a href="/blog/"><b>Flink Blog</b></a></li>
+
+            <hr />
+
+
+
+            <!-- Documentation -->
+            <!-- <li>
+              <a href="http://ci.apache.org/projects/flink/flink-docs-release-1.1" target="_blank">Documentation <small><span class="glyphicon glyphicon-new-window"></span></small></a>
+            </li> -->
+            <li class="dropdown">
+              <a class="dropdown-toggle" data-toggle="dropdown" href="#">Documentation
+                <span class="caret"></span></a>
+                <ul class="dropdown-menu">
+                  <li><a href="http://ci.apache.org/projects/flink/flink-docs-release-1.1" target="_blank">1.1 (Latest stable release) <small><span class="glyphicon glyphicon-new-window"></span></small></a></li>
+                  <li><a href="http://ci.apache.org/projects/flink/flink-docs-release-1.2" target="_blank">1.2 (Snapshot) <small><span class="glyphicon glyphicon-new-window"></span></small></a></li>
+                </ul>
+              </li>
+
+            <!-- Quickstart -->
+            <li>
+              <a href="http://ci.apache.org/projects/flink/flink-docs-release-1.1/quickstart/setup_quickstart.html" target="_blank">Quickstart <small><span class="glyphicon glyphicon-new-window"></span></small></a>
+            </li>
+
+            <!-- GitHub -->
+            <li>
+              <a href="https://github.com/apache/flink" target="_blank">Flink on GitHub <small><span class="glyphicon glyphicon-new-window"></span></small></a>
+            </li>
+
+
+
+
+
+
+          </ul>
+
+
+
+          <ul class="nav navbar-nav navbar-bottom">
+          <hr />
+
+            <!-- FAQ -->
+            <li ><a href="/faq.html">Project FAQ</a></li>
+
+            <!-- Twitter -->
+            <li><a href="https://twitter.com/apacheflink" target="_blank">@ApacheFlink <small><span class="glyphicon glyphicon-new-window"></span></small></a></li>
+
+            <!-- Visualizer -->
+            <li class=" hidden-md hidden-sm"><a href="/visualizer/" target="_blank">Plan Visualizer <small><span class="glyphicon glyphicon-new-window"></span></small></a></li>
+
+          </ul>
+        </div><!-- /.navbar-collapse -->
+    </nav>
+
+      </div>
+      <div class="col-sm-9">
+      <div class="row-fluid">
+  <div class="col-sm-12">
+    <div class="row">
+      <h1>Announcing Apache Flink 0.9.0</h1>
+
+      <article>
+        <p>24 Jun 2015</p>
+
+<p>The Apache Flink community is pleased to announce the availability of the 0.9.0 release. The release is the result of many months of hard work within the Flink community. It contains many new features and improvements which were previewed in the 0.9.0-milestone1 release and have been polished since then. This is the largest Flink release so far.</p>
+
+<p><a href="http://flink.apache.org/downloads.html">Download the release</a> and check out <a href="http://ci.apache.org/projects/flink/flink-docs-release-0.9/">the documentation</a>. Feedback through the Flink<a href="http://flink.apache.org/community.html#mailing-lists"> mailing lists</a> is, as always, very welcome!</p>
+
+<h2 id="new-features">New Features</h2>
+
+<h3 id="exactly-once-fault-tolerance-for-streaming-programs">Exactly-once Fault Tolerance for streaming programs</h3>
+
+<p>This release introduces a new fault tolerance mechanism for streaming dataflows. The new checkpointing algorithm takes data sources and also user-defined state into account and recovers failures such that all records are reflected exactly once in the operator states.</p>
+
+<p>The checkpointing algorithm is lightweight and driven by barriers that are periodically injected into the data streams at the sources. As such, it has an extremely low coordination overhead and is able to sustain very high throughput rates. User-defined state can be automatically backed up to configurable storage by the fault tolerance mechanism.</p>
+
+<p>Please refer to <a href="http://ci.apache.org/projects/flink/flink-docs-release-0.9/apis/streaming_guide.html#stateful-computation">the documentation on stateful computation</a> for details in how to use fault tolerant data streams with Flink.</p>
+
+<p>The fault tolerance mechanism requires data sources that can replay recent parts of the stream, such as <a href="http://kafka.apache.org">Apache Kafka</a>. Read more <a href="http://ci.apache.org/projects/flink/flink-docs-release-0.9/apis/streaming_guide.html#apache-kafka">about how to use the persistent Kafka source</a>.</p>
+
+<h3 id="table-api">Table API</h3>
+
+<p>Flink’s new Table API offers a higher-level abstraction for interacting with structured data sources. The Table API allows users to execute logical, SQL-like queries on distributed data sets while allowing them to freely mix declarative queries with regular Flink operators. Here is an example that groups and joins two tables:</p>
+
+<div class="highlight"><pre><code class="language-scala"><span class="k">val</span> <span class="n">clickCounts</span> <span class="k">=</span> <span class="n">clicks</span>
+  <span class="o">.</span><span class="n">groupBy</span><span class="o">(</span><span class="-Symbol">&#39;user</span><span class="o">).</span><span class="n">select</span><span class="o">(</span><span class="-Symbol">&#39;userId</span><span class="o">,</span> <span class="-Symbol">&#39;url</span><span class="o">.</span><span class="n">count</span> <span class="n">as</span> <span class="-Symbol">&#39;count</span><span class="o">)</span>
+
+<span class="k">val</span> <span class="n">activeUsers</span> <span class="k">=</span> <span class="n">users</span><span class="o">.</span><span class="n">join</span><span class="o">(</span><span class="n">clickCounts</span><span class="o">)</span>
+  <span class="o">.</span><span class="n">where</span><span class="o">(</span><span class="-Symbol">&#39;id</span> <span class="o">===</span> <span class="-Symbol">&#39;userId</span> <span class="o">&amp;&amp;</span> <span class="-Symbol">&#39;count</span> <span class="o">&gt;</span> <span class="mi">10</span><span class="o">).</span><span class="n">select</span><span class="o">(</span><span class="-Symbol">&#39;username</span><span class="o">,</span> <span class="-Symbol">&#39;count</span><span class="o">,</span> <span class="o">...)</span></code></pre></div>
+
+<p>Tables consist of logical attributes that can be selected by name rather than physical Java and Scala data types. This alleviates a lot of boilerplate code for common ETL tasks and raises the abstraction for Flink programs. Tables are available for both static and streaming data sources (DataSet and DataStream APIs).</p>
+
+<p><a href="http://ci.apache.org/projects/flink/flink-docs-release-0.9/libs/table.html">Check out the Table guide for Java and Scala</a>.</p>
+
+<h3 id="gelly-graph-processing-api">Gelly Graph Processing API</h3>
+
+<p>Gelly is a Java Graph API for Flink. It contains a set of utilities for graph analysis, support for iterative graph processing and a library of graph algorithms. Gelly exposes a Graph data structure that wraps DataSets for vertices and edges, as well as methods for creating graphs from DataSets, graph transformations and utilities (e.g., in- and out- degrees of vertices), neighborhood aggregations, iterative vertex-centric graph processing, as well as a library of common graph algorithms, including PageRank, SSSP, label propagation, and community detection.</p>
+
+<p>Gelly internally builds on top of Flink’s<a href="http://ci.apache.org/projects/flink/flink-docs-release-0.9/apis/iterations.html"> delta iterations</a>. Iterative graph algorithms are executed leveraging mutable state, achieving similar performance with specialized graph processing systems.</p>
+
+<p>Gelly will eventually subsume Spargel, Flink’s Pregel-like API.</p>
+
+<p>Note: The Gelly library is still in beta status and subject to improvements and heavy performance tuning.</p>
+
+<p><a href="http://ci.apache.org/projects/flink/flink-docs-release-0.9/libs/gelly_guide.html">Check out the Gelly guide</a>.</p>
+
+<h3 id="flink-machine-learning-library">Flink Machine Learning Library</h3>
+
+<p>This release includes the first version of Flink’s Machine Learning library. The library’s pipeline approach, which has been strongly inspired by scikit-learn’s abstraction of transformers and predictors, makes it easy to quickly set up a data processing pipeline and to get your job done.</p>
+
+<p>Flink distinguishes between transformers and predictors. Transformers are components which transform your input data into a new format allowing you to extract features, cleanse your data or to sample from it. Predictors on the other hand constitute the components which take your input data and train a model on it. The model you obtain from the learner can then be evaluated and used to make predictions on unseen data.</p>
+
+<p>Currently, the machine learning library contains transformers and predictors to do multiple tasks. The library supports multiple linear regression using stochastic gradient descent to scale to large data sizes. Furthermore, it includes an alternating least squares (ALS) implementation to factorizes large matrices. The matrix factorization can be used to do collaborative filtering. An implementation of the communication efficient distributed dual coordinate ascent (CoCoA) algorithm is the latest addition to the library. The CoCoA algorithm can be used to train distributed soft-margin SVMs.</p>
+
+<p>Note: The ML library is still in beta status and subject to improvements and heavy performance tuning.</p>
+
+<p><a href="http://ci.apache.org/projects/flink/flink-docs-release-0.9/libs/ml/">Check out FlinkML</a></p>
+
+<h3 id="flink-on-yarn-leveraging-apache-tez">Flink on YARN leveraging Apache Tez</h3>
+
+<p>We are introducing a new execution mode for Flink to be able to run restricted Flink programs on top of<a href="http://tez.apache.org"> Apache Tez</a>. This mode retains Flink’s APIs, optimizer, as well as Flink’s runtime operators, but instead of wrapping those in Flink tasks that are executed by Flink TaskManagers, it wraps them in Tez runtime tasks and builds a Tez DAG that represents the program.</p>
+
+<p>By using Flink on Tez, users have an additional choice for an execution platform for Flink programs. While Flink’s distributed runtime favors low latency, streaming shuffles, and iterative algorithms, Tez focuses on scalability and elastic resource usage in shared YARN clusters.</p>
+
+<p><a href="http://ci.apache.org/projects/flink/flink-docs-release-0.9/setup/flink_on_tez.html">Get started with Flink on Tez</a>.</p>
+
+<h3 id="reworked-distributed-runtime-on-akka">Reworked Distributed Runtime on Akka</h3>
+
+<p>Flink’s RPC system has been replaced by the widely adopted<a href="http://akka.io"> Akka</a> framework. Akka’s concurrency model offers the right abstraction to develop a fast as well as robust distributed system. By using Akka’s own failure detection mechanism the stability of Flink’s runtime is significantly improved, because the system can now react in proper form to node outages. Furthermore, Akka improves Flink’s scalability by introducing asynchronous messages to the system. These asynchronous messages allow Flink to be run on many more nodes than before.</p>
+
+<h3 id="improved-yarn-support">Improved YARN support</h3>
+
+<p>Flink’s YARN client contains several improvements, such as a detached mode for starting a YARN session in the background, the ability to submit a single Flink job to a YARN cluster without starting a session, including a “fire and forget” mode. Flink is now also able to reallocate failed YARN containers to maintain the size of the requested cluster. This feature allows to implement fault-tolerant setups on top of YARN. There is also an internal Java API to deploy and control a running YARN cluster. This is being used by system integrators to easily control Flink on YARN within their Hadoop 2 cluster.</p>
+
+<p><a href="http://ci.apache.org/projects/flink/flink-docs-release-0.9/setup/yarn_setup.html">See the YARN docs</a>.</p>
+
+<h3 id="static-code-analysis-for-the-flink-optimizer-opening-the-udf-blackboxes">Static Code Analysis for the Flink Optimizer: Opening the UDF blackboxes</h3>
+
+<p>This release introduces a first version of a static code analyzer that pre-interprets functions written by the user to get information about the function’s internal dataflow. The code analyzer can provide useful information about <a href="http://ci.apache.org/projects/flink/flink-docs-release-0.9/apis/programming_guide.html#semantic-annotations">forwarded fields</a> to Flink’s optimizer and thus speedup job executions. It also informs if the code contains obvious mistakes. For stability reasons, the code analyzer is initially disabled by default. It can be activated through</p>
+
+<p>ExecutionEnvironment.getExecutionConfig().setCodeAnalysisMode(…)</p>
+
+<p>either as an assistant that gives hints during the implementation or by directly applying the optimizations that have been found.</p>
+
+<h2 id="more-improvements-and-fixes">More Improvements and Fixes</h2>
+
+<ul>
+  <li>
+    <p><a href="https://issues.apache.org/jira/browse/FLINK-1605">FLINK-1605</a>: Flink is not exposing its Guava and ASM dependencies to Maven projects depending on Flink. We use the maven-shade-plugin to relocate these dependencies into our own namespace. This allows users to use any Guava or ASM version.</p>
+  </li>
+  <li>
+    <p><a href="https://issues.apache.org/jira/browse/FLINK-1605">FLINK-1417</a>: Automatic recognition and registration of Java Types at Kryo and the internal serializers: Flink has its own type handling and serialization framework falling back to Kryo for types that it cannot handle. To get the best performance Flink is automatically registering all types a user is using in their program with Kryo.Flink also registers serializers for Protocol Buffers, Thrift, Avro and YodaTime automatically. Users can also manually register serializers to Kryo (https://issues.apache.org/jira/browse/FLINK-1399)</p>
+  </li>
+  <li>
+    <p><a href="https://issues.apache.org/jira/browse/FLINK-1296">FLINK-1296</a>: Add support for sorting very large records</p>
+  </li>
+  <li>
+    <p><a href="https://issues.apache.org/jira/browse/FLINK-1679">FLINK-1679</a>: “degreeOfParallelism” methods renamed to “parallelism”</p>
+  </li>
+  <li>
+    <p><a href="https://issues.apache.org/jira/browse/FLINK-1501">FLINK-1501</a>: Add metrics library for monitoring TaskManagers</p>
+  </li>
+  <li>
+    <p><a href="https://issues.apache.org/jira/browse/FLINK-1760">FLINK-1760</a>: Add support for building Flink with Scala 2.11</p>
+  </li>
+  <li>
+    <p><a href="https://issues.apache.org/jira/browse/FLINK-1648">FLINK-1648</a>: Add a mode where the system automatically sets the parallelism to the available task slots</p>
+  </li>
+  <li>
+    <p><a href="https://issues.apache.org/jira/browse/FLINK-1622">FLINK-1622</a>: Add groupCombine operator</p>
+  </li>
+  <li>
+    <p><a href="https://issues.apache.org/jira/browse/FLINK-1589">FLINK-1589</a>: Add option to pass Configuration to LocalExecutor</p>
+  </li>
+  <li>
+    <p><a href="https://issues.apache.org/jira/browse/FLINK-1504">FLINK-1504</a>: Add support for accessing secured HDFS clusters in standalone mode</p>
+  </li>
+  <li>
+    <p><a href="https://issues.apache.org/jira/browse/FLINK-1478">FLINK-1478</a>: Add strictly local input split assignment</p>
+  </li>
+  <li>
+    <p><a href="https://issues.apache.org/jira/browse/FLINK-1512">FLINK-1512</a>: Add CsvReader for reading into POJOs.</p>
+  </li>
+  <li>
+    <p><a href="https://issues.apache.org/jira/browse/FLINK-1461">FLINK-1461</a>: Add sortPartition operator</p>
+  </li>
+  <li>
+    <p><a href="https://issues.apache.org/jira/browse/FLINK-1450">FLINK-1450</a>: Add Fold operator to the Streaming api</p>
+  </li>
+  <li>
+    <p><a href="https://issues.apache.org/jira/browse/FLINK-1389">FLINK-1389</a>: Allow setting custom file extensions for files created by the FileOutputFormat</p>
+  </li>
+  <li>
+    <p><a href="https://issues.apache.org/jira/browse/FLINK-1236">FLINK-1236</a>: Add support for localization of Hadoop Input Splits</p>
+  </li>
+  <li>
+    <p><a href="https://issues.apache.org/jira/browse/FLINK-1179">FLINK-1179</a>: Add button to JobManager web interface to request stack trace of a TaskManager</p>
+  </li>
+  <li>
+    <p><a href="https://issues.apache.org/jira/browse/FLINK-1105">FLINK-1105</a>: Add support for locally sorted output</p>
+  </li>
+  <li>
+    <p><a href="https://issues.apache.org/jira/browse/FLINK-1688">FLINK-1688</a>: Add socket sink</p>
+  </li>
+  <li>
+    <p><a href="https://issues.apache.org/jira/browse/FLINK-1436">FLINK-1436</a>: Improve usability of command line interface</p>
+  </li>
+  <li>
+    <p><a href="https://issues.apache.org/jira/browse/FLINK-2174">FLINK-2174</a>: Allow comments in ‘slaves’ file</p>
+  </li>
+  <li>
+    <p><a href="https://issues.apache.org/jira/browse/FLINK-1698">FLINK-1698</a>: Add polynomial base feature mapper to ML library</p>
+  </li>
+  <li>
+    <p><a href="https://issues.apache.org/jira/browse/FLINK-1697">FLINK-1697</a>: Add alternating least squares algorithm for matrix factorization to ML library</p>
+  </li>
+  <li>
+    <p><a href="https://issues.apache.org/jira/browse/FLINK-1792">FLINK-1792</a>: FLINK-456 Improve TM Monitoring: CPU utilization, hide graphs by default and show summary only</p>
+  </li>
+  <li>
+    <p><a href="https://issues.apache.org/jira/browse/FLINK-1672">FLINK-1672</a>: Refactor task registration/unregistration</p>
+  </li>
+  <li>
+    <p><a href="https://issues.apache.org/jira/browse/FLINK-2001">FLINK-2001</a>: DistanceMetric cannot be serialized</p>
+  </li>
+  <li>
+    <p><a href="https://issues.apache.org/jira/browse/FLINK-1676">FLINK-1676</a>: enableForceKryo() is not working as expected</p>
+  </li>
+  <li>
+    <p><a href="https://issues.apache.org/jira/browse/FLINK-1959">FLINK-1959</a>: Accumulators BROKEN after Partitioning</p>
+  </li>
+  <li>
+    <p><a href="https://issues.apache.org/jira/browse/FLINK-1696">FLINK-1696</a>: Add multiple linear regression to ML library</p>
+  </li>
+  <li>
+    <p><a href="https://issues.apache.org/jira/browse/FLINK-1820">FLINK-1820</a>: Bug in DoubleParser and FloatParser - empty String is not casted to 0</p>
+  </li>
+  <li>
+    <p><a href="https://issues.apache.org/jira/browse/FLINK-1985">FLINK-1985</a>: Streaming does not correctly forward ExecutionConfig to runtime</p>
+  </li>
+  <li>
+    <p><a href="https://issues.apache.org/jira/browse/FLINK-1828">FLINK-1828</a>: Impossible to output data to an HBase table</p>
+  </li>
+  <li>
+    <p><a href="https://issues.apache.org/jira/browse/FLINK-1952">FLINK-1952</a>: Cannot run ConnectedComponents example: Could not allocate a slot on instance</p>
+  </li>
+  <li>
+    <p><a href="https://issues.apache.org/jira/browse/FLINK-1848">FLINK-1848</a>: Paths containing a Windows drive letter cannot be used in FileOutputFormats</p>
+  </li>
+  <li>
+    <p><a href="https://issues.apache.org/jira/browse/FLINK-1954">FLINK-1954</a>: Task Failures and Error Handling</p>
+  </li>
+  <li>
+    <p><a href="https://issues.apache.org/jira/browse/FLINK-2004">FLINK-2004</a>: Memory leak in presence of failed checkpoints in KafkaSource</p>
+  </li>
+  <li>
+    <p><a href="https://issues.apache.org/jira/browse/FLINK-2132">FLINK-2132</a>: Java version parsing is not working for OpenJDK</p>
+  </li>
+  <li>
+    <p><a href="https://issues.apache.org/jira/browse/FLINK-2098">FLINK-2098</a>: Checkpoint barrier initiation at source is not aligned with snapshotting</p>
+  </li>
+  <li>
+    <p><a href="https://issues.apache.org/jira/browse/FLINK-2069">FLINK-2069</a>: writeAsCSV function in DataStream Scala API creates no file</p>
+  </li>
+  <li>
+    <p><a href="https://issues.apache.org/jira/browse/FLINK-2092">FLINK-2092</a>: Document (new) behavior of print() and execute()</p>
+  </li>
+  <li>
+    <p><a href="https://issues.apache.org/jira/browse/FLINK-2177">FLINK-2177</a>: NullPointer in task resource release</p>
+  </li>
+  <li>
+    <p><a href="https://issues.apache.org/jira/browse/FLINK-2054">FLINK-2054</a>: StreamOperator rework removed copy calls when passing output to a chained operator</p>
+  </li>
+  <li>
+    <p><a href="https://issues.apache.org/jira/browse/FLINK-2196">FLINK-2196</a>: Missplaced Class in flink-java SortPartitionOperator</p>
+  </li>
+  <li>
+    <p><a href="https://issues.apache.org/jira/browse/FLINK-2191">FLINK-2191</a>: Inconsistent use of Closure Cleaner in Streaming API</p>
+  </li>
+  <li>
+    <p><a href="https://issues.apache.org/jira/browse/FLINK-2206">FLINK-2206</a>: JobManager webinterface shows 5 finished jobs at most</p>
+  </li>
+  <li>
+    <p><a href="https://issues.apache.org/jira/browse/FLINK-2188">FLINK-2188</a>: Reading from big HBase Tables</p>
+  </li>
+  <li>
+    <p><a href="https://issues.apache.org/jira/browse/FLINK-1781">FLINK-1781</a>: Quickstarts broken due to Scala Version Variables</p>
+  </li>
+</ul>
+
+<h2 id="notice">Notice</h2>
+
+<p>The 0.9 series of Flink is the last version to support Java 6. If you are still using Java 6, please consider upgrading to Java 8 (Java 7 ended its free support in April 2015).</p>
+
+<p>Flink will require at least Java 7 in major releases after 0.9.0.</p>
+
+      </article>
+    </div>
+
+    <div class="row">
+      <div id="disqus_thread"></div>
+      <script type="text/javascript">
+        /* * * CONFIGURATION VARIABLES: EDIT BEFORE PASTING INTO YOUR WEBPAGE * * */
+        var disqus_shortname = 'stratosphere-eu'; // required: replace example with your forum shortname
+
+        /* * * DON'T EDIT BELOW THIS LINE * * */
+        (function() {
+            var dsq = document.createElement('script'); dsq.type = 'text/javascript'; dsq.async = true;
+            dsq.src = '//' + disqus_shortname + '.disqus.com/embed.js';
+             (document.getElementsByTagName('head')[0] || document.getElementsByTagName('body')[0]).appendChild(dsq);
+        })();
+      </script>
+    </div>
+  </div>
+</div>
+      </div>
+    </div>
+
+    <hr />
+
+    <div class="row">
+      <div class="footer text-center col-sm-12">
+        <p>Copyright © 2014-2016 <a href="http://apache.org">The Apache Software Foundation</a>. All Rights Reserved.</p>
+        <p>Apache Flink, Apache, and the Apache feather logo are either registered trademarks or trademarks of The Apache Software Foundation.</p>
+        <p><a href="/privacy-policy.html">Privacy Policy</a> &middot; <a href="/blog/feed.xml">RSS feed</a></p>
+      </div>
+    </div>
+    </div><!-- /.container -->
+
+    <!-- Include all compiled plugins (below), or include individual files as needed -->
+    <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.4/js/bootstrap.min.js"></script>
+    <script src="/js/codetabs.js"></script>
+    <script src="/js/stickysidebar.js"></script>
+
+
+    <!-- Google Analytics -->
+    <script>
+      (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+      (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
+      m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+      })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
+
+      ga('create', 'UA-52545728-1', 'auto');
+      ga('send', 'pageview');
+    </script>
+  </body>
+</html>


Mime
View raw message