flink-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From trohrm...@apache.org
Subject flink git commit: [FLINK-2073] [ml] [docs] Adds contribution guide. Adds links to FlinkML main site.
Date Wed, 27 May 2015 13:43:52 GMT
Repository: flink
Updated Branches:
  refs/heads/master 63edeca85 -> c77947e94


[FLINK-2073] [ml] [docs] Adds contribution guide. Adds links to FlinkML main site.

Adds html titles. Adds explicit operation section to all operators.

This closes #727.


Project: http://git-wip-us.apache.org/repos/asf/flink/repo
Commit: http://git-wip-us.apache.org/repos/asf/flink/commit/c77947e9
Tree: http://git-wip-us.apache.org/repos/asf/flink/tree/c77947e9
Diff: http://git-wip-us.apache.org/repos/asf/flink/diff/c77947e9

Branch: refs/heads/master
Commit: c77947e94767565cc4f947a4ba86a960258a052a
Parents: 63edeca
Author: Till Rohrmann <trohrmann@apache.org>
Authored: Tue May 26 18:45:01 2015 +0200
Committer: Till Rohrmann <trohrmann@apache.org>
Committed: Wed May 27 15:43:31 2015 +0200

----------------------------------------------------------------------
 docs/_layouts/base.html                         |   4 +
 docs/libs/ml/als.md                             |  21 +-
 docs/libs/ml/cocoa.md                           |  20 +-
 docs/libs/ml/contribution_guide.md              | 347 ++++++++++++++++++-
 docs/libs/ml/distance_metrics.md                |   3 +-
 docs/libs/ml/index.md                           |  15 +-
 docs/libs/ml/multiple_linear_regression.md      |  22 +-
 docs/libs/ml/optimization.md                    |   3 +-
 docs/libs/ml/pipelines.md                       |   3 +-
 docs/libs/ml/polynomial_base_feature_mapper.md  |  91 -----
 docs/libs/ml/polynomial_features.md             | 108 ++++++
 docs/libs/ml/quickstart.md                      |   3 +-
 docs/libs/ml/standard_scaler.md                 |  26 +-
 docs/libs/ml/vision_roadmap.md                  |   3 +-
 docs/page/css/flink.css                         |   7 +-
 flink-staging/flink-ml/pom.xml                  |   4 +-
 .../src/test/resources/log4j-test.properties    |   2 +-
 17 files changed, 563 insertions(+), 119 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/flink/blob/c77947e9/docs/_layouts/base.html
----------------------------------------------------------------------
diff --git a/docs/_layouts/base.html b/docs/_layouts/base.html
index d4e74ef..dfd0f65 100644
--- a/docs/_layouts/base.html
+++ b/docs/_layouts/base.html
@@ -23,7 +23,11 @@ under the License.
     <meta http-equiv="X-UA-Compatible" content="IE=edge">
     <meta name="viewport" content="width=device-width, initial-scale=1">
     <!-- The above 3 meta tags *must* come first in the head; any other head content must
come *after* these tags -->
+    {% if page.htmlTitle %}
+    <title>Apache Flink {{ site.version}} Documentation: {{ page.htmlTitle }}</title>
+    {% else %}
     <title>Apache Flink {{ site.version}} Documentation: {{ page.title }}</title>
+    {% endif %}
     <link rel="shortcut icon" href="{{ site.baseurl }}/page/favicon.ico" type="image/x-icon">
     <link rel="icon" href="{{ site.baseurl }}/page/favicon.ico" type="image/x-icon">
 

http://git-wip-us.apache.org/repos/asf/flink/blob/c77947e9/docs/libs/ml/als.md
----------------------------------------------------------------------
diff --git a/docs/libs/ml/als.md b/docs/libs/ml/als.md
index 6b9e10f..1b06fd7 100644
--- a/docs/libs/ml/als.md
+++ b/docs/libs/ml/als.md
@@ -1,6 +1,7 @@
 ---
 mathjax: include
-title: Alternating Least Squares
+htmlTitle: FlinkML - Alternating Least Squares
+title: <a href="/libs/ml">FlinkML</a> - Alternating Least Squares
 ---
 <!--
 Licensed to the Apache Software Foundation (ASF) under one
@@ -47,6 +48,22 @@ By applying this step alternately to the matrices $U$ and $V$, we can iterativel
 
 The matrix $R$ is given in its sparse representation as a tuple of $(i, j, r)$ where $i$
denotes the row index, $j$ the column index and $r$ is the matrix value at position $(i,j)$.
 
+## Operations
+
+`ALS` is a `Predictor`.
+As such, it supports the `fit` and `predict` operation.
+
+### Fit
+
+ALS is trained on the sparse representation of the rating matrix: 
+
+* `fit: DataSet[(Int, Int, Double)] => Unit` 
+
+### Predict
+
+ALS predicts for each tuple of row and column index the rating: 
+
+* `predict: DataSet[(Int, Int)] => DataSet[(Int, Int, Double)]`
 
 ## Parameters
 
@@ -97,7 +114,7 @@ The alternating least squares implementation can be controlled by the following
             The fewer blocks one uses, the less data is sent redundantly. 
             However, bigger blocks entail bigger update messages which have to be stored
on the heap. 
             If the algorithm fails because of an OutOfMemoryException, then try to increase
the number of blocks. 
-            (Default value: '''None''')
+            (Default value: <strong>None</strong>)
           </p>
         </td>
       </tr>

http://git-wip-us.apache.org/repos/asf/flink/blob/c77947e9/docs/libs/ml/cocoa.md
----------------------------------------------------------------------
diff --git a/docs/libs/ml/cocoa.md b/docs/libs/ml/cocoa.md
index cd13806..8aea886 100644
--- a/docs/libs/ml/cocoa.md
+++ b/docs/libs/ml/cocoa.md
@@ -1,6 +1,7 @@
 ---
 mathjax: include
-title: Communication efficient distributed dual coordinate ascent (CoCoA)
+htmlTitle: FlinkML - Communication efficient distributed dual coordinate ascent (CoCoA)
+title: <a href="/libs/ml">FlinkML</a> - Communication efficient distributed dual
coordinate ascent (CoCoA)
 ---
 <!--
 Licensed to the Apache Software Foundation (ASF) under one
@@ -56,6 +57,23 @@ distributed across the cluster.
 The implementation of this algorithm is based on the work of 
 [Jaggi et al.](http://arxiv.org/abs/1409.1458)
 
+## Operations
+
+`CoCoA` is a `Predictor`.
+As such, it supports the `fit` and `predict` operation.
+
+### Fit
+
+CoCoA is trained given a set of `LabeledVector`: 
+
+* `fit: DataSet[LabeledVector] => Unit`
+
+### Predict
+
+CoCoA predicts for all subtypes of `Vector` the corresponding class label: 
+
+* `predict[T <: Vector]: DataSet[T] => DataSet[LabeledVector]`
+
 ## Parameters
 
 The CoCoA implementation can be controlled by the following parameters:

http://git-wip-us.apache.org/repos/asf/flink/blob/c77947e9/docs/libs/ml/contribution_guide.md
----------------------------------------------------------------------
diff --git a/docs/libs/ml/contribution_guide.md b/docs/libs/ml/contribution_guide.md
index e0db10a..6b139a6 100644
--- a/docs/libs/ml/contribution_guide.md
+++ b/docs/libs/ml/contribution_guide.md
@@ -1,5 +1,7 @@
 ---
-title: "FlinkML - Contribution guide"
+mathjax: include
+htmlTitle: FlinkML - How to Contribute 
+title: <a href="/libs/ml">FlinkML</a> - How to Contribute
 ---
 <!--
 Licensed to the Apache Software Foundation (ASF) under one
@@ -20,7 +22,348 @@ specific language governing permissions and limitations
 under the License.
 -->
 
+The Flink community highly appreciates all sorts of contributions to FlinkML.
+FlinkML offers people interested in machine learning to work on a highly active open source
project which makes scalable ML reality.
+The following document describes how to contribute to FlinkML.
+
 * This will be replaced by the TOC
 {:toc}
 
-Coming soon. In the meantime, check our list of [open issues on JIRA](https://issues.apache.org/jira/browse/FLINK-1748?jql=component%20%3D%20%22Machine%20Learning%20Library%22%20AND%20project%20%3D%20FLINK%20AND%20resolution%20%3D%20Unresolved%20ORDER%20BY%20priority%20DESC)
+## Getting Started
+
+In order to get started first read Flink's [contribution guide](http://flink.apache.org/how-to-contribute.html).
+Everything from this guide also applies to FlinkML.
+
+## Pick a Topic
+
+If you are looking for some new ideas, then you should check out the list of [unresolved
issues on JIRA](https://issues.apache.org/jira/issues/?jql=component%20%3D%20%22Machine%20Learning%20Library%22%20AND%20project%20%3D%20FLINK%20AND%20resolution%20%3D%20Unresolved%20ORDER%20BY%20priority%20DESC).
+Once you decide to contribute to one of these issues, you should take ownership of it and
track your progress with this issue.
+That way, the other contributors know the state of the different issues and redundant work
is avoided.
+
+If you already know what you want to contribute to FlinkML all the better.
+It is still advisable to create a JIRA issue for your idea to tell the Flink community what
you want to do, though.
+
+## Testing
+
+New contributions should come with tests to verify the correct behavior of the algorithm.
+The tests help to maintain the algorithm's correctness throughout code changes, e.g. refactorings.
+
+We distinguish between unit tests, which are executed during Maven's test phase, and integration
tests, which are executed during maven's verify phase.
+Maven automatically makes this distinction by using the following naming rules:
+All test cases whose class name ends with a suffix fulfilling the regular expression `(IT|Integration)(Test|Suite|Case)`,
are considered integration tests.
+The rest are considered unit tests and should only test behavior which is local to the component
under test.
+
+An integration test is a test which requires the full Flink system to be started.
+In order to do that properly, all integration test cases have to mix in the trait `FlinkTestBase`.
+This trait will set the right `ExecutionEnvironment` so that the test will be executed on
a special `FlinkMiniCluster` designated for testing purposes.
+Thus, an integration test could look the following:
+
+{% highlight scala %}
+class ExampleITSuite extends FlatSpec with FlinkTestBase {
+  behavior of "An example algorithm"
+  
+  it should "do something" in {
+    ...
+  }
+}
+{% endhighlight %}
+
+The test style does not have to be `FlatSpec` but can be any other scalatest `Suite` subclass.
+See [ScalaTest testing styles](http://scalatest.org/user_guide/selecting_a_style) for more
information.
+
+## Documentation
+
+When contributing new algorithms, it is required to add code comments describing the way
the algorithm works and its parameters with which the user can control its behavior.
+Additionally, we would like to encourage contributors to add this information to the online
documentation.
+The online documentation for FlinkML's components can be found in the directory `docs/libs/ml`.
+
+Every new algorithm is described by a single markdown file.
+This file should contain at least the following points:
+
+1. What does the algorithm do
+2. How does the algorithm work (or reference to description) 
+3. Parameter description with default values
+4. Code snippet showing how the algorithm is used
+
+In order to use latex syntax in the markdown file, you have to include `mathjax: include`
in the YAML front matter.
+ 
+{% highlight java %}
+---
+mathjax: include
+htmlTitle: FlinkML - Example title
+title: <a href="/libs/ml">FlinkML</a> - Example title
+---
+{% endhighlight %}
+
+In order to use displayed mathematics, you have to put your latex code in `$$ ... $$`.
+For in-line mathematics, use `$ ... $`.
+Additionally some predefined latex commands are included into the scope of your markdown
file.
+See `docs/_include/latex_commands.html` for the complete list of predefined latex commands.
+
+## Contributing
+
+Once you have implemented the algorithm with adequate test coverage and added documentation,
you are ready to open a pull request.
+Details of how to open a pull request can be found [here](http://flink.apache.org/how-to-contribute.html#contributing-code--documentation).

+
+## How to Implement a Pipeline Operator
+
+FlinkML follows the principle to make machine learning as easy and accessible as possible.
+Therefore, it supports a flexible pipelining mechanism which allows users to quickly define
their analysis pipelines consisting of a multitude of different components.
+The base trait of all pipeline operators is the `Estimator`.
+The `Estimator` defines a `fit` method which is used to fit the operator to training data.
+Every operator which learns from data has to implement this method.
+Two sub-classes of `Estimator` are the `Transformer` and the `Predictor`.
+A pipeline operator is an implementation of one of these sub-classes.
+
+A `Transformer` defines a `transform` method which takes input data and transforms it into
output data.
+`Transformer`s can be stateful or stateless depending on whether they learn from training
data or not.
+A scaling transformer which changes the mean and variance of its input data according to
the mean and variance of some training data is an example of a stateful `Transformer`.
+In contrast, a transformer which maps feature vectors into the polynomial space is an example
of a stateless `Transformer`.
+In general, `Transformer` can be thought of as constituting the pre-processing steps in your
data pipeline.
+
+A `Predictor` defines a `predict` method which takes testing data and calculates predictions
for this data.
+In order to do this calculation, a `Predictor` first has to fit a model to training data.
+This happens in the `fit` method which it inherits from `Estimator`.
+The trained model is then used in the `predict` method to make the predictions for unseen
data.
+Thus, a `Predictor` is defined by the trinity of model, training logic and prediction logic.
+A support vector machine, which is first trained to obtain the support vectors and then used
to classify data points, is an example of a `Predictor`.
+
+An arbitrary number of `Transformer`s with an optionally trailing `Predictor` can be chained
together to form a pipeline if the input and output types are compatible.
+Each operator of such a pipeline constitutes a certain task of your analysis, e.g. data centering,
feature selection or model training.
+Therefore, the pipeline abstraction gives you a convenient abstraction to solve complex analysis
tasks.
+An in-depth description of FlinkML's pipelining mechanism can be found [here]({{site.baseurl}}/libs/ml/pipelines.html).
+In order to support FlinkML's pipelining, algorithms have to adhere to a certain design pattern,
which we will describe next.
+
+Let's assume that we want to implement a pipeline operator which changes the mean of your
data.
+Since centering data is a common pre-processing step in many analysis pipeline, we will implement
it as a `Transformer`.
+Therefore, we first create a `MeanTransformer` class which inherits from `Transformer`
+
+{% highlight scala %}
+class MeanTransformer extends Transformer[MeanTransformer] {}
+{% endhighlight %}
+
+Since we want to be able to configure the mean of the resulting data, we have to add a configuration
parameter.
+
+{% highlight scala %}
+class MeanTransformer extends Transformer[MeanTransformer] {
+  def setMean(mean: Double): this.type = {
+    parameters.add(MeanTransformer.Mean, mean)
+    this
+  }
+}
+
+object MeanTransformer {
+  case object Mean extends Parameter[Double] {
+    override val defaultValue: Option[Double] = Some(0.0)
+  }
+  
+  def apply(): MeanTransformer = new MeanTransformer
+}
+{% endhighlight %}
+
+Parameters are defined in the companion object of the transformer class and extend the `Parameter`
class.
+Since the parameter instances are supposed to act as immutable keys for a parameter map,
they should be implemented as `case objects`.
+The default value will be used if no other value has been set by the user of this component.
+If no default value has been specified, meaning that `defaultValue = None`, then the algorithm
has to handle this situation accordingly.
+
+We can now instantiate a `MeanTransformer` object and set the mean value of the transformed
data.
+But we still have to implement how the transformation works.
+The workflow can be separated into two phases.
+Within the first phase, the transformer learns the mean of the given training data.
+This knowledge can then be used in the second phase to transform the provided data with respect
to the configured resulting mean value.
+
+The learning of the mean can be implemented within the `fit` operation of our `Transformer`,
which it inherited from `Estimator`.
+Within the `fit` operation, a pipeline component is trained with respect to the given training
data.
+The algorithm is, however, **not** implemented by overriding the `fit` method but by providing
an implementation of a corresponding `FitOperation` for the correct type.
+Taking a look at the definition of the `fit` method in `Estimator`, which is the parent class
of `Transformer`, reveals what why this is the case.
+
+{% highlight scala %}
+trait Estimator[Self] extends WithParameters with Serializable {
+  that: Self =>
+
+  def fit[Training](
+      training: DataSet[Training],
+      fitParameters: ParameterMap = ParameterMap.Empty)
+      (implicit fitOperation: FitOperation[Self, Training]): Unit = {
+    FlinkMLTools.registerFlinkMLTypes(training.getExecutionEnvironment)
+    fitOperation.fit(this, fitParameters, training)
+  }
+}
+{% endhighlight %}
+
+We see that the `fit` method is called with an input data set of type `Training`, an optional
parameter list and in the second parameter list with an implicit parameter of type `FitOperation`.
+Within the body of the function, first some machine learning types are registered and then
the `fit` method of the `FitOperation` parameter is called.
+The instance gives itself, the parameter map and the training data set as a parameters to
the method.
+Thus, all the program logic takes place within the `FitOperation`.
+
+The `FitOperation` has two type parameters.
+The first defines the pipeline operator type for which this `FitOperation` shall work and
the second type parameter defines the type of the data set elements.
+If we first wanted to implement the `MeanTransformer` to work on `DenseVector`, we would,
thus, have to provide an implementation for `FitOperation[MeanTransformer, DenseVector]`.
+ 
+{% highlight scala %}
+val denseVectorMeanFitOperation = new FitOperation[MeanTransformer, DenseVector] {
+  override def fit(instance: MeanTransformer, fitParameters: ParameterMap, input: DataSet[DenseVector])
: Unit = {
+    import org.apache.flink.ml.math.Breeze._
+    val meanTrainingData: DataSet[DenseVector] = input
+      .map{ x => (x.asBreeze, 1) }
+      .reduce{
+        (left, right) => 
+          (left._1 + right._1, left._2 + right._2) 
+      }
+      .map{ p => (p._1/p._2).fromBreeze }
+  }
+}
+{% endhighlight %}
+
+A `FitOperation[T, I]` has a `fit` method which is called with an instance of type `T`, a
parameter map and an input `DataSet[I]`.
+In our case `T=MeanTransformer` and `I=DenseVector`.
+The parameter map is necessary if our fit step depends on some parameter values which were
not given directly at creation time of the `Transformer`.
+The `FitOperation` of the `MeanTransformer` sums the `DenseVector` instances of the given
input data set up and divides the result by the total number of vectors.
+That way, we obtain a `DataSet[DenseVector]` with a single element which is the mean value.
+
+But if we look closely at the implementation, we see that the result of the mean computation
is never stored anywhere.
+If we want to use this knowledge in a later step to adjust the mean of some other input,
we have to keep it around.
+And here is where the parameter of type `MeanTransformer` which is given to the `fit` method
comes into play.
+We can use this instance to store state, which is used by a subsequent `transform` operation
which works on the same object.
+But first we have to extend `MeanTransformer` by a member field and then adjust the `FitOperation`
implementation.
+
+{% highlight scala %}
+class MeanTransformer extends Transformer[Centering] {
+  var meanOption: Option[DataSet[DenseVector]] = None
+
+  def setMean(mean: Double): Mean = {
+    parameters.add(MeanTransformer.Mean, mu)
+  }
+}
+
+val denseVectorMeanFitOperation = new FitOperation[MeanTransformer, DenseVector] {
+  override def fit(instance: MeanTransformer, fitParameters: ParameterMap, input: DataSet[DenseVector])
: Unit = {
+    import org.apache.flink.ml.math.Breeze._
+    
+    instance.meanOption = Some(input
+      .map{ x => (x.asBreeze, 1) }
+      .reduce{
+        (left, right) => 
+          (left._1 + right._1, left._2 + right._2) 
+      }
+      .map{ p => (p._1/p._2).fromBreeze })
+  }
+}
+{% endhighlight %}
+
+If we look at the `transform` method in `Transformer`, we will see that we also need an implementation
of `TransformOperation`.
+A possible mean transforming implementation could look like the following.
+
+{% highlight scala %}
+
+val denseVectorMeanTransformOperation = new TransformOperation[MeanTransformer, DenseVector,
DenseVector] {
+  override def transform(
+      instance: MeanTransformer, 
+      transformParameters: ParameterMap, 
+      input: DataSet[DenseVector]) 
+    : DataSet[DenseVector] = {
+    val resultingParameters = parameters ++ transformParameters
+    
+    val resultingMean = resultingParameters(MeanTransformer.Mean)
+    
+    instance.meanOption match {
+      case Some(trainingMean) => {
+        input.map{ new MeanTransformMapper(resultingMean) }.withBroadcastSet(trainingMean,
"trainingMean")
+      }
+      case None => throw new RuntimeException("MeanTransformer has not been fitted to
data.")
+    }
+  }
+}
+
+class MeanTransformMapper(resultingMean: Double) extends RichMapFunction[DenseVector, DenseVector]
{
+  var trainingMean: DenseVector = null
+
+  override def open(parameters: Configuration): Unit = {
+    trainingMean = getRuntimeContext().getBroadcastVariable[DenseVector]("trainingMean").get(0)
+  }
+  
+  override def map(vector: DenseVector): DenseVector = {
+    import org.apache.flink.ml.math.Breeze._
+    
+    val result = vector.asBreeze - trainingMean.asBreeze + resultingMean
+    
+    result.fromBreeze
+  }
+}
+{% endhighlight %}
+
+Now we have everything implemented to fit our `MeanTransformer` to a training data set of
`DenseVector` instances and to transform them.
+However, when we execute the `fit` operation
+
+{% highlight scala %}
+val trainingData: DataSet[DenseVector] = ...
+val meanTransformer = MeanTransformer()
+
+meanTransformer.fit(trainingData)
+{% endhighlight %}
+
+we receive the following error at runtime: `"There is no FitOperation defined for class MeanTransformer
which trains on a DataSet[org.apache.flink.ml.math.DenseVector]"`.
+The reason is that the Scala compiler could not find a fitting `FitOperation` value with
the right type parameters for the implicit parameter of the `fit` method.
+Therefore, it chose a fallback implicit value which gives you this error message at runtime.
+In order to make the compiler aware of our implementation, we have to define it as an implicit
value and put it in the scope of the `MeanTransformer's` companion object.
+
+{% highlight scala %}
+object MeanTransformer{
+  implicit val denseVectorMeanFitOperation = new FitOperation[MeanTransformer, DenseVector]
...
+  
+  implicit val denseVectorMeanTransformOperation = new TransformOperation[MeanTransformer,
DenseVector, DenseVector] ...
+}
+{% endhighlight %}
+
+Now we can call `fit` and `transform` of our `MeanTransformer` with `DataSet[DenseVector]`
as input.
+Furthermore, we can now use this transformer as part of an analysis pipeline where we have
a `DenseVector` as input and expected output.
+
+{% highlight scala %}
+val trainingData: DataSet[DenseVector] = ...
+
+val mean = MeanTransformer.setMean(1.0)
+val polyFeaturs = PolynomialFeatures().setDegree(3)
+
+val pipeline = mean.chainTransformer(polyFeatures)
+
+pipeline.fit(trainingData)
+{% endhighlight %}
+
+It is noteworthy that there is no additional code needed to enable chaining.
+The system automatically constructs the pipeline logic using the operations of the individual
components.
+
+So far everything works fine with `DenseVector`.
+But what happens, if we call our transformer with `LabeledVector` instead?
+{% highlight scala %}
+val trainingData: DataSet[LabeledVector] = ...
+
+val mean = MeanTransformer()
+
+mean.fit(trainingData)
+{% endhighlight %}
+
+As before we see the following exception upon execution of the program: `"There is no FitOperation
defined for class MeanTransformer which trains on a DataSet[org.apache.flink.ml.common.LabeledVector]"`.
+It is noteworthy, that this exception is thrown in the pre-flight phase, which means that
the job has not been submitted to the runtime system.
+This has the advantage that you won't see a job which runs for a couple of days and then
fails because of an incompatible pipeline component.
+Type compatibility is, thus, checked at the very beginning for the complete job.
+
+In order to make the `MeanTransformer` work on `LabeledVector` as well, we have to provide
the corresponding operations.
+Consequently, we have to define a `FitOperation[MeanTransformer, LabeledVector]` and `TransformOperation[MeanTransformer,
LabeledVector, LabeledVector]` as implicit values in the scope of `MeanTransformer`'s companion
object.
+
+{% highlight scala %}
+object MeanTransformer {
+  implicit val labeledVectorFitOperation = new FitOperation[MeanTransformer, LabeledVector]
...
+  
+  implicit val labeledVectorTransformOperation = new TransformOperation[MeanTransformer,
LabeledVector, LabeledVector] ...
+}
+{% endhighlight %}
+
+If we wanted to implement a `Predictor` instead of a `Transformer`, then we would have to
provide a `FitOperation`, too.
+Moreover, a `Predictor` requires a `PredictOperation` which implements how predictions are
calculated from testing data.  
+
+
+
+
+
+
+

http://git-wip-us.apache.org/repos/asf/flink/blob/c77947e9/docs/libs/ml/distance_metrics.md
----------------------------------------------------------------------
diff --git a/docs/libs/ml/distance_metrics.md b/docs/libs/ml/distance_metrics.md
index 6f8b64d..f329864 100644
--- a/docs/libs/ml/distance_metrics.md
+++ b/docs/libs/ml/distance_metrics.md
@@ -1,6 +1,7 @@
 ---
 mathjax: include
-title: Distance Metrics
+htmlTitle: FlinkML - Distance Metrics
+title: <a href="/libs/ml">FlinkML</a> - Distance Metrics
 ---
 <!--
 Licensed to the Apache Software Foundation (ASF) under one

http://git-wip-us.apache.org/repos/asf/flink/blob/c77947e9/docs/libs/ml/index.md
----------------------------------------------------------------------
diff --git a/docs/libs/ml/index.md b/docs/libs/ml/index.md
index 344492d..d172bda 100644
--- a/docs/libs/ml/index.md
+++ b/docs/libs/ml/index.md
@@ -41,7 +41,7 @@ FlinkML currently supports the following algorithms:
 
 ### Data Preprocessing
 
-* [Polynomial Base Feature Mapper](polynomial_base_feature_mapper.html)
+* [Polynomial Features](polynomial_features.html)
 * [Standard Scaler](standard_scaler.html)
 
 ### Recommendation
@@ -65,7 +65,7 @@ Next, you have to add the FlinkML dependency to the `pom.xml` of your project.
 </dependency>
 {% endhighlight %}
 
-Now you can start defining your ML pipelines.
+Now you can start solving your analysis task.
 The following code snippet shows how easy it is to train a multiple linear regression model.
 
 {% highlight scala %}
@@ -101,7 +101,7 @@ val scaler = StandardScaler()
 val polyFeatures = PolynomialFeatures().setDegree(3)
 val mlr = MultipleLinearRegression()
 
-// Construct pipeline
+// Construct pipeline of standard scaler, polynomial features and multiple linear regression
 val pipeline = scaler.chainTransformer(polyFeatures).chainPredictor(mlr)
 
 // Train pipeline
@@ -111,10 +111,11 @@ pipeline.fit(trainingData)
 val predictions: DataSet[LabeledVector] = pipeline.predict(testingData)
 {% endhighlight %} 
 
-An in-depth description of FlinkML's pipelines and their internal workings can be found [here](pipelines.html)
+One can chain a `Transformer` to another `Transformer` or a set of chained `Transformers`
by calling the method `chainTransformer`.
+If one wants to chain a `Predictor` to a `Transformer` or a set of chained `Transformers`,
one has to call the method `chainPredictor`. 
+An in-depth description of FlinkML's pipelines and their internal workings can be found [here](pipelines.html).
 
 ## How to contribute
 
-Please check our [roadmap](vision_roadmap.html#roadmap) and [contribution guide](contribution_guide.html).

-You can also check out our list of
-[unresolved issues on JIRA](https://issues.apache.org/jira/browse/FLINK-1748?jql=component%20%3D%20%22Machine%20Learning%20Library%22%20AND%20project%20%3D%20FLINK%20AND%20resolution%20%3D%20Unresolved%20ORDER%20BY%20priority%20DESC)
+The Flink community welcomes all contributors who want to get involved in the development
of Flink and its libraries.
+In order to get quickly started with contributing to FlinkML, please read first the official
[contribution guide]({{site.baseurl}}/libs/ml/contribution_guide.html).
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/flink/blob/c77947e9/docs/libs/ml/multiple_linear_regression.md
----------------------------------------------------------------------
diff --git a/docs/libs/ml/multiple_linear_regression.md b/docs/libs/ml/multiple_linear_regression.md
index 85a925d..da7bbde 100644
--- a/docs/libs/ml/multiple_linear_regression.md
+++ b/docs/libs/ml/multiple_linear_regression.md
@@ -1,6 +1,7 @@
 ---
 mathjax: include
-title: "Multiple linear regression"
+htmlTitle: FlinkML - Multiple linear regression
+title: <a href="/libs/ml">FlinkML</a> - Multiple linear regression
 ---
 <!--
 Licensed to the Apache Software Foundation (ASF) under one
@@ -48,7 +49,7 @@ under the License.
   $$\nabla_{\mathbf{w}} S(\mathbf{w}, \mathbf{x_i}) = 2\left(\mathbf{w}^T\mathbf{x_i} -
     y\right)\mathbf{x_i}$$
 
-  The gradients are averaged and scaled. The scaling is defined by $\gamma = s/\sqrt{j}}$
+  The gradients are averaged and scaled. The scaling is defined by $\gamma = \frac{s}{\sqrt{j}}$
   with $s$ being the initial step size and $j$ being the current iteration number. The resulting
gradient is subtracted from the
   current weight vector giving the new weight vector for the next iteration:
 
@@ -58,6 +59,23 @@ under the License.
   The convergence criterion is the relative change in the sum of squared residuals:
 
   $$\frac{S_{k-1} - S_k}{S_{k-1}} < \rho$$
+  
+## Operations
+
+`MultipleLinearRegression` is a `Predictor`.
+As such, it supports the `fit` and `predict` operation.
+
+### Fit
+
+MultipleLinearRegression is trained on a set of `LabeledVector`: 
+
+* `fit: DataSet[LabeledVector] => Unit`
+
+### Predict
+
+MultipleLinearRegression predicts for all subtypes of `Vector` the corresponding regression
value: 
+
+* `predict[T <: Vector]: DataSet[T] => DataSet[LabeledVector]`
 
 ## Parameters
 

http://git-wip-us.apache.org/repos/asf/flink/blob/c77947e9/docs/libs/ml/optimization.md
----------------------------------------------------------------------
diff --git a/docs/libs/ml/optimization.md b/docs/libs/ml/optimization.md
index d3ee9fb..9495b3f 100644
--- a/docs/libs/ml/optimization.md
+++ b/docs/libs/ml/optimization.md
@@ -1,6 +1,7 @@
 ---
 mathjax: include
-title: "FlinkML - Optimization"
+htmlTitle: FlinkML - Optimization
+title: <a href="/libs/ml">FlinkML</a> - Optimization
 ---
 <!--
 Licensed to the Apache Software Foundation (ASF) under one

http://git-wip-us.apache.org/repos/asf/flink/blob/c77947e9/docs/libs/ml/pipelines.md
----------------------------------------------------------------------
diff --git a/docs/libs/ml/pipelines.md b/docs/libs/ml/pipelines.md
index 9e121c1..3bcd22a 100644
--- a/docs/libs/ml/pipelines.md
+++ b/docs/libs/ml/pipelines.md
@@ -1,6 +1,7 @@
 ---
 mathjax: include
-title: "Pipelines - In-depth Description"
+htmlTitle: FlinkML - Looking under the hood of piplines
+title: <a href="/libs/ml">FlinkML</a> - Looking under the hood of pipelines
 ---
 <!--
 Licensed to the Apache Software Foundation (ASF) under one

http://git-wip-us.apache.org/repos/asf/flink/blob/c77947e9/docs/libs/ml/polynomial_base_feature_mapper.md
----------------------------------------------------------------------
diff --git a/docs/libs/ml/polynomial_base_feature_mapper.md b/docs/libs/ml/polynomial_base_feature_mapper.md
deleted file mode 100644
index 801e3cf..0000000
--- a/docs/libs/ml/polynomial_base_feature_mapper.md
+++ /dev/null
@@ -1,91 +0,0 @@
----
-mathjax: include
-title: Polynomial Base Feature Mapper
----
-<!--
-Licensed to the Apache Software Foundation (ASF) under one
-or more contributor license agreements.  See the NOTICE file
-distributed with this work for additional information
-regarding copyright ownership.  The ASF licenses this file
-to you under the Apache License, Version 2.0 (the
-"License"); you may not use this file except in compliance
-with the License.  You may obtain a copy of the License at
-
-  http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing,
-software distributed under the License is distributed on an
-"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-KIND, either express or implied.  See the License for the
-specific language governing permissions and limitations
-under the License.
--->
-
-* This will be replaced by the TOC
-{:toc}
-
-## Description
-
-The polynomial base feature mapper maps a vector into the polynomial feature space of degree
$d$.
-The dimension of the input vector determines the number of polynomial factors whose values
are the respective vector entries.
-Given a vector $(x, y, z, \ldots)^T$ the resulting feature vector looks like:
-
-$$\left(x, y, z, x^2, xy, y^2, yz, z^2, x^3, x^2y, x^2z, xy^2, xyz, xz^2, y^3, \ldots\right)^T$$
-
-Flink's implementation orders the polynomials in decreasing order of their degree.
-
-Given the vector $\left(3,2\right)^T$, the polynomial base feature vector of degree 3 would
look like
- 
- $$\left(3^3, 3^2\cdot2, 3\cdot2^2, 2^3, 3^2, 3\cdot2, 2^2, 3, 2\right)^T$$
-
-This transformer can be prepended to all `Transformer` and `Learner` implementations which
expect an input of type `LabeledVector`.
-
-## Parameters
-
-The polynomial base feature mapper can be controlled by the following parameters:
-
-<table class="table table-bordered">
-    <thead>
-      <tr>
-        <th class="text-left" style="width: 20%">Parameters</th>
-        <th class="text-center">Description</th>
-      </tr>
-    </thead>
-
-    <tbody>
-      <tr>
-        <td><strong>Degree</strong></td>
-        <td>
-          <p>
-            The maximum polynomial degree. 
-            (Default value: <strong>10</strong>)
-          </p>
-        </td>
-      </tr>
-    </tbody>
-  </table>
-
-## Examples
-
-{% highlight scala %}
-// Obtain the training data set
-val trainingDS: DataSet[LabeledVector] = ...
-
-// Setup polynomial base feature extractor of degree 3
-val polyBase = PolynomialBase()
-.setDegree(3)
-
-// Setup the multiple linear regression learner
-val mlr = MultipleLinearRegression()
-
-// Control the learner via the parameter map
-val parameters = ParameterMap()
-.add(MultipleLinearRegression.Iterations, 20)
-.add(MultipleLinearRegression.Stepsize, 0.5)
-
-// Create pipeline PolynomialBase -> MultipleLinearRegression
-val pipeline = polyBase.chainPredictor(mlr)
-
-// Learn the model
-pipeline.fit(trainingDS)
-{% endhighlight %}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/flink/blob/c77947e9/docs/libs/ml/polynomial_features.md
----------------------------------------------------------------------
diff --git a/docs/libs/ml/polynomial_features.md b/docs/libs/ml/polynomial_features.md
new file mode 100644
index 0000000..d25141f
--- /dev/null
+++ b/docs/libs/ml/polynomial_features.md
@@ -0,0 +1,108 @@
+---
+mathjax: include
+htmlTitle: FlinkML - Polynomial Features
+title: <a href="/libs/ml">FlinkML</a> - Polynomial Features
+---
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+* This will be replaced by the TOC
+{:toc}
+
+## Description
+
+The polynomial features transformer maps a vector into the polynomial feature space of degree
$d$.
+The dimension of the input vector determines the number of polynomial factors whose values
are the respective vector entries.
+Given a vector $(x, y, z, \ldots)^T$ the resulting feature vector looks like:
+
+$$\left(x, y, z, x^2, xy, y^2, yz, z^2, x^3, x^2y, x^2z, xy^2, xyz, xz^2, y^3, \ldots\right)^T$$
+
+Flink's implementation orders the polynomials in decreasing order of their degree.
+
+Given the vector $\left(3,2\right)^T$, the polynomial features vector of degree 3 would look
like
+ 
+ $$\left(3^3, 3^2\cdot2, 3\cdot2^2, 2^3, 3^2, 3\cdot2, 2^2, 3, 2\right)^T$$
+
+This transformer can be prepended to all `Transformer` and `Predictor` implementations which
expect an input of type `LabeledVector` or any sub-type of `Vector`.
+
+## Operations
+
+`PolynomialFeatures` is a `Transformer`.
+As such, it supports the `fit` and `transform` operation.
+
+### Fit
+
+PolynomialFeatures is not trained on data and, thus, supports all types of input data.
+
+### Transform
+
+PolynomialFeatures transforms all subtypes of `Vector` and `LabeledVector` into their respective
types: 
+
+* `transform[T <: Vector]: DataSet[T] => DataSet[T]`
+* `transform: DataSet[LabeledVector] => DataSet[LabeledVector]`
+
+## Parameters
+
+The polynomial features transformer can be controlled by the following parameters:
+
+<table class="table table-bordered">
+    <thead>
+      <tr>
+        <th class="text-left" style="width: 20%">Parameters</th>
+        <th class="text-center">Description</th>
+      </tr>
+    </thead>
+
+    <tbody>
+      <tr>
+        <td><strong>Degree</strong></td>
+        <td>
+          <p>
+            The maximum polynomial degree. 
+            (Default value: <strong>10</strong>)
+          </p>
+        </td>
+      </tr>
+    </tbody>
+  </table>
+
+## Examples
+
+{% highlight scala %}
+// Obtain the training data set
+val trainingDS: DataSet[LabeledVector] = ...
+
+// Setup polynomial feature transformer of degree 3
+val polyFeatures = PolynomialFeatures()
+.setDegree(3)
+
+// Setup the multiple linear regression learner
+val mlr = MultipleLinearRegression()
+
+// Control the learner via the parameter map
+val parameters = ParameterMap()
+.add(MultipleLinearRegression.Iterations, 20)
+.add(MultipleLinearRegression.Stepsize, 0.5)
+
+// Create pipeline PolynomialFeatures -> MultipleLinearRegression
+val pipeline = polyFeatures.chainPredictor(mlr)
+
+// train the model
+pipeline.fit(trainingDS)
+{% endhighlight %}

http://git-wip-us.apache.org/repos/asf/flink/blob/c77947e9/docs/libs/ml/quickstart.md
----------------------------------------------------------------------
diff --git a/docs/libs/ml/quickstart.md b/docs/libs/ml/quickstart.md
index 43a3144..b354b73 100644
--- a/docs/libs/ml/quickstart.md
+++ b/docs/libs/ml/quickstart.md
@@ -1,5 +1,6 @@
 ---
-title: "FlinkML - Quickstart guide"
+htmlTitle: FlinkML - Quickstart Guide
+title: <a href="/libs/ml">FlinkML</a> - Quickstart Guide
 ---
 <!--
 Licensed to the Apache Software Foundation (ASF) under one

http://git-wip-us.apache.org/repos/asf/flink/blob/c77947e9/docs/libs/ml/standard_scaler.md
----------------------------------------------------------------------
diff --git a/docs/libs/ml/standard_scaler.md b/docs/libs/ml/standard_scaler.md
index 2ea21d6..86fbd03 100644
--- a/docs/libs/ml/standard_scaler.md
+++ b/docs/libs/ml/standard_scaler.md
@@ -1,6 +1,7 @@
 ---
 mathjax: include
-title: "Standard Scaler"
+htmlTitle: FlinkML - Standard Scaler
+title: <a href="/libs/ml">FlinkML</a> - Standard Scaler
 ---
 <!--
 Licensed to the Apache Software Foundation (ASF) under one
@@ -28,7 +29,7 @@ under the License.
 
  The standard scaler scales the given data set, so that all features will have a user specified
mean and variance. 
  In case the user does not provide a specific mean and standard deviation, the standard scaler
transforms the features of the input data set to have mean equal to 0 and standard deviation
equal to 1.
- Given a set of input data $x_{1}, x_{2},... x_{n}$, with mean: 
+ Given a set of input data $x_1, x_2,... x_n$, with mean: 
  
  $$\bar{x} = \frac{1}{n}\sum_{i=1}^{n}x_{i}$$ 
  
@@ -36,12 +37,31 @@ under the License.
  
  $$\sigma_{x}=\sqrt{ \frac{1}{n} \sum_{i=1}^{n}(x_{i}-\bar{x})^{2}}$$
 
-The scaled data set $z_{1}, z_{2},...,z_{n}$ will be:
+The scaled data set $z_1, z_2,...,z_n$ will be:
 
  $$z_{i}= std \left (\frac{x_{i} - \bar{x}  }{\sigma_{x}}\right ) + mean$$
 
 where $\textit{std}$ and $\textit{mean}$ are the user specified values for the standard deviation
and mean.
 
+## Operations
+
+`StandardScaler` is a `Transformer`.
+As such, it supports the `fit` and `transform` operation.
+
+### Fit
+
+StandardScaler is trained on all subtypes of `Vector` or `LabeledVector`: 
+
+* `fit[T <: Vector]: DataSet[T] => Unit` 
+* `fit: DataSet[LabeledVector] => Unit`
+
+### Transform
+
+StandardScaler transforms all subtypes of `Vector` or `LabeledVector` into the respective
type: 
+
+* `transform[T <: Vector]: DataSet[T] => DataSet[T]` 
+* `transform: DataSet[LabeledVector] => DataSet[LabeledVector]`
+
 ## Parameters
 
 The standard scaler implementation can be controlled by the following two parameters:

http://git-wip-us.apache.org/repos/asf/flink/blob/c77947e9/docs/libs/ml/vision_roadmap.md
----------------------------------------------------------------------
diff --git a/docs/libs/ml/vision_roadmap.md b/docs/libs/ml/vision_roadmap.md
index 1e319b6..7fcfada 100644
--- a/docs/libs/ml/vision_roadmap.md
+++ b/docs/libs/ml/vision_roadmap.md
@@ -1,5 +1,6 @@
 ---
-title: "FlinkML - Vision and Roadmap"
+htmlTitle: FlinkML - Vision and Roadmap
+title: <a href="/libs/ml">FlinkML</a> - Vision and Roadmap
 ---
 <!--
 Licensed to the Apache Software Foundation (ASF) under one

http://git-wip-us.apache.org/repos/asf/flink/blob/c77947e9/docs/page/css/flink.css
----------------------------------------------------------------------
diff --git a/docs/page/css/flink.css b/docs/page/css/flink.css
index 2a32910..9074e23 100644
--- a/docs/page/css/flink.css
+++ b/docs/page/css/flink.css
@@ -111,10 +111,11 @@ h2, h3 {
 	border-bottom: 1px solid #E5E5E5;
 }
 
-
 code {
-	background: none;
-	color: black;
+	background: #f5f5f5;
+  padding: 0;
+  color: #333333;
+  font-family: "Menlo", "Lucida Console", monospace;
 }
 
 pre {

http://git-wip-us.apache.org/repos/asf/flink/blob/c77947e9/flink-staging/flink-ml/pom.xml
----------------------------------------------------------------------
diff --git a/flink-staging/flink-ml/pom.xml b/flink-staging/flink-ml/pom.xml
index 43e6605..bc61e2e 100644
--- a/flink-staging/flink-ml/pom.xml
+++ b/flink-staging/flink-ml/pom.xml
@@ -117,7 +117,7 @@
 						</goals>
 						<configuration>
 							<suffixes>(?&lt;!(IT|Integration))(Test|Suite|Case)</suffixes>
-							<argLine>-Xms256m -Xmx800m -Dlog4j.configuration=${log4j.configuration} -Dmvn.forkNumber=${surefire.forkNumber}
-XX:-UseGCOverheadLimit</argLine>
+							<argLine>-Xms256m -Xmx800m -Dlog4j.configuration=${log4j.configuration} -XX:-UseGCOverheadLimit</argLine>
 						</configuration>
 					</execution>
 					<execution>
@@ -128,7 +128,7 @@
 						</goals>
 						<configuration>
 							<suffixes>(IT|Integration)(Test|Suite|Case)</suffixes>
-							<argLine>-Xms256m -Xmx800m -Dlog4j.configuration=${log4j.configuration} -Dmvn.forkNumber=${surefire.forkNumber}
-XX:-UseGCOverheadLimit</argLine>
+							<argLine>-Xms256m -Xmx800m -Dlog4j.configuration=${log4j.configuration} -XX:-UseGCOverheadLimit</argLine>
 						</configuration>
 					</execution>
 				</executions>

http://git-wip-us.apache.org/repos/asf/flink/blob/c77947e9/flink-staging/flink-ml/src/test/resources/log4j-test.properties
----------------------------------------------------------------------
diff --git a/flink-staging/flink-ml/src/test/resources/log4j-test.properties b/flink-staging/flink-ml/src/test/resources/log4j-test.properties
index 76b237e..023b23a 100644
--- a/flink-staging/flink-ml/src/test/resources/log4j-test.properties
+++ b/flink-staging/flink-ml/src/test/resources/log4j-test.properties
@@ -29,7 +29,7 @@ log4j.appender.console.layout.ConversionPattern=%d{HH:mm:ss,SSS} %-5p %-60c
%x -
 # File (use 'file')
 # -----------------------------------------------------------------------------
 log4j.appender.file=org.apache.log4j.FileAppender
-log4j.appender.file.file=${log.dir}/{$mvn.forkNumber}.log
+log4j.appender.file.file=${log.dir}/flinkML.log
 log4j.appender.file.append=false
 log4j.appender.file.layout=org.apache.log4j.PatternLayout
 log4j.appender.file.layout.ConversionPattern=%d{HH:mm:ss,SSS} %-5p %-60c %x - %m%n


Mime
View raw message