crunch-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From build...@apache.org
Subject svn commit: r927238 - in /websites/staging/crunch/trunk/content: ./ user-guide.html
Date Wed, 29 Oct 2014 03:38:43 GMT
Author: buildbot
Date: Wed Oct 29 03:38:42 2014
New Revision: 927238

Log:
Staging update by buildbot for crunch

Modified:
    websites/staging/crunch/trunk/content/   (props changed)
    websites/staging/crunch/trunk/content/user-guide.html

Propchange: websites/staging/crunch/trunk/content/
------------------------------------------------------------------------------
--- cms:source-revision (original)
+++ cms:source-revision Wed Oct 29 03:38:42 2014
@@ -1 +1 @@
-1635032
+1635033

Modified: websites/staging/crunch/trunk/content/user-guide.html
==============================================================================
--- websites/staging/crunch/trunk/content/user-guide.html (original)
+++ websites/staging/crunch/trunk/content/user-guide.html Wed Oct 29 03:38:42 2014
@@ -579,104 +579,100 @@ can be used to kick off a shuffle on the
   }
 </pre>
 
-<p>If you find yourself in a situation where you have a PCollection&lt;Pair&lt;K,
V&gt;&gt; and you need a PTable&lt;K, V&gt;, the
+<p>If you find yourself in a situation where you have a <code>PCollection&lt;Pair&lt;K,
V&gt;&gt;</code> and you need a <code>PTable&lt;K, V&gt;</code>,
the
 <a href="apidocs/0.10.0/org/apache/crunch/lib/PTables.html">PTables</a> library
class has methods that will do the conversion for you.</p>
 <p>Let's look at some more example PTypes created using the common primitive and collection
types. For most of your pipelines,
 you will use one type family exclusively, and so you can cut down on some of the boilerplate
in your classes by importing
 all of the methods from the <code>Writables</code> or <code>Avros</code>
classes into your class:</p>
-<pre>
-// Import all of the PType factory methods from Avros
-import static org.apache.crunch.types.avro.Avros.*;
+<div class="codehilite"><pre><span class="c1">// Import all of the PType
factory methods from Avros</span>
+<span class="kn">import</span> <span class="nn">static</span> <span
class="n">org</span><span class="p">.</span><span class="n">apache</span><span
class="p">.</span><span class="n">crunch</span><span class="p">.</span><span
class="n">types</span><span class="p">.</span><span class="n">avro</span><span
class="p">.</span><span class="n">Avros</span><span class="p">.</span><span
class="o">*</span><span class="p">;</span>
+
+<span class="kn">import</span> <span class="nn">org</span><span
class="p">.</span><span class="n">apache</span><span class="p">.</span><span
class="n">crunch</span><span class="p">.</span><span class="n">Pair</span><span
class="p">;</span>
+<span class="kn">import</span> <span class="nn">org</span><span
class="p">.</span><span class="n">apache</span><span class="p">.</span><span
class="n">crunch</span><span class="p">.</span><span class="n">Tuple3</span><span
class="p">;</span>
+<span class="kn">import</span> <span class="nn">org</span><span
class="p">.</span><span class="n">apache</span><span class="p">.</span><span
class="n">crunch</span><span class="p">.</span><span class="n">TupleN</span><span
class="p">;</span>
+
+<span class="kn">import</span> <span class="nn">java</span><span
class="p">.</span><span class="n">nio</span><span class="p">.</span><span
class="n">ByteBuffer</span><span class="p">;</span>
+<span class="kn">import</span> <span class="nn">java</span><span
class="p">.</span><span class="n">util</span><span class="p">.</span><span
class="n">Collection</span><span class="p">;</span>
+<span class="kn">import</span> <span class="nn">java</span><span
class="p">.</span><span class="n">util</span><span class="p">.</span><span
class="n">Map</span><span class="p">;</span>
+
+<span class="n">public</span> <span class="k">class</span> <span
class="n">MyPipeline</span> <span class="p">{</span>
+
+  <span class="c1">// Common primitive types</span>
+  <span class="n">PType</span><span class="o">&lt;</span><span
class="n">Integer</span><span class="o">&gt;</span> <span class="n">intType</span>
<span class="o">=</span> <span class="n">ints</span><span class="p">();</span>
+  <span class="n">PType</span><span class="o">&lt;</span><span
class="n">Long</span><span class="o">&gt;</span> <span class="n">longType</span>
<span class="o">=</span> <span class="n">longs</span><span class="p">();</span>
+  <span class="n">PType</span><span class="o">&lt;</span><span
class="n">Double</span><span class="o">&gt;</span> <span class="n">doubleType</span>
<span class="o">=</span> <span class="n">doubles</span><span class="p">();</span>
+  <span class="c1">// Bytes are represented by java.nio.ByteBuffer</span>
+  <span class="n">PType</span><span class="o">&lt;</span><span
class="n">ByteBuffer</span><span class="o">&gt;</span> <span class="n">bytesType</span>
<span class="o">=</span> <span class="n">bytes</span><span class="p">();</span>
+
+  <span class="c1">// A PTableType: using tableOf will return a PTable instead of a</span>
+  <span class="c1">// PCollection from a parallelDo call.</span>
+  <span class="n">PTableType</span><span class="o">&lt;</span><span
class="n">String</span><span class="p">,</span> <span class="n">Boolean</span><span
class="o">&gt;</span> <span class="n">tableType</span> <span class="o">=</span>
<span class="n">tableOf</span><span class="p">(</span><span class="n">strings</span><span
class="p">(),</span> <span class="n">booleans</span><span class="p">());</span>
+
+  <span class="c1">// Pair types: </span>
+  <span class="n">PType</span><span class="o">&lt;</span><span
class="n">Pair</span><span class="o">&lt;</span><span class="n">String</span><span
class="p">,</span> <span class="n">Boolean</span><span class="o">&gt;&gt;</span>
<span class="n">pairType</span> <span class="o">=</span> <span
class="n">pairs</span><span class="p">(</span><span class="n">strings</span><span
class="p">(),</span> <span class="n">booleans</span><span class="p">());</span>

+  <span class="n">PType</span><span class="o">&lt;</span><span
class="n">Pair</span><span class="o">&lt;</span><span class="n">String</span><span
class="p">,</span> <span class="n">Pair</span><span class="o">&lt;</span><span
class="n">Long</span><span class="p">,</span> <span class="n">Long</span><span
class="o">&gt;&gt;</span> <span class="n">nestedPairType</span>
<span class="o">=</span> <span class="n">pairs</span><span class="p">(</span><span
class="n">strings</span><span class="p">(),</span> <span class="n">pairs</span><span
class="p">(</span><span class="n">longs</span><span class="p">(),</span>
<span class="n">longs</span><span class="p">()));</span>
+
+  <span class="c1">// A triple</span>
+  <span class="n">PType</span><span class="o">&lt;</span><span
class="n">Tuple3</span><span class="o">&lt;</span><span class="n">Long</span><span
class="p">,</span> <span class="n">Float</span><span class="p">,</span>
<span class="n">Float</span><span class="o">&gt;&gt;</span>
<span class="n">tripType</span> <span class="o">=</span> <span
class="n">trips</span><span class="p">(</span><span class="n">longs</span><span
class="p">(),</span> <span class="n">floats</span><span class="p">(),</span>
<span class="n">floats</span><span class="p">());</span>
+  <span class="c1">// An arbitrary length tuple-- note that we lose the generic type
information</span>
+  <span class="n">PType</span><span class="o">&lt;</span><span
class="n">TupleN</span><span class="o">&gt;</span> <span class="n">tupleType</span>
<span class="o">=</span> <span class="n">tupleN</span><span class="p">(</span><span
class="n">ints</span><span class="p">(),</span> <span class="n">ints</span><span
class="p">(),</span> <span class="n">floats</span><span class="p">(),</span>
<span class="n">strings</span><span class="p">(),</span> <span
class="n">strings</span><span class="p">(),</span> <span class="n">ints</span><span
class="p">());</span>
+
+  <span class="c1">// A Collection type</span>
+  <span class="n">PType</span><span class="o">&lt;</span><span
class="n">Collection</span><span class="o">&lt;</span><span class="n">Long</span><span
class="o">&gt;&gt;</span> <span class="n">longsType</span> <span
class="o">=</span> <span class="n">collections</span><span class="p">(</span><span
class="n">longs</span><span class="p">());</span>
+  <span class="c1">// A Map Type-- note that the keys are always strings, we only specify
the value.</span>
+  <span class="n">PType</span><span class="o">&lt;</span><span
class="n">Map</span><span class="o">&lt;</span><span class="n">String</span><span
class="p">,</span> <span class="n">Boolean</span><span class="o">&gt;&gt;</span>
<span class="n">mapType</span> <span class="o">=</span> <span class="n">maps</span><span
class="p">(</span><span class="n">booleans</span><span class="p">());</span>
+
+  <span class="c1">// A Pair of collections</span>
+  <span class="n">PType</span><span class="o">&lt;</span><span
class="n">Pair</span><span class="o">&lt;</span><span class="n">Collection</span><span
class="o">&lt;</span><span class="n">String</span><span class="o">&gt;</span><span
class="p">,</span> <span class="n">Collection</span><span class="o">&lt;</span><span
class="n">Long</span><span class="o">&gt;&gt;&gt;</span>
<span class="n">pairColType</span> <span class="o">=</span> <span
class="n">pairs</span><span class="p">(</span>
+      <span class="n">collections</span><span class="p">(</span><span
class="n">strings</span><span class="p">()),</span>
+      <span class="n">collections</span><span class="p">(</span><span
class="n">longs</span><span class="p">()));</span>
+<span class="p">}</span>
+</pre></div>
 
-import org.apache.crunch.Pair;
-import org.apache.crunch.Tuple3;
-import org.apache.crunch.TupleN;
-
-import java.nio.ByteBuffer;
-import java.util.Collection;
-import java.util.Map;
-
-public class MyPipeline {
-
-  // Common primitive types
-  PType&lt;Integer&gt; intType = ints();
-  PType&lt;Long&gt; longType = longs();
-  PType&lt;Double&gt; doubleType = doubles();
-  // Bytes are represented by java.nio.ByteBuffer
-  PType&lt;ByteBuffer&gt; bytesType = bytes();
-
-  // A PTableType: using tableOf will return a PTable instead of a
-  // PCollection from a parallelDo call.
-  PTableType&lt;String, Boolean&gt; tableType = tableOf(strings(), booleans());
-
-  // Pair types: 
-  PType&lt;Pair&lt;String, Boolean&gt;&gt; pairType = pairs(strings(), booleans());

-  PType&lt;Pair&lt;String, Pair&lt;Long, Long&gt;&gt; nestedPairType
= pairs(strings(), pairs(longs(), longs()));
-
-  // A triple
-  PType&lt;Tuple3&lt;Long, Float, Float&gt;&gt; tripType = trips(longs(),
floats(), floats());
-  // An arbitrary length tuple-- note that we lose the generic type information
-  PType&lt;TupleN&gt; tupleType = tupleN(ints(), ints(), floats(), strings(), strings(),
ints());
-
-  // A Collection type
-  PType&lt;Collection&lt;Long&gt;&gt; longsType = collections(longs());
-  // A Map Type-- note that the keys are always strings, we only specify the value.
-  PType&lt;Map&lt;String, Boolean&gt;&gt; mapType = maps(booleans());
-
-  // A Pair of collections
-  PType&lt;Pair&lt;Collection&lt;String&gt;, Collection&lt;Long&gt;&gt;&gt;
pairColType = pairs(
-      collections(strings()),
-      collections(longs()));
-}
-</pre>
 
 <p>Both type families also have a method named <code>PType&lt;T&gt; records(Class&lt;T&gt;
clazz)</code> that can be used to create PTypes that support the common
 record format for each type family. For the WritableTypeFamily, the records method supports
PTypes for implementations of the <code>Writable</code>
 interface, and for the AvroTypeFamily, the records method supports PTypes for implementations
of Avro's <code>IndexedRecord</code> interface, which
 includes both Avro generic and specific records:</p>
-<pre>
-  PType&lt;FooWritable&gt; fwType1 = Writables.records(FooWritable.class);
-  // The more obvious "writables" method also works.
-  PType&lt;FooWritable&gt; fwType = Writables.writables(FooWritable.class);
-
-  // For a generated Avro class, this works:
-  PType&lt;Person&gt; personType1 = Avros.records(Person.class);
-  // As does this:
-  PType&lt;Person&gt; personType2 = Avros.containers(Person.class); 
-  // If you only have a schema, you can create a generic type, like this:
-  org.apache.avro.Schema schema = ...;
-  PType&lt;Record&gt; avroGenericType = Avros.generics(schema);
-</pre>
+<div class="codehilite"><pre><span class="n">PType</span><span
class="o">&lt;</span><span class="n">FooWritable</span><span class="o">&gt;</span>
<span class="n">fwType1</span> <span class="p">=</span> <span class="n">Writables</span><span
class="p">.</span><span class="n">records</span><span class="p">(</span><span
class="n">FooWritable</span><span class="p">.</span><span class="n">class</span><span
class="p">);</span>
+<span class="o">//</span> <span class="n">The</span> <span class="n">more</span>
<span class="n">obvious</span> &quot;<span class="n">writables</span>&quot;
<span class="n">method</span> <span class="n">also</span> <span
class="n">works</span><span class="p">.</span>
+<span class="n">PType</span><span class="o">&lt;</span><span
class="n">FooWritable</span><span class="o">&gt;</span> <span
class="n">fwType</span> <span class="p">=</span> <span class="n">Writables</span><span
class="p">.</span><span class="n">writables</span><span class="p">(</span><span
class="n">FooWritable</span><span class="p">.</span><span class="n">class</span><span
class="p">);</span>
+
+<span class="o">//</span> <span class="n">For</span> <span class="n">a</span>
<span class="n">generated</span> <span class="n">Avro</span> <span
class="n">class</span><span class="p">,</span> <span class="n">this</span>
<span class="n">works</span><span class="p">:</span>
+<span class="n">PType</span><span class="o">&lt;</span><span
class="n">Person</span><span class="o">&gt;</span> <span class="n">personType1</span>
<span class="p">=</span> <span class="n">Avros</span><span class="p">.</span><span
class="n">records</span><span class="p">(</span><span class="n">Person</span><span
class="p">.</span><span class="n">class</span><span class="p">);</span>
+<span class="o">//</span> <span class="n">As</span> <span class="n">does</span>
<span class="n">this</span><span class="p">:</span>
+<span class="n">PType</span><span class="o">&lt;</span><span
class="n">Person</span><span class="o">&gt;</span> <span class="n">personType2</span>
<span class="p">=</span> <span class="n">Avros</span><span class="p">.</span><span
class="n">containers</span><span class="p">(</span><span class="n">Person</span><span
class="p">.</span><span class="n">class</span><span class="p">);</span>

+<span class="o">//</span> <span class="n">If</span> <span class="n">you</span>
<span class="n">only</span> <span class="n">have</span> <span class="n">a</span>
<span class="n">schema</span><span class="p">,</span> <span class="n">you</span>
<span class="n">can</span> <span class="n">create</span> <span
class="n">a</span> <span class="n">generic</span> <span class="n">type</span><span
class="p">,</span> <span class="n">like</span> <span class="n">this</span><span
class="p">:</span>
+<span class="n">org</span><span class="p">.</span><span class="n">apache</span><span
class="p">.</span><span class="n">avro</span><span class="p">.</span><span
class="n">Schema</span> <span class="n">schema</span> <span class="p">=</span>
<span class="p">...;</span>
+<span class="n">PType</span><span class="o">&lt;</span><span
class="n">Record</span><span class="o">&gt;</span> <span class="n">avroGenericType</span>
<span class="p">=</span> <span class="n">Avros</span><span class="p">.</span><span
class="n">generics</span><span class="p">(</span><span class="n">schema</span><span
class="p">);</span>
+</pre></div>
+
 
 <p>The <a href="apidocs/0.10.0/org/apache/crunch/types/avro/Avros.html">Avros</a>
class also has a <code>reflects</code> method for creating PTypes
 for POJOs using Avro's reflection-based serialization mechanism. There are a couple of restrictions
on the structure of
 the POJO:</p>
 <ol>
 <li>It must have a default, no-arg constructor.</li>
-<li>All of its fields must be Avro primitive types or collection types that have Avro
equivalents, like <code>ArrayList</code> and
-<code>HashMap&lt;String, T&gt;</code>. You may also have arrays of Avro
primitive types.</li>
-</ol>
-<pre>
-  // Declare an inline data type and use it for Crunch serialization
-  public static class UrlData {
-    // The fields don't have to be public, just doing this for the example.
-    double curPageRank;
-    String[] outboundUrls;
-
-    // Remember: you must have a no-arg constructor. 
-    public UrlData() { this(0.0, new String[0]); }
-
-    // The regular constructor
-    public UrlData(double pageRank, String[] outboundUrls) {
-      this.curPageRank = pageRank;
-      this.outboundUrls = outboundUrls;
-    }
+<li>
+<p>All of its fields must be Avro primitive types or collection types that have Avro
equivalents, like <code>ArrayList</code> and
+<code>HashMap&lt;String, T&gt;</code>. You may also have arrays of Avro
primitive types.</p>
+<p>// Declare an inline data type and use it for Crunch serialization
+public static class UrlData {
+  // The fields don't have to be public, just doing this for the example.
+  double curPageRank;
+  String[] outboundUrls;</p>
+<p>// Remember: you must have a no-arg constructor. 
+  public UrlData() { this(0.0, new String[0]); }</p>
+<p>// The regular constructor
+  public UrlData(double pageRank, String[] outboundUrls) {
+    this.curPageRank = pageRank;
+    this.outboundUrls = outboundUrls;
   }
-
-  PType&lt;UrlData&gt; urlDataType = Avros.reflects(UrlData.class);
-  PTableType&lt;String, UrlData&gt; pageRankType = Avros.tableOf(Avros.strings(),
urlDataType);
-</pre>
-
+}</p>
+<p>PType<UrlData> urlDataType = Avros.reflects(UrlData.class);
+PTableType<String, UrlData> pageRankType = Avros.tableOf(Avros.strings(), urlDataType);</p>
+</li>
+</ol>
 <p>Avro reflection is a great way to define intermediate types for your Crunch pipelines;
not only is your logic clear
 and easy to test, but the fact that the data is written out as Avro records means that you
can use tools like Hive and Pig
 to query intermediate results to aid in debugging pipeline failures.</p>



Mime
View raw message