hadoop-mapreduce-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From acmur...@apache.org
Subject svn commit: r1397183 - in /hadoop/common/branches/branch-2/hadoop-mapreduce-project: ./ hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/
Date Thu, 11 Oct 2012 17:22:27 GMT
Author: acmurthy
Date: Thu Oct 11 17:22:27 2012
New Revision: 1397183

URL: http://svn.apache.org/viewvc?rev=1397183&view=rev
Log:
Merge -c 1397182 from trunk to branch-2 to fix MAPREDUCE-4616. Improve javadoc for MultipleOutputs.
Contributed by Tony Burton.

Modified:
    hadoop/common/branches/branch-2/hadoop-mapreduce-project/CHANGES.txt
    hadoop/common/branches/branch-2/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/LazyOutputFormat.java
    hadoop/common/branches/branch-2/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/MultipleOutputs.java

Modified: hadoop/common/branches/branch-2/hadoop-mapreduce-project/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-2/hadoop-mapreduce-project/CHANGES.txt?rev=1397183&r1=1397182&r2=1397183&view=diff
==============================================================================
--- hadoop/common/branches/branch-2/hadoop-mapreduce-project/CHANGES.txt (original)
+++ hadoop/common/branches/branch-2/hadoop-mapreduce-project/CHANGES.txt Thu Oct 11 17:22:27
2012
@@ -14,6 +14,9 @@ Release 2.0.3-alpha - Unreleased
     MAPREDUCE-3678. The Map tasks logs should have the value of input
     split it processed. (harsh)
 
+    MAPREDUCE-4616. Improve javadoc for MultipleOutputs. (Tony Burton via
+    acmurthy) 
+
   OPTIMIZATIONS
 
   BUG FIXES

Modified: hadoop/common/branches/branch-2/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/LazyOutputFormat.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-2/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/LazyOutputFormat.java?rev=1397183&r1=1397182&r2=1397183&view=diff
==============================================================================
--- hadoop/common/branches/branch-2/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/LazyOutputFormat.java
(original)
+++ hadoop/common/branches/branch-2/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/LazyOutputFormat.java
Thu Oct 11 17:22:27 2012
@@ -32,7 +32,10 @@ import org.apache.hadoop.mapreduce.TaskA
 import org.apache.hadoop.util.ReflectionUtils;
 
 /**
- * A Convenience class that creates output lazily.  
+ * A Convenience class that creates output lazily.
+ * Use in conjuction with org.apache.hadoop.mapreduce.lib.output.MultipleOutputs to recreate
the
+ * behaviour of org.apache.hadoop.mapred.lib.MultipleTextOutputFormat (etc) of the old Hadoop
API.
+ * See {@link MultipleOutputs} documentation for more information.
  */
 @InterfaceAudience.Public
 @InterfaceStability.Stable

Modified: hadoop/common/branches/branch-2/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/MultipleOutputs.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-2/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/MultipleOutputs.java?rev=1397183&r1=1397182&r2=1397183&view=diff
==============================================================================
--- hadoop/common/branches/branch-2/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/MultipleOutputs.java
(original)
+++ hadoop/common/branches/branch-2/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/MultipleOutputs.java
Thu Oct 11 17:22:27 2012
@@ -20,7 +20,10 @@ package org.apache.hadoop.mapreduce.lib.
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapreduce.*;
+import org.apache.hadoop.mapreduce.Reducer.Context;
+import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
 import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl;
 import org.apache.hadoop.util.ReflectionUtils;
 
@@ -37,6 +40,7 @@ import java.util.*;
  * Each additional output, or named output, may be configured with its own
  * <code>OutputFormat</code>, with its own key class and with its own value
  * class.
+ * </p>
  * 
  * <p>
  * Case two: to write data to different files provided by user
@@ -107,6 +111,64 @@ import java.util.*;
  *
  * }
  * </pre>
+ * 
+ * <p>
+ * When used in conjuction with org.apache.hadoop.mapreduce.lib.output.LazyOutputFormat,
+ * MultipleOutputs can mimic the behaviour of MultipleTextOutputFormat and MultipleSequenceFileOutputFormat
+ * from the old Hadoop API - ie, output can be written from the Reducer to more than one
location.
+ * </p>
+ * 
+ * <p>
+ * Use <code>MultipleOutputs.write(KEYOUT key, VALUEOUT value, String baseOutputPath)</code>
to write key and 
+ * value to a path specified by <code>baseOutputPath</code>, with no need to
specify a named output:
+ * </p>
+ * 
+ * <pre>
+ * private MultipleOutputs<Text, Text> out;
+ * 
+ * public void setup(Context context) {
+ *   out = new MultipleOutputs<Text, Text>(context);
+ *   ...
+ * }
+ * 
+ * public void reduce(Text key, Iterable<Text> values, Context context) throws IOException,
InterruptedException {
+ * for (Text t : values) {
+ *   out.write(key, t, generateFileName(<<i>parameter list...</i>>));
+ *   }
+ * }
+ * 
+ * protected void cleanup(Context context) throws IOException, InterruptedException {
+ *   out.close();
+ * }
+ * </pre>
+ * 
+ * <p>
+ * Use your own code in <code>generateFileName()</code> to create a custom path
to your results. 
+ * '/' characters in <code>baseOutputPath</code> will be translated into directory
levels in your file system. 
+ * Also, append your custom-generated path with "part" or similar, otherwise your output
will be -00000, -00001 etc. 
+ * No call to <code>context.write()</code> is necessary. See example <code>generateFileName()</code>
code below. 
+ * </p>
+ * 
+ * <pre>
+ * private String generateFileName(Text k) {
+ *   // expect Text k in format "Surname|Forename"
+ *   String[] kStr = k.toString().split("\\|");
+ *   
+ *   String sName = kStr[0];
+ *   String fName = kStr[1];
+ *
+ *   // example for k = Smith|John
+ *   // output written to /user/hadoop/path/to/output/Smith/John-r-00000 (etc)
+ *   return sName + "/" + fName;
+ * }
+ * </pre>
+ * 
+ * <p>
+ * Using MultipleOutputs in this way will still create zero-sized default output, eg part-00000.
+ * To prevent this use <code>LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);</code>
+ * instead of <code>job.setOutputFormatClass(TextOutputFormat.class);</code>
in your Hadoop job configuration.
+ * </p> 
+ * 
  */
 @InterfaceAudience.Public
 @InterfaceStability.Stable



Mime
View raw message