hadoop-common-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From a.@apache.org
Subject [12/31] hadoop git commit: HADOOP-13110. add a streaming subcommand to mapred
Date Sun, 15 May 2016 14:50:52 GMT
HADOOP-13110. add a streaming subcommand to mapred


Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/584a9156
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/584a9156
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/584a9156

Branch: refs/heads/HADOOP-12930
Commit: 584a915611fada6779bb74cb15b511aeed9c3a36
Parents: 1dcd9a9
Author: Allen Wittenauer <aw@apache.org>
Authored: Fri May 6 14:00:56 2016 -0700
Committer: Allen Wittenauer <aw@apache.org>
Committed: Sun May 15 07:50:15 2016 -0700

----------------------------------------------------------------------
 .../main/resources/assemblies/hadoop-tools.xml  |  8 +++
 .../apache/hadoop/streaming/DumpTypedBytes.java |  3 +-
 .../hadoop/streaming/HadoopStreaming.java       |  3 +-
 .../apache/hadoop/streaming/LoadTypedBytes.java |  3 +-
 .../src/main/shellprofile.d/hadoop-streaming.sh | 55 ++++++++++++++++++++
 .../src/site/markdown/HadoopStreaming.md.vm     | 30 +++++------
 6 files changed, 81 insertions(+), 21 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hadoop/blob/584a9156/hadoop-assemblies/src/main/resources/assemblies/hadoop-tools.xml
----------------------------------------------------------------------
diff --git a/hadoop-assemblies/src/main/resources/assemblies/hadoop-tools.xml b/hadoop-assemblies/src/main/resources/assemblies/hadoop-tools.xml
index 8606e23..3909277 100644
--- a/hadoop-assemblies/src/main/resources/assemblies/hadoop-tools.xml
+++ b/hadoop-assemblies/src/main/resources/assemblies/hadoop-tools.xml
@@ -148,6 +148,14 @@
       </includes>
     </fileSet>
     <fileSet>
+      <directory>../hadoop-streaming/src/main/shellprofile.d</directory>
+      <includes>
+        <include>*</include>
+      </includes>
+      <outputDirectory>/libexec/shellprofile.d</outputDirectory>
+      <fileMode>0755</fileMode>
+    </fileSet>
+    <fileSet>
       <directory>../hadoop-sls/target</directory>
       <outputDirectory>/share/hadoop/${hadoop.component}/sources</outputDirectory>
       <includes>

http://git-wip-us.apache.org/repos/asf/hadoop/blob/584a9156/hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/DumpTypedBytes.java
----------------------------------------------------------------------
diff --git a/hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/DumpTypedBytes.java
b/hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/DumpTypedBytes.java
index 5a07cc3..ffddc7c 100644
--- a/hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/DumpTypedBytes.java
+++ b/hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/DumpTypedBytes.java
@@ -91,8 +91,7 @@ public class DumpTypedBytes implements Tool {
   }
 
   private void printUsage() {
-    System.out.println("Usage: $HADOOP_HOME/bin/hadoop jar hadoop-streaming.jar"
-        + " dumptb <glob-pattern>");
+    System.out.println("Usage: mapred streaming dumptb <glob-pattern>");
     System.out.println("  Dumps all files that match the given pattern to " +
         "standard output as typed bytes.");
     System.out.println("  The files can be text or sequence files");

http://git-wip-us.apache.org/repos/asf/hadoop/blob/584a9156/hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/HadoopStreaming.java
----------------------------------------------------------------------
diff --git a/hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/HadoopStreaming.java
b/hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/HadoopStreaming.java
index eabf46c..92f9d03 100644
--- a/hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/HadoopStreaming.java
+++ b/hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/HadoopStreaming.java
@@ -56,8 +56,7 @@ public class HadoopStreaming {
   }
   
   private static void printUsage() {
-    System.out.println("Usage: $HADOOP_HOME/bin/hadoop jar hadoop-streaming.jar"
-        + " [options]");
+    System.out.println("Usage: mapred streaming [options]");
     System.out.println("Options:");
     System.out.println("  dumptb <glob-pattern> Dumps all files that match the" 
         + " given pattern to ");

http://git-wip-us.apache.org/repos/asf/hadoop/blob/584a9156/hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/LoadTypedBytes.java
----------------------------------------------------------------------
diff --git a/hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/LoadTypedBytes.java
b/hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/LoadTypedBytes.java
index a7a001c..838cfa1 100644
--- a/hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/LoadTypedBytes.java
+++ b/hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/LoadTypedBytes.java
@@ -89,8 +89,7 @@ public class LoadTypedBytes implements Tool {
   }
 
   private void printUsage() {
-    System.out.println("Usage: $HADOOP_HOME/bin/hadoop jar hadoop-streaming.jar"
-        + " loadtb <path>");
+    System.out.println("Usage: mapred streaming loadtb <path>");
     System.out.println("  Reads typed bytes from standard input" +
     " and stores them in a sequence file in");
     System.out.println("  the specified path");

http://git-wip-us.apache.org/repos/asf/hadoop/blob/584a9156/hadoop-tools/hadoop-streaming/src/main/shellprofile.d/hadoop-streaming.sh
----------------------------------------------------------------------
diff --git a/hadoop-tools/hadoop-streaming/src/main/shellprofile.d/hadoop-streaming.sh b/hadoop-tools/hadoop-streaming/src/main/shellprofile.d/hadoop-streaming.sh
new file mode 100755
index 0000000..cca016d
--- /dev/null
+++ b/hadoop-tools/hadoop-streaming/src/main/shellprofile.d/hadoop-streaming.sh
@@ -0,0 +1,55 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if ! declare -f mapred_subcommand_streaming >/dev/null 2>/dev/null; then
+
+  if [[ "${HADOOP_SHELL_EXECNAME}" = mapred ]]; then
+    hadoop_add_subcommand "streaming" "launch a mapreduce streaming job"
+  fi
+
+## @description  streaming command for mapred
+## @audience     public
+## @stability    stable
+## @replaceable  yes
+function mapred_subcommand_streaming
+{
+  declare jarname
+  declare oldifs
+
+  # shellcheck disable=SC2034
+  HADOOP_CLASSNAME=org.apache.hadoop.util.RunJar
+  hadoop_add_to_classpath_tools hadoop-streaming
+
+  # locate the streaming jar so we have something to
+  # give to RunJar
+  oldifs=${IFS}
+  IFS=:
+  for jarname in ${CLASSPATH}; do
+    if [[ "${jarname}" =~ hadoop-streaming-[0-9] ]]; then
+      HADOOP_SUBCMD_ARGS=("${jarname}" "${HADOOP_SUBCMD_ARGS[@]}")
+      break
+    fi
+  done
+
+  IFS=${oldifs}
+
+  hadoop_debug "Appending HADOOP_CLIENT_OPTS onto HADOOP_OPTS"
+  HADOOP_OPTS="${HADOOP_OPTS} ${HADOOP_CLIENT_OPTS}"
+
+}
+
+fi

http://git-wip-us.apache.org/repos/asf/hadoop/blob/584a9156/hadoop-tools/hadoop-streaming/src/site/markdown/HadoopStreaming.md.vm
----------------------------------------------------------------------
diff --git a/hadoop-tools/hadoop-streaming/src/site/markdown/HadoopStreaming.md.vm b/hadoop-tools/hadoop-streaming/src/site/markdown/HadoopStreaming.md.vm
index cc8ed69..072a68b 100644
--- a/hadoop-tools/hadoop-streaming/src/site/markdown/HadoopStreaming.md.vm
+++ b/hadoop-tools/hadoop-streaming/src/site/markdown/HadoopStreaming.md.vm
@@ -62,7 +62,7 @@ Hadoop Streaming
 
 Hadoop streaming is a utility that comes with the Hadoop distribution. The utility allows
you to create and run Map/Reduce jobs with any executable or script as the mapper and/or the
reducer. For example:
 
-    hadoop jar hadoop-streaming-${project.version}.jar \
+    mapred streaming \
       -input myInputDirs \
       -output myOutputDir \
       -mapper /bin/cat \
@@ -88,7 +88,7 @@ Streaming supports streaming command options as well as [generic command
options
 
 **Note:** Be sure to place the generic options before the streaming options, otherwise the
command will fail. For an example, see [Making Archives Available to Tasks](#Making_Archives_Available_to_Tasks).
 
-    hadoop command [genericOptions] [streamingOptions]
+    mapred streaming [genericOptions] [streamingOptions]
 
 The Hadoop streaming command options are listed here:
 
@@ -115,7 +115,7 @@ $H3 Specifying a Java Class as the Mapper/Reducer
 
 You can supply a Java class as the mapper and/or the reducer.
 
-    hadoop jar hadoop-streaming-${project.version}.jar \
+    mapred streaming \
       -input myInputDirs \
       -output myOutputDir \
       -inputformat org.apache.hadoop.mapred.KeyValueTextInputFormat \
@@ -128,7 +128,7 @@ $H3 Packaging Files With Job Submissions
 
 You can specify any executable as the mapper and/or the reducer. The executables do not need
to pre-exist on the machines in the cluster; however, if they don't, you will need to use
"-file" option to tell the framework to pack your executable files as a part of job submission.
For example:
 
-    hadoop jar hadoop-streaming-${project.version}.jar \
+    mapred streaming \
       -input myInputDirs \
       -output myOutputDir \
       -mapper myPythonScript.py \
@@ -139,7 +139,7 @@ The above example specifies a user defined Python executable as the mapper.
The
 
 In addition to executable files, you can also package other auxiliary files (such as dictionaries,
configuration files, etc) that may be used by the mapper and/or the reducer. For example:
 
-    hadoop jar hadoop-streaming-${project.version}.jar \
+    mapred streaming \
       -input myInputDirs \
       -output myOutputDir \
       -mapper myPythonScript.py \
@@ -216,7 +216,7 @@ $H4 Specifying the Number of Reducers
 
 To specify the number of reducers, for example two, use:
 
-    hadoop jar hadoop-streaming-${project.version}.jar \
+    mapred streaming \
       -D mapreduce.job.reduces=2 \
       -input myInputDirs \
       -output myOutputDir \
@@ -229,7 +229,7 @@ As noted earlier, when the Map/Reduce framework reads a line from the
stdout of
 
 However, you can customize this default. You can specify a field separator other than the
tab character (the default), and you can specify the nth (n \>= 1) character rather than
the first character in a line (the default) as the separator between the key and value. For
example:
 
-    hadoop jar hadoop-streaming-${project.version}.jar \
+    mapred streaming \
       -D stream.map.output.field.separator=. \
       -D stream.num.map.output.key.fields=4 \
       -input myInputDirs \
@@ -279,7 +279,7 @@ User can specify a different symlink name for -archives using \#.
 
 In this example, the input.txt file has two lines specifying the names of the two files:
cachedir.jar/cache.txt and cachedir.jar/cache2.txt. "cachedir.jar" is a symlink to the archived
directory, which has the files "cache.txt" and "cache2.txt".
 
-    hadoop jar hadoop-streaming-${project.version}.jar \
+    mapred streaming \
                     -archives 'hdfs://hadoop-nn1.example.com/user/me/samples/cachefile/cachedir.jar'
\
                     -D mapreduce.job.maps=1 \
                     -D mapreduce.job.reduces=1 \
@@ -325,7 +325,7 @@ $H3 Hadoop Partitioner Class
 
 Hadoop has a library class, [KeyFieldBasedPartitioner](../api/org/apache/hadoop/mapred/lib/KeyFieldBasedPartitioner.html),
that is useful for many applications. This class allows the Map/Reduce framework to partition
the map outputs based on certain key fields, not the whole keys. For example:
 
-    hadoop jar hadoop-streaming-${project.version}.jar \
+    mapred streaming \
       -D stream.map.output.field.separator=. \
       -D stream.num.map.output.key.fields=4 \
       -D map.output.key.field.separator=. \
@@ -375,7 +375,7 @@ $H3 Hadoop Comparator Class
 
 Hadoop has a library class, [KeyFieldBasedComparator](../api/org/apache/hadoop/mapreduce/lib/partition/KeyFieldBasedComparator.html),
that is useful for many applications. This class provides a subset of features provided by
the Unix/GNU Sort. For example:
 
-    hadoop jar hadoop-streaming-${project.version}.jar \
+    mapred streaming \
       -D mapreduce.job.output.key.comparator.class=org.apache.hadoop.mapreduce.lib.partition.KeyFieldBasedComparator
\
       -D stream.map.output.field.separator=. \
       -D stream.num.map.output.key.fields=4 \
@@ -411,7 +411,7 @@ Hadoop has a library package called [Aggregate](../api/org/apache/hadoop/mapred/
 
 To use Aggregate, simply specify "-reducer aggregate":
 
-    hadoop jar hadoop-streaming-${project.version}.jar \
+    mapred streaming \
       -input myInputDirs \
       -output myOutputDir \
       -mapper myAggregatorForKeyCount.py \
@@ -444,7 +444,7 @@ $H3 Hadoop Field Selection Class
 
 Hadoop has a library class, [FieldSelectionMapReduce](../api/org/apache/hadoop/mapred/lib/FieldSelectionMapReduce.html),
that effectively allows you to process text data like the unix "cut" utility. The map function
defined in the class treats each input key/value pair as a list of fields. You can specify
the field separator (the default is the tab character). You can select an arbitrary list of
fields as the map output key, and an arbitrary list of fields as the map output value. Similarly,
the reduce function defined in the class treats each input key/value pair as a list of fields.
You can select an arbitrary list of fields as the reduce output key, and an arbitrary list
of fields as the reduce output value. For example:
 
-    hadoop jar hadoop-streaming-${project.version}.jar \
+    mapred streaming \
       -D mapreduce.map.output.key.field.separator=. \
       -D mapreduce.partition.keypartitioner.options=-k1,2 \
       -D mapreduce.fieldsel.data.field.separator=. \
@@ -495,7 +495,7 @@ Using an alias will not work, but variable substitution is allowed as
shown in t
     charlie 80
     dan     75
 
-    $ c2='cut -f2'; hadoop jar hadoop-streaming-${project.version}.jar \
+    $ c2='cut -f2'; mapred streaming \
       -D mapreduce.job.name='Experiment' \
       -input /user/me/samples/student_marks \
       -output /user/me/samples/student_out \
@@ -525,7 +525,7 @@ $H3 How do I specify multiple input directories?
 
 You can specify multiple input directories with multiple '-input' options:
 
-    hadoop jar hadoop-streaming-${project.version}.jar \
+    mapred streaming \
       -input '/user/foo/dir1' -input '/user/foo/dir2' \
         (rest of the command)
 
@@ -541,7 +541,7 @@ $H3 How do I parse XML documents using streaming?
 
 You can use the record reader StreamXmlRecordReader to process XML documents.
 
-    hadoop jar hadoop-streaming-${project.version}.jar \
+    mapred streaming \
       -inputreader "StreamXmlRecord,begin=BEGIN_STRING,end=END_STRING" \
         (rest of the command)
 


---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org
For additional commands, e-mail: common-commits-help@hadoop.apache.org


Mime
View raw message