hadoop-common-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From cnaur...@apache.org
Subject svn commit: r1495297 [16/46] - in /hadoop/common/branches/branch-1-win: ./ bin/ conf/ ivy/ lib/jdiff/ src/c++/libhdfs/docs/ src/c++/libhdfs/tests/conf/ src/contrib/capacity-scheduler/ivy/ src/contrib/capacity-scheduler/src/java/org/apache/hadoop/mapred...
Date Fri, 21 Jun 2013 06:37:39 GMT
Modified: hadoop/common/branches/branch-1-win/src/docs/src/documentation/content/xdocs/HttpAuthentication.xml
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-1-win/src/docs/src/documentation/content/xdocs/HttpAuthentication.xml?rev=1495297&r1=1495296&r2=1495297&view=diff
==============================================================================
--- hadoop/common/branches/branch-1-win/src/docs/src/documentation/content/xdocs/HttpAuthentication.xml (original)
+++ hadoop/common/branches/branch-1-win/src/docs/src/documentation/content/xdocs/HttpAuthentication.xml Fri Jun 21 06:37:27 2013
@@ -110,7 +110,7 @@
 
       <p><code>hadoop.http.authentication.kerberos.principal</code>: Indicates the Kerberos 
       principal to be used for HTTP endpoint when using 'kerberos' authentication.
-      The principal short name must be <code>HTTP</code> per Kerberos HTTP SPENGO specification.
+      The principal short name must be <code>HTTP</code> per Kerberos HTTP SPNEGO specification.
       The default value is <code>HTTP/localhost@$LOCALHOST</code>.
       </p>
 

Modified: hadoop/common/branches/branch-1-win/src/docs/src/documentation/content/xdocs/Secure_Impersonation.xml
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-1-win/src/docs/src/documentation/content/xdocs/Secure_Impersonation.xml?rev=1495297&r1=1495296&r2=1495297&view=diff
==============================================================================
--- hadoop/common/branches/branch-1-win/src/docs/src/documentation/content/xdocs/Secure_Impersonation.xml (original)
+++ hadoop/common/branches/branch-1-win/src/docs/src/documentation/content/xdocs/Secure_Impersonation.xml Fri Jun 21 06:37:27 2013
@@ -88,6 +88,9 @@
         <p>
            If these configurations are not present, impersonation will not be allowed and connection will fail.
         </p>
+        <p>
+           If more lax security is preferred, the wildcard value <code>*</code> may be used to allow impersonation from any host or of any user.
+        </p>
       </section>
 
  

Added: hadoop/common/branches/branch-1-win/src/docs/src/documentation/content/xdocs/cli_minicluster.xml
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-1-win/src/docs/src/documentation/content/xdocs/cli_minicluster.xml?rev=1495297&view=auto
==============================================================================
--- hadoop/common/branches/branch-1-win/src/docs/src/documentation/content/xdocs/cli_minicluster.xml (added)
+++ hadoop/common/branches/branch-1-win/src/docs/src/documentation/content/xdocs/cli_minicluster.xml Fri Jun 21 06:37:27 2013
@@ -0,0 +1,94 @@
+<?xml version="1.0"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+
+<!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V2.0//EN" "http://forrest.apache.org/dtd/document-v20.dtd">
+
+<document>
+
+  <header>
+    <title>CLI MiniCluster</title>
+  </header>
+  <body>
+
+    <section>
+      <title>Purpose</title>
+      <p>
+         Using the CLI MiniCluster, users can simply start and stop
+         a single-node Hadoop cluster with a single command, and without the
+         need to set any environment variables or manage configuration files.
+         The CLI MiniCluster starts both a MapReduce and HDF clusters. This is
+         useful for cases where users want to quickly experiment with a real
+         Hadoop cluster or test non-Java programs that rely on significant
+         Hadoop functionality.
+      </p>
+    </section>
+
+    <section>
+      <title>Hadoop Tarball</title>
+      <p>
+         To get a Hadoop distribution, download a recent
+         <a href="ext:releases">stable release</a> from one of the Apache
+         Download Mirrors. Unpack the downloaded Hadoop distribution. In the
+         distribution, edit the file <code>conf/hadoop-env.sh</code> to define at
+         least <code>JAVA_HOME</code> to be the root of your Java installation.
+      </p>
+    </section>
+
+    <section>
+      <title>Running the MiniCluster</title>
+      <p>
+         From inside the root directory of the extracted tarball, you can start the
+         CLI MiniCluster using the following command:
+      </p>
+      <p>
+         <code>$ bin/hadoop jar hadoop-test-*.jar minicluster -jtport JT_PORT -nnport NN_PORT</code>
+      </p>
+      <p>
+         In the example command above, JT_PORT and NN_PORT should be replaced by
+         the user's choice of these port numbers. If not specified, random free
+         ports will be used.
+      </p>
+      <p>
+         There are a number of command line arguments that the users can use to
+         control which services to start, and to pass other configuration
+         properties. The available command line arguments:
+      </p>
+      <table>
+        <tr><th>Argument</th><th>Description</th></tr>
+        <tr><td><code>-D &lt;property=value&gt;</code></td><td>Options to pass into configuration object</td></tr>
+        <tr><td><code>-datanodes &lt;arg&gt;</code></td><td>How many datanodes to start (default 1)</td></tr>
+        <tr><td><code>-format</code></td><td>Format the DFS (default false)</td></tr>
+        <tr><td><code>-help</code></td><td>Prints option help.</td></tr>
+        <tr><td><code>-jhsport &lt;arg&gt;</code></td><td>JobHistoryServer port (default 0--we choose)</td></tr>
+        <tr><td><code>-namenode &lt;arg&gt;</code></td><td>URL of the namenode (default is either the DFS cluster or a temporary dir)</td></tr>
+        <tr><td><code>-nnport &lt;arg&gt;</code></td><td>NameNode port (default 0--we choose)</td></tr>
+        <tr><td><code>-nodemanagers &lt;arg&gt;</code></td><td>How many nodemanagers to start (default 1)</td></tr>
+        <tr><td><code>-nodfs</code></td><td>Don't start a mini DFS cluster</td></tr>
+        <tr><td><code>-nomr</code></td><td>Don't start a mini MR cluster</td></tr>
+        <tr><td><code>-rmport &lt;arg&gt;</code></td><td>ResourceManager port (default 0--we choose)</td></tr>
+        <tr><td><code>-writeConfig &lt;path&gt;</code></td><td>Save configuration to this XML file.</td></tr>
+        <tr><td><code>-writeDetails &lt;path&gt;</code></td><td>Write basic information to this JSON file.</td></tr>
+      </table>
+      <p>
+         To display this full list of available arguments, the user can pass the
+         <code>-help</code> argument to the above command.
+      </p>
+    </section>
+
+  </body>
+</document>

Modified: hadoop/common/branches/branch-1-win/src/docs/src/documentation/content/xdocs/cluster_setup.xml
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-1-win/src/docs/src/documentation/content/xdocs/cluster_setup.xml?rev=1495297&r1=1495296&r2=1495297&view=diff
==============================================================================
--- hadoop/common/branches/branch-1-win/src/docs/src/documentation/content/xdocs/cluster_setup.xml (original)
+++ hadoop/common/branches/branch-1-win/src/docs/src/documentation/content/xdocs/cluster_setup.xml Fri Jun 21 06:37:27 2013
@@ -122,6 +122,11 @@
           <p>At the very least you should specify the
           <code>JAVA_HOME</code> so that it is correctly defined on each
           remote node.</p>
+
+          <p>In most cases you should also specify <code>HADOOP_PID_DIR</code>
+          to point a directory that can only be written to by the users that
+          are going to run the hadoop daemons.  Otherwise there is the
+          potential for a symlink attack.</p>
           
           <p>Administrators can configure individual daemons using the
           configuration options <code>HADOOP_*_OPTS</code>. Various options 
@@ -716,7 +721,7 @@
         A TT ensures that a task is killed if it, and 
         its descendants, use VMEM over the task's per-task limit. It also 
         ensures that one or more tasks are killed if the sum total of VMEM 
-        usage by all tasks, and their descendents, cross the node-limit.</p>
+        usage by all tasks, and their descendants, cross the node-limit.</p>
         
         <p>Users can, optionally, specify the VMEM task-limit per job. If no
         such limit is provided, a default limit is used. A node-limit can be 
@@ -728,20 +733,25 @@
 
         <table>
           <tr><th>Name</th><th>Type</th><th>Description</th></tr>
-          <tr><td>mapred.tasktracker.vmem.reserved</td><td>long</td>
-            <td>A number, in bytes, that represents an offset. The total VMEM on 
-            the machine, minus this offset, is the VMEM node-limit for all 
-            tasks, and their descendants, spawned by the TT. 
+          <tr><td><code>mapred.cluster.map.memory.mb</code>, <code>mapred.cluster.reduce.memory.mb</code></td><td>long</td>
+            <td>The size, in terms of virtual memory, of a single map/reduce slot
+                in the Map-Reduce framework, used by the scheduler.
+                A job can ask for multiple slots for a single task via
+                mapred.job.map.memory.mb/mapred.job.reduce.memory.mb, up to the limit specified by
+                mapred.cluster.max.map.memory.mb/mapred.cluster.max.reduce.memory.mb, if the scheduler supports the feature.
+                The value of -1 indicates that this feature is turned off.
           </td></tr>
-          <tr><td>mapred.task.default.maxvmem</td><td>long</td>
+          <tr><td><code>mapred.job.map.memory.mb</code>, <code>mapred.job.reduce.memory.mb</code></td><td>long</td>
             <td>A number, in bytes, that represents the default VMEM task-limit 
-            associated with a task. Unless overridden by a job's setting, 
-            this number defines the VMEM task-limit.   
+            associated with a map/reduce task. Unless overridden by a job's setting,
+            this number defines the VMEM task-limit. These properties replace the old deprecated property,
+                <code>mapred.task.default.maxvmem</code>.
           </td></tr>
-          <tr><td>mapred.task.limit.maxvmem</td><td>long</td>
+          <tr><td><code>mapred.cluster.max.map.memory.mb</code>, <code>mapred.cluster.max.reduce.memory.mb</code></td><td>long</td>
             <td>A number, in bytes, that represents the upper VMEM task-limit 
-            associated with a task. Users, when specifying a VMEM task-limit 
-            for their tasks, should not specify a limit which exceeds this amount. 
+            associated with a map/reduce task. Users, when specifying a VMEM task-limit
+            for their tasks, should not specify a limit which exceeds this amount. These properties replace the old deprecated property,
+                <code>mapred.task.limit.maxvmem</code>.
           </td></tr>
         </table>
         
@@ -749,7 +759,7 @@
 
     <table>
           <tr><th>Name</th><th>Type</th><th>Description</th></tr>
-          <tr><td>mapred.tasktracker.taskmemorymanager.monitoring-interval</td>
+          <tr><td><code>mapred.tasktracker.taskmemorymanager.monitoring-interval</code></td>
             <td>long</td>
             <td>The time interval, in milliseconds, between which the TT 
             checks for any memory violation. The default value is 5000 msec
@@ -763,14 +773,6 @@
           above are missing or -1 is specified , memory monitoring is 
           disabled for the TT.
           </li>
-          <li>In addition, monitoring is disabled if 
-          <code>mapred.task.default.maxvmem</code> is greater than 
-          <code>mapred.task.limit.maxvmem</code>. 
-          </li>
-          <li>If a TT receives a task whose task-limit is set by the user
-          to a value larger than <code>mapred.task.limit.maxvmem</code>, it 
-          logs a warning but executes the task.
-          </li> 
           <li>Periodically, the TT checks the following: 
           <ul>
             <li>If any task's current VMEM usage is greater than that task's
@@ -781,7 +783,7 @@
             <li>If the sum total of VMEM used by all tasks and descendants is 
             greater than the node-limit, the TT kills enough tasks, in the
             order of least progress made, till the overall VMEM usage falls
-            below the node-limt. Such killed tasks are not considered failed
+            below the node-limit. Such killed tasks are not considered failed
             and their killing does not count towards the tasks' failure counts.
             </li>
           </ul>
@@ -793,31 +795,15 @@
         tasks only if the TT has enough VMEM free. In addition, Schedulers may 
         choose to consider the physical memory (RAM) available on the node
         as well. To enable Scheduler support, TTs report their memory settings 
-        to the JobTracker in every heartbeat. Before getting into details, 
-        consider the following additional memory-related parameters than can be 
-        configured to enable better scheduling:</p> 
-
-        <table>
-          <tr><th>Name</th><th>Type</th><th>Description</th></tr>
-          <tr><td>mapred.tasktracker.pmem.reserved</td><td>int</td>
-            <td>A number, in bytes, that represents an offset. The total 
-            physical memory (RAM) on the machine, minus this offset, is the 
-            recommended RAM node-limit. The RAM node-limit is a hint to a
-            Scheduler to scheduler only so many tasks such that the sum 
-            total of their RAM requirements does not exceed this limit. 
-            RAM usage is not monitored by a TT.   
-          </td></tr>
-        </table>
+        to the JobTracker in every heartbeat.</p>
         
         <p>A TT reports the following memory-related numbers in every 
         heartbeat:</p>
         <ul>
           <li>The total VMEM available on the node.</li>
-          <li>The value of <code>mapred.tasktracker.vmem.reserved</code>,
-           if set.</li>
+          <li>The remaining VMEM available on the node.</li>
           <li>The total RAM available on the node.</li> 
-          <li>The value of <code>mapred.tasktracker.pmem.reserved</code>,
-           if set.</li>
+          <li>The remaining RAM available on the node.</li>
          </ul>
         </section>
         
@@ -841,7 +827,9 @@
           log4j</a> via the <a href="http://commons.apache.org/logging/">Apache 
           Commons Logging</a> framework for logging. Edit the 
           <code>conf/log4j.properties</code> file to customize the Hadoop 
-          daemons' logging configuration (log-formats and so on).</p>
+          daemons' logging configuration (log-formats and so on). Edit
+          conf/task-log4j.properties file to customize the logging
+          configuration for MapReduce tasks.</p>
           
           <section>
             <title>History Logging</title>

Modified: hadoop/common/branches/branch-1-win/src/docs/src/documentation/content/xdocs/commands_manual.xml
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-1-win/src/docs/src/documentation/content/xdocs/commands_manual.xml?rev=1495297&r1=1495296&r2=1495297&view=diff
==============================================================================
--- hadoop/common/branches/branch-1-win/src/docs/src/documentation/content/xdocs/commands_manual.xml (original)
+++ hadoop/common/branches/branch-1-win/src/docs/src/documentation/content/xdocs/commands_manual.xml Fri Jun 21 06:37:27 2013
@@ -631,14 +631,16 @@
 					<a href="hdfs_user_guide.html#Upgrade+and+Rollback">Upgrade Rollback</a>
 				</p>
 				<p>
-					<code>Usage: hadoop namenode [-format] | [-upgrade] | [-rollback] | [-finalize] | [-importCheckpoint]</code>
+					<code>Usage: hadoop namenode [-format [-force] [-nonInteractive]] | [-upgrade] | [-rollback] | [-finalize] | [-importCheckpoint]</code>
 				</p>
 				<table>
 			          <tr><th> COMMAND_OPTION </th><th> Description </th></tr>
 			
 			           <tr>
-			          	<td><code>-format</code></td>
-			            <td>Formats the namenode. It starts the namenode, formats it and then shut it down.</td>
+			          	<td><code>-format [-force] [-nonInteractive]</code></td>
+			            <td>Formats the namenode. It starts the namenode, formats it and then shuts it down. User will be prompted for input if the name directories exist on the local filesystem.<br/>
+                                    -nonInteractive: User will not be prompted for input if the name directories exist in the local filesystem and the format will fail.<br/>
+                                    -force: formats the namenode and the user will NOT be prompted to confirm formatting of name directories in the local filesystem. If -nonInteractive option is specified it will be ignored.</td>
 			           </tr>
 			           <tr>
 			          	<td><code>-upgrade</code></td>

Added: hadoop/common/branches/branch-1-win/src/docs/src/documentation/content/xdocs/distcp2.xml
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-1-win/src/docs/src/documentation/content/xdocs/distcp2.xml?rev=1495297&view=auto
==============================================================================
--- hadoop/common/branches/branch-1-win/src/docs/src/documentation/content/xdocs/distcp2.xml (added)
+++ hadoop/common/branches/branch-1-win/src/docs/src/documentation/content/xdocs/distcp2.xml Fri Jun 21 06:37:27 2013
@@ -0,0 +1,673 @@
+<?xml version="1.0"?>
+<!--
+  Copyright 2002-2004 The Apache Software Foundation
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+
+<!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V2.0//EN" "http://forrest.apache.org/dtd/document-v20.dtd">
+
+<document>
+
+  <header>
+    <title>DistCp Version 2 Guide</title>
+  </header>
+
+  <body>
+
+    <section id="Overview">
+      <title>Overview</title>
+
+      <p>DistCp Version 2 (distributed copy) is a tool used for large inter/intra-cluster
+      copying. It uses MapReduce to effect its distribution, error
+      handling and recovery, and reporting. It expands a list of files and
+      directories into input to map tasks, each of which will copy a partition
+      of the files specified in the source list.
+      </p>
+      <p>
+       The erstwhile implementation of DistCp has its share of quirks and
+       drawbacks, both in its usage, as well as its extensibility and
+       performance. The purpose of the DistCp refactor was to fix these shortcomings,
+       enabling it to be used and extended programmatically. New paradigms have
+       been introduced to improve runtime and setup performance, while simultaneously
+       retaining the legacy behaviour as default.
+      </p>
+      <p>
+       This document aims to describe the design of the new DistCp, its spanking
+       new features, their optimal use, and any deviance from the legacy
+       implementation.
+      </p>
+    </section>
+
+    <section id="Usage">
+      <title>Usage</title>
+      <section id="BasicUsage">
+        <title>Basic Usage</title>
+        <p>The most common invocation of DistCp is an inter-cluster copy:</p>
+        <p><code>bash$ hadoop distcp2 hdfs://nn1:8020/foo/bar \</code><br/>
+           <code>                    hdfs://nn2:8020/bar/foo</code></p>
+
+        <p>This will expand the namespace under <code>/foo/bar</code> on nn1
+        into a temporary file, partition its contents among a set of map
+        tasks, and start a copy on each TaskTracker from nn1 to nn2.</p>
+
+        <p>One can also specify multiple source directories on the command
+        line:</p>
+        <p><code>bash$ hadoop distcp2 hdfs://nn1:8020/foo/a \</code><br/>
+           <code> hdfs://nn1:8020/foo/b \</code><br/>
+           <code> hdfs://nn2:8020/bar/foo</code></p>
+
+        <p>Or, equivalently, from a file using the <code>-f</code> option:<br/>
+        <code>bash$ hadoop distcp2 -f hdfs://nn1:8020/srclist \</code><br/>
+        <code> hdfs://nn2:8020/bar/foo</code><br/></p>
+
+        <p>Where <code>srclist</code> contains<br/>
+        <code>hdfs://nn1:8020/foo/a</code><br/>
+        <code>hdfs://nn1:8020/foo/b</code></p>
+
+        <p>When copying from multiple sources, DistCp will abort the copy with
+        an error message if two sources collide, but collisions at the
+        destination are resolved per the <a href="#CommandLineOptions">options</a>
+        specified. By default, files already existing at the destination are
+        skipped (i.e. not replaced by the source file). A count of skipped
+        files is reported at the end of each job, but it may be inaccurate if a
+        copier failed for some subset of its files, but succeeded on a later
+        attempt.</p>
+
+        <p>It is important that each TaskTracker can reach and communicate with
+        both the source and destination file systems. For HDFS, both the source
+        and destination must be running the same version of the protocol or use
+        a backwards-compatible protocol; 
+        see <a href="#CopyingBetweenVersionsOfHDFS">Copying Between Versions</a>.
+        </p>
+
+        <p>After a copy, it is recommended that one generates and cross-checks
+        a listing of the source and destination to verify that the copy was
+        truly successful. Since DistCp employs both Map/Reduce and the
+        FileSystem API, issues in or between any of the three could adversely
+        and silently affect the copy. Some have had success running with
+        <code>-update</code> enabled to perform a second pass, but users should
+        be acquainted with its semantics before attempting this.</p>
+
+        <p>It's also worth noting that if another client is still writing to a
+        source file, the copy will likely fail. Attempting to overwrite a file
+        being written at the destination should also fail on HDFS. If a source
+        file is (re)moved before it is copied, the copy will fail with a
+        FileNotFoundException.</p>
+
+        <p>Please refer to the detailed Command Line Reference for information
+        on all the options available in DistCp.</p>
+      </section>
+        
+      <section id="UpdateAndOverwrite">
+        <title>Update and Overwrite</title>
+        <p><code>-update</code> is used to copy files from source that don't
+        exist at the target, or have different contents. <code>-overwrite</code>
+        overwrites target-files even if they exist at the source, or have the
+        same contents.</p>
+
+        <p><br/>Update and Overwrite options warrant special attention, since their
+        handling of source-paths varies from the defaults in a very subtle manner.
+        Consider a copy from <code>/source/first/</code> and
+        <code>/source/second/</code> to <code>/target/</code>, where the source
+        paths have the following contents:</p>
+
+        <p><code>hdfs://nn1:8020/source/first/1</code><br/>
+           <code>hdfs://nn1:8020/source/first/2</code><br/>
+           <code>hdfs://nn1:8020/source/second/10</code><br/>
+           <code>hdfs://nn1:8020/source/second/20</code><br/></p>
+
+        <p><br/>When DistCp is invoked without <code>-update</code> or
+        <code>-overwrite</code>, the DistCp defaults would create directories
+        <code>first/</code> and <code>second/</code>, under <code>/target</code>.
+        Thus:<br/></p>
+
+        <p><code>distcp2 hdfs://nn1:8020/source/first hdfs://nn1:8020/source/second hdfs://nn2:8020/target</code></p>
+        <p><br/>would yield the following contents in <code>/target</code>: </p>
+
+        <p><code>hdfs://nn2:8020/target/first/1</code><br/>
+           <code>hdfs://nn2:8020/target/first/2</code><br/>
+           <code>hdfs://nn2:8020/target/second/10</code><br/>
+           <code>hdfs://nn2:8020/target/second/20</code><br/></p>
+
+        <p><br/>When either <code>-update</code> or <code>-overwrite</code> is
+            specified, the <strong>contents</strong> of the source-directories
+            are copied to target, and not the source directories themselves. Thus: </p>
+
+        <p><code>distcp2 -update hdfs://nn1:8020/source/first hdfs://nn1:8020/source/second hdfs://nn2:8020/target</code></p>
+
+        <p><br/>would yield the following contents in <code>/target</code>: </p>
+
+        <p><code>hdfs://nn2:8020/target/1</code><br/>
+           <code>hdfs://nn2:8020/target/2</code><br/>
+           <code>hdfs://nn2:8020/target/10</code><br/>
+           <code>hdfs://nn2:8020/target/20</code><br/></p>
+
+        <p><br/>By extension, if both source folders contained a file with the same
+        name (say, <code>0</code>), then both sources would map an entry to
+        <code>/target/0</code> at the destination. Rather than to permit this
+        conflict, DistCp will abort.</p>
+
+        <p><br/>Now, consider the following copy operation:</p>
+
+        <p><code>distcp2 hdfs://nn1:8020/source/first hdfs://nn1:8020/source/second hdfs://nn2:8020/target</code></p>
+
+        <p><br/>With sources/sizes:</p>
+
+        <p><code>hdfs://nn1:8020/source/first/1     32</code><br/>
+           <code>hdfs://nn1:8020/source/first/2     32</code><br/>
+           <code>hdfs://nn1:8020/source/second/10   64</code><br/>
+           <code>hdfs://nn1:8020/source/second/20   32</code><br/></p>
+
+        <p><br/>And destination/sizes:</p>
+
+        <p><code>hdfs://nn2:8020/target/1   32</code><br/>
+           <code>hdfs://nn2:8020/target/10  32</code><br/>
+           <code>hdfs://nn2:8020/target/20  64</code><br/></p>
+
+        <p><br/>Will effect: </p>
+
+        <p><code>hdfs://nn2:8020/target/1   32</code><br/>
+           <code>hdfs://nn2:8020/target/2   32</code><br/>
+           <code>hdfs://nn2:8020/target/10  64</code><br/>
+           <code>hdfs://nn2:8020/target/20  32</code><br/></p>
+
+        <p><br/><code>1</code> is skipped because the file-length and contents match.
+        <code>2</code> is copied because it doesn't exist at the target.
+        <code>10</code> and <code>20</code> are overwritten since the contents
+        don't match the source. </p>
+
+        <p>If <code>-update</code> is used, <code>1</code> is overwritten as well.</p>
+      </section>
+    </section>
+
+    <section id="CommandLineOptions">
+      <title>Command Line Options</title>
+      <table>
+        <tr><th> Flag </th><th> Description </th><th> Notes </th></tr>
+
+        <tr><td><code>-p[rbugp]</code></td>
+            <td>Preserve<br/>
+                r: replication number<br/>
+                b: block size<br/>
+                u: user<br/>
+                g: group<br/>
+                p: permission<br/></td>
+            <td>Modification times are not preserved. Also, when
+            <code>-update</code> is specified, status updates will
+            <strong>not</strong> be synchronized unless the file sizes
+            also differ (i.e. unless the file is re-created).
+            </td></tr>
+        <tr><td><code>-i</code></td>
+            <td>Ignore failures</td>
+            <td>As explained in the Appendix, this option
+            will keep more accurate statistics about the copy than the
+            default case. It also preserves logs from failed copies, which
+            can be valuable for debugging. Finally, a failing map will not
+            cause the job to fail before all splits are attempted.
+            </td></tr>
+        <tr><td><code>-log &lt;logdir&gt;</code></td>
+            <td>Write logs to &lt;logdir&gt;</td>
+            <td>DistCp keeps logs of each file it attempts to copy as map
+            output. If a map fails, the log output will not be retained if
+            it is re-executed.
+            </td></tr>
+        <tr><td><code>-m &lt;num_maps&gt;</code></td>
+            <td>Maximum number of simultaneous copies</td>
+            <td>Specify the number of maps to copy data. Note that more maps
+            may not necessarily improve throughput.
+            </td></tr>
+        <tr><td><code>-overwrite</code></td>
+            <td>Overwrite destination</td>
+            <td>If a map fails and <code>-i</code> is not specified, all the
+            files in the split, not only those that failed, will be recopied.
+            As discussed in the Usage documentation, it also changes
+            the semantics for generating destination paths, so users should
+            use this carefully.
+            </td></tr>
+        <tr><td><code>-update</code></td>
+            <td>Overwrite if src size different from dst size</td>
+            <td>As noted in the preceding, this is not a &quot;sync&quot;
+            operation. The only criterion examined is the source and
+            destination file sizes; if they differ, the source file
+            replaces the destination file. As discussed in the
+            Usage documentation, it also changes the semantics for
+            generating destination paths, so users should use this carefully.
+            </td></tr>
+        <tr><td><code>-f &lt;urilist_uri&gt;</code></td>
+            <td>Use list at &lt;urilist_uri&gt; as src list</td>
+            <td>This is equivalent to listing each source on the command
+            line. The <code>urilist_uri</code> list should be a fully
+            qualified URI.
+            </td></tr>
+        <tr><td><code>-filelimit &lt;n&gt;</code></td>
+            <td>Limit the total number of files to be &lt;= n</td>
+            <td><strong>Deprecated!</strong> Ignored in the new DistCp.
+            </td></tr>
+        <tr><td><code>-sizelimit &lt;n&gt;</code></td>
+            <td>Limit the total size to be &lt;= n bytes</td>
+            <td><strong>Deprecated!</strong> Ignored in the new DistCp.
+            </td></tr>
+        <tr><td><code>-delete</code></td>
+            <td>Delete the files existing in the dst but not in src</td>
+            <td>The deletion is done by FS Shell.  So the trash will be used,
+                if it is enable.
+            </td></tr>
+        <tr><td><code>-strategy {dynamic|uniformsize}</code></td>
+            <td>Choose the copy-strategy to be used in DistCp.</td>
+            <td>By default, uniformsize is used. (i.e. Maps are balanced on the
+                total size of files copied by each map. Similar to legacy.)
+                If "dynamic" is specified, <code>DynamicInputFormat</code> is
+                used instead. (This is described in the Architecture section,
+                under InputFormats.)
+            </td></tr>
+        <tr><td><code>-bandwidth</code></td>
+              <td>Specify bandwidth per map, in MB/second.</td>
+              <td>Each map will be restricted to consume only the specified
+                  bandwidth. This is not always exact. The map throttles back
+                  its bandwidth consumption during a copy, such that the
+                  <strong>net</strong> bandwidth used tends towards the
+                  specified value.
+              </td></tr>
+        <tr><td><code>-atomic {-tmp &lt;tmp_dir&gt;}</code></td>
+              <td>Specify atomic commit, with optional tmp directory.</td>
+              <td><code>-atomic</code> instructs DistCp to copy the source
+                  data to a temporary target location, and then move the
+                  temporary target to the final-location atomically. Data will
+                  either be available at final target in a complete and consistent
+                  form, or not at all.
+                  Optionally, <code>-tmp</code> may be used to specify the
+                  location of the tmp-target. If not specified, a default is
+                  chosen. <strong>Note:</strong> tmp_dir must be on the final
+                  target cluster.
+              </td></tr>
+        <tr><td><code>-mapredSslConf &lt;ssl_conf_file&gt;</code></td>
+              <td>Specify SSL Config file, to be used with HSFTP source</td>
+              <td>When using the hsftp protocol with a source, the security-
+                  related properties may be specified in a config-file and
+                  passed to DistCp. &lt;ssl_conf_file&gt; needs to be in
+                  the classpath.
+              </td></tr>
+        <tr><td><code>-async</code></td>
+              <td>Run DistCp asynchronously. Quits as soon as the Hadoop
+              Job is launched.</td>
+              <td>The Hadoop Job-id is logged, for tracking.
+              </td></tr>
+      </table>
+    </section>
+
+    <section id="ArchitectureOfDistCp">
+      <title>Architecture of DistCp</title>
+      <p>The components of the new DistCp may be classified into the following
+         categories: </p>
+
+      <ul>
+        <li>DistCp Driver</li>
+        <li>Copy-listing generator</li>
+        <li>Input-formats and Map-Reduce components</li>
+      </ul>
+
+      <section id="DistCpDriver">
+        <title>DistCp Driver</title>
+
+        <p>The DistCp Driver components are responsible for:</p>
+
+        <ul>
+          <li>Parsing the arguments passed to the DistCp command on the
+              command-line, via:
+            <ul>
+              <li>OptionsParser, and</li>
+              <li>DistCpOptionsSwitch</li>
+            </ul>
+          </li>
+          <li>Assembling the command arguments into an appropriate
+              DistCpOptions object, and initializing DistCp. These arguments
+              include:
+            <ul>
+              <li>Source-paths</li>
+              <li>Target location</li>
+              <li>Copy options (e.g. whether to update-copy, overwrite, which
+                  file-attributes to preserve, etc.)</li>
+            </ul>
+          </li>
+          <li>Orchestrating the copy operation by:
+            <ul>
+              <li>Invoking the copy-listing-generator to create the list of
+                  files to be copied.</li>
+              <li>Setting up and launching the Hadoop Map-Reduce Job to carry
+                  out the copy.</li>
+              <li>Based on the options, either returning a handle to the
+                  Hadoop MR Job immediately, or waiting till completion.</li>
+            </ul>
+          </li>
+        </ul>
+
+        <p>The parser-elements are exercised only from the command-line (or if
+           DistCp::run() is invoked). The DistCp class may also be used
+           programmatically, by constructing the DistCpOptions object, and
+           initializing a DistCp object appropriately.</p>
+      </section>
+
+      <section id="Copy-listingGenerator">
+        <title>Copy-listing Generator</title>
+
+        <p>The copy-listing-generator classes are responsible for creating the
+           list of files/directories to be copied from source. They examine
+           the contents of the source-paths (files/directories, including
+           wild-cards), and record all paths that need copy into a sequence-
+           file, for consumption by the DistCp Hadoop Job. The main classes in
+           this module include:</p>
+
+        <ol>
+          <li>CopyListing: The interface that should be implemented by any 
+              copy-listing-generator implementation. Also provides the factory
+              method by which the concrete CopyListing implementation is
+              chosen.</li>
+
+          <li>SimpleCopyListing: An implementation of CopyListing that accepts
+              multiple source paths (files/directories), and recursively lists
+              all the individual files and directories under each, for
+              copy.</li>
+
+          <li>GlobbedCopyListing: Another implementation of CopyListing that
+              expands wild-cards in the source paths.</li>
+
+          <li>FileBasedCopyListing: An implementation of CopyListing that
+              reads the source-path list from a specified file.</li>
+        </ol>
+
+        <p>Based on whether a source-file-list is specified in the
+           DistCpOptions, the source-listing is generated in one of the
+           following ways:</p>
+
+        <ol>
+          <li>If there's no source-file-list, the GlobbedCopyListing is used.
+              All wild-cards are expanded, and all the expansions are
+              forwarded to the SimpleCopyListing, which in turn constructs the
+              listing (via recursive descent of each path). </li>
+
+          <li>If a source-file-list is specified, the FileBasedCopyListing is
+              used. Source-paths are read from the specified file, and then
+              forwarded to the GlobbedCopyListing. The listing is then
+              constructed as described above.</li>
+        </ol>
+
+        <p>One may customize the method by which the copy-listing is
+           constructed by providing a custom implementation of the CopyListing
+           interface. The behaviour of DistCp differs here from the legacy
+           DistCp, in how paths are considered for copy. </p>
+
+        <p>The legacy implementation only lists those paths that must
+           definitely be copied on to target.
+           E.g. if a file already exists at the target (and -overwrite isn't
+           specified), the file isn't even considered in the Map-Reduce Copy
+           Job. Determining this during setup (i.e. before the Map-Reduce Job)
+           involves file-size and checksum-comparisons that are potentially
+           time-consuming.</p>
+
+        <p>The new DistCp postpones such checks until the Map-Reduce Job, thus
+           reducing setup time. Performance is enhanced further since these
+           checks are parallelized across multiple maps.</p>
+      </section>
+
+      <section id="Input-formatsAndMap-ReduceComponents">
+        <title>Input-formats and Map-Reduce Components</title>
+
+        <p> The Input-formats and Map-Reduce components are responsible for
+            the actual copy of files and directories from the source to the
+            destination path. The listing-file created during copy-listing
+            generation is consumed at this point, when the copy is carried
+            out. The classes of interest here include:</p>
+
+        <ul>
+          <li><strong>UniformSizeInputFormat:</strong> This implementation of
+              org.apache.hadoop.mapreduce.InputFormat provides equivalence
+              with Legacy DistCp in balancing load across maps.
+              The aim of the UniformSizeInputFormat is to make each map copy
+              roughly the same number of bytes. Apropos, the listing file is
+              split into groups of paths, such that the sum of file-sizes in
+              each InputSplit is nearly equal to every other map. The splitting
+              isn't always perfect, but its trivial implementation keeps the
+              setup-time low.</li>
+
+          <li><strong>DynamicInputFormat and DynamicRecordReader:</strong>
+              <p> The DynamicInputFormat implements org.apache.hadoop.mapreduce.InputFormat,
+              and is new to DistCp. The listing-file is split into several
+              "chunk-files", the exact number of chunk-files being a multiple
+              of the number of maps requested for in the Hadoop Job. Each map
+              task is "assigned" one of the chunk-files (by renaming the chunk
+              to the task's id), before the Job is launched.</p>
+
+              <p>Paths are read from each chunk using the DynamicRecordReader,
+              and processed in the CopyMapper. After all the paths in a chunk
+              are processed, the current chunk is deleted and a new chunk is
+              acquired. The process continues until no more chunks are
+              available.</p>
+              <p>This "dynamic" approach allows faster map-tasks to consume
+              more paths than slower ones, thus speeding up the DistCp job
+              overall. </p>
+          </li>
+
+          <li><strong>CopyMapper:</strong> This class implements the physical
+              file-copy. The input-paths are checked against the input-options
+              (specified in the Job's Configuration), to determine whether a
+              file needs copy. A file will be copied only if at least one of
+              the following is true:
+            <ul>
+              <li>A file with the same name doesn't exist at target.</li>
+              <li>A file with the same name exists at target, but has a
+                  different file size.</li>
+              <li>A file with the same name exists at target, but has a
+                  different checksum, and -skipcrccheck isn't mentioned.</li>
+              <li>A file with the same name exists at target, but -overwrite
+                  is specified.</li>
+              <li>A file with the same name exists at target, but differs in
+                  block-size (and block-size needs to be preserved.</li>
+            </ul>
+          </li>
+
+          <li><strong>CopyCommitter:</strong>
+              This class is responsible for the commit-phase of the DistCp
+              job, including:
+            <ul>
+              <li>Preservation of directory-permissions (if specified in the
+                  options)</li>
+              <li>Clean-up of temporary-files, work-directories, etc.</li>
+            </ul>
+          </li>
+        </ul>
+      </section>
+    </section>
+
+    <section id="Appendix">
+    <title>Appendix</title>
+
+      <section id="MapSizing">
+        <title>Map sizing</title>
+ 
+        <p> By default, DistCp makes an attempt to size each map comparably so
+        that each copies roughly the same number of bytes. Note that files are the
+        finest level of granularity, so increasing the number of simultaneous
+        copiers (i.e. maps) may not always increase the number of
+        simultaneous copies nor the overall throughput.</p>
+
+        <p> The new DistCp also provides a strategy to "dynamically" size maps,
+        allowing faster data-nodes to copy more bytes than slower nodes. Using
+        <code>-strategy dynamic</code> (explained in the Architecture), rather
+        than to assign a fixed set of source-files to each map-task, files are
+        instead split into several sets. The number of sets exceeds the number of
+        maps, usually by a factor of 2-3. Each map picks up and copies all files
+        listed in a chunk. When a chunk is exhausted, a new chunk is acquired and
+        processed, until no more chunks remain.</p>
+
+        <p> By not assigning a source-path to a fixed map, faster map-tasks (i.e.
+        data-nodes) are able to consume more chunks, and thus copy more data,
+        than slower nodes. While this distribution isn't uniform, it is
+        <strong>fair</strong> with regard to each mapper's capacity.</p>
+
+        <p>The dynamic-strategy is implemented by the DynamicInputFormat. It
+        provides superior performance under most conditions. </p>
+
+        <p>Tuning the number of maps to the size of the source and
+        destination clusters, the size of the copy, and the available
+        bandwidth is recommended for long-running and regularly run jobs.</p>
+      </section>
+
+      <section id="CopyingBetweenVersionsOfHDFS">
+        <title>Copying Between Versions of HDFS</title>
+
+        <p>For copying between two different versions of Hadoop, one will
+        usually use HftpFileSystem. This is a read-only FileSystem, so DistCp
+        must be run on the destination cluster (more specifically, on
+        TaskTrackers that can write to the destination cluster). Each source is
+        specified as <code>hftp://&lt;dfs.http.address&gt;/&lt;path&gt;</code>
+        (the default <code>dfs.http.address</code> is
+        &lt;namenode&gt;:50070).</p>
+      </section>
+ 
+      <section id="MapReduceAndOtherSide-effects">
+        <title>Map/Reduce and other side-effects</title>
+
+        <p>As has been mentioned in the preceding, should a map fail to copy
+        one of its inputs, there will be several side-effects.</p>
+
+        <ul>
+          <li>Unless <code>-overwrite</code> is specified, files successfully
+          copied by a previous map on a re-execution will be marked as
+          &quot;skipped&quot;.</li>
+
+          <li>If a map fails <code>mapred.map.max.attempts</code> times, the
+          remaining map tasks will be killed (unless <code>-i</code> is
+          set).</li>
+
+          <li>If <code>mapred.speculative.execution</code> is set set
+          <code>final</code> and <code>true</code>, the result of the copy is
+          undefined.</li>
+        </ul>
+      </section>
+
+      <section id="SSLConfigurationsForHSFTPSources">
+        <title>SSL Configurations for HSFTP sources</title>
+
+        <p>To use an HSFTP source (i.e. using the hsftp protocol), a Map-Red SSL
+        configuration file needs to be specified (via the <code>-mapredSslConf</code>
+        option). This must specify 3 parameters:</p>
+
+        <ul>
+          <li><code>ssl.client.truststore.location</code>: The local-filesystem
+           location of the trust-store file, containing the certificate for
+           the namenode.</li>
+
+          <li><code>ssl.client.truststore.type</code>: (Optional) The format of
+          the trust-store file.</li>
+
+          <li><code>ssl.client.truststore.password</code>: (Optional) Password
+          for the trust-store file.</li>
+        </ul>
+
+        <p>The following is an example of the contents of the contents of
+        a Map-Red SSL Configuration file:</p>
+
+        <p> <br/> <code> &lt;configuration&gt; </code> </p>
+
+        <p> <br/> <code>&lt;property&gt; </code> </p>
+        <p> <code>&lt;name&gt;ssl.client.truststore.location&lt;/name&gt; </code> </p>
+        <p> <code>&lt;value&gt;/work/keystore.jks&lt;/value&gt; </code> </p>
+        <p> <code>&lt;description&gt;Truststore to be used by clients like distcp. Must be specified. &lt;/description&gt;</code> </p>
+        <p> <br/> <code>&lt;/property&gt; </code> </p>
+
+        <p><code> &lt;property&gt; </code> </p>
+        <p> <code>&lt;name&gt;ssl.client.truststore.password&lt;/name&gt; </code> </p>
+        <p> <code>&lt;value&gt;changeme&lt;/value&gt; </code> </p>
+        <p> <code>&lt;description&gt;Optional. Default value is "". &lt;/description&gt;  </code> </p>
+        <p> <code>&lt;/property&gt; </code>  </p>
+
+        <p> <br/> <code> &lt;property&gt; </code> </p>
+        <p> <code> &lt;name&gt;ssl.client.truststore.type&lt;/name&gt;</code>  </p>
+        <p> <code> &lt;value&gt;jks&lt;/value&gt;</code>  </p>
+        <p> <code> &lt;description&gt;Optional. Default value is "jks". &lt;/description&gt;</code>  </p>
+        <p> <code> &lt;/property&gt; </code> </p>
+
+        <p> <code> &lt;/configuration&gt; </code> </p>
+
+        <p><br/>The SSL configuration file must be in the class-path of the 
+        DistCp program.</p>
+      </section>
+    </section>
+
+    <section id="FrequentlyAskedQuestions">
+    <title>Frequently Asked Questions</title>
+  
+    <ol>
+      <li><strong>Why does -update not create the parent source-directory under
+          a pre-existing target directory?</strong>
+
+        <p>The behaviour of <code>-update</code> and <code>-overwrite</code>
+        is described in detail in the Usage section of this document. In short,
+        if either option is used with a pre-existing destination directory, the
+        <strong>contents</strong> of each source directory is copied over, rather
+        than the source-directory itself.
+        This behaviour is consistent with the legacy DistCp implementation as well.
+        </p>
+      </li>
+
+      <li><strong>How does the new DistCp differ in semantics from the Legacy
+      DistCp?</strong>
+
+        <ul>
+          <li>Files that are skipped during copy used to also have their
+          file-attributes (permissions, owner/group info, etc.) unchanged,
+          when copied with Legacy DistCp. These are now updated, even if
+          the file-copy is skipped.</li>
+          <li>Empty root directories among the source-path inputs were not
+          created at the target, in Legacy DistCp. These are now created.</li>
+        </ul>
+      </li>
+
+      <li><strong>Why does the new DistCp use more maps than legacy DistCp?</strong>
+        <p>Legacy DistCp works by figuring out what files need to be actually
+        copied to target <strong>before</strong> the copy-job is launched, and then
+        launching as many maps as required for copy. So if a majority of the files
+        need to be skipped (because they already exist, for example), fewer maps
+        will be needed. As a consequence, the time spent in setup (i.e. before the
+        M/R job) is higher.</p>
+        <p>The new DistCp calculates only the contents of the source-paths. It
+        doesn't try to filter out what files can be skipped. That decision is put-
+        off till the M/R job runs. This is much faster (vis-a-vis execution-time),
+        but the number of maps launched will be as specified in the <code>-m</code>
+        option, or 20 (default) if unspecified.</p>
+      </li>
+
+      <li><strong>Why does DistCp not run faster when more maps are specified?</strong>
+        <p>At present, the smallest unit of work for DistCp is a file. i.e.,
+        a file is processed by only one map. Increasing the number of maps to
+        a value exceeding the number of files would yield no performance
+        benefit. The number of maps lauched would equal the number of files.</p>
+      </li>
+
+      <li><strong>Why does DistCp run out of memory?</strong>
+        <p>If the number of individual files/directories being copied from
+        the source path(s) is extremely large (e.g. 1,000,000 paths), DistCp might
+        run out of memory while determining the list of paths for copy. This is
+        not unique to the new DistCp implementation.</p>
+        <p>To get around this, consider changing the <code>-Xmx</code> JVM
+        heap-size parameters, as follows:</p>
+        <p><code>bash$ export HADOOP_CLIENT_OPTS="-Xms64m -Xmx1024m"</code></p>
+        <p><code>bash$ hadoop distcp2 /source /target</code></p>
+      </li>
+    </ol>
+    </section>
+
+  </body>
+
+</document>

Modified: hadoop/common/branches/branch-1-win/src/docs/src/documentation/content/xdocs/file_system_shell.xml
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-1-win/src/docs/src/documentation/content/xdocs/file_system_shell.xml?rev=1495297&r1=1495296&r2=1495297&view=diff
==============================================================================
--- hadoop/common/branches/branch-1-win/src/docs/src/documentation/content/xdocs/file_system_shell.xml (original)
+++ hadoop/common/branches/branch-1-win/src/docs/src/documentation/content/xdocs/file_system_shell.xml Fri Jun 21 06:37:27 2013
@@ -412,7 +412,7 @@
 				<code>Usage: hdfs dfs -rm [-skipTrash] URI [URI &#x2026;] </code>
 			</p>
 			<p>
-	   Delete files specified as args. Only deletes non empty directory and files. If the <code>-skipTrash</code> option
+	   Delete files specified as args. Only deletes files. If the <code>-skipTrash</code> option
 	   is specified, the trash, if enabled, will be bypassed and the specified file(s) deleted immediately.  	This can be
 		   useful when it is necessary to delete files from an over-quota directory.
 	   Refer to rmr for recursive deletes.<br/>
@@ -420,7 +420,7 @@
 	   </p>
 			<ul>
 				<li>
-					<code> hdfs dfs -rm hdfs://nn.example.com/file /user/hadoop/emptydir </code>
+					<code> hdfs dfs -rm hdfs://nn.example.com/file </code>
 				</li>
 			</ul>
 			<p>Exit Code:</p>
@@ -436,7 +436,7 @@
 			<p>
 				<code>Usage: hdfs dfs -rmr [-skipTrash] URI [URI &#x2026;]</code>
 			</p>
-			<p>Recursive version of delete. If the <code>-skipTrash</code> option
+			<p>Recursive version of delete. The rmr command recursively deletes the directory and any content under it. If the <code>-skipTrash</code> option
 		   is specified, the trash, if enabled, will be bypassed and the specified file(s) deleted immediately. This can be
 		   useful when it is necessary to delete files from an over-quota directory.<br/>
 	   Example:

Added: hadoop/common/branches/branch-1-win/src/docs/src/documentation/content/xdocs/hdfs_imageviewer.xml
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-1-win/src/docs/src/documentation/content/xdocs/hdfs_imageviewer.xml?rev=1495297&view=auto
==============================================================================
--- hadoop/common/branches/branch-1-win/src/docs/src/documentation/content/xdocs/hdfs_imageviewer.xml (added)
+++ hadoop/common/branches/branch-1-win/src/docs/src/documentation/content/xdocs/hdfs_imageviewer.xml Fri Jun 21 06:37:27 2013
@@ -0,0 +1,427 @@
+<?xml version="1.0"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+
+<!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V2.0//EN" "http://forrest.apache.org/dtd/document-v20.dtd">
+
+<document>
+
+  <header>
+    <title>Offline Image Viewer Guide</title>
+  </header>
+
+  <body>
+
+    <section>
+      <title>Overview</title>
+
+      <p>The Offline Image Viewer is a tool to dump the contents of hdfs
+      fsimage files to human-readable formats in order to allow offline analysis
+      and examination of an Hadoop cluster's namespace. The tool is able to
+      process very large image files relatively quickly, converting them to
+      one of several output formats. The tool handles the layout formats that
+      were included with Hadoop versions 16 and up. If the tool is not able to
+      process an image file, it will exit cleanly. The Offline Image Viewer does not require
+      an Hadoop cluster to be running; it is entirely offline in its operation.</p>
+
+      <p>The Offline Image Viewer provides several output processors:</p>
+        <ol>
+        <li><strong>Ls</strong> is the default output processor. It closely mimics the format of
+          the <code>lsr </code> command. It includes the same fields, in the same order, as
+          <code>lsr </code>: directory or file flag, permissions, replication, owner, group,
+          file size, modification date, and full path. Unlike the <code>lsr </code> command,
+          the root path is included. One important difference between the output
+          of the <code>lsr </code> command this processor, is that this output is not sorted
+          by directory name and contents. Rather, the files are listed in the
+          order in which they are stored in the fsimage file. Therefore, it is
+          not possible to directly compare the output of the <code>lsr </code> command this
+          this tool. The Ls processor uses information contained within the Inode blocks to
+          calculate file sizes and ignores the <code>-skipBlocks</code> option.</li>
+        <li><strong>Indented</strong> provides a more complete view of the fsimage's contents,
+          including all of the information included in the image, such as image
+          version, generation stamp and inode- and block-specific listings. This
+          processor uses indentation to organize the output into a hierarchal manner.
+          The <code>lsr </code> format is suitable for easy human comprehension.</li>
+        <li><strong>Delimited</strong> provides one file per line consisting of the path,
+        replication, modification time, access time, block size, number of blocks, file size,
+        namespace quota, diskspace quota, permissions, username and group name. If run against
+        an fsimage that does not contain any of these fields, the field's column will be included,
+        but no data recorded. The default record delimiter is a tab, but this may be changed
+        via the <code>-delimiter</code> command line argument. This processor is designed to
+        create output that is easily analyzed by other tools, such as <a href="http://hadoop.apache.org/pig/">Apache Pig</a>. 
+        See the <a href="#analysis">Analyzing Results</a> section
+        for further information on using this processor to analyze the contents of fsimage files.</li>
+        <li><strong>XML</strong> creates an XML document of the fsimage and includes all of the
+          information within the fsimage, similar to the <code>lsr </code> processor. The output
+          of this processor is amenable to automated processing and analysis with XML tools.
+          Due to the verbosity of the XML syntax, this processor will also generate
+          the largest amount of output.</li>
+        <li><strong>FileDistribution</strong> is the tool for analyzing file 
+          sizes in the namespace image. In order to run the tool one should 
+          define a range of integers <code>[0, maxSize]</code> by specifying
+          <code>maxSize</code> and a <code>step</code>.
+          The range of integers is divided into segments of size
+          <code>step</code>:
+          <code>[0, s</code><sub>1</sub><code>, ..., s</code><sub>n-1</sub><code>, maxSize]</code>, 
+          and the processor calculates how many files in the system fall into 
+          each segment <code>[s</code><sub>i-1</sub><code>, s</code><sub>i</sub><code>)</code>.
+          Note that files larger than <code>maxSize</code> always fall into 
+          the very last segment.
+          The output file is formatted as a tab separated two column table:
+          Size and NumFiles. Where Size represents the start of the segment,
+          and numFiles is the number of files form the image which size falls
+          in this segment.</li>
+        </ol>
+
+    </section> <!-- overview -->
+
+    <section>
+      <title>Usage</title>
+
+      <section>
+        <title>Basic</title>
+        <p>The simplest usage of the Offline Image Viewer is to provide just an input and output
+          file, via the <code>-i</code> and <code>-o</code> command-line switches:</p>
+
+        <p><code>bash$ bin/hadoop oiv -i fsimage -o fsimage.txt</code><br/></p>
+
+        <p>This will create a file named fsimage.txt in the current directory using
+        the Ls output processor.  For very large image files, this process may take
+        several minutes.</p>
+
+        <p>One can specify which output processor via the command-line switch <code>-p</code>.
+        For instance:</p>
+        <p><code>bash$ bin/hadoop oiv -i fsimage -o fsimage.xml -p XML</code><br/></p>
+
+        <p>or</p>
+
+        <p><code>bash$ bin/hadoop oiv -i fsimage -o fsimage.txt -p Indented</code><br/></p>
+
+        <p>This will run the tool using either the XML or Indented output processor,
+        respectively.</p>
+
+        <p>One command-line option worth considering is <code>-skipBlocks</code>, which
+        prevents the tool from explicitly enumerating all of the blocks that make up
+        a file in the namespace. This is useful for file systems that have very large
+        files. Enabling this option can significantly decrease the size of the resulting
+        output, as individual blocks are not included. Note, however, that the Ls processor
+        needs to enumerate the blocks and so overrides this option.</p>
+
+      </section> <!-- Basic -->
+      <section id="Example">
+        <title>Example</title>
+
+<p>Consider the following contrived namespace:</p>
+<source>
+drwxr-xr-x   - theuser supergroup          0 2009-03-16 21:17 /anotherDir 
+
+-rw-r--r--   3 theuser supergroup  286631664 2009-03-16 21:15 /anotherDir/biggerfile 
+
+-rw-r--r--   3 theuser supergroup       8754 2009-03-16 21:17 /anotherDir/smallFile 
+
+drwxr-xr-x   - theuser supergroup          0 2009-03-16 21:11 /mapredsystem 
+
+drwxr-xr-x   - theuser supergroup          0 2009-03-16 21:11 /mapredsystem/theuser 
+
+drwxr-xr-x   - theuser supergroup          0 2009-03-16 21:11 /mapredsystem/theuser/mapredsystem 
+
+drwx-wx-wx   - theuser supergroup          0 2009-03-16 21:11 /mapredsystem/theuser/mapredsystem/ip.redacted.com 
+
+drwxr-xr-x   - theuser supergroup          0 2009-03-16 21:12 /one 
+
+drwxr-xr-x   - theuser supergroup          0 2009-03-16 21:12 /one/two 
+
+drwxr-xr-x   - theuser supergroup          0 2009-03-16 21:16 /user 
+
+drwxr-xr-x   - theuser supergroup          0 2009-03-16 21:19 /user/theuser 
+</source>          
+
+<p>Applying the Offline Image Processor against this file with default options would result in the following output:</p>
+<source>
+machine:hadoop-0.21.0-dev theuser$ bin/hadoop oiv -i fsimagedemo -o fsimage.txt 
+
+drwxr-xr-x  -   theuser supergroup            0 2009-03-16 14:16 / 
+
+drwxr-xr-x  -   theuser supergroup            0 2009-03-16 14:17 /anotherDir 
+
+drwxr-xr-x  -   theuser supergroup            0 2009-03-16 14:11 /mapredsystem 
+
+drwxr-xr-x  -   theuser supergroup            0 2009-03-16 14:12 /one 
+
+drwxr-xr-x  -   theuser supergroup            0 2009-03-16 14:16 /user 
+
+-rw-r--r--  3   theuser supergroup    286631664 2009-03-16 14:15 /anotherDir/biggerfile 
+
+-rw-r--r--  3   theuser supergroup         8754 2009-03-16 14:17 /anotherDir/smallFile 
+
+drwxr-xr-x  -   theuser supergroup            0 2009-03-16 14:11 /mapredsystem/theuser 
+
+drwxr-xr-x  -   theuser supergroup            0 2009-03-16 14:11 /mapredsystem/theuser/mapredsystem 
+
+drwx-wx-wx  -   theuser supergroup            0 2009-03-16 14:11 /mapredsystem/theuser/mapredsystem/ip.redacted.com 
+
+drwxr-xr-x  -   theuser supergroup            0 2009-03-16 14:12 /one/two 
+
+drwxr-xr-x  -   theuser supergroup            0 2009-03-16 14:19 /user/theuser 
+</source>
+
+<p>Similarly, applying the Indented processor would generate output that begins with:</p>
+<source>
+machine:hadoop-0.21.0-dev theuser$ bin/hadoop oiv -i fsimagedemo -p Indented -o fsimage.txt 
+
+FSImage 
+
+  ImageVersion = -19 
+
+  NamespaceID = 2109123098 
+
+  GenerationStamp = 1003 
+
+  INodes [NumInodes = 12] 
+
+    Inode 
+
+      INodePath =  
+
+      Replication = 0 
+
+      ModificationTime = 2009-03-16 14:16 
+
+      AccessTime = 1969-12-31 16:00 
+
+      BlockSize = 0 
+
+      Blocks [NumBlocks = -1] 
+
+      NSQuota = 2147483647 
+
+      DSQuota = -1 
+
+      Permissions 
+
+        Username = theuser 
+
+        GroupName = supergroup 
+
+        PermString = rwxr-xr-x 
+
+   remaining output omitted
+</source>          
+          
+      </section> <!-- example-->
+
+    </section>
+
+    <section id="options">
+        <title>Options</title>
+
+        <section>
+        <title>Option Index</title>
+        <table>
+          <tr><th> Flag </th><th> Description </th></tr>
+          <tr><td><code>[-i|--inputFile] &lt;input file&gt;</code></td>
+              <td>Specify the input fsimage file to process. Required.</td></tr>
+          <tr><td><code>[-o|--outputFile] &lt;output file&gt;</code></td>
+              <td>Specify the output filename, if the specified output processor
+              generates one. If the specified file already exists, it is silently overwritten. Required.
+              </td></tr>
+          <tr><td><code>[-p|--processor] &lt;processor&gt;</code></td>
+                  <td>Specify the image processor to apply against the image file. Currently
+                    valid options are Ls (default), XML and Indented..
+                  </td></tr>
+          <tr><td><code>-skipBlocks</code></td>
+              <td>Do not enumerate individual blocks within files. This may save processing time
+              and outfile file space on namespaces with very large files. The <code>Ls</code> processor reads
+              the blocks to correctly determine file sizes and ignores this option.</td></tr>
+          <tr><td><code>-printToScreen</code></td>
+              <td>Pipe output of processor to console as well as specified file. On extremely 
+              large namespaces, this may increase processing time by an order of magnitude.</td></tr>
+           <tr><td><code>-delimiter &lt;arg&gt;</code></td>
+                  <td>When used in conjunction with the Delimited processor, replaces the default
+	                    tab delimiter with the string specified by <code>arg</code>.</td></tr>
+          <tr><td><code>[-h|--help]</code></td>
+              <td>Display the tool usage and help information and exit.</td></tr>
+            </table>
+          </section> <!-- options -->
+    </section>
+   
+    <section id="analysis">
+      <title>Analyzing Results</title>
+      <p>The Offline Image Viewer makes it easy to gather large amounts of data about the hdfs namespace.
+         This information can then be used to explore file system usage patterns or find
+        specific files that match arbitrary criteria, along with other types of namespace analysis. The Delimited 
+         image processor in particular creates
+        output that is amenable to further processing by tools such as <a href="http://hadoop.apache.org/pig/">Apache Pig</a>. Pig provides a particularly
+        good choice for analyzing these data as it is able to deal with the output generated from a small fsimage
+        but also scales up to consume data from extremely large file systems.</p>
+      <p>The Delimited image processor generates lines of text separated, by default, by tabs and includes
+        all of the fields that are common between constructed files and files that were still under constructed
+        when the fsimage was generated. Examples scripts are provided demonstrating how to use this output to 
+        accomplish three tasks: determine the number of files each user has created on the file system,
+        find files were created but have not accessed, and find probable duplicates of large files by comparing
+        the size of each file.</p>
+      <p>Each of the following scripts assumes you have generated an output file using the Delimited processor named
+        <code>foo</code> and will be storing the results of the Pig analysis in a file named <code>results</code>.</p>
+      <section>
+      <title>Total Number of Files for Each User</title>
+      <p>This script processes each path within the namespace, groups them by the file owner and determines the total
+      number of files each user owns.</p>
+      <p><strong>numFilesOfEachUser.pig:</strong></p>
+        <source>
+-- This script determines the total number of files each user has in
+-- the namespace. Its output is of the form:
+--   username, totalNumFiles
+
+-- Load all of the fields from the file
+A = LOAD '$inputFile' USING PigStorage('\t') AS (path:chararray,
+                                                 replication:int,
+                                                 modTime:chararray,
+                                                 accessTime:chararray,
+                                                 blockSize:long,
+                                                 numBlocks:int,
+                                                 fileSize:long,
+                                                 NamespaceQuota:int,
+                                                 DiskspaceQuota:int,
+                                                 perms:chararray,
+                                                 username:chararray,
+                                                 groupname:chararray);
+
+
+-- Grab just the path and username
+B = FOREACH A GENERATE path, username;
+
+-- Generate the sum of the number of paths for each user
+C = FOREACH (GROUP B BY username) GENERATE group, COUNT(B.path);
+
+-- Save results
+STORE C INTO '$outputFile';
+        </source>
+      <p>This script can be run against pig with the following command:</p>
+      <p><code>bin/pig -x local -param inputFile=../foo -param outputFile=../results ../numFilesOfEachUser.pig</code><br/></p>
+      <p>The output file's content will be similar to that below:</p>
+      <p>
+        <code>bart  1</code><br/>
+        <code>lisa  16</code><br/>
+        <code>homer 28</code><br/>
+        <code>marge 2456</code><br/>
+      </p>
+      </section>
+      
+      <section><title>Files That Have Never Been Accessed</title>
+      <p>This script finds files that were created but whose access times were never changed, meaning they were never opened or viewed.</p>
+            <p><strong>neverAccessed.pig:</strong></p>
+      <source>
+-- This script generates a list of files that were created but never
+-- accessed, based on their AccessTime
+
+-- Load all of the fields from the file
+A = LOAD '$inputFile' USING PigStorage('\t') AS (path:chararray,
+                                                 replication:int,
+                                                 modTime:chararray,
+                                                 accessTime:chararray,
+                                                 blockSize:long,
+                                                 numBlocks:int,
+                                                 fileSize:long,
+                                                 NamespaceQuota:int,
+                                                 DiskspaceQuota:int,
+                                                 perms:chararray,
+                                                 username:chararray,
+                                                 groupname:chararray);
+
+-- Grab just the path and last time the file was accessed
+B = FOREACH A GENERATE path, accessTime;
+
+-- Drop all the paths that don't have the default assigned last-access time
+C = FILTER B BY accessTime == '1969-12-31 16:00';
+
+-- Drop the accessTimes, since they're all the same
+D = FOREACH C GENERATE path;
+
+-- Save results
+STORE D INTO '$outputFile';
+      </source>
+      <p>This script can be run against pig with the following command and its output file's content will be a list of files that were created but never viewed afterwards.</p>
+      <p><code>bin/pig -x local -param inputFile=../foo -param outputFile=../results ../neverAccessed.pig</code><br/></p>
+      </section>
+      <section><title>Probable Duplicated Files Based on File Size</title>
+      <p>This script groups files together based on their size, drops any that are of less than 100mb and returns a list of the file size, number of files found and a tuple of the file paths.  This can be used to find likely duplicates within the filesystem namespace.</p>
+      
+            <p><strong>probableDuplicates.pig:</strong></p>
+      <source>
+-- This script finds probable duplicate files greater than 100 MB by
+-- grouping together files based on their byte size. Files of this size
+-- with exactly the same number of bytes can be considered probable
+-- duplicates, but should be checked further, either by comparing the
+-- contents directly or by another proxy, such as a hash of the contents.
+-- The scripts output is of the type:
+--    fileSize numProbableDuplicates {(probableDup1), (probableDup2)}
+
+-- Load all of the fields from the file
+A = LOAD '$inputFile' USING PigStorage('\t') AS (path:chararray,
+                                                 replication:int,
+                                                 modTime:chararray,
+                                                 accessTime:chararray,
+                                                 blockSize:long,
+                                                 numBlocks:int,
+                                                 fileSize:long,
+                                                 NamespaceQuota:int,
+                                                 DiskspaceQuota:int,
+                                                 perms:chararray,
+                                                 username:chararray,
+                                                 groupname:chararray);
+
+-- Grab the pathname and filesize
+B = FOREACH A generate path, fileSize;
+
+-- Drop files smaller than 100 MB
+C = FILTER B by fileSize > 100L  * 1024L * 1024L;
+
+-- Gather all the files of the same byte size
+D = GROUP C by fileSize;
+
+-- Generate path, num of duplicates, list of duplicates
+E = FOREACH D generate group AS fileSize, COUNT(C) as numDupes, C.path AS files;
+
+-- Drop all the files where there are only one of them
+F = FILTER E by numDupes > 1L;
+
+-- Sort by the size of the files
+G = ORDER F by fileSize;
+
+-- Save results
+STORE G INTO '$outputFile';
+      </source>
+      <p>This script can be run against pig with the following command:</p>
+      <p><code>bin/pig -x local -param inputFile=../foo -param outputFile=../results ../probableDuplicates.pig</code><br/></p>
+      <p> The output file's content will be similar to that below:</p>
+      
+<source>
+1077288632 2 {(/user/tennant/work1/part-00501),(/user/tennant/work1/part-00993)} 
+1077288664 4 {(/user/tennant/work0/part-00567),(/user/tennant/work0/part-03980),(/user/tennant/work1/part-00725),(/user/eccelston/output/part-03395)} 
+1077288668 3 {(/user/tennant/work0/part-03705),(/user/tennant/work0/part-04242),(/user/tennant/work1/part-03839)} 
+1077288698 2 {(/user/tennant/work0/part-00435),(/user/eccelston/output/part-01382)} 
+1077288702 2 {(/user/tennant/work0/part-03864),(/user/eccelston/output/part-03234)} 
+</source>      
+      <p>Each line includes the file size in bytes that was found to be duplicated, the number of duplicates found, and a list of the duplicated paths. 
+      Files less than 100MB are ignored, providing a reasonable likelihood that files of these exact sizes may be duplicates.</p>
+      </section>
+    </section>
+
+
+  </body>
+
+</document>

Modified: hadoop/common/branches/branch-1-win/src/docs/src/documentation/content/xdocs/hdfs_user_guide.xml
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-1-win/src/docs/src/documentation/content/xdocs/hdfs_user_guide.xml?rev=1495297&r1=1495296&r2=1495297&view=diff
==============================================================================
--- hadoop/common/branches/branch-1-win/src/docs/src/documentation/content/xdocs/hdfs_user_guide.xml (original)
+++ hadoop/common/branches/branch-1-win/src/docs/src/documentation/content/xdocs/hdfs_user_guide.xml Fri Jun 21 06:37:27 2013
@@ -403,8 +403,32 @@
       the delegation token file. 
       For command usage, see <a href="commands_manual.html#fetchdt"><code>fetchdt</code> command</a>. 
      </p>
-     
-   </section> <section> <title> Upgrade and Rollback </title>
+     </section>
+     <section> <title>Recovery Mode</title>
+       <p>Typically, you will configure multiple metadata storage locations.
+       Then, if one storage location is corrupt, you can read the
+       metadata from one of the other storage locations.</p>
+
+       <p>However, what can you do if the only storage locations available are
+       corrupt?  In this case, there is a special NameNode startup mode called
+       Recovery mode that may allow you to recover most of your data.</p>
+
+       <p>You can start the NameNode in recovery mode like so:
+        <code>namenode -recover</code></p>
+
+        <p>When in recovery mode, the NameNode will interactively prompt you at
+       the command line about possible courses of action you can take to
+       recover your data.</p>
+
+       <p>If you don't want to be prompted, you can give the
+       <code>-force</code> option.  This option will force
+       recovery mode to always select the first choice.  Normally, this
+       will be the most reasonable choice.</p>
+
+       <p>Because Recovery mode can cause you to lose data, you should always
+       back up your edit log and fsimage before using it.</p>
+     </section>
+     <section> <title> Upgrade and Rollback </title>
      <p>
       When Hadoop is upgraded on an existing cluster, as with any
       software upgrade, it is possible there are new bugs or

Added: hadoop/common/branches/branch-1-win/src/docs/src/documentation/content/xdocs/hftp.xml
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-1-win/src/docs/src/documentation/content/xdocs/hftp.xml?rev=1495297&view=auto
==============================================================================
--- hadoop/common/branches/branch-1-win/src/docs/src/documentation/content/xdocs/hftp.xml (added)
+++ hadoop/common/branches/branch-1-win/src/docs/src/documentation/content/xdocs/hftp.xml Fri Jun 21 06:37:27 2013
@@ -0,0 +1,69 @@
+<?xml version="1.0"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+
+<!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V2.0//EN" "http://forrest.apache.org/dtd/document-v20.dtd">
+
+
+<document>
+
+  <header>
+    <title> HFTP Guide</title>
+  </header>
+
+  <body>
+    <section>
+      <title> Introduction </title>
+      <p> HFTP is a Hadoop filesystem implementation that lets you read data from a remote Hadoop HDFS cluster. 
+      The reads are done via HTTP, and data is sourced from DataNodes. 
+      HFTP is a read-only filesystem, and will throw exceptions if you try to use it to write data or modify
+      the filesystem state.</p>     
+
+      <p>HFTP is primarily useful if you have multiple HDFS clusters with different versions and you need to move data from one to another. HFTP is wire-compatible even between different versions of HDFS. For example, you can do things like:
+      <code>hadoop distcp -i hftp://sourceFS:50070/src hdfs://destFS:50070/dest</code>. Note that HFTP is read-only so the destination must be an HDFS filesystem. (Also, in this example, the <code>distcp</code> should be run using the configuraton of the new filesystem.)</p>
+      
+      <p>An extension, HSFTP, uses HTTPS by default. This means that data will be encrypted in transit.</p>
+    </section>
+    
+    <section>
+      <title>Implementation</title>
+      <p>The code for HFTP lives in the Java class <code>org.apache.hadoop.hdfs.HftpFileSystem</code>. Likewise, 
+      HSFTP is implemented in <code>org.apache.hadoop.hdfs.HsftpFileSystem</code>.
+      </p>
+    </section>
+    
+    <section>
+      <title> Configuration Options </title>
+      <table>
+        <tr>
+          <th>Name</th>
+          <th>Description</th>
+        </tr>
+        <tr>
+          <td>dfs.hftp.https.port</td>
+          <td>the HTTPS port on the remote cluster. If not set, HFTP will fall back on
+          <code>dfs.https.port</code>.</td>
+        </tr>
+        <tr>
+          <td>hdfs.service.host_<strong>ip:port</strong></td>
+          <td>Specifies the service name (for the security subsystem) associated with the HFTP filesystem
+          running at <strong>ip:port.</strong></td>
+        </tr>
+      </table>     
+    </section>
+  </body>
+</document>

Modified: hadoop/common/branches/branch-1-win/src/docs/src/documentation/content/xdocs/index.xml
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-1-win/src/docs/src/documentation/content/xdocs/index.xml?rev=1495297&r1=1495296&r2=1495297&view=diff
==============================================================================
--- hadoop/common/branches/branch-1-win/src/docs/src/documentation/content/xdocs/index.xml (original)
+++ hadoop/common/branches/branch-1-win/src/docs/src/documentation/content/xdocs/index.xml Fri Jun 21 06:37:27 2013
@@ -30,6 +30,8 @@
 <p>The Hadoop documentation includes the information you need to get started using Hadoop. 
 Begin with the <a href="single_node_setup.html">Single Node Setup</a> which shows you how to set up a single-node Hadoop installation. 
 Then move on to the <a href="cluster_setup.html">Cluster Setup</a> to learn how to set up a multi-node Hadoop installation. 
+Users interested in quickly setting-up a hadoop cluster for experimentation and testing may also check
+<a href="cli_minicluster.html">CLI MiniCluster</a>.
 </p>
    </section>
   

Modified: hadoop/common/branches/branch-1-win/src/docs/src/documentation/content/xdocs/site.xml
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-1-win/src/docs/src/documentation/content/xdocs/site.xml?rev=1495297&r1=1495296&r2=1495297&view=diff
==============================================================================
--- hadoop/common/branches/branch-1-win/src/docs/src/documentation/content/xdocs/site.xml (original)
+++ hadoop/common/branches/branch-1-win/src/docs/src/documentation/content/xdocs/site.xml Fri Jun 21 06:37:27 2013
@@ -35,6 +35,7 @@ See http://forrest.apache.org/docs/linki
     <overview label="Overview"  href="index.html" />  
     <single   label="Single Node Setup"  href="single_node_setup.html" />
     <cluster  label="Cluster Setup"  href="cluster_setup.html" />
+    <cluster  label="CLI MiniCluster"  href="cli_minicluster.html" />
   </docs>  
 
    <docs label="Guides">
@@ -46,6 +47,7 @@ See http://forrest.apache.org/docs/linki
     <streaming  label="Hadoop Streaming"  href="streaming.html" />
     <commands label="Hadoop Commands"  href="commands_manual.html" />
     <distcp         label="DistCp"  href="distcp.html" />
+    <distcp2        label="DistCp Version 2"  href="distcp2.html" />
     <vaidya         label="Vaidya"  href="vaidya.html"/>
     <archives     label="Hadoop Archives" href="hadoop_archives.html"/>
     <gridmix       label="Gridmix"  href="gridmix.html"/>
@@ -61,6 +63,8 @@ See http://forrest.apache.org/docs/linki
     <hdfs_perm        label="Permissions" href="hdfs_permissions_guide.html" />
     <hdfs_quotas      label="Quotas" href="hdfs_quota_admin_guide.html" />
     <hdfs_SLG         label="Synthetic Load Generator"  href="SLG_user_guide.html" />
+    <hdfs_imageviewer	label="Offline Image Viewer"	href="hdfs_imageviewer.html" />
+    <hftp label="HFTP" href="hftp.html"/>
     <webhdfs label="WebHDFS REST API" href="webhdfs.html" />
     <hdfs_libhdfs       label="C API libhdfs" href="libhdfs.html" />
   </docs>
@@ -164,6 +168,7 @@ See http://forrest.apache.org/docs/linki
                 <setTimes href="#setTimes(org.apache.hadoop.fs.Path,%20long,%20long)" />
 
                 <append href="#append(org.apache.hadoop.fs.Path,%20int,%20org.apache.hadoop.util.Progressable)" />
+                <concat href="#concat(org.apache.hadoop.fs.Path,%20org.apache.hadoop.fs.Path[])" />
                 <delete href="#delete(org.apache.hadoop.fs.Path,%20boolean)" />
               </filesystem>
             </fs>

Modified: hadoop/common/branches/branch-1-win/src/docs/src/documentation/content/xdocs/webhdfs.xml
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-1-win/src/docs/src/documentation/content/xdocs/webhdfs.xml?rev=1495297&r1=1495296&r2=1495297&view=diff
==============================================================================
--- hadoop/common/branches/branch-1-win/src/docs/src/documentation/content/xdocs/webhdfs.xml (original)
+++ hadoop/common/branches/branch-1-win/src/docs/src/documentation/content/xdocs/webhdfs.xml Fri Jun 21 06:37:27 2013
@@ -104,6 +104,9 @@
     <li><a href="#APPEND"><code>APPEND</code></a>
         (see <a href="ext:api/org/apache/hadoop/fs/filesystem/append">FileSystem.append</a>)
     </li>
+    <li><a href="#CONCAT"><code>CONCAT</code></a>
+        (see <a href="ext:api/org/apache/hadoop/fs/filesystem/concat">FileSystem.concat</a>)
+    </li>
   </ul></li>
   <li>HTTP DELETE
   <ul>
@@ -152,7 +155,7 @@
 <tr><td><code>dfs.web.authentication.kerberos.principal</code></td>
 <td>The HTTP Kerberos principal used by Hadoop-Auth in the HTTP endpoint.
     The HTTP Kerberos principal MUST start with 'HTTP/' per Kerberos
-    HTTP SPENGO specification.
+    HTTP SPNEGO specification.
 </td></tr>
 <tr><td><code>dfs.web.authentication.kerberos.keytab</code></td>
 <td>The Kerberos keytab file with the credentials for the
@@ -311,6 +314,28 @@ Content-Length: 0
 </p>
       </section>
 <!-- ***************************************************************************** -->
+      <section id="CONCAT">
+        <title>Concatenate Files</title>
+<ul>
+  <li>Submit a HTTP POST request.
+    <source>
+        curl -i -X POST "http://&lt;HOST&gt;:&lt;PORT&gt;/webhdfs/v1/&lt;PATH&gt;?op=CONCAT&amp;sources=&lt;PATHS&gt;"
+    </source>
+The client receives a response with zero content length:
+    <source>
+HTTP/1.1 200 OK
+Content-Length: 0
+    </source>
+  </li>
+</ul>
+<p>
+  See also:
+  <a href="#sources"><code>sources</code></a>,
+  <a href="ext:api/org/apache/hadoop/fs/filesystem/concat">FileSystem.concat</a>
+</p>
+      </section>
+
+<!-- ***************************************************************************** -->
       <section id="OPEN">
         <title>Open and Read a File</title>
 <ul>
@@ -1535,6 +1560,22 @@ var fileStatusProperties =
 </p>
       </section>
 <!-- ***************************************************************************** -->
+      <section id="sources">
+        <title>Sources</title>
+<table>
+  <tr><td>Name</td><td><code>sources</code></td></tr>
+  <tr><td>Description</td><td>A list of source paths.</td></tr>
+  <tr><td>Type</td><td>String</td></tr>
+  <tr><td>Default Value</td><td>&lt;empty&gt;</td></tr>
+  <tr><td>Valid Values</td><td>A list of comma seperated absolute FileSystem paths without scheme and authority.</td></tr>
+  <tr><td>Syntax</td><td>Any string.</td></tr>
+</table>
+<p>
+  See also:
+  <a href="#CONCAT"><code>CONCAT</code></a>,
+</p>
+      </section>
+<!-- ***************************************************************************** -->
       <section id="token">
         <title>Token</title>
 <table>

Modified: hadoop/common/branches/branch-1-win/src/examples/org/apache/hadoop/examples/AggregateWordCount.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-1-win/src/examples/org/apache/hadoop/examples/AggregateWordCount.java?rev=1495297&r1=1495296&r2=1495297&view=diff
==============================================================================
--- hadoop/common/branches/branch-1-win/src/examples/org/apache/hadoop/examples/AggregateWordCount.java (original)
+++ hadoop/common/branches/branch-1-win/src/examples/org/apache/hadoop/examples/AggregateWordCount.java Fri Jun 21 06:37:27 2013
@@ -69,7 +69,7 @@ public class AggregateWordCount {
   @SuppressWarnings("unchecked")
   public static void main(String[] args) throws IOException {
     JobConf conf = ValueAggregatorJob.createValueAggregatorJob(args
-        , new Class[] {WordCountPlugInClass.class});
+        , new Class[] {WordCountPlugInClass.class}, AggregateWordCount.class);
    
     JobClient.runJob(conf);
   }



Mime
View raw message