hadoop-hdfs-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From s..@apache.org
Subject svn commit: r817449 [7/8] - in /hadoop/hdfs/branches/HDFS-265: ./ .eclipse.templates/.launches/ lib/ src/contrib/block_forensics/ src/contrib/block_forensics/client/ src/contrib/block_forensics/ivy/ src/contrib/block_forensics/src/java/org/apache/hadoo...
Date Mon, 21 Sep 2009 22:33:12 GMT
Propchange: hadoop/hdfs/branches/HDFS-265/src/docs/src/documentation/content/xdocs/mapred_tutorial.xml
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: hadoop/hdfs/branches/HDFS-265/src/docs/src/documentation/content/xdocs/native_libraries.xml
URL: http://svn.apache.org/viewvc/hadoop/hdfs/branches/HDFS-265/src/docs/src/documentation/content/xdocs/native_libraries.xml?rev=817449&view=auto
==============================================================================
--- hadoop/hdfs/branches/HDFS-265/src/docs/src/documentation/content/xdocs/native_libraries.xml (added)
+++ hadoop/hdfs/branches/HDFS-265/src/docs/src/documentation/content/xdocs/native_libraries.xml Mon Sep 21 22:33:09 2009
@@ -0,0 +1,212 @@
+<?xml version="1.0"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+
+<!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V2.0//EN" "http://forrest.apache.org/dtd/document-v20.dtd">
+
+<document>
+  
+  <header>
+    <title>Native Libraries Guide</title>
+  </header>
+  
+  <body>
+  
+    <section>
+      <title>Purpose</title>
+      
+      <p>Hadoop has native implementations of certain components for reasons of 
+      both performance and non-availability of Java implementations. These 
+      components are available in a single, dynamically-linked, native library. 
+      On the *nix platform it is <em>libhadoop.so</em>. This document describes 
+      the usage and details on how to build the native libraries.</p>
+    </section>
+    
+    <section>
+      <title>Components</title>
+      
+      <p>Hadoop currently has the following 
+      <a href="ext:api/org/apache/hadoop/io/compress/compressioncodec">
+      compression codecs</a> as the native components:</p>
+      <ul>
+        <li><a href="ext:zlib">zlib</a></li>
+        <li><a href="ext:gzip">gzip</a></li>
+        <li><a href="ext:bzip">bzip2</a></li>
+      </ul>
+      
+      <p>Of the above, the availability of native hadoop libraries is imperative 
+      for the gzip and bzip2 compression codecs to work.</p>
+    </section>
+
+    <section>
+      <title>Usage</title>
+      
+      <p>It is fairly simple to use the native hadoop libraries:</p>
+
+      <ul>
+        <li>
+          Take a look at the 
+          <a href="#Supported+Platforms">supported platforms</a>.
+        </li>
+        <li>
+          Either <a href="ext:releases/download">download</a> the pre-built 
+          32-bit i386-Linux native hadoop libraries (available as part of hadoop 
+          distribution in <code>lib/native</code> directory) or 
+          <a href="#Building+Native+Hadoop+Libraries">build</a> them yourself.
+        </li>
+        <li>
+          Make sure you have any of or all of <strong>&gt;zlib-1.2</strong>,
+          <strong>&gt;gzip-1.2</strong>, and <strong>&gt;bzip2-1.0</strong>
+          packages for your platform installed; 
+          depending on your needs.
+        </li>
+      </ul>
+      
+      <p>The <code>bin/hadoop</code> script ensures that the native hadoop 
+      library is on the library path via the system property 
+      <em>-Djava.library.path=&lt;path&gt;</em>.</p>
+
+      <p>To check everything went alright check the hadoop log files for:</p>
+      
+      <p>
+        <code>
+          DEBUG util.NativeCodeLoader - Trying to load the custom-built 
+          native-hadoop library... 
+        </code><br/>
+        <code>
+          INFO  util.NativeCodeLoader - Loaded the native-hadoop library
+        </code>
+      </p>
+
+      <p>If something goes wrong, then:</p>
+      <p>
+        <code>
+          INFO util.NativeCodeLoader - Unable to load native-hadoop library for 
+          your platform... using builtin-java classes where applicable
+        </code>
+      </p>
+    </section>
+    
+    <section>
+      <title>Supported Platforms</title>
+      
+      <p>Hadoop native library is supported only on *nix platforms only.
+      Unfortunately it is known not to work on <a href="ext:cygwin">Cygwin</a> 
+      and <a href="ext:osx">Mac OS X</a> and has mainly been used on the 
+      GNU/Linux platform.</p>
+
+      <p>It has been tested on the following GNU/Linux distributions:</p>
+      <ul>
+        <li>
+          <a href="http://www.redhat.com/rhel/">RHEL4</a>/<a href="http://fedora.redhat.com/">Fedora</a>
+        </li>
+        <li><a href="http://www.ubuntu.com/">Ubuntu</a></li>
+        <li><a href="http://www.gentoo.org/">Gentoo</a></li>
+      </ul>
+
+      <p>On all the above platforms a 32/64 bit Hadoop native library will work 
+      with a respective 32/64 bit jvm.</p>
+    </section>
+    
+    <section>
+      <title>Building Native Hadoop Libraries</title>
+      
+      <p>Hadoop native library is written in 
+      <a href="http://en.wikipedia.org/wiki/ANSI_C">ANSI C</a> and built using 
+      the GNU autotools-chain (autoconf, autoheader, automake, autoscan, libtool). 
+      This means it should be straight-forward to build them on any platform with 
+      a standards compliant C compiler and the GNU autotools-chain. 
+      See <a href="#Supported+Platforms">supported platforms</a>.</p>
+
+      <p>In particular the various packages you would need on the target 
+      platform are:</p>
+      <ul>
+        <li>
+          C compiler (e.g. <a href="http://gcc.gnu.org/">GNU C Compiler</a>)
+        </li>
+        <li>
+          GNU Autools Chain: 
+          <a href="http://www.gnu.org/software/autoconf/">autoconf</a>, 
+          <a href="http://www.gnu.org/software/automake/">automake</a>, 
+          <a href="http://www.gnu.org/software/libtool/">libtool</a>
+        </li>
+        <li> 
+          zlib-development package (stable version >= 1.2.0)
+        </li>
+      </ul>
+
+      <p>Once you have the pre-requisites use the standard <code>build.xml</code> 
+      and pass along the <code>compile.native</code> flag (set to 
+      <code>true</code>) to build the native hadoop library:</p>
+
+      <p><code>$ ant -Dcompile.native=true &lt;target&gt;</code></p>
+
+      <p>The native hadoop library is not built by default since not everyone is 
+      interested in building them.</p>
+
+      <p>You should see the newly-built native hadoop library in:</p>
+
+      <p><code>$ build/native/&lt;platform&gt;/lib</code></p>
+
+      <p>where &lt;platform&gt; is combination of the system-properties: 
+      <code>${os.name}-${os.arch}-${sun.arch.data.model}</code>; for e.g. 
+      Linux-i386-32.</p>
+
+      <section>
+        <title>Notes</title>
+        
+        <ul>
+          <li>
+            It is <strong>mandatory</strong> to have the 
+            zlib, gzip, and bzip2
+            development packages on the target platform for building the 
+            native hadoop library; however for deployment it is sufficient to 
+            install one of them if you wish to use only one of them.
+          </li>
+          <li>
+            It is necessary to have the correct 32/64 libraries of both zlib 
+            depending on the 32/64 bit jvm for the target platform for 
+            building/deployment of the native hadoop library.
+          </li>
+        </ul>
+      </section>
+    </section>
+    <section>
+      <title> Loading native libraries through DistributedCache </title>
+      <p>User can load native shared libraries through  
+      <a href="mapred_tutorial.html#DistributedCache">DistributedCache</a>
+      for <em>distributing</em> and <em>symlinking</em> the library files</p>
+      
+      <p>Here is an example, describing how to distribute the library and
+      load it from map/reduce task. </p>
+      <ol>
+      <li> First copy the library to the HDFS. <br/>
+      <code>bin/hadoop fs -copyFromLocal mylib.so.1 /libraries/mylib.so.1</code>
+      </li>
+      <li> The job launching program should contain the following: <br/>
+      <code> DistributedCache.createSymlink(conf); </code> <br/>
+      <code> DistributedCache.addCacheFile("hdfs://host:port/libraries/mylib.so.1#mylib.so", conf);
+      </code>
+      </li>
+      <li> The map/reduce task can contain: <br/>
+      <code> System.loadLibrary("mylib.so"); </code>
+      </li>
+      </ol>
+    </section>
+  </body>
+  
+</document>

Propchange: hadoop/hdfs/branches/HDFS-265/src/docs/src/documentation/content/xdocs/native_libraries.xml
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: hadoop/hdfs/branches/HDFS-265/src/docs/src/documentation/content/xdocs/quickstart.xml
URL: http://svn.apache.org/viewvc/hadoop/hdfs/branches/HDFS-265/src/docs/src/documentation/content/xdocs/quickstart.xml?rev=817449&view=auto
==============================================================================
--- hadoop/hdfs/branches/HDFS-265/src/docs/src/documentation/content/xdocs/quickstart.xml (added)
+++ hadoop/hdfs/branches/HDFS-265/src/docs/src/documentation/content/xdocs/quickstart.xml Mon Sep 21 22:33:09 2009
@@ -0,0 +1,296 @@
+<?xml version="1.0"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+
+<!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V2.0//EN" "http://forrest.apache.org/dtd/document-v20.dtd">
+
+<document>
+  
+  <header>
+    <title>Quick Start</title>
+  </header>
+  
+  <body>
+  
+    <section>
+      <title>Purpose</title>
+      
+      <p>The purpose of this document is to help you get a single-node Hadoop 
+      installation up and running very quickly so that you can get a flavour
+      of the Hadoop Distributed File System 
+      (see <a href="hdfs_design.html"> <acronym title="Hadoop Distributed File System">HDFS</acronym> Architecture</a>) and 
+      the Map/Reduce framework; that is, perform simple operations on HDFS and 
+      run example jobs.</p>
+    </section>
+    
+    <section id="PreReqs">
+      <title>Pre-requisites</title>
+      
+      <section>
+        <title>Supported Platforms</title>
+        
+        <ul>
+          <li>
+            GNU/Linux is supported as a development and production platform. 
+            Hadoop has been demonstrated on GNU/Linux clusters with 2000 nodes.
+          </li>
+          <li>
+            Win32 is supported as a <em>development platform</em>. Distributed 
+            operation has not been well tested on Win32, so it is not 
+            supported as a <em>production platform</em>.
+          </li>
+        </ul>        
+      </section>
+      
+      <section>
+        <title>Required Software</title>
+        <p>Required software for Linux and Windows include:</p>
+        <ol>
+          <li>
+            Java<sup>TM</sup> 1.6.x, preferably from Sun, must be installed.
+          </li>
+          <li>
+            <strong>ssh</strong> must be installed and <strong>sshd</strong> must 
+            be running to use the Hadoop scripts that manage remote Hadoop 
+            daemons.
+          </li>
+        </ol>
+        <p>Additional requirements for Windows include:</p>
+        <ol>
+          <li>
+            <a href="http://www.cygwin.com/">Cygwin</a> - Required for shell 
+            support in addition to the required software above. 
+          </li>
+        </ol>
+      </section>
+
+      <section>
+        <title>Installing Software</title>
+          
+        <p>If your cluster doesn't have the requisite software you will need to
+        install it.</p>
+          
+        <p>For example on Ubuntu Linux:</p>
+        <p>
+          <code>$ sudo apt-get install ssh</code><br/>
+          <code>$ sudo apt-get install rsync</code>
+        </p>
+          
+        <p>On Windows, if you did not install the required software when you 
+        installed cygwin, start the cygwin installer and select the packages:</p>
+        <ul>
+          <li>openssh - the <em>Net</em> category</li>
+        </ul>
+      </section>
+      
+    </section>
+    
+    <section>
+      <title>Download</title>
+      
+      <p>
+        To get a Hadoop distribution, download a recent 
+        <a href="ext:releases">stable release</a> from one of the Apache Download
+        Mirrors.
+      </p>
+    </section>
+
+    <section>
+      <title>Prepare to Start the Hadoop Cluster</title>
+      <p>
+        Unpack the downloaded Hadoop distribution. In the distribution, edit the
+        file <code>conf/hadoop-env.sh</code> to define at least 
+        <code>JAVA_HOME</code> to be the root of your Java installation.
+      </p>
+
+	  <p>
+	    Try the following command:<br/>
+        <code>$ bin/hadoop</code><br/>
+        This will display the usage documentation for the <strong>hadoop</strong> 
+        script.
+      </p>
+      
+      <p>Now you are ready to start your Hadoop cluster in one of the three supported
+      modes:
+      </p>
+      <ul>
+        <li>Local (Standalone) Mode</li>
+        <li>Pseudo-Distributed Mode</li>
+        <li>Fully-Distributed Mode</li>
+      </ul>
+    </section>
+    
+    <section id="Local">
+      <title>Standalone Operation</title>
+      
+      <p>By default, Hadoop is configured to run in a non-distributed 
+      mode, as a single Java process. This is useful for debugging.</p>
+      
+      <p>
+        The following example copies the unpacked <code>conf</code> directory to 
+        use as input and then finds and displays every match of the given regular 
+        expression. Output is written to the given <code>output</code> directory.
+        <br/>
+        <code>$ mkdir input</code><br/>
+        <code>$ cp conf/*.xml input</code><br/>
+        <code>
+          $ bin/hadoop jar hadoop-*-examples.jar grep input output 'dfs[a-z.]+'
+        </code><br/>
+        <code>$ cat output/*</code>
+      </p>
+    </section>
+    
+    <section id="PseudoDistributed">
+      <title>Pseudo-Distributed Operation</title>
+
+	  <p>Hadoop can also be run on a single-node in a pseudo-distributed mode 
+	  where each Hadoop daemon runs in a separate Java process.</p>
+	  
+      <section>
+        <title>Configuration</title>
+        <p>Use the following:
+        <br/>
+        <code>conf/core-site.xml</code>:</p>
+        <table>
+        <tr><td>&lt;configuration&gt;</td></tr>
+
+          <tr><td>&nbsp;&nbsp;&lt;property&gt;</td></tr>
+            <tr><td>&nbsp;&nbsp;&nbsp;&nbsp;&lt;name&gt;fs.default.name&lt;/name&gt;</td></tr>
+            <tr><td>&nbsp;&nbsp;&nbsp;&nbsp;&lt;value&gt;hdfs://localhost:9000&lt;/value&gt;</td></tr>
+          <tr><td>&nbsp;&nbsp;&lt;/property&gt;</td></tr>
+
+        <tr><td>&lt;/configuration&gt;</td></tr>
+        </table>
+      
+        <p><br/><code>conf/hdfs-site.xml</code>:</p>
+        <table>
+        <tr><td>&lt;configuration&gt;</td></tr>
+
+          <tr><td>&nbsp;&nbsp;&lt;property&gt;</td></tr>
+            <tr><td>&nbsp;&nbsp;&nbsp;&nbsp;&lt;name&gt;dfs.replication&lt;/name&gt;</td></tr>
+            <tr><td>&nbsp;&nbsp;&nbsp;&nbsp;&lt;value&gt;1&lt;/value&gt;</td></tr>
+          <tr><td>&nbsp;&nbsp;&lt;/property&gt;</td></tr>
+
+        <tr><td>&lt;/configuration&gt;</td></tr>
+        </table>
+      
+        <p><br/><code>conf/mapred-site.xml</code>:</p>
+        <table>
+        <tr><td>&lt;configuration&gt;</td></tr>
+
+          <tr><td>&nbsp;&nbsp;&lt;property&gt;</td></tr>
+            <tr><td>&nbsp;&nbsp;&nbsp;&nbsp;&lt;name&gt;mapred.job.tracker&lt;/name&gt;</td></tr>
+            <tr><td>&nbsp;&nbsp;&nbsp;&nbsp;&lt;value&gt;localhost:9001&lt;/value&gt;</td></tr>
+          <tr><td>&nbsp;&nbsp;&lt;/property&gt;</td></tr>
+
+        <tr><td>&lt;/configuration&gt;</td></tr>
+        </table>
+      </section>
+
+      <section>
+        <title>Setup passphraseless <em>ssh</em></title>
+        
+        <p>
+          Now check that you can ssh to the localhost without a passphrase:<br/>
+          <code>$ ssh localhost</code>
+        </p>
+        
+        <p>
+          If you cannot ssh to localhost without a passphrase, execute the 
+          following commands:<br/>
+   		  <code>$ ssh-keygen -t dsa -P '' -f ~/.ssh/id_dsa</code><br/>
+		  <code>$ cat ~/.ssh/id_dsa.pub >> ~/.ssh/authorized_keys</code>
+		</p>
+      </section>
+    
+      <section>
+        <title>Execution</title>
+        
+        <p>
+          Format a new distributed-filesystem:<br/>
+          <code>$ bin/hadoop namenode -format</code>
+        </p>
+
+		<p>
+		  Start the hadoop daemons:<br/>
+          <code>$ bin/start-all.sh</code>
+        </p>
+
+        <p>The hadoop daemon log output is written to the 
+        <code>${HADOOP_LOG_DIR}</code> directory (defaults to 
+        <code>${HADOOP_HOME}/logs</code>).</p>
+
+        <p>Browse the web interface for the NameNode and the JobTracker; by
+        default they are available at:</p>
+        <ul>
+          <li>
+            <code>NameNode</code> - 
+            <a href="http://localhost:50070/">http://localhost:50070/</a>
+          </li>
+          <li>
+            <code>JobTracker</code> - 
+            <a href="http://localhost:50030/">http://localhost:50030/</a>
+          </li>
+        </ul>
+        
+        <p>
+          Copy the input files into the distributed filesystem:<br/>
+		  <code>$ bin/hadoop fs -put conf input</code>
+		</p>
+		
+        <p>
+          Run some of the examples provided:<br/>
+          <code>
+            $ bin/hadoop jar hadoop-*-examples.jar grep input output 'dfs[a-z.]+'
+          </code>
+        </p>
+        
+        <p>Examine the output files:</p>
+        <p>
+          Copy the output files from the distributed filesystem to the local 
+          filesytem and examine them:<br/>
+          <code>$ bin/hadoop fs -get output output</code><br/>
+          <code>$ cat output/*</code>
+        </p>
+        <p> or </p>
+        <p>
+          View the output files on the distributed filesystem:<br/>
+          <code>$ bin/hadoop fs -cat output/*</code>
+        </p>
+
+		<p>
+		  When you're done, stop the daemons with:<br/>
+		  <code>$ bin/stop-all.sh</code>
+		</p>
+      </section>
+    </section>
+    
+    <section id="FullyDistributed">
+      <title>Fully-Distributed Operation</title>
+      
+	  <p>For information on setting up fully-distributed, non-trivial clusters
+	  see <a href="cluster_setup.html">Hadoop Cluster Setup</a>.</p>  
+    </section>
+    
+    <p>
+      <em>Java and JNI are trademarks or registered trademarks of 
+      Sun Microsystems, Inc. in the United States and other countries.</em>
+    </p>
+    
+  </body>
+  
+</document>

Propchange: hadoop/hdfs/branches/HDFS-265/src/docs/src/documentation/content/xdocs/quickstart.xml
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: hadoop/hdfs/branches/HDFS-265/src/docs/src/documentation/content/xdocs/service_level_auth.xml
URL: http://svn.apache.org/viewvc/hadoop/hdfs/branches/HDFS-265/src/docs/src/documentation/content/xdocs/service_level_auth.xml?rev=817449&view=auto
==============================================================================
--- hadoop/hdfs/branches/HDFS-265/src/docs/src/documentation/content/xdocs/service_level_auth.xml (added)
+++ hadoop/hdfs/branches/HDFS-265/src/docs/src/documentation/content/xdocs/service_level_auth.xml Mon Sep 21 22:33:09 2009
@@ -0,0 +1,234 @@
+<?xml version="1.0"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+
+<!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V2.0//EN" "http://forrest.apache.org/dtd/document-v20.dtd">
+
+<document>
+  
+  <header>
+    <title>Service Level Authorization Guide</title>
+  </header>
+  
+  <body>
+  
+    <section>
+      <title>Purpose</title>
+      
+      <p>This document describes how to configure and manage <em>Service Level
+      Authorization</em> for Hadoop.</p>
+    </section>
+    
+    <section>
+      <title>Pre-requisites</title>
+      
+      <p>Ensure that Hadoop is installed, configured and setup correctly. More
+      details:</p> 
+      <ul>
+        <li>
+          <a href="quickstart.html">Hadoop Quick Start</a> for first-time users.
+        </li>
+        <li>
+          <a href="cluster_setup.html">Hadoop Cluster Setup</a> for large, 
+          distributed clusters.
+        </li>
+      </ul>
+    </section>
+    
+    <section>
+      <title>Overview</title>
+      
+      <p>Service Level Authorization is the initial authorization mechanism to
+      ensure clients connecting to a particular Hadoop <em>service</em> have the
+      necessary, pre-configured, permissions and are authorized to access the given
+      service. For e.g. a Map/Reduce cluster can use this mechanism to allow a
+      configured list of users/groups to submit jobs.</p>
+      
+      <p>The <code>${HADOOP_CONF_DIR}/hadoop-policy.xml</code> configuration file 
+      is used to define the access control lists for various Hadoop services.</p>
+      
+      <p>Service Level Authorization is performed much before to other access 
+      control checks such as file-permission checks, access control on job queues
+      etc.</p>
+    </section>
+    
+    <section>
+      <title>Configuration</title>
+      
+      <p>This section describes how to configure service-level authorization
+      via the configuration file <code>{HADOOP_CONF_DIR}/hadoop-policy.xml</code>.
+      </p>
+      
+      <section>
+        <title>Enable Service Level Authorization</title>
+        
+        <p>By default, service-level authorization is disabled for Hadoop. To
+        enable it set the configuration property 
+        <code>hadoop.security.authorization</code> to <strong>true</strong>
+        in <code>${HADOOP_CONF_DIR}/core-site.xml</code>.</p>
+      </section>
+
+      <section>
+        <title>Hadoop Services and Configuration Properties</title>
+        
+        <p>This section lists the various Hadoop services and their configuration
+        knobs:</p>
+        
+        <table>
+          <tr>
+            <th>Property</th>
+            <th>Service</th>
+          </tr>
+          <tr>
+            <td><code>security.client.protocol.acl</code></td>
+            <td>ACL for ClientProtocol, which is used by user code via the 
+            DistributedFileSystem.</td>
+          </tr>
+          <tr>
+            <td><code>security.client.datanode.protocol.acl</code></td>
+            <td>ACL for ClientDatanodeProtocol, the client-to-datanode protocol
+            for block recovery.</td>
+          </tr>
+          <tr>
+            <td><code>security.datanode.protocol.acl</code></td>
+            <td>ACL for DatanodeProtocol, which is used by datanodes to 
+            communicate with the namenode.</td>
+          </tr>
+          <tr>
+            <td><code>security.inter.datanode.protocol.acl</code></td>
+            <td>ACL for InterDatanodeProtocol, the inter-datanode protocol
+            for updating generation timestamp.</td>
+          </tr>
+          <tr>
+            <td><code>security.namenode.protocol.acl</code></td>
+            <td>ACL for NamenodeProtocol, the protocol used by the secondary
+            namenode to communicate with the namenode.</td>
+          </tr>
+          <tr>
+            <td><code>security.inter.tracker.protocol.acl</code></td>
+            <td>ACL for InterTrackerProtocol, used by the tasktrackers to 
+            communicate with the jobtracker.</td>
+          </tr>
+          <tr>
+            <td><code>security.job.submission.protocol.acl</code></td>
+            <td>ACL for JobSubmissionProtocol, used by job clients to 
+            communciate with the jobtracker for job submission, querying job status 
+            etc.</td>
+          </tr>
+          <tr>
+            <td><code>security.task.umbilical.protocol.acl</code></td>
+            <td>ACL for TaskUmbilicalProtocol, used by the map and reduce 
+            tasks to communicate with the parent tasktracker.</td>
+          </tr>
+          <tr>
+            <td><code>security.refresh.policy.protocol.acl</code></td>
+            <td>ACL for RefreshAuthorizationPolicyProtocol, used by the 
+            dfsadmin and mradmin commands to refresh the security policy in-effect.
+            </td>
+          </tr>
+        </table>
+      </section>
+      
+      <section>
+        <title>Access Control Lists</title>
+        
+        <p><code>${HADOOP_CONF_DIR}/hadoop-policy.xml</code> defines an access 
+        control list for each Hadoop service. Every access control list has a 
+        simple format:</p>
+        
+        <p>The list of users and groups are both comma separated list of names. 
+        The two lists are separated by a space.</p> 
+        
+        <p>Example: <code>user1,user2 group1,group2</code>.</p> 
+        
+        <p>Add a blank at the beginning of the line if only a list of groups
+        is to be provided, equivalently a comman-separated list of users followed
+        by a space or nothing implies only a set of given users.</p>
+        
+        <p>A special value of <strong>*</strong> implies that all users are
+        allowed to access the service.</p>
+      </section>
+      
+      <section>
+        <title>Refreshing Service Level Authorization Configuration</title>
+        
+        <p>The service-level authorization configuration for the NameNode and 
+        JobTracker can be changed without restarting either of the Hadoop master
+        daemons. The cluster administrator can change 
+        <code>${HADOOP_CONF_DIR}/hadoop-policy.xml</code> on the master nodes and 
+        instruct the NameNode and JobTracker to reload their respective 
+        configurations via the <em>-refreshServiceAcl</em> switch to 
+        <em>dfsadmin</em> and <em>mradmin</em> commands respectively.</p>
+        
+        <p>Refresh the service-level authorization configuration for the
+        NameNode:</p>
+        <p>
+          <code>$ bin/hadoop dfsadmin -refreshServiceAcl</code>
+        </p>
+
+        <p>Refresh the service-level authorization configuration for the 
+        JobTracker:</p>
+        <p>  
+          <code>$ bin/hadoop mradmin -refreshServiceAcl</code>
+        </p>
+        
+        <p>Of course, one can use the 
+        <code>security.refresh.policy.protocol.acl</code> property in 
+        <code>${HADOOP_CONF_DIR}/hadoop-policy.xml</code> to restrict access to
+        the ability to refresh the service-level authorization configuration to
+        certain users/groups.</p>
+         
+      </section>
+      
+      <section>
+        <title>Examples</title>
+        
+        <p>Allow only users <code>alice</code>, <code>bob</code> and users in the 
+        <code>mapreduce</code> group to submit jobs to the Map/Reduce cluster:</p>
+        
+        <table>
+          <tr><td>&nbsp;&nbsp;&lt;property&gt;</td></tr>
+            <tr><td>&nbsp;&nbsp;&nbsp;&nbsp;&lt;name&gt;security.job.submission.protocol.acl&lt;/name&gt;</td></tr>
+            <tr><td>&nbsp;&nbsp;&nbsp;&nbsp;&lt;value&gt;alice,bob mapreduce&lt;/value&gt;</td></tr>
+          <tr><td>&nbsp;&nbsp;&lt;/property&gt;</td></tr>
+        </table>
+        
+        <p></p><p>Allow only DataNodes running as the users who belong to the 
+        group <code>datanodes</code> to communicate with the NameNode:</p> 
+        
+        <table>
+          <tr><td>&nbsp;&nbsp;&lt;property&gt;</td></tr>
+            <tr><td>&nbsp;&nbsp;&nbsp;&nbsp;&lt;name&gt;security.datanode.protocol.acl&lt;/name&gt;</td></tr>
+            <tr><td>&nbsp;&nbsp;&nbsp;&nbsp;&lt;value&gt; datanodes&lt;/value&gt;</td></tr>
+          <tr><td>&nbsp;&nbsp;&lt;/property&gt;</td></tr>
+        </table>
+        
+        <p></p><p>Allow any user to talk to the HDFS cluster as a DFSClient:</p>
+        
+        <table>
+          <tr><td>&nbsp;&nbsp;&lt;property&gt;</td></tr>
+            <tr><td>&nbsp;&nbsp;&nbsp;&nbsp;&lt;name&gt;security.client.protocol.acl&lt;/name&gt;</td></tr>
+            <tr><td>&nbsp;&nbsp;&nbsp;&nbsp;&lt;value&gt;*&lt;/value&gt;</td></tr>
+          <tr><td>&nbsp;&nbsp;&lt;/property&gt;</td></tr>
+        </table>
+        
+      </section>
+    </section>
+    
+  </body>
+  
+</document>

Propchange: hadoop/hdfs/branches/HDFS-265/src/docs/src/documentation/content/xdocs/service_level_auth.xml
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Modified: hadoop/hdfs/branches/HDFS-265/src/docs/src/documentation/content/xdocs/site.xml
URL: http://svn.apache.org/viewvc/hadoop/hdfs/branches/HDFS-265/src/docs/src/documentation/content/xdocs/site.xml?rev=817449&r1=817448&r2=817449&view=diff
==============================================================================
--- hadoop/hdfs/branches/HDFS-265/src/docs/src/documentation/content/xdocs/site.xml (original)
+++ hadoop/hdfs/branches/HDFS-265/src/docs/src/documentation/content/xdocs/site.xml Mon Sep 21 22:33:09 2009
@@ -33,20 +33,44 @@
 <site label="Hadoop" href="" xmlns="http://apache.org/forrest/linkmap/1.0">
   
    <docs label="Getting Started"> 
-     <hdfsproxy 			label="HDFS Proxy" 					href="hdfsproxy.html"/>
-     <hdfs_user      				label="User Guide"    							href="hdfs_user_guide.html" />
-     <hdfs_arch     				label="Architecture"  								href="hdfs_design.html" />	
+		<overview   				label="Overview" 					href="index.html" />
+		<quickstart 				label="Quick Start"        		href="quickstart.html" />
+		<setup     					label="Cluster Setup"      		href="cluster_setup.html" />
+		<mapred    				label="Map/Reduce Tutorial" 	href="mapred_tutorial.html" />
+  </docs>	
+		
+ <docs label="Programming Guides">
+		<commands 				label="Commands"     					href="commands_manual.html" />
+		<distcp    					label="DistCp"       						href="distcp.html" />
+		<native_lib    				label="Native Libraries" 					href="native_libraries.html" />
+		<streaming 				label="Streaming"          				href="streaming.html" />
+		<fair_scheduler 			label="Fair Scheduler" 					href="fair_scheduler.html"/>
+        <hdfsproxy 			label="HDFS Proxy" 					href="hdfsproxy.html"/>
+		<cap_scheduler 		label="Capacity Scheduler" 			href="capacity_scheduler.html"/>
+		<SLA					 	label="Service Level Authorization" 	href="service_level_auth.html"/>
+		<vaidya    					label="Vaidya" 								href="vaidya.html"/>
+		<archives  				label="Archives"     						href="hadoop_archives.html"/>
    </docs>
-   <docs label="Guides">
-      <hdfs_perm      				label="Permissions Guide"    					href="hdfs_permissions_guide.html" />
-      <hdfs_quotas     			label="Quotas Guide" 							href="hdfs_quota_admin_guide.html" />
-      <hdfs_SLG        			label="Synthetic Load Generator Guide"  href="SLG_user_guide.html" />
-      <hdfs_imageviewer						label="Offline Image Viewer Guide"	href="hdfs_imageviewer.html" />
-      <hdfs_libhdfs   				label="C API libhdfs"         						href="libhdfs.html" /> 
-    </docs>
-    <docs label="Testing">
-      <faultinject_framework              label="Fault Injection"                                                     href="faultinject_framework.html" />
-    </docs>
+   
+   <docs label="HDFS">
+		<hdfs_user      				label="User Guide"    							href="hdfs_user_guide.html" />
+		<hdfs_arch     				label="Architecture"  								href="hdfs_design.html" />	
+		<hdfs_fs       	 				label="File System Shell Guide"     		href="hdfs_shell.html" />
+		<hdfs_perm      				label="Permissions Guide"    					href="hdfs_permissions_guide.html" />
+		<hdfs_quotas     			label="Quotas Guide" 							href="hdfs_quota_admin_guide.html" />
+		<hdfs_SLG        			label="Synthetic Load Generator Guide"  href="SLG_user_guide.html" />
+		<hdfs_imageviewer						label="Offline Image Viewer Guide"	href="hdfs_imageviewer.html" />
+		<hdfs_libhdfs   				label="C API libhdfs"         						href="libhdfs.html" /> 
+                <docs label="Testing">
+                    <faultinject_framework              label="Fault Injection"                                                     href="faultinject_framework.html" />
+                </docs>
+   </docs> 
+   
+   <docs label="HOD">
+		<hod_user 	label="User Guide" 	href="hod_user_guide.html"/>
+		<hod_admin 	label="Admin Guide" 	href="hod_admin_guide.html"/>
+		<hod_config 	label="Config Guide" 	href="hod_config_guide.html"/> 
+   </docs> 
    
    <docs label="Miscellaneous"> 
 		<api       	label="API Docs"           href="ext:api/index" />
@@ -58,20 +82,19 @@
    </docs> 
    
   <external-refs>
-    <site      href="http://hadoop.apache.org/hdfs/"/>
-    <lists     href="http://hadoop.apache.org/hdfs/mailing_lists.html"/>
-    <archive   href="http://mail-archives.apache.org/mod_mbox/hadoop-hdfs-commits/"/>
-    <releases  href="http://hadoop.apache.org/hdfs/releases.html">
-              <download href="#Download" />
+    <site      href="http://hadoop.apache.org/core/"/>
+    <lists     href="http://hadoop.apache.org/core/mailing_lists.html"/>
+    <archive   href="http://mail-archives.apache.org/mod_mbox/hadoop-core-commits/"/>
+    <releases  href="http://hadoop.apache.org/core/releases.html">
+      <download href="#Download" />
     </releases>
-    <jira      href="http://hadoop.apache.org/hdfs/issue_tracking.html"/>
-    <wiki      href="http://wiki.apache.org/hadoop/HDFS" />
-    <faq       href="http://wiki.apache.org/hadoop/HDFS/FAQ" />
-    
-    <common-default href="http://hadoop.apache.org/common/docs/current/common-default.html" />
-    <hdfs-default href="http://hadoop.apache.org/hdfs/docs/current/hdfs-default.html" />
-    <mapred-default href="http://hadoop.apache.org/mapreduce/docs/current/mapred-default.html" />
-    
+    <jira      href="http://hadoop.apache.org/core/issue_tracking.html"/>
+    <wiki      href="http://wiki.apache.org/hadoop/" />
+    <faq       href="http://wiki.apache.org/hadoop/FAQ" />
+    <hadoop-default href="http://hadoop.apache.org/core/docs/current/hadoop-default.html" />
+    <core-default href="http://hadoop.apache.org/core/docs/current/core-default.html" />
+    <hdfs-default href="http://hadoop.apache.org/core/docs/current/hdfs-default.html" />
+    <mapred-default href="http://hadoop.apache.org/core/docs/current/mapred-default.html" />
     <zlib      href="http://www.zlib.net/" />
     <gzip      href="http://www.gzip.org/" />
     <bzip      href="http://www.bzip.org/" />

Added: hadoop/hdfs/branches/HDFS-265/src/docs/src/documentation/content/xdocs/streaming.xml
URL: http://svn.apache.org/viewvc/hadoop/hdfs/branches/HDFS-265/src/docs/src/documentation/content/xdocs/streaming.xml?rev=817449&view=auto
==============================================================================
--- hadoop/hdfs/branches/HDFS-265/src/docs/src/documentation/content/xdocs/streaming.xml (added)
+++ hadoop/hdfs/branches/HDFS-265/src/docs/src/documentation/content/xdocs/streaming.xml Mon Sep 21 22:33:09 2009
@@ -0,0 +1,670 @@
+<?xml version="1.0"?>
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V2.0//EN"
+          "http://forrest.apache.org/dtd/document-v20.dtd">
+
+
+<document>
+<header>
+<title>Hadoop Streaming</title>
+<meta name="http-equiv">Content-Type</meta>
+<meta name="content">text/html;</meta>
+<meta name="charset">utf-8</meta>
+</header>
+<body>
+<section>
+<title>Hadoop Streaming</title>
+
+<p>
+Hadoop streaming is a utility that comes with the Hadoop distribution. The utility allows you to create and run Map/Reduce jobs with any executable or script as the mapper and/or the reducer. For example:
+</p>
+<source>
+$HADOOP_HOME/bin/hadoop  jar $HADOOP_HOME/hadoop-streaming.jar \
+    -input myInputDirs \
+    -output myOutputDir \
+    -mapper /bin/cat \
+    -reducer /bin/wc
+</source>
+</section>
+
+<section>
+<title>How Does Streaming Work </title>
+<p>
+In the above example, both the mapper and the reducer are executables that read the input from stdin (line by line) and emit the output to stdout. The utility will create a Map/Reduce job, submit the job to an appropriate cluster, and monitor the progress of the job until it completes.
+</p><p>
+  When an executable is specified for mappers, each mapper task will launch the executable as a separate process when the mapper is initialized. As the mapper task runs, it converts its inputs into lines and feed the lines to the stdin of the process. In the meantime, the mapper collects the line oriented outputs from the stdout of the process and converts each line into a key/value pair, which is collected as the output of the mapper. By default, the 
+  <em>prefix of a line up to the first tab character</em> is the <strong>key</strong> and the rest of the line (excluding the tab character) will be the <strong>value</strong>. 
+  If there is no tab character in the line, then entire line is considered as key and the value is null. However, this can be customized, as discussed later.
+</p>
+<p>
+When an executable is specified for reducers, each reducer task will launch the executable as a separate process then the reducer is initialized. As the reducer task runs, it converts its input key/values pairs into lines and feeds the lines to the stdin of the process. In the meantime, the reducer collects the line oriented outputs from the stdout of the process, converts each line into a key/value pair, which is collected as the output of the reducer. By default, the prefix of a line up to the first tab character is the key and the rest of the line (excluding the tab character) is the value. However, this can be customized, as discussed later.
+</p><p>
+This is the basis for the communication protocol between the Map/Reduce framework and the streaming mapper/reducer.
+</p><p>
+You can supply a Java class as the mapper and/or the reducer. The above example is equivalent to:
+</p>
+<source>
+$HADOOP_HOME/bin/hadoop  jar $HADOOP_HOME/hadoop-streaming.jar \
+    -input myInputDirs \
+    -output myOutputDir \
+    -mapper org.apache.hadoop.mapred.lib.IdentityMapper \
+    -reducer /bin/wc
+</source>
+<p>User can specify <code>stream.non.zero.exit.is.failure</code> as 
+<code>true</code> or <code>false</code> to make a streaming task that exits 
+with a non-zero status to be <code>Failure</code> 
+or <code>Success</code> respectively. By default, streaming tasks exiting 
+with non-zero status are considered to be failed tasks.</p>
+
+</section>
+
+<section>
+<title>Package Files With Job Submissions</title>
+<p>
+You can specify any executable as the mapper and/or the reducer. The executables do not need to pre-exist on the machines in the cluster; however, if they don't, you will need to use "-file" option to tell the framework to pack your executable files as a part of job submission. For example:
+</p>
+<source>
+$HADOOP_HOME/bin/hadoop  jar $HADOOP_HOME/hadoop-streaming.jar \
+    -input myInputDirs \
+    -output myOutputDir \
+    -mapper myPythonScript.py \
+    -reducer /bin/wc \
+    -file myPythonScript.py 
+</source>
+<p>
+The above example specifies a user defined Python executable as the mapper. The option "-file myPythonScript.py" causes the python executable shipped to the cluster machines as a part of job submission.
+</p>
+<p>
+In addition to executable files, you can also package other auxiliary files (such as dictionaries, configuration files, etc) that may be used by the mapper and/or the reducer. For example:
+</p>
+<source>
+$HADOOP_HOME/bin/hadoop  jar $HADOOP_HOME/hadoop-streaming.jar \
+    -input myInputDirs \
+    -output myOutputDir \
+    -mapper myPythonScript.py \
+    -reducer /bin/wc \
+    -file myPythonScript.py \
+    -file myDictionary.txt
+</source>
+</section>
+
+<section>
+<title>Streaming Options and Usage </title>
+
+<section>
+<title>Mapper-Only Jobs </title>
+<p>
+Often, you may want to process input data using a map function only. To do this, simply set mapred.reduce.tasks to zero. The Map/Reduce framework will not create any reducer tasks. Rather, the outputs of the mapper tasks will be the final output of the job.
+</p><p>
+To be backward compatible, Hadoop Streaming also supports the "-reduce NONE" option, which is equivalent to "-D mapred.reduce.tasks=0".
+</p>
+</section>
+
+<section>
+<title>Specifying Other Plugins for Jobs </title>
+<p>
+Just as with a normal Map/Reduce job, you can specify other plugins for a streaming job:
+</p>
+<source>
+   -inputformat JavaClassName
+   -outputformat JavaClassName
+   -partitioner JavaClassName
+   -combiner streamingCommand or JavaClassName
+</source>
+<p>
+The class you supply for the input format should return key/value pairs of Text class. If you do not specify an input format class, the TextInputFormat is used as the default. Since the TextInputFormat returns keys of LongWritable class, which are actually not part of the input data, the keys will be discarded; only the values will be piped to the streaming mapper.
+</p><p>
+The class you supply for the output format is expected to take key/value pairs of Text class. If you do not specify an output format class, the TextOutputFormat is used as the default.
+</p>
+</section>
+
+<section>
+<title>Large files and archives in Hadoop Streaming </title>
+
+<p>
+The -files and -archives options allow you to make files and archives available to the tasks. The argument is a URI to the file or archive that you have already uploaded to HDFS. These files and archives are cached across jobs. You can retrieve the host and fs_port values from the fs.default.name config variable.
+</p>
+<p>
+Here are examples of the -files option:
+</p> 
+<source>
+-files hdfs://host:fs_port/user/testfile.txt#testlink
+</source>
+<p>
+In the above example, the part of the url after # is used as the symlink name that is created in the current working directory of tasks. So the tasks will have a symlink called testlink in the cwd that points to a local copy of testfile.txt. Multiple entries can be specified as: 
+</p>
+<source>
+-files hdfs://host:fs_port/user/testfile1.txt#testlink1 -files hdfs://host:fs_port/user/testfile2.txt#testlink2
+</source>
+<p>
+The -archives option allows you to copy jars locally to the cwd of tasks and automatically unjar the files. For example:
+</p>
+<source>
+-archives hdfs://host:fs_port/user/testfile.jar#testlink3
+</source>
+<p>
+In the example above, a symlink testlink3 is created in the current working directory of tasks. This symlink points to the directory that stores the unjarred contents of the uploaded jar file.
+</p>
+<p>
+Here's another example of the -archives option. Here, the input.txt file has two lines specifying the names of the two files: testlink/cache.txt and testlink/cache2.txt. "testlink" is a symlink to the archived directory, which has the files "cache.txt" and "cache2.txt".
+</p>
+<source>
+$HADOOP_HOME/bin/hadoop  jar $HADOOP_HOME/hadoop-streaming.jar \
+                  -input "/user/me/samples/cachefile/input.txt"  \
+                  -mapper "xargs cat"  \
+                  -reducer "cat"  \
+                  -output "/user/me/samples/cachefile/out" \  
+                  -archives 'hdfs://hadoop-nn1.example.com/user/me/samples/cachefile/cachedir.jar#testlink' \  
+                  -D mapred.map.tasks=1 \
+                  -D mapred.reduce.tasks=1 \ 
+                  -D mapred.job.name="Experiment"
+
+$ ls test_jar/
+cache.txt  cache2.txt
+
+$ jar cvf cachedir.jar -C test_jar/ .
+added manifest
+adding: cache.txt(in = 30) (out= 29)(deflated 3%)
+adding: cache2.txt(in = 37) (out= 35)(deflated 5%)
+
+$ hadoop dfs -put cachedir.jar samples/cachefile
+
+$ hadoop dfs -cat /user/me/samples/cachefile/input.txt
+testlink/cache.txt
+testlink/cache2.txt
+
+$ cat test_jar/cache.txt 
+This is just the cache string
+
+$ cat test_jar/cache2.txt 
+This is just the second cache string
+
+$ hadoop dfs -ls /user/me/samples/cachefile/out      
+Found 1 items
+/user/me/samples/cachefile/out/part-00000  &lt;r 3&gt;   69
+
+$ hadoop dfs -cat /user/me/samples/cachefile/out/part-00000
+This is just the cache string   
+This is just the second cache string
+
+</source>
+</section>
+
+<section>
+<title>Specifying Additional Configuration Variables for Jobs </title>
+<p>
+You can specify additional configuration variables by using "-D  &lt;n&gt;=&lt;v&gt;". For example: 
+</p>
+<source>
+$HADOOP_HOME/bin/hadoop  jar $HADOOP_HOME/hadoop-streaming.jar \
+    -input myInputDirs \
+    -output myOutputDir \
+    -mapper org.apache.hadoop.mapred.lib.IdentityMapper\
+    -reducer /bin/wc \
+    -D mapred.reduce.tasks=2
+</source>
+<p>
+The -D mapred.reduce.tasks=2 in the above example specifies to use two reducers for the job.
+</p>
+<p>
+For more details on the jobconf parameters see:
+<a href="ext:mapred-default">mapred-default.html</a></p>
+</section>
+
+<section>
+<title>Other Supported Options </title>
+<p>
+Other options you may specify for a streaming job are described here:
+</p>
+<table>
+<tr><th>Parameter</th><th>Optional/Required </th><th>Description </th></tr>
+
+<tr><td> -cmdenv   name=value </td><td> Optional </td><td> Pass env var to streaming commands </td></tr>
+
+<tr><td> -inputreader JavaClassName </td><td> Optional </td><td> For backwards-compatibility: specifies a record reader class (instead of an input format class) </td></tr>
+<tr><td> -verbose </td><td> Optional </td><td> Verbose output </td></tr>
+<tr><td> -lazyOutput </td><td> Optional </td><td> Create output lazily. For example, if the output format is based on FileOutputFormat, the output file is created only on the first call to output.collect (or Context.write)</td></tr>
+</table>
+<p>
+Streaming support Hadoop generic command line options. 
+
+Supported parameters are : 
+The general command line syntax is :
+<br/>    bin/hadoop command [genericOptions] [commandOptions]
+</p>
+
+<table>
+<tr><th>Parameter</th><th>Optional/Required </th><th>Description </th></tr>
+
+<tr><td> -conf  configuration_file </td><td> Optional </td><td> specify an application configuration file </td></tr>
+<tr><td> -D  property=value </td><td> Optional </td><td> use value for given property </td></tr>
+<tr><td> -fs host:port or local </td><td> Optional </td><td> specify a namenode </td></tr>
+<tr><td> -jt host:port or local </td><td> Optional </td><td> specify a job tracker </td></tr>
+<tr><td> -files </td><td> Optional </td><td> specify comma separated files to be copied to the map reduce cluster </td></tr>
+<tr><td> -archives </td><td> Optional </td><td> specify comma separated archives to be unarchived on the compute machines </td></tr>
+<tr><td>  </td><td> Optional </td><td>  </td></tr>
+<tr><td> -jt host:port or local </td><td> Optional </td><td>  </td></tr>
+</table>
+
+<p>
+To change the local temp directory use:
+</p>
+<source>
+  -D dfs.data.dir=/tmp
+</source>
+<p>
+To specify additional local temp directories use:
+</p>
+<source>
+   -D mapred.local.dir=/tmp/local
+   -D mapred.system.dir=/tmp/system
+   -D mapred.temp.dir=/tmp/temp
+</source>
+<p>
+For more details on jobconf parameters see:
+<a href="ext:mapred-default">mapred-default.html</a></p>
+<p>
+To set an environment variable in a streaming command use:
+</p>
+<source>
+-cmdenv EXAMPLE_DIR=/home/example/dictionaries/
+</source>
+</section>
+</section>
+
+<section>
+<title>More usage examples </title>
+
+<section>
+<title>Customizing the Way to Split Lines into Key/Value Pairs </title>
+<p>
+As noted earlier, when the Map/Reduce framework reads a line from the stdout of the mapper, it splits the line into a key/value pair. By default, the prefix of the line up to the first tab character is the key and the rest of the line (excluding the tab character) is the value.
+</p>
+<p>
+However, you can customize this default. You can specify a field separator other than the tab character (the default), and you can specify the nth (n >= 1) character rather than the first character in a line (the default) as the separator between the key and value. For example:
+</p>
+
+<source>
+$HADOOP_HOME/bin/hadoop  jar $HADOOP_HOME/hadoop-streaming.jar \
+    -input myInputDirs \
+    -output myOutputDir \
+    -mapper org.apache.hadoop.mapred.lib.IdentityMapper \
+    -reducer org.apache.hadoop.mapred.lib.IdentityReducer \
+    -D stream.map.output.field.separator=. \
+    -D stream.num.map.output.key.fields=4 
+</source>
+<p>
+In the above example, "-D stream.map.output.field.separator=." specifies "." as the field separator for the map outputs, and the prefix up to the fourth "." in a line will be the key and the rest of the line (excluding the fourth ".") will be the value. If a line has less than four "."s, then the whole line will be the key and the value will be an empty Text object (like the one created by new Text("")).
+</p><p>
+Similarly, you can use "-D stream.reduce.output.field.separator=SEP" and "-D stream.num.reduce.output.fields=NUM" to specify the nth field separator in a line of the reduce outputs as the separator between the key and the value.
+</p>
+<p> Similarly, you can specify "stream.map.input.field.separator" and 
+"stream.reduce.input.field.separator" as the input separator for map/reduce 
+inputs. By default the separator is the tab character.</p>
+</section>
+
+
+<section>
+<title>A Useful Partitioner Class (secondary sort, the -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner option) </title>
+<p>
+Hadoop has a library class, 
+<a href="ext:api/org/apache/hadoop/mapred/lib/keyfieldbasedpartitioner">KeyFieldBasedPartitioner</a>, 
+that is useful for many applications. This class allows the Map/Reduce 
+framework to partition the map outputs based on certain key fields, not
+the whole keys. For example:
+</p>
+<source>
+$HADOOP_HOME/bin/hadoop  jar $HADOOP_HOME/hadoop-streaming.jar \
+    -input myInputDirs \
+    -output myOutputDir \
+    -mapper org.apache.hadoop.mapred.lib.IdentityMapper \
+    -reducer org.apache.hadoop.mapred.lib.IdentityReducer \
+    -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner \
+    -D stream.map.output.field.separator=. \
+    -D stream.num.map.output.key.fields=4 \
+    -D map.output.key.field.separator=. \
+    -D mapred.text.key.partitioner.options=-k1,2\
+    -D mapred.reduce.tasks=12
+</source>
+<p>
+Here, <em>-D stream.map.output.field.separator=.</em> and <em>-D stream.num.map.output.key.fields=4</em> are as explained in previous example. The two variables are used by streaming to identify the key/value pair of mapper. 
+</p><p>
+The map output keys of the above Map/Reduce job normally have four fields
+separated by ".". However, the Map/Reduce framework will partition the map
+outputs by the first two fields of the keys using the 
+<em>-D mapred.text.key.partitioner.options=-k1,2</em> option. 
+Here, <em>-D map.output.key.field.separator=.</em> specifies the separator 
+for the partition. This guarantees that all the key/value pairs with the 
+same first two fields in the keys will be partitioned into the same reducer.
+</p><p>
+<em>This is effectively equivalent to specifying the first two fields as the primary key and the next two fields as the secondary. The primary key is used for partitioning, and the combination of the primary and secondary keys is used for sorting.</em> A simple illustration is shown here:
+</p>
+<p>
+Output of map (the keys)</p><source>
+11.12.1.2
+11.14.2.3
+11.11.4.1
+11.12.1.1
+11.14.2.2
+
+</source>
+<p>
+Partition into 3 reducers (the first 2 fields are used as keys for partition)</p><source>
+11.11.4.1
+-----------
+11.12.1.2
+11.12.1.1
+-----------
+11.14.2.3
+11.14.2.2
+</source>
+<p>
+Sorting within each partition for the reducer(all 4 fields used for sorting)</p><source>
+11.11.4.1
+-----------
+11.12.1.1
+11.12.1.2
+-----------
+11.14.2.2
+11.14.2.3
+</source>
+</section>
+<section>
+<title>A Useful Comparator Class</title>
+<p>
+Hadoop has a library class, 
+<a href="ext:api/org/apache/hadoop/mapred/lib/keyfieldbasedcomparator">KeyFieldBasedComparator</a>, 
+that is useful for many applications. This class provides a subset of features
+provided by the Unix/GNU Sort. For example:
+</p>
+<source>
+$HADOOP_HOME/bin/hadoop  jar $HADOOP_HOME/hadoop-streaming.jar \
+    -input myInputDirs \
+    -output myOutputDir \
+    -mapper org.apache.hadoop.mapred.lib.IdentityMapper \
+    -reducer org.apache.hadoop.mapred.lib.IdentityReducer \
+    -D mapred.output.key.comparator.class=org.apache.hadoop.mapred.lib.KeyFieldBasedComparator \
+    -D stream.map.output.field.separator=. \
+    -D stream.num.map.output.key.fields=4 \
+    -D map.output.key.field.separator=. \
+    -D mapred.text.key.comparator.options=-k2,2nr\
+    -D mapred.reduce.tasks=12
+</source>
+<p>
+The map output keys of the above Map/Reduce job normally have four fields
+separated by ".". However, the Map/Reduce framework will sort the 
+outputs by the second field of the keys using the 
+<em>-D mapred.text.key.comparator.options=-k2,2nr</em> option. 
+Here, <em>-n</em> specifies that the sorting is numerical sorting and 
+<em>-r</em> specifies that the result should be reversed. A simple illustration
+is shown below:
+</p>
+<p>
+Output of map (the keys)</p>
+<source>
+11.12.1.2
+11.14.2.3
+11.11.4.1
+11.12.1.1
+11.14.2.2
+</source>
+<p>
+Sorting output for the reducer(where second field used for sorting)</p>
+<source>
+11.14.2.3
+11.14.2.2
+11.12.1.2
+11.12.1.1
+11.11.4.1
+</source>
+</section>
+
+<section>
+<title>Working with the Hadoop Aggregate Package (the -reduce aggregate option) </title>
+<p>
+Hadoop has a library package called 
+<a href="ext:api/org/apache/hadoop/mapred/lib/aggregate/package-summary">Aggregate</a>.
+Aggregate provides a special reducer class and a special combiner class, and
+a list of simple aggregators that perform aggregations such as "sum", "max",
+"min" and so on  over a sequence of values. Aggregate allows you to define a
+mapper plugin class that is expected to generate "aggregatable items" for each
+input key/value pair of the mappers. The combiner/reducer will aggregate those
+aggregatable items by invoking the appropriate aggregators.
+</p><p>
+To use Aggregate, simply specify "-reducer aggregate":
+</p>
+<source>
+$HADOOP_HOME/bin/hadoop  jar $HADOOP_HOME/hadoop-streaming.jar \
+    -input myInputDirs \
+    -output myOutputDir \
+    -mapper myAggregatorForKeyCount.py \
+    -reducer aggregate \
+    -file myAggregatorForKeyCount.py \
+    -D mapred.reduce.tasks=12
+</source>
+<p>
+The python program myAggregatorForKeyCount.py looks like:
+</p>
+<source>
+#!/usr/bin/python
+
+import sys;
+
+def generateLongCountToken(id):
+    return "LongValueSum:" + id + "\t" + "1"
+
+def main(argv):
+    line = sys.stdin.readline();
+    try:
+        while line:
+            line = line&#91;:-1];
+            fields = line.split("\t");
+            print generateLongCountToken(fields&#91;0]);
+            line = sys.stdin.readline();
+    except "end of file":
+        return None
+if __name__ == "__main__":
+     main(sys.argv)
+</source>
+</section>
+
+<section>
+<title>Field Selection ( similar to unix 'cut' command) </title>
+<p>
+Hadoop has a library class, org.apache.hadoop.mapred.lib.FieldSelectionMapReduce, that effectively allows you to process text data like the unix "cut" utility. The map function defined in the class treats each input key/value pair as a list of fields. You can specify the field separator (the default is the tab character). You can select an arbitrary list of fields as the map output key, and an arbitrary list of fields as the map output value. Similarly, the reduce function defined in the class treats each input key/value pair as a list of fields. You can select an arbitrary list of fields as the reduce output key, and an arbitrary list of fields as the reduce output value. For example:
+</p>
+<source>
+$HADOOP_HOME/bin/hadoop  jar $HADOOP_HOME/hadoop-streaming.jar \
+    -input myInputDirs \
+    -output myOutputDir \
+    -mapper org.apache.hadoop.mapred.lib.FieldSelectionMapReduce\
+    -reducer org.apache.hadoop.mapred.lib.FieldSelectionMapReduce\
+    -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner \
+    -D map.output.key.field.separa=. \
+    -D mapred.text.key.partitioner.options=-k1,2 \
+    -D mapred.data.field.separator=. \
+    -D map.output.key.value.fields.spec=6,5,1-3:0- \
+    -D reduce.output.key.value.fields.spec=0-2:5- \
+    -D mapred.reduce.tasks=12
+</source>
+<p>
+The option "-D map.output.key.value.fields.spec=6,5,1-3:0-" specifies key/value selection for the map outputs. Key selection spec and value selection spec are separated by ":". In this case, the map output key will consist of fields 6, 5, 1, 2, and 3. The map output value will consist of all fields (0- means field 0 and all 
+the subsequent fields). 
+</p><p>
+The option "-D reduce.output.key.value.fields.spec=0-2:5-" specifies 
+key/value selection for the reduce outputs. In this case, the reduce 
+output key will consist of fields 0, 1, 2 (corresponding to the original 
+fields 6, 5, 1). The reduce output value will consist of all fields starting
+from field 5 (corresponding to all the original fields).  
+</p>
+</section>
+</section>
+
+<section>
+<title>Frequently Asked Questions </title>
+
+<section>
+<title>How do I use Hadoop Streaming to run an arbitrary set of (semi-)independent tasks? </title>
+<p>
+Often you do not need the full power of Map Reduce, but only need to run multiple instances of the same program - either on different parts of the data, or on the same data, but with different parameters. You can use Hadoop Streaming to do this.
+</p>
+
+</section>
+
+<section>
+<title>How do I process files, one per map? </title>
+<p>
+As an example, consider the problem of zipping (compressing) a set of files across the hadoop cluster. You can achieve this using either of these methods:
+</p><ol>
+<li> Hadoop Streaming and custom mapper script:<ul>
+  <li> Generate a file containing the full HDFS path of the input files. Each map task would get one file name as input.</li>
+  <li> Create a mapper script which, given a filename, will get the file to local disk, gzip the file and put it back in the desired output directory</li>
+</ul></li>
+<li>The existing Hadoop Framework:<ul>
+   <li>Add these commands to your main function:
+<source>
+       FileOutputFormat.setCompressOutput(conf, true);
+       FileOutputFormat.setOutputCompressorClass(conf, org.apache.hadoop.io.compress.GzipCodec.class);
+       conf.setOutputFormat(NonSplitableTextInputFormat.class);
+       conf.setNumReduceTasks(0);
+</source></li>
+   <li>Write your map function:
+<source>
+
+       public void map(WritableComparable key, Writable value, 
+                               OutputCollector output, 
+                               Reporter reporter) throws IOException {
+            output.collect((Text)value, null);
+       }
+</source></li>
+  <li>Note that the output filename will not be the same as the original filename</li>
+</ul></li>
+</ol>
+</section>
+
+<section>
+<title>How many reducers should I use? </title>
+<p>
+See the Hadoop Wiki for details: <a href="mapred_tutorial.html#Reducer">Reducer</a>
+</p>
+</section>
+
+<section>
+<title>If I set up an alias in my shell script, will that work after -mapper, i.e. say I do: alias c1='cut -f1'. Will -mapper "c1" work? </title>
+<p>
+Using an alias will not work, but variable substitution is allowed as shown in this example:
+</p>
+<source>
+$ hadoop dfs -cat samples/student_marks
+alice   50
+bruce   70
+charlie 80
+dan     75
+
+$ c2='cut -f2'; $HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/hadoop-streaming.jar \
+    -input /user/me/samples/student_marks 
+    -mapper \"$c2\" -reducer 'cat'  
+    -output /user/me/samples/student_out 
+    -D mapred.job.name='Experiment'
+
+$ hadoop dfs -ls samples/student_out
+Found 1 items/user/me/samples/student_out/part-00000    &lt;r 3&gt;   16
+
+$ hadoop dfs -cat samples/student_out/part-00000
+50
+70
+75
+80
+</source>
+</section>
+
+<section>
+<title>Can I use UNIX pipes? For example, will -mapper "cut -f1 | sed s/foo/bar/g" work?</title>
+<p>
+Currently this does not work and gives an "java.io.IOException: Broken pipe" error. This is probably a bug that needs to be investigated.
+</p>
+</section>
+
+<section>
+<title>When I run a streaming job by <strong>distributing large executables</strong> (for example, 3.6G) through the -file option, I get a "No space left on device" error. What do I do? </title>
+<p>
+The jar packaging happens in a directory pointed to by the configuration variable stream.tmpdir. The default value of stream.tmpdir is /tmp. Set the value to a directory with more space:
+</p>
+<source>
+-D stream.tmpdir=/export/bigspace/...
+</source>
+</section>
+
+<section>
+<title>How do I specify multiple input directories? </title>
+<p>
+You can specify multiple input directories with multiple '-input' options:
+</p><source>
+ hadoop jar hadoop-streaming.jar -input '/user/foo/dir1' -input '/user/foo/dir2' 
+</source>
+</section>
+
+<section>
+<title>How do I generate output files with gzip format? </title>
+<p>
+Instead of plain text files, you can generate gzip files as your generated output. Pass '-D mapred.output.compress=true -D  mapred.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec' as option to your streaming job.
+</p>
+</section>
+
+<section>
+<title>How do I provide my own input/output format with streaming? </title>
+<p>
+At least as late as version 0.14, Hadoop does not support multiple jar files. So, when specifying your own custom classes you will have to pack them along with the streaming jar and use the custom jar instead of the default hadoop streaming jar. 
+</p>
+</section>
+
+<section>
+<title>How do I parse XML documents using streaming? </title>
+<p>
+You can use the record reader StreamXmlRecordReader to process XML documents. 
+</p>
+<source>
+hadoop jar hadoop-streaming.jar -inputreader "StreamXmlRecord,begin=BEGIN_STRING,end=END_STRING" ..... (rest of the command)
+</source>
+<p>
+Anything found between BEGIN_STRING and END_STRING would be treated as one record for map tasks.
+</p>
+</section>
+
+<section>
+<title>How do I update counters in streaming applications? </title>
+<p>
+A streaming process can use the stderr to emit counter information.
+<code>reporter:counter:&lt;group&gt;,&lt;counter&gt;,&lt;amount&gt;</code> 
+should be sent to stderr to update the counter.
+</p>
+</section>
+
+<section>
+<title>How do I update status in streaming applications? </title>
+<p>
+A streaming process can use the stderr to emit status information.
+To set a status, <code>reporter:status:&lt;message&gt;</code> should be sent 
+to stderr.
+</p>
+</section>
+
+</section>
+</body>
+</document>

Propchange: hadoop/hdfs/branches/HDFS-265/src/docs/src/documentation/content/xdocs/streaming.xml
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Modified: hadoop/hdfs/branches/HDFS-265/src/docs/src/documentation/content/xdocs/tabs.xml
URL: http://svn.apache.org/viewvc/hadoop/hdfs/branches/HDFS-265/src/docs/src/documentation/content/xdocs/tabs.xml?rev=817449&r1=817448&r2=817449&view=diff
==============================================================================
--- hadoop/hdfs/branches/HDFS-265/src/docs/src/documentation/content/xdocs/tabs.xml (original)
+++ hadoop/hdfs/branches/HDFS-265/src/docs/src/documentation/content/xdocs/tabs.xml Mon Sep 21 22:33:09 2009
@@ -30,8 +30,8 @@
     directory (ends in '/'), in which case /index.html will be added
   -->
 
-  <tab label="Project" href="http://hadoop.apache.org/hdfs/" />
-  <tab label="Wiki" href="http://wiki.apache.org/hadoop/hdfs" />
-  <tab label="HDFS 0.21 Documentation" dir="" />  
+  <tab label="Project" href="http://hadoop.apache.org/core/" />
+  <tab label="Wiki" href="http://wiki.apache.org/hadoop" />
+  <tab label="Hadoop 0.21 Documentation" dir="" />  
   
 </tabs>

Added: hadoop/hdfs/branches/HDFS-265/src/docs/src/documentation/content/xdocs/vaidya.xml
URL: http://svn.apache.org/viewvc/hadoop/hdfs/branches/HDFS-265/src/docs/src/documentation/content/xdocs/vaidya.xml?rev=817449&view=auto
==============================================================================
--- hadoop/hdfs/branches/HDFS-265/src/docs/src/documentation/content/xdocs/vaidya.xml (added)
+++ hadoop/hdfs/branches/HDFS-265/src/docs/src/documentation/content/xdocs/vaidya.xml Mon Sep 21 22:33:09 2009
@@ -0,0 +1,172 @@
+<?xml version="1.0"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+
+<!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V2.0//EN" "http://forrest.apache.org/dtd/document-v20.dtd">
+
+<document>
+  
+  <header>
+    <title>Vaidya Guide</title>
+  </header>
+  
+  <body>
+  
+    <section>
+      <title>Purpose</title>
+      
+      <p>This document describes various user-facing facets of Hadoop Vaidya, a performance diagnostic tool for map/reduce jobs. It
+         describes how to execute a default set of rules against your map/reduce job counters and
+         how to write and execute new rules to detect specific performance problems. 
+      </p>
+      <p>A few sample test rules are provided with the tool with the objective of growing the rules database over the time. 
+         You are welcome to contribute new rules for everyone's benefit; to do so, follow the 
+         <a href="http://wiki.apache.org/hadoop/HowToContribute">How to Contribute</a> procedure
+         specified on Apache Hadoop website.
+      </p>
+    </section>
+    
+    <section>
+      <title>Pre-requisites</title>
+      
+      <p>Ensure that Hadoop is installed and configured. More details:</p> 
+      <ul>
+        <li>
+          Make sure HADOOP_HOME environment variable is set.
+        </li>
+        <li>
+          Make sure Java is installed and configured as a part of the Hadoop installation.
+        </li>
+      </ul>
+    </section>
+    
+    <section>
+      <title>Overview</title>
+      
+      <p>Hadoop Vaidya (Vaidya in Sanskrit language means "one who knows", or "a physician") 
+	    is a rule based performance diagnostic tool for 
+        Map/Reduce jobs. It performs a post execution analysis of map/reduce 
+        job by parsing and collecting execution statistics through job history 
+        and job configuration files. It runs a set of predefined tests/rules 
+        against job execution statistics to diagnose various performance problems. 
+        Each test rule detects a specific performance problem with the Map/Reduce job and provides 
+        a targeted advice to the user. This tool generates an XML report based on 
+        the evaluation results of individual test rules.
+      </p>
+      
+    </section>
+  
+    <section>
+	 <title>Terminology</title>
+	 
+	 <p> This section describes main concepts and terminology involved with Hadoop Vaidya,</p>
+		<ul>
+			<li> <em>PostExPerformanceDiagnoser</em>: This class extends the base Diagnoser class and acts as a driver for post execution performance analysis of Map/Reduce Jobs. 
+                       It detects performance inefficiencies by executing a set of performance diagnosis rules against the job execution statistics.</li>
+			<li> <em>Job Statistics</em>: This includes the job configuration information (job.xml) and various counters logged by Map/Reduce job as a part of the job history log
+		           file. The counters are parsed and collected into the Job Statistics data structures, which contains global job level aggregate counters and 
+			     a set of counters for each Map and Reduce task.</li>
+			<li> <em>Diagnostic Test/Rule</em>: This is a program logic that detects the inefficiency of M/R job based on the job statistics. The
+				 description of the Test is specified as an XML element (DiagnosticTest) in a test description file e.g. 
+				 default tests description file, <em>$HADOOP_HOME/contrib/vaidya/conf/postex_diagnosis_tests.xml</em>. The actual logic is coded as
+				 a java class and referenced in the DiagnosticTest XML element. </li>
+		</ul>
+	<p></p>
+	<p>Following section describes the <em>DiagnosticTest</em> XML element in a diagnostic test description file </p>
+		<ul>
+			<li> <em>DiagnosticTest{Title}</em>: Specifies a short name/description of the test.</li>
+			<li> <em>DiagnosticTest{ClassName}</em>: Specifies fully qualified class name that implements the test logic.</li>
+			<li> <em>DiagnosticTest{Description}</em>: Specifies a full description of the test rule.</li>
+			<li> <em>DiagnosticTest{Importance}</em>: Specifies a declarative value for overall importance of the test rule. (Values: High, Medium, Low)</li>
+			<li> <em>DiagnosticTest{SuccessThreshod}</em>: This is a threshold value specified by test case writer such that if impact level of the test case
+				 is lesser, then test is declared as PASSED (or NEGATIVE). The impact level is calculated and returned
+				 by individual test's evaluate function, specifying the degree of problem job has with respect to the condition being evaluated.</li>
+			<li> <em>DiagnosticTest{Prescription}</em>: This is a targeted advice written by the test case adviser for the user to follow when test is not PASSED. </li>
+			<li> <em>DiagonsticTest{InputElement}</em>: This is a test specific input that test writer has to optionally provide. This will be supplied to individual test case
+                       class so that test writer can use it within test case. This is typically a test configuration information such that test writer need not change the
+                       Java code for test case but rather can configure the test case using these input values. </li>
+		</ul>
+	<p></p>
+	<p>Following section describes the performance analysis report generated by the tool in XML format</p>
+		<ul>
+			<li> <em>PostExPerformanceDiagnosticReport</em>: This is a document (root) element from the XML report generated by the tool. </li>
+			<li> <em>TestReportElement</em>: This is a XML report element from the test report document, one for each individual test specified in test description
+				 file </li>  
+			<li> <em>TestReportElement{TestTitle}</em>: Will be included from DiagnosticTest{Title} </li>
+			<li> <em>TestReportElement{TestDescription}</em>: Will be included from DiagnosticTest{Description} </li>
+			<li> <em>TestReportElement{TestImportance}</em>: Will be included from DiagnosticTest{Importance} </li>
+			<li> <em>TestReportElement{TestSeverity}</em>: This is a product of Test Impact level and Test Importance. It indicates overall severity of the test.</li>
+			<li> <em>TestReportElement{ReferenceDetails}</em>: This is a test specific runtime information provided by test case to support the test result and severity. Typically
+				 Test writer should print the test impact level in this section. </li>
+			<li> <em>TestReportElement{TestResults}</em>: This is boolean outcome of the test based on the SuccessThreshold specified by test writer in the DiagnosticTest description. The 
+				 test PASSED(NEGATIVE) indicates no problem vs. FAILED (POSITIVE) indicates a potential problem with the job for given test case. </li>
+			<li> <em>TestReportElement{TestPrescription}</em>: This will be included from DiagnosticTest{Prescription}, unless test case writer overrides it in the test case class through getPrescription()
+				 method </li>
+		</ul>	 
+	</section>
+	
+	<section>
+		<title>How to Execute the Hadoop Vaidya Tool</title>
+		  
+      	<p>Script to execute Hadoop Vaidya is in <code>$HADOOP_HOME/contrib/vaidya/bin/</code> directory.
+		   It comes with a default set of rules defined in file: 
+           <code>$HADOOP_HOME/contrib/vaidya/conf/postex_diagnosis_tests.xml</code> </p>
+		  <ul>
+			<li>Make sure HADOOP_HOME environment variable is set and Java is installed and configured.</li>
+			<li>Execute the Hadoop Vaidya script with -help (or without any arguments) to get the command line help. e.g. 
+                       <code>=>sh $HADOOP_HOME/contrib/vaidya/bin/vaidya.sh -help</code></li>
+			<li>User needs to 
+				 supply job's configuration file (<code>-jobconf job_conf.xml</code>), job history log file (<code>-joblog job_history_log_file</code>), and optionally the test description
+				 file (<code>-testconf postex_diagonostic_tests.xml</code>). If test description file is not specified then the default one is picked up from the Hadoop Vaidya Jar (<code>$HADOOP_HOME/contrib/vaidya/hadoop-{version}-vaidya.jar</code>).
+				 This default test description file is also available at following location for users to make a local copy, modify and add new test rules: 
+			     <code>$HADOOP_HOME/contrib/vaidya/conf/postex_diagnostic_tests.xml</code></li>
+			<li> Use <code>-report report_file</code> option to store the xml report into specified report_file. </li>  
+		 </ul>
+	</section>
+	
+    <section>
+		<title>How to Write and Execute your own Tests</title>
+		<p>Writing and executing your own test rules is not very hard. You can take a look at Hadoop Vaidya source code for existing set of tests. 
+		   The source code is at this <a href="http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/vaidya/src/java/org/apache/hadoop/vaidya/">hadoop svn repository location</a>
+		   . The default set of tests are under <code>"postexdiagnosis/tests/"</code> folder.</p>
+		<ul>
+		  <li>Writing a test class for your new test case should extend the <code>org.apache.hadoop.vaidya.DiagnosticTest</code> class and 
+		       it should override following three methods from the base class, 
+              <ul> 
+				<li> evaluate() </li>
+				<li> getPrescription() </li> 
+ 				<li> getReferenceDetails() </li> 
+              </ul>
+          </li>
+		  <li>Make a local copy of the <code>$HADOOP_HOME/contrib/vaidya/conf/postex_diagnostic_tests.xml</code> file or create a new test description XML file.</li>
+		  <li>Add the test description element for your new test case to this test description file.</li>
+		  <li>Compile your new test class (or multiple classes), archive them into a Jar file and add it to the CLASSPATH e.g. (<code>export CLASSPATH=$CLASSPATH:newtests.jar</code>)</li>
+		  <li>Execute the Hadoop Vaidya script with the job configuration, job history log and reference to newly created test description file using <em>--testconf</em> option. 
+		  <code>=>sh $HADOOP_HOME/contrib/vaidya/bin/vaidya.sh -joblog job_history_log_file -jobconf job.xml -testconf new_test_description_file -report report.xml</code></li>
+		</ul>
+	</section>
+	
+    <p> </p>
+    <p> </p>
+    <p>
+      <em>Java and JNI are trademarks or registered trademarks of 
+      Sun Microsystems, Inc. in the United States and other countries.</em>
+    </p>
+    
+  </body>
+  
+</document>

Propchange: hadoop/hdfs/branches/HDFS-265/src/docs/src/documentation/content/xdocs/vaidya.xml
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Modified: hadoop/hdfs/branches/HDFS-265/src/docs/src/documentation/skinconf.xml
URL: http://svn.apache.org/viewvc/hadoop/hdfs/branches/HDFS-265/src/docs/src/documentation/skinconf.xml?rev=817449&r1=817448&r2=817449&view=diff
==============================================================================
--- hadoop/hdfs/branches/HDFS-265/src/docs/src/documentation/skinconf.xml (original)
+++ hadoop/hdfs/branches/HDFS-265/src/docs/src/documentation/skinconf.xml Mon Sep 21 22:33:09 2009
@@ -67,8 +67,8 @@
   <!-- project logo -->
   <project-name>Hadoop</project-name>
   <project-description>Scalable Computing Platform</project-description>
-  <project-url>http://hadoop.apache.org/hdfs/</project-url>
-  <project-logo>images/hdfs-logo.jpg</project-logo>
+  <project-url>http://hadoop.apache.org/core/</project-url>
+  <project-logo>images/core-logo.gif</project-logo>
 
   <!-- group logo -->
   <group-name>Hadoop</group-name>
@@ -146,13 +146,13 @@
     <!--Headers -->
 	#content h1 {
 	  margin-bottom: .5em;
-	  font-size: 185%; color: black;
+	  font-size: 200%; color: black;
 	  font-family: arial;
 	}  
-    h2, .h3 { font-size: 175%; color: black; font-family: arial; }
-	h3, .h4 { font-size: 135%; color: black; font-family: arial; margin-bottom: 0.5em; }
+    h2, .h3 { font-size: 195%; color: black; font-family: arial; }
+	h3, .h4 { font-size: 140%; color: black; font-family: arial; margin-bottom: 0.5em; }
 	h4, .h5 { font-size: 125%; color: black;  font-style: italic; font-weight: bold; font-family: arial; }
-	h5, h6 { font-size: 110%; color: #363636; font-weight: bold; }    
+	h5, h6 { font-size: 110%; color: #363636; font-weight: bold; } 
    
    <!--Code Background -->
     pre.code {

Modified: hadoop/hdfs/branches/HDFS-265/src/java/org/apache/hadoop/hdfs/DFSClient.java
URL: http://svn.apache.org/viewvc/hadoop/hdfs/branches/HDFS-265/src/java/org/apache/hadoop/hdfs/DFSClient.java?rev=817449&r1=817448&r2=817449&view=diff
==============================================================================
--- hadoop/hdfs/branches/HDFS-265/src/java/org/apache/hadoop/hdfs/DFSClient.java (original)
+++ hadoop/hdfs/branches/HDFS-265/src/java/org/apache/hadoop/hdfs/DFSClient.java Mon Sep 21 22:33:09 2009
@@ -575,35 +575,6 @@
     leasechecker.put(src, result);
     return result;
   }
-  
-  /**
-   * Same as {{@link #create(String, FsPermission, EnumSet, short, long,
-   *  Progressable, int)}   except that the permission
-   *   is absolute (ie has already been masked with umask.
-   * 
-   */
-  public OutputStream primitiveCreate(String src, 
-                             FsPermission absPermission,
-                             EnumSet<CreateFlag> flag,
-                             boolean createParent,
-                             short replication,
-                             long blockSize,
-                             Progressable progress,
-                             int buffersize,
-                             int bytesPerChecksum)
-    throws IOException {
-    checkOpen();
-    if (absPermission == null) {
-      absPermission = 
-        FsPermission.getDefault().applyUMask(FsPermission.getUMask(conf));
-    } 
-    LOG.debug(src + ": masked=" + absPermission);
-    OutputStream result = new DFSOutputStream(src, absPermission,
-        flag, createParent, replication, blockSize, progress, buffersize,
-        bytesPerChecksum);
-    leasechecker.put(src, result);
-    return result;
-  } 
 
   /**
    * Append to an existing HDFS file.  
@@ -1032,28 +1003,6 @@
                                      FileAlreadyExistsException.class);
     }
   }
-  
-  /**
-   * Same {{@link #mkdirs(String, FsPermission, boolean)} except
-   * that the permissions has already been masked against umask.
-   */
-  public boolean primitiveMkdir(String src, FsPermission absPermission)
-    throws IOException{
-    checkOpen();
-    if (absPermission == null) {
-      absPermission = 
-        FsPermission.getDefault().applyUMask(FsPermission.getUMask(conf));
-    } 
-
-    LOG.debug(src + ": masked=" + absPermission);
-    try {
-      return namenode.mkdirs(src, absPermission, true);
-    } catch(RemoteException re) {
-      throw re.unwrapRemoteException(AccessControlException.class,
-                                     NSQuotaExceededException.class,
-                                     DSQuotaExceededException.class);
-    }
-  }
 
   ContentSummary getContentSummary(String src) throws IOException {
     try {

Modified: hadoop/hdfs/branches/HDFS-265/src/java/org/apache/hadoop/hdfs/DistributedFileSystem.java
URL: http://svn.apache.org/viewvc/hadoop/hdfs/branches/HDFS-265/src/java/org/apache/hadoop/hdfs/DistributedFileSystem.java?rev=817449&r1=817448&r2=817449&view=diff
==============================================================================
--- hadoop/hdfs/branches/HDFS-265/src/java/org/apache/hadoop/hdfs/DistributedFileSystem.java (original)
+++ hadoop/hdfs/branches/HDFS-265/src/java/org/apache/hadoop/hdfs/DistributedFileSystem.java Mon Sep 21 22:33:09 2009
@@ -177,13 +177,6 @@
     }
     return dfs.getBlockLocations(getPathName(file.getPath()), start, len);
   }
-  
-  @Override
-  public BlockLocation[] getFileBlockLocations(Path p, 
-      long start, long len) throws IOException {
-    return dfs.getBlockLocations(getPathName(p), start, len);
-
-  }
 
   @Override
   public void setVerifyChecksum(boolean verifyChecksum) {
@@ -210,21 +203,11 @@
     EnumSet<CreateFlag> flag, int bufferSize, short replication, long blockSize,
     Progressable progress) throws IOException {
 
-    return new FSDataOutputStream(dfs.create(getPathName(f), permission,
+    return new FSDataOutputStream
+       (dfs.create(getPathName(f), permission,
                    flag, replication, blockSize, progress, bufferSize),
         statistics);
   }
-  
-  @SuppressWarnings("deprecation")
-  @Override
-  protected FSDataOutputStream primitiveCreate(Path f,
-    FsPermission absolutePermission, EnumSet<CreateFlag> flag, int bufferSize,
-    short replication, long blockSize, Progressable progress,
-    int bytesPerChecksum) throws IOException {
-    return new FSDataOutputStream(dfs.primitiveCreate(getPathName(f),
-        absolutePermission, flag, true, replication, blockSize,
-        progress, bufferSize, bytesPerChecksum),statistics);
-   } 
 
   /**
    * Same as create(), except fails if parent directory doesn't already exist.
@@ -310,13 +293,6 @@
     return dfs.mkdirs(getPathName(f), permission, true);
   }
 
-  @SuppressWarnings("deprecation")
-  @Override
-  protected boolean primitiveMkdir(Path f, FsPermission absolutePermission)
-    throws IOException {
-    return dfs.primitiveMkdir(getPathName(f), absolutePermission);
-  }
-
   /** {@inheritDoc} */
   @Override
   public void close() throws IOException {

Modified: hadoop/hdfs/branches/HDFS-265/src/java/org/apache/hadoop/hdfs/HftpFileSystem.java
URL: http://svn.apache.org/viewvc/hadoop/hdfs/branches/HDFS-265/src/java/org/apache/hadoop/hdfs/HftpFileSystem.java?rev=817449&r1=817448&r2=817449&view=diff
==============================================================================
--- hadoop/hdfs/branches/HDFS-265/src/java/org/apache/hadoop/hdfs/HftpFileSystem.java (original)
+++ hadoop/hdfs/branches/HDFS-265/src/java/org/apache/hadoop/hdfs/HftpFileSystem.java Mon Sep 21 22:33:09 2009
@@ -39,12 +39,14 @@
 import org.apache.hadoop.fs.CreateFlag;
 import org.apache.hadoop.fs.FSDataInputStream;
 import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FSInputStream;
 import org.apache.hadoop.fs.FileChecksum;
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.MD5MD5CRC32FileChecksum;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.permission.FsPermission;
+import org.apache.hadoop.hdfs.server.namenode.ListPathsServlet;
 import org.apache.hadoop.ipc.RemoteException;
 import org.apache.hadoop.net.NetUtils;
 import org.apache.hadoop.security.UnixUserGroupInformation;
@@ -57,6 +59,7 @@
 import org.xml.sax.XMLReader;
 import org.xml.sax.helpers.DefaultHandler;
 import org.xml.sax.helpers.XMLReaderFactory;
+import org.apache.hadoop.hdfs.ByteRangeInputStream;
 
 
 
@@ -295,7 +298,7 @@
 
   @Override
   public Path getWorkingDirectory() {
-    return new Path("/").makeQualified(getUri(), null);
+    return new Path("/").makeQualified(this);
   }
 
   @Override

Modified: hadoop/hdfs/branches/HDFS-265/src/java/org/apache/hadoop/hdfs/server/namenode/BlockPlacementPolicy.java
URL: http://svn.apache.org/viewvc/hadoop/hdfs/branches/HDFS-265/src/java/org/apache/hadoop/hdfs/server/namenode/BlockPlacementPolicy.java?rev=817449&r1=817448&r2=817449&view=diff
==============================================================================
--- hadoop/hdfs/branches/HDFS-265/src/java/org/apache/hadoop/hdfs/server/namenode/BlockPlacementPolicy.java (original)
+++ hadoop/hdfs/branches/HDFS-265/src/java/org/apache/hadoop/hdfs/server/namenode/BlockPlacementPolicy.java Mon Sep 21 22:33:09 2009
@@ -19,8 +19,10 @@
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.hdfs.protocol.Block;
+import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
 import org.apache.hadoop.hdfs.protocol.LocatedBlock;
 import org.apache.hadoop.net.NetworkTopology;
+import org.apache.hadoop.net.Node; 
 import org.apache.hadoop.util.ReflectionUtils;
 import java.util.*;
 

Modified: hadoop/hdfs/branches/HDFS-265/src/java/org/apache/hadoop/hdfs/server/namenode/BlockPlacementPolicyDefault.java
URL: http://svn.apache.org/viewvc/hadoop/hdfs/branches/HDFS-265/src/java/org/apache/hadoop/hdfs/server/namenode/BlockPlacementPolicyDefault.java?rev=817449&r1=817448&r2=817449&view=diff
==============================================================================
--- hadoop/hdfs/branches/HDFS-265/src/java/org/apache/hadoop/hdfs/server/namenode/BlockPlacementPolicyDefault.java (original)
+++ hadoop/hdfs/branches/HDFS-265/src/java/org/apache/hadoop/hdfs/server/namenode/BlockPlacementPolicyDefault.java Mon Sep 21 22:33:09 2009
@@ -23,6 +23,7 @@
 import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
 import org.apache.hadoop.hdfs.protocol.FSConstants;
 import org.apache.hadoop.hdfs.protocol.LocatedBlock;
+import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
 import org.apache.hadoop.net.NetworkTopology;
 import org.apache.hadoop.net.Node;
 import org.apache.hadoop.net.NodeBase;



Mime
View raw message