hbase-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mi...@apache.org
Subject [09/17] hbase git commit: HBASE-12858 - remove extraneous Docbook files
Date Thu, 15 Jan 2015 04:40:26 GMT
http://git-wip-us.apache.org/repos/asf/hbase/blob/e80b3092/src/main/docbkx/hbase_apis.xml
----------------------------------------------------------------------
diff --git a/src/main/docbkx/hbase_apis.xml b/src/main/docbkx/hbase_apis.xml
deleted file mode 100644
index bc35aba..0000000
--- a/src/main/docbkx/hbase_apis.xml
+++ /dev/null
@@ -1,133 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<chapter
-  version="5.0"
-  xml:id="hbase_apis"
-  xmlns="http://docbook.org/ns/docbook"
-  xmlns:xlink="http://www.w3.org/1999/xlink"
-  xmlns:xi="http://www.w3.org/2001/XInclude"
-  xmlns:svg="http://www.w3.org/2000/svg"
-  xmlns:m="http://www.w3.org/1998/Math/MathML"
-  xmlns:html="http://www.w3.org/1999/xhtml"
-  xmlns:db="http://docbook.org/ns/docbook">
-  <!--
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
--->
-  <title>Apache HBase APIs</title>
-  <para>This chapter provides information about performing operations using HBase native APIs. This
-    information is not exhaustive, and provides a quick reference in addition to the <link
-      xlink:href="http://hbase.apache.org/apidocs/index.html">User API
-    Reference</link>. The examples here are not comprehensive or complete, and should be used for
-    purposes of illustration only.</para>
-  <para>Apache HBase also works with multiple external APIs. See <xref linkend="external_apis" />
-    for more information.</para>
-
-  <example>
-    <title>Create a Table Using Java</title>
-    <para>This example has been tested on HBase 0.96.1.1.</para>
-    <programlisting language="java">
-package com.example.hbase.admin;
-
-import java.io.IOException;
-
-import org.apache.hadoop.hbase.HBaseConfiguration;
-import org.apache.hadoop.hbase.HColumnDescriptor;
-import org.apache.hadoop.hbase.HTableDescriptor;
-import org.apache.hadoop.hbase.TableName;
-import org.apache.hadoop.hbase.client.HBaseAdmin;
-import org.apache.hadoop.hbase.io.compress.Compression.Algorithm;
-import org.apache.hadoop.conf.Configuration;
-
-import static com.example.hbase.Constants.*;
-
-public class CreateSchema {
-
- public static void createOrOverwrite(HBaseAdmin admin, HTableDescriptor table) throws IOException {
-    if (admin.tableExists(table.getName())) {
-      admin.disableTable(table.getName());
-      admin.deleteTable(table.getName());
-    }
-    admin.createTable(table);
-  }
-
-  public static void createSchemaTables (Configuration config) {
-    try {
-      final HBaseAdmin admin = new HBaseAdmin(config);
-      HTableDescriptor table = new HTableDescriptor(TableName.valueOf(TABLE_NAME));
-      table.addFamily(new HColumnDescriptor(CF_DEFAULT).setCompressionType(Algorithm.SNAPPY));
-
-      System.out.print("Creating table. ");
-      createOrOverwrite(admin, table);
-      System.out.println(" Done.");
-
-      admin.close();
-    } catch (Exception e) {
-      e.printStackTrace();
-      System.exit(-1);
-    }
-  }
-
-
-}      
-      
-    </programlisting>
-  </example>
-  <example>
-    <title>Add, Modify, and Delete a Table</title>
-    <para>This example has been tested on HBase 0.96.1.1.</para>
-    <programlisting language="java">
-public static void upgradeFrom0 (Configuration config) {
-
-    try {
-      final HBaseAdmin admin = new HBaseAdmin(config);
-      TableName tableName = TableName.valueOf(TABLE_ASSETMETA);
-      HTableDescriptor table_assetmeta = new HTableDescriptor(tableName);
-      table_assetmeta.addFamily(new HColumnDescriptor(CF_DEFAULT).setCompressionType(Algorithm.SNAPPY));
-
-      // Create a new table.
-
-      System.out.print("Creating table_assetmeta. ");
-      admin.createTable(table_assetmeta);
-      System.out.println(" Done.");
-
-      // Update existing table
-      HColumnDescriptor newColumn = new HColumnDescriptor("NEWCF");
-      newColumn.setCompactionCompressionType(Algorithm.GZ);
-      newColumn.setMaxVersions(HConstants.ALL_VERSIONS);
-      admin.addColumn(tableName, newColumn);
-
-      // Disable an existing table
-      admin.disableTable(tableName);
-
-      // Delete an existing column family
-      admin.deleteColumn(tableName, CF_DEFAULT);
-
-      // Delete a table (Need to be disabled first)
-      admin.deleteTable(tableName);
-
-
-      admin.close();
-    } catch (Exception e) {
-      e.printStackTrace();
-      System.exit(-1);
-    }
-  }      
-    </programlisting>
-  </example>
-
-</chapter>

http://git-wip-us.apache.org/repos/asf/hbase/blob/e80b3092/src/main/docbkx/hbase_history.xml
----------------------------------------------------------------------
diff --git a/src/main/docbkx/hbase_history.xml b/src/main/docbkx/hbase_history.xml
deleted file mode 100644
index f7b9064..0000000
--- a/src/main/docbkx/hbase_history.xml
+++ /dev/null
@@ -1,41 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<appendix
-    xml:id="hbase.history"
-    version="5.0"
-    xmlns="http://docbook.org/ns/docbook"
-    xmlns:xlink="http://www.w3.org/1999/xlink"
-    xmlns:xi="http://www.w3.org/2001/XInclude"
-    xmlns:svg="http://www.w3.org/2000/svg"
-    xmlns:m="http://www.w3.org/1998/Math/MathML"
-    xmlns:html="http://www.w3.org/1999/xhtml"
-    xmlns:db="http://docbook.org/ns/docbook">
-    <!--/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
--->
-    <title>HBase History</title>
-    <itemizedlist>
-        <listitem><para>2006:  <link xlink:href="http://research.google.com/archive/bigtable.html">BigTable</link> paper published by Google.
-        </para></listitem>
-        <listitem><para>2006 (end of year):  HBase development starts.
-        </para></listitem>
-        <listitem><para>2008:  HBase becomes Hadoop sub-project.
-        </para></listitem>
-        <listitem><para>2010:  HBase becomes Apache top-level project.
-        </para></listitem>
-    </itemizedlist>
-</appendix>

http://git-wip-us.apache.org/repos/asf/hbase/blob/e80b3092/src/main/docbkx/hbck_in_depth.xml
----------------------------------------------------------------------
diff --git a/src/main/docbkx/hbck_in_depth.xml b/src/main/docbkx/hbck_in_depth.xml
deleted file mode 100644
index e2ee34f..0000000
--- a/src/main/docbkx/hbck_in_depth.xml
+++ /dev/null
@@ -1,237 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<appendix
-    xml:id="hbck.in.depth"
-    version="5.0"
-    xmlns="http://docbook.org/ns/docbook"
-    xmlns:xlink="http://www.w3.org/1999/xlink"
-    xmlns:xi="http://www.w3.org/2001/XInclude"
-    xmlns:svg="http://www.w3.org/2000/svg"
-    xmlns:m="http://www.w3.org/1998/Math/MathML"
-    xmlns:html="http://www.w3.org/1999/xhtml"
-    xmlns:db="http://docbook.org/ns/docbook">
-    <!--/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
--->
-
-        <title>hbck In Depth</title>
-        <para>HBaseFsck (hbck) is a tool for checking for region consistency and table integrity problems
-            and repairing a corrupted HBase. It works in two basic modes -- a read-only inconsistency
-            identifying mode and a multi-phase read-write repair mode.
-        </para>
-        <section>
-            <title>Running hbck to identify inconsistencies</title>
-            <para>To check to see if your HBase cluster has corruptions, run hbck against your HBase cluster:</para>
-            <programlisting language="bourne">
-$ ./bin/hbase hbck
-</programlisting>
-            <para>
-                At the end of the commands output it prints OK or tells you the number of INCONSISTENCIES
-                present. You may also want to run run hbck a few times because some inconsistencies can be
-                transient (e.g. cluster is starting up or a region is splitting). Operationally you may want to run
-                hbck regularly and setup alert (e.g. via nagios) if it repeatedly reports inconsistencies .
-                A run of hbck will report a list of inconsistencies along with a brief description of the regions and
-                tables affected. The using the <code>-details</code> option will report more details including a representative
-                listing of all the splits present in all the tables.
-            </para>
-            <programlisting language="bourne">
-$ ./bin/hbase hbck -details
-</programlisting>
-            <para>If you just want to know if some tables are corrupted, you can limit hbck to identify inconsistencies
-                in only specific tables. For example the following command would only attempt to check table
-                TableFoo and TableBar. The benefit is that hbck will run in less time.</para>
-            <programlisting language="bourne">
-$ ./bin/hbase hbck TableFoo TableBar
-</programlisting>
-        </section>
-        <section><title>Inconsistencies</title>
-            <para>
-                If after several runs, inconsistencies continue to be reported, you may have encountered a
-                corruption. These should be rare, but in the event they occur newer versions of HBase include
-                the hbck tool enabled with automatic repair options.
-            </para>
-            <para>
-                There are two invariants that when violated create inconsistencies in HBase:
-            </para>
-            <itemizedlist>
-                <listitem><para>HBase’s region consistency invariant is satisfied if every region is assigned and
-                    deployed on exactly one region server, and all places where this state kept is in
-                    accordance.</para>
-                </listitem>
-                <listitem><para>HBase’s table integrity invariant is satisfied if for each table, every possible row key
-                    resolves to exactly one region.</para>
-                </listitem>
-            </itemizedlist>
-            <para>
-                Repairs generally work in three phases -- a read-only information gathering phase that identifies
-                inconsistencies, a table integrity repair phase that restores the table integrity invariant, and then
-                finally a region consistency repair phase that restores the region consistency invariant.
-                Starting from version 0.90.0, hbck could detect region consistency problems report on a subset
-                of possible table integrity problems. It also included the ability to automatically fix the most
-                common inconsistency, region assignment and deployment consistency problems. This repair
-                could be done by using the <code>-fix</code> command line option. These problems close regions if they are
-                open on the wrong server or on multiple region servers and also assigns regions to region
-                servers if they are not open.
-            </para>
-            <para>
-                Starting from HBase versions 0.90.7, 0.92.2 and 0.94.0, several new command line options are
-                introduced to aid repairing a corrupted HBase. This hbck sometimes goes by the nickname
-                “uberhbck”. Each particular version of uber hbck is compatible with the HBase’s of the same
-                major version (0.90.7 uberhbck can repair a 0.90.4). However, versions &lt;=0.90.6 and versions
-                &lt;=0.92.1 may require restarting the master or failing over to a backup master.
-            </para>
-        </section>
-        <section><title>Localized repairs</title>
-            <para>
-                When repairing a corrupted HBase, it is best to repair the lowest risk inconsistencies first.
-                These are generally region consistency repairs -- localized single region repairs, that only modify
-                in-memory data, ephemeral zookeeper data, or patch holes in the META table.
-                Region consistency requires that the HBase instance has the state of the region’s data in HDFS
-                (.regioninfo files), the region’s row in the hbase:meta table., and region’s deployment/assignments on
-                region servers and the master in accordance. Options for repairing region consistency include:
-                <itemizedlist>
-                    <listitem><para><code>-fixAssignments</code> (equivalent to the 0.90 <code>-fix</code> option) repairs unassigned, incorrectly
-                        assigned or multiply assigned regions.</para>
-                    </listitem>
-                    <listitem><para><code>-fixMeta</code> which removes meta rows when corresponding regions are not present in
-                        HDFS and adds new meta rows if they regions are present in HDFS while not in META.</para>
-                    </listitem>
-                </itemizedlist>
-                To fix deployment and assignment problems you can run this command:
-            </para>
-            <programlisting language="bourne">
-$ ./bin/hbase hbck -fixAssignments
-</programlisting>
-            <para>To fix deployment and assignment problems as well as repairing incorrect meta rows you can
-                run this command:</para>
-            <programlisting language="bourne">
-$ ./bin/hbase hbck -fixAssignments -fixMeta
-</programlisting>
-            <para>There are a few classes of table integrity problems that are low risk repairs. The first two are
-                degenerate (startkey == endkey) regions and backwards regions (startkey > endkey). These are
-                automatically handled by sidelining the data to a temporary directory (/hbck/xxxx).
-                The third low-risk class is hdfs region holes. This can be repaired by using the:</para>
-            <itemizedlist>
-                <listitem><para><code>-fixHdfsHoles</code> option for fabricating new empty regions on the file system.
-                    If holes are detected you can use -fixHdfsHoles and should include -fixMeta and -fixAssignments to make the new region consistent.</para>
-                </listitem>
-            </itemizedlist>
-            <programlisting language="bourne">
-$ ./bin/hbase hbck -fixAssignments -fixMeta -fixHdfsHoles
-</programlisting>
-            <para>Since this is a common operation, we’ve added a the <code>-repairHoles</code> flag that is equivalent to the
-                previous command:</para>
-            <programlisting language="bourne">
-$ ./bin/hbase hbck -repairHoles
-</programlisting>
-            <para>If inconsistencies still remain after these steps, you most likely have table integrity problems
-                related to orphaned or overlapping regions.</para>
-        </section>
-        <section><title>Region Overlap Repairs</title>
-            <para>Table integrity problems can require repairs that deal with overlaps. This is a riskier operation
-                because it requires modifications to the file system, requires some decision making, and may
-                require some manual steps. For these repairs it is best to analyze the output of a <code>hbck -details</code>
-                run so that you isolate repairs attempts only upon problems the checks identify. Because this is
-                riskier, there are safeguard that should be used to limit the scope of the repairs.
-                WARNING: This is a relatively new and have only been tested on online but idle HBase instances
-                (no reads/writes). Use at your own risk in an active production environment!
-                The options for repairing table integrity violations include:</para>
-            <itemizedlist>
-                <listitem><para><code>-fixHdfsOrphans</code> option for “adopting” a region directory that is missing a region
-                    metadata file (the .regioninfo file).</para>
-                </listitem>
-                <listitem><para><code>-fixHdfsOverlaps</code> ability for fixing overlapping regions</para>
-                </listitem>
-            </itemizedlist>
-            <para>When repairing overlapping regions, a region’s data can be modified on the file system in two
-                ways: 1) by merging regions into a larger region or 2) by sidelining regions by moving data to
-                “sideline” directory where data could be restored later. Merging a large number of regions is
-                technically correct but could result in an extremely large region that requires series of costly
-                compactions and splitting operations. In these cases, it is probably better to sideline the regions
-                that overlap with the most other regions (likely the largest ranges) so that merges can happen on
-                a more reasonable scale. Since these sidelined regions are already laid out in HBase’s native
-                directory and HFile format, they can be restored by using HBase’s bulk load mechanism.
-                The default safeguard thresholds are conservative. These options let you override the default
-                thresholds and to enable the large region sidelining feature.</para>
-            <itemizedlist>
-                <listitem><para><code>-maxMerge &lt;n&gt;</code> maximum number of overlapping regions to merge</para>
-                </listitem>
-                <listitem><para><code>-sidelineBigOverlaps</code> if more than maxMerge regions are overlapping, sideline attempt
-                    to sideline the regions overlapping with the most other regions.</para>
-                </listitem>
-                <listitem><para><code>-maxOverlapsToSideline &lt;n&gt;</code> if sidelining large overlapping regions, sideline at most n
-                    regions.</para>
-                </listitem>
-            </itemizedlist>
-            
-            <para>Since often times you would just want to get the tables repaired, you can use this option to turn
-                on all repair options:</para>
-            <itemizedlist>
-                <listitem><para><code>-repair</code> includes all the region consistency options and only the hole repairing table
-                    integrity options.</para>
-                </listitem>
-            </itemizedlist>
-            <para>Finally, there are safeguards to limit repairs to only specific tables. For example the following
-                command would only attempt to check and repair table TableFoo and TableBar.</para>
-            <screen language="bourne">
-$ ./bin/hbase hbck -repair TableFoo TableBar
-</screen>
-            <section><title>Special cases: Meta is not properly assigned</title>
-                <para>There are a few special cases that hbck can handle as well.
-                    Sometimes the meta table’s only region is inconsistently assigned or deployed. In this case
-                    there is a special <code>-fixMetaOnly</code> option that can try to fix meta assignments.</para>
-                <screen language="bourne">
-$ ./bin/hbase hbck -fixMetaOnly -fixAssignments
-</screen>
-            </section>
-            <section><title>Special cases: HBase version file is missing</title>
-                <para>HBase’s data on the file system requires a version file in order to start. If this flie is missing, you
-                    can use the <code>-fixVersionFile</code> option to fabricating a new HBase version file. This assumes that
-                    the version of hbck you are running is the appropriate version for the HBase cluster.</para>
-            </section>
-            <section><title>Special case: Root and META are corrupt.</title>
-                <para>The most drastic corruption scenario is the case where the ROOT or META is corrupted and
-                    HBase will not start. In this case you can use the OfflineMetaRepair tool create new ROOT
-                    and META regions and tables.
-                    This tool assumes that HBase is offline. It then marches through the existing HBase home
-                    directory, loads as much information from region metadata files (.regioninfo files) as possible
-                    from the file system. If the region metadata has proper table integrity, it sidelines the original root
-                    and meta table directories, and builds new ones with pointers to the region directories and their
-                    data.</para>
-                <screen language="bourne">
-$ ./bin/hbase org.apache.hadoop.hbase.util.hbck.OfflineMetaRepair
-</screen>
-                <para>NOTE: This tool is not as clever as uberhbck but can be used to bootstrap repairs that uberhbck
-                    can complete.
-                    If the tool succeeds you should be able to start hbase and run online repairs if necessary.</para>
-            </section>
-            <section><title>Special cases: Offline split parent</title>
-                <para>
-                    Once a region is split, the offline parent will be cleaned up automatically. Sometimes, daughter regions
-                    are split again before their parents are cleaned up. HBase can clean up parents in the right order. However,
-                    there could be some lingering offline split parents sometimes. They are in META, in HDFS, and not deployed.
-                    But HBase can't clean them up. In this case, you can use the <code>-fixSplitParents</code> option to reset
-                    them in META to be online and not split. Therefore, hbck can merge them with other regions if fixing
-                    overlapping regions option is used.
-                </para>
-                <para>
-                    This option should not normally be used, and it is not in <code>-fixAll</code>.
-                </para>
-            </section>
-        </section>
-    
-</appendix>

http://git-wip-us.apache.org/repos/asf/hbase/blob/e80b3092/src/main/docbkx/mapreduce.xml
----------------------------------------------------------------------
diff --git a/src/main/docbkx/mapreduce.xml b/src/main/docbkx/mapreduce.xml
deleted file mode 100644
index 9e9e474..0000000
--- a/src/main/docbkx/mapreduce.xml
+++ /dev/null
@@ -1,630 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<chapter
-    xml:id="mapreduce"
-    version="5.0"
-    xmlns="http://docbook.org/ns/docbook"
-    xmlns:xlink="http://www.w3.org/1999/xlink"
-    xmlns:xi="http://www.w3.org/2001/XInclude"
-    xmlns:svg="http://www.w3.org/2000/svg"
-    xmlns:m="http://www.w3.org/1998/Math/MathML"
-    xmlns:html="http://www.w3.org/1999/xhtml"
-    xmlns:db="http://docbook.org/ns/docbook">
-    <!--/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
--->
-
-    <title>HBase and MapReduce</title>
-    <para>Apache MapReduce is a software framework used to analyze large amounts of data, and is
-      the framework used most often with <link
-        xlink:href="http://hadoop.apache.org/">Apache Hadoop</link>. MapReduce itself is out of the
-      scope of this document. A good place to get started with MapReduce is <link
-        xlink:href="http://hadoop.apache.org/docs/r1.2.1/mapred_tutorial.html" />. MapReduce version
-      2 (MR2)is now part of <link
-        xlink:href="http://hadoop.apache.org/docs/r2.3.0/hadoop-yarn/hadoop-yarn-site/">YARN</link>. </para>
-
-    <para> This chapter discusses specific configuration steps you need to take to use MapReduce on
-      data within HBase. In addition, it discusses other interactions and issues between HBase and
-      MapReduce jobs.
-      <note> 
-      <title>mapred and mapreduce</title>
-      <para>There are two mapreduce packages in HBase as in MapReduce itself: <filename>org.apache.hadoop.hbase.mapred</filename>
-      and <filename>org.apache.hadoop.hbase.mapreduce</filename>. The former does old-style API and the latter
-      the new style.  The latter has more facility though you can usually find an equivalent in the older
-      package.  Pick the package that goes with your mapreduce deploy.  When in doubt or starting over, pick the
-      <filename>org.apache.hadoop.hbase.mapreduce</filename>.  In the notes below, we refer to
-      o.a.h.h.mapreduce but replace with the o.a.h.h.mapred if that is what you are using.
-      </para>
-      </note> 
-    </para>
-
-    <section
-      xml:id="hbase.mapreduce.classpath">
-      <title>HBase, MapReduce, and the CLASSPATH</title>
-      <para>By default, MapReduce jobs deployed to a MapReduce cluster do not have access to either
-        the HBase configuration under <envar>$HBASE_CONF_DIR</envar> or the HBase classes.</para>
-      <para>To give the MapReduce jobs the access they need, you could add
-          <filename>hbase-site.xml</filename> to the
-            <filename><replaceable>$HADOOP_HOME</replaceable>/conf/</filename> directory and add the
-        HBase JARs to the <filename><replaceable>HADOOP_HOME</replaceable>/conf/</filename>
-        directory, then copy these changes across your cluster. You could add hbase-site.xml to
-        $HADOOP_HOME/conf and add HBase jars to the $HADOOP_HOME/lib. You would then need to copy
-        these changes across your cluster or edit
-          <filename><replaceable>$HADOOP_HOME</replaceable>conf/hadoop-env.sh</filename> and add
-        them to the <envar>HADOOP_CLASSPATH</envar> variable. However, this approach is not
-        recommended because it will pollute your Hadoop install with HBase references. It also
-        requires you to restart the Hadoop cluster before Hadoop can use the HBase data.</para>
-      <para> Since HBase 0.90.x, HBase adds its dependency JARs to the job configuration itself. The
-        dependencies only need to be available on the local CLASSPATH. The following example runs
-        the bundled HBase <link
-          xlink:href="http://hbase.apache.org/apidocs/org/apache/hadoop/hbase/mapreduce/RowCounter.html">RowCounter</link>
-        MapReduce job against a table named <systemitem>usertable</systemitem> If you have not set
-        the environment variables expected in the command (the parts prefixed by a
-          <literal>$</literal> sign and curly braces), you can use the actual system paths instead.
-        Be sure to use the correct version of the HBase JAR for your system. The backticks
-          (<literal>`</literal> symbols) cause ths shell to execute the sub-commands, setting the
-        CLASSPATH as part of the command. This example assumes you use a BASH-compatible shell. </para>
-      <screen language="bourne">$ <userinput>HADOOP_CLASSPATH=`${HBASE_HOME}/bin/hbase classpath` ${HADOOP_HOME}/bin/hadoop jar ${HBASE_HOME}/hbase-server-VERSION.jar rowcounter usertable</userinput></screen>
-      <para>When the command runs, internally, the HBase JAR finds the dependencies it needs for
-        zookeeper, guava, and its other dependencies on the passed <envar>HADOOP_CLASSPATH</envar>
-        and adds the JARs to the MapReduce job configuration. See the source at
-        TableMapReduceUtil#addDependencyJars(org.apache.hadoop.mapreduce.Job) for how this is done. </para>
-      <note>
-        <para> The example may not work if you are running HBase from its build directory rather
-          than an installed location. You may see an error like the following:</para>
-        <screen>java.lang.RuntimeException: java.lang.ClassNotFoundException: org.apache.hadoop.hbase.mapreduce.RowCounter$RowCounterMapper</screen>
-        <para>If this occurs, try modifying the command as follows, so that it uses the HBase JARs
-          from the <filename>target/</filename> directory within the build environment.</para>
-        <screen language="bourne">$ <userinput>HADOOP_CLASSPATH=${HBASE_HOME}/hbase-server/target/hbase-server-VERSION-SNAPSHOT.jar:`${HBASE_HOME}/bin/hbase classpath` ${HADOOP_HOME}/bin/hadoop jar ${HBASE_HOME}/hbase-server/target/hbase-server-VERSION-SNAPSHOT.jar rowcounter usertable</userinput></screen>
-      </note>
-      <caution>
-        <title>Notice to Mapreduce users of HBase 0.96.1 and above</title>
-        <para>Some mapreduce jobs that use HBase fail to launch. The symptom is an exception similar
-          to the following:</para>
-        <screen>
-Exception in thread "main" java.lang.IllegalAccessError: class
-    com.google.protobuf.ZeroCopyLiteralByteString cannot access its superclass
-    com.google.protobuf.LiteralByteString
-    at java.lang.ClassLoader.defineClass1(Native Method)
-    at java.lang.ClassLoader.defineClass(ClassLoader.java:792)
-    at java.security.SecureClassLoader.defineClass(SecureClassLoader.java:142)
-    at java.net.URLClassLoader.defineClass(URLClassLoader.java:449)
-    at java.net.URLClassLoader.access$100(URLClassLoader.java:71)
-    at java.net.URLClassLoader$1.run(URLClassLoader.java:361)
-    at java.net.URLClassLoader$1.run(URLClassLoader.java:355)
-    at java.security.AccessController.doPrivileged(Native Method)
-    at java.net.URLClassLoader.findClass(URLClassLoader.java:354)
-    at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
-    at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
-    at
-    org.apache.hadoop.hbase.protobuf.ProtobufUtil.toScan(ProtobufUtil.java:818)
-    at
-    org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil.convertScanToString(TableMapReduceUtil.java:433)
-    at
-    org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil.initTableMapperJob(TableMapReduceUtil.java:186)
-    at
-    org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil.initTableMapperJob(TableMapReduceUtil.java:147)
-    at
-    org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil.initTableMapperJob(TableMapReduceUtil.java:270)
-    at
-    org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil.initTableMapperJob(TableMapReduceUtil.java:100)
-...
-</screen>
-        <para>This is caused by an optimization introduced in <link
-            xlink:href="https://issues.apache.org/jira/browse/HBASE-9867">HBASE-9867</link> that
-          inadvertently introduced a classloader dependency. </para>
-        <para>This affects both jobs using the <code>-libjars</code> option and "fat jar," those
-          which package their runtime dependencies in a nested <code>lib</code> folder.</para>
-        <para>In order to satisfy the new classloader requirements, hbase-protocol.jar must be
-          included in Hadoop's classpath. See <xref
-            linkend="hbase.mapreduce.classpath" /> for current recommendations for resolving
-          classpath errors. The following is included for historical purposes.</para>
-        <para>This can be resolved system-wide by including a reference to the hbase-protocol.jar in
-          hadoop's lib directory, via a symlink or by copying the jar into the new location.</para>
-        <para>This can also be achieved on a per-job launch basis by including it in the
-            <code>HADOOP_CLASSPATH</code> environment variable at job submission time. When
-          launching jobs that package their dependencies, all three of the following job launching
-          commands satisfy this requirement:</para>
-        <screen language="bourne">
-$ <userinput>HADOOP_CLASSPATH=/path/to/hbase-protocol.jar:/path/to/hbase/conf hadoop jar MyJob.jar MyJobMainClass</userinput>
-$ <userinput>HADOOP_CLASSPATH=$(hbase mapredcp):/path/to/hbase/conf hadoop jar MyJob.jar MyJobMainClass</userinput>
-$ <userinput>HADOOP_CLASSPATH=$(hbase classpath) hadoop jar MyJob.jar MyJobMainClass</userinput>
-        </screen>
-        <para>For jars that do not package their dependencies, the following command structure is
-          necessary:</para>
-        <screen language="bourne">
-$ <userinput>HADOOP_CLASSPATH=$(hbase mapredcp):/etc/hbase/conf hadoop jar MyApp.jar MyJobMainClass -libjars $(hbase mapredcp | tr ':' ',')</userinput> ...
-        </screen>
-        <para>See also <link
-            xlink:href="https://issues.apache.org/jira/browse/HBASE-10304">HBASE-10304</link> for
-          further discussion of this issue.</para>
-      </caution>
-    </section>
-
-    <section>
-      <title>MapReduce Scan Caching</title>
-      <para>TableMapReduceUtil now restores the option to set scanner caching (the number of rows
-        which are cached before returning the result to the client) on the Scan object that is
-        passed in. This functionality was lost due to a bug in HBase 0.95 (<link
-          xlink:href="https://issues.apache.org/jira/browse/HBASE-11558">HBASE-11558</link>), which
-        is fixed for HBase 0.98.5 and 0.96.3. The priority order for choosing the scanner caching is
-        as follows:</para>
-      <orderedlist>
-        <listitem>
-          <para>Caching settings which are set on the scan object.</para>
-        </listitem>
-        <listitem>
-          <para>Caching settings which are specified via the configuration option
-              <option>hbase.client.scanner.caching</option>, which can either be set manually in
-              <filename>hbase-site.xml</filename> or via the helper method
-              <code>TableMapReduceUtil.setScannerCaching()</code>.</para>
-        </listitem>
-        <listitem>
-          <para>The default value <code>HConstants.DEFAULT_HBASE_CLIENT_SCANNER_CACHING</code>, which is set to
-            <literal>100</literal>.</para>
-        </listitem>
-      </orderedlist>
-      <para>Optimizing the caching settings is a balance between the time the client waits for a
-        result and the number of sets of results the client needs to receive. If the caching setting
-        is too large, the client could end up waiting for a long time or the request could even time
-        out. If the setting is too small, the scan needs to return results in several pieces.
-        If you think of the scan as a shovel, a bigger cache setting is analogous to a bigger
-        shovel, and a smaller cache setting is equivalent to more shoveling in order to fill the
-        bucket.</para>
-      <para>The list of priorities mentioned above allows you to set a reasonable default, and
-        override it for specific operations.</para>
-      <para>See the API documentation for <link
-          xlink:href="https://hbase.apache.org/apidocs/org/apache/hadoop/hbase/client/Scan.html"
-          >Scan</link> for more details.</para>
-    </section>
-
-    <section>
-      <title>Bundled HBase MapReduce Jobs</title>
-      <para>The HBase JAR also serves as a Driver for some bundled mapreduce jobs. To learn about
-        the bundled MapReduce jobs, run the following command.</para>
-
-      <screen language="bourne">$ <userinput>${HADOOP_HOME}/bin/hadoop jar ${HBASE_HOME}/hbase-server-VERSION.jar</userinput>
-<computeroutput>An example program must be given as the first argument.
-Valid program names are:
-  copytable: Export a table from local cluster to peer cluster
-  completebulkload: Complete a bulk data load.
-  export: Write table data to HDFS.
-  import: Import data written by Export.
-  importtsv: Import data in TSV format.
-  rowcounter: Count rows in HBase table</computeroutput>
-    </screen>
-      <para>Each of the valid program names are bundled MapReduce jobs. To run one of the jobs,
-        model your command after the following example.</para>
-      <screen language="bourne">$ <userinput>${HADOOP_HOME}/bin/hadoop jar ${HBASE_HOME}/hbase-server-VERSION.jar rowcounter myTable</userinput></screen>
-    </section>
-
-    <section>
-      <title>HBase as a MapReduce Job Data Source and Data Sink</title>
-      <para>HBase can be used as a data source, <link
-          xlink:href="http://hbase.apache.org/apidocs/org/apache/hadoop/hbase/mapreduce/TableInputFormat.html">TableInputFormat</link>,
-        and data sink, <link
-          xlink:href="http://hbase.apache.org/apidocs/org/apache/hadoop/hbase/mapreduce/TableOutputFormat.html">TableOutputFormat</link>
-        or <link
-          xlink:href="http://hbase.apache.org/apidocs/org/apache/hadoop/hbase/mapreduce/MultiTableOutputFormat.html">MultiTableOutputFormat</link>,
-        for MapReduce jobs. Writing MapReduce jobs that read or write HBase, it is advisable to
-        subclass <link
-          xlink:href="http://hbase.apache.org/apidocs/org/apache/hadoop/hbase/mapreduce/TableMapper.html">TableMapper</link>
-        and/or <link
-          xlink:href="http://hbase.apache.org/apidocs/org/apache/hadoop/hbase/mapreduce/TableReducer.html">TableReducer</link>.
-        See the do-nothing pass-through classes <link
-          xlink:href="http://hbase.apache.org/apidocs/org/apache/hadoop/hbase/mapreduce/IdentityTableMapper.html">IdentityTableMapper</link>
-        and <link
-          xlink:href="http://hbase.apache.org/apidocs/org/apache/hadoop/hbase/mapreduce/IdentityTableReducer.html">IdentityTableReducer</link>
-        for basic usage. For a more involved example, see <link
-          xlink:href="http://hbase.apache.org/apidocs/org/apache/hadoop/hbase/mapreduce/RowCounter.html">RowCounter</link>
-        or review the <code>org.apache.hadoop.hbase.mapreduce.TestTableMapReduce</code> unit test. </para>
-      <para>If you run MapReduce jobs that use HBase as source or sink, need to specify source and
-        sink table and column names in your configuration.</para>
-
-      <para>When you read from HBase, the <code>TableInputFormat</code> requests the list of regions
-        from HBase and makes a map, which is either a <code>map-per-region</code> or
-          <code>mapreduce.job.maps</code> map, whichever is smaller. If your job only has two maps,
-        raise <code>mapreduce.job.maps</code> to a number greater than the number of regions. Maps
-        will run on the adjacent TaskTracker if you are running a TaskTracer and RegionServer per
-        node. When writing to HBase, it may make sense to avoid the Reduce step and write back into
-        HBase from within your map. This approach works when your job does not need the sort and
-        collation that MapReduce does on the map-emitted data. On insert, HBase 'sorts' so there is
-        no point double-sorting (and shuffling data around your MapReduce cluster) unless you need
-        to. If you do not need the Reduce, you myour map might emit counts of records processed for
-        reporting at the end of the jobj, or set the number of Reduces to zero and use
-        TableOutputFormat. If running the Reduce step makes sense in your case, you should typically
-        use multiple reducers so that load is spread across the HBase cluster.</para>
-
-      <para>A new HBase partitioner, the <link
-          xlink:href="http://hbase.apache.org/apidocs/org/apache/hadoop/hbase/mapreduce/HRegionPartitioner.html">HRegionPartitioner</link>,
-        can run as many reducers the number of existing regions. The HRegionPartitioner is suitable
-        when your table is large and your upload will not greatly alter the number of existing
-        regions upon completion. Otherwise use the default partitioner. </para>
-    </section>
-
-    <section>
-      <title>Writing HFiles Directly During Bulk Import</title>
-      <para>If you are importing into a new table, you can bypass the HBase API and write your
-        content directly to the filesystem, formatted into HBase data files (HFiles). Your import
-        will run faster, perhaps an order of magnitude faster. For more on how this mechanism works,
-        see <xref
-          linkend="arch.bulk.load" />.</para>
-    </section>
-
-    <section>
-      <title>RowCounter Example</title>
-      <para>The included <link
-        xlink:href="http://hbase.apache.org/apidocs/org/apache/hadoop/hbase/mapreduce/RowCounter.html">RowCounter</link>
-        MapReduce job uses <code>TableInputFormat</code> and does a count of all rows in the specified
-        table. To run it, use the following command: </para>
-      <screen language="bourne">$ <userinput>./bin/hadoop jar hbase-X.X.X.jar</userinput></screen> 
-      <para>This will
-        invoke the HBase MapReduce Driver class. Select <literal>rowcounter</literal> from the choice of jobs
-        offered. This will print rowcouner usage advice to standard output. Specify the tablename,
-        column to count, and output
-        directory. If you have classpath errors, see <xref linkend="hbase.mapreduce.classpath" />.</para>
-    </section>
-
-    <section
-      xml:id="splitter">
-      <title>Map-Task Splitting</title>
-      <section
-        xml:id="splitter.default">
-        <title>The Default HBase MapReduce Splitter</title>
-        <para>When <link
-            xlink:href="http://hbase.apache.org/apidocs/org/apache/hadoop/hbase/mapreduce/TableInputFormat.html">TableInputFormat</link>
-          is used to source an HBase table in a MapReduce job, its splitter will make a map task for
-          each region of the table. Thus, if there are 100 regions in the table, there will be 100
-          map-tasks for the job - regardless of how many column families are selected in the
-          Scan.</para>
-      </section>
-      <section
-        xml:id="splitter.custom">
-        <title>Custom Splitters</title>
-        <para>For those interested in implementing custom splitters, see the method
-            <code>getSplits</code> in <link
-            xlink:href="http://hbase.apache.org/apidocs/org/apache/hadoop/hbase/mapreduce/TableInputFormatBase.html">TableInputFormatBase</link>.
-          That is where the logic for map-task assignment resides. </para>
-      </section>
-    </section>
-    <section
-      xml:id="mapreduce.example">
-      <title>HBase MapReduce Examples</title>
-      <section
-        xml:id="mapreduce.example.read">
-        <title>HBase MapReduce Read Example</title>
-        <para>The following is an example of using HBase as a MapReduce source in read-only manner.
-          Specifically, there is a Mapper instance but no Reducer, and nothing is being emitted from
-          the Mapper. There job would be defined as follows...</para>
-        <programlisting language="java">
-Configuration config = HBaseConfiguration.create();
-Job job = new Job(config, "ExampleRead");
-job.setJarByClass(MyReadJob.class);     // class that contains mapper
-
-Scan scan = new Scan();
-scan.setCaching(500);        // 1 is the default in Scan, which will be bad for MapReduce jobs
-scan.setCacheBlocks(false);  // don't set to true for MR jobs
-// set other scan attrs
-...
-
-TableMapReduceUtil.initTableMapperJob(
-  tableName,        // input HBase table name
-  scan,             // Scan instance to control CF and attribute selection
-  MyMapper.class,   // mapper
-  null,             // mapper output key
-  null,             // mapper output value
-  job);
-job.setOutputFormatClass(NullOutputFormat.class);   // because we aren't emitting anything from mapper
-
-boolean b = job.waitForCompletion(true);
-if (!b) {
-  throw new IOException("error with job!");
-}
-  </programlisting>
-        <para>...and the mapper instance would extend <link
-            xlink:href="http://hbase.apache.org/apidocs/org/apache/hadoop/hbase/mapreduce/TableMapper.html">TableMapper</link>...</para>
-        <programlisting language="java">
-public static class MyMapper extends TableMapper&lt;Text, Text&gt; {
-
-  public void map(ImmutableBytesWritable row, Result value, Context context) throws InterruptedException, IOException {
-    // process data for the row from the Result instance.
-   }
-}
-    </programlisting>
-      </section>
-      <section
-        xml:id="mapreduce.example.readwrite">
-        <title>HBase MapReduce Read/Write Example</title>
-        <para>The following is an example of using HBase both as a source and as a sink with
-          MapReduce. This example will simply copy data from one table to another.</para>
-        <programlisting language="java">
-Configuration config = HBaseConfiguration.create();
-Job job = new Job(config,"ExampleReadWrite");
-job.setJarByClass(MyReadWriteJob.class);    // class that contains mapper
-
-Scan scan = new Scan();
-scan.setCaching(500);        // 1 is the default in Scan, which will be bad for MapReduce jobs
-scan.setCacheBlocks(false);  // don't set to true for MR jobs
-// set other scan attrs
-
-TableMapReduceUtil.initTableMapperJob(
-	sourceTable,      // input table
-	scan,	          // Scan instance to control CF and attribute selection
-	MyMapper.class,   // mapper class
-	null,	          // mapper output key
-	null,	          // mapper output value
-	job);
-TableMapReduceUtil.initTableReducerJob(
-	targetTable,      // output table
-	null,             // reducer class
-	job);
-job.setNumReduceTasks(0);
-
-boolean b = job.waitForCompletion(true);
-if (!b) {
-    throw new IOException("error with job!");
-}
-    </programlisting>
-        <para>An explanation is required of what <classname>TableMapReduceUtil</classname> is doing,
-          especially with the reducer. <link
-            xlink:href="http://hbase.apache.org/apidocs/org/apache/hadoop/hbase/mapreduce/TableOutputFormat.html">TableOutputFormat</link>
-          is being used as the outputFormat class, and several parameters are being set on the
-          config (e.g., TableOutputFormat.OUTPUT_TABLE), as well as setting the reducer output key
-          to <classname>ImmutableBytesWritable</classname> and reducer value to
-            <classname>Writable</classname>. These could be set by the programmer on the job and
-          conf, but <classname>TableMapReduceUtil</classname> tries to make things easier.</para>
-        <para>The following is the example mapper, which will create a <classname>Put</classname>
-          and matching the input <classname>Result</classname> and emit it. Note: this is what the
-          CopyTable utility does. </para>
-        <programlisting language="java">
-public static class MyMapper extends TableMapper&lt;ImmutableBytesWritable, Put&gt;  {
-
-	public void map(ImmutableBytesWritable row, Result value, Context context) throws IOException, InterruptedException {
-		// this example is just copying the data from the source table...
-   		context.write(row, resultToPut(row,value));
-   	}
-
-  	private static Put resultToPut(ImmutableBytesWritable key, Result result) throws IOException {
-  		Put put = new Put(key.get());
- 		for (KeyValue kv : result.raw()) {
-			put.add(kv);
-		}
-		return put;
-   	}
-}
-    </programlisting>
-        <para>There isn't actually a reducer step, so <classname>TableOutputFormat</classname> takes
-          care of sending the <classname>Put</classname> to the target table. </para>
-        <para>This is just an example, developers could choose not to use
-            <classname>TableOutputFormat</classname> and connect to the target table themselves.
-        </para>
-      </section>
-      <section
-        xml:id="mapreduce.example.readwrite.multi">
-        <title>HBase MapReduce Read/Write Example With Multi-Table Output</title>
-        <para>TODO: example for <classname>MultiTableOutputFormat</classname>. </para>
-      </section>
-      <section
-        xml:id="mapreduce.example.summary">
-        <title>HBase MapReduce Summary to HBase Example</title>
-        <para>The following example uses HBase as a MapReduce source and sink with a summarization
-          step. This example will count the number of distinct instances of a value in a table and
-          write those summarized counts in another table.
-          <programlisting language="java">
-Configuration config = HBaseConfiguration.create();
-Job job = new Job(config,"ExampleSummary");
-job.setJarByClass(MySummaryJob.class);     // class that contains mapper and reducer
-
-Scan scan = new Scan();
-scan.setCaching(500);        // 1 is the default in Scan, which will be bad for MapReduce jobs
-scan.setCacheBlocks(false);  // don't set to true for MR jobs
-// set other scan attrs
-
-TableMapReduceUtil.initTableMapperJob(
-	sourceTable,        // input table
-	scan,               // Scan instance to control CF and attribute selection
-	MyMapper.class,     // mapper class
-	Text.class,         // mapper output key
-	IntWritable.class,  // mapper output value
-	job);
-TableMapReduceUtil.initTableReducerJob(
-	targetTable,        // output table
-	MyTableReducer.class,    // reducer class
-	job);
-job.setNumReduceTasks(1);   // at least one, adjust as required
-
-boolean b = job.waitForCompletion(true);
-if (!b) {
-	throw new IOException("error with job!");
-}
-    </programlisting>
-          In this example mapper a column with a String-value is chosen as the value to summarize
-          upon. This value is used as the key to emit from the mapper, and an
-            <classname>IntWritable</classname> represents an instance counter.
-          <programlisting language="java">
-public static class MyMapper extends TableMapper&lt;Text, IntWritable&gt;  {
-	public static final byte[] CF = "cf".getBytes();
-	public static final byte[] ATTR1 = "attr1".getBytes();
-
-	private final IntWritable ONE = new IntWritable(1);
-   	private Text text = new Text();
-
-   	public void map(ImmutableBytesWritable row, Result value, Context context) throws IOException, InterruptedException {
-        	String val = new String(value.getValue(CF, ATTR1));
-          	text.set(val);     // we can only emit Writables...
-
-        	context.write(text, ONE);
-   	}
-}
-    </programlisting>
-          In the reducer, the "ones" are counted (just like any other MR example that does this),
-          and then emits a <classname>Put</classname>.
-          <programlisting language="java">
-public static class MyTableReducer extends TableReducer&lt;Text, IntWritable, ImmutableBytesWritable&gt;  {
-	public static final byte[] CF = "cf".getBytes();
-	public static final byte[] COUNT = "count".getBytes();
-
- 	public void reduce(Text key, Iterable&lt;IntWritable&gt; values, Context context) throws IOException, InterruptedException {
-    		int i = 0;
-    		for (IntWritable val : values) {
-    			i += val.get();
-    		}
-    		Put put = new Put(Bytes.toBytes(key.toString()));
-    		put.add(CF, COUNT, Bytes.toBytes(i));
-
-    		context.write(null, put);
-   	}
-}
-    </programlisting>
-        </para>
-      </section>
-      <section
-        xml:id="mapreduce.example.summary.file">
-        <title>HBase MapReduce Summary to File Example</title>
-        <para>This very similar to the summary example above, with exception that this is using
-          HBase as a MapReduce source but HDFS as the sink. The differences are in the job setup and
-          in the reducer. The mapper remains the same. </para>
-        <programlisting language="java">
-Configuration config = HBaseConfiguration.create();
-Job job = new Job(config,"ExampleSummaryToFile");
-job.setJarByClass(MySummaryFileJob.class);     // class that contains mapper and reducer
-
-Scan scan = new Scan();
-scan.setCaching(500);        // 1 is the default in Scan, which will be bad for MapReduce jobs
-scan.setCacheBlocks(false);  // don't set to true for MR jobs
-// set other scan attrs
-
-TableMapReduceUtil.initTableMapperJob(
-	sourceTable,        // input table
-	scan,               // Scan instance to control CF and attribute selection
-	MyMapper.class,     // mapper class
-	Text.class,         // mapper output key
-	IntWritable.class,  // mapper output value
-	job);
-job.setReducerClass(MyReducer.class);    // reducer class
-job.setNumReduceTasks(1);    // at least one, adjust as required
-FileOutputFormat.setOutputPath(job, new Path("/tmp/mr/mySummaryFile"));  // adjust directories as required
-
-boolean b = job.waitForCompletion(true);
-if (!b) {
-	throw new IOException("error with job!");
-}
-    </programlisting>
-        <para>As stated above, the previous Mapper can run unchanged with this example. As for the
-          Reducer, it is a "generic" Reducer instead of extending TableMapper and emitting
-          Puts.</para>
-        <programlisting language="java">
- public static class MyReducer extends Reducer&lt;Text, IntWritable, Text, IntWritable&gt;  {
-
-	public void reduce(Text key, Iterable&lt;IntWritable&gt; values, Context context) throws IOException, InterruptedException {
-		int i = 0;
-		for (IntWritable val : values) {
-			i += val.get();
-		}
-		context.write(key, new IntWritable(i));
-	}
-}
-    </programlisting>
-      </section>
-      <section
-        xml:id="mapreduce.example.summary.noreducer">
-        <title>HBase MapReduce Summary to HBase Without Reducer</title>
-        <para>It is also possible to perform summaries without a reducer - if you use HBase as the
-          reducer. </para>
-        <para>An HBase target table would need to exist for the job summary. The Table method
-            <code>incrementColumnValue</code> would be used to atomically increment values. From a
-          performance perspective, it might make sense to keep a Map of values with their values to
-          be incremeneted for each map-task, and make one update per key at during the <code>
-            cleanup</code> method of the mapper. However, your milage may vary depending on the
-          number of rows to be processed and unique keys. </para>
-        <para>In the end, the summary results are in HBase. </para>
-      </section>
-      <section
-        xml:id="mapreduce.example.summary.rdbms">
-        <title>HBase MapReduce Summary to RDBMS</title>
-        <para>Sometimes it is more appropriate to generate summaries to an RDBMS. For these cases,
-          it is possible to generate summaries directly to an RDBMS via a custom reducer. The
-            <code>setup</code> method can connect to an RDBMS (the connection information can be
-          passed via custom parameters in the context) and the cleanup method can close the
-          connection. </para>
-        <para>It is critical to understand that number of reducers for the job affects the
-          summarization implementation, and you'll have to design this into your reducer.
-          Specifically, whether it is designed to run as a singleton (one reducer) or multiple
-          reducers. Neither is right or wrong, it depends on your use-case. Recognize that the more
-          reducers that are assigned to the job, the more simultaneous connections to the RDBMS will
-          be created - this will scale, but only to a point. </para>
-        <programlisting language="java">
- public static class MyRdbmsReducer extends Reducer&lt;Text, IntWritable, Text, IntWritable&gt;  {
-
-	private Connection c = null;
-
-	public void setup(Context context) {
-  		// create DB connection...
-  	}
-
-	public void reduce(Text key, Iterable&lt;IntWritable&gt; values, Context context) throws IOException, InterruptedException {
-		// do summarization
-		// in this example the keys are Text, but this is just an example
-	}
-
-	public void cleanup(Context context) {
-  		// close db connection
-  	}
-
-}
-    </programlisting>
-        <para>In the end, the summary results are written to your RDBMS table/s. </para>
-      </section>
-
-    </section>
-    <!--  mr examples -->
-    <section
-      xml:id="mapreduce.htable.access">
-      <title>Accessing Other HBase Tables in a MapReduce Job</title>
-      <para>Although the framework currently allows one HBase table as input to a MapReduce job,
-        other HBase tables can be accessed as lookup tables, etc., in a MapReduce job via creating
-        an Table instance in the setup method of the Mapper.
-        <programlisting language="java">public class MyMapper extends TableMapper&lt;Text, LongWritable&gt; {
-  private Table myOtherTable;
-
-  public void setup(Context context) {
-    // In here create a Connection to the cluster and save it or use the Connection
-    // from the existing table
-    myOtherTable = connection.getTable("myOtherTable");
-  }
-
-  public void map(ImmutableBytesWritable row, Result value, Context context) throws IOException, InterruptedException {
-	// process Result...
-	// use 'myOtherTable' for lookups
-  }
-
-  </programlisting>
-      </para>
-    </section>
-    <section
-      xml:id="mapreduce.specex">
-      <title>Speculative Execution</title>
-      <para>It is generally advisable to turn off speculative execution for MapReduce jobs that use
-        HBase as a source. This can either be done on a per-Job basis through properties, on on the
-        entire cluster. Especially for longer running jobs, speculative execution will create
-        duplicate map-tasks which will double-write your data to HBase; this is probably not what
-        you want. </para>
-      <para>See <xref
-          linkend="spec.ex" /> for more information. </para>
-    </section>
-  
-</chapter>


Mime
View raw message