accumulo-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mjw...@apache.org
Subject [03/19] accumulo-website git commit: Jekyll build from master:c9398c5
Date Tue, 20 Jun 2017 16:24:33 GMT
http://git-wip-us.apache.org/repos/asf/accumulo-website/blob/eab65f94/docs/unreleased/troubleshooting/advanced.html
----------------------------------------------------------------------
diff --git a/docs/unreleased/troubleshooting/advanced.html b/docs/unreleased/troubleshooting/advanced.html
new file mode 100644
index 0000000..65a9352
--- /dev/null
+++ b/docs/unreleased/troubleshooting/advanced.html
@@ -0,0 +1,745 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+<!--
+    Licensed to the Apache Software Foundation (ASF) under one or more
+    contributor license agreements.  See the NOTICE file distributed with
+    this work for additional information regarding copyright ownership.
+    The ASF licenses this file to You under the Apache License, Version 2.0
+    (the "License"); you may not use this file except in compliance with
+    the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+-->
+<meta charset="utf-8">
+<meta http-equiv="X-UA-Compatible" content="IE=edge">
+<meta name="viewport" content="width=device-width, initial-scale=1">
+<link href="https://maxcdn.bootstrapcdn.com/bootswatch/3.3.7/paper/bootstrap.min.css" rel="stylesheet" integrity="sha384-awusxf8AUojygHf2+joICySzB780jVvQaVCAt1clU3QsyAitLGul28Qxb2r1e5g+" crossorigin="anonymous">
+<link href="//netdna.bootstrapcdn.com/font-awesome/4.0.3/css/font-awesome.css" rel="stylesheet">
+<link rel="stylesheet" type="text/css" href="https://cdn.datatables.net/v/bs/jq-2.2.3/dt-1.10.12/datatables.min.css">
+<link href="/css/accumulo.css" rel="stylesheet" type="text/css">
+
+<title>Accumulo Documentation - Advanced Troubleshooting</title>
+
+<script src="https://ajax.googleapis.com/ajax/libs/jquery/2.2.4/jquery.min.js"></script>
+<script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/js/bootstrap.min.js" integrity="sha384-Tc5IQib027qvyjSMfHjOMaLkfuWVxZxUPnCJA7l2mCWNIpG9mGCD8wGNIcPD7Txa" crossorigin="anonymous"></script>
+<script type="text/javascript" src="https://cdn.datatables.net/v/bs/jq-2.2.3/dt-1.10.12/datatables.min.js"></script>
+<script>
+  // show location of canonical site if not currently on the canonical site
+  $(function() {
+    var host = window.location.host;
+    if (typeof host !== 'undefined' && host !== 'accumulo.apache.org') {
+      $('#non-canonical').show();
+    }
+  });
+
+  $(function() {
+    // decorate section headers with anchors
+    return $("h2, h3, h4, h5, h6").each(function(i, el) {
+      var $el, icon, id;
+      $el = $(el);
+      id = $el.attr('id');
+      icon = '<i class="fa fa-link"></i>';
+      if (id) {
+        return $el.append($("<a />").addClass("header-link").attr("href", "#" + id).html(icon));
+      }
+    });
+  });
+
+  // fix sidebar width in documentation
+  $(function() {
+    var $affixElement = $('div[data-spy="affix"]');
+    $affixElement.width($affixElement.parent().width());
+  });
+
+  // configure Google Analytics
+  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
+  m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+  })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
+
+  if (ga.hasOwnProperty('loaded') && ga.loaded === true) {
+    ga('create', 'UA-50934829-1', 'apache.org');
+    ga('send', 'pageview');
+  }
+</script>
+
+</head>
+<body style="padding-top: 100px">
+
+  <nav class="navbar navbar-default navbar-fixed-top">
+  <div class="container">
+    <div class="navbar-header">
+      <button type="button" class="navbar-toggle" data-toggle="collapse" data-target="#navbar-items">
+        <span class="sr-only">Toggle navigation</span>
+        <span class="icon-bar"></span>
+        <span class="icon-bar"></span>
+        <span class="icon-bar"></span>
+      </button>
+      <a href="/"><img id="nav-logo" alt="Apache Accumulo" class="img-responsive" src="/images/accumulo-logo.png" width="200"
+        /></a>
+    </div>
+    <div class="collapse navbar-collapse" id="navbar-items">
+      <ul class="nav navbar-nav">
+        <li class="nav-link"><a href="/downloads">Download</a></li>
+        <li class="dropdown">
+          <a class="dropdown-toggle" data-toggle="dropdown" href="#">Releases<span class="caret"></span></a>
+          <ul class="dropdown-menu">
+            <li><a href="/release/accumulo-1.8.1/">1.8.1 (Latest)</a></li>
+            <li><a href="/release/accumulo-1.7.3/">1.7.3</a></li>
+            <li><a href="/release/accumulo-1.6.6/">1.6.6</a></li>
+            <li><a href="/release/">Archive</a></li>
+          </ul>
+        </li>
+        <li class="dropdown">
+          <a class="dropdown-toggle" data-toggle="dropdown" href="#">Documentation<span class="caret"></span></a>
+          <ul class="dropdown-menu">
+            <li><a href="/1.8/accumulo_user_manual.html">User Manual (1.8)</a></li>
+            <li><a href="/1.8/apidocs">Javadocs (1.8)</a></li>
+            <li><a href="/1.8/examples">Examples (1.8)</a></li>
+            <li><a href="/features">Features</a></li>
+            <li><a href="/glossary">Glossary</a></li>
+            <li><a href="/external-docs">External Docs</a></li>
+            <li><a href="/docs-archive/">Archive</a></li>
+          </ul>
+        </li>
+        <li class="dropdown">
+          <a class="dropdown-toggle" data-toggle="dropdown" href="#">Community<span class="caret"></span></a>
+          <ul class="dropdown-menu">
+            <li><a href="/get_involved">Get Involved</a></li>
+            <li><a href="/mailing_list">Mailing Lists</a></li>
+            <li><a href="/people">People</a></li>
+            <li><a href="/related-projects">Related Projects</a></li>
+            <li><a href="/contributor/">Contributor Guide</a></li>
+          </ul>
+        </li>
+      </ul>
+      <ul class="nav navbar-nav navbar-right">
+        <li class="dropdown">
+          <a class="dropdown-toggle" data-toggle="dropdown" href="#">Apache Software Foundation<span class="caret"></span></a>
+          <ul class="dropdown-menu">
+            <li><a href="https://www.apache.org">Apache Homepage <i class="fa fa-external-link"></i></a></li>
+            <li><a href="https://www.apache.org/licenses/LICENSE-2.0">License <i class="fa fa-external-link"></i></a></li>
+            <li><a href="https://www.apache.org/foundation/sponsorship">Sponsorship <i class="fa fa-external-link"></i></a></li>
+            <li><a href="https://www.apache.org/security">Security <i class="fa fa-external-link"></i></a></li>
+            <li><a href="https://www.apache.org/foundation/thanks">Thanks <i class="fa fa-external-link"></i></a></li>
+            <li><a href="https://www.apache.org/foundation/policies/conduct">Code of Conduct <i class="fa fa-external-link"></i></a></li>
+          </ul>
+        </li>
+      </ul>
+    </div>
+  </div>
+</nav>
+
+  <div class="container">
+    <div class="row">
+      <div class="col-md-12">
+
+        <div id="non-canonical" style="display: none; background-color: #F0E68C; padding-left: 1em;">
+          Visit the official site at: <a href="https://accumulo.apache.org">https://accumulo.apache.org</a>
+        </div>
+        <div id="content">
+          
+          <div class="row">
+  <div class="col-md-3">
+    <div class="panel-group" id="accordion" role="tablist" aria-multiselectable="true" data-spy="affix">
+      <div class="panel panel-default">
+      
+      
+      
+        
+          
+        
+          
+        
+          
+            <div class="panel-heading" role="tab" id="headingOne">
+              <h4 class="panel-title">
+                <a role="button" data-toggle="collapse" data-parent="#accordion" href="#collapsegetting-started" aria-expanded="false" aria-controls="collapsegetting-started">
+                  Getting started
+                </a>
+              </h4>
+            </div>
+            <div id="collapsegetting-started" class="panel-collapse collapse" role="tabpanel" aria-labelledby="headingOne">
+              <div class="panel-body">
+                
+                
+                <div class="row doc-sidebar-link"><a href="/docs/unreleased/getting-started/design">Accumulo Design</a></div>
+                
+                <div class="row doc-sidebar-link"><a href="/docs/unreleased/getting-started/quick-install">Quick Installation</a></div>
+                
+                <div class="row doc-sidebar-link"><a href="/docs/unreleased/getting-started/clients">Accumulo Clients</a></div>
+                
+                <div class="row doc-sidebar-link"><a href="/docs/unreleased/getting-started/shell">Accumulo Shell</a></div>
+                
+                <div class="row doc-sidebar-link"><a href="/docs/unreleased/getting-started/table_design">Table Design</a></div>
+                
+                <div class="row doc-sidebar-link"><a href="/docs/unreleased/getting-started/table_configuration">Table Configuration</a></div>
+                
+              </div>
+            </div>
+          
+        
+          
+        
+          
+        
+      
+        
+          
+        
+          
+            <div class="panel-heading" role="tab" id="headingOne">
+              <h4 class="panel-title">
+                <a role="button" data-toggle="collapse" data-parent="#accordion" href="#collapsedevelopment" aria-expanded="false" aria-controls="collapsedevelopment">
+                  Development
+                </a>
+              </h4>
+            </div>
+            <div id="collapsedevelopment" class="panel-collapse collapse" role="tabpanel" aria-labelledby="headingOne">
+              <div class="panel-body">
+                
+                
+                <div class="row doc-sidebar-link"><a href="/docs/unreleased/development/iterators">Iterators</a></div>
+                
+                <div class="row doc-sidebar-link"><a href="/docs/unreleased/development/mapreduce">MapReduce</a></div>
+                
+                <div class="row doc-sidebar-link"><a href="/docs/unreleased/development/proxy">Proxy</a></div>
+                
+                <div class="row doc-sidebar-link"><a href="/docs/unreleased/development/development_tools">Development Tools</a></div>
+                
+                <div class="row doc-sidebar-link"><a href="/docs/unreleased/development/sampling">Sampling</a></div>
+                
+                <div class="row doc-sidebar-link"><a href="/docs/unreleased/development/summaries">Summary Statistics</a></div>
+                
+                <div class="row doc-sidebar-link"><a href="/docs/unreleased/development/security">Security</a></div>
+                
+                <div class="row doc-sidebar-link"><a href="/docs/unreleased/development/high_speed_ingest">High-Speed Ingest</a></div>
+                
+              </div>
+            </div>
+          
+        
+          
+        
+          
+        
+          
+        
+      
+        
+          
+            <div class="panel-heading" role="tab" id="headingOne">
+              <h4 class="panel-title">
+                <a role="button" data-toggle="collapse" data-parent="#accordion" href="#collapseadministration" aria-expanded="false" aria-controls="collapseadministration">
+                  Administration
+                </a>
+              </h4>
+            </div>
+            <div id="collapseadministration" class="panel-collapse collapse" role="tabpanel" aria-labelledby="headingOne">
+              <div class="panel-body">
+                
+                
+                <div class="row doc-sidebar-link"><a href="/docs/unreleased/administration/in-depth-install">In-depth Installation</a></div>
+                
+                <div class="row doc-sidebar-link"><a href="/docs/unreleased/administration/configuration-management">Configuration Management</a></div>
+                
+                <div class="row doc-sidebar-link"><a href="/docs/unreleased/administration/configuration-properties">Configuration Properties</a></div>
+                
+                <div class="row doc-sidebar-link"><a href="/docs/unreleased/administration/monitoring-metrics">Monitoring & Metrics</a></div>
+                
+                <div class="row doc-sidebar-link"><a href="/docs/unreleased/administration/tracing">Tracing</a></div>
+                
+                <div class="row doc-sidebar-link"><a href="/docs/unreleased/administration/fate">FATE</a></div>
+                
+                <div class="row doc-sidebar-link"><a href="/docs/unreleased/administration/multivolume">Multi-Volume Installations</a></div>
+                
+                <div class="row doc-sidebar-link"><a href="/docs/unreleased/administration/ssl">SSL</a></div>
+                
+                <div class="row doc-sidebar-link"><a href="/docs/unreleased/administration/kerberos">Kerberos</a></div>
+                
+                <div class="row doc-sidebar-link"><a href="/docs/unreleased/administration/replication">Replication</a></div>
+                
+              </div>
+            </div>
+          
+        
+          
+        
+          
+        
+          
+        
+          
+        
+      
+        
+          
+        
+          
+        
+          
+        
+          
+        
+          
+            <div class="panel-heading" role="tab" id="headingOne">
+              <h4 class="panel-title">
+                <a role="button" data-toggle="collapse" data-parent="#accordion" href="#collapsetroubleshooting" aria-expanded="true" aria-controls="collapsetroubleshooting">
+                  Troubleshooting
+                </a>
+              </h4>
+            </div>
+            <div id="collapsetroubleshooting" class="panel-collapse collapse in" role="tabpanel" aria-labelledby="headingOne">
+              <div class="panel-body">
+                
+                
+                <div class="row doc-sidebar-link"><a href="/docs/unreleased/troubleshooting/basic">Basic Troubleshooting</a></div>
+                
+                <div class="row doc-sidebar-link"><a href="/docs/unreleased/troubleshooting/advanced">Advanced Troubleshooting</a></div>
+                
+                <div class="row doc-sidebar-link"><a href="/docs/unreleased/troubleshooting/tools">Troubleshooting Tools</a></div>
+                
+                <div class="row doc-sidebar-link"><a href="/docs/unreleased/troubleshooting/system-metadata-tables">System Metadata Tables</a></div>
+                
+              </div>
+            </div>
+          
+        
+      
+      </div>
+    </div>
+  </div>
+  <div class="col-md-9">
+    
+    <p><a href="/docs/unreleased/">Accumulo unreleased docs</a> &nbsp;&gt;&gt;&nbsp; Troubleshooting &nbsp;&gt;&gt;&nbsp; Advanced Troubleshooting</p>
+    
+    
+
+    <div class="alert alert-danger" style="margin-bottom: 0px;" role="alert">This documentation is for a future release of Accumulo! <a href="/1.8/accumulo_user_manual.html">View documentation for the latest release</a>.</div>
+
+    <div class="row">
+      <div class="col-md-10"><h1>Advanced Troubleshooting</h1></div>
+      <div class="col-md-2"><a class="pull-right" style="margin-top: 25px;" href="https://github.com/apache/accumulo-website/edit/master/_docs-unreleased/troubleshooting/advanced.md" role="button"><i class="glyphicon glyphicon-pencil"></i> <small>Edit this page</small></a></div>
+    </div>  
+    
+    <h2 id="tablet-server-locks">Tablet server locks</h2>
+
+<p><strong>My tablet server lost its lock.  Why?</strong></p>
+
+<p>The primary reason a tablet server loses its lock is that it has been pushed into swap.</p>
+
+<p>A large java program (like the tablet server) may have a large portion
+of its memory image unused.  The operation system will favor pushing
+this allocated, but unused memory into swap so that the memory can be
+re-used as a disk buffer.  When the java virtual machine decides to
+access this memory, the OS will begin flushing disk buffers to return that
+memory to the VM.  This can cause the entire process to block long
+enough for the zookeeper lock to be lost.</p>
+
+<p>Configure your system to reduce the kernel parameter <em>swappiness</em> from the default (60) to zero.</p>
+
+<p><strong>My tablet server lost its lock, and I have already set swappiness to zero.  Why?</strong></p>
+
+<p>Be careful not to over-subscribe memory.  This can be easy to do if
+your accumulo processes run on the same nodes as hadoop’s map-reduce
+framework.  Remember to add up:</p>
+
+<ul>
+  <li>size of the JVM for the tablet server</li>
+  <li>size of the in-memory map, if using the native map implementation</li>
+  <li>size of the JVM for the data node</li>
+  <li>size of the JVM for the task tracker</li>
+  <li>size of the JVM times the maximum number of mappers and reducers</li>
+  <li>size of the kernel and any support processes</li>
+</ul>
+
+<p>If a 16G node can run 2 mappers and 2 reducers, and each can be 2G,
+then there is only 8G for the data node, tserver, task tracker and OS.</p>
+
+<p>Reduce the memory footprint of each component until it fits comfortably.</p>
+
+<p><strong>My tablet server lost its lock, swappiness is zero, and my node has lots of unused memory!</strong></p>
+
+<p>The JVM memory garbage collector may fall behind and cause a
+“stop-the-world” garbage collection. On a large memory virtual
+machine, this collection can take a long time.  This happens more
+frequently when the JVM is getting low on free memory.  Check the logs
+of the tablet server.  You will see lines like this:</p>
+
+<div class="highlighter-rouge"><pre class="highlight"><code>2013-06-20 13:43:20,607 [tabletserver.TabletServer] DEBUG: gc ParNew=0.00(+0.00) secs
+    ConcurrentMarkSweep=0.00(+0.00) secs freemem=1,868,325,952(+1,868,325,952) totalmem=2,040,135,680
+</code></pre>
+</div>
+
+<p>When <code class="highlighter-rouge">freemem</code> becomes small relative to the amount of memory
+needed, the JVM will spend more time finding free memory than
+performing work.  This can cause long delays in sending keep-alive
+messages to zookeeper.</p>
+
+<p>Ensure the tablet server JVM is not running low on memory.</p>
+
+<p><strong>I’m seeing errors in tablet server logs that include the words “MutationsRejectedException” and “# constraint violations: 1”. Moments after that the server died.</strong></p>
+
+<p>The error you are seeing is part of a failing tablet server scenario.
+This is a bit complicated, so name two of your tablet servers A and B.</p>
+
+<p>Tablet server A is hosting a tablet, let’s call it a-tablet.</p>
+
+<p>Tablet server B is hosting a metadata tablet, let’s call it m-tablet.</p>
+
+<p>m-tablet records the information about a-tablet, for example, the names of the files it is using to store data.</p>
+
+<p>When A ingests some data, it eventually flushes the updates from memory to a file.</p>
+
+<p>Tablet server A then writes this new information to m-tablet, on Tablet server B.</p>
+
+<p>Here’s a likely failure scenario:</p>
+
+<p>Tablet server A does not have enough memory for all the processes running on it.
+The operating system sees a large chunk of the tablet server being unused, and swaps it out to disk to make room for other processes.
+Tablet server A does a java memory garbage collection, which causes it to start using all the memory allocated to it.
+As the server starts pulling data from swap, it runs very slowly.
+It fails to send the keep-alive messages to zookeeper in a timely fashion, and it looses its zookeeper session.</p>
+
+<p>But, it’s running so slowly, that it takes a moment to realize it should no longer be hosting tablets.</p>
+
+<p>The thread that is flushing a-tablet memory attempts to update m-tablet with the new file information.</p>
+
+<p>Fortunately there’s a constraint on m-tablet.
+Mutations to the metadata table must contain a valid zookeeper session.
+This prevents tablet server A from making updates to m-tablet when it no long has the right to host the tablet.</p>
+
+<p>The “MutationsRejectedException” error is from tablet server A making an update to tablet server B’s m-tablet.
+It’s getting a constraint violation: tablet server A has lost its zookeeper session, and will fail momentarily.</p>
+
+<p>Ensure that memory is not over-allocated.  Monitor swap usage, or turn swap off.</p>
+
+<p><strong>My accumulo client is getting a MutationsRejectedException. The monitor is displaying “No Such SessionID” errors.</strong></p>
+
+<p>When your client starts sending mutations to accumulo, it creates a session. Once the session is created,
+mutations are streamed to accumulo, without acknowledgement, against this session.  Once the client is done,
+it will close the session, and get an acknowledgement.</p>
+
+<p>If the client fails to communicate with accumulo, it will release the session, assuming that the client has died.
+If the client then attempts to send more mutations against the session, you will see “No Such SessionID” errors on
+the server, and MutationRejectedExceptions in the client.</p>
+
+<p>The client library should be either actively using the connection to the tablet servers,
+or closing the connection and sessions. If the session times out, something is causing your client
+to pause.</p>
+
+<p>The most frequent source of these pauses are java garbage collection pauses
+due to the JVM running out of memory, or being swapped out to disk.</p>
+
+<p>Ensure your client has adequate memory and is not being swapped out to disk.</p>
+
+<h2 id="hdfs-failures">HDFS Failures</h2>
+
+<p><strong>I had disasterous HDFS failure.  After bringing everything back up, several tablets refuse to go online.</strong></p>
+
+<p>Data written to tablets is written into memory before being written into indexed files.  In case the server
+is lost before the data is saved into a an indexed file, all data stored in memory is first written into a
+write-ahead log (WAL).  When a tablet is re-assigned to a new tablet server, the write-ahead logs are read to
+recover any mutations that were in memory when the tablet was last hosted.</p>
+
+<p>If a write-ahead log cannot be read, then the tablet is not re-assigned.  All it takes is for one of
+the blocks in the write-ahead log to be missing.  This is unlikely unless multiple data nodes in HDFS have been
+lost.</p>
+
+<p>Get the WAL files online and healthy.  Restore any data nodes that may be down.</p>
+
+<p><strong>How do find out which tablets are offline?</strong></p>
+
+<p>Use <code class="highlighter-rouge">accumulo admin checkTablets</code></p>
+
+<div class="highlighter-rouge"><pre class="highlight"><code>$ accumulo admin checkTablets
+</code></pre>
+</div>
+
+<p><strong>I lost three data nodes, and I’m missing blocks in a WAL.  I don’t care about data loss, how
+can I get those tablets online?</strong></p>
+
+<p>See the <a href="/docs/unreleased/troubleshooting/system-metadata-tables">system metadata table page</a> which shows a typical metadata table listing.
+The entries with a column family of <code class="highlighter-rouge">log</code> are references to the WAL for that tablet.
+If you know what WAL is bad, you can find all the references with a grep in the shell:</p>
+
+<div class="highlighter-rouge"><pre class="highlight"><code>shell&gt; grep 0cb7ce52-ac46-4bf7-ae1d-acdcfaa97995
+3&lt; log:127.0.0.1+9997/0cb7ce52-ac46-4bf7-ae1d-acdcfaa97995 []    127.0.0.1+9997/0cb7ce52-ac46-4bf7-ae1d-acdcfaa97995|6
+</code></pre>
+</div>
+
+<p>You can remove the WAL references in the metadata table.</p>
+
+<div class="highlighter-rouge"><pre class="highlight"><code>shell&gt; grant -u root Table.WRITE -t accumulo.metadata
+shell&gt; delete 3&lt; log 127.0.0.1+9997/0cb7ce52-ac46-4bf7-ae1d-acdcfaa97995
+</code></pre>
+</div>
+
+<p>Note: the colon (<code class="highlighter-rouge">:</code>) is omitted when specifying the <em>row cf cq</em> for the delete command.</p>
+
+<p>The master will automatically discover the tablet no longer has a bad WAL reference and will
+assign the tablet.  You will need to remove the reference from all the tablets to get them
+online.</p>
+
+<p><strong>The metadata (or root) table has references to a corrupt WAL.</strong></p>
+
+<p>This is a much more serious state, since losing updates to the metadata table will result
+in references to old files which may not exist, or lost references to new files, resulting
+in tablets that cannot be read, or large amounts of data loss.</p>
+
+<p>The best hope is to restore the WAL by fixing HDFS data nodes and bringing the data back online.
+If this is not possible, the best approach is to re-create the instance and bulk import all files from
+the old instance into a new tables.</p>
+
+<p>A complete set of instructions for doing this is outside the scope of this guide,
+but the basic approach is:</p>
+
+<ul>
+  <li>Use <code class="highlighter-rouge">tables -l</code> in the shell to discover the table name to table id mapping</li>
+  <li>Stop all accumulo processes on all nodes</li>
+  <li>Move the accumulo directory in HDFS out of the way:
+     $ hadoop fs -mv /accumulo /corrupt</li>
+  <li>Re-initalize accumulo</li>
+  <li>Recreate tables, users and permissions</li>
+  <li>Import the directories under <code class="highlighter-rouge">/corrupt/tables/&lt;id&gt;</code> into the new instance</li>
+</ul>
+
+<p><strong>One or more HDFS Files under /accumulo/tables are corrupt</strong></p>
+
+<p>Accumulo maintains multiple references into the tablet files in the metadata
+tables and within the tablet server hosting the file, this makes it difficult to
+reliably just remove those references.</p>
+
+<p>The directory structure in HDFS for tables will follow the general structure:</p>
+
+<div class="highlighter-rouge"><pre class="highlight"><code>/accumulo
+/accumulo/tables/
+/accumulo/tables/!0
+/accumulo/tables/!0/default_tablet/A000001.rf
+/accumulo/tables/!0/t-00001/A000002.rf
+/accumulo/tables/1
+/accumulo/tables/1/default_tablet/A000003.rf
+/accumulo/tables/1/t-00001/A000004.rf
+/accumulo/tables/1/t-00001/A000005.rf
+/accumulo/tables/2/default_tablet/A000006.rf
+/accumulo/tables/2/t-00001/A000007.rf
+</code></pre>
+</div>
+
+<p>If files under <code class="highlighter-rouge">/accumulo/tables</code> are corrupt, the best course of action is to
+recover those files in hdsf see the section on HDFS. Once these recovery efforts
+have been exhausted, the next step depends on where the missing file(s) are
+located. Different actions are required when the bad files are in Accumulo data
+table files or if they are metadata table files.</p>
+
+<p><em>Data File Corruption</em></p>
+
+<p>When an Accumulo data file is corrupt, the most reliable way to restore Accumulo
+operations is to replace the missing file with an ``empty’’ file so that
+references to the file in the METADATA table and within the tablet server
+hosting the file can be resolved by Accumulo. An empty file can be created using
+the CreateEmpty utiity:</p>
+
+<div class="highlighter-rouge"><pre class="highlight"><code>$ accumulo org.apache.accumulo.core.file.rfile.CreateEmpty /path/to/empty/file/empty.rf
+</code></pre>
+</div>
+
+<p>The process is to delete the corrupt file and then move the empty file into its
+place (The generated empty file can be copied and used multiple times if necessary and does not need
+to be regenerated each time)</p>
+
+<div class="highlighter-rouge"><pre class="highlight"><code>$ hadoop fs –rm /accumulo/tables/corrupt/file/thename.rf; \
+hadoop fs -mv /path/to/empty/file/empty.rf /accumulo/tables/corrupt/file/thename.rf
+</code></pre>
+</div>
+
+<p><em>Metadata File Corruption</em></p>
+
+<p>If the corrupt files are metadata files, read the <a href="/docs/unreleased/troubleshooting/system-metadata-tables">system metadata tables</a>
+(under the path <code class="highlighter-rouge">/accumulo/tables/!0</code>). Then, you will need to rebuild
+the metadata table by initializing a new instance of Accumulo and then importing
+all of the existing data into the new instance.  This is the same procedure as
+recovering from a zookeeper failure (see next section), except that
+you will have the benefit of having the existing user and table authorizations
+that are maintained in zookeeper.</p>
+
+<p>You can use the DumpZookeeper utility to save this information for reference
+before creating the new instance.  You will not be able to use RestoreZookeeper
+because the table names and references are likely to be different between the
+original and the new instances, but it can serve as a reference.</p>
+
+<p>If the files cannot be recovered, replace corrupt data files with a empty
+rfiles to allow references in the metadata table and in the tablet servers to be
+resolved. Rebuild the metadata table if the corrupt files are metadata files.</p>
+
+<p><em>Write-Ahead Log(WAL) File Corruption</em></p>
+
+<p>In certain versions of Accumulo, a corrupt WAL file (caused by HDFS corruption
+or a bug in Accumulo that created the file) can block the successful recovery
+of one to many Tablets. Accumulo can be stuck in a loop trying to recover the
+WAL file, never being able to succeed.</p>
+
+<p>In the cases where the WAL file’s original contents are unrecoverable or some degree
+of data loss is acceptable (beware if the WAL file contains updates to the Accumulo
+metadat table!), the following process can be followed to create an valid, empty
+WAL file. Run the following commands as the Accumulo unix user (to ensure that
+the proper file permissions in HDFS)</p>
+
+<div class="highlighter-rouge"><pre class="highlight"><code>$ echo -n -e '--- Log File Header (v2) ---\x00\x00\x00\x00' &gt; empty.wal
+</code></pre>
+</div>
+
+<p>The above creates a file with the text “— Log File Header (v2) —” and then
+four bytes. You should verify the contents of the file with a hexdump tool.</p>
+
+<p>Then, place this empty WAL in HDFS and then replace the corrupt WAL file in HDFS
+with the empty WAL.</p>
+
+<div class="highlighter-rouge"><pre class="highlight"><code>$ hdfs dfs -moveFromLocal empty.wal /user/accumulo/empty.wal
+$ hdfs dfs -mv /user/accumulo/empty.wal /accumulo/wal/tserver-4.example.com+10011/26abec5b-63e7-40dd-9fa1-b8ad2436606e
+</code></pre>
+</div>
+
+<p>After the corrupt WAL file has been replaced, the system should automatically recover.
+It may be necessary to restart the Accumulo Master process as an exponential
+backup policy is used which could lead to a long wait before Accumulo will
+try to re-load the WAL file.</p>
+
+<h2 id="zookeeper-failures">Zookeeper Failures</h2>
+
+<p><strong>I lost my ZooKeeper quorum (hardware failure), but HDFS is still intact. How can I recover my Accumulo instance?</strong></p>
+
+<p>ZooKeeper, in addition to its lock-service capabilities, also serves to bootstrap an Accumulo
+instance from some location in HDFS. It contains the pointers to the root tablet in HDFS which
+is then used to load the Accumulo metadata tablets, which then loads all user tables. ZooKeeper
+also stores all namespace and table configuration, the user database, the mapping of table IDs to
+table names, and more across Accumulo restarts.</p>
+
+<p>Presently, the only way to recover such an instance is to initialize a new instance and import all
+of the old data into the new instance. The easiest way to tackle this problem is to first recreate
+the mapping of table ID to table name and then recreate each of those tables in the new instance.
+Set any necessary configuration on the new tables and add some split points to the tables to close
+the gap between how many splits the old table had and no splits.</p>
+
+<p>The directory structure in HDFS for tables will follow the general structure:</p>
+
+<div class="highlighter-rouge"><pre class="highlight"><code>/accumulo
+/accumulo/tables/
+/accumulo/tables/1
+/accumulo/tables/1/default_tablet/A000001.rf
+/accumulo/tables/1/t-00001/A000002.rf
+/accumulo/tables/1/t-00001/A000003.rf
+/accumulo/tables/2/default_tablet/A000004.rf
+/accumulo/tables/2/t-00001/A000005.rf
+</code></pre>
+</div>
+
+<p>For each table, make a new directory that you can move (or copy if you have the HDFS space to do so)
+all of the rfiles for a given table into. For example, to process the table with an ID of <code class="highlighter-rouge">1</code>, make a new directory,
+say <code class="highlighter-rouge">/new-table-1</code> and then copy all files from <code class="highlighter-rouge">/accumulo/tables/1/\*/*.rf</code> into that directory. Additionally,
+make a directory, <code class="highlighter-rouge">/new-table-1-failures</code>, for any failures during the import process. Then, issue the import
+command using the Accumulo shell into the new table, telling Accumulo to not re-set the timestamp:</p>
+
+<div class="highlighter-rouge"><pre class="highlight"><code>user@instance new_table&gt; importdirectory /new-table-1 /new-table-1-failures false
+</code></pre>
+</div>
+
+<p>Any RFiles which were failed to be loaded will be placed in <code class="highlighter-rouge">/new-table-1-failures</code>. Rfiles that were successfully
+imported will no longer exist in <code class="highlighter-rouge">/new-table-1</code>. For failures, move them back to the import directory and retry
+the <code class="highlighter-rouge">importdirectory</code> command.</p>
+
+<p>It is <em>extremely</em> important to note that this approach may introduce stale data back into
+the tables. For a few reasons, RFiles may exist in the table directory which are candidates for deletion but have
+not yet been deleted. Additionally, deleted data which was not compacted away, but still exists in write-ahead logs if
+the original instance was somehow recoverable, will be re-introduced in the new instance. Table splits and merges
+(which also include the deleteRows API call on TableOperations, are also vulnerable to this problem. This process should
+<em>not</em> be used if these are unacceptable risks. It is possible to try to re-create a view of the <code class="highlighter-rouge">accumulo.metadata</code>
+table to prune out files that are candidates for deletion, but this is a difficult task that also may not be entirely accurate.</p>
+
+<p>Likewise, it is also possible that data loss may occur from write-ahead log (WAL) files which existed on the old table but
+were not minor-compacted into an RFile. Again, it may be possible to reconstruct the state of these WAL files to
+replay data not yet in an RFile; however, this is a difficult task and is not implemented in any automated fashion.</p>
+
+<p>The <code class="highlighter-rouge">importdirectory</code> shell command can be used to import RFiles from the old instance into a newly created instance,
+but extreme care should go into the decision to do this as it may result in reintroduction of stale data or the
+omission of new data.</p>
+
+<h2 id="upgrade-issues">Upgrade Issues</h2>
+
+<p><strong>I upgraded from 1.4 to 1.5 to 1.6 but still have some WAL files on local disk. Do I have any way to recover them?</strong></p>
+
+<p>Yes, you can recover them by running the LocalWALRecovery utility (not available in 1.8 and later) on each node that needs recovery performed. The utility
+will default to using the directory specified by <code class="highlighter-rouge">logger.dir.walog</code> in your configuration, or can be
+overriden by using the <code class="highlighter-rouge">--local-wal-directories</code> option on the tool. It can be invoked as follows:</p>
+
+<div class="highlighter-rouge"><pre class="highlight"><code>accumulo org.apache.accumulo.tserver.log.LocalWALRecovery
+</code></pre>
+</div>
+
+<h2 id="file-naming-conventions">File Naming Conventions</h2>
+
+<p><strong>Why are files named like they are? Why do some start with <code class="highlighter-rouge">C</code> and others with <code class="highlighter-rouge">F</code>?</strong></p>
+
+<p>The file names give you a basic idea for the source of the file.</p>
+
+<p>The base of the filename is a base-36 unique number. All filenames in accumulo are coordinated
+with a counter in zookeeper, so they are always unique, which is useful for debugging.</p>
+
+<p>The leading letter gives you an idea of how the file was created:</p>
+
+<ul>
+  <li><code class="highlighter-rouge">F</code> - Flush: entries in memory were written to a file (Minor Compaction)</li>
+  <li><code class="highlighter-rouge">M</code> - Merging compaction: entries in memory were combined with the smallest file to create one new file</li>
+  <li><code class="highlighter-rouge">C</code> - Several files, but not all files, were combined to produce this file (Major Compaction)</li>
+  <li><code class="highlighter-rouge">A</code> - All files were compacted, delete entries were dropped</li>
+  <li><code class="highlighter-rouge">I</code> - Bulk import, complete, sorted index files. Always in a directory starting with <code class="highlighter-rouge">b-</code></li>
+</ul>
+
+<p>This simple file naming convention allows you to see the basic structure of the files from just
+their filenames, and reason about what should be happening to them next, just
+by scanning their entries in the metadata tables.</p>
+
+<p>For example, if you see multiple files with <code class="highlighter-rouge">M</code> prefixes, the tablet is, or was, up against its
+maximum file limit, so it began merging memory updates with files to keep the file count reasonable.  This
+slows down ingest performance, so knowing there are many files like this tells you that the system
+is struggling to keep up with ingest vs the compaction strategy which reduces the number of files.</p>
+
+<h2 id="hdfs-decommissioning-issues">HDFS Decommissioning Issues</h2>
+
+<p><strong>My Hadoop DataNode is hung for hours trying to decommission.</strong></p>
+
+<p>Write Ahead Logs stay open until they hit the size threshold, which could be many hours or days in some cases. These open files will prevent a DN from finishing its decommissioning process (HDFS-3599) in some versions of Hadoop 2. If you stop the DN, then the WALog file will not be closed and you could lose data. To work around this issue, we now close WALogs on a time period specified by the property <code class="highlighter-rouge">tserver.walog.max.age</code> with a default period of 24 hours.</p>
+
+
+
+    <div class="row" style="margin-top: 20px;">
+      <div class="col-md-10"><strong>Find documentation for all releases in the <a href="/docs-archive">archive</strong></div>
+      <div class="col-md-2"><a class="pull-right" href="https://github.com/apache/accumulo-website/edit/master/_docs-unreleased/troubleshooting/advanced.md" role="button"><i class="glyphicon glyphicon-pencil"></i> <small>Edit this page</small></a></div>
+    </div>  
+  </div>
+</div>
+
+        </div>
+
+        
+<footer>
+
+  <p><a href="https://www.apache.org/foundation/contributing"><img src="https://www.apache.org/images/SupportApache-small.png" alt="Support the ASF" id="asf-logo" height="100" /></a></p>
+
+  <p>Copyright © 2011-2017 The Apache Software Foundation. Licensed under the <a href="https://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>.</p>
+
+</footer>
+
+
+      </div>
+    </div>
+  </div>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/accumulo-website/blob/eab65f94/docs/unreleased/troubleshooting/basic.html
----------------------------------------------------------------------
diff --git a/docs/unreleased/troubleshooting/basic.html b/docs/unreleased/troubleshooting/basic.html
new file mode 100644
index 0000000..43c508b
--- /dev/null
+++ b/docs/unreleased/troubleshooting/basic.html
@@ -0,0 +1,593 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+<!--
+    Licensed to the Apache Software Foundation (ASF) under one or more
+    contributor license agreements.  See the NOTICE file distributed with
+    this work for additional information regarding copyright ownership.
+    The ASF licenses this file to You under the Apache License, Version 2.0
+    (the "License"); you may not use this file except in compliance with
+    the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+-->
+<meta charset="utf-8">
+<meta http-equiv="X-UA-Compatible" content="IE=edge">
+<meta name="viewport" content="width=device-width, initial-scale=1">
+<link href="https://maxcdn.bootstrapcdn.com/bootswatch/3.3.7/paper/bootstrap.min.css" rel="stylesheet" integrity="sha384-awusxf8AUojygHf2+joICySzB780jVvQaVCAt1clU3QsyAitLGul28Qxb2r1e5g+" crossorigin="anonymous">
+<link href="//netdna.bootstrapcdn.com/font-awesome/4.0.3/css/font-awesome.css" rel="stylesheet">
+<link rel="stylesheet" type="text/css" href="https://cdn.datatables.net/v/bs/jq-2.2.3/dt-1.10.12/datatables.min.css">
+<link href="/css/accumulo.css" rel="stylesheet" type="text/css">
+
+<title>Accumulo Documentation - Basic Troubleshooting</title>
+
+<script src="https://ajax.googleapis.com/ajax/libs/jquery/2.2.4/jquery.min.js"></script>
+<script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/js/bootstrap.min.js" integrity="sha384-Tc5IQib027qvyjSMfHjOMaLkfuWVxZxUPnCJA7l2mCWNIpG9mGCD8wGNIcPD7Txa" crossorigin="anonymous"></script>
+<script type="text/javascript" src="https://cdn.datatables.net/v/bs/jq-2.2.3/dt-1.10.12/datatables.min.js"></script>
+<script>
+  // show location of canonical site if not currently on the canonical site
+  $(function() {
+    var host = window.location.host;
+    if (typeof host !== 'undefined' && host !== 'accumulo.apache.org') {
+      $('#non-canonical').show();
+    }
+  });
+
+  $(function() {
+    // decorate section headers with anchors
+    return $("h2, h3, h4, h5, h6").each(function(i, el) {
+      var $el, icon, id;
+      $el = $(el);
+      id = $el.attr('id');
+      icon = '<i class="fa fa-link"></i>';
+      if (id) {
+        return $el.append($("<a />").addClass("header-link").attr("href", "#" + id).html(icon));
+      }
+    });
+  });
+
+  // fix sidebar width in documentation
+  $(function() {
+    var $affixElement = $('div[data-spy="affix"]');
+    $affixElement.width($affixElement.parent().width());
+  });
+
+  // configure Google Analytics
+  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
+  m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+  })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
+
+  if (ga.hasOwnProperty('loaded') && ga.loaded === true) {
+    ga('create', 'UA-50934829-1', 'apache.org');
+    ga('send', 'pageview');
+  }
+</script>
+
+</head>
+<body style="padding-top: 100px">
+
+  <nav class="navbar navbar-default navbar-fixed-top">
+  <div class="container">
+    <div class="navbar-header">
+      <button type="button" class="navbar-toggle" data-toggle="collapse" data-target="#navbar-items">
+        <span class="sr-only">Toggle navigation</span>
+        <span class="icon-bar"></span>
+        <span class="icon-bar"></span>
+        <span class="icon-bar"></span>
+      </button>
+      <a href="/"><img id="nav-logo" alt="Apache Accumulo" class="img-responsive" src="/images/accumulo-logo.png" width="200"
+        /></a>
+    </div>
+    <div class="collapse navbar-collapse" id="navbar-items">
+      <ul class="nav navbar-nav">
+        <li class="nav-link"><a href="/downloads">Download</a></li>
+        <li class="dropdown">
+          <a class="dropdown-toggle" data-toggle="dropdown" href="#">Releases<span class="caret"></span></a>
+          <ul class="dropdown-menu">
+            <li><a href="/release/accumulo-1.8.1/">1.8.1 (Latest)</a></li>
+            <li><a href="/release/accumulo-1.7.3/">1.7.3</a></li>
+            <li><a href="/release/accumulo-1.6.6/">1.6.6</a></li>
+            <li><a href="/release/">Archive</a></li>
+          </ul>
+        </li>
+        <li class="dropdown">
+          <a class="dropdown-toggle" data-toggle="dropdown" href="#">Documentation<span class="caret"></span></a>
+          <ul class="dropdown-menu">
+            <li><a href="/1.8/accumulo_user_manual.html">User Manual (1.8)</a></li>
+            <li><a href="/1.8/apidocs">Javadocs (1.8)</a></li>
+            <li><a href="/1.8/examples">Examples (1.8)</a></li>
+            <li><a href="/features">Features</a></li>
+            <li><a href="/glossary">Glossary</a></li>
+            <li><a href="/external-docs">External Docs</a></li>
+            <li><a href="/docs-archive/">Archive</a></li>
+          </ul>
+        </li>
+        <li class="dropdown">
+          <a class="dropdown-toggle" data-toggle="dropdown" href="#">Community<span class="caret"></span></a>
+          <ul class="dropdown-menu">
+            <li><a href="/get_involved">Get Involved</a></li>
+            <li><a href="/mailing_list">Mailing Lists</a></li>
+            <li><a href="/people">People</a></li>
+            <li><a href="/related-projects">Related Projects</a></li>
+            <li><a href="/contributor/">Contributor Guide</a></li>
+          </ul>
+        </li>
+      </ul>
+      <ul class="nav navbar-nav navbar-right">
+        <li class="dropdown">
+          <a class="dropdown-toggle" data-toggle="dropdown" href="#">Apache Software Foundation<span class="caret"></span></a>
+          <ul class="dropdown-menu">
+            <li><a href="https://www.apache.org">Apache Homepage <i class="fa fa-external-link"></i></a></li>
+            <li><a href="https://www.apache.org/licenses/LICENSE-2.0">License <i class="fa fa-external-link"></i></a></li>
+            <li><a href="https://www.apache.org/foundation/sponsorship">Sponsorship <i class="fa fa-external-link"></i></a></li>
+            <li><a href="https://www.apache.org/security">Security <i class="fa fa-external-link"></i></a></li>
+            <li><a href="https://www.apache.org/foundation/thanks">Thanks <i class="fa fa-external-link"></i></a></li>
+            <li><a href="https://www.apache.org/foundation/policies/conduct">Code of Conduct <i class="fa fa-external-link"></i></a></li>
+          </ul>
+        </li>
+      </ul>
+    </div>
+  </div>
+</nav>
+
+  <div class="container">
+    <div class="row">
+      <div class="col-md-12">
+
+        <div id="non-canonical" style="display: none; background-color: #F0E68C; padding-left: 1em;">
+          Visit the official site at: <a href="https://accumulo.apache.org">https://accumulo.apache.org</a>
+        </div>
+        <div id="content">
+          
+          <div class="row">
+  <div class="col-md-3">
+    <div class="panel-group" id="accordion" role="tablist" aria-multiselectable="true" data-spy="affix">
+      <div class="panel panel-default">
+      
+      
+      
+        
+          
+        
+          
+        
+          
+            <div class="panel-heading" role="tab" id="headingOne">
+              <h4 class="panel-title">
+                <a role="button" data-toggle="collapse" data-parent="#accordion" href="#collapsegetting-started" aria-expanded="false" aria-controls="collapsegetting-started">
+                  Getting started
+                </a>
+              </h4>
+            </div>
+            <div id="collapsegetting-started" class="panel-collapse collapse" role="tabpanel" aria-labelledby="headingOne">
+              <div class="panel-body">
+                
+                
+                <div class="row doc-sidebar-link"><a href="/docs/unreleased/getting-started/design">Accumulo Design</a></div>
+                
+                <div class="row doc-sidebar-link"><a href="/docs/unreleased/getting-started/quick-install">Quick Installation</a></div>
+                
+                <div class="row doc-sidebar-link"><a href="/docs/unreleased/getting-started/clients">Accumulo Clients</a></div>
+                
+                <div class="row doc-sidebar-link"><a href="/docs/unreleased/getting-started/shell">Accumulo Shell</a></div>
+                
+                <div class="row doc-sidebar-link"><a href="/docs/unreleased/getting-started/table_design">Table Design</a></div>
+                
+                <div class="row doc-sidebar-link"><a href="/docs/unreleased/getting-started/table_configuration">Table Configuration</a></div>
+                
+              </div>
+            </div>
+          
+        
+          
+        
+          
+        
+      
+        
+          
+        
+          
+            <div class="panel-heading" role="tab" id="headingOne">
+              <h4 class="panel-title">
+                <a role="button" data-toggle="collapse" data-parent="#accordion" href="#collapsedevelopment" aria-expanded="false" aria-controls="collapsedevelopment">
+                  Development
+                </a>
+              </h4>
+            </div>
+            <div id="collapsedevelopment" class="panel-collapse collapse" role="tabpanel" aria-labelledby="headingOne">
+              <div class="panel-body">
+                
+                
+                <div class="row doc-sidebar-link"><a href="/docs/unreleased/development/iterators">Iterators</a></div>
+                
+                <div class="row doc-sidebar-link"><a href="/docs/unreleased/development/mapreduce">MapReduce</a></div>
+                
+                <div class="row doc-sidebar-link"><a href="/docs/unreleased/development/proxy">Proxy</a></div>
+                
+                <div class="row doc-sidebar-link"><a href="/docs/unreleased/development/development_tools">Development Tools</a></div>
+                
+                <div class="row doc-sidebar-link"><a href="/docs/unreleased/development/sampling">Sampling</a></div>
+                
+                <div class="row doc-sidebar-link"><a href="/docs/unreleased/development/summaries">Summary Statistics</a></div>
+                
+                <div class="row doc-sidebar-link"><a href="/docs/unreleased/development/security">Security</a></div>
+                
+                <div class="row doc-sidebar-link"><a href="/docs/unreleased/development/high_speed_ingest">High-Speed Ingest</a></div>
+                
+              </div>
+            </div>
+          
+        
+          
+        
+          
+        
+          
+        
+      
+        
+          
+            <div class="panel-heading" role="tab" id="headingOne">
+              <h4 class="panel-title">
+                <a role="button" data-toggle="collapse" data-parent="#accordion" href="#collapseadministration" aria-expanded="false" aria-controls="collapseadministration">
+                  Administration
+                </a>
+              </h4>
+            </div>
+            <div id="collapseadministration" class="panel-collapse collapse" role="tabpanel" aria-labelledby="headingOne">
+              <div class="panel-body">
+                
+                
+                <div class="row doc-sidebar-link"><a href="/docs/unreleased/administration/in-depth-install">In-depth Installation</a></div>
+                
+                <div class="row doc-sidebar-link"><a href="/docs/unreleased/administration/configuration-management">Configuration Management</a></div>
+                
+                <div class="row doc-sidebar-link"><a href="/docs/unreleased/administration/configuration-properties">Configuration Properties</a></div>
+                
+                <div class="row doc-sidebar-link"><a href="/docs/unreleased/administration/monitoring-metrics">Monitoring & Metrics</a></div>
+                
+                <div class="row doc-sidebar-link"><a href="/docs/unreleased/administration/tracing">Tracing</a></div>
+                
+                <div class="row doc-sidebar-link"><a href="/docs/unreleased/administration/fate">FATE</a></div>
+                
+                <div class="row doc-sidebar-link"><a href="/docs/unreleased/administration/multivolume">Multi-Volume Installations</a></div>
+                
+                <div class="row doc-sidebar-link"><a href="/docs/unreleased/administration/ssl">SSL</a></div>
+                
+                <div class="row doc-sidebar-link"><a href="/docs/unreleased/administration/kerberos">Kerberos</a></div>
+                
+                <div class="row doc-sidebar-link"><a href="/docs/unreleased/administration/replication">Replication</a></div>
+                
+              </div>
+            </div>
+          
+        
+          
+        
+          
+        
+          
+        
+          
+        
+      
+        
+          
+        
+          
+        
+          
+        
+          
+        
+          
+            <div class="panel-heading" role="tab" id="headingOne">
+              <h4 class="panel-title">
+                <a role="button" data-toggle="collapse" data-parent="#accordion" href="#collapsetroubleshooting" aria-expanded="true" aria-controls="collapsetroubleshooting">
+                  Troubleshooting
+                </a>
+              </h4>
+            </div>
+            <div id="collapsetroubleshooting" class="panel-collapse collapse in" role="tabpanel" aria-labelledby="headingOne">
+              <div class="panel-body">
+                
+                
+                <div class="row doc-sidebar-link"><a href="/docs/unreleased/troubleshooting/basic">Basic Troubleshooting</a></div>
+                
+                <div class="row doc-sidebar-link"><a href="/docs/unreleased/troubleshooting/advanced">Advanced Troubleshooting</a></div>
+                
+                <div class="row doc-sidebar-link"><a href="/docs/unreleased/troubleshooting/tools">Troubleshooting Tools</a></div>
+                
+                <div class="row doc-sidebar-link"><a href="/docs/unreleased/troubleshooting/system-metadata-tables">System Metadata Tables</a></div>
+                
+              </div>
+            </div>
+          
+        
+      
+      </div>
+    </div>
+  </div>
+  <div class="col-md-9">
+    
+    <p><a href="/docs/unreleased/">Accumulo unreleased docs</a> &nbsp;&gt;&gt;&nbsp; Troubleshooting &nbsp;&gt;&gt;&nbsp; Basic Troubleshooting</p>
+    
+    
+
+    <div class="alert alert-danger" style="margin-bottom: 0px;" role="alert">This documentation is for a future release of Accumulo! <a href="/1.8/accumulo_user_manual.html">View documentation for the latest release</a>.</div>
+
+    <div class="row">
+      <div class="col-md-10"><h1>Basic Troubleshooting</h1></div>
+      <div class="col-md-2"><a class="pull-right" style="margin-top: 25px;" href="https://github.com/apache/accumulo-website/edit/master/_docs-unreleased/troubleshooting/basic.md" role="button"><i class="glyphicon glyphicon-pencil"></i> <small>Edit this page</small></a></div>
+    </div>  
+    
+    <h2 id="general">General</h2>
+
+<p><strong>The tablet server does not seem to be running!? What happened?</strong></p>
+
+<p>Accumulo is a distributed system.  It is supposed to run on remote
+equipment, across hundreds of computers.  Each program that runs on
+these remote computers writes down events as they occur, into a local
+file. By default, this is defined in <code class="highlighter-rouge">conf/accumulo-env.sh</code> as <code class="highlighter-rouge">ACCUMULO_LOG_DIR</code>.
+Look in the <code class="highlighter-rouge">$ACCUMULO_LOG_DIR/tserver*.log</code> file.  Specifically, check the end of the file.</p>
+
+<p><strong>The tablet server did not start and the debug log does not exists!  What happened?</strong></p>
+
+<p>When the individual programs are started, the stdout and stderr output
+of these programs are stored in <code class="highlighter-rouge">.out</code> and <code class="highlighter-rouge">.err</code> files in
+<code class="highlighter-rouge">$ACCUMULO_LOG_DIR</code>.  Often, when there are missing configuration
+options, files or permissions, messages will be left in these files.
+Probably a start-up problem.  Look in <code class="highlighter-rouge">$ACCUMULO_LOG_DIR/tserver*.err</code></p>
+
+<p><strong>Accumulo is not working, what’s wrong?</strong></p>
+
+<p>There’s a small web server that collects information about all the
+components that make up a running Accumulo instance. It will highlight
+unusual or unexpected conditions.</p>
+
+<p>Point your browser to the monitor (typically the master host, on port 9995).  Is anything red or yellow?</p>
+
+<p><strong>My browser is reporting connection refused, and I cannot get to the monitor</strong></p>
+
+<p>The monitor program’s output is also written to .err and .out files in
+the <code class="highlighter-rouge">$ACCUMULO_LOG_DIR</code>. Look for problems in this file if the
+<code class="highlighter-rouge">$ACCUMULO_LOG_DIR/monitor*.log</code> file does not exist.</p>
+
+<p>The monitor program is probably not running.  Check the log files for errors.</p>
+
+<p><strong>My browser hangs trying to talk to the monitor.</strong></p>
+
+<p>Your browser needs to be able to reach the monitor program.  Often
+large clusters are firewalled, or use a VPN for internal
+communications. You can use SSH to proxy your browser to the cluster,
+or consult with your system administrator to gain access to the server
+from your browser.</p>
+
+<p>It is sometimes helpful to use a text-only browser to sanity-check the
+monitor while on the machine running the monitor:</p>
+
+<div class="highlighter-rouge"><pre class="highlight"><code>$ links http://localhost:9995
+</code></pre>
+</div>
+
+<p>Verify that you are not firewalled from the monitor if it is running on a remote host.</p>
+
+<p><strong>The monitor responds, but there are no numbers for tservers and tables.  The summary page says the master is down.</strong></p>
+
+<p>The monitor program gathers all the details about the master and the
+tablet servers through the master. It will be mostly blank if the
+master is down. Check for a running master.</p>
+
+<h2 id="accumulo-processes">Accumulo Processes</h2>
+
+<p><strong>My tablet server crashed!  The logs say that it lost its zookeeper lock.</strong></p>
+
+<p>Tablet servers reserve a lock in zookeeper to maintain their ownership
+over the tablets that have been assigned to them.  Part of their
+responsibility for keeping the lock is to send zookeeper a keep-alive
+message periodically.  If the tablet server fails to send a message in
+a timely fashion, zookeeper will remove the lock and notify the tablet
+server.  If the tablet server does not receive a message from
+zookeeper, it will assume its lock has been lost, too.  If a tablet
+server loses its lock, it kills itself: everything assumes it is dead
+already.</p>
+
+<p>Investigate why the tablet server did not send a timely message to
+zookeeper.</p>
+
+<p><strong>I need to decommission a node.  How do I stop the tablet server on it?</strong></p>
+
+<p>Use the admin command:</p>
+
+<div class="highlighter-rouge"><pre class="highlight"><code>$ accumulo admin stop hostname:9997
+2013-07-16 13:15:38,403 [util.Admin] INFO : Stopping server 12.34.56.78:9997
+</code></pre>
+</div>
+
+<p><strong>I cannot login to a tablet server host, and the tablet server will not shut down.  How can I kill the server?</strong></p>
+
+<p>Sometimes you can kill a “stuck” tablet server by deleting its lock in zookeeper:</p>
+
+<div class="highlighter-rouge"><pre class="highlight"><code>$ accumulo org.apache.accumulo.server.util.TabletServerLocks --list
+                  127.0.0.1:9997 TSERV_CLIENT=127.0.0.1:9997
+$ accumulo org.apache.accumulo.server.util.TabletServerLocks -delete 127.0.0.1:9997
+$ accumulo org.apache.accumulo.server.util.TabletServerLocks -list
+                  127.0.0.1:9997             null
+</code></pre>
+</div>
+
+<p>You can find the master and instance id for any accumulo instances using the same zookeeper instance:</p>
+
+<div class="highlighter-rouge"><pre class="highlight"><code>$ accumulo org.apache.accumulo.server.util.ListInstances
+INFO : Using ZooKeepers localhost:2181
+
+ Instance Name       | Instance ID                          | Master
+---------------------+--------------------------------------+-------------------------------
+              "test" | 6140b72e-edd8-4126-b2f5-e74a8bbe323b |                127.0.0.1:9999
+</code></pre>
+</div>
+
+<p><strong>One of my Accumulo processes died. How do I bring it back?</strong></p>
+
+<p>The easiest way to bring all services online for an Accumulo instance is to run the <code class="highlighter-rouge">accumulo-cluster</code> script.</p>
+
+<div class="highlighter-rouge"><pre class="highlight"><code>$ accumulo-cluster start
+</code></pre>
+</div>
+
+<p>This process will check the process listing, using <code class="highlighter-rouge">jps</code> on each host before attempting to restart a service on the given host.
+Typically, this check is sufficient except in the face of a hung/zombie process. For large clusters, it may be
+undesirable to ssh to every node in the cluster to ensure that all hosts are running the appropriate processes and <code class="highlighter-rouge">accumulo-service</code> may be of use.</p>
+
+<div class="highlighter-rouge"><pre class="highlight"><code>$ ssh host_with_dead_process
+$ accumulo-service tserver start
+</code></pre>
+</div>
+
+<p><strong>My process died again. Should I restart it via <code class="highlighter-rouge">cron</code> or tools like <code class="highlighter-rouge">supervisord</code>?</strong></p>
+
+<p>A repeatedly dying Accumulo process is a sign of a larger problem. Typically these problems are due to a
+misconfiguration of Accumulo or over-saturation of resources. Blind automation of any service restart inside of Accumulo
+is generally an undesirable situation as it is indicative of a problem that is being masked and ignored. Accumulo
+processes should be stable on the order of months and not require frequent restart.</p>
+
+<h2 id="accumulo-clients">Accumulo Clients</h2>
+
+<p><strong>Accumulo is not showing me any data!</strong></p>
+
+<p>Do you have your auths set so that it matches your visibilities?</p>
+
+<p><strong>What are my visibilities?</strong></p>
+
+<p>Use the <a href="/docs/unreleased/troubleshooting/tools#PrintInfo">PrintInfo</a> tool on a representative file to get some idea
+of the visibilities in the underlying data.</p>
+
+<p>Note that the use of <code class="highlighter-rouge">PrintInfo</code> is an administrative tool and can only
+by used by someone who can access the underlying Accumulo data. It
+does not provide the normal access controls in Accumulo.</p>
+
+<h2 id="hdfs">HDFS</h2>
+
+<p>Accumulo reads and writes to the Hadoop Distributed File System.
+Accumulo needs this file system available at all times for normal operations.</p>
+
+<p><strong>Accumulo is having problems “getting a block blk_1234567890123”. How do I fix it?</strong></p>
+
+<p>This troubleshooting guide does not cover HDFS, but in general, you
+want to make sure that all the datanodes are running and an fsck check
+finds the file system clean:</p>
+
+<div class="highlighter-rouge"><pre class="highlight"><code>$ hadoop fsck /accumulo
+</code></pre>
+</div>
+
+<p>You can use:</p>
+
+<div class="highlighter-rouge"><pre class="highlight"><code>$ hadoop fsck /accumulo/path/to/corrupt/file -locations -blocks -files
+</code></pre>
+</div>
+
+<p>to locate the block references of individual corrupt files and use those
+references to search the name node and individual data node logs to determine which
+servers those blocks have been assigned and then try to fix any underlying file
+system issues on those nodes.</p>
+
+<p>On a larger cluster, you may need to increase the number of Xcievers for HDFS DataNodes:</p>
+
+<div class="language-xml highlighter-rouge"><pre class="highlight"><code><span class="nt">&lt;property&gt;</span>
+    <span class="nt">&lt;name&gt;</span>dfs.datanode.max.xcievers<span class="nt">&lt;/name&gt;</span>
+    <span class="nt">&lt;value&gt;</span>4096<span class="nt">&lt;/value&gt;</span>
+<span class="nt">&lt;/property&gt;</span>
+</code></pre>
+</div>
+
+<p>Verify HDFS is healthy, check the datanode logs.</p>
+
+<h2 id="zookeeper">Zookeeper</h2>
+
+<p><strong>The <code class="highlighter-rouge">accumulo init</code> command is hanging. It says something about talking to zookeeper.</strong></p>
+
+<p>Zookeeper is also a distributed service.  You will need to ensure that
+it is up.  You can run the zookeeper command line tool to connect to
+any one of the zookeeper servers:</p>
+
+<div class="highlighter-rouge"><pre class="highlight"><code>$ zkCli.sh -server zoohost
+...
+[zk: zoohost:2181(CONNECTED) 0]
+</code></pre>
+</div>
+
+<p>It is important to see the word <code class="highlighter-rouge">CONNECTED</code>!  If you only see
+<code class="highlighter-rouge">CONNECTING</code> you will need to diagnose zookeeper errors.</p>
+
+<p>Check to make sure that zookeeper is up, and that
+<code class="highlighter-rouge">accumulo-site.xml</code> has been pointed to
+your zookeeper server(s).</p>
+
+<p><strong>Zookeeper is running, but it does not say CONNECTED</strong></p>
+
+<p>Zookeeper processes talk to each other to elect a leader.  All updates
+go through the leader and propagate to a majority of all the other
+nodes.  If a majority of the nodes cannot be reached, zookeeper will
+not allow updates.  Zookeeper also limits the number connections to a
+server from any other single host.  By default, this limit can be as small as 10
+and can be reached in some everything-on-one-machine test configurations.</p>
+
+<p>You can check the election status and connection status of clients by
+asking the zookeeper nodes for their status.  You connect to zookeeper
+and ask it with the four-letter <code class="highlighter-rouge">stat</code> command:</p>
+
+<div class="highlighter-rouge"><pre class="highlight"><code>$ nc zoohost 2181
+stat
+Zookeeper version: 3.4.5-1392090, built on 09/30/2012 17:52 GMT
+Clients:
+ /127.0.0.1:58289[0](queued=0,recved=1,sent=0)
+ /127.0.0.1:60231[1](queued=0,recved=53910,sent=53915)
+
+Latency min/avg/max: 0/5/3008
+Received: 1561459
+Sent: 1561592
+Connections: 2
+Outstanding: 0
+Zxid: 0x621a3b
+Mode: standalone
+Node count: 22524
+</code></pre>
+</div>
+
+<p>Check zookeeper status, verify that it has a quorum, and has not exceeded maxClientCnxns.</p>
+
+
+
+    <div class="row" style="margin-top: 20px;">
+      <div class="col-md-10"><strong>Find documentation for all releases in the <a href="/docs-archive">archive</strong></div>
+      <div class="col-md-2"><a class="pull-right" href="https://github.com/apache/accumulo-website/edit/master/_docs-unreleased/troubleshooting/basic.md" role="button"><i class="glyphicon glyphicon-pencil"></i> <small>Edit this page</small></a></div>
+    </div>  
+  </div>
+</div>
+
+        </div>
+
+        
+<footer>
+
+  <p><a href="https://www.apache.org/foundation/contributing"><img src="https://www.apache.org/images/SupportApache-small.png" alt="Support the ASF" id="asf-logo" height="100" /></a></p>
+
+  <p>Copyright © 2011-2017 The Apache Software Foundation. Licensed under the <a href="https://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>.</p>
+
+</footer>
+
+
+      </div>
+    </div>
+  </div>
+</body>
+</html>


Mime
View raw message