accumulo-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From els...@apache.org
Subject [2/3] accumulo git commit: ACCUMULO-3963 Greatly loosen HDFS-unavailable killing properties.
Date Sat, 31 Oct 2015 19:39:29 GMT
ACCUMULO-3963 Greatly loosen HDFS-unavailable killing properties.

Reuse the Retry logic instead of the Cache (which was also broken
as cache.size() is not guaranteed to be accurate with expiration).


Project: http://git-wip-us.apache.org/repos/asf/accumulo/repo
Commit: http://git-wip-us.apache.org/repos/asf/accumulo/commit/0eaece7f
Tree: http://git-wip-us.apache.org/repos/asf/accumulo/tree/0eaece7f
Diff: http://git-wip-us.apache.org/repos/asf/accumulo/diff/0eaece7f

Branch: refs/heads/master
Commit: 0eaece7f2ea96c80e47c6ae40b0116bc3857cb95
Parents: 3ae4336
Author: Josh Elser <elserj@apache.org>
Authored: Fri Oct 30 17:41:37 2015 -0400
Committer: Josh Elser <elserj@apache.org>
Committed: Sat Oct 31 14:48:06 2015 -0400

----------------------------------------------------------------------
 .../org/apache/accumulo/core/conf/Property.java | 11 +++--
 .../apache/accumulo/tserver/TabletServer.java   | 16 +++++--
 .../tserver/log/TabletServerLogger.java         | 46 +++++++++++++++-----
 .../accumulo/test/TabletServerGivesUpIT.java    |  2 +
 4 files changed, 56 insertions(+), 19 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/accumulo/blob/0eaece7f/core/src/main/java/org/apache/accumulo/core/conf/Property.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/accumulo/core/conf/Property.java b/core/src/main/java/org/apache/accumulo/core/conf/Property.java
index 3e2b2e7..055e7dd 100644
--- a/core/src/main/java/org/apache/accumulo/core/conf/Property.java
+++ b/core/src/main/java/org/apache/accumulo/core/conf/Property.java
@@ -245,11 +245,16 @@ public enum Property {
           + "must be made, which is slower. However opening too many files at once can cause
problems."),
   TSERV_WALOG_MAX_SIZE("tserver.walog.max.size", "1G", PropertyType.MEMORY,
       "The maximum size for each write-ahead log. See comment for property tserver.memory.maps.max"),
-  TSERV_WALOG_TOLERATED_CREATION_FAILURES("tserver.walog.tolerated.creation.failures", "15",
PropertyType.COUNT,
+
+  TSERV_WALOG_TOLERATED_CREATION_FAILURES("tserver.walog.tolerated.creation.failures", "50",
PropertyType.COUNT,
       "The maximum number of failures tolerated when creating a new WAL file within the period
specified by tserver.walog.failures.period."
           + " Exceeding this number of failures in the period causes the TabletServer to
exit."),
-  TSERV_WALOG_TOLERATED_CREATION_FAILURES_PERIOD("tserver.walog.tolerated.creation.failures.period",
"10s", PropertyType.TIMEDURATION,
-      "The period in which the number of failures to create a WAL file in HDFS causes the
TabletServer to exit."),
+  TSERV_WALOG_TOLERATED_WAIT_INCREMENT("tserver.walog.tolerated.wait.increment", "1000ms",
PropertyType.TIMEDURATION,
+      "The amount of time to wait between failures to create a WALog."),
+  // Never wait longer than 5 mins for a retry
+  TSERV_WALOG_TOLERATED_MAXIMUM_WAIT_DURATION("tserver.walog.maximum.wait.duration", "5m",
PropertyType.TIMEDURATION,
+      "The maximum amount of time to wait after a failure to create a WAL file."),
+
   TSERV_MAJC_DELAY("tserver.compaction.major.delay", "30s", PropertyType.TIMEDURATION,
       "Time a tablet server will sleep between checking which tablets need compaction."),
   TSERV_MAJC_THREAD_MAXOPEN("tserver.compaction.major.thread.files.open.max", "10", PropertyType.COUNT,

http://git-wip-us.apache.org/repos/asf/accumulo/blob/0eaece7f/server/tserver/src/main/java/org/apache/accumulo/tserver/TabletServer.java
----------------------------------------------------------------------
diff --git a/server/tserver/src/main/java/org/apache/accumulo/tserver/TabletServer.java b/server/tserver/src/main/java/org/apache/accumulo/tserver/TabletServer.java
index 4be001a..034cb16 100644
--- a/server/tserver/src/main/java/org/apache/accumulo/tserver/TabletServer.java
+++ b/server/tserver/src/main/java/org/apache/accumulo/tserver/TabletServer.java
@@ -141,6 +141,7 @@ import org.apache.accumulo.core.util.UtilWaitThread;
 import org.apache.accumulo.core.zookeeper.ZooUtil;
 import org.apache.accumulo.fate.util.LoggingRunnable;
 import org.apache.accumulo.fate.zookeeper.IZooReaderWriter;
+import org.apache.accumulo.fate.zookeeper.RetryFactory;
 import org.apache.accumulo.fate.zookeeper.ZooLock.LockLossReason;
 import org.apache.accumulo.fate.zookeeper.ZooLock.LockWatcher;
 import org.apache.accumulo.fate.zookeeper.ZooUtil.NodeExistsPolicy;
@@ -343,14 +344,21 @@ public class TabletServer extends AccumuloServerContext implements Runnable
{
       }
     }, 5000, 5000);
 
-    final long walogMaxSize = getConfiguration().getMemoryInBytes(Property.TSERV_WALOG_MAX_SIZE);
+    final long walogMaxSize = aconf.getMemoryInBytes(Property.TSERV_WALOG_MAX_SIZE);
     final long minBlockSize = CachedConfiguration.getInstance().getLong("dfs.namenode.fs-limits.min-block-size",
0);
     if (minBlockSize != 0 && minBlockSize > walogMaxSize)
       throw new RuntimeException("Unable to start TabletServer. Logger is set to use blocksize
" + walogMaxSize + " but hdfs minimum block size is "
           + minBlockSize + ". Either increase the " + Property.TSERV_WALOG_MAX_SIZE + " or
decrease dfs.namenode.fs-limits.min-block-size in hdfs-site.xml.");
-    final long toleratedWalCreationFailures = getConfiguration().getCount(Property.TSERV_WALOG_TOLERATED_CREATION_FAILURES);
-    final long toleratedWalCreationFailuresPeriod = getConfiguration().getTimeInMillis(Property.TSERV_WALOG_TOLERATED_CREATION_FAILURES_PERIOD);
-    logger = new TabletServerLogger(this, walogMaxSize, syncCounter, flushCounter, toleratedWalCreationFailures,
toleratedWalCreationFailuresPeriod);
+
+    final long toleratedWalCreationFailures = aconf.getCount(Property.TSERV_WALOG_TOLERATED_CREATION_FAILURES);
+    final long walCreationFailureRetryIncrement = aconf.getTimeInMillis(Property.TSERV_WALOG_TOLERATED_WAIT_INCREMENT);
+    final long walCreationFailureRetryMax = aconf.getTimeInMillis(Property.TSERV_WALOG_TOLERATED_MAXIMUM_WAIT_DURATION);
+    // Tolerate `toleratedWalCreationFailures` failures, waiting `walCreationFailureRetryIncrement`
milliseconds after the first failure,
+    // incrementing the next wait period by the same value, for a maximum of `walCreationFailureRetryMax`
retries.
+    final RetryFactory walCreationRetryFactory = new RetryFactory(toleratedWalCreationFailures,
walCreationFailureRetryIncrement,
+        walCreationFailureRetryIncrement, walCreationFailureRetryMax);
+
+    logger = new TabletServerLogger(this, walogMaxSize, syncCounter, flushCounter, walCreationRetryFactory);
     this.resourceManager = new TabletServerResourceManager(this, fs);
     this.security = AuditedSecurityOperation.getInstance(this);
 

http://git-wip-us.apache.org/repos/asf/accumulo/blob/0eaece7f/server/tserver/src/main/java/org/apache/accumulo/tserver/log/TabletServerLogger.java
----------------------------------------------------------------------
diff --git a/server/tserver/src/main/java/org/apache/accumulo/tserver/log/TabletServerLogger.java
b/server/tserver/src/main/java/org/apache/accumulo/tserver/log/TabletServerLogger.java
index a1921c2..13c742a 100644
--- a/server/tserver/src/main/java/org/apache/accumulo/tserver/log/TabletServerLogger.java
+++ b/server/tserver/src/main/java/org/apache/accumulo/tserver/log/TabletServerLogger.java
@@ -26,7 +26,6 @@ import java.util.List;
 import java.util.Map;
 import java.util.Map.Entry;
 import java.util.Set;
-import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.concurrent.atomic.AtomicLong;
 import java.util.concurrent.locks.ReadWriteLock;
@@ -38,6 +37,8 @@ import org.apache.accumulo.core.data.impl.KeyExtent;
 import org.apache.accumulo.core.protobuf.ProtobufUtil;
 import org.apache.accumulo.core.replication.ReplicationConfigurationUtil;
 import org.apache.accumulo.core.util.UtilWaitThread;
+import org.apache.accumulo.fate.zookeeper.Retry;
+import org.apache.accumulo.fate.zookeeper.RetryFactory;
 import org.apache.accumulo.server.conf.TableConfiguration;
 import org.apache.accumulo.server.fs.VolumeManager;
 import org.apache.accumulo.server.replication.StatusUtil;
@@ -53,9 +54,6 @@ import org.apache.hadoop.fs.Path;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import com.google.common.cache.Cache;
-import com.google.common.cache.CacheBuilder;
-
 /**
  * Central logging facility for the TServerInfo.
  *
@@ -92,8 +90,8 @@ public class TabletServerLogger {
   private final AtomicLong syncCounter;
   private final AtomicLong flushCounter;
 
-  private final long toleratedFailures;
-  private final Cache<Long,Object> walErrors;
+  private final RetryFactory retryFactory;
+  private Retry retry = null;
 
   static private abstract class TestCallWithWriteLock {
     abstract boolean test();
@@ -138,14 +136,13 @@ public class TabletServerLogger {
     }
   }
 
-  public TabletServerLogger(TabletServer tserver, long maxSize, AtomicLong syncCounter, AtomicLong
flushCounter, long toleratedWalCreationFailures,
-      long toleratedFailuresPeriodMillis) {
+  public TabletServerLogger(TabletServer tserver, long maxSize, AtomicLong syncCounter, AtomicLong
flushCounter, RetryFactory retryFactory) {
     this.tserver = tserver;
     this.maxSize = maxSize;
     this.syncCounter = syncCounter;
     this.flushCounter = flushCounter;
-    this.toleratedFailures = toleratedWalCreationFailures;
-    this.walErrors = CacheBuilder.newBuilder().maximumSize(toleratedFailures).expireAfterWrite(toleratedFailuresPeriodMillis,
TimeUnit.MILLISECONDS).build();
+    this.retryFactory = retryFactory;
+    this.retry = null;
   }
 
   private int initializeLoggers(final List<DfsLogger> copy) throws IOException {
@@ -203,12 +200,37 @@ public class TabletServerLogger {
       alog.open(tserver.getClientAddressString());
       loggers.add(alog);
       logSetId.incrementAndGet();
+
+      // When we successfully create a WAL, make sure to reset the Retry.
+      if (null != retry) {
+        retry = null;
+      }
+
       return;
     } catch (Exception t) {
-      walErrors.put(System.currentTimeMillis(), "");
-      if (walErrors.size() > toleratedFailures) {
+      if (null == retry) {
+        retry = retryFactory.create();
+      }
+
+      // We have more retries or we exceeded the maximum number of accepted failures
+      if (retry.canRetry()) {
+        // Use the retry and record the time in which we did so
+        retry.useRetry();
+
+        try {
+          // Backoff
+          retry.waitForNextAttempt();
+        } catch (InterruptedException e) {
+          Thread.currentThread().interrupt();
+          throw new RuntimeException(e);
+        }
+      } else {
+        log.error("Repeatedly failed to create WAL. Going to exit tabletserver.", t);
+        // We didn't have retries or we failed too many times.
         Halt.halt("Experienced too many errors creating WALs, giving up");
       }
+
+      // The exception will trigger the log creation to be re-attempted.
       throw new RuntimeException(t);
     }
   }

http://git-wip-us.apache.org/repos/asf/accumulo/blob/0eaece7f/test/src/test/java/org/apache/accumulo/test/TabletServerGivesUpIT.java
----------------------------------------------------------------------
diff --git a/test/src/test/java/org/apache/accumulo/test/TabletServerGivesUpIT.java b/test/src/test/java/org/apache/accumulo/test/TabletServerGivesUpIT.java
index 5da0629..081ee85 100644
--- a/test/src/test/java/org/apache/accumulo/test/TabletServerGivesUpIT.java
+++ b/test/src/test/java/org/apache/accumulo/test/TabletServerGivesUpIT.java
@@ -38,6 +38,8 @@ public class TabletServerGivesUpIT extends ConfigurableMacIT {
     cfg.useMiniDFS(true);
     cfg.setNumTservers(1);
     cfg.setProperty(Property.INSTANCE_ZK_TIMEOUT, "5s");
+    cfg.setProperty(Property.TSERV_WALOG_TOLERATED_CREATION_FAILURES, "15");
+    cfg.setProperty(Property.TSERV_WALOG_TOLERATED_MAXIMUM_WAIT_DURATION, "0s");
   }
 
   @Test(timeout = 30 * 1000)


Mime
View raw message