hbase-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From te...@apache.org
Subject svn commit: r1450000 - in /hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase: master/SplitLogManager.java zookeeper/RecoverableZooKeeper.java
Date Tue, 26 Feb 2013 02:20:09 GMT
Author: tedyu
Date: Tue Feb 26 02:20:08 2013
New Revision: 1450000

URL: http://svn.apache.org/r1450000
Log:
HBASE-7914 Port the fix of HBASE-6748 into 0.94 branch (Jeffrey Zhong)


Modified:
    hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java
    hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/zookeeper/RecoverableZooKeeper.java

Modified: hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java?rev=1450000&r1=1449999&r2=1450000&view=diff
==============================================================================
--- hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java
(original)
+++ hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java
Tue Feb 26 02:20:08 2013
@@ -23,10 +23,11 @@ import static org.apache.hadoop.hbase.zo
 
 import java.io.IOException;
 import java.util.ArrayList;
+import java.util.Collections;
 import java.util.HashSet;
-import java.util.Set;
 import java.util.List;
 import java.util.Map;
+import java.util.Set;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.ConcurrentMap;
 
@@ -44,11 +45,11 @@ import org.apache.hadoop.hbase.monitorin
 import org.apache.hadoop.hbase.monitoring.TaskMonitor;
 import org.apache.hadoop.hbase.regionserver.SplitLogWorker;
 import org.apache.hadoop.hbase.regionserver.wal.HLogSplitter;
-import org.apache.hadoop.hbase.regionserver.wal.OrphanHLogAfterSplitException;
 import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
 import org.apache.hadoop.hbase.util.FSUtils;
 import org.apache.hadoop.hbase.util.Threads;
 import org.apache.hadoop.hbase.zookeeper.ZKSplitLog;
+import org.apache.hadoop.hbase.zookeeper.ZKSplitLog.TaskState;
 import org.apache.hadoop.hbase.zookeeper.ZKUtil;
 import org.apache.hadoop.hbase.zookeeper.ZooKeeperListener;
 import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
@@ -59,7 +60,6 @@ import org.apache.zookeeper.KeeperExcept
 import org.apache.zookeeper.KeeperException.NoNodeException;
 import org.apache.zookeeper.ZooDefs.Ids;
 import org.apache.zookeeper.data.Stat;
-import org.apache.hadoop.hbase.zookeeper.ZKSplitLog.TaskState;
 
 import static org.apache.hadoop.hbase.master.SplitLogManager.ResubmitDirective.*;
 import static org.apache.hadoop.hbase.master.SplitLogManager.TerminationStatus.*;
@@ -116,6 +116,8 @@ public class SplitLogManager extends Zoo
   private Set<String> deadWorkers = null;
   private Object deadWorkersLock = new Object();
 
+  private Set<String> failedDeletions = null;
+
   /**
    * Wrapper around {@link #SplitLogManager(ZooKeeperWatcher, Configuration,
    * Stoppable, String, TaskFinisher)} that provides a task finisher for
@@ -178,6 +180,8 @@ public class SplitLogManager extends Zoo
         conf.getInt("hbase.splitlog.manager.timeoutmonitor.period",
             1000),
         stopper);
+
+    this.failedDeletions = Collections.synchronizedSet(new HashSet<String>());
   }
 
   public void finishInitialization(boolean masterRecovery) {
@@ -416,11 +420,12 @@ public class SplitLogManager extends Zoo
         }
       }
     }
-    // delete the task node in zk. Keep trying indefinitely - its an async
+    // delete the task node in zk. It's an async
     // call and no one is blocked waiting for this node to be deleted. All
     // task names are unique (log.<timestamp>) there is no risk of deleting
     // a future task.
-    deleteNode(path, Long.MAX_VALUE);
+    // if a deletion fails, TimeoutMonitor will retry the same deletion later
+    deleteNode(path, zkretries);
     return;
   }
 
@@ -532,6 +537,21 @@ public class SplitLogManager extends Zoo
     }
   }
 
+  /**
+   * Helper function to check whether to abandon retries in ZooKeeper AsyncCallback functions
+   * @param statusCode integer value of a ZooKeeper exception code
+   * @param action description message about the retried action
+   * @return true when need to abandon retries, otherwise false
+   */
+  private boolean shouldAbandonRetries(int statusCode, String action) {
+    if (statusCode == KeeperException.Code.SESSIONEXPIRED.intValue()) {
+      LOG.error("ZK session expired. Master is expected to shut down. Abandoning retries
for "
+          + "action=" + action);
+      return true;
+    }
+    return false;
+  }
+
   private void heartbeat(String path, int new_version,
       String workerName) {
     Task task = findOrCreateOrphanTask(path);
@@ -653,8 +673,7 @@ public class SplitLogManager extends Zoo
   }
 
   private void deleteNodeFailure(String path) {
-    LOG.fatal("logic failure, failing to delete a node should never happen " +
-        "because delete has infinite retries");
+    LOG.info("Failed to delete node " + path + " and will retry soon.");
     return;
   }
 
@@ -1000,6 +1019,16 @@ public class SplitLogManager extends Zoo
         tot_mgr_resubmit_unassigned.incrementAndGet();
         LOG.debug("resubmitting unassigned task(s) after timeout");
       }
+
+      // Retry previously failed deletes
+      if (failedDeletions.size() > 0) {
+        List<String> tmpPaths = new ArrayList<String>(failedDeletions);
+        failedDeletions.removeAll(tmpPaths);
+        for (String tmpPath : tmpPaths) {
+          // deleteNode is an async call
+          deleteNode(tmpPath, zkretries);
+        }
+      }
     }
   }
 
@@ -1014,6 +1043,10 @@ public class SplitLogManager extends Zoo
     public void processResult(int rc, String path, Object ctx, String name) {
       tot_mgr_node_create_result.incrementAndGet();
       if (rc != 0) {
+        if (shouldAbandonRetries(rc, "Create znode " + path)) {
+          createNodeFailure(path);
+          return;
+        }
         if (rc == KeeperException.Code.NODEEXISTS.intValue()) {
           // What if there is a delete pending against this pre-existing
           // znode? Then this soon-to-be-deleted task znode must be in TASK_DONE
@@ -1053,6 +1086,9 @@ public class SplitLogManager extends Zoo
         Stat stat) {
       tot_mgr_get_data_result.incrementAndGet();
       if (rc != 0) {
+        if (shouldAbandonRetries(rc, "GetData from znode " + path)) {
+          return;
+        }
         if (rc == KeeperException.Code.NONODE.intValue()) {
           tot_mgr_get_data_nonode.incrementAndGet();
           // The task znode has been deleted. Must be some pending delete
@@ -1096,6 +1132,10 @@ public class SplitLogManager extends Zoo
     public void processResult(int rc, String path, Object ctx) {
       tot_mgr_node_delete_result.incrementAndGet();
       if (rc != 0) {
+        if (shouldAbandonRetries(rc, "Delete znode " + path)) {
+          failedDeletions.add(path);
+          return;
+        }
         if (rc != KeeperException.Code.NONODE.intValue()) {
           tot_mgr_node_delete_err.incrementAndGet();
           Long retry_count = (Long) ctx;
@@ -1103,6 +1143,7 @@ public class SplitLogManager extends Zoo
               path + " remaining retries=" + retry_count);
           if (retry_count == 0) {
             LOG.warn("delete failed " + path);
+            failedDeletions.add(path);
             deleteNodeFailure(path);
           } else {
             deleteNode(path, retry_count - 1);
@@ -1134,6 +1175,9 @@ public class SplitLogManager extends Zoo
     @Override
     public void processResult(int rc, String path, Object ctx, String name) {
       if (rc != 0) {
+        if (shouldAbandonRetries(rc, "CreateRescan znode " + path)) {
+          return;
+        }
         Long retry_count = (Long)ctx;
         LOG.warn("rc=" + KeeperException.Code.get(rc) + " for "+ path +
             " remaining retries=" + retry_count);

Modified: hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/zookeeper/RecoverableZooKeeper.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/zookeeper/RecoverableZooKeeper.java?rev=1450000&r1=1449999&r2=1450000&view=diff
==============================================================================
--- hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/zookeeper/RecoverableZooKeeper.java
(original)
+++ hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/zookeeper/RecoverableZooKeeper.java
Tue Feb 26 02:20:08 2013
@@ -70,7 +70,7 @@ import org.apache.zookeeper.proto.SetDat
 public class RecoverableZooKeeper {
   private static final Log LOG = LogFactory.getLog(RecoverableZooKeeper.class);
   // the actual ZooKeeper client instance
-  private ZooKeeper zk;
+  private volatile ZooKeeper zk;
   private final RetryCounterFactory retryCounterFactory;
   // An identifier of this process in the cluster
   private final String identifier;



Mime
View raw message