Return-Path: X-Original-To: apmail-hbase-commits-archive@www.apache.org Delivered-To: apmail-hbase-commits-archive@www.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 83E8BD10B for ; Thu, 19 Jul 2012 22:20:13 +0000 (UTC) Received: (qmail 16022 invoked by uid 500); 19 Jul 2012 22:20:13 -0000 Delivered-To: apmail-hbase-commits-archive@hbase.apache.org Received: (qmail 15986 invoked by uid 500); 19 Jul 2012 22:20:13 -0000 Mailing-List: contact commits-help@hbase.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@hbase.apache.org Delivered-To: mailing list commits@hbase.apache.org Received: (qmail 15978 invoked by uid 99); 19 Jul 2012 22:20:13 -0000 Received: from nike.apache.org (HELO nike.apache.org) (192.87.106.230) by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 19 Jul 2012 22:20:13 +0000 X-ASF-Spam-Status: No, hits=-2000.0 required=5.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 19 Jul 2012 22:20:10 +0000 Received: from eris.apache.org (localhost [127.0.0.1]) by eris.apache.org (Postfix) with ESMTP id 56A332388860 for ; Thu, 19 Jul 2012 22:19:50 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r1363570 - in /hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/replication/regionserver: ReplicationSource.java ReplicationSourceManager.java Date: Thu, 19 Jul 2012 22:19:50 -0000 To: commits@hbase.apache.org From: jdcryans@apache.org X-Mailer: svnmailer-1.0.8-patched Message-Id: <20120719221950.56A332388860@eris.apache.org> Author: jdcryans Date: Thu Jul 19 22:19:49 2012 New Revision: 1363570 URL: http://svn.apache.org/viewvc?rev=1363570&view=rev Log: HBASE-6319 ReplicationSource can call terminate on itself and deadlock HBASE-6325 [replication] Race in ReplicationSourceManager.init can initiate a failover even if the node is alive Modified: hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceManager.java Modified: hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java URL: http://svn.apache.org/viewvc/hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java?rev=1363570&r1=1363569&r2=1363570&view=diff ============================================================================== --- hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java (original) +++ hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java Thu Jul 19 22:19:49 2012 @@ -717,7 +717,10 @@ public class ReplicationSource extends T + " because an error occurred: " + reason, cause); } this.running = false; - Threads.shutdown(this, this.sleepForRetries); + // Only wait for the thread to die if it's not us + if (!Thread.currentThread().equals(this)) { + Threads.shutdown(this, this.sleepForRetries); + } } /** Modified: hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceManager.java URL: http://svn.apache.org/viewvc/hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceManager.java?rev=1363570&r1=1363569&r2=1363570&view=diff ============================================================================== --- hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceManager.java (original) +++ hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceManager.java Thu Jul 19 22:19:49 2012 @@ -79,7 +79,7 @@ public class ReplicationSourceManager { // The path to the latest log we saw, for new coming sources private Path latestPath; // List of all the other region servers in this cluster - private final List otherRegionServers; + private final List otherRegionServers = new ArrayList(); // Path to the hlogs directories private final Path logDir; // Path to the hlog archive @@ -120,12 +120,9 @@ public class ReplicationSourceManager { this.sleepBeforeFailover = conf.getLong("replication.sleep.before.failover", 2000); this.zkHelper.registerRegionServerListener( new OtherRegionServerWatcher(this.zkHelper.getZookeeperWatcher())); - List otherRSs = - this.zkHelper.getRegisteredRegionServers(); this.zkHelper.registerRegionServerListener( new PeersWatcher(this.zkHelper.getZookeeperWatcher())); this.zkHelper.listPeersIdsAndWatch(); - this.otherRegionServers = otherRSs == null ? new ArrayList() : otherRSs; // It's preferable to failover 1 RS at a time, but with good zk servers // more could be processed at the same time. int nbWorkers = conf.getInt("replication.executor.workers", 1); @@ -180,6 +177,7 @@ public class ReplicationSourceManager { return; } synchronized (otherRegionServers) { + refreshOtherRegionServersList(); LOG.info("Current list of replicators: " + currentReplicators + " other RSs: " + otherRegionServers); } @@ -397,6 +395,27 @@ public class ReplicationSourceManager { } /** + * Reads the list of region servers from ZK and atomically clears our + * local view of it and replaces it with the updated list. + * + * @return true if the local list of the other region servers was updated + * with the ZK data (even if it was empty), + * false if the data was missing in ZK + */ + private boolean refreshOtherRegionServersList() { + List newRsList = zkHelper.getRegisteredRegionServers(); + if (newRsList == null) { + return false; + } else { + synchronized (otherRegionServers) { + otherRegionServers.clear(); + otherRegionServers.addAll(newRsList); + } + } + return true; + } + + /** * Watcher used to be notified of the other region server's death * in the local cluster. It initiates the process to transfer the queues * if it is able to grab the lock. @@ -415,7 +434,7 @@ public class ReplicationSourceManager { * @param path full path of the new node */ public void nodeCreated(String path) { - refreshRegionServersList(path); + refreshListIfRightPath(path); } /** @@ -426,7 +445,7 @@ public class ReplicationSourceManager { if (stopper.isStopped()) { return; } - boolean cont = refreshRegionServersList(path); + boolean cont = refreshListIfRightPath(path); if (!cont) { return; } @@ -442,23 +461,14 @@ public class ReplicationSourceManager { if (stopper.isStopped()) { return; } - refreshRegionServersList(path); + refreshListIfRightPath(path); } - private boolean refreshRegionServersList(String path) { + private boolean refreshListIfRightPath(String path) { if (!path.startsWith(zkHelper.getZookeeperWatcher().rsZNode)) { return false; } - List newRsList = (zkHelper.getRegisteredRegionServers()); - if (newRsList == null) { - return false; - } else { - synchronized (otherRegionServers) { - otherRegionServers.clear(); - otherRegionServers.addAll(newRsList); - } - } - return true; + return refreshOtherRegionServersList(); } }