hbase-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From apurt...@apache.org
Subject svn commit: r1511424 - in /hbase/branches/0.92/src: main/java/org/apache/hadoop/hbase/replication/ main/java/org/apache/hadoop/hbase/replication/regionserver/ test/java/org/apache/hadoop/hbase/replication/
Date Wed, 07 Aug 2013 18:04:14 GMT
Author: apurtell
Date: Wed Aug  7 18:04:14 2013
New Revision: 1511424

URL: http://svn.apache.org/r1511424
Log:
HBASE-9154. [0.92] Backport HBASE-8207 Replication could have data loss when machine name
contains hyphen - (Jeffrey)

Modified:
    hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/replication/ReplicationZookeeper.java
    hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java
    hbase/branches/0.92/src/test/java/org/apache/hadoop/hbase/replication/TestMasterReplication.java
    hbase/branches/0.92/src/test/java/org/apache/hadoop/hbase/replication/TestMultiSlaveReplication.java
    hbase/branches/0.92/src/test/java/org/apache/hadoop/hbase/replication/TestReplication.java

Modified: hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/replication/ReplicationZookeeper.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/replication/ReplicationZookeeper.java?rev=1511424&r1=1511423&r2=1511424&view=diff
==============================================================================
--- hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/replication/ReplicationZookeeper.java
(original)
+++ hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/replication/ReplicationZookeeper.java
Wed Aug  7 18:04:14 2013
@@ -596,7 +596,8 @@ public class ReplicationZookeeper {
         List<String> hlogs = ZKUtil.listChildrenNoWatch(this.zookeeper, clusterPath);
         // That region server didn't have anything to replicate for this cluster
         if (hlogs == null || hlogs.size() == 0) {
-          continue;
+          ZKUtil.deleteNodeFailSilent(this.zookeeper, clusterPath);
+          continue; // empty log queue.
         }
         SortedSet<String> logQueue = new TreeSet<String>();
         queues.put(newCluster, logQueue);

Modified: hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java?rev=1511424&r1=1511423&r2=1511424&view=diff
==============================================================================
--- hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java
(original)
+++ hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java
Wed Aug  7 18:04:14 2013
@@ -25,6 +25,7 @@ import java.io.IOException;
 import java.net.SocketTimeoutException;
 import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.Collections;
 import java.util.Comparator;
 import java.util.HashSet;
 import java.util.List;
@@ -56,7 +57,6 @@ import org.apache.hadoop.hbase.regionser
 import org.apache.hadoop.hbase.replication.ReplicationZookeeper;
 import org.apache.hadoop.hbase.util.Bytes;
 import org.apache.hadoop.hbase.util.Threads;
-import org.apache.hadoop.hbase.zookeeper.ClusterId;
 import org.apache.hadoop.ipc.RemoteException;
 import org.apache.zookeeper.KeeperException;
 
@@ -123,7 +123,7 @@ public class ReplicationSource extends T
   // Indicates if this queue is recovered (and will be deleted when depleted)
   private boolean queueRecovered;
   // List of all the dead region servers that had this queue (if recovered)
-  private String[] deadRegionServers;
+  private List<String> deadRegionServers = new ArrayList<String>();
   // Maximum number of retries before taking bold actions
   private int maxRetriesMultiplier;
   // Socket timeouts require even bolder actions since we don't want to DDOS
@@ -199,17 +199,71 @@ public class ReplicationSource extends T
 
   // The passed znode will be either the id of the peer cluster or
   // the handling story of that queue in the form of id-servername-*
-  private void checkIfQueueRecovered(String peerClusterZnode) {
-    String[] parts = peerClusterZnode.split("-");
+  //
+  // package access for testing
+  void checkIfQueueRecovered(String peerClusterZnode) {
+    String[] parts = peerClusterZnode.split("-", 2);
     this.queueRecovered = parts.length != 1;
     this.peerId = this.queueRecovered ?
         parts[0] : peerClusterZnode;
     this.peerClusterZnode = peerClusterZnode;
-    this.deadRegionServers = new String[parts.length-1];
-    // Extract all the places where we could find the hlogs
-    for (int i = 1; i < parts.length; i++) {
-      this.deadRegionServers[i-1] = parts[i];
+
+    if (parts.length < 2) {
+      // not queue recovered situation
+      return;
     }
+
+    // extract dead servers
+    extractDeadServersFromZNodeString(parts[1], this.deadRegionServers);
+  }
+
+  /**
+   * for tests only
+   */
+  List<String> getDeadRegionServers() {
+    return Collections.unmodifiableList(this.deadRegionServers);
+  }
+
+  /**
+   * Parse dead server names from znode string servername can contain "-" such as
+   * "ip-10-46-221-101.ec2.internal", so we need skip some "-" during parsing for the following
+   * cases: 2-ip-10-46-221-101.ec2.internal,52170,1364333181125-<server name>-...
+   */
+  private static void extractDeadServersFromZNodeString(String deadServerListStr,
+      List<String> result) {
+
+    if (deadServerListStr == null || result == null || deadServerListStr.isEmpty()) return;
+    
+    // valid server name delimiter "-" has to be after "," in a server name
+    int seenCommaCnt = 0;
+    int startIndex = 0;
+    int len = deadServerListStr.length();
+  
+    for (int i = 0; i < len; i++) {
+      switch (deadServerListStr.charAt(i)) {
+      case ',':
+        seenCommaCnt += 1;
+        break;
+      case '-':
+        if (seenCommaCnt >= 2) {
+          if (i > startIndex) {
+            result.add(deadServerListStr.substring(startIndex, i));
+            startIndex = i + 1;
+          }
+          seenCommaCnt = 0;
+        }
+        break;
+      default:
+        break;
+      }
+    }
+
+    // add tail
+    if (startIndex < len - 1) {
+      result.add(deadServerListStr.substring(startIndex, len));
+    }
+
+    LOG.debug("Found dead servers:" + result);
   }
 
   /**
@@ -497,11 +551,10 @@ public class ReplicationSource extends T
           // We didn't find the log in the archive directory, look if it still
           // exists in the dead RS folder (there could be a chain of failures
           // to look at)
-          LOG.info("NB dead servers : " + deadRegionServers.length);
-          for (int i = this.deadRegionServers.length - 1; i >= 0; i--) {
-
+          LOG.info("NB dead servers : " + deadRegionServers.size());
+          for (String curDeadServerName : deadRegionServers) {
             Path deadRsDirectory =
-                new Path(manager.getLogDir().getParent(), this.deadRegionServers[i]);
+                new Path(manager.getLogDir().getParent(), curDeadServerName);
             Path[] locs = new Path[] {
                 new Path(deadRsDirectory, currentPath.getName()),
                 new Path(deadRsDirectory.suffix(HLog.SPLITTING_EXT),

Modified: hbase/branches/0.92/src/test/java/org/apache/hadoop/hbase/replication/TestMasterReplication.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.92/src/test/java/org/apache/hadoop/hbase/replication/TestMasterReplication.java?rev=1511424&r1=1511423&r2=1511424&view=diff
==============================================================================
--- hbase/branches/0.92/src/test/java/org/apache/hadoop/hbase/replication/TestMasterReplication.java
(original)
+++ hbase/branches/0.92/src/test/java/org/apache/hadoop/hbase/replication/TestMasterReplication.java
Wed Aug  7 18:04:14 2013
@@ -95,7 +95,7 @@ public class TestMasterReplication {
     // smaller block size and capacity to trigger more operations
     // and test them
     conf1.setInt("hbase.regionserver.hlog.blocksize", 1024*20);
-    conf1.setInt("replication.source.size.capacity", 1024);
+    conf1.setInt("replication.source.size.capacity", 10240);
     conf1.setLong("replication.source.sleepforretries", 100);
     conf1.setInt("hbase.regionserver.maxlogs", 10);
     conf1.setLong("hbase.master.logcleaner.ttl", 10);

Modified: hbase/branches/0.92/src/test/java/org/apache/hadoop/hbase/replication/TestMultiSlaveReplication.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.92/src/test/java/org/apache/hadoop/hbase/replication/TestMultiSlaveReplication.java?rev=1511424&r1=1511423&r2=1511424&view=diff
==============================================================================
--- hbase/branches/0.92/src/test/java/org/apache/hadoop/hbase/replication/TestMultiSlaveReplication.java
(original)
+++ hbase/branches/0.92/src/test/java/org/apache/hadoop/hbase/replication/TestMultiSlaveReplication.java
Wed Aug  7 18:04:14 2013
@@ -82,7 +82,7 @@ public class TestMultiSlaveReplication {
     // smaller block size and capacity to trigger more operations
     // and test them
     conf1.setInt("hbase.regionserver.hlog.blocksize", 1024*20);
-    conf1.setInt("replication.source.size.capacity", 1024);
+    conf1.setInt("replication.source.size.capacity", 10240);
     conf1.setLong("replication.source.sleepforretries", 100);
     conf1.setInt("hbase.regionserver.maxlogs", 10);
     conf1.setLong("hbase.master.logcleaner.ttl", 10);

Modified: hbase/branches/0.92/src/test/java/org/apache/hadoop/hbase/replication/TestReplication.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.92/src/test/java/org/apache/hadoop/hbase/replication/TestReplication.java?rev=1511424&r1=1511423&r2=1511424&view=diff
==============================================================================
--- hbase/branches/0.92/src/test/java/org/apache/hadoop/hbase/replication/TestReplication.java
(original)
+++ hbase/branches/0.92/src/test/java/org/apache/hadoop/hbase/replication/TestReplication.java
Wed Aug  7 18:04:14 2013
@@ -94,7 +94,7 @@ public class TestReplication {
     // smaller block size and capacity to trigger more operations
     // and test them
     conf1.setInt("hbase.regionserver.hlog.blocksize", 1024*20);
-    conf1.setInt("replication.source.size.capacity", 1024);
+    conf1.setInt("replication.source.size.capacity", 10240);
     conf1.setLong("replication.source.sleepforretries", 100);
     conf1.setInt("hbase.regionserver.maxlogs", 10);
     conf1.setLong("hbase.master.logcleaner.ttl", 10);



Mime
View raw message