hbase-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From jdcry...@apache.org
Subject svn commit: r1243736 - in /hbase/branches/0.92: CHANGES.txt src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java
Date Mon, 13 Feb 2012 23:16:45 GMT
Author: jdcryans
Date: Mon Feb 13 23:16:44 2012
New Revision: 1243736

URL: http://svn.apache.org/viewvc?rev=1243736&view=rev
Log:
HBASE-5197  [replication] Handle socket timeouts in ReplicationSource
            to prevent DDOS

Modified:
    hbase/branches/0.92/CHANGES.txt
    hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java

Modified: hbase/branches/0.92/CHANGES.txt
URL: http://svn.apache.org/viewvc/hbase/branches/0.92/CHANGES.txt?rev=1243736&r1=1243735&r2=1243736&view=diff
==============================================================================
--- hbase/branches/0.92/CHANGES.txt (original)
+++ hbase/branches/0.92/CHANGES.txt Mon Feb 13 23:16:44 2012
@@ -19,6 +19,10 @@ Release 0.92.1 - Unreleased
    HBASE-5364  Fix source files missing licenses in 0.92 and trunk
    HBASE-5363  Automatically run rat check on mvn release builds
 
+  IMPROVEMENTS
+   HBASE-5197  [replication] Handle socket timeouts in ReplicationSource
+               to prevent DDOS
+
   TESTS
    HBASE-5223  TestMetaReaderEditor is missing call to CatalogTracker.stop()
 

Modified: hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java?rev=1243736&r1=1243735&r2=1243736&view=diff
==============================================================================
--- hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java
(original)
+++ hbase/branches/0.92/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java
Mon Feb 13 23:16:44 2012
@@ -22,6 +22,7 @@ package org.apache.hadoop.hbase.replicat
 import java.io.EOFException;
 import java.io.FileNotFoundException;
 import java.io.IOException;
+import java.net.SocketTimeoutException;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Comparator;
@@ -124,7 +125,9 @@ public class ReplicationSource extends T
   // List of all the dead region servers that had this queue (if recovered)
   private String[] deadRegionServers;
   // Maximum number of retries before taking bold actions
-  private long maxRetriesMultiplier;
+  private int maxRetriesMultiplier;
+  // Socket timeouts require even bolder actions since we don't want to DDOS
+  private int socketTimeoutMultiplier;
   // Current number of entries that we need to replicate
   private int currentNbEntries = 0;
   // Current number of operations (Put/Delete) that we need to replicate
@@ -166,7 +169,8 @@ public class ReplicationSource extends T
       this.entriesArray[i] = new HLog.Entry();
     }
     this.maxRetriesMultiplier =
-        this.conf.getLong("replication.source.maxretriesmultiplier", 10);
+        this.conf.getInt("replication.source.maxretriesmultiplier", 10);
+    this.socketTimeoutMultiplier = maxRetriesMultiplier * maxRetriesMultiplier;
     this.queue =
         new PriorityBlockingQueue<Path>(
             conf.getInt("hbase.regionserver.maxlogs", 32),
@@ -620,8 +624,19 @@ public class ReplicationSource extends T
           ioe = ((RemoteException) ioe).unwrapRemoteException();
           LOG.warn("Can't replicate because of an error on the remote cluster: ", ioe);
         } else {
-          LOG.warn("Can't replicate because of a local or network error: ", ioe);
+          if (ioe instanceof SocketTimeoutException) {
+            // This exception means we waited for more than 60s and nothing
+            // happened, the cluster is alive and calling it right away
+            // even for a test just makes things worse.
+            sleepForRetries("Encountered a SocketTimeoutException. Since the" +
+              "call to the remote cluster timed out, which is usually " +
+              "caused by a machine failure or a massive slowdown",
+              this.socketTimeoutMultiplier);
+          } else {
+            LOG.warn("Can't replicate because of a local or network error: ", ioe);
+          }
         }
+
         try {
           boolean down;
           // Spin while the slave is down and we're not asked to shutdown/close



Mime
View raw message