accumulo-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From e..@apache.org
Subject [1/4] git commit: ACCUMULO-2261 cherry pick back 1.5
Date Tue, 28 Jan 2014 19:22:53 GMT
Updated Branches:
  refs/heads/master 25948e6b1 -> 8681f07ca


ACCUMULO-2261 cherry pick back 1.5


Project: http://git-wip-us.apache.org/repos/asf/accumulo/repo
Commit: http://git-wip-us.apache.org/repos/asf/accumulo/commit/772ca16b
Tree: http://git-wip-us.apache.org/repos/asf/accumulo/tree/772ca16b
Diff: http://git-wip-us.apache.org/repos/asf/accumulo/diff/772ca16b

Branch: refs/heads/master
Commit: 772ca16bc8cbaef6db4bbd440145dc744947fedb
Parents: ffd7232
Author: Eric Newton <eric.newton@gmail.com>
Authored: Tue Jan 28 13:46:31 2014 -0500
Committer: Eric Newton <eric.newton@gmail.com>
Committed: Tue Jan 28 14:15:53 2014 -0500

----------------------------------------------------------------------
 .../apache/accumulo/server/master/Master.java   | 58 +++++++++++++++++++-
 .../master/state/MetaDataTableScanner.java      | 11 +---
 .../master/state/TabletLocationState.java       |  8 ++-
 3 files changed, 65 insertions(+), 12 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/accumulo/blob/772ca16b/server/src/main/java/org/apache/accumulo/server/master/Master.java
----------------------------------------------------------------------
diff --git a/server/src/main/java/org/apache/accumulo/server/master/Master.java b/server/src/main/java/org/apache/accumulo/server/master/Master.java
index 100a351..33919f7 100644
--- a/server/src/main/java/org/apache/accumulo/server/master/Master.java
+++ b/server/src/main/java/org/apache/accumulo/server/master/Master.java
@@ -116,6 +116,7 @@ import org.apache.accumulo.server.master.state.TServerInstance;
 import org.apache.accumulo.server.master.state.TableCounts;
 import org.apache.accumulo.server.master.state.TableStats;
 import org.apache.accumulo.server.master.state.TabletLocationState;
+import org.apache.accumulo.server.master.state.TabletLocationState.BadLocationStateException;
 import org.apache.accumulo.server.master.state.TabletMigration;
 import org.apache.accumulo.server.master.state.TabletServerState;
 import org.apache.accumulo.server.master.state.TabletState;
@@ -1438,11 +1439,66 @@ public class Master implements LiveTServerSet.Listener, TableObserver,
CurrentSt
           eventListener.waitForEvents(TIME_TO_WAIT_BETWEEN_SCANS);
         } catch (Exception ex) {
           log.error("Error processing table state for store " + store.name(), ex);
-          UtilWaitThread.sleep(WAIT_BETWEEN_ERRORS);
+          if (ex.getCause() != null && ex.getCause() instanceof BadLocationStateException)
{ 
+            repairMetadata(((BadLocationStateException) ex.getCause()).getEncodedEndRow());
+          } else {
+            UtilWaitThread.sleep(WAIT_BETWEEN_ERRORS);
+          }
         }
       }
     }
     
+  private void repairMetadata(Text row) {
+    Master.log.debug("Attempting repair on " + row);
+    // ACCUMULO-2261 if a dying tserver writes a location before its lock information propagates,
it may cause duplicate assignment.
+    // Attempt to find the dead server entry and remove it.
+    try {
+      Map<Key, Value> future = new HashMap<Key, Value>();
+      Map<Key, Value> assigned = new HashMap<Key, Value>();
+      KeyExtent extent = new KeyExtent(row, new Value(new byte[]{0}));
+      String table = Constants.METADATA_TABLE_NAME;
+      Scanner scanner = getConnector().createScanner(table, Constants.NO_AUTHS);
+      scanner.fetchColumnFamily(Constants.METADATA_CURRENT_LOCATION_COLUMN_FAMILY);
+      scanner.fetchColumnFamily(Constants.METADATA_FUTURE_LOCATION_COLUMN_FAMILY);
+      scanner.setRange(new Range(row));
+      for (Entry<Key,Value> entry : scanner) {
+        if (entry.getKey().getColumnFamily().equals(Constants.METADATA_CURRENT_LOCATION_COLUMN_FAMILY))
{
+          assigned.put(entry.getKey(), entry.getValue());
+        } else if (entry.getKey().getColumnFamily().equals(Constants.METADATA_FUTURE_LOCATION_COLUMN_FAMILY))
{
+          future.put(entry.getKey(), entry.getValue());
+        }
+      }
+      if (future.size() > 0 && assigned.size() > 0) {
+        Master.log.warn("Found a tablet assigned and hosted, attempting to repair");
+      } else if (future.size() > 1 && assigned.size() == 0) {
+        Master.log.warn("Found a tablet assigned to multiple servers, attempting to repair");
+      } else if (future.size() == 0 && assigned.size() > 1) {
+        Master.log.warn("Found a tablet hosted on multiple servers, attempting to repair");
+      } else {
+        Master.log.info("Attempted a repair, but nothing seems to be obviously wrong. " +
assigned + " " + future);
+        return;
+      }
+      Map<Key, Value> all = new HashMap<Key, Value>();
+      all.putAll(future);
+      all.putAll(assigned);
+      for (Entry<Key, Value> entry : all.entrySet()) {
+        TServerInstance alive = tserverSet.find(entry.getValue().toString());
+        if (alive == null) {
+          Master.log.info("Removing entry " + entry);
+          BatchWriter bw = getConnector().createBatchWriter(table, new BatchWriterConfig());
+          Mutation m = new Mutation(entry.getKey().getRow());
+          m.putDelete(entry.getKey().getColumnFamily(), entry.getKey().getColumnQualifier());
+          bw.addMutation(m);
+          bw.close();
+          return;
+        }
+      }
+      Master.log.error("Metadata table is inconsistent at " + row + " and all assigned/future
tservers are still online.");
+    } catch (Throwable e) {
+      Master.log.error("Error attempting repair of metadata " + row + ": " + e, e);
+    }
+  }
+
     private int assignedOrHosted() {
       int result = 0;
       for (TableCounts counts : stats.getLast().values()) {

http://git-wip-us.apache.org/repos/asf/accumulo/blob/772ca16b/server/src/main/java/org/apache/accumulo/server/master/state/MetaDataTableScanner.java
----------------------------------------------------------------------
diff --git a/server/src/main/java/org/apache/accumulo/server/master/state/MetaDataTableScanner.java
b/server/src/main/java/org/apache/accumulo/server/master/state/MetaDataTableScanner.java
index 2458a07..1bddb4b 100644
--- a/server/src/main/java/org/apache/accumulo/server/master/state/MetaDataTableScanner.java
+++ b/server/src/main/java/org/apache/accumulo/server/master/state/MetaDataTableScanner.java
@@ -108,14 +108,7 @@ public class MetaDataTableScanner implements Iterator<TabletLocationState>
{
   
   @Override
   public TabletLocationState next() {
-    try {
       return fetch();
-    } catch (RuntimeException ex) {
-      // something is wrong with the records in the !METADATA table, just skip over it
-      log.error(ex, ex);
-      mdScanner.close();
-      return null;
-    } 
   }
   
   public static TabletLocationState createTabletLocationState(Key k, Value v) throws IOException,
BadLocationStateException {
@@ -137,13 +130,13 @@ public class MetaDataTableScanner implements Iterator<TabletLocationState>
{
       if (cf.compareTo(Constants.METADATA_FUTURE_LOCATION_COLUMN_FAMILY) == 0) {
         TServerInstance location = new TServerInstance(entry.getValue(), cq);
         if (future != null) {
-          throw new BadLocationStateException("found two assignments for the same extent
" + key.getRow() + ": " + future + " and " + location);
+          throw new BadLocationStateException("found two assignments for the same extent
" + key.getRow() + ": " + future + " and " + location, entry.getKey().getRow());
         }
         future = location;
       } else if (cf.compareTo(Constants.METADATA_CURRENT_LOCATION_COLUMN_FAMILY) == 0) {
         TServerInstance location = new TServerInstance(entry.getValue(), cq);
         if (current != null) {
-          throw new BadLocationStateException("found two locations for the same extent "
+ key.getRow() + ": " + current + " and " + location);
+          throw new BadLocationStateException("found two locations for the same extent "
+ key.getRow() + ": " + current + " and " + location, entry.getKey().getRow());
         }
         current = location;
       } else if (cf.compareTo(Constants.METADATA_LOG_COLUMN_FAMILY) == 0) {

http://git-wip-us.apache.org/repos/asf/accumulo/blob/772ca16b/server/src/main/java/org/apache/accumulo/server/master/state/TabletLocationState.java
----------------------------------------------------------------------
diff --git a/server/src/main/java/org/apache/accumulo/server/master/state/TabletLocationState.java
b/server/src/main/java/org/apache/accumulo/server/master/state/TabletLocationState.java
index bcfaead..5432d32 100644
--- a/server/src/main/java/org/apache/accumulo/server/master/state/TabletLocationState.java
+++ b/server/src/main/java/org/apache/accumulo/server/master/state/TabletLocationState.java
@@ -21,6 +21,7 @@ import java.util.Collections;
 import java.util.Set;
 
 import org.apache.accumulo.core.data.KeyExtent;
+import org.apache.hadoop.io.Text;
 
 /**
  * When a tablet is assigned, we mark its future location. When the tablet is opened, we
set its current location. A tablet should never have both a future and
@@ -33,8 +34,11 @@ public class TabletLocationState {
   
   static public class BadLocationStateException extends Exception {
     private static final long serialVersionUID = 1L;
+    private Text metadataTableEntry;
 
-    BadLocationStateException(String msg) { super(msg); }
+    BadLocationStateException(String msg, Text row) { super(msg); this.metadataTableEntry
= row; }
+
+    public Text getEncodedEndRow() { return metadataTableEntry; }
   }
   
   public TabletLocationState(KeyExtent extent, TServerInstance future, TServerInstance current,
TServerInstance last, Collection<Collection<String>> walogs,
@@ -48,7 +52,7 @@ public class TabletLocationState {
     this.walogs = walogs;
     this.chopped = chopped;
     if (current != null && future != null) {
-      throw new BadLocationStateException(extent + " is both assigned and hosted, which should
never happen: " + this);
+      throw new BadLocationStateException(extent + " is both assigned and hosted, which should
never happen: " + this, extent.getMetadataEntry());
     }
   }
   


Mime
View raw message