Return-Path: X-Original-To: apmail-hadoop-common-commits-archive@www.apache.org Delivered-To: apmail-hadoop-common-commits-archive@www.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 6799918DBB for ; Tue, 19 Apr 2016 00:20:05 +0000 (UTC) Received: (qmail 92273 invoked by uid 500); 19 Apr 2016 00:20:02 -0000 Delivered-To: apmail-hadoop-common-commits-archive@hadoop.apache.org Received: (qmail 92164 invoked by uid 500); 19 Apr 2016 00:20:02 -0000 Mailing-List: contact common-commits-help@hadoop.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: common-dev@hadoop.apache.org Delivered-To: mailing list common-commits@hadoop.apache.org Received: (qmail 91151 invoked by uid 99); 19 Apr 2016 00:20:01 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Tue, 19 Apr 2016 00:20:01 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id 67545E0BAC; Tue, 19 Apr 2016 00:20:01 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: aengineer@apache.org To: common-commits@hadoop.apache.org Date: Tue, 19 Apr 2016 00:20:09 -0000 Message-Id: <956e07fa646c439d9f4edb234bb1a23c@git.apache.org> In-Reply-To: <9ed1b04d772d4f8e8cfe494e3a268d28@git.apache.org> References: <9ed1b04d772d4f8e8cfe494e3a268d28@git.apache.org> X-Mailer: ASF-Git Admin Mailer Subject: [09/27] hadoop git commit: YARN-4924. NM recovery race can lead to container not cleaned up. Contributed by sandflee YARN-4924. NM recovery race can lead to container not cleaned up. Contributed by sandflee Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/3150ae81 Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/3150ae81 Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/3150ae81 Branch: refs/heads/HDFS-7240 Commit: 3150ae8108a1fc40a67926be6254824c1e37cb38 Parents: a74580a Author: Jason Lowe Authored: Thu Apr 14 19:17:14 2016 +0000 Committer: Jason Lowe Committed: Thu Apr 14 19:17:14 2016 +0000 ---------------------------------------------------------------------- .../containermanager/ContainerManagerImpl.java | 17 ----- .../recovery/NMLeveldbStateStoreService.java | 80 ++++++++++++-------- .../recovery/NMNullStateStoreService.java | 4 - .../recovery/NMStateStoreService.java | 12 --- .../TestContainerManagerRecovery.java | 4 + .../recovery/NMMemoryStateStoreService.java | 10 --- .../TestNMLeveldbStateStoreService.java | 10 +-- 7 files changed, 54 insertions(+), 83 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hadoop/blob/3150ae81/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/ContainerManagerImpl.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/ContainerManagerImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/ContainerManagerImpl.java index 8d09aa7..b8cca28 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/ContainerManagerImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/ContainerManagerImpl.java @@ -296,20 +296,8 @@ public class ContainerManagerImpl extends CompositeService implements if (LOG.isDebugEnabled()) { LOG.debug("Recovering container with state: " + rcs); } - recoverContainer(rcs); } - - String diagnostic = "Application marked finished during recovery"; - for (ApplicationId appId : appsState.getFinishedApplications()) { - - if (LOG.isDebugEnabled()) { - LOG.debug("Application marked finished during recovery: " + appId); - } - - dispatcher.getEventHandler().handle( - new ApplicationFinishEvent(appId, diagnostic)); - } } else { LOG.info("Not a recoverable state store. Nothing to recover."); } @@ -1332,11 +1320,6 @@ public class ContainerManagerImpl extends CompositeService implements } else if (appsFinishedEvent.getReason() == CMgrCompletedAppsEvent.Reason.BY_RESOURCEMANAGER) { diagnostic = "Application killed by ResourceManager"; } - try { - this.context.getNMStateStore().storeFinishedApplication(appID); - } catch (IOException e) { - LOG.error("Unable to update application state in store", e); - } this.dispatcher.getEventHandler().handle( new ApplicationFinishEvent(appID, diagnostic)); http://git-wip-us.apache.org/repos/asf/hadoop/blob/3150ae81/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMLeveldbStateStoreService.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMLeveldbStateStoreService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMLeveldbStateStoreService.java index 81d6c57..26dea2d 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMLeveldbStateStoreService.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMLeveldbStateStoreService.java @@ -84,6 +84,7 @@ public class NMLeveldbStateStoreService extends NMStateStoreService { private static final String APPLICATIONS_KEY_PREFIX = "ContainerManager/applications/"; + @Deprecated private static final String FINISHED_APPS_KEY_PREFIX = "ContainerManager/finishedApps/"; @@ -392,20 +393,6 @@ public class NMLeveldbStateStoreService extends NMStateStoreService { state.applications.add( ContainerManagerApplicationProto.parseFrom(entry.getValue())); } - - state.finishedApplications = new ArrayList(); - keyPrefix = FINISHED_APPS_KEY_PREFIX; - iter.seek(bytes(keyPrefix)); - while (iter.hasNext()) { - Entry entry = iter.next(); - String key = asString(entry.getKey()); - if (!key.startsWith(keyPrefix)) { - break; - } - ApplicationId appId = - ConverterUtils.toApplicationId(key.substring(keyPrefix.length())); - state.finishedApplications.add(appId); - } } catch (DBException e) { throw new IOException(e); } finally { @@ -414,6 +401,8 @@ public class NMLeveldbStateStoreService extends NMStateStoreService { } } + cleanupDeprecatedFinishedApps(); + return state; } @@ -434,21 +423,6 @@ public class NMLeveldbStateStoreService extends NMStateStoreService { } @Override - public void storeFinishedApplication(ApplicationId appId) - throws IOException { - if (LOG.isDebugEnabled()) { - LOG.debug("storeFinishedApplication.appId: " + appId); - } - - String key = FINISHED_APPS_KEY_PREFIX + appId; - try { - db.put(bytes(key), new byte[0]); - } catch (DBException e) { - throw new IOException(e); - } - } - - @Override public void removeApplication(ApplicationId appId) throws IOException { if (LOG.isDebugEnabled()) { @@ -460,8 +434,6 @@ public class NMLeveldbStateStoreService extends NMStateStoreService { try { String key = APPLICATIONS_KEY_PREFIX + appId; batch.delete(bytes(key)); - key = FINISHED_APPS_KEY_PREFIX + appId; - batch.delete(bytes(key)); db.write(batch); } finally { batch.close(); @@ -979,6 +951,52 @@ public class NMLeveldbStateStoreService extends NMStateStoreService { } } + @SuppressWarnings("deprecation") + private void cleanupDeprecatedFinishedApps() { + try { + cleanupKeysWithPrefix(FINISHED_APPS_KEY_PREFIX); + } catch (Exception e) { + LOG.warn("cleanup keys with prefix " + FINISHED_APPS_KEY_PREFIX + + " from leveldb failed", e); + } + } + + private void cleanupKeysWithPrefix(String prefix) throws IOException { + WriteBatch batch = null; + LeveldbIterator iter = null; + try { + iter = new LeveldbIterator(db); + try { + batch = db.createWriteBatch(); + iter.seek(bytes(prefix)); + while (iter.hasNext()) { + byte[] key = iter.next().getKey(); + String keyStr = asString(key); + if (!keyStr.startsWith(prefix)) { + break; + } + batch.delete(key); + if (LOG.isDebugEnabled()) { + LOG.debug("cleanup " + keyStr + " from leveldb"); + } + } + db.write(batch); + } catch (DBException e) { + throw new IOException(e); + } finally { + if (batch != null) { + batch.close(); + } + } + } catch (DBException e) { + throw new IOException(e); + } finally { + if (iter != null) { + iter.close(); + } + } + } + private String getLogDeleterKey(ApplicationId appId) { return LOG_DELETER_KEY_PREFIX + appId; } http://git-wip-us.apache.org/repos/asf/hadoop/blob/3150ae81/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMNullStateStoreService.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMNullStateStoreService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMNullStateStoreService.java index d5dce9b..a887e71 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMNullStateStoreService.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMNullStateStoreService.java @@ -59,10 +59,6 @@ public class NMNullStateStoreService extends NMStateStoreService { } @Override - public void storeFinishedApplication(ApplicationId appId) { - } - - @Override public void removeApplication(ApplicationId appId) throws IOException { } http://git-wip-us.apache.org/repos/asf/hadoop/blob/3150ae81/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMStateStoreService.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMStateStoreService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMStateStoreService.java index 84c5aa9..463815e 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMStateStoreService.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMStateStoreService.java @@ -52,15 +52,11 @@ public abstract class NMStateStoreService extends AbstractService { public static class RecoveredApplicationsState { List applications; - List finishedApplications; public List getApplications() { return applications; } - public List getFinishedApplications() { - return finishedApplications; - } } public enum RecoveredContainerStatus { @@ -259,14 +255,6 @@ public abstract class NMStateStoreService extends AbstractService { ContainerManagerApplicationProto p) throws IOException; /** - * Record that an application has finished - * @param appId the application ID - * @throws IOException - */ - public abstract void storeFinishedApplication(ApplicationId appId) - throws IOException; - - /** * Remove records corresponding to an application * @param appId the application ID * @throws IOException http://git-wip-us.apache.org/repos/asf/hadoop/blob/3150ae81/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java index 2e014de..9fa3fcc 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java @@ -259,6 +259,10 @@ public class TestContainerManagerRecovery extends BaseContainerManagerTest { assertEquals(1, context.getApplications().size()); app = context.getApplications().get(appId); assertNotNull(app); + // no longer saving FINISH_APP event in NM stateStore, + // simulate by resending FINISH_APP event + cm.handle(new CMgrCompletedAppsEvent(finishedApps, + CMgrCompletedAppsEvent.Reason.BY_RESOURCEMANAGER)); waitForAppState(app, ApplicationState.APPLICATION_RESOURCES_CLEANINGUP); assertTrue(context.getApplicationACLsManager().checkAccess( UserGroupInformation.createRemoteUser(modUser), http://git-wip-us.apache.org/repos/asf/hadoop/blob/3150ae81/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMMemoryStateStoreService.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMMemoryStateStoreService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMMemoryStateStoreService.java index a1c95ab..1279896 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMMemoryStateStoreService.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMMemoryStateStoreService.java @@ -44,7 +44,6 @@ import org.apache.hadoop.yarn.server.api.records.impl.pb.MasterKeyPBImpl; public class NMMemoryStateStoreService extends NMStateStoreService { private Map apps; - private Set finishedApps; private Map containerStates; private Map trackerStates; private Map deleteTasks; @@ -59,7 +58,6 @@ public class NMMemoryStateStoreService extends NMStateStoreService { @Override protected void initStorage(Configuration conf) { apps = new HashMap(); - finishedApps = new HashSet(); containerStates = new HashMap(); nmTokenState = new RecoveredNMTokensState(); nmTokenState.applicationMasterKeys = @@ -86,7 +84,6 @@ public class NMMemoryStateStoreService extends NMStateStoreService { RecoveredApplicationsState state = new RecoveredApplicationsState(); state.applications = new ArrayList( apps.values()); - state.finishedApplications = new ArrayList(finishedApps); return state; } @@ -99,15 +96,9 @@ public class NMMemoryStateStoreService extends NMStateStoreService { } @Override - public synchronized void storeFinishedApplication(ApplicationId appId) { - finishedApps.add(appId); - } - - @Override public synchronized void removeApplication(ApplicationId appId) throws IOException { apps.remove(appId); - finishedApps.remove(appId); } @Override @@ -393,7 +384,6 @@ public class NMMemoryStateStoreService extends NMStateStoreService { logDeleterState.remove(appId); } - private static class TrackerState { Map inProgressMap = new HashMap(); http://git-wip-us.apache.org/repos/asf/hadoop/blob/3150ae81/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/TestNMLeveldbStateStoreService.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/TestNMLeveldbStateStoreService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/TestNMLeveldbStateStoreService.java index 08b49e7..47468d6 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/TestNMLeveldbStateStoreService.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/TestNMLeveldbStateStoreService.java @@ -174,7 +174,6 @@ public class TestNMLeveldbStateStoreService { // test empty when no state RecoveredApplicationsState state = stateStore.loadApplicationsState(); assertTrue(state.getApplications().isEmpty()); - assertTrue(state.getFinishedApplications().isEmpty()); // store an application and verify recovered final ApplicationId appId1 = ApplicationId.newInstance(1234, 1); @@ -188,10 +187,8 @@ public class TestNMLeveldbStateStoreService { state = stateStore.loadApplicationsState(); assertEquals(1, state.getApplications().size()); assertEquals(appProto1, state.getApplications().get(0)); - assertTrue(state.getFinishedApplications().isEmpty()); - // finish an application and add a new one - stateStore.storeFinishedApplication(appId1); + // add a new app final ApplicationId appId2 = ApplicationId.newInstance(1234, 2); builder = ContainerManagerApplicationProto.newBuilder(); builder.setId(((ApplicationIdPBImpl) appId2).getProto()); @@ -203,18 +200,13 @@ public class TestNMLeveldbStateStoreService { assertEquals(2, state.getApplications().size()); assertTrue(state.getApplications().contains(appProto1)); assertTrue(state.getApplications().contains(appProto2)); - assertEquals(1, state.getFinishedApplications().size()); - assertEquals(appId1, state.getFinishedApplications().get(0)); // test removing an application - stateStore.storeFinishedApplication(appId2); stateStore.removeApplication(appId2); restartStateStore(); state = stateStore.loadApplicationsState(); assertEquals(1, state.getApplications().size()); assertEquals(appProto1, state.getApplications().get(0)); - assertEquals(1, state.getFinishedApplications().size()); - assertEquals(appId1, state.getFinishedApplications().get(0)); } @Test