ambari-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mpapirkovs...@apache.org
Subject [1/2] ambari git commit: AMBARI-20269. Failed task during EU is not reported upfront causing Upgrade to show 'Aborted' after Finalize step. (mpapirkovskyy)
Date Fri, 03 Mar 2017 09:46:01 GMT
Repository: ambari
Updated Branches:
  refs/heads/branch-2.5 97c29893c -> d567127ae
  refs/heads/trunk aebe216d0 -> 0e4819ae7


AMBARI-20269. Failed task during EU is not reported upfront causing Upgrade to show 'Aborted'
after Finalize step. (mpapirkovskyy)


Project: http://git-wip-us.apache.org/repos/asf/ambari/repo
Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/0e4819ae
Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/0e4819ae
Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/0e4819ae

Branch: refs/heads/trunk
Commit: 0e4819ae7b45fc659067b83d1cba0f97435528bd
Parents: aebe216
Author: Myroslav Papirkovskyi <mpapyrkovskyy@hortonworks.com>
Authored: Thu Mar 2 21:12:05 2017 +0200
Committer: Myroslav Papirkovskyi <mpapyrkovskyy@hortonworks.com>
Committed: Fri Mar 3 11:45:25 2017 +0200

----------------------------------------------------------------------
 .../server/actionmanager/ActionDBAccessor.java  |  4 +-
 .../actionmanager/ActionDBAccessorImpl.java     | 11 +++--
 .../server/actionmanager/ActionScheduler.java   | 30 ++----------
 .../actionmanager/TestActionScheduler.java      | 51 +++++++++-----------
 4 files changed, 37 insertions(+), 59 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/ambari/blob/0e4819ae/ambari-server/src/main/java/org/apache/ambari/server/actionmanager/ActionDBAccessor.java
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/java/org/apache/ambari/server/actionmanager/ActionDBAccessor.java
b/ambari-server/src/main/java/org/apache/ambari/server/actionmanager/ActionDBAccessor.java
index 217fe0a..9325d03 100644
--- a/ambari-server/src/main/java/org/apache/ambari/server/actionmanager/ActionDBAccessor.java
+++ b/ambari-server/src/main/java/org/apache/ambari/server/actionmanager/ActionDBAccessor.java
@@ -72,8 +72,8 @@ public interface ActionDBAccessor {
   /**
    * Mark the task as to have timed out
    */
-  void timeoutHostRole(String host, long requestId, long stageId,
-                       String role, boolean skipSupported);
+  void timeoutHostRole(String host, long requestId, long stageId, String role,
+                       boolean skipSupported, boolean hostUnknownState);
 
   /**
    * Returns all the pending stages, including queued and not-queued. A stage is

http://git-wip-us.apache.org/repos/asf/ambari/blob/0e4819ae/ambari-server/src/main/java/org/apache/ambari/server/actionmanager/ActionDBAccessorImpl.java
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/java/org/apache/ambari/server/actionmanager/ActionDBAccessorImpl.java
b/ambari-server/src/main/java/org/apache/ambari/server/actionmanager/ActionDBAccessorImpl.java
index b813fe6..14f02d2 100644
--- a/ambari-server/src/main/java/org/apache/ambari/server/actionmanager/ActionDBAccessorImpl.java
+++ b/ambari-server/src/main/java/org/apache/ambari/server/actionmanager/ActionDBAccessorImpl.java
@@ -241,20 +241,21 @@ public class ActionDBAccessorImpl implements ActionDBAccessor {
   @Override
   public void timeoutHostRole(String host, long requestId, long stageId,
                               String role) {
-    timeoutHostRole(host, requestId, stageId, role, false);
+    timeoutHostRole(host, requestId, stageId, role, false, false);
   }
 
   @Override
-  public void timeoutHostRole(String host, long requestId, long stageId,
-                              String role, boolean skipSupported) {
+  public void timeoutHostRole(String host, long requestId, long stageId, String role,
+                              boolean skipSupported, boolean hostUnknownState) {
     long now = System.currentTimeMillis();
     List<HostRoleCommandEntity> commands =
-            hostRoleCommandDAO.findByHostRole(host, requestId, stageId, role);
+      hostRoleCommandDAO.findByHostRole(host, requestId, stageId, role);
     for (HostRoleCommandEntity command : commands) {
       if (skipSupported) {
         command.setStatus(HostRoleStatus.SKIPPED_FAILED);
       } else {
-        command.setStatus(command.isRetryAllowed() ? HostRoleStatus.HOLDING_TIMEDOUT : HostRoleStatus.TIMEDOUT);
+        command.setStatus(command.isRetryAllowed() ? HostRoleStatus.HOLDING_TIMEDOUT :
+          hostUnknownState ? HostRoleStatus.ABORTED : HostRoleStatus.TIMEDOUT);
       }
 
       command.setEndTime(now);

http://git-wip-us.apache.org/repos/asf/ambari/blob/0e4819ae/ambari-server/src/main/java/org/apache/ambari/server/actionmanager/ActionScheduler.java
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/java/org/apache/ambari/server/actionmanager/ActionScheduler.java
b/ambari-server/src/main/java/org/apache/ambari/server/actionmanager/ActionScheduler.java
index a92c03c..c8e4532 100644
--- a/ambari-server/src/main/java/org/apache/ambari/server/actionmanager/ActionScheduler.java
+++ b/ambari-server/src/main/java/org/apache/ambari/server/actionmanager/ActionScheduler.java
@@ -817,6 +817,7 @@ class ActionScheduler implements Runnable {
         }
 
         // Check that service host component is not deleted
+        boolean isHostStateUnknown = false;
         if (hostDeleted) {
 
           String message = String.format(
@@ -834,9 +835,10 @@ class ActionScheduler implements Runnable {
             processActionDeath(cluster.getClusterName(), c.getHostname(), roleStr);
           }
           status = HostRoleStatus.ABORTED;
-        } else if (timeOutActionNeeded(status, s, hostObj, roleStr, now, commandTimeout))
{
+        } else if (timeOutActionNeeded(status, s, hostObj, roleStr, now, commandTimeout)
+          || (isHostStateUnknown = isHostStateUnknown(s, hostObj, roleStr))) {
           // Process command timeouts
-          if (s.getAttemptCount(host, roleStr) >= maxAttempts) {
+          if (s.getAttemptCount(host, roleStr) >= maxAttempts || isHostStateUnknown) {
             LOG.warn("Host: {}, role: {}, actionId: {} expired and will be failed", host,
roleStr,
               s.getActionId());
 
@@ -847,7 +849,7 @@ class ActionScheduler implements Runnable {
               isSkipSupported = hostRoleCommand.isFailureAutoSkipped();
             }
 
-            db.timeoutHostRole(host, s.getRequestId(), s.getStageId(), c.getRole(), isSkipSupported);
+            db.timeoutHostRole(host, s.getRequestId(), s.getStageId(), c.getRole(), isSkipSupported,
isHostStateUnknown);
             //Reinitialize status
             status = s.getHostRoleStatus(host, roleStr);
 
@@ -876,28 +878,6 @@ class ActionScheduler implements Runnable {
             commandsToSchedule.add(c);
             LOG.trace("===> commandsToSchedule(reschedule)=" + commandsToSchedule.size());
           }
-        } else if (isHostStateUnknown(s, hostObj, roleStr)) {
-          String message = "Action was aborted due agent is not heartbeating or was restarted.";
-          LOG.warn("Host: {}, role: {}, actionId: {} . {}", host, roleStr,
-            s.getActionId(), message);
-
-          db.abortHostRole(host, s.getRequestId(), s.getStageId(), c.getRole(), message);
-
-          if (null != cluster) {
-            if (!RoleCommand.CUSTOM_COMMAND.equals(c.getRoleCommand())
-              && !RoleCommand.SERVICE_CHECK.equals(c.getRoleCommand())
-              && !RoleCommand.ACTIONEXECUTE.equals(c.getRoleCommand())) {
-              //commands above don't affect host component state (e.g. no in_progress state
in process), transition will fail
-              transitionToFailedState(cluster.getClusterName(), c.getServiceName(), roleStr,
host, now, false);
-            }
-            if (c.getRoleCommand().equals(RoleCommand.ACTIONEXECUTE)) {
-              processActionDeath(cluster.getClusterName(), c.getHostname(), roleStr);
-            }
-          }
-
-          // Dequeue command
-          LOG.info("Removing command from queue, host={}, commandId={} ", host, c.getCommandId());
-          actionQueue.dequeue(host, c.getCommandId());
         } else if (status.equals(HostRoleStatus.PENDING)) {
           // in case of DEPENDENCY_ORDERED stage command can be scheduled only if all of
it's dependencies are
           // already finished

http://git-wip-us.apache.org/repos/asf/ambari/blob/0e4819ae/ambari-server/src/test/java/org/apache/ambari/server/actionmanager/TestActionScheduler.java
----------------------------------------------------------------------
diff --git a/ambari-server/src/test/java/org/apache/ambari/server/actionmanager/TestActionScheduler.java
b/ambari-server/src/test/java/org/apache/ambari/server/actionmanager/TestActionScheduler.java
index 526ca7c..718c7ce 100644
--- a/ambari-server/src/test/java/org/apache/ambari/server/actionmanager/TestActionScheduler.java
+++ b/ambari-server/src/test/java/org/apache/ambari/server/actionmanager/TestActionScheduler.java
@@ -335,7 +335,7 @@ public class TestActionScheduler {
         command.setStatus(HostRoleStatus.TIMEDOUT);
         return null;
       }
-    }).when(db).timeoutHostRole(anyString(), anyLong(), anyLong(), anyString(), anyBoolean());
+    }).when(db).timeoutHostRole(anyString(), anyLong(), anyLong(), anyString(), anyBoolean(),
eq(false));
 
 
     //Small action timeout to test rescheduling
@@ -416,32 +416,18 @@ public class TestActionScheduler {
     doAnswer(new Answer<Void>() {
       @Override
       public Void answer(InvocationOnMock invocation) throws Throwable {
-        Long requestId = (Long) invocation.getArguments()[1];
-        for (Stage stage : stages) {
-          if (requestId.equals(stage.getRequestId())) {
-            for (HostRoleCommand command : stage.getOrderedHostRoleCommands()) {
-              if (command.getStatus() == HostRoleStatus.QUEUED ||
-                command.getStatus() == HostRoleStatus.IN_PROGRESS ||
-                command.getStatus() == HostRoleStatus.PENDING) {
-                command.setStatus(HostRoleStatus.ABORTED);
-              }
-            }
-          }
-        }
-
+        String host = (String) invocation.getArguments()[0];
+        String role = (String) invocation.getArguments()[3];
+        HostRoleCommand command = s.getHostRoleCommand(host, role);
+        command.setStatus(HostRoleStatus.ABORTED);
         return null;
       }
-    }).when(db).abortHostRole(anyString(), anyLong(), anyLong(), anyString(), anyString());
+    }).when(db).timeoutHostRole(anyString(), anyLong(), anyLong(), anyString(), anyBoolean(),
eq(true));
 
     //Small action timeout to test rescheduling
     AmbariEventPublisher aep = EasyMock.createNiceMock(AmbariEventPublisher.class);
-    ActionScheduler scheduler = EasyMock.createMockBuilder(ActionScheduler.class).
-        withConstructor((long) 100, (long) 50, db, aq, fsm, 3,
-            new HostsMap((String) null), unitOfWork, aep, conf, entityManagerProviderMock,
-            mock(HostRoleCommandDAO.class), mock(HostRoleCommandFactory.class)).
-        addMockedMethod("cancelHostRoleCommands").
-        createMock();
-    EasyMock.replay(scheduler);
+    ActionScheduler scheduler = new ActionScheduler(100, 0, db, aq, fsm, 3,
+      new HostsMap((String) null), unitOfWork, null, conf, entityManagerProviderMock, hostRoleCommandDAOMock,
null);
     scheduler.setTaskTimeoutAdjustment(false);
 
     int cycleCount=0;
@@ -452,7 +438,7 @@ public class TestActionScheduler {
 
     Assert.assertEquals(HostRoleStatus.ABORTED,stages.get(0).getHostRoleStatus(hostname,
"NAMENODE"));
 
-    EasyMock.verify(scheduler, entityManagerProviderMock);
+    EasyMock.verify(entityManagerProviderMock);
   }
 
   @Test
@@ -499,7 +485,7 @@ public class TestActionScheduler {
     when(serviceObj.getCluster()).thenReturn(oneClusterMock);
 
     final List<Stage> stages = new ArrayList<Stage>();
-    Stage stage = stageFactory.createNew(1, "/tmp", "cluster1", 1L, "stageWith2Tasks",
+    final Stage stage = stageFactory.createNew(1, "/tmp", "cluster1", 1L, "stageWith2Tasks",
       CLUSTER_HOST_INFO, "{\"command_param\":\"param_value\"}", "{\"host_param\":\"param_value\"}");
     addInstallTaskToStage(stage, hostname1, "cluster1", Role.DATANODE,
       RoleCommand.INSTALL, Service.Type.HDFS, 1);
@@ -518,10 +504,21 @@ public class TestActionScheduler {
     HostRoleCommandDAO hostRoleCommandDAOMock = mock(HostRoleCommandDAO.class);
     Mockito.doNothing().when(hostRoleCommandDAOMock).publishTaskCreateEvent(anyListOf(HostRoleCommand.class));
 
+    doAnswer(new Answer<Void>() {
+      @Override
+      public Void answer(InvocationOnMock invocation) throws Throwable {
+        String host = (String) invocation.getArguments()[0];
+        String role = (String) invocation.getArguments()[3];
+        HostRoleCommand command = stage.getHostRoleCommand(host, role);
+        command.setStatus(HostRoleStatus.ABORTED);
+        return null;
+      }
+    }).when(db).timeoutHostRole(anyString(), anyLong(), anyLong(), anyString(), anyBoolean(),
eq(true));
+
     doAnswer(new Answer<Collection<HostRoleCommandEntity>>() {
       @Override
       public Collection<HostRoleCommandEntity> answer(InvocationOnMock invocation)
throws Throwable {
-        Long requestId = (Long) invocation.getArguments()[1];
+        Long requestId = (Long) invocation.getArguments()[0];
         List<HostRoleCommandEntity> abortedCommands = Lists.newArrayList();
         for (Stage stage : stages) {
           if (requestId.equals(stage.getRequestId())) {
@@ -542,7 +539,7 @@ public class TestActionScheduler {
 
         return abortedCommands;
       }
-    }).when(db).abortHostRole(anyString(), anyLong(), anyLong(), anyString(), anyString());
+    }).when(db).abortOperation(anyLong());
 
     ArgumentCaptor<ServiceComponentHostEvent> eventsCapture1 =
       ArgumentCaptor.forClass(ServiceComponentHostEvent.class);
@@ -954,7 +951,7 @@ public class TestActionScheduler {
 
     boolean taskShouldBeSkipped = stageSupportsAutoSkip && autoSkipFailedTask;
     db.timeoutHostRole(EasyMock.anyString(), EasyMock.anyLong(), EasyMock.anyLong(),
-        EasyMock.anyString(), EasyMock.eq(taskShouldBeSkipped));
+        EasyMock.anyString(), EasyMock.eq(taskShouldBeSkipped), EasyMock.anyBoolean());
 
     EasyMock.expectLastCall();
 


Mime
View raw message