accumulo-notifications mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From GitBox <...@apache.org>
Subject [GitHub] keith-turner commented on a change in pull request #402: ACCUMULO-4615: Updated get status for thread safety and with a per-task timeout
Date Tue, 27 Mar 2018 16:00:16 GMT
keith-turner commented on a change in pull request #402: ACCUMULO-4615: Updated get status
for thread safety and with a per-task timeout
URL: https://github.com/apache/accumulo/pull/402#discussion_r177479703
 
 

 ##########
 File path: server/master/src/main/java/org/apache/accumulo/master/Master.java
 ##########
 @@ -1074,60 +1077,92 @@ private long balanceTablets() {
 
   private SortedMap<TServerInstance,TabletServerStatus> gatherTableInformation(Set<TServerInstance>
currentServers) {
     long start = System.currentTimeMillis();
+
     int threads = Math.max(getConfiguration().getCount(Property.MASTER_STATUS_THREAD_POOL_SIZE),
1);
-    ExecutorService tp = Executors.newFixedThreadPool(threads);
-    final SortedMap<TServerInstance,TabletServerStatus> result = new TreeMap<>();
-    for (TServerInstance serverInstance : currentServers) {
-      final TServerInstance server = serverInstance;
-      tp.submit(new Runnable() {
+    long timeout = getConfiguration().getTimeInMillis(Property.MASTER_STATUS_THREAD_TIMEOUT);
+    final SortedMap<TServerInstance,TabletServerStatus> results = new ConcurrentSkipListMap<>();
+
+    try (TimeoutTaskExecutor<TabletServerStatus,GetTServerStatus> executor = new TimeoutTaskExecutor<>(threads,
timeout, currentServers.size())) {
+      executor.onSuccess(new SuccessCallback<TabletServerStatus,GetTServerStatus>()
{
         @Override
-        public void run() {
-          try {
-            Thread t = Thread.currentThread();
-            String oldName = t.getName();
-            try {
-              t.setName("Getting status from " + server);
-              TServerConnection connection = tserverSet.getConnection(server);
-              if (connection == null)
-                throw new IOException("No connection to " + server);
-              TabletServerStatus status = connection.getTableMap(false);
-              result.put(server, status);
-            } finally {
-              t.setName(oldName);
-            }
-          } catch (Exception ex) {
-            log.error("unable to get tablet server status " + server + " " + ex.toString());
-            log.debug("unable to get tablet server status " + server, ex);
-            if (badServers.get(server).incrementAndGet() > MAX_BAD_STATUS_COUNT) {
-              log.warn("attempting to stop " + server);
-              try {
-                TServerConnection connection = tserverSet.getConnection(server);
-                if (connection != null) {
-                  connection.halt(masterLock);
-                }
-              } catch (TTransportException e) {
-                // ignore: it's probably down
-              } catch (Exception e) {
-                log.info("error talking to troublesome tablet server ", e);
-              }
-              badServers.remove(server);
-            }
-          }
+        public void accept(GetTServerStatus task, TabletServerStatus result) {
+          results.put(task.server, result);
         }
       });
-    }
-    tp.shutdown();
-    try {
-      tp.awaitTermination(getConfiguration().getTimeInMillis(Property.TSERV_CLIENT_TIMEOUT)
* 2, TimeUnit.MILLISECONDS);
+
+      executor.onException(new ExceptionCallback<GetTServerStatus>() {
+        @Override
+        public void accept(GetTServerStatus task, Exception e) {
+          log.error("Exception occurred while getting status from " + task.server, e);
+          handleBadServer(task.server);
+        }
+      });
+
+      executor.onTimeout(new TimeoutCallback<GetTServerStatus>() {
+        @Override
+        public void accept(GetTServerStatus task) {
+          log.warn("Timed out while fetching status from " + task.server);
+          handleBadServer(task.server);
 
 Review comment:
   This seems like a new behavior, where if tablet servers do not respond within 6 seconds
for a few times they will be killed??  Not sure if this is desirable.

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services

Mime
View raw message