hadoop-common-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From asur...@apache.org
Subject [13/50] [abbrv] hadoop git commit: YARN-3254. HealthReport should include disk full information. Contributed by Suma Shivaprasad.
Date Thu, 24 Aug 2017 19:36:06 GMT
YARN-3254. HealthReport should include disk full information. Contributed by Suma Shivaprasad.


Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/f9a0e233
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/f9a0e233
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/f9a0e233

Branch: refs/heads/YARN-5972
Commit: f9a0e2338150f1bd3ba2c29f76979183fd3ed80c
Parents: 1f04cb4
Author: Sunil G <sunilg@apache.org>
Authored: Thu Aug 17 15:07:15 2017 +0530
Committer: Sunil G <sunilg@apache.org>
Committed: Thu Aug 17 15:07:15 2017 +0530

----------------------------------------------------------------------
 .../server/nodemanager/DirectoryCollection.java | 61 +++++++++++++++++++-
 .../nodemanager/LocalDirsHandlerService.java    | 59 +++++++++++++++----
 .../nodemanager/TestDirectoryCollection.java    | 23 ++++++++
 3 files changed, 130 insertions(+), 13 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hadoop/blob/f9a0e233/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DirectoryCollection.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DirectoryCollection.java
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DirectoryCollection.java
index ae2a4ef..502485f 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DirectoryCollection.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DirectoryCollection.java
@@ -38,6 +38,7 @@ import org.apache.commons.io.FileUtils;
 import org.apache.commons.lang.RandomStringUtils;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.classification.InterfaceStability;
 import org.apache.hadoop.fs.FileAlreadyExistsException;
 import org.apache.hadoop.fs.FileContext;
 import org.apache.hadoop.fs.Path;
@@ -99,6 +100,7 @@ public class DirectoryCollection {
   private List<String> localDirs;
   private List<String> errorDirs;
   private List<String> fullDirs;
+  private Map<String, DiskErrorInformation> directoryErrorInfo;
 
   // read/write lock for accessing above directories.
   private final ReadLock readLock;
@@ -192,6 +194,7 @@ public class DirectoryCollection {
     localDirs = new CopyOnWriteArrayList<>(dirs);
     errorDirs = new CopyOnWriteArrayList<>();
     fullDirs = new CopyOnWriteArrayList<>();
+    directoryErrorInfo = new ConcurrentHashMap<>();
 
     ReentrantReadWriteLock lock = new ReentrantReadWriteLock();
     this.readLock = lock.readLock();
@@ -248,11 +251,25 @@ public class DirectoryCollection {
   /**
    * @return the directories that have used all disk space
    */
-
   List<String> getFullDirs() {
     this.readLock.lock();
     try {
-      return fullDirs;
+      return Collections.unmodifiableList(fullDirs);
+    } finally {
+      this.readLock.unlock();
+    }
+  }
+
+  /**
+   * @return the directories that have errors - many not have appropriate permissions
+   * or other disk validation checks might have failed in {@link DiskValidator}
+   *
+   */
+  @InterfaceStability.Evolving
+  List<String> getErroredDirs() {
+    this.readLock.lock();
+    try {
+      return Collections.unmodifiableList(errorDirs);
     } finally {
       this.readLock.unlock();
     }
@@ -271,6 +288,39 @@ public class DirectoryCollection {
   }
 
   /**
+   *
+   * @param dirName Absolute path of Directory for which error diagnostics are needed
+   * @return DiskErrorInformation - disk error diagnostics for the specified directory
+   *         null - the disk associated with the directory has passed disk utilization checks
+   *         /error validations in {@link DiskValidator}
+   *
+   */
+  @InterfaceStability.Evolving
+  DiskErrorInformation getDirectoryErrorInfo(String dirName) {
+    this.readLock.lock();
+    try {
+      return directoryErrorInfo.get(dirName);
+    } finally {
+      this.readLock.unlock();
+    }
+  }
+
+  /**
+   *
+   * @param dirName Absolute path of Directory for which the disk has been marked as unhealthy
+   * @return Check if disk associated with the directory is unhealthy
+   */
+  @InterfaceStability.Evolving
+  boolean isDiskUnHealthy(String dirName) {
+    this.readLock.lock();
+    try {
+      return directoryErrorInfo.containsKey(dirName);
+    } finally {
+      this.readLock.unlock();
+    }
+  }
+
+  /**
    * Create any non-existent directories and parent directories, updating the
    * list of valid directories if necessary.
    * @param localFs local file system to use
@@ -297,6 +347,9 @@ public class DirectoryCollection {
         try {
           localDirs.remove(dir);
           errorDirs.add(dir);
+          directoryErrorInfo.put(dir,
+              new DiskErrorInformation(DiskErrorCause.OTHER,
+                  "Cannot create directory : " + dir + ", error " + e.getMessage()));
           numFailures++;
         } finally {
           this.writeLock.unlock();
@@ -343,11 +396,13 @@ public class DirectoryCollection {
       localDirs.clear();
       errorDirs.clear();
       fullDirs.clear();
+      directoryErrorInfo.clear();
 
       for (Map.Entry<String, DiskErrorInformation> entry : dirsFailedCheck
           .entrySet()) {
         String dir = entry.getKey();
         DiskErrorInformation errorInformation = entry.getValue();
+
         switch (entry.getValue().cause) {
         case DISK_FULL:
           fullDirs.add(entry.getKey());
@@ -359,6 +414,8 @@ public class DirectoryCollection {
           LOG.warn(entry.getValue().cause + " is unknown for disk error.");
           break;
         }
+        directoryErrorInfo.put(entry.getKey(), errorInformation);
+
         if (preCheckGoodDirs.contains(dir)) {
           LOG.warn("Directory " + dir + " error, " + errorInformation.message
               + ", removing from list of valid directories");

http://git-wip-us.apache.org/repos/asf/hadoop/blob/f9a0e233/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LocalDirsHandlerService.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LocalDirsHandlerService.java
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LocalDirsHandlerService.java
index f8cb4ee..6e00808 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LocalDirsHandlerService.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LocalDirsHandlerService.java
@@ -53,6 +53,8 @@ public class LocalDirsHandlerService extends AbstractService {
 
   private static Log LOG = LogFactory.getLog(LocalDirsHandlerService.class);
 
+  private static final String diskCapacityExceededErrorMsg =  "usable space is below configured
utilization percentage/no more usable space";
+
   /**
    * Good local directories, use internally,
    * initial value is the same as NM_LOCAL_DIRS.
@@ -344,21 +346,36 @@ public class LocalDirsHandlerService extends AbstractService {
     }
 
     StringBuilder report = new StringBuilder();
-    List<String> failedLocalDirsList = localDirs.getFailedDirs();
-    List<String> failedLogDirsList = logDirs.getFailedDirs();
+    List<String> erroredLocalDirsList = localDirs.getErroredDirs();
+    List<String> erroredLogDirsList = logDirs.getErroredDirs();
+    List<String> diskFullLocalDirsList = localDirs.getFullDirs();
+    List<String> diskFullLogDirsList = logDirs.getFullDirs();
     List<String> goodLocalDirsList = localDirs.getGoodDirs();
     List<String> goodLogDirsList = logDirs.getGoodDirs();
-    int numLocalDirs = goodLocalDirsList.size() + failedLocalDirsList.size();
-    int numLogDirs = goodLogDirsList.size() + failedLogDirsList.size();
+
+    int numLocalDirs = goodLocalDirsList.size() + erroredLocalDirsList.size() + diskFullLocalDirsList.size();
+    int numLogDirs = goodLogDirsList.size() + erroredLogDirsList.size() + diskFullLogDirsList.size();
     if (!listGoodDirs) {
-      if (!failedLocalDirsList.isEmpty()) {
-        report.append(failedLocalDirsList.size() + "/" + numLocalDirs
-            + " local-dirs are bad: "
-            + StringUtils.join(",", failedLocalDirsList) + "; ");
+      if (!erroredLocalDirsList.isEmpty()) {
+        report.append(erroredLocalDirsList.size() + "/" + numLocalDirs
+            + " local-dirs have errors: "
+            + buildDiskErrorReport(erroredLocalDirsList, localDirs));
+      }
+      if (!diskFullLocalDirsList.isEmpty()) {
+        report.append(diskFullLocalDirsList.size() + "/" + numLocalDirs
+            + " local-dirs " + diskCapacityExceededErrorMsg
+            + buildDiskErrorReport(diskFullLocalDirsList, localDirs) + "; ");
       }
-      if (!failedLogDirsList.isEmpty()) {
-        report.append(failedLogDirsList.size() + "/" + numLogDirs
-            + " log-dirs are bad: " + StringUtils.join(",", failedLogDirsList));
+
+      if (!erroredLogDirsList.isEmpty()) {
+        report.append(erroredLogDirsList.size() + "/" + numLogDirs
+            + " log-dirs have errors: "
+            + buildDiskErrorReport(erroredLogDirsList, logDirs));
+      }
+      if (!diskFullLogDirsList.isEmpty()) {
+        report.append(diskFullLogDirsList.size() + "/" + numLogDirs
+            + " log-dirs " + diskCapacityExceededErrorMsg
+            + buildDiskErrorReport(diskFullLogDirsList, logDirs));
       }
     } else {
       report.append(goodLocalDirsList.size() + "/" + numLocalDirs
@@ -620,4 +637,24 @@ public class LocalDirsHandlerService extends AbstractService {
           logDirs.getGoodDirsDiskUtilizationPercentage());
     }
   }
+
+  private String buildDiskErrorReport(List<String> dirs, DirectoryCollection directoryCollection)
{
+    StringBuilder sb = new StringBuilder();
+
+    sb.append(" [ ");
+    for (int i = 0; i < dirs.size(); i++) {
+      final String dirName = dirs.get(i);
+      if ( directoryCollection.isDiskUnHealthy(dirName)) {
+        sb.append(dirName + " : " + directoryCollection.getDirectoryErrorInfo(dirName).message);
+      } else {
+        sb.append(dirName + " : " + "Unknown cause for disk error");
+      }
+
+      if ( i != (dirs.size() - 1)) {
+        sb.append(" , ");
+      }
+    }
+    sb.append(" ] ");
+    return sb.toString();
+  }
 }

http://git-wip-us.apache.org/repos/asf/hadoop/blob/f9a0e233/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestDirectoryCollection.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestDirectoryCollection.java
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestDirectoryCollection.java
index e529628..095f21a 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestDirectoryCollection.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestDirectoryCollection.java
@@ -128,8 +128,12 @@ public class TestDirectoryCollection {
     DirectoryCollection dc = new DirectoryCollection(dirs, 0.0F);
     dc.checkDirs();
     Assert.assertEquals(0, dc.getGoodDirs().size());
+    Assert.assertEquals(0, dc.getErroredDirs().size());
     Assert.assertEquals(1, dc.getFailedDirs().size());
     Assert.assertEquals(1, dc.getFullDirs().size());
+    Assert.assertNotNull(dc.getDirectoryErrorInfo(dirA));
+    Assert.assertEquals(DirectoryCollection.DiskErrorCause.DISK_FULL, dc.getDirectoryErrorInfo(dirA).cause);
+
     // no good dirs
     Assert.assertEquals(0, dc.getGoodDirsDiskUtilizationPercentage());
 
@@ -139,16 +143,21 @@ public class TestDirectoryCollection {
             testDir.getTotalSpace());
     dc.checkDirs();
     Assert.assertEquals(1, dc.getGoodDirs().size());
+    Assert.assertEquals(0, dc.getErroredDirs().size());
     Assert.assertEquals(0, dc.getFailedDirs().size());
     Assert.assertEquals(0, dc.getFullDirs().size());
+    Assert.assertNull(dc.getDirectoryErrorInfo(dirA));
+
     Assert.assertEquals(utilizedSpacePerc,
       dc.getGoodDirsDiskUtilizationPercentage());
 
     dc = new DirectoryCollection(dirs, testDir.getTotalSpace() / (1024 * 1024));
     dc.checkDirs();
     Assert.assertEquals(0, dc.getGoodDirs().size());
+    Assert.assertEquals(0, dc.getErroredDirs().size());
     Assert.assertEquals(1, dc.getFailedDirs().size());
     Assert.assertEquals(1, dc.getFullDirs().size());
+    Assert.assertNotNull(dc.getDirectoryErrorInfo(dirA));
     // no good dirs
     Assert.assertEquals(0, dc.getGoodDirsDiskUtilizationPercentage());
 
@@ -158,8 +167,11 @@ public class TestDirectoryCollection {
             testDir.getTotalSpace());
     dc.checkDirs();
     Assert.assertEquals(1, dc.getGoodDirs().size());
+    Assert.assertEquals(0, dc.getErroredDirs().size());
     Assert.assertEquals(0, dc.getFailedDirs().size());
     Assert.assertEquals(0, dc.getFullDirs().size());
+    Assert.assertNull(dc.getDirectoryErrorInfo(dirA));
+
     Assert.assertEquals(utilizedSpacePerc,
       dc.getGoodDirsDiskUtilizationPercentage());
   }
@@ -209,12 +221,17 @@ public class TestDirectoryCollection {
     Assert.assertEquals(0, dc.getGoodDirs().size());
     Assert.assertEquals(1, dc.getFailedDirs().size());
     Assert.assertEquals(1, dc.getFullDirs().size());
+    Assert.assertEquals(0, dc.getErroredDirs().size());
+    Assert.assertNotNull(dc.getDirectoryErrorInfo(dirA));
+    Assert.assertEquals(DirectoryCollection.DiskErrorCause.DISK_FULL, dc.getDirectoryErrorInfo(dirA).cause);
 
     dc.setDiskUtilizationPercentageCutoff(100.0F, 100.0F);
     dc.checkDirs();
     Assert.assertEquals(1, dc.getGoodDirs().size());
     Assert.assertEquals(0, dc.getFailedDirs().size());
     Assert.assertEquals(0, dc.getFullDirs().size());
+    Assert.assertEquals(0, dc.getErroredDirs().size());
+    Assert.assertNull(dc.getDirectoryErrorInfo(dirA));
 
     conf.set(CommonConfigurationKeys.FS_PERMISSIONS_UMASK_KEY, "077");
 
@@ -232,12 +249,18 @@ public class TestDirectoryCollection {
     Assert.assertEquals(0, dc.getGoodDirs().size());
     Assert.assertEquals(1, dc.getFailedDirs().size());
     Assert.assertEquals(0, dc.getFullDirs().size());
+    Assert.assertEquals(1, dc.getErroredDirs().size());
+    Assert.assertNotNull(dc.getDirectoryErrorInfo(dirB));
+    Assert.assertEquals(DirectoryCollection.DiskErrorCause.OTHER, dc.getDirectoryErrorInfo(dirB).cause);
+
     permDirB = new FsPermission((short) 0700);
     localFs.setPermission(pathB, permDirB);
     dc.checkDirs();
     Assert.assertEquals(1, dc.getGoodDirs().size());
     Assert.assertEquals(0, dc.getFailedDirs().size());
     Assert.assertEquals(0, dc.getFullDirs().size());
+    Assert.assertEquals(0, dc.getErroredDirs().size());
+    Assert.assertNull(dc.getDirectoryErrorInfo(dirA));
   }
 
   @Test


---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org
For additional commands, e-mail: common-commits-help@hadoop.apache.org


Mime
View raw message