hadoop-common-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From ka...@apache.org
Subject [05/50] [abbrv] hadoop git commit: Incorrect locking in FsVolumeList#checkDirs can hang datanodes (Noah Lorang via Colin P. McCabe)
Date Mon, 15 Dec 2014 18:35:56 GMT
Incorrect locking in FsVolumeList#checkDirs can hang datanodes (Noah Lorang via Colin P. McCabe)


Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/d8352b9b
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/d8352b9b
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/d8352b9b

Branch: refs/heads/YARN-2139
Commit: d8352b9b2b99aa46679c5880a724ba3f0ceb41ff
Parents: be86237
Author: Colin Patrick Mccabe <cmccabe@cloudera.com>
Authored: Tue Dec 9 10:55:17 2014 -0800
Committer: Colin Patrick Mccabe <cmccabe@cloudera.com>
Committed: Tue Dec 9 10:56:46 2014 -0800

----------------------------------------------------------------------
 hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt     |  3 ++
 .../datanode/fsdataset/impl/FsVolumeList.java   | 56 ++++++++++----------
 2 files changed, 31 insertions(+), 28 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hadoop/blob/d8352b9b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
----------------------------------------------------------------------
diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
index 55026a2..626d90a 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
+++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
@@ -574,6 +574,9 @@ Release 2.6.1 - UNRELEASED
     HDFS-4882. Prevent the Namenode's LeaseManager from looping forever in
     checkLeases (Ravi Prakash via Colin P. McCabe)
 
+    HDFS-7489. Incorrect locking in FsVolumeList#checkDirs can hang datanodes
+    (Noah Lorang via Colin P. McCabe)
+
 Release 2.6.0 - 2014-11-18
 
   INCOMPATIBLE CHANGES

http://git-wip-us.apache.org/repos/asf/hadoop/blob/d8352b9b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsVolumeList.java
----------------------------------------------------------------------
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsVolumeList.java
b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsVolumeList.java
index 837ddf7..55329ae 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsVolumeList.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsVolumeList.java
@@ -36,6 +36,7 @@ class FsVolumeList {
    * This list is replaced on modification holding "this" lock.
    */
   volatile List<FsVolumeImpl> volumes = null;
+  private Object checkDirsMutex = new Object();
 
   private final VolumeChoosingPolicy<FsVolumeImpl> blockChooser;
   private volatile int numFailedVolumes;
@@ -167,40 +168,39 @@ class FsVolumeList {
    * Calls {@link FsVolumeImpl#checkDirs()} on each volume, removing any
    * volumes from the active list that result in a DiskErrorException.
    * 
-   * This method is synchronized to allow only one instance of checkDirs() 
-   * call
+   * Use checkDirsMutext to allow only one instance of checkDirs() call
+   *
    * @return list of all the removed volumes.
    */
-  synchronized List<FsVolumeImpl> checkDirs() {
-    ArrayList<FsVolumeImpl> removedVols = null;
-    
-    // Make a copy of volumes for performing modification 
-    final List<FsVolumeImpl> volumeList = new ArrayList<FsVolumeImpl>(volumes);
+  List<FsVolumeImpl> checkDirs() {
+    synchronized(checkDirsMutex) {
+      ArrayList<FsVolumeImpl> removedVols = null;
+      
+      // Make a copy of volumes for performing modification 
+      final List<FsVolumeImpl> volumeList = new ArrayList<FsVolumeImpl>(volumes);
 
-    for(Iterator<FsVolumeImpl> i = volumeList.iterator(); i.hasNext(); ) {
-      final FsVolumeImpl fsv = i.next();
-      try {
-        fsv.checkDirs();
-      } catch (DiskErrorException e) {
-        FsDatasetImpl.LOG.warn("Removing failed volume " + fsv + ": ",e);
-        if (removedVols == null) {
-          removedVols = new ArrayList<FsVolumeImpl>(1);
+      for(Iterator<FsVolumeImpl> i = volumeList.iterator(); i.hasNext(); ) {
+        final FsVolumeImpl fsv = i.next();
+        try {
+          fsv.checkDirs();
+        } catch (DiskErrorException e) {
+          FsDatasetImpl.LOG.warn("Removing failed volume " + fsv + ": ",e);
+          if (removedVols == null) {
+            removedVols = new ArrayList<FsVolumeImpl>(1);
+          }
+          removedVols.add(fsv);
+          removeVolume(fsv.getBasePath());
+          numFailedVolumes++;
         }
-        removedVols.add(fsv);
-        fsv.shutdown(); 
-        i.remove(); // Remove the volume
-        numFailedVolumes++;
       }
-    }
-    
-    if (removedVols != null && removedVols.size() > 0) {
-      // Replace volume list
-      volumes = Collections.unmodifiableList(volumeList);
-      FsDatasetImpl.LOG.warn("Completed checkDirs. Removed " + removedVols.size()
-          + " volumes. Current volumes: " + this);
-    }
+      
+      if (removedVols != null && removedVols.size() > 0) {
+        FsDatasetImpl.LOG.warn("Completed checkDirs. Removed " + removedVols.size()
+            + " volumes. Current volumes: " + this);
+      }
 
-    return removedVols;
+      return removedVols;
+    }
   }
 
   @Override


Mime
View raw message