hadoop-common-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From cutt...@apache.org
Subject svn commit: r395067 - in /lucene/hadoop/trunk: ./ src/java/org/apache/hadoop/mapred/
Date Tue, 18 Apr 2006 22:04:10 GMT
Author: cutting
Date: Tue Apr 18 15:04:09 2006
New Revision: 395067

URL: http://svn.apache.org/viewcvs?rev=395067&view=rev
Log:
Fix for HADOOP-133.  Retry pings from child to parent, in case of (local) communcation problems.
 Also log exit status, so that one can distinguish patricide from other deaths.  Contributed
by Owen.

Modified:
    lucene/hadoop/trunk/CHANGES.txt
    lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/LocalJobRunner.java
    lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskRunner.java
    lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskTracker.java
    lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskUmbilicalProtocol.java

Modified: lucene/hadoop/trunk/CHANGES.txt
URL: http://svn.apache.org/viewcvs/lucene/hadoop/trunk/CHANGES.txt?rev=395067&r1=395066&r2=395067&view=diff
==============================================================================
--- lucene/hadoop/trunk/CHANGES.txt (original)
+++ lucene/hadoop/trunk/CHANGES.txt Tue Apr 18 15:04:09 2006
@@ -54,6 +54,10 @@
 
 15. Fix HADOOP-115.  Correct an error message.  (Stack via cutting)
 
+16. "Fix HADOOP-133.  Retry pings from child to parent, in case of
+    (local) communcation problems.  Also log exit status, so that one
+    can distinguish patricide from other deaths.  (omalley via cutting)
+
 
 Release 0.1.1 - 2006-04-08
 

Modified: lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/LocalJobRunner.java
URL: http://svn.apache.org/viewcvs/lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/LocalJobRunner.java?rev=395067&r1=395066&r2=395067&view=diff
==============================================================================
--- lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/LocalJobRunner.java (original)
+++ lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/LocalJobRunner.java Tue Apr 18 15:04:09
2006
@@ -154,7 +154,9 @@
       // Ignore for now
     }
 
-    public void ping(String taskid) throws IOException {}
+    public boolean ping(String taskid) throws IOException {
+      return true;
+    }
 
     public void done(String taskId) throws IOException {
       int taskIndex = mapIds.indexOf(taskId);

Modified: lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskRunner.java
URL: http://svn.apache.org/viewcvs/lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskRunner.java?rev=395067&r1=395066&r2=395067&view=diff
==============================================================================
--- lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskRunner.java (original)
+++ lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskRunner.java Tue Apr 18 15:04:09
2006
@@ -260,7 +260,6 @@
   private void runChild(String[] args, File dir) throws IOException {
     this.process = Runtime.getRuntime().exec(args, null, dir);
     try {
-      StringBuffer errorBuf = new StringBuffer();
       new Thread() {
         public void run() {
           logStream(process.getErrorStream());    // copy log output
@@ -269,8 +268,10 @@
         
       logStream(process.getInputStream());        // normally empty
       
-      if (this.process.waitFor() != 0) {
-        throw new IOException("Task process exit with nonzero status.");
+      int exit_code = process.waitFor();
+      if (exit_code != 0) {
+        throw new IOException("Task process exit with nonzero status of " +
+                              exit_code + ".");
       }
       
     } catch (InterruptedException e) {

Modified: lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskTracker.java
URL: http://svn.apache.org/viewcvs/lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskTracker.java?rev=395067&r1=395066&r2=395067&view=diff
==============================================================================
--- lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskTracker.java (original)
+++ lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskTracker.java Tue Apr 18 15:04:09
2006
@@ -653,10 +653,8 @@
     }
 
     /** Child checking to see if we're alive.  Normally does nothing.*/
-    public synchronized void ping(String taskid) throws IOException {
-      if (tasks.get(taskid) == null) {
-        throw new IOException("No such task id."); // force child exit
-      }
+    public synchronized boolean ping(String taskid) throws IOException {
+      return tasks.get(taskid) != null;
     }
 
     /**
@@ -748,12 +746,23 @@
                                          final String taskid) {
           Thread thread = new Thread(new Runnable() {
               public void run() {
+                final int MAX_RETRIES = 3;
+                int remainingRetries = MAX_RETRIES;
                 while (true) {
                   try {
-                    umbilical.ping(taskid);
+                    if (!umbilical.ping(taskid)) {
+                      LOG.log(Level.WARNING, "Parent died.  Exiting "+taskid);
+                      System.exit(66);
+                    }
+                    remainingRetries = MAX_RETRIES;
                   } catch (Throwable t) {
-                    LOG.log(Level.WARNING, "Parent died.  Exiting "+taskid, t);
-                    System.exit(1);
+                    String msg = StringUtils.stringifyException(t);
+                    LOG.info("Ping exception: " + msg);
+                    remainingRetries -=1;
+                    if (remainingRetries == 0) {
+                      LOG.log(Level.WARNING, "Last retry, killing "+taskid);
+                      System.exit(65);
+                    }
                   }
                   try {
                     Thread.sleep(1000);

Modified: lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskUmbilicalProtocol.java
URL: http://svn.apache.org/viewcvs/lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskUmbilicalProtocol.java?rev=395067&r1=395066&r2=395067&view=diff
==============================================================================
--- lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskUmbilicalProtocol.java (original)
+++ lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskUmbilicalProtocol.java Tue Apr
18 15:04:09 2006
@@ -42,8 +42,10 @@
    */
   void reportDiagnosticInfo(String taskid, String trace) throws IOException;
 
-  /** Periodically called by child to check if parent is still alive. */
-  void ping(String taskid) throws IOException;
+  /** Periodically called by child to check if parent is still alive. 
+   * @return True if the task is known
+   */
+  boolean ping(String taskid) throws IOException;
 
   /** Report that the task is successfully completed.  Failure is assumed if
    * the task process exits without calling this. */



Mime
View raw message