airavata-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From lah...@apache.org
Subject git commit: adding retry during failures
Date Wed, 01 Oct 2014 18:59:19 GMT
Repository: airavata
Updated Branches:
  refs/heads/master 55285038f -> 9372890ca


adding retry during failures


Project: http://git-wip-us.apache.org/repos/asf/airavata/repo
Commit: http://git-wip-us.apache.org/repos/asf/airavata/commit/9372890c
Tree: http://git-wip-us.apache.org/repos/asf/airavata/tree/9372890c
Diff: http://git-wip-us.apache.org/repos/asf/airavata/diff/9372890c

Branch: refs/heads/master
Commit: 9372890cab9aedd3c08ecf6b0d10dfe6ecd51892
Parents: 5528503
Author: lahiru <lahiru@apache.org>
Authored: Wed Oct 1 14:59:11 2014 -0400
Committer: lahiru <lahiru@apache.org>
Committed: Wed Oct 1 14:59:11 2014 -0400

----------------------------------------------------------------------
 .../monitor/impl/pull/qstat/HPCPullMonitor.java |  12 +-
 .../gsi/ssh/impl/GSISSHAbstractCluster.java     | 192 ++++++++++++++-----
 2 files changed, 151 insertions(+), 53 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/airavata/blob/9372890c/modules/gfac/gfac-monitor/src/main/java/org/apache/airavata/gfac/monitor/impl/pull/qstat/HPCPullMonitor.java
----------------------------------------------------------------------
diff --git a/modules/gfac/gfac-monitor/src/main/java/org/apache/airavata/gfac/monitor/impl/pull/qstat/HPCPullMonitor.java
b/modules/gfac/gfac-monitor/src/main/java/org/apache/airavata/gfac/monitor/impl/pull/qstat/HPCPullMonitor.java
index b588894..da98edb 100644
--- a/modules/gfac/gfac-monitor/src/main/java/org/apache/airavata/gfac/monitor/impl/pull/qstat/HPCPullMonitor.java
+++ b/modules/gfac/gfac-monitor/src/main/java/org/apache/airavata/gfac/monitor/impl/pull/qstat/HPCPullMonitor.java
@@ -41,6 +41,7 @@ import org.apache.airavata.gfac.monitor.impl.push.amqp.SimpleJobFinishConsumer;
 import org.apache.airavata.gfac.monitor.util.CommonUtils;
 import org.apache.airavata.gsi.ssh.api.SSHApiException;
 import org.apache.airavata.gsi.ssh.api.authentication.AuthenticationInfo;
+import org.apache.airavata.gsi.ssh.api.job.JobDescriptor;
 import org.apache.airavata.model.workspace.experiment.JobState;
 import org.apache.airavata.model.workspace.experiment.TaskState;
 import org.apache.airavata.schemas.gfac.GsisshHostType;
@@ -191,8 +192,8 @@ public class HPCPullMonitor extends PullMonitor {
                             if (cancelMId.equals(iMonitorID.getExperimentID() + "+" + iMonitorID.getTaskID()))
{
                                 logger.info("Found a match in monitoring Queue, so marking
this job to remove from monitor queue " + cancelMId);
                                 logger.info("ExperimentID: " + cancelMId.split("\\+")[0]
+ ",TaskID: " + cancelMId.split("\\+")[1] + "JobID" + iMonitorID.getJobID());
-                                completedJobs.put(iMonitorID.getJobName(), iMonitorID);
                                 iMonitorID.setStatus(JobState.CANCELED);
+                                completedJobs.put(iMonitorID.getJobName(), iMonitorID);
                                 iterator1.remove();
                                 break;
                             }
@@ -242,8 +243,13 @@ public class HPCPullMonitor extends PullMonitor {
                             logger.error("Tried to monitor the job with ID " + iMonitorID.getJobID()
+ " But failed" + iMonitorID.getFailedCount() +
                                     " 3 times, so skip this Job from Monitor");
                             iMonitorID.setLastMonitored(new Timestamp((new Date()).getTime()));
-                            completedJobs.put(iMonitorID.getJobName(), iMonitorID);
-                        } else {
+                            JobDescriptor jobDescriptor = JobDescriptor.fromXML(iMonitorID.getJobExecutionContext().getJobDetails().getJobDescription());
+                            List<String> stdErr = connection.getCluster().listDirectory(jobDescriptor.getStandardErrorFile());
+                            List<String> stdOut = connection.getCluster().listDirectory(jobDescriptor.getStandardOutFile());
+                            if (stdErr.size() > 0 && stdOut.size() > 0) {
+                                completedJobs.put(iMonitorID.getJobName(), iMonitorID);
+                            }
+                            } else {
                             // Evey
                             iMonitorID.setLastMonitored(new Timestamp((new Date()).getTime()));
                             // if the job is complete we remove it from the Map, if any of
these maps

http://git-wip-us.apache.org/repos/asf/airavata/blob/9372890c/tools/gsissh/src/main/java/org/apache/airavata/gsi/ssh/impl/GSISSHAbstractCluster.java
----------------------------------------------------------------------
diff --git a/tools/gsissh/src/main/java/org/apache/airavata/gsi/ssh/impl/GSISSHAbstractCluster.java
b/tools/gsissh/src/main/java/org/apache/airavata/gsi/ssh/impl/GSISSHAbstractCluster.java
index 0572069..6238c5c 100644
--- a/tools/gsissh/src/main/java/org/apache/airavata/gsi/ssh/impl/GSISSHAbstractCluster.java
+++ b/tools/gsissh/src/main/java/org/apache/airavata/gsi/ssh/impl/GSISSHAbstractCluster.java
@@ -79,6 +79,10 @@ public class GSISSHAbstractCluster implements Cluster {
 
     public  GSISSHAbstractCluster(ServerInfo serverInfo, AuthenticationInfo authenticationInfo)
throws SSHApiException {
 
+        reconnect(serverInfo, authenticationInfo);
+    }
+
+    private void reconnect(ServerInfo serverInfo, AuthenticationInfo authenticationInfo)
throws SSHApiException {
         this.serverInfo = serverInfo;
 
         this.authenticationInfo = authenticationInfo;
@@ -293,12 +297,28 @@ public class GSISSHAbstractCluster implements Cluster {
             FileUtils.writeStringToFile(tempPBSFile, scriptContent);
 
             //reusing submitBatchJobWithScript method to submit a job
-
-            String jobID = this.submitBatchJobWithScript(tempPBSFile.getAbsolutePath(),
-                    jobDescriptor.getWorkingDirectory());
+            int retry = 3;
+            String jobID = null;
+            while (retry > 0) {
+                try {
+                     jobID = this.submitBatchJobWithScript(tempPBSFile.getAbsolutePath(),
+                            jobDescriptor.getWorkingDirectory());
+                     retry = 0;
+                } catch (SSHApiException e) {
+                    retry--;
+                    reconnect(serverInfo,authenticationInfo);
+                    if(retry==0) {
+                        throw e;
+                    }
+                }
+            }
             log.debug("Job has successfully submitted, JobID : " + jobID);
-            return jobID.replace("\n", "");
-        } catch (TransformerConfigurationException e) {
+            if (jobID != null) {
+                return jobID.replace("\n", "");
+            } else {
+                return null;
+            }
+            } catch (TransformerConfigurationException e) {
             throw new SSHApiException("Error parsing PBS transformation", e);
         } catch (TransformerException e) {
             throw new SSHApiException("Error generating PBS script", e);
@@ -349,34 +369,58 @@ public class GSISSHAbstractCluster implements Cluster {
     }
 
     public void scpTo(String remoteFile, String localFile) throws SSHApiException {
-        try {
-            if(!session.isConnected()){
-                session.connect();
+        int retry = 3;
+        while (retry > 0) {
+            try {
+                if (!session.isConnected()) {
+                    session.connect();
+                }
+                log.info("Transfering file:/" + localFile + " To:" + serverInfo.getHost()
+ ":" + remoteFile);
+                SSHUtils.scpTo(remoteFile, localFile, session);
+                retry = 0;
+            } catch (IOException e) {
+                retry--;
+                reconnect(serverInfo, authenticationInfo);
+                if (retry == 0) {
+                    throw new SSHApiException("Failed during scping local file:" + localFile
+ " to remote file "
+                            + serverInfo.getHost() + ":rFile", e);
+                }
+            } catch (JSchException e) {
+                retry--;
+                reconnect(serverInfo, authenticationInfo);
+                if (retry == 0) {
+                    throw new SSHApiException("Failed during scping local file:" + localFile
+ " to remote file "
+                            + serverInfo.getHost() + ":rFile", e);
+                }
             }
-            log.info("Transfering file:/" + localFile + " To:" + serverInfo.getHost() + ":"
+ remoteFile);
-            SSHUtils.scpTo(remoteFile, localFile, session);
-        } catch (IOException e) {
-            throw new SSHApiException("Failed during scping local file:" + localFile + "
to remote file "
-                    + serverInfo.getHost() + ":rFile", e);
-        } catch (JSchException e) {
-            throw new SSHApiException("Failed during scping local file:" + localFile + "
to remote file "
-                    + serverInfo.getHost() + ":rFile", e);
         }
     }
 
     public void scpFrom(String remoteFile, String localFile) throws SSHApiException {
-        try {
-            if(!session.isConnected()){
-                session.connect();
+        int retry = 3;
+        while(retry>0) {
+            try {
+                if (!session.isConnected()) {
+                    session.connect();
+                }
+                log.info("Transfering from:" + serverInfo.getHost() + ":" + remoteFile +
" To:" + "file:/" + localFile);
+                SSHUtils.scpFrom(remoteFile, localFile, session);
+                retry=0;
+            } catch (IOException e) {
+                retry--;
+                reconnect(serverInfo, authenticationInfo);
+                if (retry == 0) {
+                    throw new SSHApiException("Failed during scping local file:" + localFile
+ " to remote file "
+                            + serverInfo.getHost() + ":rFile", e);
+                }
+            } catch (JSchException e) {
+                retry--;
+                reconnect(serverInfo, authenticationInfo);
+                if(retry==0) {
+                    throw new SSHApiException("Failed during scping local file:" + localFile
+ " to remote file "
+                            + serverInfo.getHost() + ":rFile", e);
+                }
             }
-            log.info("Transfering from:"+ serverInfo.getHost() + ":" + remoteFile + " To:"
+ "file:/" + localFile);
-            SSHUtils.scpFrom(remoteFile, localFile, session);
-        } catch (IOException e) {
-            throw new SSHApiException("Failed during scping local file:" + localFile + "
to remote file "
-                    + serverInfo.getHost() + ":rFile", e);
-        } catch (JSchException e) {
-            throw new SSHApiException("Failed during scping local file:" + localFile + "
to remote file "
-                    + serverInfo.getHost() + ":rFile", e);
         }
     }
     
@@ -397,41 +441,89 @@ public class GSISSHAbstractCluster implements Cluster {
     }
 
     public void makeDirectory(String directoryPath) throws SSHApiException {
-        try {
-            if(!session.isConnected()){
-                session.connect();
+        int retry = 3;
+        while (retry > 0) {
+            try {
+                if (!session.isConnected()) {
+                    session.connect();
+                }
+                log.info("Creating directory: " + serverInfo.getHost() + ":" + directoryPath);
+                SSHUtils.makeDirectory(directoryPath, session);
+                retry = 0;
+            } catch (IOException e) {
+                throw new SSHApiException("Failed during creating directory:" + directoryPath
+ " to remote file "
+                        + serverInfo.getHost() + ":rFile", e);
+            } catch (JSchException e) {
+                retry--;
+                reconnect(serverInfo, authenticationInfo);
+                if (retry == 0) {
+                    throw new SSHApiException("Failed during creating directory :" + directoryPath
+ " to remote file "
+                            + serverInfo.getHost() + ":rFile", e);
+                }
+            } catch (SSHApiException e) {
+                retry--;
+                reconnect(serverInfo, authenticationInfo);
+                if (retry == 0) {
+                    throw new SSHApiException("Failed during creating directory :" + directoryPath
+ " to remote file "
+                            + serverInfo.getHost() + ":rFile", e);
+                }
             }
-            log.info("Creating directory: " + serverInfo.getHost() + ":" + directoryPath);
-            SSHUtils.makeDirectory(directoryPath, session);
-        } catch (IOException e) {
-            throw new SSHApiException("Failed during creating directory:" + directoryPath
+ " to remote file "
-                    + serverInfo.getHost() + ":rFile", e);
-        } catch (JSchException e) {
-            throw new SSHApiException("Failed during creating directory :" + directoryPath
+ " to remote file "
-                    + serverInfo.getHost() + ":rFile", e);
         }
     }
 
     public List<String> listDirectory(String directoryPath) throws SSHApiException
{
-        try {
-            if(!session.isConnected()){
-                session.connect();
+        int retry = 3;
+        List<String> files = null;
+        while (retry > 0) {
+            try {
+                if (!session.isConnected()) {
+                    session.connect();
+                }
+                log.info("Listing directory: " + serverInfo.getHost() + ":" + directoryPath);
+                files = SSHUtils.listDirectory(directoryPath, session);
+                retry=0;
+            } catch (IOException e) {
+                retry--;
+                reconnect(serverInfo, authenticationInfo);
+                if (retry == 0) {
+                    throw new SSHApiException("Failed during creating directory:" + directoryPath
+ " to remote file "
+                            + serverInfo.getHost() + ":rFile", e);
+                }
+            } catch (JSchException e) {
+                retry--;
+                reconnect(serverInfo, authenticationInfo);
+                if (retry == 0) {
+                    throw new SSHApiException("Failed during creating directory :" + directoryPath
+ " to remote file "
+                            + serverInfo.getHost() + ":rFile", e);
+                }
+            }catch (SSHApiException e) {
+                retry--;
+                reconnect(serverInfo, authenticationInfo);
+                if (retry == 0) {
+                    throw new SSHApiException("Failed during creating directory :" + directoryPath
+ " to remote file "
+                            + serverInfo.getHost() + ":rFile", e);
+                }
             }
-            log.info("Listing directory: " + serverInfo.getHost() + ":" + directoryPath);
-            return SSHUtils.listDirectory(directoryPath, session);
-        } catch (IOException e) {
-            throw new SSHApiException("Failed during creating directory:" + directoryPath
+ " to remote file "
-                    + serverInfo.getHost() + ":rFile", e);
-        } catch (JSchException e) {
-            throw new SSHApiException("Failed during creating directory :" + directoryPath
+ " to remote file "
-                    + serverInfo.getHost() + ":rFile", e);
         }
+        return files;
     }
 
     public void getJobStatuses(String userName, Map<String,JobStatus> jobIDs)throws
SSHApiException {
+        int retry = 3;
         RawCommandInfo rawCommandInfo = jobManagerConfiguration.getUserBasedMonitorCommand(userName);
         StandardOutReader stdOutReader = new StandardOutReader();
-        CommandExecutor.executeCommand(rawCommandInfo, this.getSession(), stdOutReader);
+        while (retry > 0){
+            try {
+                CommandExecutor.executeCommand(rawCommandInfo, this.getSession(), stdOutReader);
+                retry=0;
+            } catch (SSHApiException e) {
+                retry--;
+                reconnect(serverInfo, authenticationInfo);
+                if (retry == 0) {
+                    throw new SSHApiException("Failed Getting statuses  to remote file",
e);
+                }
+            }
+        }
         String result = getOutputifAvailable(stdOutReader, "Error getting job information
from the resource !", rawCommandInfo.getBaseCommand(jobManagerConfiguration.getInstalledPath()));
         jobManagerConfiguration.getParser().parse(userName,jobIDs, result);
     }


Mime
View raw message