trafodion-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From sure...@apache.org
Subject [41/50] [abbrv] trafodion git commit: More Name Server enabled process management fixes.
Date Sat, 16 Jun 2018 17:10:11 GMT
More Name Server enabled process management fixes.


Project: http://git-wip-us.apache.org/repos/asf/trafodion/repo
Commit: http://git-wip-us.apache.org/repos/asf/trafodion/commit/65ac5563
Tree: http://git-wip-us.apache.org/repos/asf/trafodion/tree/65ac5563
Diff: http://git-wip-us.apache.org/repos/asf/trafodion/diff/65ac5563

Branch: refs/heads/master
Commit: 65ac55633c34dc8ee12de5da4c6fbc91cd6b9093
Parents: 8e4f2c7
Author: Zalo Correa <zalo.correa@esgyn.com>
Authored: Fri May 25 14:44:54 2018 -0700
Committer: Zalo Correa <zalo.correa@esgyn.com>
Committed: Fri May 25 14:44:54 2018 -0700

----------------------------------------------------------------------
 .../export/include/common/evl_sqlog_eventnum.h  |  34 ++
 core/sqf/monitor/linux/cluster.cxx              |  11 +-
 core/sqf/monitor/linux/notice.cxx               |  13 +-
 core/sqf/monitor/linux/nsreqprocinfons.cxx      |   6 +-
 core/sqf/monitor/linux/pnode.cxx                | 105 +++--
 core/sqf/monitor/linux/pnode.h                  |   2 +-
 core/sqf/monitor/linux/process.cxx              | 406 +++++++++++++++++--
 core/sqf/monitor/linux/process.h                |  11 +
 core/sqf/monitor/linux/reqkill.cxx              |  23 +-
 core/sqf/monitor/linux/reqnewproc.cxx           |  21 +-
 core/sqf/monitor/linux/reqnotify.cxx            |  23 +-
 core/sqf/monitor/linux/reqprocinfo.cxx          |  26 +-
 core/sqf/monitor/linux/reqqueue.cxx             | 117 +++---
 core/sqf/monitor/linux/tmsync.cxx               |  17 +
 14 files changed, 679 insertions(+), 136 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/trafodion/blob/65ac5563/core/sqf/export/include/common/evl_sqlog_eventnum.h
----------------------------------------------------------------------
diff --git a/core/sqf/export/include/common/evl_sqlog_eventnum.h b/core/sqf/export/include/common/evl_sqlog_eventnum.h
index 1f4d166..de3e60e 100644
--- a/core/sqf/export/include/common/evl_sqlog_eventnum.h
+++ b/core/sqf/export/include/common/evl_sqlog_eventnum.h
@@ -254,6 +254,8 @@
 #define MON_CLUSTER_ASSIGNMONITORLEADER_3   101015303
 #define MON_CLUSTER_ASSIGNMONITORLEADER_4   101015304
 
+#define MON_CLUSTER_CHECKIFDONE_1           101015401
+
 /* Module: monitor.cxx = 02 */
 
 #define MON_MONITOR_MAIN_1                  101020101
@@ -298,6 +300,7 @@
 #define MON_MONITOR_STARTPROCESS_12         101020512
 #define MON_MONITOR_STARTPROCESS_13         101020513
 #define MON_MONITOR_STARTPROCESS_14         101020514
+#define MON_MONITOR_STARTPROCESS_15         101020515
 #define MON_MONITOR_PROCESSINFO             101020601
 #define MON_MONITOR_PROCESSREQUEST_1        101020701
 #define MON_MONITOR_PROCESSREQUEST_2        101020702
@@ -382,6 +385,8 @@
 #define MON_PROCESS_CREATE_9                101031409
 #define MON_PROCESS_CREATE_10               101031410
 #define MON_PROCESS_CREATE_11               101031411
+#define MON_PROCESS_CREATE_12               101031412
+#define MON_PROCESS_CREATE_13               101031413
 #define MON_PROCESS_SETPROCESSSTATE_1       101031501
 #define MON_PROCESS_PIDHANGUPCHECK_1        101031601
 #define MON_PROCESS_PIDHANGUPCHECK_2        101031602
@@ -416,6 +421,14 @@
 #define MON_PROCESSCONT_KILLALLDOWN_1       101032301
 #define MON_PROCESS_SETSTATE_1              101032401
 #define MON_PROCESS_SETSTATE_2              101032402
+#define MON_PROCESS_COMPLETESTARTUP_1       101032501
+#define MON_PROCESS_COMPLETESTARTUP_2       101032502
+#define MON_PROCESS_COMPLETESTARTUP_3       101032503
+#define MON_PROCESSCONT_CHILDEXIT_1         101032601
+#define MON_PROCESS_PROCEXITNOTIFIERNODES_1 101032701
+#define MON_PROCESS_PROCEXIT_1              101032801
+#define MON_PROCESS_PROCEXIT_2              101032802
+#define MON_PROCESS_PROCEXITUNREGALL_1      101032901
 
 /* Module: pnode.cxx = 04 */
 
@@ -450,6 +463,19 @@
 #define MON_NODE_GETPROCESSNS_2             101041502
 #define MON_NODE_GETPROCESSNS_3             101041503
 #define MON_NODE_GETPROCESSNS_4             101041504
+#define MON_NODE_GETPROCESSNS_5             101041505
+#define MON_NODE_GETPROCESSNS_6             101041506
+#define MON_NODE_GETSTRINGID_1              101041601
+#define MON_NODE_GETSTRINGID_2              101041602
+#define MON_NODE_CLONEPROCESSNS_1           101041701
+#define MON_NODE_CLONEPROCESSNS_2           101041702
+#define MON_NODE_CLONEPROCESSNS_3           101041703
+#define MON_NODE_CLONEPROCESSNS_4           101041704
+#define MON_NODE_CLONEPROCESSNS_5           101041705
+#define MON_NODE_CLONEPROCESSNS_6           101041706
+#define MON_NODE_GETPROCESSLBYTYPENS_1      101041801
+#define MON_NODE_GETPROCESSLBYTYPENS_2      101041802
+#define MON_NODE_GETPROCESSLBYTYPENS_3      101041803
 
 /* Module: config.cxx = 05 */
 
@@ -747,6 +773,11 @@
 #define MON_REQ_IODATA_1                    101182201
 #define MON_REQ_STDIN_1                     101182301
 #define MON_REQ_STDIN_2                     101182302
+#define MON_REQ_KILL_1                      101182401
+#define MON_REQ_NOTIFY_1                    101182501
+#define MON_REQ_PROCINFO_1                  101182601
+#define MON_REQ_PROCINFOCONT_1              101182701
+#define MON_INTREQ_CHILDDEATH_1             101182801
 
 /* Module: clio.cxx = 19 */
 #define MON_CLIO_ACQUIRE_MSG_1              101190101
@@ -1048,6 +1079,9 @@
 #define PTP_COMMACCEPT_7                    101940107
 #define PTP_COMMACCEPT_8                    101940108
 
+/* Module notice.cxx = 95 */
+#define NOTICE_NOTIFYREMOTE_1               101950101
+
 /**********************************************/
 
 /*********** Seabed ***********/

http://git-wip-us.apache.org/repos/asf/trafodion/blob/65ac5563/core/sqf/monitor/linux/cluster.cxx
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/cluster.cxx b/core/sqf/monitor/linux/cluster.cxx
index 8768d2e..69647ec 100644
--- a/core/sqf/monitor/linux/cluster.cxx
+++ b/core/sqf/monitor/linux/cluster.cxx
@@ -7786,7 +7786,16 @@ bool CCluster::checkIfDone (  )
                                     Nodes->ProcessCount(), MyNode->ProcessCount());
     
                     waitForNameServerExit_ = true;
-                    NameServer->ProcessShutdown();
+                    int rc = NameServer->ProcessShutdown();
+                    if (rc)
+                    {
+                        char la_buf[MON_STRING_BUF_SIZE];
+                        snprintf( la_buf, sizeof(la_buf)
+                                , "[%s] - Shutdown request to Name Server failed, node going down\n"
+                                , method_name );
+                        mon_log_write( MON_CLUSTER_CHECKIFDONE_1, SQ_LOG_ERR, la_buf );
+                        ReqQueue.enqueueDownReq( MyPNID );
+                    }
                 }
             }
             else

http://git-wip-us.apache.org/repos/asf/trafodion/blob/65ac5563/core/sqf/monitor/linux/notice.cxx
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/notice.cxx b/core/sqf/monitor/linux/notice.cxx
index f478f45..64778d7 100644
--- a/core/sqf/monitor/linux/notice.cxx
+++ b/core/sqf/monitor/linux/notice.cxx
@@ -421,7 +421,18 @@ void CNotice::NotifyRemote( void )
                                    , targetLNode->GetNode()->GetName() );
         if (rc)
         {
-            // TODO: Error handling
+            char la_buf[MON_STRING_BUF_SIZE];
+            snprintf( la_buf, sizeof(la_buf)
+                    , "[%s] - Can't send process exit "
+                      "for process %s (%d, %d) "
+                      "to target node %s, nid=%d\n"
+                    , method_name
+                    , Process->GetName()
+                    , Process->GetNid()
+                    , Process->GetPid()
+                    , targetLNode->GetNode()->GetName()
+                    , targetLNode->GetNid() );
+            mon_log_write(NOTICE_NOTIFYREMOTE_1, SQ_LOG_ERR, la_buf);
         }
         nidQueue->pop();
     }

http://git-wip-us.apache.org/repos/asf/trafodion/blob/65ac5563/core/sqf/monitor/linux/nsreqprocinfons.cxx
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/nsreqprocinfons.cxx b/core/sqf/monitor/linux/nsreqprocinfons.cxx
index 562fdb6..aa53437 100644
--- a/core/sqf/monitor/linux/nsreqprocinfons.cxx
+++ b/core/sqf/monitor/linux/nsreqprocinfons.cxx
@@ -59,7 +59,7 @@ CExtProcInfoNsReq::~CExtProcInfoNsReq()
 // Copy information for a specific process into the reply message buffer.
 void CExtProcInfoNsReq::copyInfo(CProcess *process, ProcessInfoNs_reply_def &process_info_ns)
 {
-    const char method_name[] = "CNameServer::SendReceive";
+    const char method_name[] = "CExtProcInfoNsReq::copyInfo";
     TRACE_ENTRY;
 
     CProcess *parent;
@@ -92,8 +92,8 @@ void CExtProcInfoNsReq::copyInfo(CProcess *process, ProcessInfoNs_reply_def &pro
     process_info_ns.unhooked = process->IsUnhooked();
     process_info_ns.event_messages = process->IsEventMessages();
     process_info_ns.system_messages = process->IsSystemMessages();
-    strncpy( process_info_ns.path, process->path(), MAX_PROCESS_PATH );
-    strncpy( process_info_ns.ldpath, process->ldpath(), MAX_PROCESS_PATH );
+    strncpy( process_info_ns.path, process->path(), MAX_SEARCH_PATH );
+    strncpy( process_info_ns.ldpath, process->ldpath(), MAX_SEARCH_PATH );
     strncpy( process_info_ns.program, process->program(), MAX_PROCESS_PATH );
 //    process_info_ns.pathStrId = process->pathStrId();
 //    process_info_ns.ldpathStrId = process->ldPathStrId();

http://git-wip-us.apache.org/repos/asf/trafodion/blob/65ac5563/core/sqf/monitor/linux/pnode.cxx
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/pnode.cxx b/core/sqf/monitor/linux/pnode.cxx
index c044100..73b0246 100644
--- a/core/sqf/monitor/linux/pnode.cxx
+++ b/core/sqf/monitor/linux/pnode.cxx
@@ -1035,7 +1035,7 @@ bool CNode::GetSchedulingData( void )
 }
 
 
-strId_t CNode::GetStringId( char *candidate, CLNode *targetLNode )
+strId_t CNode::GetStringId( char *candidate, CLNode *targetLNode, bool clone )
 {
     const char method_name[] = "CNode::GetStringId";
     strId_t id;
@@ -1059,21 +1059,33 @@ strId_t CNode::GetStringId( char *candidate, CLNode *targetLNode )
 #ifndef NAMESERVER_PROCESS
         if (NameServerEnabled)
         {
-            if (targetLNode != NULL &&
+            if (targetLNode != NULL && !clone &&
                 !MyNode->IsMyNode(targetLNode->GetNid()))
             {
                 // Forward the unique string to the target node
-                PtpClient->AddUniqStr( id.nid
-                                     , id.id
-                                     , candidate
-                                     , targetLNode->GetNid()
-                                     , targetLNode->GetNode()->GetName());
+                int rc = PtpClient->AddUniqStr( id.nid
+                                              , id.id
+                                              , candidate
+                                              , targetLNode->GetNid()
+                                              , targetLNode->GetNode()->GetName() );
+                if (rc)
+                {
+                    char la_buf[MON_STRING_BUF_SIZE];
+                    snprintf( la_buf, sizeof(la_buf)
+                            , "[%s] - Can't send unique string "
+                              "to target node %s, nid=%d\n"
+                            , method_name
+                            , targetLNode->GetNode()->GetName()
+                            , targetLNode->GetNid() );
+                    mon_log_write(MON_NODE_GETSTRINGID_1, SQ_LOG_ERR, la_buf);
+                }
             }
         }
         else
 #endif
         {
 #ifdef NAMESERVER_PROCESS
+            clone = clone;  // Make compiler happy!
             targetLNode = targetLNode;  // Make compiler happy!
 #endif
             CReplUniqStr *repl = new CReplUniqStr ( id.nid, id.id, candidate );
@@ -1092,15 +1104,26 @@ strId_t CNode::GetStringId( char *candidate, CLNode *targetLNode )
 #ifndef NAMESERVER_PROCESS
         if (NameServerEnabled)
         {
-            if (targetLNode != NULL &&
+            if (targetLNode != NULL && !clone &&
                 !MyNode->IsMyNode(targetLNode->GetNid()))
             {
                 // Forward the unique string to the target node
-                PtpClient->AddUniqStr( id.nid
-                                     , id.id
-                                     , candidate
-                                     , targetLNode->GetNid()
-                                     , targetLNode->GetNode()->GetName());
+                int rc = PtpClient->AddUniqStr( id.nid
+                                              , id.id
+                                              , candidate
+                                              , targetLNode->GetNid()
+                                              , targetLNode->GetNode()->GetName());
+                if (rc)
+                {
+                    char la_buf[MON_STRING_BUF_SIZE];
+                    snprintf( la_buf, sizeof(la_buf)
+                            , "[%s] - Can't send unique string "
+                              "to target node %s, nid=%d\n"
+                            , method_name
+                            , targetLNode->GetNode()->GetName()
+                            , targetLNode->GetNid() );
+                    mon_log_write(MON_NODE_GETSTRINGID_2, SQ_LOG_ERR, la_buf);
+                }
             }
         }
 #endif
@@ -1721,9 +1744,9 @@ CProcess *CNodeContainer::AddCloneProcess( ProcessInfoNs_reply_def *processInfo
     CLNode   *lnode = Nodes->GetLNode(processInfo->nid);
     CNode    *node = lnode->GetNode();
 
-    strId_t pathStrId = MyNode->GetStringId ( processInfo->path, lnode );
-    strId_t ldpathStrId = MyNode->GetStringId (processInfo->ldpath, lnode );
-    strId_t programStrId = MyNode->GetStringId ( processInfo->program, lnode );
+    strId_t pathStrId = MyNode->GetStringId ( processInfo->path, lnode, true );
+    strId_t ldpathStrId = MyNode->GetStringId (processInfo->ldpath, lnode, true );
+    strId_t programStrId = MyNode->GetStringId ( processInfo->program, lnode, true );
 
     CProcess *process = node->CloneProcess( processInfo->nid
                                           , processInfo->type
@@ -2535,7 +2558,7 @@ CProcess *CNodeContainer::CloneProcessNs( int nid
                 snprintf( buf, sizeof(buf),
                           "[%s] ProcessInfo failed, rc=%d\n"
                         , method_name, msg.u.reply.u.process_info_ns.return_code );
-                mon_log_write( MON_NODE_GETPROCESSNS_1, SQ_LOG_ERR, buf );
+                mon_log_write( MON_NODE_CLONEPROCESSNS_1, SQ_LOG_ERR, buf );
             }
         }
         else
@@ -2545,9 +2568,17 @@ CProcess *CNodeContainer::CloneProcessNs( int nid
                       "[%s], Invalid MsgType(%d)/ReplyType(%d) for "
                       "ProcessInfoNs\n"
                     , method_name, msg.type, msg.u.reply.type );
-            mon_log_write( MON_NODE_GETPROCESSNS_2, SQ_LOG_ERR, buf );
+            mon_log_write( MON_NODE_CLONEPROCESSNS_2, SQ_LOG_ERR, buf );
         }
     }
+    else
+    {
+        char la_buf[MON_STRING_BUF_SIZE];
+        snprintf( la_buf, sizeof(la_buf)
+                , "[%s] - Process info request to Name Server failed\n"
+                , method_name );
+        mon_log_write( MON_NODE_CLONEPROCESSNS_3, SQ_LOG_ERR, la_buf );
+    }
 
     TRACE_EXIT;
     return( process );
@@ -2596,7 +2627,7 @@ CProcess *CNodeContainer::CloneProcessNs( const char *name, Verifier_t verifier
                 snprintf( buf, sizeof(buf),
                           "[%s] ProcessInfo failed, rc=%d\n"
                         , method_name, msg.u.reply.u.process_info_ns.return_code );
-                mon_log_write( MON_NODE_GETPROCESSNS_3, SQ_LOG_ERR, buf );
+                mon_log_write( MON_NODE_CLONEPROCESSNS_4, SQ_LOG_ERR, buf );
             }
         }
         else
@@ -2606,9 +2637,17 @@ CProcess *CNodeContainer::CloneProcessNs( const char *name, Verifier_t verifier
                       "[%s], Invalid MsgType(%d)/ReplyType(%d) for "
                       "ProcessInfo\n"
                     , method_name, msg.type, msg.u.reply.type );
-            mon_log_write( MON_NODE_GETPROCESSNS_4, SQ_LOG_ERR, buf );
+            mon_log_write( MON_NODE_CLONEPROCESSNS_5, SQ_LOG_ERR, buf );
         }
     }
+    else
+    {
+        char la_buf[MON_STRING_BUF_SIZE];
+        snprintf( la_buf, sizeof(la_buf)
+                , "[%s] - Process info request to Name Server failed\n"
+                , method_name );
+        mon_log_write( MON_NODE_CLONEPROCESSNS_6, SQ_LOG_ERR, la_buf );
+    }
 
     TRACE_EXIT;
     return( process );
@@ -3243,6 +3282,11 @@ int CNodeContainer::GetProcessInfoNs( int nid
     }
     else
     {
+        char la_buf[MON_STRING_BUF_SIZE];
+        snprintf( la_buf, sizeof(la_buf)
+                , "[%s] - Process info request to Name Server failed\n"
+                , method_name );
+        mon_log_write( MON_NODE_GETPROCESSNS_3, SQ_LOG_ERR, la_buf );
         rc = MPI_ERR_OP;
     }
 
@@ -3293,7 +3337,7 @@ int CNodeContainer::GetProcessInfoNs( const char *name
                 snprintf( buf, sizeof(buf),
                           "[%s] ProcessInfo failed, rc=%d\n"
                         , method_name, msg.u.reply.u.process_info_ns.return_code );
-                mon_log_write( MON_NODE_GETPROCESSNS_3, SQ_LOG_ERR, buf );
+                mon_log_write( MON_NODE_GETPROCESSNS_4, SQ_LOG_ERR, buf );
             }
             rc = msg.u.reply.u.process_info_ns.return_code;
         }
@@ -3304,12 +3348,17 @@ int CNodeContainer::GetProcessInfoNs( const char *name
                       "[%s], Invalid MsgType(%d)/ReplyType(%d) for "
                       "ProcessInfo\n"
                     , method_name, msg.type, msg.u.reply.type );
-            mon_log_write( MON_NODE_GETPROCESSNS_4, SQ_LOG_ERR, buf );
+            mon_log_write( MON_NODE_GETPROCESSNS_5, SQ_LOG_ERR, buf );
             rc = MPI_ERR_OP;
         }
     }
     else
     {
+        char la_buf[MON_STRING_BUF_SIZE];
+        snprintf( la_buf, sizeof(la_buf)
+                , "[%s] - Process info request to Name Server failed\n"
+                , method_name );
+        mon_log_write( MON_NODE_GETPROCESSNS_6, SQ_LOG_ERR, la_buf );
         rc = MPI_ERR_OP;
     }
 
@@ -3383,7 +3432,7 @@ CProcess *CNodeContainer::GetProcessLByTypeNs( int nid, PROCESSTYPE type )
                 snprintf( buf, sizeof(buf),
                           "[%s] ProcessInfo failed, rc=%d\n"
                         , method_name, msg.u.reply.u.process_info_ns.return_code );
-                mon_log_write( MON_NODE_GETPROCESSNS_3, SQ_LOG_ERR, buf );
+                mon_log_write( MON_NODE_GETPROCESSLBYTYPENS_1, SQ_LOG_ERR, buf );
             }
         }
         else
@@ -3393,9 +3442,17 @@ CProcess *CNodeContainer::GetProcessLByTypeNs( int nid, PROCESSTYPE type )
                       "[%s], Invalid MsgType(%d)/ReplyType(%d) for "
                       "ProcessInfo\n"
                     , method_name, msg.type, msg.u.reply.type );
-            mon_log_write( MON_NODE_GETPROCESSNS_4, SQ_LOG_ERR, buf );
+            mon_log_write( MON_NODE_GETPROCESSLBYTYPENS_2, SQ_LOG_ERR, buf );
         }
     }
+    else
+    {
+        char la_buf[MON_STRING_BUF_SIZE];
+        snprintf( la_buf, sizeof(la_buf)
+                , "[%s] - Process info request to Name Server failed\n"
+                , method_name );
+        mon_log_write( MON_NODE_GETPROCESSLBYTYPENS_3, SQ_LOG_ERR, la_buf );
+    }
 
     TRACE_EXIT;
     return( process );

http://git-wip-us.apache.org/repos/asf/trafodion/blob/65ac5563/core/sqf/monitor/linux/pnode.h
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/pnode.h b/core/sqf/monitor/linux/pnode.h
index fbfddf4..b04e387 100644
--- a/core/sqf/monitor/linux/pnode.h
+++ b/core/sqf/monitor/linux/pnode.h
@@ -269,7 +269,7 @@ public:
     // If candidate string has not been seen before assign a unique
     // id and store it in the config database.   In either case return
     // the unique id as the value of the method.
-    strId_t GetStringId( char *candidate, CLNode *targetLNode = NULL );
+    strId_t GetStringId( char *candidate, CLNode *targetLNode = NULL, bool clone = false );
 
     inline int   GetTmSyncNid( void ) { return( tmSyncNid_ ); }
     inline SyncState GetTmSyncState( void ) { return( tmSyncState_ ); }

http://git-wip-us.apache.org/repos/asf/trafodion/blob/65ac5563/core/sqf/monitor/linux/process.cxx
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/process.cxx b/core/sqf/monitor/linux/process.cxx
index 5e7b792..3a3e699 100644
--- a/core/sqf/monitor/linux/process.cxx
+++ b/core/sqf/monitor/linux/process.cxx
@@ -651,14 +651,24 @@ void CProcess::procExitNotifierNodes( void )
         {
             if (NameServerEnabled && targetNode->GetPNid() != MyPNID)
             {
-                int rc = -1;
                 // Forward the process exit to the target node
-                rc = PtpClient->ProcessExit( this 
-                                           , targetLNode->GetNid()
-                                           , targetNode->GetName() ); 
+                int rc = PtpClient->ProcessExit( this 
+                                               , targetLNode->GetNid()
+                                               , targetNode->GetName() ); 
                 if (rc)
                 {
-                    // TODO: Error handling
+                    char la_buf[MON_STRING_BUF_SIZE];
+                    snprintf( la_buf, sizeof(la_buf)
+                            , "[%s] - Can't send process exit "
+                              "for process %s (%d, %d) "
+                              "to target node %s, nid=%d\n"
+                            , method_name
+                            , GetName()
+                            , GetNid()
+                            , GetPid()
+                            , targetLNode->GetNode()->GetName()
+                            , targetLNode->GetNid() );
+                    mon_log_write(MON_PROCESS_PROCEXITNOTIFIERNODES_1, SQ_LOG_ERR, la_buf);
                 }
             }
         }
@@ -709,7 +719,18 @@ void CProcess::procExitUnregAll ( _TM_Txid_External transId )
                                              , targetLNode->GetNode()->GetName() ); 
                 if (rc)
                 {
-                    // TODO: Error handling
+                    char la_buf[MON_STRING_BUF_SIZE];
+                    snprintf( la_buf, sizeof(la_buf)
+                            , "[%s] - Can't send process notify request "
+                              "for process %s (%d, %d) "
+                              "to target node %s, nid=%d\n"
+                            , method_name
+                            , targetProcess->GetName()
+                            , targetProcess->GetNid()
+                            , targetProcess->GetPid()
+                            , targetLNode->GetNode()->GetName()
+                            , targetLNode->GetNid() );
+                    mon_log_write(MON_PROCESS_PROCEXITUNREGALL_1, SQ_LOG_ERR, la_buf);
                 }
             }
             
@@ -726,6 +747,7 @@ void CProcess::procExitUnregAll ( _TM_Txid_External transId )
 }
 #endif
 
+#ifndef NAMESERVER_PROCESS
 void CProcess::childAdd ( int nid, int pid )
 {
     const char method_name[] = "CProcess::childAdd";
@@ -795,6 +817,81 @@ bool CProcess::childRemoveFirst ( nidPid_t & child)
     return result;
 }
 
+void CProcess::childUnHookedAdd( int nid, int pid )
+{
+    const char method_name[] = "CProcess::childUnHookedAdd";
+    TRACE_ENTRY;
+
+    if (trace_settings & (TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL))
+        trace_printf( "%s@%d adding unhooked child (%d:%d)\n"
+                    , method_name, __LINE__
+                    , nid, pid );
+
+    nidPid_t child = { nid, pid };
+    childrenListLock_.lock();
+    childrenUnHooked_.push_back ( child );
+    childrenListLock_.unlock();
+
+    TRACE_EXIT;
+}
+
+int CProcess::childUnHookedCount( void )
+{
+    const char method_name[] = "CProcess::childUnHookedCount";
+    TRACE_ENTRY;
+
+    childrenListLock_.lock();
+    int count = childrenUnHooked_.size();
+    childrenListLock_.unlock();
+
+    TRACE_EXIT;
+    return(count);
+}
+
+void CProcess::childUnHookedRemove( int nid, int pid )
+{
+    const char method_name[] = "CProcess::childUnHookedRemove";
+    TRACE_ENTRY;
+
+    nidPidList_t::iterator it;
+
+    childrenListLock_.lock();
+    for ( it = childrenUnHooked_.begin(); it != childrenUnHooked_.end(); ++it)
+    {
+        if (it->nid == nid && it->pid == pid )
+        {
+            childrenUnHooked_.erase ( it );
+            break;
+        }
+    }
+    childrenListLock_.unlock();
+
+    TRACE_EXIT;
+}
+
+bool CProcess::childUnHookedRemoveFirst( nidPid_t & child)
+{
+    const char method_name[] = "CProcess::childUnHookedRemoveFirst";
+    TRACE_ENTRY;
+
+    bool result = false;
+
+    childrenListLock_.lock();
+    if ( !childrenUnHooked_.empty() )
+    {
+        child = childrenUnHooked_.front ();
+        childrenUnHooked_.pop_front ();
+        result = true;
+
+    }
+    childrenListLock_.unlock();
+
+    TRACE_EXIT;
+
+    return result;
+}
+#endif
+
 #ifndef NAMESERVER_PROCESS
 void CProcess::CompleteDump(DUMPSTATUS status, char *core_file)
 {
@@ -882,7 +979,16 @@ void CProcess::CompleteProcessStartup (char *port, int os_pid, bool event_messag
                     rc = NameServer->ProcessNew(this); // in reqQueue thread (CExtStartupReq)
                     if (rc)
                     {
-                        // TODO: Error handling
+                        char la_buf[MON_STRING_BUF_SIZE];
+                        snprintf( la_buf, sizeof(la_buf)
+                                , "[%s] - Can't register new process "
+                                  "%s (%d, %d) "
+                                  "to Name Server process\n"
+                                , method_name
+                                , GetName()
+                                , GetNid()
+                                , GetPid() );
+                        mon_log_write(MON_PROCESS_COMPLETESTARTUP_1, SQ_LOG_ERR, la_buf);
                     }
 
                     if (Parent_Nid != -1)
@@ -893,14 +999,22 @@ void CProcess::CompleteProcessStartup (char *port, int os_pid, bool event_messag
                             rc = PtpClient->ProcessClone(this);
                             if (rc)
                             {
-                                // TODO: Error handling
+                                char la_buf[MON_STRING_BUF_SIZE];
+                                CLNode *parentLNode = NULL;
+                                parentLNode = Nodes->GetLNode( GetParentNid() );
+                                snprintf( la_buf, sizeof(la_buf)
+                                        , "[%s] - Can't send process clone request"
+                                          "for process %s (%d, %d) "
+                                          "to parent node %s, nid=%d\n"
+                                        , method_name
+                                        , GetName()
+                                        , GetNid()
+                                        , GetPid()
+                                        , parentLNode->GetNode()->GetName()
+                                        , parentLNode->GetNid() );
+                                mon_log_write(MON_PROCESS_COMPLETESTARTUP_2, SQ_LOG_ERR, la_buf);
                             }
                         }
-                        else
-                        {
-                            // TODO: Generate internal clone request?
-                            //       to update local parent?  
-                        }
                     }
                 }
                 else
@@ -929,14 +1043,22 @@ void CProcess::CompleteProcessStartup (char *port, int os_pid, bool event_messag
                         rc = PtpClient->ProcessClone(this);
                         if (rc)
                         {
-                            // TODO: Error handling
+                            char la_buf[MON_STRING_BUF_SIZE];
+                            CLNode *parentLNode = NULL;
+                            parentLNode = Nodes->GetLNode( GetParentNid() );
+                            snprintf( la_buf, sizeof(la_buf)
+                                    , "[%s] - Can't send process clone request"
+                                      "for process %s (%d, %d) "
+                                      "to parent node %s, nid=%d\n"
+                                    , method_name
+                                    , GetName()
+                                    , GetNid()
+                                    , GetPid()
+                                    , parentLNode->GetNode()->GetName()
+                                    , parentLNode->GetNid() );
+                            mon_log_write(MON_PROCESS_COMPLETESTARTUP_3, SQ_LOG_ERR, la_buf);
                         }
                     }
-                    else
-                    {
-                        // TODO: Generate internal clone request?
-                        //       to update local parent?  
-                    }
                 }
             }
             else
@@ -1616,6 +1738,7 @@ bool CProcess::Create (CProcess *parent, void* tag, int & result)
     int i;
     int j;
     int rc = -1;
+    int rc2 = -1;
     char *env;
     char **argv;
     char *childEnv[MAX_CHILD_ENV_VARS + 1];
@@ -2339,10 +2462,27 @@ bool CProcess::Create (CProcess *parent, void* tag, int & result)
             // Send actual pid and process name back to parent
             // STDIO Redirection requires that clone process in parent node
             // have the actual pid
-            PtpClient->ProcessInit( this
-                                  , tag
-                                  , 0
-                                  , parent->Nid );
+            rc2 = PtpClient->ProcessInit( this
+                                        , tag
+                                        , 0
+                                        , parent->Nid );
+            if (rc2)
+            {
+                char la_buf[MON_STRING_BUF_SIZE];
+                CLNode *parentLNode = NULL;
+                parentLNode = Nodes->GetLNode( parent->Nid );
+                snprintf( la_buf, sizeof(la_buf)
+                        , "[%s] - Can't send process create success "
+                          "for process %s (%d, %d) "
+                          "to parent node %s, nid=%d\n"
+                        , method_name
+                        , GetName()
+                        , GetNid()
+                        , GetPid()
+                        , parentLNode->GetNode()->GetName()
+                        , parentLNode->GetNid() );
+                mon_log_write(MON_PROCESS_CREATE_12, SQ_LOG_ERR, la_buf);
+            }
         }
 
         if (trace_settings & (TRACE_PROCESS | TRACE_REDIRECTION))
@@ -2708,6 +2848,31 @@ bool CProcess::Create (CProcess *parent, void* tag, int & result)
         successful = false;
         result = MPI_ERR_SPAWN;
 
+        if (NameServerEnabled)
+        {
+            rc2 = PtpClient->ProcessInit( this
+                                        , tag
+                                        , result
+                                        , parent->Nid );
+            if (rc2)
+            {
+                char la_buf[MON_STRING_BUF_SIZE];
+                CLNode *parentLNode = NULL;
+                parentLNode = Nodes->GetLNode( parent->Nid );
+                snprintf( la_buf, sizeof(la_buf)
+                        , "[%s] - Can't send process create failure "
+                          "for process %s (%d, %d) "
+                          "result to parent node %s, nid=%d, result=%d\n"
+                        , method_name
+                        , GetName()
+                        , GetNid()
+                        , GetPid()
+                        , parentLNode->GetNode()->GetName()
+                        , parentLNode->GetNid(), result );
+                mon_log_write(MON_PROCESS_CREATE_13, SQ_LOG_ERR, la_buf);
+            }
+        }
+
         char buf[MON_STRING_BUF_SIZE];
         snprintf(buf, sizeof(buf), "[CProcess::Create], Failed to start process %s path= %s.\n", Name, path.c_str());
         mon_log_write(MON_PROCESS_CREATE_11, SQ_LOG_ERR, buf);
@@ -3231,6 +3396,10 @@ void CProcess::Exit( CProcess *parent )
         if ( (parent != NULL) && (parent->GetState() == State_Up) )
         {
             parent->childRemove( Nid, Pid);
+            if (NameServerEnabled)
+            {
+                parent->childUnHookedRemove( Nid, Pid);
+            }
         }
 
         // Check if we need to output a entry into the process id map log file
@@ -3364,7 +3533,18 @@ void CProcess::Exit( CProcess *parent )
                                                , targetLNode->GetNode()->GetName() );
                 if (rc)
                 {
-                    // TODO: Error handling
+                    char la_buf[MON_STRING_BUF_SIZE];
+                    snprintf( la_buf, sizeof(la_buf)
+                            , "[%s] - Can't send process exit "
+                              "for process %s (%d, %d) "
+                              "to parent node %s, nid=%d\n"
+                            , method_name
+                            , GetName()
+                            , GetNid()
+                            , GetPid()
+                            , targetLNode->GetNode()->GetName()
+                            , targetLNode->GetNid() );
+                    mon_log_write(MON_PROCESS_PROCEXIT_1, SQ_LOG_ERR, la_buf);
                 }
             }
         }
@@ -3380,7 +3560,18 @@ void CProcess::Exit( CProcess *parent )
                                                , targetLNode->GetNode()->GetName() );
                 if (rc)
                 {
-                    // TODO: Error handling
+                    char la_buf[MON_STRING_BUF_SIZE];
+                    snprintf( la_buf, sizeof(la_buf)
+                            , "[%s] - Can't send process exit "
+                              "for process %s (%d, %d) "
+                              "to parent node %s, nid=%d\n"
+                            , method_name
+                            , GetName()
+                            , GetNid()
+                            , GetPid()
+                            , targetLNode->GetNode()->GetName()
+                            , targetLNode->GetNid() );
+                    mon_log_write(MON_PROCESS_PROCEXIT_2, SQ_LOG_ERR, la_buf);
                 }
             }
         }
@@ -4703,18 +4894,36 @@ void CProcessContainer::Child_Exit ( CProcess * parent )
                 {
                     if (NameServerEnabled)
                     {
-                        CNode  *childNode = NULL;
-                        childNode = childNode->GetNode();
-                        // Forward the process create to the target node
-                        PtpClient->ProcessKill( process
-                                              , process->GetAbort()
-                                              , childLNode->GetNid()
-                                              , childNode->GetName());
+                        CNode* childNode = childLNode->GetNode();
+                        // Forward the process kill to the target node
+                        int rc = PtpClient->ProcessKill( process
+                                                       , process->GetAbort()
+                                                       , childLNode->GetNid()
+                                                       , childNode->GetName() );
+                        if (rc)
+                        {
+                            char la_buf[MON_STRING_BUF_SIZE];
+                            snprintf( la_buf, sizeof(la_buf)
+                                    , "[%s] - Can't send process kill "
+                                      "request for child process %s (%d, %d) "
+                                      "to child node %s, nid=%d\n"
+                                    , method_name
+                                    , process->GetName()
+                                    , process->GetNid()
+                                    , process->GetPid()
+                                    , childNode->GetName()
+                                    , childLNode->GetNid() );
+                            mon_log_write(MON_PROCESSCONT_CHILDEXIT_1, SQ_LOG_ERR, la_buf);
+                        }
                     }
                 }
                 
                 if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
-                    trace_printf("%s@%d - Completed kill for child process %s (%d, %d)\n", method_name, __LINE__, process->GetName(), process->GetNid(), process->GetPid());
+                    trace_printf( "%s@%d - Completed kill for child process %s (%d, %d)\n"
+                                , method_name, __LINE__
+                                , process->GetName()
+                                , process->GetNid()
+                                , process->GetPid());
             }
             else
             {
@@ -4736,6 +4945,90 @@ void CProcessContainer::Child_Exit ( CProcess * parent )
     }
     TRACE_EXIT;
 }
+
+void CProcessContainer::ChildUnHooked_Exit( CProcess* parent )
+{
+    const char method_name[] = "CProcessContainer::ChildUnHooked_Exit";
+    TRACE_ENTRY;
+
+    CProcess *process;
+
+    if (trace_settings & (TRACE_REQUEST | TRACE_PROCESS))
+        trace_printf( "%s@%d with parent %s (%d,%d:%d)\n"
+                    , method_name, __LINE__
+                    , parent->GetName()
+                    , parent->GetNid()
+                    , parent->GetPid()
+                    , parent->GetVerifier() );
+
+    if (NameServerEnabled)
+    {
+        if ( parent && !parent->IsClone()
+           && ((MyNode->GetState() != State_Shutdown
+             && MyNode->GetShutdownLevel() == ShutdownLevel_Undefined)) )
+        {
+            CProcess::nidPid_t child;
+            CLNode* childLNode;
+
+            while ( parent->childUnHookedRemoveFirst( child ))
+            {
+                childLNode = Nodes->GetLNode( child.nid );
+                process = (childLNode != NULL )
+                             ? childLNode->GetNode()->GetProcess( child.pid ) : NULL;
+                if (process)
+                {
+                    if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
+                    {
+                        trace_printf( "%s@%d - Telling unhooked child process %s (%d,%d:%d) "
+                                      "of parent death %s (%d,%d:%d)\n"
+                                    , method_name, __LINE__
+                                    , process->GetName()
+                                    , process->GetNid()
+                                    , process->GetPid()
+                                    , process->GetVerifier()
+                                    , parent->GetName()
+                                    , parent->GetNid()
+                                    , parent->GetPid()
+                                    , parent->GetVerifier() );
+                    }
+    
+                    CNode* childNode = childLNode->GetNode();
+                    // Forward the parent's process exit to the child's node
+                    int rc = PtpClient->ProcessExit( parent
+                                                   , childLNode->GetNid()
+                                                   , childNode->GetName() );
+                    if (rc)
+                    {
+                        char la_buf[MON_STRING_BUF_SIZE];
+                        snprintf( la_buf, sizeof(la_buf)
+                                , "[%s] - Can't send process exit "
+                                  "request for parent process %s (%d,%d:%d) "
+                                  "to child's node %s, nid=%d\n"
+                                , method_name
+                                , parent->GetName()
+                                , parent->GetNid()
+                                , parent->GetPid()
+                                , parent->GetVerifier()
+                                , childNode->GetName()
+                                , childLNode->GetNid() );
+                        mon_log_write(MON_PROCESSCONT_CHILDEXIT_1, SQ_LOG_ERR, la_buf);
+                    }
+                    else
+                    {
+                        if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
+                            trace_printf( "%s@%d - Completed kill for parent process %s (%d,%d:%d)\n"
+                                        , method_name, __LINE__
+                                        , parent->GetName()
+                                        , parent->GetNid()
+                                        , parent->GetPid()
+                                        , parent->GetVerifier() );
+                    }
+                }
+            }
+        }
+    }
+    TRACE_EXIT;
+}
 #endif
 
 void CProcessContainer::CleanUpProcesses( void )
@@ -4922,7 +5215,6 @@ CProcess *CProcessContainer::CompleteProcessStartup (char *process_name,
                 // exits abnormally.
                 int parentNid;
                 int parentPid;
-                CProcess * parent;
                 if ( ! process->IsBackup() )
                 {
                     parentNid = process->GetParentNid();
@@ -4934,8 +5226,10 @@ CProcess *CProcessContainer::CompleteProcessStartup (char *process_name,
                     parentPid = process->GetPairParentPid();
                 }
 
+#ifndef NAMESERVER_PROCESS
                 if ( parentNid != -1 && parentPid != -1 )
                 {
+                    CProcess* parent;
                     parent = Nodes->GetLNode ( parentNid )
                                 ->GetProcessL( parentPid );
                     if ( parent && !process->IsBackup() )
@@ -4945,7 +5239,43 @@ CProcess *CProcessContainer::CompleteProcessStartup (char *process_name,
                         parent->childAdd ( process->GetNid(), os_pid );
                     }
                 }
+#endif
+            }
+#ifndef NAMESERVER_PROCESS
+            if (NameServerEnabled)
+            {
+                if (process->IsUnhooked())
+                {   // Parent process object keeps track of child processes
+                    // created. Needed when parent process exits to clean up
+                    // parent clone process object in remote nodes.
+                    int parentNid;
+                    int parentPid;
+                    CProcess* parent;
+                    if ( !process->IsBackup() )
+                    {
+                        parentNid = process->GetParentNid();
+                        parentPid = process->GetParentPid();
+                    }
+                    else
+                    {
+                        parentNid = process->GetPairParentNid();
+                        parentPid = process->GetPairParentPid();
+                    }
+    
+                    if ( parentNid != -1 && parentPid != -1 )
+                    {
+                        parent = Nodes->GetLNode(parentNid)->GetProcessL(parentPid);
+                        if ( parent && !parent->IsClone() && !process->IsBackup() )
+                        {
+                            parent->childUnHookedRemove( process->GetNid()
+                                                       , process->GetPid() );
+                            parent->childUnHookedAdd( process->GetNid()
+                                                    , os_pid );
+                        }
+                    }
+                }
             }
+#endif
             // Process id changed from when we started the process.  So
             // remap using the new pid.  [This could happen if, for example,
             // a shell script was the originally started process and it
@@ -5366,6 +5696,14 @@ void CProcessContainer::Exit_Process (CProcess *process, bool abend, int downNod
             Child_Exit(process);
         }
 
+        if (!process->IsClone() && NameServerEnabled)
+        {
+            if (process->childUnHookedCount() > 0)
+            {
+                ChildUnHooked_Exit(process);
+            }
+        }
+    
         if ( parent == NULL)
         {
             parent = Nodes->GetProcess( process->GetParentNid(),

http://git-wip-us.apache.org/repos/asf/trafodion/blob/65ac5563/core/sqf/monitor/linux/process.h
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/process.h b/core/sqf/monitor/linux/process.h
index 3c813bb..3cde3e5 100644
--- a/core/sqf/monitor/linux/process.h
+++ b/core/sqf/monitor/linux/process.h
@@ -73,6 +73,7 @@ class CProcessContainer
                                 , _TM_Txid_External trans_id );
 #ifndef NAMESERVER_PROCESS
     void Child_Exit ( CProcess * parent );
+    void ChildUnHooked_Exit ( CProcess * parent );
 #endif
     void CleanUpProcesses( void );
     CProcess *CloneProcess( int nid, 
@@ -428,11 +429,18 @@ class CProcess
     void SetHangupTime () { clock_gettime(CLOCK_REALTIME, &hangupTime_); }
     time_t GetHangupTime () { return hangupTime_.tv_sec; }
 
+#ifndef NAMESERVER_PROCESS
     void childAdd ( int nid, int pid );
     int childCount ( void );
     void childRemove ( int nid, int pid );
     bool childRemoveFirst ( nidPid_t & child );
 
+    void childUnHookedAdd( int nid, int pid );
+    int childUnHookedCount( void );
+    void childUnHookedRemove( int nid, int pid );
+    bool childUnHookedRemoveFirst( nidPid_t & child );
+#endif
+
     struct message_def * GetDeathNotice ( void );
     void PutDeathNotice( struct message_def * );
 
@@ -558,17 +566,20 @@ private:
 
     enum  { MAX_CHILD_ENV_VARS = 300 };
 
+#ifndef NAMESERVER_PROCESS
     // Container to keep track of this process' children created on
     // the local node.  Needed because if this process abornmally terminates
     // the children will be terminated too.
     typedef list<nidPid_t> nidPidList_t;
     nidPidList_t children_;
+    nidPidList_t childrenUnHooked_;   // only used with Name Server enabled
 
     // Lock for children_ list.   Temporarily using a lock but should 
     // be able to eliminate for better performance.   Once lioCleanupThread
     // and syncThread uniformly queue requests to be processed by worker
     // thread this lock should not be necessary.
     CLock       childrenListLock_;
+#endif
 
     // Container to hold dead process info to be sent as death notices
     // to an ssmp process.   This is a NULL list except when the CProcess

http://git-wip-us.apache.org/repos/asf/trafodion/blob/65ac5563/core/sqf/monitor/linux/reqkill.cxx
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/reqkill.cxx b/core/sqf/monitor/linux/reqkill.cxx
index e8cad71..b59cae2 100644
--- a/core/sqf/monitor/linux/reqkill.cxx
+++ b/core/sqf/monitor/linux/reqkill.cxx
@@ -94,10 +94,25 @@ void CExtKillReq::Kill( CProcess *process )
         if (NameServerEnabled)
         {
             // Forward the process create to the target node
-            PtpClient->ProcessKill( process
-                                  , process->GetAbort()
-                                  , lnode->GetNid()
-                                  , node->GetName());
+            int rc = PtpClient->ProcessKill( process
+                                           , process->GetAbort()
+                                           , lnode->GetNid()
+                                           , node->GetName());
+            if (rc)
+            {
+                char la_buf[MON_STRING_BUF_SIZE];
+                snprintf( la_buf, sizeof(la_buf)
+                        , "[%s] - Can't send process kill "
+                          "request for child process %s (%d, %d) "
+                          "to child node %s, nid=%d\n"
+                        , method_name
+                        , process->GetName()
+                        , process->GetNid()
+                        , process->GetPid()
+                        , node->GetName()
+                        , lnode->GetNid() );
+                mon_log_write(MON_REQ_KILL_1, SQ_LOG_ERR, la_buf);
+            }
         }
         else
         {

http://git-wip-us.apache.org/repos/asf/trafodion/blob/65ac5563/core/sqf/monitor/linux/reqnewproc.cxx
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/reqnewproc.cxx b/core/sqf/monitor/linux/reqnewproc.cxx
index a6da01e..972d785 100644
--- a/core/sqf/monitor/linux/reqnewproc.cxx
+++ b/core/sqf/monitor/linux/reqnewproc.cxx
@@ -532,9 +532,24 @@ void CExtNewProcReq::performRequest()
                     if (NameServerEnabled)
                     {
                         // Forward the process create to the target node
-                        PtpClient->ProcessNew( process
-                                             , lnode->GetNid()
-                                             , lnode->GetNode()->GetName());
+                        int rc = PtpClient->ProcessNew( process
+                                                      , lnode->GetNid()
+                                                      , lnode->GetNode()->GetName());
+                        if (rc)
+                        {
+                            char la_buf[MON_STRING_BUF_SIZE];
+                            snprintf( la_buf, sizeof(la_buf)
+                                    , "[%s] - Can't send process create "
+                                      "request for process %s (%d, %d) "
+                                      "to target node %s, nid=%d\n"
+                                    , method_name
+                                    , process->GetName()
+                                    , process->GetNid()
+                                    , process->GetPid()
+                                    , lnode->GetNode()->GetName()
+                                    , lnode->GetNid() );
+                            mon_log_write(MON_MONITOR_STARTPROCESS_15, SQ_LOG_ERR, la_buf);
+                        }
                     }
                     else
 #endif

http://git-wip-us.apache.org/repos/asf/trafodion/blob/65ac5563/core/sqf/monitor/linux/reqnotify.cxx
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/reqnotify.cxx b/core/sqf/monitor/linux/reqnotify.cxx
index 5900f01..4d278ce 100644
--- a/core/sqf/monitor/linux/reqnotify.cxx
+++ b/core/sqf/monitor/linux/reqnotify.cxx
@@ -274,16 +274,6 @@ void CExtNotifyReq::performRequest()
                 } 
                 else
                 {
-#if 0
-                    if ( msg_->u.request.u.notify.cancel )
-                    {   // Unregister interest in death of target process 
-                        status = targetProcess->CancelDeathNotification( nid_
-                                                                       , pid
-                                                                       , verifier_
-                                                                       , msg_->u.request.u.notify.trans_id);
-                    }
-                    else if (sourceProcess)
-#endif
                     if (sourceProcess)
                     {   // Register interest in death of target process 
                         if (NameServerEnabled && targetProcess->IsClone())
@@ -304,7 +294,18 @@ void CExtNotifyReq::performRequest()
                                                          , targetLNode->GetNode()->GetName() );
                             if (rc)
                             {
-                                // TODO: Error handling
+                                char la_buf[MON_STRING_BUF_SIZE];
+                                snprintf( la_buf, sizeof(la_buf)
+                                        , "[%s] - Can't send process notify request "
+                                          "for process %s (%d, %d) "
+                                          "to target node %s, nid=%d\n"
+                                        , method_name
+                                        , sourceProcess->GetName()
+                                        , sourceProcess->GetNid()
+                                        , sourceProcess->GetPid()
+                                        , targetLNode->GetNode()->GetName()
+                                        , targetLNode->GetNid() );
+                                mon_log_write(MON_REQ_NOTIFY_1, SQ_LOG_ERR, la_buf);
                             }
                         }
                         

http://git-wip-us.apache.org/repos/asf/trafodion/blob/65ac5563/core/sqf/monitor/linux/reqprocinfo.cxx
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/reqprocinfo.cxx b/core/sqf/monitor/linux/reqprocinfo.cxx
index 84dc3a7..d3f04e2 100644
--- a/core/sqf/monitor/linux/reqprocinfo.cxx
+++ b/core/sqf/monitor/linux/reqprocinfo.cxx
@@ -44,7 +44,7 @@ extern CNameServer *NameServer;
 // Copy information for a specific process into the reply message buffer.
 void CExtProcInfoBase::ProcessInfo_CopyData(CProcess *process, ProcessInfoState &procState)
 {
-    const char method_name[] = "CNameServer::SendReceive";
+    const char method_name[] = "CExtProcInfoBase::ProcessInfo_CopyData";
     CProcess *parent;
 
     TRACE_ENTRY;
@@ -356,7 +356,17 @@ void CExtProcInfoReq::performRequest()
     }
 
     if ( NameServerEnabled && !getMonitorInfo )
-        NameServer->ProcessInfo(msg_); // in reqQueue thread (CExternalReq)
+    {
+        int rc = NameServer->ProcessInfo(msg_); // in reqQueue thread (CExternalReq)
+        if (rc)
+        {
+            char la_buf[MON_STRING_BUF_SIZE];
+            snprintf( la_buf, sizeof(la_buf)
+                    , "[%s] - Process info request to Name Server failed\n"
+                    , method_name );
+            mon_log_write(MON_REQ_PROCINFO_1, SQ_LOG_ERR, la_buf);
+        }
+    }
 #endif
 
 #ifndef NAMESERVER_PROCESS
@@ -642,7 +652,17 @@ void CExtProcInfoContReq::performRequest()
     }
 
     if ( NameServerEnabled && !getMonitorInfo )
-        NameServer->ProcessInfoCont(msg_); // in reqQueue thread (CExternalReq)
+    {
+        int rc = NameServer->ProcessInfoCont(msg_); // in reqQueue thread (CExternalReq)
+        if (rc)
+        {
+            char la_buf[MON_STRING_BUF_SIZE];
+            snprintf( la_buf, sizeof(la_buf)
+                    , "[%s] - Process info continue request to Name Server failed\n"
+                    , method_name );
+            mon_log_write(MON_REQ_PROCINFOCONT_1, SQ_LOG_ERR, la_buf);
+        }
+    }
 #endif
 
 #ifndef NAMESERVER_PROCESS

http://git-wip-us.apache.org/repos/asf/trafodion/blob/65ac5563/core/sqf/monitor/linux/reqqueue.cxx
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/reqqueue.cxx b/core/sqf/monitor/linux/reqqueue.cxx
index 508a325..c7af4ca 100644
--- a/core/sqf/monitor/linux/reqqueue.cxx
+++ b/core/sqf/monitor/linux/reqqueue.cxx
@@ -1577,7 +1577,7 @@ void CIntNewProcReq::performRequest()
         else
         {
             if (NameServerEnabled)
-            { // Name Server find by nid,pid:verifier
+            {
                 if (trace_settings & TRACE_REQUEST)
                     trace_printf( "%s@%d" " - Getting parent process from Name Server (%d,%d:%d)\n"
                                 , method_name, __LINE__
@@ -1656,14 +1656,7 @@ void CIntNewProcReq::performRequest()
             {
                 // Process creation failure, relay error code to node
                 // that requested process creation.
-                if (NameServerEnabled)
-                {
-                    PtpClient->ProcessInit( newProcess
-                                          , reqTag_
-                                          , result
-                                          , parentNid_ );
-                }
-                else
+                if (!NameServerEnabled)
                 {
                     CReplProcInit *repl = new CReplProcInit(newProcess, reqTag_,
                                                             result, parentNid_);
@@ -2285,9 +2278,10 @@ void CIntProcInitReq::performRequest()
         Nodes->GetLNode( process_->GetNid() )->GetNode()->AddToPidMap(process_->GetPid(), process_);
         Nodes->GetLNode( process_->GetNid() )->GetNode()->AddToNameMap(process_);
 
+        CProcess* parent;
+
         if (process_->IsBackup())
         {
-            CProcess * parent;
             parent = Nodes->GetProcess(process_->GetParentNid(),
                                        process_->GetParentPid(), false);
             if (parent)
@@ -2295,53 +2289,59 @@ void CIntProcInitReq::performRequest()
                 // this backup process object.
                 if (trace_settings & (TRACE_SYNC | TRACE_PROCESS))
                 {
-                    trace_printf("%s@%d - For backup process (%d, %d)"
-                                 ", for parent (%d, %d) setting "
-                                 "parent's Parent_Nid/Parent_Pid="
-                                 "(%d, %d).\n",
-                                 method_name, __LINE__,  process_->GetNid(),
-                                 process_->GetPid(), parent->GetNid(),
-                                 parent->GetPid(),
-                                 process_->GetNid(), process_->GetPid());
+                    trace_printf( "%s@%d - For backup process %s (%d,%d:%d)"
+                                  ", for parent %s (%d,%d:%d) setting "
+                                  "parent's Parent_Nid/Parent_Pid="
+                                  "(%d,%d).\n"
+                                , method_name, __LINE__
+                                , process_->GetName()
+                                , process_->GetNid()
+                                , process_->GetPid()
+                                , process_->GetVerifier()
+                                , parent->GetName()
+                                , parent->GetNid()
+                                , parent->GetPid()
+                                , parent->GetVerifier()
+                                , process_->GetNid()
+                                , process_->GetPid());
                 }
                 parent->SetParentNid ( process_->GetNid() );
                 parent->SetParentPid ( process_->GetPid() );
             }
         }
-
-
-#ifdef QUICK_WAITED_NEWPROCESS_REPLY
-// Following allows reply to a "waited" new process request before we
-// get the "startup" message from the process.   This make the process
-// creation appear to complete more quickly.   However there are potential
-// problems if the requester immediately tries to open the new process
-// because it is not ready yet.   So need to handle quick "open" of this
-// type before re-enabling this code section.
-                if (!process->IsNowait())
-                {   // new process request was a "waited" request
-                    if (process->GetParentNid() == -1)
-                    {
-                        parent = NULL;
-                    }
-                    else
-                    {
-                        parent =
-                            LNode[process->GetParentNid()]->
-                            GetProcessL(process->GetParentPid());
-                    }
-
-                    if (parent)
-                    {
-                        reply_msg = process->parentContext();
-                        if ( reply_msg )
-                        {
-                            // the parent gets a new_process reply
-                            parent->ReplyNewProcess ( reply_msg, process );
-
-                            process->parentContext (NULL);
-                        }
+#ifndef NAMESERVER_PROCESS
+        if (NameServerEnabled)
+        {
+            if (process_->IsUnhooked())
+            {
+                if ( process_->GetParentNid() != -1 && process_->GetParentPid() != -1 )
+                {
+                    parent = Nodes->GetProcess(process_->GetParentNid(),
+                                               process_->GetParentPid(), false);
+                    if (parent && !parent->IsClone())
+                    {   // Parent process object keeps track of child processes
+                        // created. Needed when parent process exits to clean up
+                        // parent clone process object in remote nodes.
+                        if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL
+                                              | TRACE_PROCESS_DETAIL))
+                            trace_printf( "%s@%d - Adding unhooked child process %s (%d,%d:%d) to "
+                                          "parent %s (%d,%d:%d)\n"
+                                        , method_name, __LINE__
+                                        , process_->GetName()
+                                        , process_->GetNid()
+                                        , process_->GetPid()
+                                        , process_->GetVerifier()
+                                        , parent->GetName()
+                                        , parent->GetNid()
+                                        , parent->GetPid()
+                                        , parent->GetVerifier() );
+            
+                        parent->childUnHookedAdd( process_->GetNid()
+                                                , process_->GetPid() );
                     }
                 }
+            }
+        }
 #endif
     }
 
@@ -2599,7 +2599,22 @@ void CIntChildDeathReq::performRequest()
         }
 #ifndef NAMESERVER_PROCESS
         if ( NameServerEnabled )
-            NameServer->ProcessDelete(process_); // in reqQueue thread (CIntChildDeathReq)
+        {
+            int rc = NameServer->ProcessDelete(process_); // in reqQueue thread (CIntChildDeathReq)
+            if (rc)
+            {
+                char la_buf[MON_STRING_BUF_SIZE];
+                snprintf( la_buf, sizeof(la_buf)
+                        , "[%s] - Process delete request to Name Server failed"
+                          "for child process %s (%d, %d:%d)\n"
+                        , method_name
+                        , process_->GetName()
+                        , process_->GetNid()
+                        , process_->GetPid()
+                        , process_->GetVerifier() );
+                mon_log_write(MON_INTREQ_CHILDDEATH_1, SQ_LOG_ERR, la_buf);
+            }
+        }
 #endif
         MyNode->DelFromNameMap ( process_ );
         MyNode->DelFromPidMap ( process_ );

http://git-wip-us.apache.org/repos/asf/trafodion/blob/65ac5563/core/sqf/monitor/linux/tmsync.cxx
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/tmsync.cxx b/core/sqf/monitor/linux/tmsync.cxx
index e6e3a76..548ae81 100644
--- a/core/sqf/monitor/linux/tmsync.cxx
+++ b/core/sqf/monitor/linux/tmsync.cxx
@@ -1010,6 +1010,23 @@ void CTmSync_Container::SendUnsolicitedMessages (void)
                     delete msg;
                     msg = NULL;
                 }
+                if (NameServerEnabled)
+                {
+                    if (!MyNode->IsMyNode( tm->GetNid() ))
+                    {
+                        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC))
+                        {
+                            trace_printf( "%s@%d - Deleting clone process %s, (%d,%d:%d)\n"
+                                        , method_name, __LINE__
+                                        , tm->GetName()
+                                        , tm->GetNid()
+                                        , tm->GetPid()
+                                        , tm->GetVerifier() );
+                        }
+                        Nodes->DeleteCloneProcess( tm );
+                    }
+                
+                }
             }
             else
             {


Mime
View raw message