hawq-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From y...@apache.org
Subject incubator-hawq git commit: HAWQ-363. Add guc variables for controlling FTS heartbeat interval and timeout
Date Tue, 26 Jan 2016 05:21:32 GMT
Repository: incubator-hawq
Updated Branches:
  refs/heads/master fd95c34b1 -> 379cb11eb


HAWQ-363. Add guc variables for controlling FTS heartbeat interval and timeout


Project: http://git-wip-us.apache.org/repos/asf/incubator-hawq/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-hawq/commit/379cb11e
Tree: http://git-wip-us.apache.org/repos/asf/incubator-hawq/tree/379cb11e
Diff: http://git-wip-us.apache.org/repos/asf/incubator-hawq/diff/379cb11e

Branch: refs/heads/master
Commit: 379cb11eb685674695efa7fdbdc5ae581af224e5
Parents: fd95c34
Author: YI JIN <yjin@pivotal.io>
Authored: Tue Jan 26 16:21:18 2016 +1100
Committer: YI JIN <yjin@pivotal.io>
Committed: Tue Jan 26 16:21:18 2016 +1100

----------------------------------------------------------------------
 src/backend/cdb/cdbvars.c                       | 16 ++++++++-
 .../communication/rmcomm_AsyncComm.c            |  1 +
 src/backend/resourcemanager/include/dynrm.h     |  1 +
 src/backend/resourcemanager/resourcemanager.c   | 15 +++++---
 .../resourcemanager/resourcemanager_RMSEG.c     | 25 +++++++------
 src/backend/utils/misc/guc.c                    | 38 ++++++++++++++++++++
 src/include/cdb/cdbvars.h                       |  5 +++
 7 files changed, 85 insertions(+), 16 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/379cb11e/src/backend/cdb/cdbvars.c
----------------------------------------------------------------------
diff --git a/src/backend/cdb/cdbvars.c b/src/backend/cdb/cdbvars.c
index 8f2d8c8..ea5b466 100644
--- a/src/backend/cdb/cdbvars.c
+++ b/src/backend/cdb/cdbvars.c
@@ -346,7 +346,21 @@ int		rm_session_lease_heartbeat_interval;/* How many seconds to wait
before
 int		rm_nocluster_timeout;				/* How many seconds to wait before
 											   getting enough number of available
 											   segments registered. */
-
+int		rm_segment_heartbeat_interval;		/* How many seconds to wait before
+											   sending another heart-beat to
+											   from a segment to resource
+											   manager. */
+int		rm_segment_heartbeat_timeout;		/* How many seconds to wait before
+											   setting down a segment that does
+											   not have heart-beat sent
+											   successfully to resource
+											   manager. */
+int		rm_segment_config_refresh_interval; /* How many seconds to wait before
+ 	 	 	 	 	 	 	 	 	 	 	   another refreshing local segment
+ 	 	 	 	 	 	 	 	 	 	 	   configuration. */
+int		rm_segment_tmpdir_detect_interval;	/* How many seconds to wait before
+											   another detecting local temporary
+											   directories. */
 int		rm_tolerate_nseg_limit;
 int		rm_rejectrequest_nseg_limit;
 int		rm_nvseg_variance_among_seg_limit;

http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/379cb11e/src/backend/resourcemanager/communication/rmcomm_AsyncComm.c
----------------------------------------------------------------------
diff --git a/src/backend/resourcemanager/communication/rmcomm_AsyncComm.c b/src/backend/resourcemanager/communication/rmcomm_AsyncComm.c
index 039a1cb..772adef 100644
--- a/src/backend/resourcemanager/communication/rmcomm_AsyncComm.c
+++ b/src/backend/resourcemanager/communication/rmcomm_AsyncComm.c
@@ -713,6 +713,7 @@ int registerAsyncConnectionFileDesc(const char				*sockpath,
 				   fd,
 				   errno);
 		 close(fd);
+		 res = UTIL_NETWORK_FAIL_CONNECT;
 	}
 
 exit:

http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/379cb11e/src/backend/resourcemanager/include/dynrm.h
----------------------------------------------------------------------
diff --git a/src/backend/resourcemanager/include/dynrm.h b/src/backend/resourcemanager/include/dynrm.h
index 103adbf..0cba28b 100644
--- a/src/backend/resourcemanager/include/dynrm.h
+++ b/src/backend/resourcemanager/include/dynrm.h
@@ -337,4 +337,5 @@ int  initializeSocketServer_RMSEG(void);
 int  MainHandlerLoop_RMSEG(void);
 int  MainHandler_RMSEGDummyLoop(void);
 
+void checkAndBuildFailedTmpDirList(void);
 #endif //DYNAMIC_RESOURCE_MANAGEMENT_H

http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/379cb11e/src/backend/resourcemanager/resourcemanager.c
----------------------------------------------------------------------
diff --git a/src/backend/resourcemanager/resourcemanager.c b/src/backend/resourcemanager/resourcemanager.c
index 6b8a9e2..819da83 100644
--- a/src/backend/resourcemanager/resourcemanager.c
+++ b/src/backend/resourcemanager/resourcemanager.c
@@ -613,7 +613,8 @@ int MainHandlerLoop(void)
         uint64_t curtime = gettime_microsec();
 		if ((rm_resourcepool_test_filename == NULL ||
 			rm_resourcepool_test_filename[0] == '\0') &&
-			(curtime - PRESPOOL->LastCheckTime > 10LL * SEGMENT_HEARTBEAT_INTERVAL))
+			(curtime - PRESPOOL->LastCheckTime >
+        	 1000000LL * rm_segment_heartbeat_timeout))
 		{
 			updateStatusOfAllNodes();
 			PRESPOOL->LastCheckTime = curtime;
@@ -2605,17 +2606,21 @@ void sendResponseToClients(void)
  * Check and set the nodes down that are not updated by IMAlive heart-beat for a
  * long time.
  */
-void updateStatusOfAllNodes() {
+void updateStatusOfAllNodes()
+{
 	SegResource node = NULL;
 	uint64_t curtime = 0;
 
 	bool changedstatus = false;
 	curtime = gettime_microsec();
-	for(uint32_t idx = 0; idx < PRESPOOL->SegmentIDCounter; idx++) {
+	for(uint32_t idx = 0; idx < PRESPOOL->SegmentIDCounter; idx++)
+	{
 	    node = getSegResource(idx);
         if (node != NULL &&
-            curtime - node->LastUpdateTime > 10LL * SEGMENT_HEARTBEAT_INTERVAL &&
-			IS_SEGSTAT_FTSAVAILABLE(node->Stat) ) {
+            (curtime - node->LastUpdateTime >
+			 1000000LL * rm_segment_heartbeat_timeout) &&
+			IS_SEGSTAT_FTSAVAILABLE(node->Stat) )
+        {
         	/*
         	 * This call makes resource manager able to adjust queue and mem/core
         	 * trackers' capacity.

http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/379cb11e/src/backend/resourcemanager/resourcemanager_RMSEG.c
----------------------------------------------------------------------
diff --git a/src/backend/resourcemanager/resourcemanager_RMSEG.c b/src/backend/resourcemanager/resourcemanager_RMSEG.c
index 9cbf63e..9a89199 100644
--- a/src/backend/resourcemanager/resourcemanager_RMSEG.c
+++ b/src/backend/resourcemanager/resourcemanager_RMSEG.c
@@ -150,9 +150,7 @@ int  initializeSocketServer_RMSEG(void)
 	return res;
 
 }
-#define SEGMENT_HEARTBEAT_INTERVAL (3LL * 1000000LL)
-#define SEGMENT_HOSTCHECK_INTERVAL (5LL * 1000000LL)
-#define SEGMENT_TMPDIRCHECK_INTERVAL (5 * 60LL * 1000000LL)
+
 int MainHandlerLoop_RMSEG(void)
 {
 	int 		res 	  = FUNC_RETURN_OK;
@@ -185,7 +183,8 @@ int MainHandlerLoop_RMSEG(void)
 		processSubmittedRequests();
 
 		if ( curtime - DRMGlobalInstance->TmpDirLastCheckTime >
-			SEGMENT_TMPDIRCHECK_INTERVAL ) {
+			1000000LL * rm_segment_tmpdir_detect_interval )
+		{
 			checkAndBuildFailedTmpDirList();
 			DRMGlobalInstance->TmpDirLastCheckTime = gettime_microsec();
 		}
@@ -195,15 +194,18 @@ int MainHandlerLoop_RMSEG(void)
 		curtime = gettime_microsec();
 		if ( DRMGlobalInstance->LocalHostStat == NULL ||
 			 curtime - DRMGlobalInstance->LocalHostLastUpdateTime >
-			 SEGMENT_HOSTCHECK_INTERVAL ) {
+			 1000000LL * rm_segment_config_refresh_interval )
+		{
 			refreshLocalHostInstance();
 			checkLocalPostmasterStatus();
 		}
 
-		if ( DRMGlobalInstance->SendIMAlive ) {
+		if ( DRMGlobalInstance->SendIMAlive )
+		{
 			 if (DRMGlobalInstance->LocalHostStat != NULL &&
 			     curtime - DRMGlobalInstance->HeartBeatLastSentTime >
-			     SEGMENT_HEARTBEAT_INTERVAL ) {
+				 1000000LL * rm_segment_heartbeat_interval )
+			 {
 				 sendIMAlive(&errorcode, errorbuf, sizeof(errorbuf));
 				 DRMGlobalInstance->HeartBeatLastSentTime = gettime_microsec();
 			 }
@@ -284,8 +286,9 @@ bool CheckTmpDirAvailable(char *path)
  * Check the status of each temporary directory,
  * and build a list of failed temporary directories.
  */
-void checkAndBuildFailedTmpDirList()
+void checkAndBuildFailedTmpDirList(void)
 {
+	uint64_t starttime = gettime_microsec();
 	destroyTmpDirList(DRMGlobalInstance->LocalHostFailedTmpDirList);
 	DRMGlobalInstance->LocalHostFailedTmpDirList = NULL;
 
@@ -297,6 +300,8 @@ void checkAndBuildFailedTmpDirList()
 					lappend(DRMGlobalInstance->LocalHostFailedTmpDirList, failedDir);
 		}
 	DQUEUE_LOOP_END
-
-	elog(LOG, "checkAndBuildFailedTmpDirList finish!");
+	uint64_t endtime = gettime_microsec();
+	elog(LOG, "checkAndBuildFailedTmpDirList finished checking temporary "
+			  "directory, which costs " UINT64_FORMAT " us",
+			  endtime - starttime);
 }

http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/379cb11e/src/backend/utils/misc/guc.c
----------------------------------------------------------------------
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index b75dcb0..605a29d 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -6393,6 +6393,16 @@ static struct config_int ConfigureNamesInt[] =
 	},
 
 	{
+		{"hawq_rm_segment_heartbeat_timeout", PGC_POSTMASTER, RESOURCES_MGM,
+			gettext_noop("timeout for setting one segment down that having no heart-beat "
+						 "successfully received by resource manager."),
+			NULL
+		},
+		&rm_segment_heartbeat_timeout,
+		300, 1, 65535, NULL, NULL
+	},
+
+	{
 		{"hawq_rm_session_lease_heartbeat_interval", PGC_POSTMASTER, RESOURCES_MGM,
 			gettext_noop("interval for sending heart-beat to resource manager to keep "
 						 "resource context alive."),
@@ -6413,6 +6423,34 @@ static struct config_int ConfigureNamesInt[] =
 	},
 
 	{
+		{"hawq_rm_segment_heartbeat_interval", PGC_POSTMASTER, RESOURCES_MGM,
+			gettext_noop("interval for sending heart-beat to resource manager to keep "
+						 "segment alive and to present latest segment status."),
+			NULL
+		},
+		&rm_segment_heartbeat_interval,
+		30, 1, 65535, NULL, NULL
+	},
+
+	{
+		{"hawq_rm_segment_tmpdir_detect_interval", PGC_POSTMASTER, RESOURCES_MGM,
+			gettext_noop("interval for detecting segment local temporary directories."),
+			NULL
+		},
+		&rm_segment_tmpdir_detect_interval,
+		300, 60, 65535, NULL, NULL
+	},
+
+	{
+		{"hawq_rm_segment_config_refresh_interval", PGC_POSTMASTER, RESOURCES_MGM,
+			gettext_noop("interval for refreshing segment local host config."),
+			NULL
+		},
+		&rm_segment_config_refresh_interval,
+		30, 5, 65535, NULL, NULL
+	},
+
+	{
 		{"hawq_rm_tolerate_nseg_limit", PGC_POSTMASTER, RESOURCES_MGM,
 			gettext_noop("resource manager re-allocates resource if the number of exclusive "
 						 "segments is greater than this limit value when there is at least "

http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/379cb11e/src/include/cdb/cdbvars.h
----------------------------------------------------------------------
diff --git a/src/include/cdb/cdbvars.h b/src/include/cdb/cdbvars.h
index 076374c..bcd1d81 100644
--- a/src/include/cdb/cdbvars.h
+++ b/src/include/cdb/cdbvars.h
@@ -1183,8 +1183,13 @@ extern bool    rm_session_lease_heartbeat_enable;
 
 extern int 	   rm_resource_allocation_timeout;
 extern int	   rm_resource_timeout;
+extern int	   rm_segment_heartbeat_timeout;
 extern int	   rm_request_timeoutcheck_interval;
 extern int	   rm_session_lease_heartbeat_interval;
+extern int	   rm_segment_heartbeat_interval;
+extern int	   rm_segment_config_refresh_interval;
+extern int	   rm_segment_tmpdir_detect_interval;
+
 extern int	   rm_nocluster_timeout;
 extern int	   rm_tolerate_nseg_limit;
 extern int	   rm_rejectrequest_nseg_limit;


Mime
View raw message