hawq-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From w...@apache.org
Subject incubator-hawq git commit: HAWQ-362. Master RM should recalculate segment's allocation status if segment's RM process is restarted. Add a timestamp for segment's RM process and reports to Master RM.
Date Mon, 25 Jan 2016 03:24:06 GMT
Repository: incubator-hawq
Updated Branches:
  refs/heads/master 43ab8a35e -> df0261e51


HAWQ-362. Master RM should recalculate segment's allocation status if segment's RM process
is restarted.
          Add a timestamp for segment's RM process and reports to Master RM.


Project: http://git-wip-us.apache.org/repos/asf/incubator-hawq/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-hawq/commit/df0261e5
Tree: http://git-wip-us.apache.org/repos/asf/incubator-hawq/tree/df0261e5
Diff: http://git-wip-us.apache.org/repos/asf/incubator-hawq/diff/df0261e5

Branch: refs/heads/master
Commit: df0261e51053d1c90cfa427ef49957d668bda0a9
Parents: 43ab8a3
Author: Wen Lin <wlin@pivotal.io>
Authored: Mon Jan 25 11:21:01 2016 +0800
Committer: Wen Lin <wlin@pivotal.io>
Committed: Mon Jan 25 11:21:01 2016 +0800

----------------------------------------------------------------------
 .../communication/rmcomm_RMSEG2RM.c             |  1 +
 .../communication/rmcomm_RMSEG_RM_Protocol.h    |  1 +
 .../resourcemanager/include/resourcepool.h      |  1 +
 src/backend/resourcemanager/requesthandler.c    |  7 ++--
 .../resourcemanager/requesthandler_RMSEG.c      |  2 +-
 .../resourcemanager/resourcemanager_RMSEG.c     |  1 +
 src/backend/resourcemanager/resourcepool.c      | 34 ++++++++++++++++++--
 7 files changed, 40 insertions(+), 7 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/df0261e5/src/backend/resourcemanager/communication/rmcomm_RMSEG2RM.c
----------------------------------------------------------------------
diff --git a/src/backend/resourcemanager/communication/rmcomm_RMSEG2RM.c b/src/backend/resourcemanager/communication/rmcomm_RMSEG2RM.c
index 9d4eca3..30256a4 100644
--- a/src/backend/resourcemanager/communication/rmcomm_RMSEG2RM.c
+++ b/src/backend/resourcemanager/communication/rmcomm_RMSEG2RM.c
@@ -118,6 +118,7 @@ int sendIMAlive(int  *errorcode,
 	requesthead.TmpDirCount 	  = getDQueueLength(&DRMGlobalInstance->LocalHostTempDirectories);
 	requesthead.TmpDirBrokenCount = DRMGlobalInstance->LocalHostStat->FailedTmpDirNum;
 	requesthead.Reserved		  = 0;
+	requesthead.RMStartTimestamp  = DRMGlobalInstance->ResourceManagerStartTime;
 
 	appendSMBVar(&tosend, requesthead);
 	appendSelfMaintainBuffer(&tosend,

http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/df0261e5/src/backend/resourcemanager/include/communication/rmcomm_RMSEG_RM_Protocol.h
----------------------------------------------------------------------
diff --git a/src/backend/resourcemanager/include/communication/rmcomm_RMSEG_RM_Protocol.h
b/src/backend/resourcemanager/include/communication/rmcomm_RMSEG_RM_Protocol.h
index f866c9c..90a6843 100644
--- a/src/backend/resourcemanager/include/communication/rmcomm_RMSEG_RM_Protocol.h
+++ b/src/backend/resourcemanager/include/communication/rmcomm_RMSEG_RM_Protocol.h
@@ -46,6 +46,7 @@ RPC_PROTOCOL_STRUCT_BEGIN(RPCRequestHeadIMAlive)
 	uint16_t	TmpDirCount;
 	uint16_t	TmpDirBrokenCount;
 	uint32_t	Reserved;
+	uint64_t	RMStartTimestamp;
 RPC_PROTOCOL_STRUCT_END(RPCRequestHeadIMAlive)
 
 RPC_PROTOCOL_STRUCT_BEGIN(RPCResponseIMAlive)

http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/df0261e5/src/backend/resourcemanager/include/resourcepool.h
----------------------------------------------------------------------
diff --git a/src/backend/resourcemanager/include/resourcepool.h b/src/backend/resourcemanager/include/resourcepool.h
index 1829a64..d63a6cb 100644
--- a/src/backend/resourcemanager/include/resourcepool.h
+++ b/src/backend/resourcemanager/include/resourcepool.h
@@ -151,6 +151,7 @@ struct SegStatData {
 	uint32_t		FTSTotalCore;			/* FTS reports core capacity.	  */
 	uint32_t		GRMTotalMemoryMB;		/* GRM reports memory capacity.	  */
 	uint32_t		GRMTotalCore;			/* GRM reports core capacity. 	  */
+	uint64_t		RMStartTimestamp;		/* RM process reset timestamp */
 	SegInfoData		Info;					/* 64-bit aligned.				  */
 };
 

http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/df0261e5/src/backend/resourcemanager/requesthandler.c
----------------------------------------------------------------------
diff --git a/src/backend/resourcemanager/requesthandler.c b/src/backend/resourcemanager/requesthandler.c
index 6f5eba2..c6e9a34 100644
--- a/src/backend/resourcemanager/requesthandler.c
+++ b/src/backend/resourcemanager/requesthandler.c
@@ -747,12 +747,13 @@ bool handleRMSEGRequestIMAlive(void **arg)
 
 	destroySelfMaintainBuffer(&newseginfo);
 
-	newsegstat->ID 				= SEGSTAT_ID_INVALID;
-	newsegstat->GRMAvailable 	= RESOURCE_SEG_STATUS_UNSET;
+	newsegstat->ID 				 = SEGSTAT_ID_INVALID;
+	newsegstat->GRMAvailable 	 = RESOURCE_SEG_STATUS_UNSET;
 
 	RPCRequestHeadIMAlive header = SMBUFF_HEAD(RPCRequestHeadIMAlive,
 												&(conntrack->MessageBuff));
-	newsegstat->FailedTmpDirNum = header->TmpDirBrokenCount;
+	newsegstat->FailedTmpDirNum  = header->TmpDirBrokenCount;
+	newsegstat->RMStartTimestamp = header->RMStartTimestamp;
 
 	/*
 	 * Check if the there is any failed temporary directory on this segment.

http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/df0261e5/src/backend/resourcemanager/requesthandler_RMSEG.c
----------------------------------------------------------------------
diff --git a/src/backend/resourcemanager/requesthandler_RMSEG.c b/src/backend/resourcemanager/requesthandler_RMSEG.c
index b96b448..67424e9 100644
--- a/src/backend/resourcemanager/requesthandler_RMSEG.c
+++ b/src/backend/resourcemanager/requesthandler_RMSEG.c
@@ -162,7 +162,7 @@ int refreshLocalHostInstance(void)
 			if (strcmp(GET_SEGINFO_FAILEDTMPDIR(info), failedTmpDirStr.Str) != 0)
 			{
 				elog(LOG, "Segment resource manager finds failed temporary directory change "
-						  "from %s to %s", GET_SEGINFO_FAILEDTMPDIR(info), failedTmpDirStr.Str);
+						  "from '%s' to '%s'", GET_SEGINFO_FAILEDTMPDIR(info), failedTmpDirStr.Str);
 				shouldupdate = true;
 			}
 		}

http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/df0261e5/src/backend/resourcemanager/resourcemanager_RMSEG.c
----------------------------------------------------------------------
diff --git a/src/backend/resourcemanager/resourcemanager_RMSEG.c b/src/backend/resourcemanager/resourcemanager_RMSEG.c
index ffac70a..9cbf63e 100644
--- a/src/backend/resourcemanager/resourcemanager_RMSEG.c
+++ b/src/backend/resourcemanager/resourcemanager_RMSEG.c
@@ -160,6 +160,7 @@ int MainHandlerLoop_RMSEG(void)
 	int			errorcode = FUNC_RETURN_OK;
 	char		errorbuf[1024];
 
+	DRMGlobalInstance->ResourceManagerStartTime = gettime_microsec();
 	while( DRMGlobalInstance->ResManagerMainKeepRun ) {
 
 		if (!PostmasterIsAlive(true)) {

http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/df0261e5/src/backend/resourcemanager/resourcepool.c
----------------------------------------------------------------------
diff --git a/src/backend/resourcemanager/resourcepool.c b/src/backend/resourcemanager/resourcepool.c
index 38cacfa..918f401 100644
--- a/src/backend/resourcemanager/resourcepool.c
+++ b/src/backend/resourcemanager/resourcepool.c
@@ -893,6 +893,35 @@ int addHAWQSegWithSegStat(SegStat segstat, bool *capstatchanged)
 	else {
 		segresource = getSegResource(segid);
 		Assert(segresource != NULL);
+		uint8_t oldStatus = segresource->Stat->FTSAvailable;
+		bool statusChanged = oldStatus != segstat->FTSAvailable;
+
+		/*
+		 * Check if RM process is restarted in this segment.
+		 * If the latest reported RM process startup timestamp doesn't
+		 * match the previous, master RM consider segment's RM process
+		 * has restarted.
+		 * In rare case, the system's time is reset and segment's RM process
+		 * happen to get a same timestamp with previous one.
+		 */
+		if (segresource->Stat->RMStartTimestamp != segstat->RMStartTimestamp)
+		{
+			/*
+			 * This segment's RM process has restarted,
+			 * we should clean up old status, so mark it down.
+			 */
+			if (oldStatus == RESOURCE_SEG_STATUS_AVAILABLE && !statusChanged)
+			{
+				segstat->FTSAvailable = RESOURCE_SEG_STATUS_UNAVAILABLE;
+				statusChanged = true;
+			}
+			segresource->Stat->RMStartTimestamp = segstat->RMStartTimestamp;
+			elog(LOG, "Master RM finds segment:%s 's RM process has restarted. "
+					  "old status:%d, new status:%d",
+					  GET_SEGRESOURCE_HOSTNAME(segresource),
+					  oldStatus,
+					  segstat->FTSAvailable);
+		}
 
 		/* Check if temporary directory path is changed */
 		bool tmpDirChanged = false;
@@ -921,8 +950,6 @@ int addHAWQSegWithSegStat(SegStat segstat, bool *capstatchanged)
 		 * Either the FTSAvailable or the failed temporary directory
 		 * of this segment is changed.
 		 */
-		uint8_t oldStatus = segresource->Stat->FTSAvailable;
-		bool statusChanged = oldStatus != segstat->FTSAvailable;
 		if (statusChanged || tmpDirChanged)
 		{
 			if (statusChanged && !tmpDirChanged)
@@ -934,12 +961,12 @@ int addHAWQSegWithSegStat(SegStat segstat, bool *capstatchanged)
 											SEGMENT_STATUS_UP:SEGMENT_STATUS_DOWN);
 				}
 
+				setSegResHAWQAvailability(segresource, segstat->FTSAvailable);
 				/*
 				 * Segment is set from up to down, return resource.
 				 */
 				if (oldStatus == RESOURCE_SEG_STATUS_AVAILABLE)
 				{
-					/* The segment is up again, its capacity should be considered again. */
 					*capstatchanged = true;
 					returnAllGRMResourceFromSegment(segresource);
 				}
@@ -1090,6 +1117,7 @@ int addHAWQSegWithSegStat(SegStat segstat, bool *capstatchanged)
 		res = RESOURCEPOOL_DUPLICATE_HOST;
 	}
 
+
 	/*
 	 * If host capacity is changed, update the cluster level memory/core ratio.
 	 * The expectation is that more than 50% cluster nodes has the same memory/


Mime
View raw message