hawq-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From w...@apache.org
Subject incubator-hawq git commit: HAWQ-503. Fix the bug of failed temporary directory and GRM host/rack conflicts in resource pool in YARN mode
Date Thu, 10 Mar 2016 06:19:43 GMT
Repository: incubator-hawq
Updated Branches:
  refs/heads/master a81ae771e -> cb7caf540


HAWQ-503. Fix the bug of failed temporary directory and GRM host/rack conflicts in resource
pool  in YARN mode


Project: http://git-wip-us.apache.org/repos/asf/incubator-hawq/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-hawq/commit/cb7caf54
Tree: http://git-wip-us.apache.org/repos/asf/incubator-hawq/tree/cb7caf54
Diff: http://git-wip-us.apache.org/repos/asf/incubator-hawq/diff/cb7caf54

Branch: refs/heads/master
Commit: cb7caf5408b8677bfd189fdea9654825c6ce370d
Parents: a81ae77
Author: Wen Lin <wlin@pivotal.io>
Authored: Thu Mar 10 14:17:04 2016 +0800
Committer: Wen Lin <wlin@pivotal.io>
Committed: Thu Mar 10 14:17:04 2016 +0800

----------------------------------------------------------------------
 .../resourcebroker_LIBYARN_proc.c               |   2 +
 src/backend/resourcemanager/resourcepool.c      | 112 ++++++++++++++-----
 2 files changed, 87 insertions(+), 27 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/cb7caf54/src/backend/resourcemanager/resourcebroker/resourcebroker_LIBYARN_proc.c
----------------------------------------------------------------------
diff --git a/src/backend/resourcemanager/resourcebroker/resourcebroker_LIBYARN_proc.c b/src/backend/resourcemanager/resourcebroker/resourcebroker_LIBYARN_proc.c
index d3028c3..3afae7b 100644
--- a/src/backend/resourcemanager/resourcebroker/resourcebroker_LIBYARN_proc.c
+++ b/src/backend/resourcemanager/resourcebroker/resourcebroker_LIBYARN_proc.c
@@ -1500,6 +1500,8 @@ int RB2YARN_getClusterReport(DQueue hosts)
     		segstat->Info.GRMRackNameLen         = racknamelen;
     		segstat->Info.GRMRackNameOffset 	 = segstat->Info.GRMHostNameOffset +
     											   __SIZE_ALIGN64(hostnamelen+1);
+    		segstat->Info.FailedTmpDirOffset	 = 0;
+    		segstat->Info.FailedTmpDirLen		 = 0;
     		segstat->Info.Size 		 		 	 = segsize;
 
     		memcpy((char *)&(segstat->Info) + sizeof(SegInfoData),

http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/cb7caf54/src/backend/resourcemanager/resourcepool.c
----------------------------------------------------------------------
diff --git a/src/backend/resourcemanager/resourcepool.c b/src/backend/resourcemanager/resourcepool.c
index 6cc53eb..a0f64eb 100644
--- a/src/backend/resourcemanager/resourcepool.c
+++ b/src/backend/resourcemanager/resourcepool.c
@@ -1004,47 +1004,69 @@ int addHAWQSegWithSegStat(SegStat segstat, bool *capstatchanged)
 
 				int old = segresource->Stat->Info.FailedTmpDirLen == 0 ?
 										0 :__SIZE_ALIGN64(segresource->Stat->Info.FailedTmpDirLen+1);
-				int new =  segstat->Info.FailedTmpDirLen == 0 ?
+				int new = segstat->Info.FailedTmpDirLen == 0 ?
 										0 : __SIZE_ALIGN64(segstat->Info.FailedTmpDirLen+1);
-				if (new > old &&
-					segresource->Stat->Info.Size -
-					(segresource->Stat->Info.HostNameOffset + __SIZE_ALIGN64(segresource->Stat->Info.HostNameLen+1))
-					< new)
+
+				int current = segresource->Stat->Info.Size -
+						(segresource->Stat->Info.HostNameOffset + __SIZE_ALIGN64(segresource->Stat->Info.HostNameLen+1));
+				if (segresource->Stat->Info.GRMHostNameLen != 0 && segresource->Stat->Info.GRMHostNameOffset
!= 0)
+					current -= __SIZE_ALIGN64(segresource->Stat->Info.GRMHostNameLen+1);
+				if (segresource->Stat->Info.GRMRackNameLen != 0 && segresource->Stat->Info.GRMRackNameOffset
!= 0)
+					current -= __SIZE_ALIGN64(segresource->Stat->Info.GRMRackNameLen+1);
+
+				/*
+				 * repalloc memory if new size exceeds the old one.
+				 * we don't shrink memory size if new size is less than the old one.
+				 */
+				if (new > old && current < new)
 				{
 					SegStat newSegStat = rm_repalloc(PCONTEXT,
 													 segresource->Stat,
 													 offsetof(SegStatData, Info) +
 													 segresource->Stat->Info.Size + (new - old));
 					segresource->Stat = newSegStat;
-					memset((char*)&segresource->Stat->Info + segresource->Stat->Info.Size,
0, (new - old));
 					segresource->Stat->Info.Size += (new - old);
 				}
 
-				if (segstat->FailedTmpDirNum != 0)
+				if (segresource->Stat->Info.FailedTmpDirOffset == 0)
 				{
+					Assert(segresource->Stat->FailedTmpDirNum == 0);
 					segresource->Stat->Info.FailedTmpDirOffset = segresource->Stat->Info.HostNameOffset
+
 																	__SIZE_ALIGN64(segresource->Stat->Info.HostNameLen+1);
+					if (segresource->Stat->Info.GRMHostNameLen != 0 && segresource->Stat->Info.GRMHostNameOffset
!= 0)
+						segresource->Stat->Info.FailedTmpDirOffset += __SIZE_ALIGN64(segresource->Stat->Info.GRMHostNameLen+1);
+					if (segresource->Stat->Info.GRMRackNameLen != 0 && segresource->Stat->Info.GRMRackNameOffset
!= 0)
+						segresource->Stat->Info.FailedTmpDirOffset += __SIZE_ALIGN64(segresource->Stat->Info.GRMRackNameLen+1);
+				}
+
+				/* clear old failed temporary directory string in SegInfoData */
+				memset((char *)&segresource->Stat->Info +
+						segresource->Stat->Info.FailedTmpDirOffset,
+						0,
+						segresource->Stat->Info.Size -
+						segresource->Stat->Info.FailedTmpDirOffset);
+
+				if (segstat->FailedTmpDirNum != 0)
+				{
 					memcpy((char *)&segresource->Stat->Info + segresource->Stat->Info.FailedTmpDirOffset,
 							GET_SEGINFO_FAILEDTMPDIR(&segstat->Info),
 							strlen(GET_SEGINFO_FAILEDTMPDIR(&segstat->Info)));
-					memset((char *)&segresource->Stat->Info +
-							 segresource->Stat->Info.FailedTmpDirOffset +
-							 segstat->Info.FailedTmpDirLen,
-							 0,
-							 segresource->Stat->Info.Size -
-							 segresource->Stat->Info.FailedTmpDirOffset -
-							 segstat->Info.FailedTmpDirLen);
 				}
 				else
 				{
-					memset((char *)&segresource->Stat->Info + segresource->Stat->Info.FailedTmpDirOffset,
-							0,
-							segresource->Stat->Info.Size - segresource->Stat->Info.FailedTmpDirOffset);
 					segresource->Stat->Info.FailedTmpDirOffset = 0;
 				}
 				segresource->Stat->Info.FailedTmpDirLen = segstat->Info.FailedTmpDirLen;
 				segresource->Stat->FailedTmpDirNum = segstat->FailedTmpDirNum;
 
+				elog(RMLOG, "After resource manager "
+							"updates segment failed temporary directory, "
+							"GRM hostname:%s, GRM rackname:%s",
+							segresource->Stat->Info.GRMHostNameLen == 0 ?
+								"":GET_SEGINFO_GRMHOSTNAME(&(segresource->Stat->Info)),
+							segresource->Stat->Info.GRMRackNameLen == 0 ?
+								"":GET_SEGINFO_GRMRACKNAME(&(segresource->Stat->Info)));
+
 				setSegResHAWQAvailability(segresource, segstat->FTSAvailable);
 				if (Gp_role != GP_ROLE_UTILITY)
 				{
@@ -1205,27 +1227,60 @@ int updateHAWQSegWithGRMSegStat( SegStat segstat)
 	int oldgracklen = segres->Stat->Info.GRMRackNameLen == 0 ?
 					  0 :
 					  __SIZE_ALIGN64(segres->Stat->Info.GRMRackNameLen+1);
+
+	Assert(segres->Stat->Info.HostNameOffset != 0);
+	int current = segres->Stat->Info.Size -
+					(segres->Stat->Info.HostNameOffset +
+					__SIZE_ALIGN64(segres->Stat->Info.HostNameLen+1));
+	if (segres->Stat->FailedTmpDirNum != 0)
+		current -= __SIZE_ALIGN64(segres->Stat->Info.FailedTmpDirLen +1);
+
+	/*
+	 * If new GRM hostname and rackname length exceeds the old one,
+	 * repalloc memory. But never shrink memory.
+	 */
 	int change = ghostlen + gracklen - oldghostlen - oldgracklen;
-	if (change > 0)
+	if (change > 0 && current < (ghostlen + gracklen))
 	{
 		newSegStat = rm_repalloc(PCONTEXT,
 								 segres->Stat,
 								 offsetof(SegStatData, Info) +
 								 	 segres->Stat->Info.Size + change);
 		segres->Stat = newSegStat;
+		segres->Stat->Info.Size += change;
 	}
 	else
 		newSegStat = segres->Stat;
 
-	Assert(newSegStat != NULL);
-	/* Reset the memory area for GRM host and rack name zero filled. */
-	memset((char*)newSegStat +
-		   offsetof(SegStatData, Info) + segres->Stat->Info.Size -
-		   (oldghostlen + oldgracklen),
-		   '\0',
-		   ghostlen + gracklen);
+	/* Refill failed temporary directory string */
+	if (segres->Stat->FailedTmpDirNum != 0 && change > 0
+			&& current < (ghostlen + gracklen))
+	{
+		Assert(newSegStat->Info.FailedTmpDirOffset != 0 &&
+				newSegStat->Info.FailedTmpDirLen != 0);
+		memmove((char*)newSegStat + offsetof(SegStatData, Info)
+				+ newSegStat->Info.FailedTmpDirOffset + change,
+				(char*)newSegStat + offsetof(SegStatData, Info)
+				+ newSegStat->Info.FailedTmpDirOffset,
+				__SIZE_ALIGN64(segres->Stat->Info.FailedTmpDirLen + 1));
+		memset((char*)newSegStat + offsetof(SegStatData, Info)
+				+ newSegStat->Info.FailedTmpDirOffset,
+				0,
+				change);
+		newSegStat->Info.FailedTmpDirOffset += change;
+	}
 
 	Assert(newSegStat != NULL);
+	/* Reset the memory area for GRM host and rack name zero filled. */
+	Assert(newSegStat->Info.HostNameLen != 0);
+	memset((char *)&newSegStat->Info + newSegStat->Info.HostNameOffset +
+			__SIZE_ALIGN64(newSegStat->Info.HostNameLen + 1),
+			0,
+			segres->Stat->Info.Size -
+			newSegStat->Info.HostNameOffset -
+			__SIZE_ALIGN64(newSegStat->Info.HostNameLen + 1) -
+			(segres->Stat->Info.FailedTmpDirLen == 0) ?
+				0:__SIZE_ALIGN64(segres->Stat->Info.FailedTmpDirLen + 1));
 
 	newSegStat->Info.GRMHostNameLen    = segstat->Info.GRMHostNameLen;
 	newSegStat->Info.GRMHostNameOffset = newSegStat->Info.HostNameOffset +
@@ -1233,8 +1288,6 @@ int updateHAWQSegWithGRMSegStat( SegStat segstat)
 	newSegStat->Info.GRMRackNameLen    = segstat->Info.GRMRackNameLen;
 	newSegStat->Info.GRMRackNameOffset = newSegStat->Info.GRMHostNameOffset +
 										 __SIZE_ALIGN64(newSegStat->Info.GRMHostNameLen+1);
-	newSegStat->Info.Size = newSegStat->Info.GRMRackNameOffset +
-						    __SIZE_ALIGN64(newSegStat->Info.GRMRackNameLen+1);
 
 	strcpy(GET_SEGINFO_GRMHOSTNAME(&(newSegStat->Info)),
 		   GET_SEGINFO_GRMHOSTNAME(&(segstat->Info)));
@@ -1249,6 +1302,11 @@ int updateHAWQSegWithGRMSegStat( SegStat segstat)
 			  GET_SEGINFO_GRMHOSTNAME(&(newSegStat->Info)),
 			  GET_SEGINFO_GRMRACKNAME(&(newSegStat->Info)));
 
+	elog(RMLOG, "After resource manager "
+				"updates segment info's GRM host name and rack name, "
+				"failed temporary directory: %s",
+				segres->Stat->FailedTmpDirNum == 0 ? "":GET_SEGINFO_FAILEDTMPDIR(&(segres->Stat->Info)));
+
 	/* Always set segment global resource manager available. */
 	setSegResGLOBAvailability(segres, RESOURCE_SEG_STATUS_AVAILABLE);
 


Mime
View raw message