hawq-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From y...@apache.org
Subject incubator-hawq git commit: HAWQ-357. Track how many times a segment can not get expected containers from global resource manager
Date Mon, 25 Jan 2016 02:17:23 GMT
Repository: incubator-hawq
Updated Branches:
  refs/heads/master 6488b1757 -> 43ab8a35e


HAWQ-357. Track how many times a segment can not get expected containers from global resource
manager


Project: http://git-wip-us.apache.org/repos/asf/incubator-hawq/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-hawq/commit/43ab8a35
Tree: http://git-wip-us.apache.org/repos/asf/incubator-hawq/tree/43ab8a35
Diff: http://git-wip-us.apache.org/repos/asf/incubator-hawq/diff/43ab8a35

Branch: refs/heads/master
Commit: 43ab8a35e7b08effab7cb2aa618b362faade0512
Parents: 6488b17
Author: YI JIN <yjin@pivotal.io>
Authored: Mon Jan 25 13:17:06 2016 +1100
Committer: YI JIN <yjin@pivotal.io>
Committed: Mon Jan 25 13:17:06 2016 +1100

----------------------------------------------------------------------
 .../include/resourcebroker/resourcebroker_API.h |  3 +
 .../resourcemanager/include/resourcepool.h      |  3 +
 .../resourcebroker/resourcebroker_API.c         | 99 +++++++++++++++++++-
 .../resourcebroker/resourcebroker_LIBYARN.c     | 86 +++++++++++++++--
 src/backend/resourcemanager/resourcemanager.c   | 11 +--
 src/backend/resourcemanager/resourcepool.c      | 18 +++-
 6 files changed, 196 insertions(+), 24 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/43ab8a35/src/backend/resourcemanager/include/resourcebroker/resourcebroker_API.h
----------------------------------------------------------------------
diff --git a/src/backend/resourcemanager/include/resourcebroker/resourcebroker_API.h b/src/backend/resourcemanager/include/resourcebroker/resourcebroker_API.h
index 0f35086..f499490 100644
--- a/src/backend/resourcemanager/include/resourcebroker/resourcebroker_API.h
+++ b/src/backend/resourcemanager/include/resourcebroker/resourcebroker_API.h
@@ -94,6 +94,9 @@ bool isCleanGRMResourceStatus(void);
 
 void RB_clearResource(List **ctnl);
 
+void RB_freePreferedHostsForGRMContainers(void);
+void RB_updateSegmentsHavingNoExpectedGRMContainers(HASHTABLE segments);
+
 /* Error message */
 extern bool				ResourceManagerIsForked;
 #endif /* HAWQ_RESOURCE_MANAGER_RESOURCE_BROKER_API_H */

http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/43ab8a35/src/backend/resourcemanager/include/resourcepool.h
----------------------------------------------------------------------
diff --git a/src/backend/resourcemanager/include/resourcepool.h b/src/backend/resourcemanager/include/resourcepool.h
index 12ee5ae..1829a64 100644
--- a/src/backend/resourcemanager/include/resourcepool.h
+++ b/src/backend/resourcemanager/include/resourcepool.h
@@ -281,6 +281,7 @@ struct SegResourceData {
 
 	/* Total GRM container size. */
 	int				GRMContainerCount;
+	int				GRMContainerFailAllocCount;
 
 	/*
 	 * When resource manager has resource allocated from resource broker, the
@@ -633,6 +634,8 @@ int getOrderedResourceAllocTreeIndexByRatio(uint32_t ratio, BBST *tree);
 
 void setAllSegResourceGRMUnavailable(void);
 
+void resetAllSegmentsGRMContainerFailAllocCount(void);
+
 struct RB_GRMContainerStatData
 {
 	int64_t		ContainerID;

http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/43ab8a35/src/backend/resourcemanager/resourcebroker/resourcebroker_API.c
----------------------------------------------------------------------
diff --git a/src/backend/resourcemanager/resourcebroker/resourcebroker_API.c b/src/backend/resourcemanager/resourcebroker/resourcebroker_API.c
index 323b13a..7a577fd 100644
--- a/src/backend/resourcemanager/resourcebroker/resourcebroker_API.c
+++ b/src/backend/resourcemanager/resourcebroker/resourcebroker_API.c
@@ -22,8 +22,9 @@
 #include "resourcebroker/resourcebroker_NONE.h"
 #include "resourcebroker/resourcebroker_LIBYARN.h"
 
-bool					 ResourceManagerIsForked;
-RB_FunctionEntriesData 	 CurrentRBImp;
+bool					ResourceManagerIsForked;
+RB_FunctionEntriesData	CurrentRBImp;
+List				   *PreferedHostsForGRMContainers;
 
 void RB_prepareImplementation(enum RB_IMP_TYPE imptype)
 {
@@ -37,6 +38,8 @@ void RB_prepareImplementation(enum RB_IMP_TYPE imptype)
 	CurrentRBImp.start				= NULL;
 	CurrentRBImp.stop				= NULL;
 
+	PreferedHostsForGRMContainers	= NULL;
+
 	switch(imptype) {
 	case NONE_HAWQ2:
 		RB_NONE_createEntries(&CurrentRBImp);
@@ -77,7 +80,20 @@ int RB_getClusterReport(const char *queuename, List **machines, double
*maxcapac
 int RB_acquireResource(uint32_t memorymb, uint32_t core, List *preferred)
 {
 	Assert(CurrentRBImp.acquireResource != NULL);
-	return CurrentRBImp.acquireResource(memorymb, core, preferred);
+	int res = CurrentRBImp.acquireResource(memorymb, core, preferred);
+
+	/*--------------------------------------------------------------------------
+	 * We hold preferred host list here, when resource broker gets the resource
+	 * allocation result, this is the reference for updating the counter in each
+	 * segment for failing of getting GRM containers from GRM.
+	 *--------------------------------------------------------------------------
+	 */
+	if ( PreferedHostsForGRMContainers != NULL )
+	{
+		RB_freePreferedHostsForGRMContainers();
+	}
+	PreferedHostsForGRMContainers = preferred;
+	return res;
 }
 
 int RB_returnResource(List **containers)
@@ -173,7 +189,9 @@ void RB_clearResource(List **ctnl)
 
 		if ( ctn->CalcDecPending )
 		{
-			minusResourceBundleData(&(ctn->Resource->DecPending), ctn->MemoryMB, ctn->Core);
+			minusResourceBundleData(&(ctn->Resource->DecPending),
+									ctn->MemoryMB,
+									ctn->Core);
 			Assert( ctn->Resource->DecPending.Core >= 0 );
 			Assert( ctn->Resource->DecPending.MemoryMB >= 0 );
 		}
@@ -183,3 +201,76 @@ void RB_clearResource(List **ctnl)
 		PRESPOOL->RetPendingContainerCount--;
 	}
 }
+
+void RB_freePreferedHostsForGRMContainers(void)
+{
+	ListCell *cell = NULL;
+	foreach(cell, PreferedHostsForGRMContainers)
+	{
+		PAIR pair = (PAIR)lfirst(cell);
+		rm_pfree(PCONTEXT, pair->Value);
+		rm_pfree(PCONTEXT, pair);
+	}
+	MEMORY_CONTEXT_SWITCH_TO(PCONTEXT)
+	list_free(PreferedHostsForGRMContainers);
+	MEMORY_CONTEXT_SWITCH_BACK
+
+	PreferedHostsForGRMContainers = NULL;
+}
+
+void RB_updateSegmentsHavingNoExpectedGRMContainers(HASHTABLE segments)
+{
+	ListCell *cell = NULL;
+	foreach(cell, PreferedHostsForGRMContainers)
+	{
+		PAIR pair = (PAIR)lfirst(cell);
+		SegResource 	segres = (SegResource)(pair->Key);
+		ResourceBundle	expres = (ResourceBundle)(pair->Value);
+
+		bool failed = false;
+		/* Check if the segment exists in the hash table. */
+		PAIR pair2 = getHASHTABLENode(segments, segres);
+		if ( pair2 == NULL )
+		{
+			elog(RMLOG, "Resource manager finds segment %s has no resource "
+						"container allocated from global resource manager, "
+						"expected resource quota (%d MB, %lf CORE)",
+						GET_SEGRESOURCE_HOSTNAME(segres),
+						expres->MemoryMB,
+						expres->Core);
+			failed = true;
+		}
+		else
+		{
+			ResourceBundle allocres = (ResourceBundle)(pair->Value);
+			if ( allocres->MemoryMB < expres->MemoryMB )
+			{
+				elog(RMLOG, "Resource manager finds segment %s hasn't sufficient "
+							"resource containers allocated from global resource "
+							"manager, expected resource quota (%d MB, %lf CORE), "
+							"actual allocated resource (%d MB, %lf CORE)",
+							GET_SEGRESOURCE_HOSTNAME(segres),
+							expres->MemoryMB,
+							expres->Core,
+							allocres->MemoryMB,
+							allocres->Core);
+				failed = true;
+			}
+		}
+
+		if ( failed )
+		{
+			segres->GRMContainerFailAllocCount++;
+
+			elog(WARNING, "Resource manager detects segment %s hasn't gotten "
+						  "expected quantity of global resource containers for "
+						  "%d times.",
+						  GET_SEGRESOURCE_HOSTNAME(segres),
+						  segres->GRMContainerFailAllocCount);
+		}
+		else
+		{
+			segres->GRMContainerFailAllocCount = 0;
+		}
+	}
+}

http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/43ab8a35/src/backend/resourcemanager/resourcebroker/resourcebroker_LIBYARN.c
----------------------------------------------------------------------
diff --git a/src/backend/resourcemanager/resourcebroker/resourcebroker_LIBYARN.c b/src/backend/resourcemanager/resourcebroker/resourcebroker_LIBYARN.c
index 980202c..c6d26af 100644
--- a/src/backend/resourcemanager/resourcebroker/resourcebroker_LIBYARN.c
+++ b/src/backend/resourcemanager/resourcebroker/resourcebroker_LIBYARN.c
@@ -33,6 +33,8 @@ int handleRB2RM_AllocatedResource(void);
 int handleRB2RM_ReturnedResource(void);
 int handleRB2RM_ContainerReport(void);
 void buildToReturnNotTrackedGRMContainers(RB_GRMContainerStat ctnstats, int size);
+
+void freeResourceBundle(void *resbundle);
 /*
  *------------------------------------------------------------------------------
  * Global variables.
@@ -787,7 +789,7 @@ int handleRB2RM_AllocatedResource(void)
 			return RESBROK_PIPE_ERROR;
 		}
 
-		/* Build result and add into the resource pool. */
+		/* Build result and add into the resource pool as a pending container. */
 		char *phostname     = buffer;
 		for ( int i = 0 ; i < response.ContainerCount ; ++i )
 		{
@@ -815,13 +817,10 @@ int handleRB2RM_AllocatedResource(void)
 	/* We always update resource broker work time stamp here. */
 	DRMGlobalInstance->ResBrokerAppTimeStamp = response.SystemStartTimestamp;
 
-	while( list_length(newcontainers) > 0 )
+	ListCell *cell = NULL;
+	foreach(cell, newcontainers)
 	{
-		GRMContainer newcontainer = (GRMContainer)
-									lfirst(list_head(newcontainers));
-		MEMORY_CONTEXT_SWITCH_TO(PCONTEXT)
-		newcontainers = list_delete_first(newcontainers);
-		MEMORY_CONTEXT_SWITCH_BACK
+		GRMContainer newcontainer = (GRMContainer)lfirst(cell);
 
 		if ( isCleanGRMResourceStatus() )
 		{
@@ -853,14 +852,78 @@ int handleRB2RM_AllocatedResource(void)
 		response.Core     * (response.ExpectedContainerCount - acceptedcount),
 		response.Result == FUNC_RETURN_OK);
 
-	elog(LOG, "Accepted (%d MB, %d CORE) x %d from resource broker, "
-			  "Expected %d containers, skipped %d containers.",
+	elog(LOG, "Resource manager accepted YARN containers (%d MB, %d CORE) x %d "
+			  "from resource broker, expected %d containers, skipped %d containers.",
 			  response.MemoryMB,
 			  response.Core,
 			  acceptedcount,
 			  response.ExpectedContainerCount,
 			  response.ContainerCount - acceptedcount);
 
+	/*
+	 * Buildup hash table for updating each segment if it can not have expected
+	 * resources allocated.
+	 */
+	HASHTABLEData	seghavingres;
+
+	initializeHASHTABLE(&seghavingres,
+						PCONTEXT,
+						HASHTABLE_SLOT_VOLUME_DEFAULT,
+						HASHTABLE_SLOT_VOLUME_DEFAULT_MAX,
+						HASHTABLE_KEYTYPE_VOIDPT,
+						freeResourceBundle);
+
+	while( list_length(newcontainers) > 0 )
+	{
+		GRMContainer newcontainer = (GRMContainer)
+									lfirst(list_head(newcontainers));
+		MEMORY_CONTEXT_SWITCH_TO(PCONTEXT)
+		newcontainers = list_delete_first(newcontainers);
+		MEMORY_CONTEXT_SWITCH_BACK
+
+		if ( isCleanGRMResourceStatus() )
+		{
+			continue;
+		}
+
+		int32_t segid = -1;
+		int res = getSegIDByGRMHostName(newcontainer->HostName,
+										strlen(newcontainer->HostName),
+										&segid);
+		if ( res != FUNC_RETURN_OK )
+		{
+			elog(WARNING, "Resource manager finds not recognized YARN "
+						  "container on host %s, container id is "INT64_FORMAT,
+						  newcontainer->HostName,
+						  newcontainer->ID);
+			continue;
+		}
+
+		SegResource 	segres	   = getSegResource(segid);
+		PAIR			oldsegpair = getHASHTABLENode(&seghavingres, segres);
+		ResourceBundle	resbundle  = NULL;
+		if ( oldsegpair != NULL )
+		{
+			resbundle = (ResourceBundle)(oldsegpair->Value);
+		}
+		else
+		{
+			resbundle = rm_palloc0(PCONTEXT, sizeof(ResourceBundleData));
+			resbundle->MemoryMB = 0;
+			resbundle->Core     = 0;
+			setHASHTABLENode(&seghavingres, segres, resbundle, false);
+		}
+		addResourceBundleData(resbundle,
+							  newcontainer->MemoryMB,
+							  newcontainer->Core);
+	}
+
+	if ( !isCleanGRMResourceStatus() )
+	{
+		RB_updateSegmentsHavingNoExpectedGRMContainers(&seghavingres);
+	}
+	cleanHASHTABLE(&seghavingres);
+
 	if ( containerids != NULL )
 	{
 		rm_pfree(PCONTEXT, containerids);
@@ -872,6 +935,11 @@ int handleRB2RM_AllocatedResource(void)
 	return response.Result;
 }
 
+void freeResourceBundle(void *resbundle)
+{
+	rm_pfree(PCONTEXT, resbundle);
+}
+
 int handleRB2RM_ReturnedResource(void)
 {
 	int fd		= ResBrokerNotifyPipe[0];

http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/43ab8a35/src/backend/resourcemanager/resourcemanager.c
----------------------------------------------------------------------
diff --git a/src/backend/resourcemanager/resourcemanager.c b/src/backend/resourcemanager/resourcemanager.c
index fcdc2df..6b8a9e2 100644
--- a/src/backend/resourcemanager/resourcemanager.c
+++ b/src/backend/resourcemanager/resourcemanager.c
@@ -567,6 +567,8 @@ int MainHandlerLoop(void)
 			/* Refresh resource queue resource usage and request quota. */
 			refreshMemoryCoreRatioLevelUsage(gettime_microsec());
 
+			resetAllSegmentsGRMContainerFailAllocCount();
+
 			/* Check if can resume using new available global resource manager.*/
 			if ( cleanedAllGRMContainers() )
 			{
@@ -2268,15 +2270,6 @@ int generateAllocRequestToBroker(void)
 							reqcore);
 			}
 		}
-
-		/* Free preferred host list. */
-		foreach(cell, preferred)
-		{
-			PAIR pair = (PAIR)lfirst(cell);
-			rm_pfree(PCONTEXT, pair->Value);
-			rm_pfree(PCONTEXT, pair);
-		}
-		list_free(preferred);
 	}
 
 	return res;

http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/43ab8a35/src/backend/resourcemanager/resourcepool.c
----------------------------------------------------------------------
diff --git a/src/backend/resourcemanager/resourcepool.c b/src/backend/resourcemanager/resourcepool.c
index fdc90f5..38cacfa 100644
--- a/src/backend/resourcemanager/resourcepool.c
+++ b/src/backend/resourcemanager/resourcepool.c
@@ -1311,6 +1311,20 @@ void setAllSegResourceGRMUnavailable(void)
 	freePAIRRefList(&(PRESPOOL->Segments), &allsegres);
 }
 
+void resetAllSegmentsGRMContainerFailAllocCount(void)
+{
+	List 	 *allsegres = NULL;
+	ListCell *cell		= NULL;
+	getAllPAIRRefIntoList(&(PRESPOOL->Segments), &allsegres);
+
+	foreach(cell, allsegres)
+	{
+		SegResource segres = (SegResource)(((PAIR)lfirst(cell))->Value);
+		segres->GRMContainerFailAllocCount = 0;
+	}
+	freePAIRRefList(&(PRESPOOL->Segments), &allsegres);
+}
+
 /*
  * Check index to get host id based on host name string.
  */
@@ -1379,8 +1393,8 @@ SegResource createSegResource(SegStat segstat)
 	resetResourceBundleData(&(res->DecPending), 0, 0.0, 0);
 	resetResourceBundleData(&(res->OldInuse)  , 0, 0.0, 0);
 
-	res->GRMContainerCount = 0;
-
+	res->GRMContainerCount 			= 0;
+	res->GRMContainerFailAllocCount	= 0;
 	return res;
 }
 


Mime
View raw message