hawq-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From iw...@apache.org
Subject incubator-hawq git commit: HAWQ-445. Support large strings (up to a GB) in text_to_array()
Date Thu, 10 Mar 2016 01:23:09 GMT
Repository: incubator-hawq
Updated Branches:
  refs/heads/master 87d13b673 -> e29e13345


HAWQ-445. Support large strings (up to a GB) in text_to_array()


Project: http://git-wip-us.apache.org/repos/asf/incubator-hawq/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-hawq/commit/e29e1334
Tree: http://git-wip-us.apache.org/repos/asf/incubator-hawq/tree/e29e1334
Diff: http://git-wip-us.apache.org/repos/asf/incubator-hawq/diff/e29e1334

Branch: refs/heads/master
Commit: e29e13345f14b70d4193eb16e4b904737871c486
Parents: 87d13b6
Author: ivan <iweng@pivotal.io>
Authored: Thu Mar 10 09:22:49 2016 +0800
Committer: ivan <iweng@pivotal.io>
Committed: Thu Mar 10 09:22:49 2016 +0800

----------------------------------------------------------------------
 src/backend/utils/adt/test/Makefile       |  42 ++++
 src/backend/utils/adt/test/varlena_test.c | 251 +++++++++++++++++++
 src/backend/utils/adt/varlena.c           | 334 +++++++++++++++++++------
 3 files changed, 550 insertions(+), 77 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/e29e1334/src/backend/utils/adt/test/Makefile
----------------------------------------------------------------------
diff --git a/src/backend/utils/adt/test/Makefile b/src/backend/utils/adt/test/Makefile
new file mode 100644
index 0000000..a0cd950
--- /dev/null
+++ b/src/backend/utils/adt/test/Makefile
@@ -0,0 +1,42 @@
+top_builddir=../../../../..
+subdir=src/backend/utils/adt
+
+TARGETS=varlena
+
+# Objects from backend, which don't need to be mocked but need to be linked.
+common_REAL_OBJS=\
+    $(top_srcdir)/src/backend/access/hash/hashfunc.o \
+    $(top_srcdir)/src/backend/bootstrap/bootparse.o \
+    $(top_srcdir)/src/backend/lib/stringinfo.o \
+    $(top_srcdir)/src/backend/nodes/bitmapset.o \
+    $(top_srcdir)/src/backend/nodes/equalfuncs.o \
+    $(top_srcdir)/src/backend/nodes/list.o \
+    $(top_srcdir)/src/backend/parser/gram.o \
+    $(top_srcdir)/src/backend/regex/regcomp.o \
+    $(top_srcdir)/src/backend/regex/regerror.o \
+    $(top_srcdir)/src/backend/regex/regexec.o \
+    $(top_srcdir)/src/backend/regex/regfree.o \
+    $(top_srcdir)/src/backend/storage/page/itemptr.o \
+    $(top_srcdir)/src/backend/utils/adt/datum.o \
+    $(top_srcdir)/src/backend/utils/adt/like.o \
+    $(top_srcdir)/src/backend/utils/hash/dynahash.o \
+    $(top_srcdir)/src/backend/utils/hash/hashfn.o \
+    $(top_srcdir)/src/backend/utils/misc/guc.o \
+    $(top_srcdir)/src/backend/utils/init/globals.o \
+    $(top_srcdir)/src/backend/utils/mmgr/mcxt.o \
+    $(top_srcdir)/src/backend/utils/mmgr/aset.o \
+    $(top_srcdir)/src/backend/utils/mmgr/memprot.o \
+    $(top_srcdir)/src/port/exec.o \
+    $(top_srcdir)/src/port/path.o \
+    $(top_srcdir)/src/port/pgsleep.o \
+    $(top_srcdir)/src/port/pgstrcasecmp.o \
+    $(top_srcdir)/src/port/qsort.o \
+    $(top_srcdir)/src/port/strlcpy.o \
+    $(top_srcdir)/src/port/thread.o \
+    $(top_srcdir)/src/timezone/localtime.o \
+    $(top_srcdir)/src/timezone/pgtz.o    
+
+varlena_REAL_OBJS=$(common_REAL_OBJS)
+
+include ../../../../Makefile.mock
+

http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/e29e1334/src/backend/utils/adt/test/varlena_test.c
----------------------------------------------------------------------
diff --git a/src/backend/utils/adt/test/varlena_test.c b/src/backend/utils/adt/test/varlena_test.c
new file mode 100644
index 0000000..46035fa
--- /dev/null
+++ b/src/backend/utils/adt/test/varlena_test.c
@@ -0,0 +1,251 @@
+#include <stdarg.h>
+#include <stddef.h>
+#include <setjmp.h>
+#include "cmockery.h"
+
+#include "c.h"
+#include "postgres.h"
+#include "nodes/nodes.h"
+#include "../varlena.c"
+
+#define MEMORY_LIMIT 8 /* 8 bytes memory limit */
+
+#ifdef USE_ASSERT_CHECKING
+void
+_ExceptionalCondition( )
+{
+     PG_RE_THROW();
+}
+#endif
+
+/*
+ * Checks if the small strings that fit in memory fails assertion.
+ */
+void
+test__find_memory_limited_substring__small_string(void **state)
+{
+	int subStringByteLength = -1;
+	int subStringCharLength = -1;
+	int totalByteLength = MEMORY_LIMIT;
+	char *strStart = 0xabcdefab;
+
+#ifdef USE_ASSERT_CHECKING
+	expect_any(ExceptionalCondition,conditionName);
+	expect_any(ExceptionalCondition,errorType);
+	expect_any(ExceptionalCondition,fileName);
+	expect_any(ExceptionalCondition,lineNumber);
+	will_be_called_with_sideeffect(ExceptionalCondition,&_ExceptionalCondition,NULL);
+
+	/* Test if within memory-limit strings cause assertion failure */
+	PG_TRY();
+	{
+		find_memory_limited_substring(strStart, totalByteLength, MEMORY_LIMIT, &subStringByteLength,
&subStringCharLength);
+		assert_true(false);
+	}
+	PG_CATCH();
+	{
+	}
+	PG_END_TRY();
+#endif
+}
+
+/*
+ * Checks if null input string causes assertion failure.
+ */
+void
+test__find_memory_limited_substring__null_string(void **state)
+{
+	int subStringByteLength = -1;
+	int subStringCharLength = -1;
+	int totalByteLength = MEMORY_LIMIT + 1;
+	char *strStart = NULL;
+
+#ifdef USE_ASSERT_CHECKING
+	expect_any(ExceptionalCondition,conditionName);
+	expect_any(ExceptionalCondition,errorType);
+	expect_any(ExceptionalCondition,fileName);
+	expect_any(ExceptionalCondition,lineNumber);
+	will_be_called_with_sideeffect(ExceptionalCondition,&_ExceptionalCondition,NULL);
+
+ 	/* Test if null strings cause assertion failure */
+	PG_TRY();
+	{
+		find_memory_limited_substring(strStart, totalByteLength, MEMORY_LIMIT, &subStringByteLength,
&subStringCharLength);
+		assert_true(false);
+	}
+	PG_CATCH();
+	{
+	}
+	PG_END_TRY();
+#endif
+}
+
+/*
+ * Checks if the returned string segments are within memory limit for ascii characters.
+ */
+void
+test__find_memory_limited_substring__ascii_chars_within_memory_limit(void **state)
+{
+	int subStringByteLength = -1;
+	int subStringCharLength = -1;
+	int cumulativeLengthConsidered = 0;
+
+	char *strStart = 0xabcdefab;
+
+	int totalByteLength = 25;
+
+	while (cumulativeLengthConsidered < totalByteLength - MEMORY_LIMIT)
+	{
+		will_return(pg_database_encoding_max_length, 1);
+		find_memory_limited_substring(strStart, totalByteLength - cumulativeLengthConsidered, MEMORY_LIMIT,
&subStringByteLength, &subStringCharLength);
+		cumulativeLengthConsidered += subStringByteLength;
+		assert_true(subStringByteLength == MEMORY_LIMIT);
+		assert_true(subStringByteLength == subStringCharLength);
+	}
+
+#ifdef USE_ASSERT_CHECKING
+	expect_any(ExceptionalCondition,conditionName);
+	expect_any(ExceptionalCondition,errorType);
+	expect_any(ExceptionalCondition,fileName);
+	expect_any(ExceptionalCondition,lineNumber);
+	will_be_called_with_sideeffect(ExceptionalCondition,&_ExceptionalCondition,NULL);
+
+	/* Test if the left-over string that fits in memory cause assertion failure */
+	PG_TRY();
+	{
+		find_memory_limited_substring(strStart, totalByteLength - cumulativeLengthConsidered, MEMORY_LIMIT,
&subStringByteLength, &subStringCharLength);
+		assert_true(false);
+	}
+	PG_CATCH();
+	{
+	}
+	PG_END_TRY();
+
+	expect_any(ExceptionalCondition,conditionName);
+	expect_any(ExceptionalCondition,errorType);
+	expect_any(ExceptionalCondition,fileName);
+	expect_any(ExceptionalCondition,lineNumber);
+	will_be_called_with_sideeffect(ExceptionalCondition,&_ExceptionalCondition,NULL);
+
+	/* Test if null strings cause assertion failure */
+	PG_TRY();
+	{
+		find_memory_limited_substring(NULL, totalByteLength, MEMORY_LIMIT, &subStringByteLength,
&subStringCharLength);
+	}
+	PG_CATCH();
+	{
+		return;
+	}
+	PG_END_TRY();
+	assert_true(false);
+#endif
+}
+
+
+/*
+ * Checks if the returned string segments are within memory limit for multi-bytes chars.
+ */
+void
+test__find_memory_limited_substring__mb_chars_within_memory_limit(void **state)
+{
+	int subStringByteLength = -1;
+	int subStringCharLength = -1;
+	int cumulativeLengthConsidered = 0;
+
+	/* Lengths of the multi-byte characters at different positions */
+	int stringByteLengths[] = {3, 3, 3 /* seg1 */, 2, 2, 1, 2 /* seg2 */, 2, 1, 1, 1, 2, /*
seg3 */ 5, 4 /* seg4 */, 4};
+
+	/* Total length in terms of number of characters */
+	int stringCharLength = sizeof(stringByteLengths) / sizeof(int);
+
+	/* Total byte lengths of all the characters */
+	int totalByteLength = 0;
+	for (int charIndex = 0; charIndex < stringCharLength; charIndex++)
+	{
+		totalByteLength += stringByteLengths[charIndex];
+	}
+
+	int segmentByteLength = 0; /* Number of bytes in current segment */
+	int segmentCharLength = 0; /* Number of characters in current segment */
+
+	/* Length of the char that spilled over from one partition to another */
+	int carryoverLength = 0;
+
+	/* Fictitious multi-byte string to segment */
+	char *strStart = 0xabcdefab;
+
+	for (int charIndex = 0; charIndex < stringCharLength; charIndex++)
+	{
+		if (carryoverLength > 0)
+		{
+			expect_any(pg_mblen, mbstr);
+			will_return(pg_mblen, carryoverLength);
+			carryoverLength = 0;
+		}
+
+		expect_any(pg_mblen, mbstr);
+		will_return(pg_mblen, stringByteLengths[charIndex]);
+		segmentByteLength += stringByteLengths[charIndex];
+		segmentCharLength++;
+
+		if (segmentByteLength > MEMORY_LIMIT)
+		{
+
+			will_return(pg_database_encoding_max_length, 6);
+			find_memory_limited_substring(strStart, totalByteLength - cumulativeLengthConsidered,
MEMORY_LIMIT, &subStringByteLength, &subStringCharLength);
+			assert_true(subStringByteLength == (segmentByteLength - stringByteLengths[charIndex]));
+			assert_true(subStringCharLength == (segmentCharLength - 1));
+			assert_true(subStringByteLength <= MEMORY_LIMIT);
+			assert_true(subStringCharLength <= MEMORY_LIMIT);
+
+			cumulativeLengthConsidered += subStringByteLength;
+
+			segmentByteLength = stringByteLengths[charIndex];
+			segmentCharLength = 1;
+			carryoverLength = stringByteLengths[charIndex];
+		}
+	}
+
+	/* Now purge any unused pg_mblen call because of the suffix that does not exceed MEMORY_LIMIT
*/
+	for (int partitionCharIndex = 0; partitionCharIndex < segmentCharLength; partitionCharIndex++)
+	{
+		pg_mblen("a");
+	}
+
+#ifdef USE_ASSERT_CHECKING
+	expect_any(ExceptionalCondition,conditionName);
+	expect_any(ExceptionalCondition,errorType);
+	expect_any(ExceptionalCondition,fileName);
+	expect_any(ExceptionalCondition,lineNumber);
+	will_be_called_with_sideeffect(ExceptionalCondition,&_ExceptionalCondition,NULL);
+
+	/* Test if the left-over string that fits in memory cause assertion failure */
+	PG_TRY();
+	{
+		find_memory_limited_substring(strStart, totalByteLength - cumulativeLengthConsidered, MEMORY_LIMIT,
&subStringByteLength, &subStringCharLength);
+	}
+	PG_CATCH();
+	{
+		return;
+	}
+	PG_END_TRY();
+
+	assert_true(false);
+#endif
+}
+
+int 
+main(int argc, char* argv[]) 
+{
+        cmockery_parse_arguments(argc, argv);
+        
+        const UnitTest tests[] = {
+			unit_test(test__find_memory_limited_substring__small_string),
+			unit_test(test__find_memory_limited_substring__null_string),
+			unit_test(test__find_memory_limited_substring__ascii_chars_within_memory_limit),
+			unit_test(test__find_memory_limited_substring__mb_chars_within_memory_limit)
+        };
+        return run_tests(tests);
+}
+
+

http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/e29e1334/src/backend/utils/adt/varlena.c
----------------------------------------------------------------------
diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c
index 68aa810..21c4afb 100644
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@@ -28,6 +28,7 @@
 #include "utils/lsyscache.h"
 #include "utils/pg_locale.h"
 #include "utils/string_wrapper.h"
+#include "utils/memutils.h"
 
 typedef struct varlena unknown;
 
@@ -55,6 +56,13 @@ typedef struct
 #define PG_STR_GET_TEXT(str_) \
 	DatumGetTextP(DirectFunctionCall1(textin, CStringGetDatum(str_)))
 
+/*
+ * Max considered sub-string size is set to MaxAllocSize - 4MB).
+ * The 4MB is saved aside for memory allocation overhead such
+ * as allocation set headers.
+ */
+#define MAX_STRING_BYTES	((Size) (MaxAllocSize - 0x400000))
+
 static int	text_position_ptr_len(char* p1, int len1, char *p2, int len2); 
 static void text_position_setup_ptr_len(char* p1, int len1, char* p2, int len2, TextPositionState
*state);
 
@@ -617,6 +625,65 @@ charlen_to_bytelen(const char *p, int n)
 	}
 }
 
+/* find_memory_limited_substring()
+ *	Computes the sub-string length in number of characters and number
+ *	of bytes where the sub-string consumes up to "memoryLimit" amount of memory.
+ *
+ *	Parameters:
+ *		strStart: starting pointer in the string
+ * 		byteLen: number of bytes in the string, starting from strStart
+ * 		memoryLimit: max string size in terms of bytes
+ *
+ * 	Out parameters:
+ *		subStringByteLen: length of chosen sub-string in bytes
+ *		subStringCharLen: length of chosen sub-string in character count
+ *
+ * It is caller's responsibility that there actually are byteLen bytes
+ * starting from strStart; the string needs not be null-terminated.
+ */
+static void
+find_memory_limited_substring(const char *strStart, int byteLen, int memoryLimit, int *subStringByteLen,
int *subStringCharLen)
+{
+	AssertArg(byteLen > memoryLimit);
+	AssertArg(NULL != strStart);
+	AssertArg(NULL != subStringCharLen);
+
+	if (pg_database_encoding_max_length() == 1)
+	{
+		/* Optimization for single-byte encodings */
+		*subStringByteLen = byteLen < memoryLimit ? byteLen : memoryLimit;
+		*subStringCharLen = *subStringByteLen;
+
+		return;
+	}
+	else
+	{
+		const char *strCurPointer = strStart;;
+
+		int consumedBytes = 0;
+		int consumedChars = 0;
+
+		while (consumedBytes <= byteLen)
+		{
+			int curCharBytes = pg_mblen(strCurPointer);
+			strCurPointer += curCharBytes;
+			consumedChars++;
+			consumedBytes += curCharBytes;
+
+			if (consumedBytes > memoryLimit)
+			{
+				*subStringByteLen = consumedBytes - curCharBytes;
+				*subStringCharLen = consumedChars - 1;
+
+				Insist((*subStringByteLen > 0) && (*subStringCharLen > 0));
+
+				return;
+			}
+		}
+	}
+}
+
+
 /*
  * text_substr()
  * Return a substring starting at the specified position.
@@ -2559,24 +2626,36 @@ split_text(PG_FUNCTION_ARGS)
 	PG_RETURN_TEXT_P(result_text);
 }
 
+
 /*
- * text_to_array
- * parse input string
- * return text array of elements
- * based on provided field separator
+ * text_to_array_impl
+ *		Carries out the actual tokenization and array conversion of an input string.
+ *
+ * Parameters:
+ * 		string: Where to start in the input string
+ * 		stringByteLen: Length of current string
+ * 		delimiter: Which delimiter to use
+ * 		delimiterByteLen: Length of delimiter in bytes
+ * 		delimiterCharLen: Length of delimiter in chars
+ * 		arrayState: State of the output array where we accumulate results
+ * 		endOfString: Do we expect any more chunk of the main input string?
+ *
+ * Returns the pointer where the last match was found. Successively the
+ * caller can splice more data starting from this address to find further
+ * array elements.
  */
-Datum
-text_to_array(PG_FUNCTION_ARGS)
+static char* text_to_array_impl(char *string, int stringByteLen, char *delimiter,
+		int delimiterByteLen, int delimiterCharLen, ArrayBuildState **arrayState, bool endOfString)
 {
-	Datum d0 = PG_GETARG_DATUM(0);
-	char *p0; void *tofree0; int len0;
+	int start_posn = 1;
+	int fldnum = 1;
+	int end_posn = 0;
+	int chunk_len = 0;
+	text	   *result_text;
 
-	Datum d1 = PG_GETARG_DATUM(1);
-	char *p1; void *tofree1; int len1;
+	char* cur_ptr = string;
 
-	int			inputstring_len;
-	int			fldsep_len; 
-	TextPositionState state = 		
+	TextPositionState state =
 		{
 		0, /* use_wchar */
 		NULL, /* str1 */
@@ -2587,79 +2666,32 @@ text_to_array(PG_FUNCTION_ARGS)
 		0, /* len2 */
 		};
 
-	int			fldnum;
-	int			start_posn;
-	int			end_posn;
-	int			chunk_len;
-	char	   *start_ptr;
-	text	   *result_text;
-	ArrayBuildState *astate = NULL;
-
-	varattrib_untoast_ptr_len(d0, &p0, &len0, &tofree0);
-	varattrib_untoast_ptr_len(d1, &p1, &len1, &tofree1);
-
-	if(pg_database_encoding_max_length() == 1)
-	{
-		inputstring_len = len0;
-		fldsep_len = len1;
-	}
-	else
-	{
-		inputstring_len = pg_mbstrlen_with_len(p0, len0);
-		fldsep_len = pg_mbstrlen_with_len(p1, len1);
-	}
-
-	/* return NULL for empty input string */
-	if (inputstring_len < 1)
-	{
-		if(tofree0)
-			pfree(tofree0);
-		if(tofree1)
-			pfree(tofree1);
-
-		PG_RETURN_NULL();
-	}
-
-	/*
-	 * empty field separator return one element, 1D, array using the input
-	 * string
-	 */
-	if (fldsep_len < 1)
-	{
-		if(tofree0)
-			pfree(tofree0);
-		if(tofree1)
-			pfree(tofree1);
-
-		PG_RETURN_ARRAYTYPE_P(create_singleton_array(fcinfo, TEXTOID, d0, 1));
-	}
-
-	text_position_setup_ptr_len(p0, len0, p1, len1, &state);
-
-	start_posn = 1;
-	/* start_ptr points to the start_posn'th character of inputstring */
-	start_ptr = p0; 
+	text_position_setup_ptr_len(string, stringByteLen, delimiter, delimiterByteLen, &state);
 
 	for (fldnum = 1;; fldnum++) /* field number is 1 based */
 	{
 		end_posn = text_position_next(start_posn, &state);
 
-		if (end_posn == 0)
+		if (end_posn == 0 && !endOfString)
+		{
+			break;
+		}
+		else if (end_posn == 0)
 		{
 			/* fetch last field */
-			chunk_len = (p0 + len0) - start_ptr;
+			chunk_len = (string + stringByteLen) - cur_ptr;
 		}
 		else
 		{
 			/* fetch non-last field */
-			chunk_len = charlen_to_bytelen(start_ptr, end_posn - start_posn);
+			chunk_len = charlen_to_bytelen(cur_ptr, end_posn - start_posn);
 		}
 
 		/* must build a temp text datum to pass to accumArrayResult */
-		result_text = cstring_to_text_with_len(start_ptr, chunk_len);
+		result_text = cstring_to_text_with_len(cur_ptr, chunk_len);
 
 		/* stash away this field */
-		astate = accumArrayResult(astate,
+		*arrayState = accumArrayResult(*arrayState,
 								  PointerGetDatum(result_text),
 								  false,
 								  TEXTOID,
@@ -2668,20 +2700,168 @@ text_to_array(PG_FUNCTION_ARGS)
 		pfree(result_text);
 
 		if (end_posn == 0)
+		{
+			/* Process next sub-string if any */
 			break;
+		}
 
 		start_posn = end_posn;
-		start_ptr += chunk_len;
-		start_posn += fldsep_len;
-		start_ptr += charlen_to_bytelen(start_ptr, fldsep_len);
+		cur_ptr += chunk_len;
+		start_posn += delimiterCharLen;
+		cur_ptr += charlen_to_bytelen(cur_ptr, delimiterCharLen);
 	}
 
 	text_position_cleanup(&state);
 
-	if(tofree0)
-		pfree(tofree0);
-	if(tofree1)
-		pfree(tofree1);
+	return cur_ptr;
+}
+
+
+/*
+ * text_to_array_multi_pass
+ *		Carries out the actual tokenization and array conversion of input string
+ *		in multiple passes, where each pass is restricted to GPDB memory allocation limit.
+ *
+ * Parameters:
+ * 		string: The start of the input string
+ * 		stringByteLen: Length of current string
+ * 		delimiter: Which delimiter to use
+ * 		delimiterByteLen: Length of delimiter in bytes
+ * 		delimiterCharLen: Length of delimiter in chars
+ * 		endOfString: Do we expect any more chunk of the main input string?
+ *
+ * Returns the ArrayBuildState containing all the array elements.
+ */
+static ArrayBuildState* text_to_array_multi_pass(char *string, int stringByteLen, char *delimiter,
int delimiterByteLen, int delimiterCharLen)
+{
+	ArrayBuildState *astate = NULL;
+
+	/* Start with full string. If it is too big then we chunk it later */
+	char	   *start_ptr = string;
+	int curSubStringByteLen = stringByteLen;
+
+	bool endOfString = false;
+
+	/* More bytes to consider? */
+	while (!endOfString)
+	{
+		/*
+		 * Give the rest of the string to the current pass; may be chunked if
+		 * the rest still doesn't fit in the memory
+		 */
+		curSubStringByteLen = (string + stringByteLen) - start_ptr;
+
+		/* Will this MBCS become too big to fit in memory once converted to wchar? */
+		if (pg_database_encoding_max_length() > 1 && curSubStringByteLen > ((MAX_STRING_BYTES)/
sizeof(pg_wchar)))
+		{
+			int curSubStringCharLen = 0;
+			/* We need multi-pass. So find the sub-string boundary for the current pass */
+			find_memory_limited_substring(start_ptr, string + stringByteLen - start_ptr,
+				(MAX_STRING_BYTES) / sizeof(pg_wchar), &curSubStringByteLen, &curSubStringCharLen);
+		}
+
+		Insist(start_ptr + curSubStringByteLen <= string + stringByteLen);
+
+		endOfString = ((start_ptr + curSubStringByteLen) == (string + stringByteLen));
+
+		char *nextStartPtr = text_to_array_impl(start_ptr, curSubStringByteLen, delimiter, delimiterByteLen,
delimiterCharLen, &astate, endOfString);
+
+		Insist(nextStartPtr >= start_ptr);
+
+		if (!endOfString && nextStartPtr == start_ptr)
+		{
+			elog(ERROR, "String size not supported.");
+		}
+
+		start_ptr = nextStartPtr;
+	}
+
+	return astate;
+}
+
+
+/*
+ *  * text_to_array
+ *   * parse input string
+ *    * return text array of elements
+ *     * based on provided field separator
+ *      */
+Datum
+text_to_array(PG_FUNCTION_ARGS)
+{
+	Datum stringDatum = PG_GETARG_DATUM(0);
+	char *string = NULL;
+	void *toFreeString = NULL;
+	int stringByteLen = 0;
+
+	Datum delimiterDatum = PG_GETARG_DATUM(1);
+	char *delimiter = NULL;
+	void *toFreeDelimiter = NULL;
+	int delimiterByteLen = 0;
+
+	int stringCharLen = 0;
+	int	delimiterCharLen = 0;
+
+	varattrib_untoast_ptr_len(stringDatum, &string, &stringByteLen, &toFreeString);
+	varattrib_untoast_ptr_len(delimiterDatum, &delimiter, &delimiterByteLen, &toFreeDelimiter);
+
+	if(pg_database_encoding_max_length() == 1)
+	{
+		stringCharLen = stringByteLen;
+		delimiterCharLen = delimiterByteLen;
+	}
+	else
+	{
+		stringCharLen = pg_mbstrlen_with_len(string, stringByteLen);
+		delimiterCharLen = pg_mbstrlen_with_len(delimiter, delimiterByteLen);
+	}
+
+	/* return NULL for empty input string */
+	if (stringCharLen < 1)
+	{
+		if(toFreeString)
+		{
+			pfree(toFreeString);
+		}
+
+		if(toFreeDelimiter)
+		{
+			pfree(toFreeDelimiter);
+		}
+
+		PG_RETURN_NULL();
+	}
+
+	/*
+	 * empty field separator return one element, 1D, array using the input
+	 * string
+	 */
+	if (delimiterCharLen < 1)
+	{
+		if(toFreeString)
+		{
+			pfree(toFreeString);
+		}
+
+		if(toFreeDelimiter)
+		{
+			pfree(toFreeDelimiter);
+		}
+
+		PG_RETURN_ARRAYTYPE_P(create_singleton_array(fcinfo, TEXTOID, stringDatum, 1));
+	}
+
+	ArrayBuildState *astate = text_to_array_multi_pass(string, stringByteLen, delimiter, delimiterByteLen,
delimiterCharLen);
+
+	if(toFreeString)
+	{
+		pfree(toFreeString);
+	}
+	if(toFreeDelimiter)
+	{
+		pfree(toFreeDelimiter);
+	}
+
 	PG_RETURN_DATUM(makeArrayResult(astate, CurrentMemoryContext));
 }
 


Mime
View raw message