Github user njayaram2 commented on a diff in the pull request:
https://github.com/apache/madlib/pull/230#discussion_r166056096
--- Diff: src/ports/postgres/modules/sample/balance_sample.py_in ---
@@ -0,0 +1,748 @@
+# coding=utf-8
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file EXCEPT in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+m4_changequote(`<!', `!>')
+
+import math
+
+if __name__ != "__main__":
+ import plpy
+ from utilities.control import MinWarning
+ from utilities.utilities import _assert
+ from utilities.utilities import extract_keyvalue_params
+ from utilities.utilities import unique_string
+ from utilities.validate_args import columns_exist_in_table
+ from utilities.validate_args import get_cols
+ from utilities.validate_args import table_exists
+ from utilities.validate_args import table_is_empty
+else:
+ # Used only for Unit Testing
+ # FIXME: repeating a function from utilities that is needed by the unit test.
+ # This should be removed once a unittest framework in used for testing.
+ import random
+ import time
+
+ def unique_string(desp='', **kwargs):
+ """
+ Generate random remporary names for temp table and other names.
+ It has a SQL interface so both SQL and Python functions can call it.
+ """
+ r1 = random.randint(1, 100000000)
+ r2 = int(time.time())
+ r3 = int(time.time()) % random.randint(1, 100000000)
+ u_string = "__madlib_temp_" + desp + str(r1) + "_" + str(r2) + "_" + str(r3)
+ "__"
+ return u_string
+# ------------------------------------------------------------------------------
+
+UNIFORM = 'uniform'
+UNDERSAMPLE = 'undersample'
+OVERSAMPLE = 'oversample'
+NOSAMPLE = 'nosample'
+
+NEW_ID_COLUMN = '__madlib_id__'
+NULL_IDENTIFIER = '__madlib_null_id__'
+
+def _get_frequency_distribution(source_table, class_col):
+ """ Returns a dict containing the number of rows associated with each class
+ level. Each class level value is converted to a string using ::text.
+ """
+ query_result = plpy.execute("""
+ SELECT {class_col}::text AS classes,
+ count(*) AS class_count
+ FROM {source_table}
+ GROUP BY {class_col}
+ """.format(**locals()))
+ actual_level_counts = {}
+ for each_row in query_result:
+ level = each_row['classes']
+ if level:
+ level = level.strip()
+ actual_level_counts[level] = each_row['class_count']
+ return actual_level_counts
+
+
+def _validate_and_get_sampling_strategy(sampling_strategy_str, output_table_size,
+ supported_strategies=None, default=UNIFORM):
+ """ Returns the sampling strategy based on the class_sizes input param.
+ @param sampling_strategy_str The sampling strategy specified by the
+ user (class_sizes param)
+ @returns:
+ Str. One of [UNIFORM, UNDERSAMPLE, OVERSAMPLE]. Default is UNIFORM.
+ """
+ if not sampling_strategy_str:
+ sampling_strategy_str = default
+ else:
+ if len(sampling_strategy_str) < 3:
+ # Require at least 3 characters since UNIFORM and UNDERSAMPLE have
+ # common prefix substring
+ plpy.error("Sample: Invalid class_sizes parameter")
+
+ if not supported_strategies:
+ supported_strategies = [UNIFORM, UNDERSAMPLE, OVERSAMPLE]
+ try:
+ # allow user to specify a prefix substring of
--- End diff --
There is precedence for supporting prefix for parameter values, in modules such as SVM.
Yes, the error messages could be similar.
---
|