Github user orhankislal commented on a diff in the pull request:
https://github.com/apache/madlib/pull/223#discussion_r161296957
--- Diff: src/ports/postgres/modules/sample/balance_sample.py_in ---
@@ -0,0 +1,994 @@
+# coding=utf-8
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file EXCEPT in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import math
+import plpy
+import re
+from collections import defaultdict
+from fractions import Fraction
+from utilities.control import MinWarning
+from utilities.utilities import _assert
+from utilities.utilities import unique_string
+from utilities.validate_args import table_exists
+from utilities.validate_args import columns_exist_in_table
+from utilities.validate_args import table_is_empty
+from utilities.validate_args import get_cols
+from utilities.utilities import py_list_to_sql_string
+
+
+m4_changequote(`<!', `!>')
+
+def balance_sample(schema_madlib, source_table, output_table, class_col,
+ class_sizes, output_table_size, grouping_cols, with_replacement, **kwargs):
+
+ """
+ Balance sampling function
+ Args:
+ @param source_table Input table name.
+ @param output_table Output table name.
+ @param class_col Name of the column containing the class to be
+ balanced.
+ @param class_size Parameter to define the size of the different
+ class values.
+ @param output_table_size Desired size of the output data set.
+ @param grouping_cols The columns columns that defines the grouping.
+ @param with_replacement The sampling method.
+
+ """
+ with MinWarning("warning"):
+
+ class_counts = unique_string(desp='class_counts')
+ desired_sample_per_class = unique_string(desp='desired_sample_per_class')
+ desired_counts = unique_string(desp='desired_counts')
+
+ if not class_sizes or class_sizes.strip().lower() in ('null', ''):
+ class_sizes = 'uniform'
+
+ _validate_strs(source_table, output_table, class_col, class_sizes,
+ output_table_size, grouping_cols, with_replacement)
+
+ source_table_columns = ','.join(get_cols(source_table))
+ grp_by = "GROUP BY {0}".format(class_col)
+
+ _create_frequency_distribution(class_counts, source_table, class_col)
+ temp_views = [class_counts]
+
+ if class_sizes.lower() == 'undersample' and not with_replacement:
+ """
+ Random undersample without replacement.
+ Randomly order the rows and give a unique (per class)
+ identifier to each one.
+ Select rows that have identifiers under the target limit.
+ """
+ _undersampling_with_no_replacement(source_table, output_table, class_col,
+ class_sizes, output_table_size, grouping_cols, with_replacement,
+ class_counts, source_table_columns)
+
+ _delete_temp_views(temp_views)
+ return
+
+ """
+ Create views for true and desired sample sizes of classes
+ """
+ """
+ include_unsampled_classes tracks is unsampled classes are desired or not.
+ include_unsampled_classes is always true in output_table_size Null cases
but changes given values of desired sample class sizes in comma-delimited classsize paramter.
--- End diff --
is -> if ?
---
|