madlib-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From orhankislal <...@git.apache.org>
Subject [GitHub] madlib pull request #223: Balance datasets : re-sampling technique
Date Tue, 16 Jan 2018 19:28:39 GMT
Github user orhankislal commented on a diff in the pull request:

    https://github.com/apache/madlib/pull/223#discussion_r161296957
  
    --- Diff: src/ports/postgres/modules/sample/balance_sample.py_in ---
    @@ -0,0 +1,994 @@
    +# coding=utf-8
    +#
    +# Licensed to the Apache Software Foundation (ASF) under one
    +# or more contributor license agreements.  See the NOTICE file
    +# distributed with this work for additional information
    +# regarding copyright ownership.  The ASF licenses this file
    +# to you under the Apache License, Version 2.0 (the
    +# "License"); you may not use this file EXCEPT in compliance
    +# with the License.  You may obtain a copy of the License at
    +#
    +#   http://www.apache.org/licenses/LICENSE-2.0
    +#
    +# Unless required by applicable law or agreed to in writing,
    +# software distributed under the License is distributed on an
    +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
    +# KIND, either express or implied.  See the License for the
    +# specific language governing permissions and limitations
    +# under the License.
    +import math
    +import plpy
    +import re
    +from collections import defaultdict
    +from fractions import Fraction
    +from utilities.control import MinWarning
    +from utilities.utilities import _assert
    +from utilities.utilities import unique_string
    +from utilities.validate_args import table_exists
    +from utilities.validate_args import columns_exist_in_table
    +from utilities.validate_args import table_is_empty
    +from utilities.validate_args import get_cols
    +from utilities.utilities import py_list_to_sql_string
    +
    +
    +m4_changequote(`<!', `!>')
    +
    +def balance_sample(schema_madlib, source_table, output_table, class_col,
    +    class_sizes, output_table_size, grouping_cols, with_replacement, **kwargs):
    +
    +    """
    +    Balance sampling function
    +    Args:
    +        @param source_table       Input table name.
    +        @param output_table       Output table name.
    +        @param class_col          Name of the column containing the class to be
    +                                  balanced.
    +        @param class_size         Parameter to define the size of the different
    +                                  class values.
    +        @param output_table_size  Desired size of the output data set.
    +        @param grouping_cols      The columns columns that defines the grouping.
    +        @param with_replacement   The sampling method.
    +
    +    """
    +    with MinWarning("warning"):
    +
    +        class_counts = unique_string(desp='class_counts')
    +        desired_sample_per_class = unique_string(desp='desired_sample_per_class')
    +        desired_counts = unique_string(desp='desired_counts')
    +
    +        if not class_sizes or class_sizes.strip().lower() in ('null', ''):
    +            class_sizes = 'uniform'
    +
    +        _validate_strs(source_table, output_table, class_col, class_sizes,
    +            output_table_size, grouping_cols, with_replacement)
    +
    +        source_table_columns = ','.join(get_cols(source_table))
    +        grp_by = "GROUP BY {0}".format(class_col)
    +
    +        _create_frequency_distribution(class_counts, source_table, class_col)
    +        temp_views = [class_counts]
    +
    +        if class_sizes.lower() == 'undersample' and not with_replacement:
    +            """
    +                Random undersample without replacement.
    +                Randomly order the rows and give a unique (per class)
    +                identifier to each one.
    +                Select rows that have identifiers under the target limit.
    +            """
    +            _undersampling_with_no_replacement(source_table, output_table, class_col,
    +                    class_sizes, output_table_size, grouping_cols, with_replacement,
    +                    class_counts, source_table_columns)
    +
    +            _delete_temp_views(temp_views)
    +            return
    +
    +        """
    +            Create views for true and desired sample sizes of classes
    +        """
    +        """
    +            include_unsampled_classes tracks is unsampled classes are desired or not.
    +            include_unsampled_classes is always true in output_table_size Null cases
but changes given values of desired sample class sizes in comma-delimited classsize paramter.
    --- End diff --
    
    is -> if ?


---

Mime
View raw message