From commits-return-2427-archive-asf-public=cust-asf.ponee.io@superset.incubator.apache.org Thu Mar 21 00:14:25 2019 Return-Path: X-Original-To: archive-asf-public@cust-asf.ponee.io Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by mx-eu-01.ponee.io (Postfix) with SMTP id 71E3718062C for ; Thu, 21 Mar 2019 01:14:24 +0100 (CET) Received: (qmail 59168 invoked by uid 500); 21 Mar 2019 00:14:23 -0000 Mailing-List: contact commits-help@superset.incubator.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@superset.incubator.apache.org Delivered-To: mailing list commits@superset.incubator.apache.org Received: (qmail 59159 invoked by uid 99); 21 Mar 2019 00:14:23 -0000 Received: from ec2-52-202-80-70.compute-1.amazonaws.com (HELO gitbox.apache.org) (52.202.80.70) by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 21 Mar 2019 00:14:23 +0000 Received: by gitbox.apache.org (ASF Mail Server at gitbox.apache.org, from userid 33) id 0332185074; Thu, 21 Mar 2019 00:14:23 +0000 (UTC) Date: Thu, 21 Mar 2019 00:14:22 +0000 To: "commits@superset.apache.org" Subject: [incubator-superset] branch master updated: [missing values] Removing replacing missing values (#4905) MIME-Version: 1.0 Content-Type: text/plain; charset=utf-8 Content-Transfer-Encoding: 8bit Message-ID: <155312726258.6230.5426256851050049353@gitbox.apache.org> From: johnbodley@apache.org X-Git-Host: gitbox.apache.org X-Git-Repo: incubator-superset X-Git-Refname: refs/heads/master X-Git-Reftype: branch X-Git-Oldrev: 30f88caf552fa6fcf96c3232a3876181981bf8e9 X-Git-Newrev: 61add606ca16a6ba981ccde864b121f5464b697a X-Git-Rev: 61add606ca16a6ba981ccde864b121f5464b697a X-Git-NotificationType: ref_changed_plus_diff X-Git-Multimail-Version: 1.5.dev Auto-Submitted: auto-generated This is an automated email from the ASF dual-hosted git repository. johnbodley pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/incubator-superset.git The following commit(s) were added to refs/heads/master by this push: new 61add60 [missing values] Removing replacing missing values (#4905) 61add60 is described below commit 61add606ca16a6ba981ccde864b121f5464b697a Author: John Bodley <4567245+john-bodley@users.noreply.github.com> AuthorDate: Wed Mar 20 17:14:15 2019 -0700 [missing values] Removing replacing missing values (#4905) --- superset/assets/backendSync.json | 3 +- superset/assets/src/explore/controls.jsx | 2 +- superset/common/query_context.py | 23 ------- superset/viz.py | 102 +++++++++++++------------------ tests/core_tests.py | 10 --- tests/viz_tests.py | 12 ---- 6 files changed, 46 insertions(+), 106 deletions(-) diff --git a/superset/assets/backendSync.json b/superset/assets/backendSync.json index ee44c70..3dfb573 100644 --- a/superset/assets/backendSync.json +++ b/superset/assets/backendSync.json @@ -3729,4 +3729,5 @@ "default": false } } -} \ No newline at end of file +} + diff --git a/superset/assets/src/explore/controls.jsx b/superset/assets/src/explore/controls.jsx index d5fd1ca..9bb8198 100644 --- a/superset/assets/src/explore/controls.jsx +++ b/superset/assets/src/explore/controls.jsx @@ -1351,7 +1351,7 @@ export const controls = { 'mean', 'min', 'max', - 'stdev', + 'std', 'var', ]), default: 'sum', diff --git a/superset/common/query_context.py b/superset/common/query_context.py index 5053372..f989dad 100644 --- a/superset/common/query_context.py +++ b/superset/common/query_context.py @@ -41,7 +41,6 @@ class QueryContext: to retrieve the data payload for a given viz. """ - default_fillna = 0 cache_type = 'df' enforce_numerical_metrics = True @@ -103,7 +102,6 @@ class QueryContext: self.df_metrics_to_num(df, query_object) df.replace([np.inf, -np.inf], np.nan) - df = self.handle_nulls(df) return { 'query': result.query, 'status': result.status, @@ -118,27 +116,6 @@ class QueryContext: if dtype.type == np.object_ and col in metrics: df[col] = pd.to_numeric(df[col], errors='coerce') - def handle_nulls(self, df): - fillna = self.get_fillna_for_columns(df.columns) - return df.fillna(fillna) - - def get_fillna_for_col(self, col): - """Returns the value to use as filler for a specific Column.type""" - if col and col.is_string: - return ' NULL' - return self.default_fillna - - def get_fillna_for_columns(self, columns=None): - """Returns a dict or scalar that can be passed to DataFrame.fillna""" - if columns is None: - return self.default_fillna - columns_dict = {col.column_name: col for col in self.datasource.columns} - fillna = { - c: self.get_fillna_for_col(columns_dict.get(c)) - for c in columns - } - return fillna - def get_data(self, df): return df.to_dict(orient='records') diff --git a/superset/viz.py b/superset/viz.py index ef7ee45..1a93adb 100644 --- a/superset/viz.py +++ b/superset/viz.py @@ -75,7 +75,6 @@ class BaseViz(object): verbose_name = 'Base Viz' credits = '' is_timeseries = False - default_fillna = 0 cache_type = 'df' enforce_numerical_metrics = True @@ -164,28 +163,6 @@ class BaseViz(object): """ pass - def handle_nulls(self, df): - fillna = self.get_fillna_for_columns(df.columns) - return df.fillna(fillna) - - def get_fillna_for_col(self, col): - """Returns the value to use as filler for a specific Column.type""" - if col: - if col.is_string: - return ' NULL' - return self.default_fillna - - def get_fillna_for_columns(self, columns=None): - """Returns a dict or scalar that can be passed to DataFrame.fillna""" - if columns is None: - return self.default_fillna - columns_dict = {col.column_name: col for col in self.datasource.columns} - fillna = { - c: self.get_fillna_for_col(columns_dict.get(c)) - for c in columns - } - return fillna - def get_samples(self): query_obj = self.query_obj() query_obj.update({ @@ -254,8 +231,7 @@ class BaseViz(object): if self.enforce_numerical_metrics: self.df_metrics_to_num(df) - df.replace([np.inf, -np.inf], np.nan) - df = self.handle_nulls(df) + df.replace([np.inf, -np.inf], np.nan, inplace=True) return df def df_metrics_to_num(self, df): @@ -653,7 +629,9 @@ class TimeTableViz(BaseViz): pt = df.pivot_table( index=DTTM_ALIAS, columns=columns, - values=values) + values=values, + dropna=False, + ) pt.index = pt.index.map(str) pt = pt.sort_index() return dict( @@ -696,12 +674,20 @@ class PivotTableViz(BaseViz): self.form_data.get('granularity') == 'all' and DTTM_ALIAS in df): del df[DTTM_ALIAS] + + aggfunc = self.form_data.get('pandas_aggfunc') + + # Ensure that Pandas's sum function mimics that of SQL. + if aggfunc == 'sum': + aggfunc = lambda x: x.sum(min_count=1) # noqa: E731 + df = df.pivot_table( index=self.form_data.get('groupby'), columns=self.form_data.get('columns'), values=[utils.get_metric_name(m) for m in self.form_data.get('metrics')], - aggfunc=self.form_data.get('pandas_aggfunc'), + aggfunc=aggfunc, margins=self.form_data.get('pivot_margins'), + dropna=False, ) # Display metrics side by side with each column if self.form_data.get('combine_metric'): @@ -709,7 +695,7 @@ class PivotTableViz(BaseViz): return dict( columns=list(df.columns), html=df.to_html( - na_rep='', + na_rep='null', classes=( 'dataframe table table-striped table-bordered ' 'table-condensed table-hover').split(' ')), @@ -877,7 +863,7 @@ class BoxPlotViz(NVD3Viz): index_value = label_sep.join(index_value) boxes = defaultdict(dict) for (label, key), value in row.items(): - if key == 'median': + if key == 'nanmedian': key = 'Q2' boxes[label][key] = value for label, box in boxes.items(): @@ -894,28 +880,24 @@ class BoxPlotViz(NVD3Viz): def get_data(self, df): form_data = self.form_data - df = df.fillna(0) # conform to NVD3 names def Q1(series): # need to be named functions - can't use lambdas - return np.percentile(series, 25) + return np.nanpercentile(series, 25) def Q3(series): - return np.percentile(series, 75) + return np.nanpercentile(series, 75) whisker_type = form_data.get('whisker_options') if whisker_type == 'Tukey': def whisker_high(series): upper_outer_lim = Q3(series) + 1.5 * (Q3(series) - Q1(series)) - series = series[series <= upper_outer_lim] - return series[np.abs(series - upper_outer_lim).argmin()] + return series[series <= upper_outer_lim].max() def whisker_low(series): lower_outer_lim = Q1(series) - 1.5 * (Q3(series) - Q1(series)) - # find the closest value above the lower outer limit - series = series[series >= lower_outer_lim] - return series[np.abs(series - lower_outer_lim).argmin()] + return series[series >= lower_outer_lim].min() elif whisker_type == 'Min/max (no outliers)': @@ -929,10 +911,10 @@ class BoxPlotViz(NVD3Viz): low, high = whisker_type.replace(' percentiles', '').split('/') def whisker_high(series): - return np.percentile(series, int(high)) + return np.nanpercentile(series, int(high)) def whisker_low(series): - return np.percentile(series, int(low)) + return np.nanpercentile(series, int(low)) else: raise ValueError('Unknown whisker type: {}'.format(whisker_type)) @@ -943,7 +925,7 @@ class BoxPlotViz(NVD3Viz): # pandas sometimes doesn't like getting lists back here return set(above.tolist() + below.tolist()) - aggregate = [Q1, np.median, Q3, whisker_high, whisker_low, outliers] + aggregate = [Q1, np.nanmedian, Q3, whisker_high, whisker_low, outliers] df = df.groupby(form_data.get('groupby')).agg(aggregate) chart_data = self.to_series(df) return chart_data @@ -1034,7 +1016,6 @@ class BulletViz(NVD3Viz): return d def get_data(self, df): - df = df.fillna(0) df['metric'] = df[[utils.get_metric_name(self.metric)]] values = df['metric'].values return { @@ -1152,7 +1133,6 @@ class NVD3TimeSeriesViz(NVD3Viz): def process_data(self, df, aggregate=False): fd = self.form_data - df = df.fillna(0) if fd.get('granularity') == 'all': raise Exception(_('Pick a time granularity for your time series')) @@ -1160,14 +1140,18 @@ class NVD3TimeSeriesViz(NVD3Viz): df = df.pivot_table( index=DTTM_ALIAS, columns=fd.get('groupby'), - values=self.metric_labels) + values=self.metric_labels, + dropna=False, + ) else: df = df.pivot_table( index=DTTM_ALIAS, columns=fd.get('groupby'), values=self.metric_labels, fill_value=0, - aggfunc=sum) + aggfunc=sum, + dropna=False, + ) fm = fd.get('resample_fillmethod') if not fm: @@ -1176,8 +1160,6 @@ class NVD3TimeSeriesViz(NVD3Viz): rule = fd.get('resample_rule') if how and rule: df = df.resample(rule, how=how, fill_method=fm) - if not fm: - df = df.fillna(0) if self.sort_series: dfs = df.sum() @@ -1241,7 +1223,6 @@ class NVD3TimeSeriesViz(NVD3Viz): fd = self.form_data comparison_type = fd.get('comparison_type') or 'values' df = self.process_data(df) - if comparison_type == 'values': chart_data = self.to_series(df) for i, (label, df2) in enumerate(self._extra_chart_data): @@ -1368,7 +1349,6 @@ class NVD3DualLineViz(NVD3Viz): def get_data(self, df): fd = self.form_data - df = df.fillna(0) if self.form_data.get('granularity') == 'all': raise Exception(_('Pick a time granularity for your time series')) @@ -1377,7 +1357,9 @@ class NVD3DualLineViz(NVD3Viz): metric_2 = utils.get_metric_name(fd.get('metric_2')) df = df.pivot_table( index=DTTM_ALIAS, - values=[metric, metric_2]) + values=[metric, metric_2], + dropna=False, + ) chart_data = self.to_series(df) return chart_data @@ -1425,7 +1407,9 @@ class NVD3TimePivotViz(NVD3TimeSeriesViz): df = df.pivot_table( index=DTTM_ALIAS, columns='series', - values=utils.get_metric_name(fd.get('metric'))) + values=utils.get_metric_name(fd.get('metric')), + dropna=False, + ) chart_data = self.to_series(df) for serie in chart_data: serie['rank'] = rank_lookup[serie['key']] @@ -1462,7 +1446,9 @@ class DistributionPieViz(NVD3Viz): metric = self.metric_labels[0] df = df.pivot_table( index=self.groupby, - values=[metric]) + values=[metric], + dropna=False, + ) df.sort_values(by=metric, ascending=False, inplace=True) df = df.reset_index() df.columns = ['x', 'y'] @@ -1549,9 +1535,10 @@ class DistributionBarViz(DistributionPieViz): pt = df.pivot_table( index=self.groupby, columns=columns, - values=metrics) + values=metrics, + dropna=False, + ) if fd.get('contribution'): - pt = pt.fillna(0) pt = pt.T pt = (pt / pt.sum()).T pt = pt.reindex(row.index) @@ -2117,9 +2104,6 @@ class BaseDeckGLViz(BaseViz): credits = 'deck.gl' spatial_control_keys = [] - def handle_nulls(self, df): - return df - def get_metrics(self): self.metric = self.form_data.get('size') return [self.metric] if self.metric else [] @@ -2572,11 +2556,11 @@ class PairedTTestViz(BaseViz): fd = self.form_data groups = fd.get('groupby') metrics = fd.get('metrics') - df.fillna(0) df = df.pivot_table( index=DTTM_ALIAS, columns=groups, - values=metrics) + values=metrics, + ) cols = [] # Be rid of falsey keys for col in df.columns: @@ -2699,7 +2683,7 @@ class PartitionViz(NVD3TimeSeriesViz): for i in range(0, len(groups) + 1): self.form_data['groupby'] = groups[:i] df_drop = df.drop(groups[i:], 1) - procs[i] = self.process_data(df_drop, aggregate=True).fillna(0) + procs[i] = self.process_data(df_drop, aggregate=True) self.form_data['groupby'] = groups return procs diff --git a/tests/core_tests.py b/tests/core_tests.py index 326025e..14a6ed8 100644 --- a/tests/core_tests.py +++ b/tests/core_tests.py @@ -622,16 +622,6 @@ class CoreTests(SupersetTestCase): assert 'language' in resp self.logout() - def test_viz_get_fillna_for_columns(self): - slc = self.get_slice('Girls', db.session) - q = slc.viz.query_obj() - results = slc.viz.datasource.query(q) - fillna_columns = slc.viz.get_fillna_for_columns(results.df.columns) - self.assertDictEqual( - fillna_columns, - {'name': ' NULL', 'sum__num': 0}, - ) - def test_import_csv(self): self.login(username='admin') filename = 'testCSV.csv' diff --git a/tests/viz_tests.py b/tests/viz_tests.py index b23cacd..b84c661 100644 --- a/tests/viz_tests.py +++ b/tests/viz_tests.py @@ -84,18 +84,6 @@ class BaseVizTestCase(SupersetTestCase): self.assertEqual(test_viz.metric_labels, expect_metric_labels) self.assertEqual(test_viz.all_metrics, expect_metric_labels) - def test_get_fillna_returns_default_on_null_columns(self): - form_data = { - 'viz_type': 'table', - 'token': '12345', - } - datasource = self.get_datasource_mock() - test_viz = viz.BaseViz(datasource, form_data) - self.assertEqual( - test_viz.default_fillna, - test_viz.get_fillna_for_columns(), - ) - def test_get_df_returns_empty_df(self): form_data = {'dummy': 123} query_obj = {'granularity': 'day'}