Return-Path: X-Original-To: archive-asf-public-internal@cust-asf2.ponee.io Delivered-To: archive-asf-public-internal@cust-asf2.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by cust-asf2.ponee.io (Postfix) with ESMTP id B2E2C200BFB for ; Wed, 11 Jan 2017 11:58:14 +0100 (CET) Received: by cust-asf.ponee.io (Postfix) id B19CB160B4E; Wed, 11 Jan 2017 10:58:14 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id ACA23160B2E for ; Wed, 11 Jan 2017 11:58:13 +0100 (CET) Received: (qmail 22188 invoked by uid 500); 11 Jan 2017 10:58:12 -0000 Mailing-List: contact issues-help@flink.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@flink.apache.org Delivered-To: mailing list issues@flink.apache.org Received: (qmail 22179 invoked by uid 99); 11 Jan 2017 10:58:12 -0000 Received: from pnap-us-west-generic-nat.apache.org (HELO spamd2-us-west.apache.org) (209.188.14.142) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 11 Jan 2017 10:58:12 +0000 Received: from localhost (localhost [127.0.0.1]) by spamd2-us-west.apache.org (ASF Mail Server at spamd2-us-west.apache.org) with ESMTP id 4DDF41A05B5 for ; Wed, 11 Jan 2017 10:58:12 +0000 (UTC) X-Virus-Scanned: Debian amavisd-new at spamd2-us-west.apache.org X-Spam-Flag: NO X-Spam-Score: -7.019 X-Spam-Level: X-Spam-Status: No, score=-7.019 tagged_above=-999 required=6.31 tests=[KAM_LAZY_DOMAIN_SECURITY=1, RCVD_IN_DNSWL_HI=-5, RCVD_IN_MSPIKE_H3=-0.01, RCVD_IN_MSPIKE_WL=-0.01, RP_MATCHES_RCVD=-2.999] autolearn=disabled Received: from mx1-lw-us.apache.org ([10.40.0.8]) by localhost (spamd2-us-west.apache.org [10.40.0.9]) (amavisd-new, port 10024) with ESMTP id McCTEEdmkZtv for ; Wed, 11 Jan 2017 10:58:10 +0000 (UTC) Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by mx1-lw-us.apache.org (ASF Mail Server at mx1-lw-us.apache.org) with SMTP id 2C9D15FCD0 for ; Wed, 11 Jan 2017 10:58:10 +0000 (UTC) Received: (qmail 19115 invoked by uid 99); 11 Jan 2017 10:57:08 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 11 Jan 2017 10:57:08 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id 1D081DFB79; Wed, 11 Jan 2017 10:57:08 +0000 (UTC) From: twalthr To: issues@flink.incubator.apache.org Reply-To: issues@flink.incubator.apache.org References: In-Reply-To: Subject: [GitHub] flink pull request #2938: [FLINK-4692] [tableApi] Add tumbling group-windows... Content-Type: text/plain Message-Id: <20170111105708.1D081DFB79@git1-us-west.apache.org> Date: Wed, 11 Jan 2017 10:57:08 +0000 (UTC) archived-at: Wed, 11 Jan 2017 10:58:14 -0000 Github user twalthr commented on a diff in the pull request: https://github.com/apache/flink/pull/2938#discussion_r95549888 --- Diff: flink-libraries/flink-table/src/main/scala/org/apache/flink/table/plan/nodes/dataset/DataSetWindowAggregate.scala --- @@ -0,0 +1,244 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.flink.table.plan.nodes.dataset + +import org.apache.calcite.plan.{RelOptCluster, RelOptCost, RelOptPlanner, RelTraitSet} +import org.apache.calcite.rel.`type`.RelDataType +import org.apache.calcite.rel.core.AggregateCall +import org.apache.calcite.rel.metadata.RelMetadataQuery +import org.apache.calcite.rel.{RelNode, RelWriter, SingleRel} +import org.apache.flink.api.common.operators.Order +import org.apache.flink.api.common.typeinfo.TypeInformation +import org.apache.flink.api.java.DataSet +import org.apache.flink.api.java.typeutils.{ResultTypeQueryable, RowTypeInfo} +import org.apache.flink.table.api.BatchTableEnvironment +import org.apache.flink.table.calcite.FlinkRelBuilder.NamedWindowProperty +import org.apache.flink.table.calcite.FlinkTypeFactory +import org.apache.flink.table.plan.logical._ +import org.apache.flink.table.plan.nodes.FlinkAggregate +import org.apache.flink.table.runtime.aggregate.AggregateUtil.{CalcitePair, _} +import org.apache.flink.table.typeutils.TypeCheckUtils.isTimeInterval +import org.apache.flink.table.typeutils.TypeConverter +import org.apache.flink.types.Row + +import scala.collection.JavaConversions._ + +/** + * Flink RelNode which matches along with a LogicalWindowAggregate. + */ +class DataSetWindowAggregate( + window: LogicalWindow, + namedProperties: Seq[NamedWindowProperty], + cluster: RelOptCluster, + traitSet: RelTraitSet, + inputNode: RelNode, + namedAggregates: Seq[CalcitePair[AggregateCall, String]], + rowRelDataType: RelDataType, + inputType: RelDataType, + grouping: Array[Int]) + extends SingleRel(cluster, traitSet, inputNode) + with FlinkAggregate + with DataSetRel { + + override def deriveRowType() = rowRelDataType + + override def copy(traitSet: RelTraitSet, inputs: java.util.List[RelNode]): RelNode = { + new DataSetWindowAggregate( + window, + namedProperties, + cluster, + traitSet, + inputs.get(0), + namedAggregates, + getRowType, + inputType, + grouping) + } + + override def toString: String = { + s"Aggregate(${ + if (!grouping.isEmpty) { + s"groupBy: (${groupingToString(inputType, grouping)}), " + } else { + "" + } + }window: ($window), " + + s"select: (${ + aggregationToString( + inputType, + grouping, + getRowType, + namedAggregates, + namedProperties) + }))" + } + + override def explainTerms(pw: RelWriter): RelWriter = { + super.explainTerms(pw) + .itemIf("groupBy", groupingToString(inputType, grouping), !grouping.isEmpty) + .item("window", window) + .item( + "select", aggregationToString( + inputType, + grouping, + getRowType, + namedAggregates, + namedProperties)) + } + + override def computeSelfCost (planner: RelOptPlanner, metadata: RelMetadataQuery): RelOptCost = { + val child = this.getInput + val rowCnt = metadata.getRowCount(child) + val rowSize = this.estimateRowSize(child.getRowType) + val aggCnt = this.namedAggregates.size + planner.getCostFactory.makeCost(rowCnt, rowCnt * aggCnt, rowCnt * rowSize) + } + + override def translateToPlan( + tableEnv: BatchTableEnvironment, + expectedType: Option[TypeInformation[Any]]): DataSet[Any] = { + + val config = tableEnv.getConfig + + val inputDS = getInput.asInstanceOf[DataSetRel].translateToPlan( + tableEnv, + // tell the input operator that this operator currently only supports Rows as input + Some(TypeConverter.DEFAULT_ROW_TYPE)) + + val result = window match { + case EventTimeTumblingGroupWindow(_, _, size) => + createEventTimeTumblingWindowDataSet(inputDS, isTimeInterval(size.resultType)) + case EventTimeSessionGroupWindow(_, _, _) => + throw new UnsupportedOperationException( + "Event-time session windows on batch are currently not supported") + case EventTimeSlidingGroupWindow(_, _, _, _) => + throw new UnsupportedOperationException( + "Event-time sliding windows on batch are currently not supported") + case _: ProcessingTimeGroupWindow => + throw new UnsupportedOperationException( + "Processing-time tumbling windows are not supported on batch tables, " + + "window on batch must declare a time attribute over which the query is evaluated.") + } + + // if the expected type is not a Row, inject a mapper to convert to the expected type + expectedType match { + case Some(typeInfo) if typeInfo.getTypeClass != classOf[Row] => + val mapName = s"convert: (${getRowType.getFieldNames.toList.mkString(", ")})" + result.map( + getConversionMapper( + config = config, + nullableInput = false, + inputType = resultRowTypeInfo.asInstanceOf[TypeInformation[Any]], + expectedType = expectedType.get, + conversionOperatorName = "DataSetWindowAggregateConversion", + fieldNames = getRowType.getFieldNames + )) + .name(mapName) + case _ => result + } + } + + + private def createEventTimeTumblingWindowDataSet( + inputDS: DataSet[Any], + isTimeWindow: Boolean) + : DataSet[Any] = { + val mapFunction = createDataSetWindowPrepareMapFunction( + window, + namedAggregates, + grouping, + inputType) + val groupReduceFunction = createDataSetWindowAggGroupReduceFunction( + window, + namedAggregates, + inputType, + getRowType, + grouping, + namedProperties) + + val mappedInput = inputDS + .map(mapFunction) + .name(prepareOperatorName) + + if (isTimeWindow) { + // grouped time window aggregation + val mapReturnType = mapFunction.asInstanceOf[ResultTypeQueryable[Row]].getProducedType + // group by grouping keys and rowtime field (the last field in the row) + val groupingKeys = grouping.indices ++ Seq(mapReturnType.getArity - 1) + mappedInput.asInstanceOf[DataSet[Row]] + .groupBy(groupingKeys: _*) + .reduceGroup(groupReduceFunction) + .returns(resultRowTypeInfo) + .name(aggregateOperatorName) + .asInstanceOf[DataSet[Any]] + } else { + // count window + val groupingKeys = grouping.indices.toArray + if (groupingKeys.length > 0) { + // grouped aggregation + mappedInput.asInstanceOf[DataSet[Row]] + .groupBy(groupingKeys: _*) + // sort on time field, it's the one after grouping keys + .sortGroup(groupingKeys.length, Order.ASCENDING) --- End diff -- Shouldn't this be `mapReturnType.getArity - 1`? According to the docs of `AggregateUtil#createDataSetWindowPrepareMapFunction` the time field should be at the end? --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastructure@apache.org or file a JIRA ticket with INFRA. ---