Return-Path: X-Original-To: archive-asf-public-internal@cust-asf2.ponee.io Delivered-To: archive-asf-public-internal@cust-asf2.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by cust-asf2.ponee.io (Postfix) with ESMTP id E0CA1200CD3 for ; Fri, 14 Jul 2017 00:21:20 +0200 (CEST) Received: by cust-asf.ponee.io (Postfix) id DF16A16CFF2; Thu, 13 Jul 2017 22:21:20 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id 0A70A16CFEF for ; Fri, 14 Jul 2017 00:21:19 +0200 (CEST) Received: (qmail 54445 invoked by uid 500); 13 Jul 2017 22:21:17 -0000 Mailing-List: contact dev-help@drill.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@drill.apache.org Delivered-To: mailing list dev@drill.apache.org Received: (qmail 53736 invoked by uid 99); 13 Jul 2017 22:21:17 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 13 Jul 2017 22:21:17 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id 1E76DF5530; Thu, 13 Jul 2017 22:21:17 +0000 (UTC) From: Ben-Zvi To: dev@drill.apache.org Reply-To: dev@drill.apache.org References: In-Reply-To: Subject: [GitHub] drill pull request #860: DRILL-5601: Rollup of external sort fixes and impro... Content-Type: text/plain Message-Id: <20170713222117.1E76DF5530@git1-us-west.apache.org> Date: Thu, 13 Jul 2017 22:21:17 +0000 (UTC) archived-at: Thu, 13 Jul 2017 22:21:21 -0000 Github user Ben-Zvi commented on a diff in the pull request: https://github.com/apache/drill/pull/860#discussion_r123844716 --- Diff: exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/spill/RecordBatchSizer.java --- @@ -19,117 +19,162 @@ import java.util.ArrayList; import java.util.List; +import java.util.Set; +import org.apache.drill.common.types.TypeProtos.DataMode; import org.apache.drill.common.types.TypeProtos.MinorType; import org.apache.drill.exec.expr.TypeHelper; +import org.apache.drill.exec.memory.AllocationManager.BufferLedger; import org.apache.drill.exec.memory.BaseAllocator; import org.apache.drill.exec.record.BatchSchema; import org.apache.drill.exec.record.MaterializedField; import org.apache.drill.exec.record.RecordBatch; +import org.apache.drill.exec.record.SmartAllocationHelper; import org.apache.drill.exec.record.VectorAccessible; import org.apache.drill.exec.record.VectorWrapper; import org.apache.drill.exec.record.selection.SelectionVector2; +import org.apache.drill.exec.vector.UInt4Vector; import org.apache.drill.exec.vector.ValueVector; import org.apache.drill.exec.vector.complex.AbstractMapVector; +import org.apache.drill.exec.vector.complex.RepeatedValueVector; import org.apache.drill.exec.vector.VariableWidthVector; +import com.google.common.collect.Sets; + /** * Given a record batch or vector container, determines the actual memory * consumed by each column, the average row, and the entire record batch. */ public class RecordBatchSizer { -// private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(RecordBatchSizer.class); /** * Column size information. */ public static class ColumnSize { + public final String prefix; public final MaterializedField metadata; /** - * Assumed size from Drill metadata. + * Assumed size from Drill metadata. Note that this information is + * 100% bogus. Do not use it. */ + @Deprecated public int stdSize; /** - * Actual memory consumed by all the vectors associated with this column. - */ - - public int totalSize; - - /** * Actual average column width as determined from actual memory use. This * size is larger than the actual data size since this size includes per- * column overhead such as any unused vector space, etc. */ - public int estSize; - public int capacity; - public int density; - public int dataSize; - public boolean variableWidth; - - public ColumnSize(ValueVector vv) { - metadata = vv.getField(); - stdSize = TypeHelper.getSize(metadata.getType()); - - // Can't get size estimates if this is an empty batch. - int rowCount = vv.getAccessor().getValueCount(); - if (rowCount == 0) { - estSize = stdSize; - return; - } + public final int estSize; + public final int valueCount; + public final int entryCount; + public final int dataSize; + public final int estElementCount; + public final boolean isVariableWidth; - // Total size taken by all vectors (and underlying buffers) - // associated with this vector. + public ColumnSize(ValueVector v, String prefix, int valueCount) { + this.prefix = prefix; + this.valueCount = valueCount; + metadata = v.getField(); + boolean isMap = v.getField().getType().getMinorType() == MinorType.MAP; - totalSize = vv.getAllocatedByteCount(); + // The amount of memory consumed by the payload: the actual + // data stored in the vectors. - // Capacity is the number of values that the vector could - // contain. This is useful only for fixed-length vectors. + int mapSize = 0; + if (v.getField().getDataMode() == DataMode.REPEATED) { - capacity = vv.getValueCapacity(); + // Repeated vectors are special: they have an associated offset vector + // that changes the value count of the contained vectors. - // The amount of memory consumed by the payload: the actual - // data stored in the vectors. + @SuppressWarnings("resource") + UInt4Vector offsetVector = ((RepeatedValueVector) v).getOffsetVector(); + entryCount = offsetVector.getAccessor().get(valueCount); + estElementCount = roundUp(entryCount, valueCount); + if (isMap) { - dataSize = vv.getPayloadByteCount(); + // For map, the only data associated with the map vector + // itself is the offset vector, if any. - // Determine "density" the number of rows compared to potential - // capacity. Low-density batches occur at block boundaries, ends - // of files and so on. Low-density batches throw off our estimates - // for Varchar columns because we don't know the actual number of - // bytes consumed (that information is hidden behind the Varchar - // implementation where we can't get at it.) + mapSize = offsetVector.getPayloadByteCount(valueCount); + } + } else { + entryCount = valueCount; + estElementCount = 1; + } - density = roundUp(dataSize * 100, totalSize); - estSize = roundUp(dataSize, rowCount); - variableWidth = vv instanceof VariableWidthVector ; + if (isMap) { + dataSize = mapSize; + stdSize = 0; + } else { + dataSize = v.getPayloadByteCount(valueCount); + stdSize = TypeHelper.getSize(metadata.getType()); + } + estSize = roundUp(dataSize, valueCount); + isVariableWidth = v instanceof VariableWidthVector ; } @Override public String toString() { StringBuilder buf = new StringBuilder() + .append(prefix) .append(metadata.getName()) .append("(type: ") + .append(metadata.getType().getMode().name()) + .append(" ") .append(metadata.getType().getMinorType().name()) - .append(", std col. size: ") + .append(", count: ") + .append(valueCount); + if (metadata.getDataMode() == DataMode.REPEATED) { + buf.append(", total entries: ") + .append(entryCount) + .append(", per-array: ") + .append(estElementCount); + } + buf .append(", std size: ") .append(stdSize) - .append(", actual col. size: ") + .append(", actual size: ") .append(estSize) - .append(", total size: ") - .append(totalSize) .append(", data size: ") .append(dataSize) - .append(", row capacity: ") - .append(capacity) - .append(", density: ") - .append(density) .append(")"); return buf.toString(); } + + public void buildAllocHelper(SmartAllocationHelper allocHelper) { --- End diff -- "Smart" and "Helper" are vague terms useful for marketing; we better use a more descriptive terminology to explain what exactly it does. E.g., "batch-allocator" instead of "alloc-helper"; for "smart" -- maybe "deep-check" , or "hint-based" ?? .... Also add a (javadoc) comment here. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastructure@apache.org or file a JIRA ticket with INFRA. ---