drill-issues mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From "ASF GitHub Bot (JIRA)" <j...@apache.org>
Subject [jira] [Commented] (DRILL-5601) Rollup of External Sort memory management fixes
Date Thu, 13 Jul 2017 22:21:00 GMT

    [ https://issues.apache.org/jira/browse/DRILL-5601?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16086491#comment-16086491
] 

ASF GitHub Bot commented on DRILL-5601:
---------------------------------------

Github user Ben-Zvi commented on a diff in the pull request:

    https://github.com/apache/drill/pull/860#discussion_r123834281
  
    --- Diff: exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/spill/RecordBatchSizer.java
---
    @@ -19,117 +19,162 @@
     
     import java.util.ArrayList;
     import java.util.List;
    +import java.util.Set;
     
    +import org.apache.drill.common.types.TypeProtos.DataMode;
     import org.apache.drill.common.types.TypeProtos.MinorType;
     import org.apache.drill.exec.expr.TypeHelper;
    +import org.apache.drill.exec.memory.AllocationManager.BufferLedger;
     import org.apache.drill.exec.memory.BaseAllocator;
     import org.apache.drill.exec.record.BatchSchema;
     import org.apache.drill.exec.record.MaterializedField;
     import org.apache.drill.exec.record.RecordBatch;
    +import org.apache.drill.exec.record.SmartAllocationHelper;
     import org.apache.drill.exec.record.VectorAccessible;
     import org.apache.drill.exec.record.VectorWrapper;
     import org.apache.drill.exec.record.selection.SelectionVector2;
    +import org.apache.drill.exec.vector.UInt4Vector;
     import org.apache.drill.exec.vector.ValueVector;
     import org.apache.drill.exec.vector.complex.AbstractMapVector;
    +import org.apache.drill.exec.vector.complex.RepeatedValueVector;
     import org.apache.drill.exec.vector.VariableWidthVector;
     
    +import com.google.common.collect.Sets;
    +
     /**
      * Given a record batch or vector container, determines the actual memory
      * consumed by each column, the average row, and the entire record batch.
      */
     
     public class RecordBatchSizer {
    -//  private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(RecordBatchSizer.class);
     
       /**
        * Column size information.
        */
       public static class ColumnSize {
    +    public final String prefix;
         public final MaterializedField metadata;
     
         /**
    -     * Assumed size from Drill metadata.
    +     * Assumed size from Drill metadata. Note that this information is
    +     * 100% bogus. Do not use it.
          */
     
    +    @Deprecated
         public int stdSize;
     
         /**
    -     * Actual memory consumed by all the vectors associated with this column.
    -     */
    -
    -    public int totalSize;
    -
    -    /**
          * Actual average column width as determined from actual memory use. This
          * size is larger than the actual data size since this size includes per-
          * column overhead such as any unused vector space, etc.
          */
     
    -    public int estSize;
    -    public int capacity;
    -    public int density;
    -    public int dataSize;
    -    public boolean variableWidth;
    -
    -    public ColumnSize(ValueVector vv) {
    -      metadata = vv.getField();
    -      stdSize = TypeHelper.getSize(metadata.getType());
    -
    -      // Can't get size estimates if this is an empty batch.
    -      int rowCount = vv.getAccessor().getValueCount();
    -      if (rowCount == 0) {
    -        estSize = stdSize;
    -        return;
    -      }
    +    public final int estSize;
    +    public final int valueCount;
    +    public final int entryCount;
    +    public final int dataSize;
    +    public final int estElementCount;
    +    public final boolean isVariableWidth;
     
    -      // Total size taken by all vectors (and underlying buffers)
    -      // associated with this vector.
    +    public ColumnSize(ValueVector v, String prefix, int valueCount) {
    +      this.prefix = prefix;
    +      this.valueCount = valueCount;
    +      metadata = v.getField();
    +      boolean isMap = v.getField().getType().getMinorType() == MinorType.MAP;
     
    -      totalSize = vv.getAllocatedByteCount();
    +      // The amount of memory consumed by the payload: the actual
    +      // data stored in the vectors.
     
    -      // Capacity is the number of values that the vector could
    -      // contain. This is useful only for fixed-length vectors.
    +      int mapSize = 0;
    +      if (v.getField().getDataMode() == DataMode.REPEATED) {
     
    -      capacity = vv.getValueCapacity();
    +        // Repeated vectors are special: they have an associated offset vector
    +        // that changes the value count of the contained vectors.
     
    -      // The amount of memory consumed by the payload: the actual
    -      // data stored in the vectors.
    +        @SuppressWarnings("resource")
    +        UInt4Vector offsetVector = ((RepeatedValueVector) v).getOffsetVector();
    +        entryCount = offsetVector.getAccessor().get(valueCount);
    +        estElementCount = roundUp(entryCount, valueCount);
    +        if (isMap) {
     
    -      dataSize = vv.getPayloadByteCount();
    +          // For map, the only data associated with the map vector
    +          // itself is the offset vector, if any.
     
    -      // Determine "density" the number of rows compared to potential
    -      // capacity. Low-density batches occur at block boundaries, ends
    -      // of files and so on. Low-density batches throw off our estimates
    -      // for Varchar columns because we don't know the actual number of
    -      // bytes consumed (that information is hidden behind the Varchar
    -      // implementation where we can't get at it.)
    +          mapSize = offsetVector.getPayloadByteCount(valueCount);
    +        }
    +      } else {
    +        entryCount = valueCount;
    +        estElementCount = 1;
    +      }
     
    -      density = roundUp(dataSize * 100, totalSize);
    -      estSize = roundUp(dataSize, rowCount);
    -      variableWidth = vv instanceof VariableWidthVector ;
    +      if (isMap) {
    +        dataSize = mapSize;
    +        stdSize = 0;
    +     } else {
    +        dataSize = v.getPayloadByteCount(valueCount);
    +        stdSize = TypeHelper.getSize(metadata.getType());
    +     }
    +      estSize = roundUp(dataSize, valueCount);
    +      isVariableWidth = v instanceof VariableWidthVector ;
         }
     
         @Override
         public String toString() {
           StringBuilder buf = new StringBuilder()
    +          .append(prefix)
               .append(metadata.getName())
               .append("(type: ")
    +          .append(metadata.getType().getMode().name())
    +          .append(" ")
               .append(metadata.getType().getMinorType().name())
    -          .append(", std col. size: ")
    +          .append(", count: ")
    +          .append(valueCount);
    +      if (metadata.getDataMode() == DataMode.REPEATED) {
    +        buf.append(", total entries: ")
    +           .append(entryCount)
    +           .append(", per-array: ")
    +           .append(estElementCount);
    +      }
    +      buf .append(", std size: ")
               .append(stdSize)
    -          .append(", actual col. size: ")
    +          .append(", actual size: ")
               .append(estSize)
    -          .append(", total size: ")
    -          .append(totalSize)
               .append(", data size: ")
               .append(dataSize)
    -          .append(", row capacity: ")
    -          .append(capacity)
    -          .append(", density: ")
    -          .append(density)
               .append(")");
           return buf.toString();
         }
    +
    +    public void buildAllocHelper(SmartAllocationHelper allocHelper) {
    +      int width = 0;
    +      switch(metadata.getType().getMinorType()) {
    --- End diff --
    
    Could be nicer by adding a method to MinorType called **isVariable()** (i.e. check if
any of those 3), then this switch statement can be replaced by a simple if statement.  (Maybe
that proposed method is the same as checking the local variable **isVariableWidth** ???)



> Rollup of External Sort memory management fixes
> -----------------------------------------------
>
>                 Key: DRILL-5601
>                 URL: https://issues.apache.org/jira/browse/DRILL-5601
>             Project: Apache Drill
>          Issue Type: Task
>    Affects Versions: 1.11.0
>            Reporter: Paul Rogers
>            Assignee: Paul Rogers
>             Fix For: 1.12.0
>
>
> Rollup of a set of specific JIRA entries that all relate to the very difficult problem
of managing memory within Drill in order for the external sort to stay within a memory budget.
In general, the fixes relate to better estimating memory used by the three ways that Drill
allocates vector memory (see DRILL-5522) and to predicting the size of vectors that the sort
will create, to avoid repeated realloc-copy cycles (see DRILL-5594).



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)

Mime
View raw message