drill-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From paul-rogers <...@git.apache.org>
Subject [GitHub] drill pull request #860: DRILL-5601: Rollup of external sort fixes and impro...
Date Wed, 19 Jul 2017 17:59:47 GMT
Github user paul-rogers commented on a diff in the pull request:

    https://github.com/apache/drill/pull/860#discussion_r128126935
  
    --- Diff: exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/spill/RecordBatchSizer.java
---
    @@ -19,117 +19,162 @@
     
     import java.util.ArrayList;
     import java.util.List;
    +import java.util.Set;
     
    +import org.apache.drill.common.types.TypeProtos.DataMode;
     import org.apache.drill.common.types.TypeProtos.MinorType;
     import org.apache.drill.exec.expr.TypeHelper;
    +import org.apache.drill.exec.memory.AllocationManager.BufferLedger;
     import org.apache.drill.exec.memory.BaseAllocator;
     import org.apache.drill.exec.record.BatchSchema;
     import org.apache.drill.exec.record.MaterializedField;
     import org.apache.drill.exec.record.RecordBatch;
    +import org.apache.drill.exec.record.SmartAllocationHelper;
     import org.apache.drill.exec.record.VectorAccessible;
     import org.apache.drill.exec.record.VectorWrapper;
     import org.apache.drill.exec.record.selection.SelectionVector2;
    +import org.apache.drill.exec.vector.UInt4Vector;
     import org.apache.drill.exec.vector.ValueVector;
     import org.apache.drill.exec.vector.complex.AbstractMapVector;
    +import org.apache.drill.exec.vector.complex.RepeatedValueVector;
     import org.apache.drill.exec.vector.VariableWidthVector;
     
    +import com.google.common.collect.Sets;
    +
     /**
      * Given a record batch or vector container, determines the actual memory
      * consumed by each column, the average row, and the entire record batch.
      */
     
     public class RecordBatchSizer {
    -//  private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(RecordBatchSizer.class);
     
       /**
        * Column size information.
        */
       public static class ColumnSize {
    +    public final String prefix;
         public final MaterializedField metadata;
     
         /**
    -     * Assumed size from Drill metadata.
    +     * Assumed size from Drill metadata. Note that this information is
    +     * 100% bogus. Do not use it.
          */
     
    +    @Deprecated
         public int stdSize;
     
         /**
    -     * Actual memory consumed by all the vectors associated with this column.
    -     */
    -
    -    public int totalSize;
    -
    -    /**
          * Actual average column width as determined from actual memory use. This
          * size is larger than the actual data size since this size includes per-
          * column overhead such as any unused vector space, etc.
          */
     
    -    public int estSize;
    -    public int capacity;
    -    public int density;
    -    public int dataSize;
    -    public boolean variableWidth;
    -
    -    public ColumnSize(ValueVector vv) {
    -      metadata = vv.getField();
    -      stdSize = TypeHelper.getSize(metadata.getType());
    -
    -      // Can't get size estimates if this is an empty batch.
    -      int rowCount = vv.getAccessor().getValueCount();
    -      if (rowCount == 0) {
    -        estSize = stdSize;
    -        return;
    -      }
    +    public final int estSize;
    +    public final int valueCount;
    +    public final int entryCount;
    +    public final int dataSize;
    +    public final int estElementCount;
    +    public final boolean isVariableWidth;
     
    -      // Total size taken by all vectors (and underlying buffers)
    -      // associated with this vector.
    +    public ColumnSize(ValueVector v, String prefix, int valueCount) {
    +      this.prefix = prefix;
    +      this.valueCount = valueCount;
    +      metadata = v.getField();
    +      boolean isMap = v.getField().getType().getMinorType() == MinorType.MAP;
     
    -      totalSize = vv.getAllocatedByteCount();
    +      // The amount of memory consumed by the payload: the actual
    +      // data stored in the vectors.
     
    -      // Capacity is the number of values that the vector could
    -      // contain. This is useful only for fixed-length vectors.
    +      int mapSize = 0;
    +      if (v.getField().getDataMode() == DataMode.REPEATED) {
     
    -      capacity = vv.getValueCapacity();
    +        // Repeated vectors are special: they have an associated offset vector
    +        // that changes the value count of the contained vectors.
     
    -      // The amount of memory consumed by the payload: the actual
    -      // data stored in the vectors.
    +        @SuppressWarnings("resource")
    +        UInt4Vector offsetVector = ((RepeatedValueVector) v).getOffsetVector();
    +        entryCount = offsetVector.getAccessor().get(valueCount);
    +        estElementCount = roundUp(entryCount, valueCount);
    +        if (isMap) {
     
    -      dataSize = vv.getPayloadByteCount();
    +          // For map, the only data associated with the map vector
    +          // itself is the offset vector, if any.
     
    -      // Determine "density" the number of rows compared to potential
    -      // capacity. Low-density batches occur at block boundaries, ends
    -      // of files and so on. Low-density batches throw off our estimates
    -      // for Varchar columns because we don't know the actual number of
    -      // bytes consumed (that information is hidden behind the Varchar
    -      // implementation where we can't get at it.)
    +          mapSize = offsetVector.getPayloadByteCount(valueCount);
    +        }
    +      } else {
    +        entryCount = valueCount;
    +        estElementCount = 1;
    +      }
     
    -      density = roundUp(dataSize * 100, totalSize);
    -      estSize = roundUp(dataSize, rowCount);
    -      variableWidth = vv instanceof VariableWidthVector ;
    +      if (isMap) {
    +        dataSize = mapSize;
    +        stdSize = 0;
    +     } else {
    +        dataSize = v.getPayloadByteCount(valueCount);
    +        stdSize = TypeHelper.getSize(metadata.getType());
    +     }
    +      estSize = roundUp(dataSize, valueCount);
    +      isVariableWidth = v instanceof VariableWidthVector ;
         }
     
         @Override
         public String toString() {
           StringBuilder buf = new StringBuilder()
    +          .append(prefix)
               .append(metadata.getName())
               .append("(type: ")
    +          .append(metadata.getType().getMode().name())
    +          .append(" ")
               .append(metadata.getType().getMinorType().name())
    -          .append(", std col. size: ")
    +          .append(", count: ")
    +          .append(valueCount);
    +      if (metadata.getDataMode() == DataMode.REPEATED) {
    +        buf.append(", total entries: ")
    +           .append(entryCount)
    +           .append(", per-array: ")
    +           .append(estElementCount);
    +      }
    +      buf .append(", std size: ")
               .append(stdSize)
    -          .append(", actual col. size: ")
    +          .append(", actual size: ")
               .append(estSize)
    -          .append(", total size: ")
    -          .append(totalSize)
               .append(", data size: ")
               .append(dataSize)
    -          .append(", row capacity: ")
    -          .append(capacity)
    -          .append(", density: ")
    -          .append(density)
               .append(")");
           return buf.toString();
         }
    +
    +    public void buildAllocHelper(SmartAllocationHelper allocHelper) {
    +      int width = 0;
    +      switch(metadata.getType().getMinorType()) {
    --- End diff --
    
    Would definitely be nicer. Unfortunately, MajorType and MinorType are protobuf-generated
classes, so we can't muck about with them -- alas.
    
    The isVariableWidth (which, I believe, you added) handles a subset of the cases. It handles
required and nullable modes, but does not consider repeated vectors to be variable width.
(That is REQUIRED or OPTIONAL VARCHAR is variable width; REPEATED VARCHAR is not. Is this
a bug?)
    
    What we need is a better metadata system that abstracts out all of this stuff.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

Mime
View raw message