drill-issues mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From "ASF GitHub Bot (JIRA)" <j...@apache.org>
Subject [jira] [Commented] (DRILL-6126) Allocate memory for value vectors upfront in flatten operator
Date Mon, 05 Mar 2018 07:32:00 GMT

    [ https://issues.apache.org/jira/browse/DRILL-6126?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16385715#comment-16385715
] 

ASF GitHub Bot commented on DRILL-6126:
---------------------------------------

Github user paul-rogers commented on a diff in the pull request:

    https://github.com/apache/drill/pull/1125#discussion_r172104808
  
    --- Diff: exec/java-exec/src/main/java/org/apache/drill/exec/record/RecordBatchSizer.java
---
    @@ -76,110 +82,327 @@
          * greater than (but unlikely) same as the row count.
          */
     
    -    public final int valueCount;
    +    private final int valueCount;
     
         /**
    -     * Total number of elements for a repeated type, or 1 if this is
    -     * a non-repeated type. That is, a batch of 100 rows may have an
    -     * array with 10 elements per row. In this case, the element count
    -     * is 1000.
    +     * Total number of elements for a repeated type, or same as
    +     * valueCount if this is a non-repeated type. That is, a batch
    +     * of 100 rows may have an array with 10 elements per row.
    +     * In this case, the element count is 1000.
          */
     
    -    public final int elementCount;
    +    private int elementCount;
     
         /**
    -     * Size of the top level value vector. For map and repeated list,
    -     * this is just size of offset vector.
    +     * The estimated, average number of elements per parent value.
    +     * Always 1 for a non-repeated type. For a repeated type,
    +     * this is the average entries per array (per repeated element).
          */
    -    public int dataSize;
    +
    +    private float estElementCountPerArray;
     
         /**
    -     * Total size of the column includes the sum total of memory for all
    -     * value vectors representing the column.
    +     * Indicates if it is variable width column.
    +     * For map columns, this is true if any of the children is variable
    +     * width column.
          */
    -    public int netSize;
    +
    +    private boolean isVariableWidth;
     
         /**
    -     * The estimated, average number of elements per parent value.
    -     * Always 1 for a non-repeated type. For a repeated type,
    -     * this is the average entries per array (per repeated element).
    +     * Indicates if cardinality is repeated(top level only).
    +     */
    +
    +    private boolean isRepeated;
    +
    +    /**
    +     * Indicates if cardinality is optional i.e. nullable(top level only).
    +     */
    +    private boolean isOptional;
    +
    +    /**
    +     * Child columns if this is a map column.
    +     */
    +    private Map<String, ColumnSize> children = CaseInsensitiveMap.newHashMap();
    +
    +    /**
    +     * std pure data size per entry from Drill metadata, based on type.
    +     * Does not include metadata vector overhead we add for cardinality,
    +     * variable length etc.
    +     * For variable-width columns, we use 50 as std size for entry width.
    +     * For repeated column, we assume repetition of 10.
    +     */
    +    public int getStdDataSizePerEntry() {
    +      int stdDataSize;
    +
    +      try {
    +        stdDataSize = TypeHelper.getSize(metadata.getType());
    +
    +        // For variable width, typeHelper includes offset vector width. Adjust for that.
    +        if (isVariableWidth) {
    +          stdDataSize -= OFFSET_VECTOR_WIDTH;
    +        }
    +
    +        if (isRepeated) {
    +          stdDataSize = stdDataSize * STD_REPETITION_FACTOR;
    +        }
    +      } catch (Exception e) {
    +        // For unsupported types, just set stdSize to 0.
    +        // Map, Union, List etc.
    +        stdDataSize = 0;
    +      }
    +
    +      // Add sizes of children.
    +      for (ColumnSize columnSize : children.values()) {
    +        stdDataSize += columnSize.getStdDataSizePerEntry();
    +      }
    +
    +      if (isRepeatedList()) {
    +        stdDataSize = stdDataSize * STD_REPETITION_FACTOR;
    +      }
    +
    +      return stdDataSize;
    +    }
    +
    +    /**
    +     * std net size per entry taking into account additional metadata vectors
    +     * we add on top for variable length, cardinality etc.
    +     * For variable-width columns, we use 50 as std data size for entry width.
    +     * For repeated column, we assume repetition of 10.
    +     */
    +    public int getStdNetSizePerEntry() {
    +      int stdNetSize;
    +      try {
    +        stdNetSize = TypeHelper.getSize(metadata.getType());
    +      } catch (Exception e) {
    +        stdNetSize = 0;
    +      }
    +
    +      if (isOptional) {
    +        stdNetSize += BIT_VECTOR_WIDTH;
    +      }
    +
    +      if (isRepeated) {
    +        stdNetSize = (stdNetSize * STD_REPETITION_FACTOR) + OFFSET_VECTOR_WIDTH;
    +      }
    +
    +      for (ColumnSize columnSize : children.values()) {
    +        stdNetSize += columnSize.getStdNetSizePerEntry();
    +      }
    +
    +      if (isRepeatedList()) {
    +        stdNetSize = (stdNetSize * STD_REPETITION_FACTOR) + OFFSET_VECTOR_WIDTH;
    +      }
    +
    +      return stdNetSize;
    +    }
    +
    +    /**
    +     * This is the average actual per entry data size in bytes. Does not
    +     * include any overhead of metadata vectors.
    +     * For repeated columns, it is average for the repeated array, not
    +     * individual entry in the array.
    +     */
    +    public int getDataSizePerEntry() {
    +      return safeDivide(getTotalDataSize(), getValueCount());
    +    }
    +
    +    /**
    +     * This is the average per entry size of just pure data plus
    +     * overhead of additional vectors we add on top like bits vector,
    +     * offset vector etc. This
    +     * size is larger than the actual data size since this size includes per-
    +     * column overhead for additional vectors we add for
    +     * cardinality, variable length etc.
    +     */
    +    public int getNetSizePerEntry() {
    +      return safeDivide(getTotalNetSize(), getValueCount());
    +    }
    +
    +    /**
    +     * This is the total data size for the column, including children for map
    +     * columns. Does not include any overhead of metadata vectors.
    +     */
    +    public int getTotalDataSize() {
    +      int dataSize = this.totalDataSize;
    +      for (ColumnSize columnSize : children.values()) {
    +        dataSize += columnSize.getTotalDataSize();
    +      }
    +      return dataSize;
    +    }
    +
    +    /**
    +     * This is the total net size for the column, including children for map
    +     * columns. Includes overhead of metadata vectors.
          */
    +    public int getTotalNetSize() {
    +      return this.totalNetSize;
    +    }
    +
    +    public int getValueCount() {
    +      return valueCount;
    +    }
     
    -    public final float estElementCountPerArray;
    -    public final boolean isVariableWidth;
    +    public int getElementCount() {
    +      return elementCount;
    +    }
    +
    +    public float getEstElementCountPerArray() {
    +      return estElementCountPerArray;
    +    }
     
    -    public Map<String, ColumnSize> children = CaseInsensitiveMap.newHashMap();
    +    public boolean isVariableWidth() {
    +      return isVariableWidth;
    +    }
     
         public Map<String, ColumnSize> getChildren() {
           return children;
         }
     
    +    public boolean isComplex() {
    +      if (metadata.getType().getMinorType() == MinorType.MAP ||
    +        metadata.getType().getMinorType() == MinorType.UNION ||
    +        metadata.getType().getMinorType() == MinorType.LIST) {
    +        return true;
    +      }
    +      return false;
    --- End diff --
    
    Nit, but
    ```
    return metadata.getType().getMinorType() == MinorType.MAP ||
           metadata.getType().getMinorType() == MinorType.UNION ||
           metadata.getType().getMinorType() == MinorType.LIST;
    ```
    
    And below.


> Allocate memory for value vectors upfront in flatten operator
> -------------------------------------------------------------
>
>                 Key: DRILL-6126
>                 URL: https://issues.apache.org/jira/browse/DRILL-6126
>             Project: Apache Drill
>          Issue Type: Improvement
>            Reporter: Padma Penumarthy
>            Assignee: Padma Penumarthy
>            Priority: Critical
>             Fix For: 1.12.0
>
>
> With recent changes to control batch size for flatten operator, we figure out row count
in the output batch based on memory. Since we know how many rows we are going to include in
the batch, we can also allocate the memory needed upfront instead of starting with initial
value (4096) and doubling, copying every time we need more. 



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

Mime
View raw message