spark-reviews mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From nongli <...@git.apache.org>
Subject [GitHub] spark pull request: [SPARK-12635][SQL] Add ColumnarBatch, an in me...
Date Mon, 11 Jan 2016 20:44:32 GMT
Github user nongli commented on a diff in the pull request:

    https://github.com/apache/spark/pull/10628#discussion_r49375889
  
    --- Diff: sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVector.java
---
    @@ -0,0 +1,181 @@
    +/*
    + * Licensed to the Apache Software Foundation (ASF) under one or more
    + * contributor license agreements.  See the NOTICE file distributed with
    + * this work for additional information regarding copyright ownership.
    + * The ASF licenses this file to You under the Apache License, Version 2.0
    + * (the "License"); you may not use this file except in compliance with
    + * the License.  You may obtain a copy of the License at
    + *
    + *    http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +package org.apache.spark.sql.execution.vectorized;
    +
    +import org.apache.spark.sql.types.DataType;
    +
    +/**
    + * This class represents a column of values and provides the main APIs to access the
data
    + * values. It supports all the types and contains get/put APIs as well as their batched
versions.
    + * The batched versions are preferable whenever possible.
    + *
    + * Most of the APIs take the rowId as a parameter. This is the local 0-based row id for
values
    + * in the current RowBatch.
    + *
    + * A ColumnVector should be considered immutable once originally created. In other words,
it is not
    + * valid to call put APIs after reads until reset() is called.
    + */
    +public abstract class ColumnVector {
    +  /**
    +   * Allocates a column with each element of size `width` either on or off heap.
    +   */
    +  public static ColumnVector allocate(int capacity, DataType type, boolean offHeap) {
    +    if (offHeap) {
    +      return new OffHeapColumnVector(capacity, type);
    +    } else {
    +      return new OnHeapColumnVector(capacity, type);
    +    }
    +  }
    +
    +  public final DataType dataType() { return type; }
    +
    +  /**
    +   * Resets this column for writing. The currently stored values are no longer accessible.
    +   */
    +  public void reset() {
    +    numNulls = 0;
    +    if (anyNullsSet) {
    +      putNotNulls(0, capacity);
    +      anyNullsSet = false;
    +    }
    +  }
    +
    +  /**
    +   * Cleans up memory for this column. The column is not usable after this.
    +   * TODO: this should probably have ref-counted semantics.
    +   */
    +  public abstract void close();
    +
    +  /**
    +   * Returns the number of nulls in this column.
    +   */
    +  public final int numNulls() { return numNulls; }
    +
    +  /**
    +   * Returns true if any of the nulls indicator are set for this column. This can be
used
    +   * as an optimization to prevent setting nulls.
    +   */
    +  public final boolean anyNullsSet() { return anyNullsSet; }
    +
    +  /**
    +   * Returns the off heap ptr for the arrays backing the NULLs and values buffer. Only
valid
    +   * to call for off heap columns.
    +   */
    +  public abstract long nullsNativeAddress();
    +  public abstract long valuesNativeAddress();
    +
    +  /**
    +   * Sets the value at rowId to null/not null.
    +   */
    +  public abstract void putNotNull(int rowId);
    +  public abstract void putNull(int rowId);
    +
    +  /**
    +   * Sets the values from [rowId, rowId + count) to null/not null.
    +   */
    +  public abstract void putNulls(int rowId, int count);
    +  public abstract void putNotNulls(int rowId, int count);
    +
    +  /**
    +   * Returns whether the value at rowId is NULL.
    +   */
    +  public abstract boolean getIsNull(int rowId);
    +
    +  /**
    +   * Sets the value at rowId to `value`.
    +   */
    +  public abstract void putInt(int rowId, int value);
    +
    +  /**
    +   * Sets values from [rowId, rowId + count) to value.
    +   */
    +  public abstract void putInts(int rowId, int count, int value);
    +
    +  /**
    +   * Sets values from [rowId, rowId + count) to [src + srcIndex, src + srcIndex + count)
    +   */
    +  public abstract void putInts(int rowId, int count, int[] src, int srcIndex);
    +
    +  /**
    +   * Sets values from [rowId, rowId + count) to [src + srcIndex, src + srcIndex + count)
    +   * The data in src must be 4-byte little endian ints.
    +   */
    +  public abstract void putIntsLittleEndian(int rowId, int count, byte[] src, int srcIndex);
    +
    +  /**
    +   * Returns the integer for rowId.
    +   */
    +  public abstract int getInt(int rowId);
    +
    +  /**
    +   * Sets the value at rowId to `value`.
    +   */
    +  public abstract void putDouble(int rowId, double value);
    +
    +  /**
    +   * Sets values from [rowId, rowId + count) to value.
    +   */
    +  public abstract void putDoubles(int rowId, int count, double value);
    +
    +  /**
    +   * Sets values from [rowId, rowId + count) to [src + srcIndex, src + srcIndex + count)
    +   * src should contain `count` doubles written as ieee format.
    +   */
    +  public abstract void putDoubles(int rowId, int count, double[] src, int srcIndex);
    +
    +  /**
    +   * Sets values from [rowId, rowId + count) to [src + srcIndex, src + srcIndex + count)
    +   * The data in src must be ieee formated doubles.
    +   */
    +  public abstract void putDoubles(int rowId, int count, byte[] src, int srcIndex);
    +
    +  /**
    +   * Returns the double for rowId.
    +   */
    +  public abstract double getDouble(int rowId);
    +
    +  /**
    +   * Maximum number of rows that can be stored in this column.
    +   */
    +  protected final int capacity;
    +
    +  /**
    +   * Byte width fo this column.
    +   */
    +  protected final int width;
    +
    +  /**
    +   * Number of nulls in this column.
    +   */
    +  protected int numNulls;
    +
    +  /**
    +   * True if there is at least one NULL byte set.
    +   */
    +  protected boolean anyNullsSet;
    +
    +  /**
    +   * Data type for this column.
    +   */
    +  protected final DataType type;
    +
    +  protected ColumnVector(int capacity, DataType type) {
    +    this.capacity = capacity;
    +    this.type = type;
    +    this.width = type.defaultSize();
    --- End diff --
    
    I don't use this right now. Removed.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org


Mime
View raw message