spark-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From wenc...@apache.org
Subject spark git commit: [SPARK-22775][SQL] move dictionary related APIs from ColumnVector to WritableColumnVector
Date Thu, 14 Dec 2017 11:34:32 GMT
Repository: spark
Updated Branches:
  refs/heads/master c3dd2a26d -> 7d8e2ca7f


[SPARK-22775][SQL] move dictionary related APIs from ColumnVector to WritableColumnVector

## What changes were proposed in this pull request?

These dictionary related APIs are special to `WritableColumnVector` and should not be in `ColumnVector`,
which will be public soon.

## How was this patch tested?

existing tests

Author: Wenchen Fan <wenchen@databricks.com>

Closes #19970 from cloud-fan/final.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7d8e2ca7
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7d8e2ca7
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7d8e2ca7

Branch: refs/heads/master
Commit: 7d8e2ca7f8667d809034073aad2ea67b3b082fc2
Parents: c3dd2a2
Author: Wenchen Fan <wenchen@databricks.com>
Authored: Thu Dec 14 19:33:54 2017 +0800
Committer: Wenchen Fan <wenchen@databricks.com>
Committed: Thu Dec 14 19:33:54 2017 +0800

----------------------------------------------------------------------
 .../parquet/VectorizedColumnReader.java         |  2 +-
 .../execution/vectorized/ArrowColumnVector.java |  5 --
 .../sql/execution/vectorized/ColumnVector.java  | 66 ++++-----------
 .../vectorized/WritableColumnVector.java        | 88 +++++++++++++-------
 4 files changed, 73 insertions(+), 88 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/7d8e2ca7/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java
----------------------------------------------------------------------
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java
b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java
index b764696..3ba1808 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java
@@ -239,7 +239,7 @@ public class VectorizedColumnReader {
       int rowId,
       int num,
       WritableColumnVector column,
-      ColumnVector dictionaryIds) {
+      WritableColumnVector dictionaryIds) {
     switch (descriptor.getType()) {
       case INT32:
         if (column.dataType() == DataTypes.IntegerType ||

http://git-wip-us.apache.org/repos/asf/spark/blob/7d8e2ca7/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ArrowColumnVector.java
----------------------------------------------------------------------
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ArrowColumnVector.java
b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ArrowColumnVector.java
index 1f1347c..e99201f 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ArrowColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ArrowColumnVector.java
@@ -159,11 +159,6 @@ public final class ArrowColumnVector extends ColumnVector {
     return array;
   }
 
-  @Override
-  public int getDictId(int rowId) {
-    throw new UnsupportedOperationException();
-  }
-
   //
   // APIs dealing with Longs
   //

http://git-wip-us.apache.org/repos/asf/spark/blob/7d8e2ca7/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVector.java
----------------------------------------------------------------------
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVector.java
b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVector.java
index e6b8751..fd5caf3 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVector.java
@@ -22,24 +22,22 @@ import org.apache.spark.sql.types.Decimal;
 import org.apache.spark.unsafe.types.UTF8String;
 
 /**
- * This class represents a column of values and provides the main APIs to access the data
- * values. It supports all the types and contains get APIs as well as their batched versions.
- * The batched versions are preferable whenever possible.
+ * This class represents in-memory values of a column and provides the main APIs to access
the data.
+ * It supports all the types and contains get APIs as well as their batched versions. The
batched
+ * versions are considered to be faster and preferable whenever possible.
  *
  * To handle nested schemas, ColumnVector has two types: Arrays and Structs. In both cases
these
- * columns have child columns. All of the data is stored in the child columns and the parent
column
- * contains nullability, and in the case of Arrays, the lengths and offsets into the child
column.
- * Lengths and offsets are encoded identically to INTs.
+ * columns have child columns. All of the data are stored in the child columns and the parent
column
+ * only contains nullability. In the case of Arrays, the lengths and offsets are saved in
the child
+ * column and are encoded identically to INTs.
+ *
  * Maps are just a special case of a two field struct.
  *
  * Most of the APIs take the rowId as a parameter. This is the batch local 0-based row id
for values
- * in the current RowBatch.
- *
- * A ColumnVector should be considered immutable once originally created.
- *
- * ColumnVectors are intended to be reused.
+ * in the current batch.
  */
 public abstract class ColumnVector implements AutoCloseable {
+
   /**
    * Returns the data type of this column.
    */
@@ -47,7 +45,6 @@ public abstract class ColumnVector implements AutoCloseable {
 
   /**
    * Cleans up memory for this column. The column is not usable after this.
-   * TODO: this should probably have ref-counted semantics.
    */
   public abstract void close();
 
@@ -108,13 +105,6 @@ public abstract class ColumnVector implements AutoCloseable {
   public abstract int[] getInts(int rowId, int count);
 
   /**
-   * Returns the dictionary Id for rowId.
-   * This should only be called when the ColumnVector is dictionaryIds.
-   * We have this separate method for dictionaryIds as per SPARK-16928.
-   */
-  public abstract int getDictId(int rowId);
-
-  /**
    * Returns the value for rowId.
    */
   public abstract long getLong(int rowId);
@@ -145,39 +135,39 @@ public abstract class ColumnVector implements AutoCloseable {
   public abstract double[] getDoubles(int rowId, int count);
 
   /**
-   * Returns the length of the array at rowid.
+   * Returns the length of the array for rowId.
    */
   public abstract int getArrayLength(int rowId);
 
   /**
-   * Returns the offset of the array at rowid.
+   * Returns the offset of the array for rowId.
    */
   public abstract int getArrayOffset(int rowId);
 
   /**
-   * Returns a utility object to get structs.
+   * Returns the struct for rowId.
    */
   public final ColumnarRow getStruct(int rowId) {
     return new ColumnarRow(this, rowId);
   }
 
   /**
-   * Returns a utility object to get structs.
-   * provided to keep API compatibility with InternalRow for code generation
+   * A special version of {@link #getStruct(int)}, which is only used as an adapter for Spark
+   * codegen framework, the second parameter is totally ignored.
    */
   public final ColumnarRow getStruct(int rowId, int size) {
     return getStruct(rowId);
   }
 
   /**
-   * Returns the array at rowid.
+   * Returns the array for rowId.
    */
   public final ColumnarArray getArray(int rowId) {
     return new ColumnarArray(arrayData(), getArrayOffset(rowId), getArrayLength(rowId));
   }
 
   /**
-   * Returns the value for rowId.
+   * Returns the map for rowId.
    */
   public MapData getMap(int ordinal) {
     throw new UnsupportedOperationException();
@@ -215,30 +205,6 @@ public abstract class ColumnVector implements AutoCloseable {
   protected DataType type;
 
   /**
-   * The Dictionary for this column.
-   *
-   * If it's not null, will be used to decode the value in getXXX().
-   */
-  protected Dictionary dictionary;
-
-  /**
-   * Reusable column for ids of dictionary.
-   */
-  protected ColumnVector dictionaryIds;
-
-  /**
-   * Returns true if this column has a dictionary.
-   */
-  public boolean hasDictionary() { return this.dictionary != null; }
-
-  /**
-   * Returns the underlying integer column for ids of dictionary.
-   */
-  public ColumnVector getDictionaryIds() {
-    return dictionaryIds;
-  }
-
-  /**
    * Sets up the common state and also handles creating the child columns if this is a nested
    * type.
    */

http://git-wip-us.apache.org/repos/asf/spark/blob/7d8e2ca7/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
----------------------------------------------------------------------
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
index 7c053b5..63cf608 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
@@ -36,8 +36,10 @@ import org.apache.spark.unsafe.types.UTF8String;
  * elements. This means that the put() APIs do not check as in common cases (i.e. flat schemas),
  * the lengths are known up front.
  *
- * A ColumnVector should be considered immutable once originally created. In other words,
it is not
- * valid to call put APIs after reads until reset() is called.
+ * A WritableColumnVector should be considered immutable once originally created. In other
words,
+ * it is not valid to call put APIs after reads until reset() is called.
+ *
+ * WritableColumnVector are intended to be reused.
  */
 public abstract class WritableColumnVector extends ColumnVector {
 
@@ -106,6 +108,58 @@ public abstract class WritableColumnVector extends ColumnVector {
   public boolean anyNullsSet() { return anyNullsSet; }
 
   /**
+   * Returns the dictionary Id for rowId.
+   *
+   * This should only be called when this `WritableColumnVector` represents dictionaryIds.
+   * We have this separate method for dictionaryIds as per SPARK-16928.
+   */
+  public abstract int getDictId(int rowId);
+
+  /**
+   * The Dictionary for this column.
+   *
+   * If it's not null, will be used to decode the value in getXXX().
+   */
+  protected Dictionary dictionary;
+
+  /**
+   * Reusable column for ids of dictionary.
+   */
+  protected WritableColumnVector dictionaryIds;
+
+  /**
+   * Returns true if this column has a dictionary.
+   */
+  public boolean hasDictionary() { return this.dictionary != null; }
+
+  /**
+   * Returns the underlying integer column for ids of dictionary.
+   */
+  public WritableColumnVector getDictionaryIds() {
+    return dictionaryIds;
+  }
+
+  /**
+   * Update the dictionary.
+   */
+  public void setDictionary(Dictionary dictionary) {
+    this.dictionary = dictionary;
+  }
+
+  /**
+   * Reserve a integer column for ids of dictionary.
+   */
+  public WritableColumnVector reserveDictionaryIds(int capacity) {
+    if (dictionaryIds == null) {
+      dictionaryIds = reserveNewColumn(capacity, DataTypes.IntegerType);
+    } else {
+      dictionaryIds.reset();
+      dictionaryIds.reserve(capacity);
+    }
+    return dictionaryIds;
+  }
+
+  /**
    * Ensures that there is enough storage to store capacity elements. That is, the put()
APIs
    * must work for all rowIds < capacity.
    */
@@ -614,36 +668,6 @@ public abstract class WritableColumnVector extends ColumnVector {
   protected WritableColumnVector[] childColumns;
 
   /**
-   * Update the dictionary.
-   */
-  public void setDictionary(Dictionary dictionary) {
-    this.dictionary = dictionary;
-  }
-
-  /**
-   * Reserve a integer column for ids of dictionary.
-   */
-  public WritableColumnVector reserveDictionaryIds(int capacity) {
-    WritableColumnVector dictionaryIds = (WritableColumnVector) this.dictionaryIds;
-    if (dictionaryIds == null) {
-      dictionaryIds = reserveNewColumn(capacity, DataTypes.IntegerType);
-      this.dictionaryIds = dictionaryIds;
-    } else {
-      dictionaryIds.reset();
-      dictionaryIds.reserve(capacity);
-    }
-    return dictionaryIds;
-  }
-
-  /**
-   * Returns the underlying integer column for ids of dictionary.
-   */
-  @Override
-  public WritableColumnVector getDictionaryIds() {
-    return (WritableColumnVector) dictionaryIds;
-  }
-
-  /**
    * Reserve a new column.
    */
   protected abstract WritableColumnVector reserveNewColumn(int capacity, DataType type);


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org


Mime
View raw message