drill-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From sohami <...@git.apache.org>
Subject [GitHub] drill pull request #785: DRILL-5323: Test tools for row sets
Date Thu, 30 Mar 2017 01:32:25 GMT
Github user sohami commented on a diff in the pull request:

    https://github.com/apache/drill/pull/785#discussion_r108758552
  
    --- Diff: exec/java-exec/src/test/java/org/apache/drill/test/rowSet/HyperRowSetImpl.java
---
    @@ -0,0 +1,221 @@
    +/*
    + * Licensed to the Apache Software Foundation (ASF) under one
    + * or more contributor license agreements.  See the NOTICE file
    + * distributed with this work for additional information
    + * regarding copyright ownership.  The ASF licenses this file
    + * to you under the Apache License, Version 2.0 (the
    + * "License"); you may not use this file except in compliance
    + * with the License.  You may obtain a copy of the License at
    + *
    + * http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +package org.apache.drill.test.rowSet;
    +
    +import java.util.ArrayList;
    +import java.util.List;
    +
    +import org.apache.drill.common.types.TypeProtos.MinorType;
    +import org.apache.drill.exec.memory.BufferAllocator;
    +import org.apache.drill.exec.record.BatchSchema.SelectionVectorMode;
    +import org.apache.drill.exec.record.HyperVectorWrapper;
    +import org.apache.drill.exec.record.VectorContainer;
    +import org.apache.drill.exec.record.VectorWrapper;
    +import org.apache.drill.exec.record.selection.SelectionVector4;
    +import org.apache.drill.exec.vector.ValueVector;
    +import org.apache.drill.exec.vector.accessor.AccessorUtilities;
    +import org.apache.drill.exec.vector.complex.AbstractMapVector;
    +import org.apache.drill.test.rowSet.AbstractRowSetAccessor.BoundedRowIndex;
    +import org.apache.drill.test.rowSet.RowSet.HyperRowSet;
    +import org.apache.drill.test.rowSet.RowSetSchema.LogicalColumn;
    +import org.apache.drill.test.rowSet.RowSetSchema.PhysicalSchema;
    +
    +public class HyperRowSetImpl extends AbstractRowSet implements HyperRowSet {
    +
    +  public static class HyperRowIndex extends BoundedRowIndex {
    +
    +    private final SelectionVector4 sv4;
    +
    +    public HyperRowIndex(SelectionVector4 sv4) {
    +      super(sv4.getCount());
    +      this.sv4 = sv4;
    +    }
    +
    +    @Override
    +    public int index() {
    +      return AccessorUtilities.sv4Index(sv4.get(rowIndex));
    +    }
    +
    +    @Override
    +    public int batch( ) {
    +      return AccessorUtilities.sv4Batch(sv4.get(rowIndex));
    +    }
    +  }
    +
    +  /**
    +   * Build a hyper row set by restructuring a hyper vector bundle into a uniform
    +   * shape. Consider this schema: <pre><code>
    +   * { a: 10, b: { c: 20, d: { e: 30 } } }</code></pre>
    +   * <p>
    +   * The hyper container, with two batches, has this structure:
    +   * <table border="1">
    +   * <tr><th>Batch</th><th>a</th><th>b</th></tr>
    +   * <tr><td>0</td><td>Int vector</td><td>Map Vector(Int
vector, Map Vector(Int vector))</td></th>
    +   * <tr><td>1</td><td>Int vector</td><td>Map Vector(Int
vector, Map Vector(Int vector))</td></th>
    +   * </table>
    +   * <p>
    +   * The above table shows that top-level scalar vectors (such as the Int Vector for
column
    +   * a) appear "end-to-end" as a hyper-vector. Maps also appear end-to-end. But, the
    +   * contents of the map (column c) do not appear end-to-end. Instead, they appear as
    +   * contents in the map vector. To get to c, one indexes into the map vector, steps
inside
    +   * the map to find c and indexes to the right row.
    +   * <p>
    +   * Similarly, the maps for d do not appear end-to-end, one must step to the right batch
    +   * in b, then step to d.
    +   * <p>
    +   * Finally, to get to e, one must step
    +   * into the hyper vector for b, then steps to the proper batch, steps to d, step to
e
    +   * and finally step to the row within e. This is a very complex, costly indexing scheme
    +   * that differs depending on map nesting depth.
    +   * <p>
    +   * To simplify access, this class restructures the maps to flatten the scalar vectors
    +   * into end-to-end hyper vectors. For example, for the above:
    +   * <p>
    +   * <table border="1">
    +   * <tr><th>Batch</th><th>a</th><th>c</th><th>d</th></tr>
    +   * <tr><td>0</td><td>Int vector</td><td>Int vector</td><td>Int
vector</td></th>
    +   * <tr><td>1</td><td>Int vector</td><td>Int vector</td><td>Int
vector</td></th>
    +   * </table>
    +   *
    +   * The maps are still available as hyper vectors, but separated into map fields.
    +   * (Scalar access no longer needs to access the maps.) The result is a uniform
    +   * addressing scheme for both top-level and nested vectors.
    +   */
    +
    +  public static class HyperVectorBuilder {
    +
    +    protected final HyperVectorWrapper<?> valueVectors[];
    +    protected final HyperVectorWrapper<AbstractMapVector> mapVectors[];
    +    private final List<ValueVector> nestedScalars[];
    +    private int vectorIndex;
    +    private int mapIndex;
    +    private final PhysicalSchema physicalSchema;
    +
    +    @SuppressWarnings("unchecked")
    +    public HyperVectorBuilder(RowSetSchema schema) {
    +      physicalSchema = schema.physical();
    +      valueVectors = new HyperVectorWrapper<?>[schema.access().count()];
    +      if (schema.access().mapCount() == 0) {
    +        mapVectors = null;
    +        nestedScalars = null;
    +      } else {
    +        mapVectors = (HyperVectorWrapper<AbstractMapVector>[])
    +            new HyperVectorWrapper<?>[schema.access().mapCount()];
    +        nestedScalars = new ArrayList[schema.access().count()];
    +      }
    +    }
    +
    +    @SuppressWarnings("unchecked")
    +    public HyperVectorWrapper<ValueVector>[] mapContainer(VectorContainer container)
{
    +      int i = 0;
    +      for (VectorWrapper<?> w : container) {
    +        HyperVectorWrapper<?> hvw = (HyperVectorWrapper<?>) w;
    +        if (w.getField().getType().getMinorType() == MinorType.MAP) {
    +          HyperVectorWrapper<AbstractMapVector> mw = (HyperVectorWrapper<AbstractMapVector>)
hvw;
    +          mapVectors[mapIndex++] = mw;
    +          buildHyperMap(physicalSchema.column(i).mapSchema(), mw);
    --- End diff --
    
    we are not checking if physicalSchema has any column or not before accessing. While constructing
RowSchema using batchSchema if there are no fields in batchSchema, then physicalSchema still
has reference to valid object but it's count will be zero.
    
    Hence the access here _physicalSchema.column(i)_ can throw IndexOutOfBound exception ?


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

Mime
View raw message