flink-issues mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From twalthr <...@git.apache.org>
Subject [GitHub] flink pull request #5043: [FLINK-2170] [connectors] Add OrcRowInputFormat an...
Date Wed, 22 Nov 2017 11:12:56 GMT
Github user twalthr commented on a diff in the pull request:

    https://github.com/apache/flink/pull/5043#discussion_r152510004
  
    --- Diff: flink-connectors/flink-orc/src/main/java/org/apache/flink/orc/OrcUtils.java
---
    @@ -0,0 +1,1511 @@
    +/*
    + * Licensed to the Apache Software Foundation (ASF) under one
    + * or more contributor license agreements.  See the NOTICE file
    + * distributed with this work for additional information
    + * regarding copyright ownership.  The ASF licenses this file
    + * to you under the Apache License, Version 2.0 (the
    + * "License"); you may not use this file except in compliance
    + * with the License.  You may obtain a copy of the License at
    + *
    + *     http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +
    +package org.apache.flink.orc;
    +
    +import org.apache.flink.api.common.typeinfo.BasicTypeInfo;
    +import org.apache.flink.api.common.typeinfo.PrimitiveArrayTypeInfo;
    +import org.apache.flink.api.common.typeinfo.SqlTimeTypeInfo;
    +import org.apache.flink.api.common.typeinfo.TypeInformation;
    +import org.apache.flink.api.java.typeutils.MapTypeInfo;
    +import org.apache.flink.api.java.typeutils.ObjectArrayTypeInfo;
    +import org.apache.flink.api.java.typeutils.RowTypeInfo;
    +import org.apache.flink.types.Row;
    +
    +import org.apache.hadoop.hive.common.type.HiveDecimal;
    +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
    +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
    +import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector;
    +import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
    +import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector;
    +import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
    +import org.apache.hadoop.hive.ql.exec.vector.MapColumnVector;
    +import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector;
    +import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
    +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
    +import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
    +
    +import org.apache.orc.TypeDescription;
    +
    +import java.lang.reflect.Array;
    +import java.math.BigDecimal;
    +import java.sql.Date;
    +import java.sql.Timestamp;
    +import java.util.Arrays;
    +import java.util.HashMap;
    +import java.util.List;
    +import java.util.TimeZone;
    +import java.util.function.DoubleFunction;
    +import java.util.function.IntFunction;
    +import java.util.function.LongFunction;
    +
    +/**
    + * A class that provides utility methods for orc file reading.
    + */
    +class OrcUtils {
    +
    +	private static final long MILLIS_PER_DAY = 86400000; // = 24 * 60 * 60 * 1000
    +	private static final TimeZone LOCAL_TZ = TimeZone.getDefault();
    +
    +	/**
    +	 * Converts an ORC schema to a Flink TypeInformation.
    +	 *
    +	 * @param schema The ORC schema.
    +	 * @return The TypeInformation that corresponds to the ORC schema.
    +	 */
    +	static TypeInformation schemaToTypeInfo(TypeDescription schema) {
    +		switch (schema.getCategory()) {
    +			case BOOLEAN:
    +				return BasicTypeInfo.BOOLEAN_TYPE_INFO;
    +			case BYTE:
    +				return BasicTypeInfo.BYTE_TYPE_INFO;
    +			case SHORT:
    +				return BasicTypeInfo.SHORT_TYPE_INFO;
    +			case INT:
    +				return BasicTypeInfo.INT_TYPE_INFO;
    +			case LONG:
    +				return BasicTypeInfo.LONG_TYPE_INFO;
    +			case FLOAT:
    +				return BasicTypeInfo.FLOAT_TYPE_INFO;
    +			case DOUBLE:
    +				return BasicTypeInfo.DOUBLE_TYPE_INFO;
    +			case DECIMAL:
    +				return BasicTypeInfo.BIG_DEC_TYPE_INFO;
    +			case STRING:
    +			case CHAR:
    +			case VARCHAR:
    +				return BasicTypeInfo.STRING_TYPE_INFO;
    +			case DATE:
    +				return SqlTimeTypeInfo.DATE;
    +			case TIMESTAMP:
    +				return SqlTimeTypeInfo.TIMESTAMP;
    +			case BINARY:
    +				return PrimitiveArrayTypeInfo.BYTE_PRIMITIVE_ARRAY_TYPE_INFO;
    +			case STRUCT:
    +				List<TypeDescription> fieldSchemas = schema.getChildren();
    +				TypeInformation[] fieldTypes = new TypeInformation[fieldSchemas.size()];
    +				for (int i = 0; i < fieldSchemas.size(); i++) {
    +					fieldTypes[i] = schemaToTypeInfo(fieldSchemas.get(i));
    +				}
    +				String[] fieldNames = schema.getFieldNames().toArray(new String[]{});
    +				return new RowTypeInfo(fieldTypes, fieldNames);
    +			case LIST:
    +				TypeDescription elementSchema = schema.getChildren().get(0);
    +				TypeInformation<?> elementType = schemaToTypeInfo(elementSchema);
    +				// arrays of primitive types are handled as object arrays to support null values
    +				return ObjectArrayTypeInfo.getInfoFor(elementType);
    +			case MAP:
    +				TypeDescription keySchema = schema.getChildren().get(0);
    +				TypeDescription valSchema = schema.getChildren().get(1);
    +				TypeInformation<?> keyType = schemaToTypeInfo(keySchema);
    +				TypeInformation<?> valType = schemaToTypeInfo(valSchema);
    +				return new MapTypeInfo<>(keyType, valType);
    +			case UNION:
    +				throw new UnsupportedOperationException("UNION type is not supported yet.");
    +			default:
    +				throw new IllegalArgumentException("Unknown type " + schema);
    +		}
    +	}
    +
    +	/**
    +	 * Fills an ORC batch into an array of Row.
    +	 *
    +	 * @param rows The batch of rows need to be filled.
    +	 * @param schema The schema of the ORC data.
    +	 * @param batch The ORC data.
    +	 * @param selectedFields The list of selected ORC fields.
    +	 * @return The number of rows that were filled.
    +	 */
    +	static int fillRows(Row[] rows, TypeDescription schema, VectorizedRowBatch batch, int[]
selectedFields) {
    +
    +		int rowsToRead = Math.min((int) batch.count(), rows.length);
    +
    +		List<TypeDescription> fieldTypes = schema.getChildren();
    +		// read each selected field
    +		for (int rowIdx = 0; rowIdx < selectedFields.length; rowIdx++) {
    +			int orcIdx = selectedFields[rowIdx];
    +			readField(rows, rowIdx, fieldTypes.get(orcIdx), batch.cols[orcIdx], null, rowsToRead);
    +		}
    +		return rowsToRead;
    +	}
    +
    +	/**
    +	 * Reads a vector of data into an array of objects.
    +	 *
    +	 * @param vals The array that needs to be filled.
    +	 * @param fieldIdx If the vals array is an array of Row, the index of the field that
needs to be filled.
    +	 *                 Otherwise a -1 must be passed and the data is directly filled into
the array.
    +	 * @param schema The schema of the vector to read.
    +	 * @param vector The vector to read.
    +	 * @param lengthVector If the vector is of type List or Map, the number of sub-elements
to read for each field. Otherwise, it must be null.
    +	 * @param childCount The number of vector entries to read.
    +	 */
    +	private static void readField(Object[] vals, int fieldIdx, TypeDescription schema, ColumnVector
vector, long[] lengthVector, int childCount) {
    +
    +		// check the type of the vector to decide how to read it.
    +		switch (schema.getCategory()) {
    +			case BOOLEAN:
    +				if (vector.noNulls) {
    +					readNonNullLongColumn(vals, fieldIdx, (LongColumnVector) vector, lengthVector, childCount,
OrcUtils::readBoolean, OrcUtils::boolArray);
    +				} else {
    +					readLongColumn(vals, fieldIdx, (LongColumnVector) vector, lengthVector, childCount,
OrcUtils::readBoolean, OrcUtils::boolArray);
    +				}
    +				break;
    +			case BYTE:
    +				if (vector.noNulls) {
    +					readNonNullLongColumn(vals, fieldIdx, (LongColumnVector) vector, lengthVector, childCount,
OrcUtils::readByte, OrcUtils::byteArray);
    +				} else {
    +					readLongColumn(vals, fieldIdx, (LongColumnVector) vector, lengthVector, childCount,
OrcUtils::readByte, OrcUtils::byteArray);
    +				}
    +				break;
    +			case SHORT:
    +				if (vector.noNulls) {
    +					readNonNullLongColumn(vals, fieldIdx, (LongColumnVector) vector, lengthVector, childCount,
OrcUtils::readShort, OrcUtils::shortArray);
    +				} else {
    +					readLongColumn(vals, fieldIdx, (LongColumnVector) vector, lengthVector, childCount,
OrcUtils::readShort, OrcUtils::shortArray);
    +				}
    +				break;
    +			case INT:
    +				if (vector.noNulls) {
    +					readNonNullLongColumn(vals, fieldIdx, (LongColumnVector) vector, lengthVector, childCount,
OrcUtils::readInt, OrcUtils::intArray);
    +				} else {
    +					readLongColumn(vals, fieldIdx, (LongColumnVector) vector, lengthVector, childCount,
OrcUtils::readInt, OrcUtils::intArray);
    +				}
    +				break;
    +			case LONG:
    +				if (vector.noNulls) {
    +					readNonNullLongColumn(vals, fieldIdx, (LongColumnVector) vector, lengthVector, childCount,
OrcUtils::readLong, OrcUtils::longArray);
    +				} else {
    +					readLongColumn(vals, fieldIdx, (LongColumnVector) vector, lengthVector, childCount,
OrcUtils::readLong, OrcUtils::longArray);
    +				}
    +				break;
    +			case FLOAT:
    +				if (vector.noNulls) {
    +					readNonNullDoubleColumn(vals, fieldIdx, (DoubleColumnVector) vector, lengthVector,
childCount, OrcUtils::readFloat, OrcUtils::floatArray);
    +				} else {
    +					readDoubleColumn(vals, fieldIdx, (DoubleColumnVector) vector, lengthVector, childCount,
OrcUtils::readFloat, OrcUtils::floatArray);
    +				}
    +				break;
    +			case DOUBLE:
    +				if (vector.noNulls) {
    +					readNonNullDoubleColumn(vals, fieldIdx, (DoubleColumnVector) vector, lengthVector,
childCount, OrcUtils::readDouble, OrcUtils::doubleArray);
    +				} else {
    +					readDoubleColumn(vals, fieldIdx, (DoubleColumnVector) vector, lengthVector, childCount,
OrcUtils::readDouble, OrcUtils::doubleArray);
    +				}
    +				break;
    +			case CHAR:
    +			case VARCHAR:
    +			case STRING:
    +				if (vector.noNulls) {
    +					readNonNullBytesColumnAsString(vals, fieldIdx, (BytesColumnVector) vector, lengthVector,
childCount);
    +				} else {
    +					readBytesColumnAsString(vals, fieldIdx, (BytesColumnVector) vector, lengthVector,
childCount);
    +				}
    +				break;
    +			case DATE:
    +				if (vector.noNulls) {
    +					readNonNullLongColumnAsDate(vals, fieldIdx, (LongColumnVector) vector, lengthVector,
childCount);
    +				} else {
    +					readLongColumnAsDate(vals, fieldIdx, (LongColumnVector) vector, lengthVector, childCount);
    +				}
    +				break;
    +			case TIMESTAMP:
    +				if (vector.noNulls) {
    +					readNonNullTimestampColumn(vals, fieldIdx, (TimestampColumnVector) vector, lengthVector,
childCount);
    +				} else {
    +					readTimestampColumn(vals, fieldIdx, (TimestampColumnVector) vector, lengthVector,
childCount);
    +				}
    +				break;
    +			case BINARY:
    +				if (vector.noNulls) {
    +					readNonNullBytesColumnAsBinary(vals, fieldIdx, (BytesColumnVector) vector, lengthVector,
childCount);
    +				} else {
    +					readBytesColumnAsBinary(vals, fieldIdx, (BytesColumnVector) vector, lengthVector,
childCount);
    +				}
    +				break;
    +			case DECIMAL:
    +				if (vector.noNulls) {
    +					readNonNullDecimalColumn(vals, fieldIdx, (DecimalColumnVector) vector, lengthVector,
childCount);
    +				}
    +				else {
    +					readDecimalColumn(vals, fieldIdx, (DecimalColumnVector) vector, lengthVector, childCount);
    +				}
    +				break;
    +			case STRUCT:
    +				if (vector.noNulls) {
    +					readNonNullStructColumn(vals, fieldIdx, (StructColumnVector) vector, schema, lengthVector,
childCount);
    +				} else {
    +					readStructColumn(vals, fieldIdx, (StructColumnVector) vector, schema, lengthVector,
childCount);
    +				}
    +				break;
    +			case LIST:
    +				if (vector.noNulls) {
    +					readNonNullListColumn(vals, fieldIdx, (ListColumnVector) vector, schema, lengthVector,
childCount);
    +				}
    +				else {
    +					readListColumn(vals, fieldIdx, (ListColumnVector) vector, schema, lengthVector,
childCount);
    +				}
    +				break;
    +			case MAP:
    +				if (vector.noNulls) {
    +					readNonNullMapColumn(vals, fieldIdx, (MapColumnVector) vector, schema, lengthVector,
childCount);
    +				}
    +				else {
    +					readMapColumn(vals, fieldIdx, (MapColumnVector) vector, schema, lengthVector, childCount);
    +				}
    +				break;
    +			case UNION:
    +				throw new UnsupportedOperationException("UNION type not supported yet");
    +			default:
    +				throw new IllegalArgumentException("Unknown type " + schema);
    +		}
    +	}
    +
    +	private static <T> void readNonNullLongColumn(Object[] vals, int fieldIdx, LongColumnVector
vector, long[] lengthVector, int childCount,
    +													LongFunction<T> reader, IntFunction<T[]> array) {
    +
    +		// check if the values need to be read into lists or as single values
    +		if (lengthVector == null) {
    +			if (vector.isRepeating) { // fill complete column with first value
    +				T repeatingValue = reader.apply(vector.vector[0]);
    +				fillColumnWithRepeatingValue(vals, fieldIdx, repeatingValue, childCount);
    +			} else {
    +				if (fieldIdx == -1) { // set as an object
    +					for (int i = 0; i < childCount; i++) {
    +						vals[i] = reader.apply(vector.vector[i]);
    +					}
    +				} else { // set as a field of Row
    +					Row[] rows = (Row[]) vals;
    +					for (int i = 0; i < childCount; i++) {
    +						rows[i].setField(fieldIdx, reader.apply(vector.vector[i]));
    +					}
    +				}
    +			}
    +		} else { // in a list
    +			T[] temp;
    +			int offset = 0;
    +			if (vector.isRepeating) { // fill complete list with first value
    +				T repeatingValue = reader.apply(vector.vector[0]);
    +				for (int i = 0; offset < childCount; i++) {
    +					temp = array.apply((int) lengthVector[i]);
    +					Arrays.fill(temp, repeatingValue);
    +					offset += temp.length;
    +					if (fieldIdx == -1) {
    +						vals[i] = temp;
    +					} else {
    +						((Row) vals[i]).setField(fieldIdx, temp);
    +					}
    +				}
    +			} else {
    +				for (int i = 0; offset < childCount; i++) {
    +					temp = array.apply((int) lengthVector[i]);
    +					for (int j = 0; j < temp.length; j++) {
    +						temp[j] = reader.apply(vector.vector[offset++]);
    +					}
    +					if (fieldIdx == -1) {
    +						vals[i] = temp;
    +					} else {
    +						((Row) vals[i]).setField(fieldIdx, temp);
    +					}
    +				}
    +			}
    +		}
    +	}
    +
    +	private static <T> void readNonNullDoubleColumn(Object[] vals, int fieldIdx, DoubleColumnVector
vector, long[] lengthVector, int childCount,
    +													DoubleFunction<T> reader, IntFunction<T[]> array) {
    +
    +		// check if the values need to be read into lists or as single values
    +		if (lengthVector == null) {
    +			if (vector.isRepeating) { // fill complete column with first value
    +				T repeatingValue = reader.apply(vector.vector[0]);
    +				fillColumnWithRepeatingValue(vals, fieldIdx, repeatingValue, childCount);
    +			} else {
    +				if (fieldIdx == -1) { // set as an object
    +					for (int i = 0; i < childCount; i++) {
    +						vals[i] = reader.apply(vector.vector[i]);
    +					}
    +				} else { // set as a field of Row
    +					Row[] rows = (Row[]) vals;
    +					for (int i = 0; i < childCount; i++) {
    +						rows[i].setField(fieldIdx, reader.apply(vector.vector[i]));
    +					}
    +				}
    +			}
    +		} else { // in a list
    +			T[] temp;
    +			int offset = 0;
    +			if (vector.isRepeating) { // fill complete list with first value
    +				T repeatingValue = reader.apply(vector.vector[0]);
    +				for (int i = 0; offset < childCount; i++) {
    +					temp = array.apply((int) lengthVector[i]);
    +					Arrays.fill(temp, repeatingValue);
    +					offset += temp.length;
    +					if (fieldIdx == -1) {
    +						vals[i] = temp;
    +					} else {
    +						((Row) vals[i]).setField(fieldIdx, temp);
    +					}
    +				}
    +			} else {
    +				for (int i = 0; offset < childCount; i++) {
    +					temp = array.apply((int) lengthVector[i]);
    +					for (int j = 0; j < temp.length; j++) {
    +						temp[j] = reader.apply(vector.vector[offset++]);
    +					}
    +					if (fieldIdx == -1) {
    +						vals[i] = temp;
    +					} else {
    +						((Row) vals[i]).setField(fieldIdx, temp);
    +					}
    +				}
    +			}
    +		}
    +	}
    +
    +	private static void readNonNullBytesColumnAsString(Object[] vals, int fieldIdx, BytesColumnVector
bytes, long[] lengthVector, int childCount) {
    +		// check if the values need to be read into lists or as single values
    +		if (lengthVector == null) {
    +			if (bytes.isRepeating) { // fill complete column with first value
    +				String repeatingValue = new String(bytes.vector[0], bytes.start[0], bytes.length[0]);
    --- End diff --
    
    Do we need to consider the Charset here?


---

Mime
View raw message