Return-Path: X-Original-To: archive-asf-public-internal@cust-asf2.ponee.io Delivered-To: archive-asf-public-internal@cust-asf2.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by cust-asf2.ponee.io (Postfix) with ESMTP id 75E12200C40 for ; Thu, 23 Mar 2017 17:30:56 +0100 (CET) Received: by cust-asf.ponee.io (Postfix) id 74007160B83; Thu, 23 Mar 2017 16:30:56 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id 43CCB160B68 for ; Thu, 23 Mar 2017 17:30:55 +0100 (CET) Received: (qmail 26447 invoked by uid 500); 23 Mar 2017 16:30:52 -0000 Mailing-List: contact commits-help@arrow.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@arrow.apache.org Delivered-To: mailing list commits@arrow.apache.org Received: (qmail 26438 invoked by uid 99); 23 Mar 2017 16:30:52 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 23 Mar 2017 16:30:52 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id 907FBDFB31; Thu, 23 Mar 2017 16:30:52 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: wesm@apache.org To: commits@arrow.apache.org Message-Id: <3172cf1cf1fc4411a8603daafee9c387@git.apache.org> X-Mailer: ASF-Git Admin Mailer Subject: arrow git commit: ARROW-662: [Format] Move Schema flatbuffers into their own file that can be included Date: Thu, 23 Mar 2017 16:30:52 +0000 (UTC) archived-at: Thu, 23 Mar 2017 16:30:56 -0000 Repository: arrow Updated Branches: refs/heads/master 7594492d5 -> 2a568f093 ARROW-662: [Format] Move Schema flatbuffers into their own file that can be included @julienledem for some reason the Java build is failing for me locally (also on master): ``` [ERROR] Failed to execute goal org.apache.maven.plugins:maven-compiler-plugin:3.2:testCompile (default-testCompile) on project arrow-vector: Compilation failure: Compilation failure: [ERROR] /home/wesm/code/arrow/java/vector/src/test/java/org/apache/arrow/vector/types/pojo/TestSchema.java:[38] error: a type with the same simple name is already defined by the single-type-import of List [ERROR] /home/wesm/code/arrow/java/vector/src/test/java/org/apache/arrow/vector/types/pojo/TestSchema.java:[64,19] error: List is abstract; cannot be instantiated [ERROR] /home/wesm/code/arrow/java/vector/src/test/java/org/apache/arrow/vector/types/pojo/TestSchema.java:[81,19] error: List is abstract; cannot be instantiated [ERROR] -> [Help 1] ``` Author: Wes McKinney Closes #429 from wesm/ARROW-662 and squashes the following commits: b588f81 [Wes McKinney] Move Schema flatbuffers into their own file that can be included Project: http://git-wip-us.apache.org/repos/asf/arrow/repo Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/2a568f09 Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/2a568f09 Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/2a568f09 Branch: refs/heads/master Commit: 2a568f093670daba7b5dab8c096669bcfdd09a5f Parents: 7594492 Author: Wes McKinney Authored: Thu Mar 23 12:30:44 2017 -0400 Committer: Wes McKinney Committed: Thu Mar 23 12:30:44 2017 -0400 ---------------------------------------------------------------------- cpp/src/arrow/ipc/CMakeLists.txt | 1 + format/File.fbs | 2 +- format/Message.fbs | 264 +------------------------------- format/Schema.fbs | 280 ++++++++++++++++++++++++++++++++++ java/format/pom.xml | 20 +-- 5 files changed, 295 insertions(+), 272 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/arrow/blob/2a568f09/cpp/src/arrow/ipc/CMakeLists.txt ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/ipc/CMakeLists.txt b/cpp/src/arrow/ipc/CMakeLists.txt index 3a98a38..629cc5b 100644 --- a/cpp/src/arrow/ipc/CMakeLists.txt +++ b/cpp/src/arrow/ipc/CMakeLists.txt @@ -113,6 +113,7 @@ set(FBS_OUTPUT_FILES set(FBS_SRC ${CMAKE_SOURCE_DIR}/../format/Message.fbs ${CMAKE_SOURCE_DIR}/../format/File.fbs + ${CMAKE_SOURCE_DIR}/../format/Schema.fbs ${CMAKE_CURRENT_SOURCE_DIR}/feather.fbs) foreach(FIL ${FBS_SRC}) http://git-wip-us.apache.org/repos/asf/arrow/blob/2a568f09/format/File.fbs ---------------------------------------------------------------------- diff --git a/format/File.fbs b/format/File.fbs index e8d6da4..3a27ca6 100644 --- a/format/File.fbs +++ b/format/File.fbs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -include "Message.fbs"; +include "Schema.fbs"; namespace org.apache.arrow.flatbuf; http://git-wip-us.apache.org/repos/asf/arrow/blob/2a568f09/format/Message.fbs ---------------------------------------------------------------------- diff --git a/format/Message.fbs b/format/Message.fbs index ff30ace..2cb6095 100644 --- a/format/Message.fbs +++ b/format/Message.fbs @@ -15,272 +15,14 @@ // specific language governing permissions and limitations // under the License. -namespace org.apache.arrow.flatbuf; - -enum MetadataVersion:short { - V1, - V2 -} - -/// ---------------------------------------------------------------------- -/// Logical types and their metadata (if any) -/// -/// These are stored in the flatbuffer in the Type union below - -table Null { -} - -/// A Struct_ in the flatbuffer metadata is the same as an Arrow Struct -/// (according to the physical memory layout). We used Struct_ here as -/// Struct is a reserved word in Flatbuffers -table Struct_ { -} - -table List { -} - -enum UnionMode:short { Sparse, Dense } - -/// A union is a complex type with children in Field -/// By default ids in the type vector refer to the offsets in the children -/// optionally typeIds provides an indirection between the child offset and the type id -/// for each child typeIds[offset] is the id used in the type vector -table Union { - mode: UnionMode; - typeIds: [ int ]; // optional, describes typeid of each child. -} - -table Int { - bitWidth: int; // restricted to 8, 16, 32, and 64 in v1 - is_signed: bool; -} - -enum Precision:short {HALF, SINGLE, DOUBLE} - -table FloatingPoint { - precision: Precision; -} - -/// Unicode with UTF-8 encoding -table Utf8 { -} - -table Binary { -} - -table FixedWidthBinary { - /// Number of bytes per value - byteWidth: int; -} - -table Bool { -} - -table Decimal { - precision: int; - scale: int; -} - -enum DateUnit: short { - DAY, - MILLISECOND -} - -/// Date is either a 32-bit or 64-bit type representing elapsed time since UNIX -/// epoch (1970-01-01), stored in either of two units: -/// -/// * Milliseconds (64 bits) indicating UNIX time elapsed since the epoch (no -/// leap seconds), where the values are evenly divisible by 86400000 -/// * Days (32 bits) since the UNIX epoch -table Date { - unit: DateUnit; -} - -enum TimeUnit: short { SECOND, MILLISECOND, MICROSECOND, NANOSECOND } - -/// Time type. The physical storage type depends on the unit -/// - SECOND and MILLISECOND: 32 bits -/// - MICROSECOND and NANOSECOND: 64 bits -table Time { - unit: TimeUnit; - bitWidth: int; -} - -/// Time elapsed from the Unix epoch, 00:00:00.000 on 1 January 1970, UTC. -table Timestamp { - unit: TimeUnit; - - /// The time zone is a string indicating the name of a time zone, one of: - /// - /// * As used in the Olson time zone database (the "tz database" or - /// "tzdata"), such as "America/New_York" - /// * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30 - /// - /// Whether a timezone string is present indicates different semantics about - /// the data: - /// - /// * If the time zone is null or equal to an empty string, the data is "time - /// zone naive" and shall be displayed *as is* to the user, not localized - /// to the locale of the user. This data can be though of as UTC but - /// without having "UTC" as the time zone, it is not considered to be - /// localized to any time zone - /// - /// * If the time zone is set to a valid value, values can be displayed as - /// "localized" to that time zone, even though the underlying 64-bit - /// integers are identical to the same data stored in UTC. Converting - /// between time zones is a metadata-only operation and does not change the - /// underlying values - timezone: string; -} - -enum IntervalUnit: short { YEAR_MONTH, DAY_TIME} -table Interval { - unit: IntervalUnit; -} - -/// ---------------------------------------------------------------------- -/// Top-level Type value, enabling extensible type-specific metadata. We can -/// add new logical types to Type without breaking backwards compatibility - -union Type { - Null, - Int, - FloatingPoint, - Binary, - Utf8, - Bool, - Decimal, - Date, - Time, - Timestamp, - Interval, - List, - Struct_, - Union, - FixedWidthBinary -} - -/// ---------------------------------------------------------------------- -/// The possible types of a vector - -enum VectorType: short { - /// used in List type, Dense Union and variable length primitive types (String, Binary) - OFFSET, - /// actual data, either wixed width primitive types in slots or variable width delimited by an OFFSET vector - DATA, - /// Bit vector indicating if each value is null - VALIDITY, - /// Type vector used in Union type - TYPE -} - -/// ---------------------------------------------------------------------- -/// represents the physical layout of a buffer -/// buffers have fixed width slots of a given type - -table VectorLayout { - /// the width of a slot in the buffer (typically 1, 8, 16, 32 or 64) - bit_width: short; - /// the purpose of the vector - type: VectorType; -} - - -/// ---------------------------------------------------------------------- -/// user defined key value pairs to add custom metadata to arrow -/// key namespacing is the responsibility of the user - -table KeyValue { - key: string; - value: [ubyte]; -} - -/// ---------------------------------------------------------------------- -/// Dictionary encoding metadata - -table DictionaryEncoding { - /// The known dictionary id in the application where this data is used. In - /// the file or streaming formats, the dictionary ids are found in the - /// DictionaryBatch messages - id: long; - - /// The dictionary indices are constrained to be positive integers. If this - /// field is null, the indices must be signed int32 - indexType: Int; +include "Schema.fbs"; - /// By default, dictionaries are not ordered, or the order does not have - /// semantic meaning. In some statistical, applications, dictionary-encoding - /// is used to represent ordered categorical data, and we provide a way to - /// preserve that metadata here - isOrdered: bool; -} - -/// ---------------------------------------------------------------------- -/// A field represents a named column in a record / row batch or child of a -/// nested type. -/// -/// - children is only for nested Arrow arrays -/// - For primitive types, children will have length 0 -/// - nullable should default to true in general - -table Field { - // Name is not required, in i.e. a List - name: string; - nullable: bool; - type: Type; - - // Present only if the field is dictionary encoded - dictionary: DictionaryEncoding; - - // children apply only to Nested data types like Struct, List and Union - children: [Field]; - /// layout of buffers produced for this type (as derived from the Type) - /// does not include children - /// each recordbatch will return instances of those Buffers. - layout: [ VectorLayout ]; - // User-defined metadata - custom_metadata: [ KeyValue ]; -} - -/// ---------------------------------------------------------------------- -/// Endianness of the platform that produces the RecordBatch - -enum Endianness:short { Little, Big } - -/// ---------------------------------------------------------------------- -/// A Schema describes the columns in a row batch - -table Schema { - - /// endianness of the buffer - /// it is Little Endian by default - /// if endianness doesn't match the underlying system then the vectors need to be converted - endianness: Endianness=Little; - - fields: [Field]; - // User-defined metadata - custom_metadata: [ KeyValue ]; -} +namespace org.apache.arrow.flatbuf; /// ---------------------------------------------------------------------- /// Data structures for describing a table row batch (a collection of /// equal-length Arrow arrays) -/// A Buffer represents a single contiguous memory segment -struct Buffer { - /// The shared memory page id where this buffer is located. Currently this is - /// not used - page: int; - - /// The relative offset into the shared memory page where the bytes for this - /// buffer starts - offset: long; - - /// The absolute length (in bytes) of the memory buffer. The memory is found - /// from offset (inclusive) to offset + length (non-inclusive). - length: long; -} - /// Metadata about a field at some level of a nested type tree (but not /// its children). /// @@ -349,4 +91,4 @@ table Message { bodyLength: long; } -root_type Message; +root_type Message; \ No newline at end of file http://git-wip-us.apache.org/repos/asf/arrow/blob/2a568f09/format/Schema.fbs ---------------------------------------------------------------------- diff --git a/format/Schema.fbs b/format/Schema.fbs new file mode 100644 index 0000000..5268bf9 --- /dev/null +++ b/format/Schema.fbs @@ -0,0 +1,280 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +/// Logical types, vector layouts, and schemas + +namespace org.apache.arrow.flatbuf; + +enum MetadataVersion:short { + V1, + V2 +} + +/// These are stored in the flatbuffer in the Type union below + +table Null { +} + +/// A Struct_ in the flatbuffer metadata is the same as an Arrow Struct +/// (according to the physical memory layout). We used Struct_ here as +/// Struct is a reserved word in Flatbuffers +table Struct_ { +} + +table List { +} + +enum UnionMode:short { Sparse, Dense } + +/// A union is a complex type with children in Field +/// By default ids in the type vector refer to the offsets in the children +/// optionally typeIds provides an indirection between the child offset and the type id +/// for each child typeIds[offset] is the id used in the type vector +table Union { + mode: UnionMode; + typeIds: [ int ]; // optional, describes typeid of each child. +} + +table Int { + bitWidth: int; // restricted to 8, 16, 32, and 64 in v1 + is_signed: bool; +} + +enum Precision:short {HALF, SINGLE, DOUBLE} + +table FloatingPoint { + precision: Precision; +} + +/// Unicode with UTF-8 encoding +table Utf8 { +} + +table Binary { +} + +table FixedWidthBinary { + /// Number of bytes per value + byteWidth: int; +} + +table Bool { +} + +table Decimal { + precision: int; + scale: int; +} + +enum DateUnit: short { + DAY, + MILLISECOND +} + +/// Date is either a 32-bit or 64-bit type representing elapsed time since UNIX +/// epoch (1970-01-01), stored in either of two units: +/// +/// * Milliseconds (64 bits) indicating UNIX time elapsed since the epoch (no +/// leap seconds), where the values are evenly divisible by 86400000 +/// * Days (32 bits) since the UNIX epoch +table Date { + unit: DateUnit; +} + +enum TimeUnit: short { SECOND, MILLISECOND, MICROSECOND, NANOSECOND } + +/// Time type. The physical storage type depends on the unit +/// - SECOND and MILLISECOND: 32 bits +/// - MICROSECOND and NANOSECOND: 64 bits +table Time { + unit: TimeUnit; + bitWidth: int; +} + +/// Time elapsed from the Unix epoch, 00:00:00.000 on 1 January 1970, UTC. +table Timestamp { + unit: TimeUnit; + + /// The time zone is a string indicating the name of a time zone, one of: + /// + /// * As used in the Olson time zone database (the "tz database" or + /// "tzdata"), such as "America/New_York" + /// * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30 + /// + /// Whether a timezone string is present indicates different semantics about + /// the data: + /// + /// * If the time zone is null or equal to an empty string, the data is "time + /// zone naive" and shall be displayed *as is* to the user, not localized + /// to the locale of the user. This data can be though of as UTC but + /// without having "UTC" as the time zone, it is not considered to be + /// localized to any time zone + /// + /// * If the time zone is set to a valid value, values can be displayed as + /// "localized" to that time zone, even though the underlying 64-bit + /// integers are identical to the same data stored in UTC. Converting + /// between time zones is a metadata-only operation and does not change the + /// underlying values + timezone: string; +} + +enum IntervalUnit: short { YEAR_MONTH, DAY_TIME} +table Interval { + unit: IntervalUnit; +} + +/// ---------------------------------------------------------------------- +/// Top-level Type value, enabling extensible type-specific metadata. We can +/// add new logical types to Type without breaking backwards compatibility + +union Type { + Null, + Int, + FloatingPoint, + Binary, + Utf8, + Bool, + Decimal, + Date, + Time, + Timestamp, + Interval, + List, + Struct_, + Union, + FixedWidthBinary +} + +/// ---------------------------------------------------------------------- +/// The possible types of a vector + +enum VectorType: short { + /// used in List type, Dense Union and variable length primitive types (String, Binary) + OFFSET, + /// actual data, either wixed width primitive types in slots or variable width delimited by an OFFSET vector + DATA, + /// Bit vector indicating if each value is null + VALIDITY, + /// Type vector used in Union type + TYPE +} + +/// ---------------------------------------------------------------------- +/// represents the physical layout of a buffer +/// buffers have fixed width slots of a given type + +table VectorLayout { + /// the width of a slot in the buffer (typically 1, 8, 16, 32 or 64) + bit_width: short; + /// the purpose of the vector + type: VectorType; +} + + +/// ---------------------------------------------------------------------- +/// user defined key value pairs to add custom metadata to arrow +/// key namespacing is the responsibility of the user + +table KeyValue { + key: string; + value: [ubyte]; +} + +/// ---------------------------------------------------------------------- +/// Dictionary encoding metadata + +table DictionaryEncoding { + /// The known dictionary id in the application where this data is used. In + /// the file or streaming formats, the dictionary ids are found in the + /// DictionaryBatch messages + id: long; + + /// The dictionary indices are constrained to be positive integers. If this + /// field is null, the indices must be signed int32 + indexType: Int; + + /// By default, dictionaries are not ordered, or the order does not have + /// semantic meaning. In some statistical, applications, dictionary-encoding + /// is used to represent ordered categorical data, and we provide a way to + /// preserve that metadata here + isOrdered: bool; +} + +/// ---------------------------------------------------------------------- +/// A field represents a named column in a record / row batch or child of a +/// nested type. +/// +/// - children is only for nested Arrow arrays +/// - For primitive types, children will have length 0 +/// - nullable should default to true in general + +table Field { + // Name is not required, in i.e. a List + name: string; + nullable: bool; + type: Type; + + // Present only if the field is dictionary encoded + dictionary: DictionaryEncoding; + + // children apply only to Nested data types like Struct, List and Union + children: [Field]; + /// layout of buffers produced for this type (as derived from the Type) + /// does not include children + /// each recordbatch will return instances of those Buffers. + layout: [ VectorLayout ]; + // User-defined metadata + custom_metadata: [ KeyValue ]; +} + +/// ---------------------------------------------------------------------- +/// Endianness of the platform producing the data + +enum Endianness:short { Little, Big } + +/// ---------------------------------------------------------------------- +/// A Buffer represents a single contiguous memory segment +struct Buffer { + /// The shared memory page id where this buffer is located. Currently this is + /// not used + page: int; + + /// The relative offset into the shared memory page where the bytes for this + /// buffer starts + offset: long; + + /// The absolute length (in bytes) of the memory buffer. The memory is found + /// from offset (inclusive) to offset + length (non-inclusive). + length: long; +} + +/// ---------------------------------------------------------------------- +/// A Schema describes the columns in a row batch + +table Schema { + + /// endianness of the buffer + /// it is Little Endian by default + /// if endianness doesn't match the underlying system then the vectors need to be converted + endianness: Endianness=Little; + + fields: [Field]; + // User-defined metadata + custom_metadata: [ KeyValue ]; +} + +root_type Schema; http://git-wip-us.apache.org/repos/asf/arrow/blob/2a568f09/java/format/pom.xml ---------------------------------------------------------------------- diff --git a/java/format/pom.xml b/java/format/pom.xml index c65a7bc..e7a58a4 100644 --- a/java/format/pom.xml +++ b/java/format/pom.xml @@ -1,13 +1,13 @@ - 4.0.0 @@ -109,6 +109,7 @@ -j -o ${flatc.generated.files} + ../../format/Schema.fbs ../../format/Message.fbs ../../format/File.fbs @@ -165,4 +166,3 @@ -