arrow-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From w...@apache.org
Subject arrow git commit: ARROW-662: [Format] Move Schema flatbuffers into their own file that can be included
Date Thu, 23 Mar 2017 16:30:52 GMT
Repository: arrow
Updated Branches:
  refs/heads/master 7594492d5 -> 2a568f093


ARROW-662: [Format] Move Schema flatbuffers into their own file that can be included

@julienledem for some reason the Java build is failing for me locally (also on master):

```
[ERROR] Failed to execute goal org.apache.maven.plugins:maven-compiler-plugin:3.2:testCompile
(default-testCompile) on project arrow-vector: Compilation failure: Compilation failure:
[ERROR] /home/wesm/code/arrow/java/vector/src/test/java/org/apache/arrow/vector/types/pojo/TestSchema.java:[38]
error: a type with the same simple name is already defined by the single-type-import of List
[ERROR] /home/wesm/code/arrow/java/vector/src/test/java/org/apache/arrow/vector/types/pojo/TestSchema.java:[64,19]
error: List is abstract; cannot be instantiated
[ERROR] /home/wesm/code/arrow/java/vector/src/test/java/org/apache/arrow/vector/types/pojo/TestSchema.java:[81,19]
error: List is abstract; cannot be instantiated
[ERROR] -> [Help 1]
```

Author: Wes McKinney <wes.mckinney@twosigma.com>

Closes #429 from wesm/ARROW-662 and squashes the following commits:

b588f81 [Wes McKinney] Move Schema flatbuffers into their own file that can be included


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/2a568f09
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/2a568f09
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/2a568f09

Branch: refs/heads/master
Commit: 2a568f093670daba7b5dab8c096669bcfdd09a5f
Parents: 7594492
Author: Wes McKinney <wes.mckinney@twosigma.com>
Authored: Thu Mar 23 12:30:44 2017 -0400
Committer: Wes McKinney <wes.mckinney@twosigma.com>
Committed: Thu Mar 23 12:30:44 2017 -0400

----------------------------------------------------------------------
 cpp/src/arrow/ipc/CMakeLists.txt |   1 +
 format/File.fbs                  |   2 +-
 format/Message.fbs               | 264 +-------------------------------
 format/Schema.fbs                | 280 ++++++++++++++++++++++++++++++++++
 java/format/pom.xml              |  20 +--
 5 files changed, 295 insertions(+), 272 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/2a568f09/cpp/src/arrow/ipc/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/CMakeLists.txt b/cpp/src/arrow/ipc/CMakeLists.txt
index 3a98a38..629cc5b 100644
--- a/cpp/src/arrow/ipc/CMakeLists.txt
+++ b/cpp/src/arrow/ipc/CMakeLists.txt
@@ -113,6 +113,7 @@ set(FBS_OUTPUT_FILES
 set(FBS_SRC
   ${CMAKE_SOURCE_DIR}/../format/Message.fbs
   ${CMAKE_SOURCE_DIR}/../format/File.fbs
+  ${CMAKE_SOURCE_DIR}/../format/Schema.fbs
   ${CMAKE_CURRENT_SOURCE_DIR}/feather.fbs)
 
 foreach(FIL ${FBS_SRC})

http://git-wip-us.apache.org/repos/asf/arrow/blob/2a568f09/format/File.fbs
----------------------------------------------------------------------
diff --git a/format/File.fbs b/format/File.fbs
index e8d6da4..3a27ca6 100644
--- a/format/File.fbs
+++ b/format/File.fbs
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-include "Message.fbs";
+include "Schema.fbs";
 
 namespace org.apache.arrow.flatbuf;
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/2a568f09/format/Message.fbs
----------------------------------------------------------------------
diff --git a/format/Message.fbs b/format/Message.fbs
index ff30ace..2cb6095 100644
--- a/format/Message.fbs
+++ b/format/Message.fbs
@@ -15,272 +15,14 @@
 // specific language governing permissions and limitations
 // under the License.
 
-namespace org.apache.arrow.flatbuf;
-
-enum MetadataVersion:short {
-  V1,
-  V2
-}
-
-/// ----------------------------------------------------------------------
-/// Logical types and their metadata (if any)
-///
-/// These are stored in the flatbuffer in the Type union below
-
-table Null {
-}
-
-/// A Struct_ in the flatbuffer metadata is the same as an Arrow Struct
-/// (according to the physical memory layout). We used Struct_ here as
-/// Struct is a reserved word in Flatbuffers
-table Struct_ {
-}
-
-table List {
-}
-
-enum UnionMode:short { Sparse, Dense }
-
-/// A union is a complex type with children in Field
-/// By default ids in the type vector refer to the offsets in the children
-/// optionally typeIds provides an indirection between the child offset and the type id
-/// for each child typeIds[offset] is the id used in the type vector
-table Union {
-  mode: UnionMode;
-  typeIds: [ int ]; // optional, describes typeid of each child.
-}
-
-table Int {
-  bitWidth: int; // restricted to 8, 16, 32, and 64 in v1
-  is_signed: bool;
-}
-
-enum Precision:short {HALF, SINGLE, DOUBLE}
-
-table FloatingPoint {
-  precision: Precision;
-}
-
-/// Unicode with UTF-8 encoding
-table Utf8 {
-}
-
-table Binary {
-}
-
-table FixedWidthBinary {
-  /// Number of bytes per value
-  byteWidth: int;
-}
-
-table Bool {
-}
-
-table Decimal {
-  precision: int;
-  scale: int;
-}
-
-enum DateUnit: short {
-  DAY,
-  MILLISECOND
-}
-
-/// Date is either a 32-bit or 64-bit type representing elapsed time since UNIX
-/// epoch (1970-01-01), stored in either of two units:
-///
-/// * Milliseconds (64 bits) indicating UNIX time elapsed since the epoch (no
-///   leap seconds), where the values are evenly divisible by 86400000
-/// * Days (32 bits) since the UNIX epoch
-table Date {
-  unit: DateUnit;
-}
-
-enum TimeUnit: short { SECOND, MILLISECOND, MICROSECOND, NANOSECOND }
-
-/// Time type. The physical storage type depends on the unit
-/// - SECOND and MILLISECOND: 32 bits
-/// - MICROSECOND and NANOSECOND: 64 bits
-table Time {
-  unit: TimeUnit;
-  bitWidth: int;
-}
-
-/// Time elapsed from the Unix epoch, 00:00:00.000 on 1 January 1970, UTC.
-table Timestamp {
-  unit: TimeUnit;
-
-  /// The time zone is a string indicating the name of a time zone, one of:
-  ///
-  /// * As used in the Olson time zone database (the "tz database" or
-  ///   "tzdata"), such as "America/New_York"
-  /// * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30
-  ///
-  /// Whether a timezone string is present indicates different semantics about
-  /// the data:
-  ///
-  /// * If the time zone is null or equal to an empty string, the data is "time
-  ///   zone naive" and shall be displayed *as is* to the user, not localized
-  ///   to the locale of the user. This data can be though of as UTC but
-  ///   without having "UTC" as the time zone, it is not considered to be
-  ///   localized to any time zone
-  ///
-  /// * If the time zone is set to a valid value, values can be displayed as
-  ///   "localized" to that time zone, even though the underlying 64-bit
-  ///   integers are identical to the same data stored in UTC. Converting
-  ///   between time zones is a metadata-only operation and does not change the
-  ///   underlying values
-  timezone: string;
-}
-
-enum IntervalUnit: short { YEAR_MONTH, DAY_TIME}
-table Interval {
-  unit: IntervalUnit;
-}
-
-/// ----------------------------------------------------------------------
-/// Top-level Type value, enabling extensible type-specific metadata. We can
-/// add new logical types to Type without breaking backwards compatibility
-
-union Type {
-  Null,
-  Int,
-  FloatingPoint,
-  Binary,
-  Utf8,
-  Bool,
-  Decimal,
-  Date,
-  Time,
-  Timestamp,
-  Interval,
-  List,
-  Struct_,
-  Union,
-  FixedWidthBinary
-}
-
-/// ----------------------------------------------------------------------
-/// The possible types of a vector
-
-enum VectorType: short {
-  /// used in List type, Dense Union and variable length primitive types (String, Binary)
-  OFFSET,
-  /// actual data, either wixed width primitive types in slots or variable width delimited
by an OFFSET vector
-  DATA,
-  /// Bit vector indicating if each value is null
-  VALIDITY,
-  /// Type vector used in Union type
-  TYPE
-}
-
-/// ----------------------------------------------------------------------
-/// represents the physical layout of a buffer
-/// buffers have fixed width slots of a given type
-
-table VectorLayout {
-  /// the width of a slot in the buffer (typically 1, 8, 16, 32 or 64)
-  bit_width: short;
-  /// the purpose of the vector
-  type: VectorType;
-}
-
-
-/// ----------------------------------------------------------------------
-/// user defined key value pairs to add custom metadata to arrow
-/// key namespacing is the responsibility of the user
-
-table KeyValue {
-  key: string;
-  value: [ubyte];
-}
-
-/// ----------------------------------------------------------------------
-/// Dictionary encoding metadata
-
-table DictionaryEncoding {
-  /// The known dictionary id in the application where this data is used. In
-  /// the file or streaming formats, the dictionary ids are found in the
-  /// DictionaryBatch messages
-  id: long;
-
-  /// The dictionary indices are constrained to be positive integers. If this
-  /// field is null, the indices must be signed int32
-  indexType: Int;
+include "Schema.fbs";
 
-  /// By default, dictionaries are not ordered, or the order does not have
-  /// semantic meaning. In some statistical, applications, dictionary-encoding
-  /// is used to represent ordered categorical data, and we provide a way to
-  /// preserve that metadata here
-  isOrdered: bool;
-}
-
-/// ----------------------------------------------------------------------
-/// A field represents a named column in a record / row batch or child of a
-/// nested type.
-///
-/// - children is only for nested Arrow arrays
-/// - For primitive types, children will have length 0
-/// - nullable should default to true in general
-
-table Field {
-  // Name is not required, in i.e. a List
-  name: string;
-  nullable: bool;
-  type: Type;
-
-  // Present only if the field is dictionary encoded
-  dictionary: DictionaryEncoding;
-
-  // children apply only to Nested data types like Struct, List and Union
-  children: [Field];
-  /// layout of buffers produced for this type (as derived from the Type)
-  /// does not include children
-  /// each recordbatch will return instances of those Buffers.
-  layout: [ VectorLayout ];
-  // User-defined metadata
-  custom_metadata: [ KeyValue ];
-}
-
-/// ----------------------------------------------------------------------
-/// Endianness of the platform that produces the RecordBatch
-
-enum Endianness:short { Little, Big }
-
-/// ----------------------------------------------------------------------
-/// A Schema describes the columns in a row batch
-
-table Schema {
-
-  /// endianness of the buffer
-  /// it is Little Endian by default
-  /// if endianness doesn't match the underlying system then the vectors need to be converted
-  endianness: Endianness=Little;
-
-  fields: [Field];
-  // User-defined metadata
-  custom_metadata: [ KeyValue ];
-}
+namespace org.apache.arrow.flatbuf;
 
 /// ----------------------------------------------------------------------
 /// Data structures for describing a table row batch (a collection of
 /// equal-length Arrow arrays)
 
-/// A Buffer represents a single contiguous memory segment
-struct Buffer {
-  /// The shared memory page id where this buffer is located. Currently this is
-  /// not used
-  page: int;
-
-  /// The relative offset into the shared memory page where the bytes for this
-  /// buffer starts
-  offset: long;
-
-  /// The absolute length (in bytes) of the memory buffer. The memory is found
-  /// from offset (inclusive) to offset + length (non-inclusive).
-  length: long;
-}
-
 /// Metadata about a field at some level of a nested type tree (but not
 /// its children).
 ///
@@ -349,4 +91,4 @@ table Message {
   bodyLength: long;
 }
 
-root_type Message;
+root_type Message;
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/arrow/blob/2a568f09/format/Schema.fbs
----------------------------------------------------------------------
diff --git a/format/Schema.fbs b/format/Schema.fbs
new file mode 100644
index 0000000..5268bf9
--- /dev/null
+++ b/format/Schema.fbs
@@ -0,0 +1,280 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+/// Logical types, vector layouts, and schemas
+
+namespace org.apache.arrow.flatbuf;
+
+enum MetadataVersion:short {
+  V1,
+  V2
+}
+
+/// These are stored in the flatbuffer in the Type union below
+
+table Null {
+}
+
+/// A Struct_ in the flatbuffer metadata is the same as an Arrow Struct
+/// (according to the physical memory layout). We used Struct_ here as
+/// Struct is a reserved word in Flatbuffers
+table Struct_ {
+}
+
+table List {
+}
+
+enum UnionMode:short { Sparse, Dense }
+
+/// A union is a complex type with children in Field
+/// By default ids in the type vector refer to the offsets in the children
+/// optionally typeIds provides an indirection between the child offset and the type id
+/// for each child typeIds[offset] is the id used in the type vector
+table Union {
+  mode: UnionMode;
+  typeIds: [ int ]; // optional, describes typeid of each child.
+}
+
+table Int {
+  bitWidth: int; // restricted to 8, 16, 32, and 64 in v1
+  is_signed: bool;
+}
+
+enum Precision:short {HALF, SINGLE, DOUBLE}
+
+table FloatingPoint {
+  precision: Precision;
+}
+
+/// Unicode with UTF-8 encoding
+table Utf8 {
+}
+
+table Binary {
+}
+
+table FixedWidthBinary {
+  /// Number of bytes per value
+  byteWidth: int;
+}
+
+table Bool {
+}
+
+table Decimal {
+  precision: int;
+  scale: int;
+}
+
+enum DateUnit: short {
+  DAY,
+  MILLISECOND
+}
+
+/// Date is either a 32-bit or 64-bit type representing elapsed time since UNIX
+/// epoch (1970-01-01), stored in either of two units:
+///
+/// * Milliseconds (64 bits) indicating UNIX time elapsed since the epoch (no
+///   leap seconds), where the values are evenly divisible by 86400000
+/// * Days (32 bits) since the UNIX epoch
+table Date {
+  unit: DateUnit;
+}
+
+enum TimeUnit: short { SECOND, MILLISECOND, MICROSECOND, NANOSECOND }
+
+/// Time type. The physical storage type depends on the unit
+/// - SECOND and MILLISECOND: 32 bits
+/// - MICROSECOND and NANOSECOND: 64 bits
+table Time {
+  unit: TimeUnit;
+  bitWidth: int;
+}
+
+/// Time elapsed from the Unix epoch, 00:00:00.000 on 1 January 1970, UTC.
+table Timestamp {
+  unit: TimeUnit;
+
+  /// The time zone is a string indicating the name of a time zone, one of:
+  ///
+  /// * As used in the Olson time zone database (the "tz database" or
+  ///   "tzdata"), such as "America/New_York"
+  /// * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30
+  ///
+  /// Whether a timezone string is present indicates different semantics about
+  /// the data:
+  ///
+  /// * If the time zone is null or equal to an empty string, the data is "time
+  ///   zone naive" and shall be displayed *as is* to the user, not localized
+  ///   to the locale of the user. This data can be though of as UTC but
+  ///   without having "UTC" as the time zone, it is not considered to be
+  ///   localized to any time zone
+  ///
+  /// * If the time zone is set to a valid value, values can be displayed as
+  ///   "localized" to that time zone, even though the underlying 64-bit
+  ///   integers are identical to the same data stored in UTC. Converting
+  ///   between time zones is a metadata-only operation and does not change the
+  ///   underlying values
+  timezone: string;
+}
+
+enum IntervalUnit: short { YEAR_MONTH, DAY_TIME}
+table Interval {
+  unit: IntervalUnit;
+}
+
+/// ----------------------------------------------------------------------
+/// Top-level Type value, enabling extensible type-specific metadata. We can
+/// add new logical types to Type without breaking backwards compatibility
+
+union Type {
+  Null,
+  Int,
+  FloatingPoint,
+  Binary,
+  Utf8,
+  Bool,
+  Decimal,
+  Date,
+  Time,
+  Timestamp,
+  Interval,
+  List,
+  Struct_,
+  Union,
+  FixedWidthBinary
+}
+
+/// ----------------------------------------------------------------------
+/// The possible types of a vector
+
+enum VectorType: short {
+  /// used in List type, Dense Union and variable length primitive types (String, Binary)
+  OFFSET,
+  /// actual data, either wixed width primitive types in slots or variable width delimited
by an OFFSET vector
+  DATA,
+  /// Bit vector indicating if each value is null
+  VALIDITY,
+  /// Type vector used in Union type
+  TYPE
+}
+
+/// ----------------------------------------------------------------------
+/// represents the physical layout of a buffer
+/// buffers have fixed width slots of a given type
+
+table VectorLayout {
+  /// the width of a slot in the buffer (typically 1, 8, 16, 32 or 64)
+  bit_width: short;
+  /// the purpose of the vector
+  type: VectorType;
+}
+
+
+/// ----------------------------------------------------------------------
+/// user defined key value pairs to add custom metadata to arrow
+/// key namespacing is the responsibility of the user
+
+table KeyValue {
+  key: string;
+  value: [ubyte];
+}
+
+/// ----------------------------------------------------------------------
+/// Dictionary encoding metadata
+
+table DictionaryEncoding {
+  /// The known dictionary id in the application where this data is used. In
+  /// the file or streaming formats, the dictionary ids are found in the
+  /// DictionaryBatch messages
+  id: long;
+
+  /// The dictionary indices are constrained to be positive integers. If this
+  /// field is null, the indices must be signed int32
+  indexType: Int;
+
+  /// By default, dictionaries are not ordered, or the order does not have
+  /// semantic meaning. In some statistical, applications, dictionary-encoding
+  /// is used to represent ordered categorical data, and we provide a way to
+  /// preserve that metadata here
+  isOrdered: bool;
+}
+
+/// ----------------------------------------------------------------------
+/// A field represents a named column in a record / row batch or child of a
+/// nested type.
+///
+/// - children is only for nested Arrow arrays
+/// - For primitive types, children will have length 0
+/// - nullable should default to true in general
+
+table Field {
+  // Name is not required, in i.e. a List
+  name: string;
+  nullable: bool;
+  type: Type;
+
+  // Present only if the field is dictionary encoded
+  dictionary: DictionaryEncoding;
+
+  // children apply only to Nested data types like Struct, List and Union
+  children: [Field];
+  /// layout of buffers produced for this type (as derived from the Type)
+  /// does not include children
+  /// each recordbatch will return instances of those Buffers.
+  layout: [ VectorLayout ];
+  // User-defined metadata
+  custom_metadata: [ KeyValue ];
+}
+
+/// ----------------------------------------------------------------------
+/// Endianness of the platform producing the data
+
+enum Endianness:short { Little, Big }
+
+/// ----------------------------------------------------------------------
+/// A Buffer represents a single contiguous memory segment
+struct Buffer {
+  /// The shared memory page id where this buffer is located. Currently this is
+  /// not used
+  page: int;
+
+  /// The relative offset into the shared memory page where the bytes for this
+  /// buffer starts
+  offset: long;
+
+  /// The absolute length (in bytes) of the memory buffer. The memory is found
+  /// from offset (inclusive) to offset + length (non-inclusive).
+  length: long;
+}
+
+/// ----------------------------------------------------------------------
+/// A Schema describes the columns in a row batch
+
+table Schema {
+
+  /// endianness of the buffer
+  /// it is Little Endian by default
+  /// if endianness doesn't match the underlying system then the vectors need to be converted
+  endianness: Endianness=Little;
+
+  fields: [Field];
+  // User-defined metadata
+  custom_metadata: [ KeyValue ];
+}
+
+root_type Schema;

http://git-wip-us.apache.org/repos/asf/arrow/blob/2a568f09/java/format/pom.xml
----------------------------------------------------------------------
diff --git a/java/format/pom.xml b/java/format/pom.xml
index c65a7bc..e7a58a4 100644
--- a/java/format/pom.xml
+++ b/java/format/pom.xml
@@ -1,13 +1,13 @@
 <?xml version="1.0"?>
-<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
-  license agreements. See the NOTICE file distributed with this work for additional 
-  information regarding copyright ownership. The ASF licenses this file to 
-  You under the Apache License, Version 2.0 (the "License"); you may not use 
-  this file except in compliance with the License. You may obtain a copy of 
-  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
-  by applicable law or agreed to in writing, software distributed under the 
-  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
-  OF ANY KIND, either express or implied. See the License for the specific 
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
+  license agreements. See the NOTICE file distributed with this work for additional
+  information regarding copyright ownership. The ASF licenses this file to
+  You under the Apache License, Version 2.0 (the "License"); you may not use
+  this file except in compliance with the License. You may obtain a copy of
+  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
+  by applicable law or agreed to in writing, software distributed under the
+  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
+  OF ANY KIND, either express or implied. See the License for the specific
   language governing permissions and limitations under the License. -->
 <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 <modelVersion>4.0.0</modelVersion>
@@ -109,6 +109,7 @@
               <argument>-j</argument>
               <argument>-o</argument>
               <argument>${flatc.generated.files}</argument>
+              <argument>../../format/Schema.fbs</argument>
               <argument>../../format/Message.fbs</argument>
               <argument>../../format/File.fbs</argument>
             </arguments>
@@ -165,4 +166,3 @@
 
 </build>
 </project>
-


Mime
View raw message