Repository: incubator-parquet-cpp
Updated Branches:
refs/heads/master bce89a021 -> db20bae08
Add "parquet_reader.cc" in the folder "example".
The program "parquet_reader" shows a simple example to read the content
in a Parquet-format file and print them out in a human-readable manner.
Project: http://git-wip-us.apache.org/repos/asf/incubator-parquet-cpp/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-parquet-cpp/commit/db20bae0
Tree: http://git-wip-us.apache.org/repos/asf/incubator-parquet-cpp/tree/db20bae0
Diff: http://git-wip-us.apache.org/repos/asf/incubator-parquet-cpp/diff/db20bae0
Branch: refs/heads/master
Commit: db20bae08d6377211630019d5593ef9f08bd3cb4
Parents: bce89a0
Author: Yue Chen <ychen.contact@gmail.com>
Authored: Mon Sep 8 00:18:29 2014 -0400
Committer: Nong Li <nong@cloudera.com>
Committed: Tue Oct 28 13:08:10 2014 -0700
----------------------------------------------------------------------
example/CMakeLists.txt | 3 +
example/parquet_reader.cc | 294 +++++++++++++++++++++++++++++++++++++++++
2 files changed, 297 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-parquet-cpp/blob/db20bae0/example/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt
index 8eed603..1f59856 100644
--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -32,3 +32,6 @@ target_link_libraries(compute_stats ${LINK_LIBS})
add_executable(decode_benchmark decode_benchmark.cc)
target_link_libraries(decode_benchmark ${LINK_LIBS})
+
+add_executable(parquet_reader parquet_reader.cc)
+target_link_libraries(parquet_reader ${LINK_LIBS})
http://git-wip-us.apache.org/repos/asf/incubator-parquet-cpp/blob/db20bae0/example/parquet_reader.cc
----------------------------------------------------------------------
diff --git a/example/parquet_reader.cc b/example/parquet_reader.cc
new file mode 100644
index 0000000..c02ffb0
--- /dev/null
+++ b/example/parquet_reader.cc
@@ -0,0 +1,294 @@
+// Copyright 2012 Cloudera Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <parquet/parquet.h>
+#include <iostream>
+#include <stdio.h>
+
+#include "example_util.h"
+
+// the fixed initial size is just for an example
+#define INIT_SIZE 100
+#define COL_WIDTH "17"
+
+using namespace parquet;
+using namespace parquet_cpp;
+using namespace std;
+
+struct AnyType {
+ union {
+ bool bool_val;
+ int32_t int32_val;
+ int64_t int64_val;
+ float float_val;
+ double double_val;
+ ByteArray byte_array_val;
+ };
+};
+
+static string ByteArrayToString(const ByteArray& a) {
+ return string(reinterpret_cast<const char*>(a.ptr), a.len);
+}
+
+void* read_parquet(char* filename);
+
+// Simple example which prints out the content of the Parquet file
+int main(int argc, char** argv) {
+
+ if (argc < 2) {
+ cerr << "Usage: parquet_reader <file>" << endl;
+ return -1;
+ }
+
+ void *column_ptr = read_parquet(argv[1]);
+
+ // an example to use the returned column_ptr
+ // printf("%-"COL_WIDTH"d\n",((int32_t *)(((int32_t **)column_ptr)[0]))[0]);
+
+ return 0;
+}
+
+
+void* read_parquet(char* filename) {
+
+ unsigned int total_row_number = 0;
+
+ FileMetaData metadata;
+ if (!GetFileMetadata(filename, &metadata)) return NULL;
+
+ FILE* file = fopen(filename, "r");
+ if (file == NULL) {
+ cerr << "Could not open file: " << filename << endl;
+ return NULL;
+ }
+
+ for (int i = 0; i < metadata.row_groups.size(); ++i) {
+ const RowGroup& row_group = metadata.row_groups[i];
+
+ Type::type* type_array = (Type::type*)malloc(
+ row_group.columns.size() * sizeof(Type::type));
+ assert(type_array);
+
+ void* column_ptr = (void*)malloc(row_group.columns.size() * sizeof(void*));
+ assert(column_ptr);
+
+ for (int c = 0; c < row_group.columns.size(); ++c) {
+
+ const ColumnChunk& col = row_group.columns[c];
+ if (col.meta_data.type == Type::INT96 ||
+ col.meta_data.type == Type::FIXED_LEN_BYTE_ARRAY) {
+ cout << " Skipping unsupported column" << endl;
+ continue;
+ }
+
+ size_t col_start = col.meta_data.data_page_offset;
+ if (col.meta_data.__isset.dictionary_page_offset) {
+ if (col_start > col.meta_data.dictionary_page_offset) {
+ col_start = col.meta_data.dictionary_page_offset;
+ }
+ }
+ fseek(file, col_start, SEEK_SET);
+ vector<uint8_t> column_buffer;
+ column_buffer.resize(col.meta_data.total_compressed_size);
+ size_t num_read = fread(&column_buffer[0], 1, column_buffer.size(), file);
+ if (num_read != column_buffer.size()) {
+ cerr << "Could not read column data." << endl;
+ continue;
+ }
+
+ InMemoryInputStream input(&column_buffer[0], column_buffer.size());
+ ColumnReader reader(&col.meta_data, &metadata.schema[c + 1], &input);
+
+ AnyType min, max;
+ int num_values = 0;
+ int num_nulls = 0;
+
+ switch (col.meta_data.type) {
+ case Type::BOOLEAN: {
+ ((bool**)column_ptr)[c] = (bool*)malloc(sizeof(bool) * INIT_SIZE);
+ type_array[c] = Type::BOOLEAN;
+ break;
+ }
+ case Type::INT32: {
+ ((int32_t**)column_ptr)[c] = (int32_t*)malloc(sizeof(int32_t) * INIT_SIZE);
+ type_array[c] = Type::INT32;
+ break;
+ }
+ case Type::INT64: {
+ ((int64_t**)column_ptr)[c] = (int64_t*)malloc(sizeof(int64_t) * INIT_SIZE);
+ type_array[c] = Type::INT64;
+ break;
+ }
+ case Type::FLOAT: {
+ ((float**)column_ptr)[c] = (float*)malloc(sizeof(float) * INIT_SIZE);
+ type_array[c] = Type::FLOAT;
+
+ break;
+ }
+ case Type::DOUBLE: {
+ ((double**)column_ptr)[c] = (double*)malloc(sizeof(double) * INIT_SIZE);
+ type_array[c] = Type::DOUBLE;
+ break;
+ }
+ case Type::BYTE_ARRAY: {
+ ((ByteArray**)column_ptr)[c] =
+ (ByteArray*)malloc(sizeof(ByteArray) * INIT_SIZE);
+ type_array[c] = Type::BYTE_ARRAY;
+ break;
+ }
+ case Type::FIXED_LEN_BYTE_ARRAY:
+ case Type::INT96:
+ assert(false);
+ break;
+ }
+
+ int def_level = 0, rep_level = 0;
+ while (reader.HasNext()) {
+ switch (col.meta_data.type) {
+ case Type::BOOLEAN: {
+ bool val = reader.GetBool(&def_level, &rep_level);
+ if (def_level < rep_level) break;
+ ((bool*)(((bool**)column_ptr)[c]))[num_values] = val;
+ break;
+ }
+ case Type::INT32: {
+ int32_t val = reader.GetInt32(&def_level, &rep_level);;
+ if (def_level < rep_level) break;
+ ((int32_t*)(((int32_t**)column_ptr)[c]))[num_values] = val;
+ break;
+ }
+ case Type::INT64: {
+ int64_t val = reader.GetInt64(&def_level, &rep_level);;
+ if (def_level < rep_level) break;
+ ((int64_t *)(((int64_t**)column_ptr)[c]))[num_values] = val;
+ break;
+ }
+ case Type::FLOAT: {
+ float val = reader.GetFloat(&def_level, &rep_level);;
+ if (def_level < rep_level) break;
+ ((float*)(((float**)column_ptr)[c]))[num_values] = val;
+ break;
+ }
+ case Type::DOUBLE: {
+ double val = reader.GetDouble(&def_level, &rep_level);;
+ if (def_level < rep_level) break;
+ ((double*)(((double**)column_ptr)[c]))[num_values] = val;
+ break;
+ }
+ case Type::BYTE_ARRAY: {
+ ByteArray val = reader.GetByteArray(&def_level, &rep_level);;
+ if (def_level < rep_level) break;
+ ((ByteArray*)(((ByteArray**)column_ptr)[c]))[num_values] = val;
+ break;
+ }
+
+ default:
+ continue;
+ }
+
+ if (def_level < rep_level) ++num_nulls;
+ ++num_values;
+ }
+
+ total_row_number = num_values;
+ }
+
+ // prints out the table
+ cout << "=========================================================================\n";
+
+ // j is the row, k is the column
+ int k = 0, j = 0;
+
+ // prints column name
+ for (j = 0; j < row_group.columns.size(); ++j) {
+ char *str = (char*)malloc(50);
+ assert(str);
+ strcpy(str, metadata.schema[j+1].name.c_str());
+ printf("%-"COL_WIDTH"s", str);
+ free(str);
+ }
+
+ cout << "\n";
+
+
+ for (j = 0;j < row_group.columns.size(); ++j)
+ switch(type_array[j]) {
+ case Type::BOOLEAN:
+ printf("%-"COL_WIDTH"s","BOOLEAN");
+ break;
+ case Type::INT32:
+ printf("%-"COL_WIDTH"s","INT32");
+ break;
+ case Type::INT64:
+ printf("%-"COL_WIDTH"s","INT64");
+ break;
+ case Type::FLOAT:
+ printf("%-"COL_WIDTH"s","FLOAT");
+ break;
+ case Type::DOUBLE:
+ printf("%-"COL_WIDTH"s","DOUBLE");
+ break;
+ case Type::BYTE_ARRAY:
+ printf("%-"COL_WIDTH"s","BYTE_ARRAY");
+ break;
+ default:
+ continue;
+ }
+
+ cout << "\n";
+
+ static string result;
+ char* str1;
+
+ for (k = 0; k < total_row_number; ++k) {
+ for (j = 0; j < row_group.columns.size(); ++j) {
+ switch(type_array[j]) {
+ case Type::BOOLEAN:
+ printf("%-"COL_WIDTH"d",((bool*)(((bool**)column_ptr)[j]))[k]);
+ break;
+ case Type::INT32:
+ printf("%-"COL_WIDTH"d",((int32_t *)(((int32_t **)column_ptr)[j]))[k]);
+ break;
+ case Type::INT64:
+ printf("%-"COL_WIDTH"ld",((int64_t *)(((int64_t **)column_ptr)[j]))[k]);
+ break;
+ case Type::FLOAT:
+ printf("%-"COL_WIDTH"f",((float*)(((float**)column_ptr)[j]))[k]);
+ break;
+ case Type::DOUBLE:
+ printf("%-"COL_WIDTH"lf",((double*)(((double**)column_ptr)[j]))[k]);
+ break;
+ case Type::BYTE_ARRAY:
+ result = ByteArrayToString( ((ByteArray*)(((ByteArray**)column_ptr)[j]))[k] );
+ str1 = (char*)malloc(result.size());
+ assert(str1);
+ strcpy(str1, result.c_str());
+ printf("%-"COL_WIDTH"s", str1);
+ free(str1);
+ break;
+ default:
+ continue;
+ }
+ }
+ cout << "\n";
+
+ // print ends
+ }
+
+ return column_ptr;
+ }
+
+ fclose(file);
+ return NULL;
+}
|