Return-Path: X-Original-To: archive-asf-public-internal@cust-asf2.ponee.io Delivered-To: archive-asf-public-internal@cust-asf2.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by cust-asf2.ponee.io (Postfix) with ESMTP id 15A1C200D57 for ; Mon, 11 Dec 2017 08:48:53 +0100 (CET) Received: by cust-asf.ponee.io (Postfix) id 142EE160C10; Mon, 11 Dec 2017 07:48:53 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id 1253D160C22 for ; Mon, 11 Dec 2017 08:48:51 +0100 (CET) Received: (qmail 67279 invoked by uid 500); 11 Dec 2017 07:48:51 -0000 Mailing-List: contact dev-help@orc.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@orc.apache.org Delivered-To: mailing list dev@orc.apache.org Received: (qmail 67166 invoked by uid 99); 11 Dec 2017 07:48:50 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Mon, 11 Dec 2017 07:48:50 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id 14F33DFB6E; Mon, 11 Dec 2017 07:48:50 +0000 (UTC) From: majetideepak To: dev@orc.apache.org Reply-To: dev@orc.apache.org References: In-Reply-To: Subject: [GitHub] orc pull request #199: ORC-276: [C++] Create a simple tool to import CSV fil... Content-Type: text/plain Message-Id: <20171211074850.14F33DFB6E@git1-us-west.apache.org> Date: Mon, 11 Dec 2017 07:48:50 +0000 (UTC) archived-at: Mon, 11 Dec 2017 07:48:53 -0000 Github user majetideepak commented on a diff in the pull request: https://github.com/apache/orc/pull/199#discussion_r155995744 --- Diff: tools/src/CSVFileImport.cc --- @@ -0,0 +1,436 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "orc/Exceptions.hh" +#include "orc/OrcFile.hh" + +#include +#include +#include +#include +#include +#include +#include + +static char gDelimiter = ','; + +std::string extractColumn(std::string s, uint64_t colIndex) { + uint64_t col = 0; + size_t start = 0; + size_t end = s.find(gDelimiter); + while (col < colIndex && end != std::string::npos) { + start = end + 1; + end = s.find(gDelimiter, start); + ++col; + } + return col == colIndex ? s.substr(start, end - start) : ""; +} + +static const char* GetDate(void) +{ + static char buf[200]; + time_t t = time(NULL); + struct tm* p = localtime(&t); + strftime(buf, sizeof(buf), "[%Y-%m-%d %H:%M:%S]", p); + return buf; +} + +void fillLongValues(const std::vector& data, + orc::ColumnVectorBatch* batch, + uint64_t numValues, + uint64_t colIndex) { + orc::LongVectorBatch* longBatch = + dynamic_cast(batch); + bool hasNull = false; + for (uint64_t i = 0; i < numValues; ++i) { + std::string col = extractColumn(data[i], colIndex); + if (col.empty()) { + batch->notNull[i] = 0; + hasNull = true; + } else { + batch->notNull[i] = 1; + longBatch->data[i] = atoll(col.c_str()); + } + } + longBatch->hasNulls = hasNull; + longBatch->numElements = numValues; +} + +void fillStringValues(const std::vector& data, + orc::ColumnVectorBatch* batch, + uint64_t numValues, + uint64_t colIndex, + orc::DataBuffer& buffer, + uint64_t& offset) { + orc::StringVectorBatch* stringBatch = + dynamic_cast(batch); + bool hasNull = false; + for (uint64_t i = 0; i < numValues; ++i) { + std::string col = extractColumn(data[i], colIndex); + if (col.empty()) { + batch->notNull[i] = 0; + hasNull = true; + } else { + batch->notNull[i] = 1; + if (buffer.size() - offset < col.size()) { + buffer.reserve(buffer.size() * 2); + } + memcpy(buffer.data() + offset, + col.c_str(), + col.size()); + stringBatch->data[i] = buffer.data() + offset; + stringBatch->length[i] = static_cast(col.size()); + offset += col.size(); + } + } + stringBatch->hasNulls = hasNull; + stringBatch->numElements = numValues; +} + +void fillDoubleValues(const std::vector& data, + orc::ColumnVectorBatch* batch, + uint64_t numValues, + uint64_t colIndex) { + orc::DoubleVectorBatch* dblBatch = + dynamic_cast(batch); + bool hasNull = false; + for (uint64_t i = 0; i < numValues; ++i) { + std::string col = extractColumn(data[i], colIndex); + if (col.empty()) { + batch->notNull[i] = 0; + hasNull = true; + } else { + batch->notNull[i] = 1; + dblBatch->data[i] = atof(col.c_str()); + } + } + dblBatch->hasNulls = hasNull; + dblBatch->numElements = numValues; +} + +// parse fixed point decimal numbers +void fillDecimalValues(const std::vector& data, + orc::ColumnVectorBatch* batch, + uint64_t numValues, + uint64_t colIndex, + size_t scale, + size_t precision) { + + + orc::Decimal128VectorBatch* d128Batch = ORC_NULLPTR; + orc::Decimal64VectorBatch* d64Batch = ORC_NULLPTR; + if (precision <= 18) { + d64Batch = dynamic_cast(batch); + d64Batch->scale = static_cast(scale); + } else { + d128Batch = dynamic_cast(batch); + d128Batch->scale = static_cast(scale); + } + bool hasNull = false; + for (uint64_t i = 0; i < numValues; ++i) { + std::string col = extractColumn(data[i], colIndex); + if (col.empty()) { + batch->notNull[i] = 0; + hasNull = true; + } else { + batch->notNull[i] = 1; + size_t ptPos = col.find('.'); + size_t curScale = 0; + std::string num = col; + if (ptPos != std::string::npos) { + curScale = col.length() - ptPos - 1; + num = col.substr(0, ptPos) + col.substr(ptPos + 1); + } + orc::Int128 decimal(num); + while (curScale != scale) { + curScale++; + decimal *= 10; + } + if (precision <= 18) { + d64Batch->values[i] = decimal.toLong(); + } else { + d128Batch->values[i] = decimal; + } + } + } + batch->hasNulls = hasNull; + batch->numElements = numValues; +} + +void fillBoolValues(const std::vector& data, + orc::ColumnVectorBatch* batch, + uint64_t numValues, + uint64_t colIndex) { + orc::LongVectorBatch* boolBatch = + dynamic_cast(batch); + bool hasNull = false; + for (uint64_t i = 0; i < numValues; ++i) { + std::string col = extractColumn(data[i], colIndex); + if (col.empty()) { + batch->notNull[i] = 0; + hasNull = true; + } else { + batch->notNull[i] = 1; + std::transform(col.begin(), col.end(), col.begin(), ::tolower); + if (col == "true" || col == "t") { + boolBatch->data[i] = true; + } else { + boolBatch->data[i] = false; + } + } + } + boolBatch->hasNulls = hasNull; + boolBatch->numElements = numValues; +} + +// parse date string from format YYYY-MM-dd +void fillDateValues(const std::vector& data, + orc::ColumnVectorBatch* batch, + uint64_t numValues, + uint64_t colIndex) { + orc::LongVectorBatch* longBatch = + dynamic_cast(batch); + bool hasNull = false; + for (uint64_t i = 0; i < numValues; ++i) { + std::string col = extractColumn(data[i], colIndex); + if (col.empty()) { + batch->notNull[i] = 0; + hasNull = true; + } else { + batch->notNull[i] = 1; + struct tm tm; + memset(&tm, 0, sizeof(struct tm)); + strptime(col.c_str(), "%Y-%m-%d", &tm); + time_t t = mktime(&tm); + time_t t1970 = 0; + double seconds = difftime(t, t1970); + int64_t days = static_cast(seconds / (60*60*24)); + longBatch->data[i] = days; + } + } + longBatch->hasNulls = hasNull; + longBatch->numElements = numValues; +} + +// parse timestamp values in seconds +void fillTimestampValues(const std::vector& data, + orc::ColumnVectorBatch* batch, + uint64_t numValues, + uint64_t colIndex) { + orc::TimestampVectorBatch* tsBatch = + dynamic_cast(batch); + bool hasNull = false; + for (uint64_t i = 0; i < numValues; ++i) { + std::string col = extractColumn(data[i], colIndex); + if (col.empty()) { + batch->notNull[i] = 0; + hasNull = true; + } else { + batch->notNull[i] = 1; + tsBatch->data[i] = atoll(col.c_str()); + tsBatch->nanoseconds[i] = 0; + } + } + tsBatch->hasNulls = hasNull; + tsBatch->numElements = numValues; +} + +void usage() { + std::cout << "Usage: csv-import --schema=" + << " [--delimiter=]\n" + << "Import CSV file into an Orc file using the specified schema.\n" + << "Compound types are not supported at the moment.\n"; +} + +int main(int argc, char* argv[]) { + if (argc < 4) { + std::cout << "Invalid number of arguments." << std::endl; + usage(); + return 1; + } + + std::string input = argv[1]; + std::string output = argv[2]; + std::string schema = argv[3]; + + const std::string SCHEMA_PREFIX = "--schema="; + ORC_UNIQUE_PTR fileType = ORC_NULLPTR; + if (schema.find(SCHEMA_PREFIX) != 0) { + std::cout << "Cannot find " << SCHEMA_PREFIX << " argument." << std::endl; + usage(); + return 1; + } else { + fileType = orc::Type::buildTypeFromString(schema.substr(SCHEMA_PREFIX.size())); + } + + if (argc > 4) { + std::string delimiter = argv[4]; + const std::string DELIMITER_PREFIX = "--delimiter="; + if (delimiter.find(DELIMITER_PREFIX) != 0) { + std::cout << "Cannot find " << DELIMITER_PREFIX << " argument." << std::endl; + usage(); + return 1; + } else { + gDelimiter = delimiter.substr(DELIMITER_PREFIX.size())[0]; + } + } + + std::cout << GetDate() << "Start importing Orc file..." << std::endl; + + double totalElapsedTime = 0.0; + clock_t totalCPUTime = 0; + + orc::DataBuffer buffer(*orc::getDefaultPool()); + buffer.resize(4 * 1024 * 1024); + + // set ORC writer options here + uint64_t stripeSize = (128 << 20); // 128M + uint64_t blockSize = 64 << 10; // 64K + uint64_t batchSize = 1024; + orc::CompressionKind compression = orc::CompressionKind_ZLIB; + + orc::WriterOptions options; + options.setStripeSize(stripeSize); + options.setCompressionBlockSize(blockSize); + options.setCompression(compression); + + ORC_UNIQUE_PTR outStream = orc::writeLocalFile(output); + ORC_UNIQUE_PTR writer = + orc::createWriter(*fileType, outStream.get(), options); + ORC_UNIQUE_PTR rowBatch = + writer->createRowBatch(batchSize); + + bool eof = false; + std::string line; + std::vector data; + std::ifstream finput(input.c_str()); + while (!eof) { --- End diff -- Some code comments will definitely help to extend this in the future. ---