Mailing-List: contact dev-help@quickstep.incubator.apache.org; run by ezmlm
Precedence: bulk
Reply-To: dev@quickstep.incubator.apache.org
From: zuyu <git@git.apache.org>
To: dev@quickstep.incubator.apache.org
Reply-To: dev@quickstep.incubator.apache.org
References: <git-pr-19-incubator-quickstep@git.apache.org>
In-Reply-To: <git-pr-19-incubator-quickstep@git.apache.org>
Subject: [GitHub] incubator-quickstep pull request #19: Improve text scan operator
Content-Type: text/plain
Message-Id: <20160609145821.1011FDFC61@git1-us-west.apache.org>
Date: Thu,  9 Jun 2016 14:58:21 +0000 (UTC)
archived-at: Thu, 09 Jun 2016 14:58:26 -0000

Github user zuyu commented on a diff in the pull request:

    https://github.com/apache/incubator-quickstep/pull/19#discussion_r66457297
  
    --- Diff: relational_operators/TextScanOperator.cpp ---
    @@ -155,116 +63,50 @@ bool TextScanOperator::getAllWorkOrders(
       InsertDestination *output_destination =
           query_context->getInsertDestination(output_destination_index_);
     
    -  if (parallelize_load_) {
    -    // Parallel implementation: Split work orders are generated for each file
    -    // being bulk-loaded. (More than one file can be loaded, because we support
    -    // glob() semantics in file name.) These work orders read the input file,
    -    // and split them in the blobs that can be parsed independently.
    -    if (blocking_dependencies_met_) {
    -      if (!work_generated_) {
    -        // First, generate text-split work orders.
    -        for (const auto &file : files) {
    -          container->addNormalWorkOrder(
    -              new TextSplitWorkOrder(query_id_,
    -                                     file,
    -                                     process_escape_sequences_,
    -                                     storage_manager,
    -                                     op_index_,
    -                                     scheduler_client_id,
    -                                     bus),
    -              op_index_);
    -          ++num_split_work_orders_;
    -        }
    -        work_generated_ = true;
    -        return false;
    -      } else {
    -        // Check if there are blobs to parse.
    -        while (!text_blob_queue_.empty()) {
    -          const TextBlob blob_work = text_blob_queue_.popOne();
    -          container->addNormalWorkOrder(
    -              new TextScanWorkOrder(query_id_,
    -                                    blob_work.blob_id,
    -                                    blob_work.size,
    -                                    field_terminator_,
    -                                    process_escape_sequences_,
    -                                    output_destination,
    -                                    storage_manager),
    -              op_index_);
    -        }
    -        // Done if all split work orders are completed, and no blobs are left to
    -        // process.
    -        return num_done_split_work_orders_.load(std::memory_order_acquire) == num_split_work_orders_ &&
    -               text_blob_queue_.empty();
    -      }
    -    }
    -    return false;
    -  } else {
    -    // Serial implementation.
    -    if (blocking_dependencies_met_ && !work_generated_) {
    -      for (const auto &file : files) {
    +  // Text segment size set to 256KB.
    +  constexpr std::size_t kTextSegmentSize = 0x40000u;
    +
    +  if (blocking_dependencies_met_ && !work_generated_) {
    +    for (const std::string &file : files) {
    +      // Use standard C libary to retrieve the file size.
    +      FILE *fp = std::fopen(file.c_str(), "rb");
    +      std::fseek(fp, 0, SEEK_END);
    +      const std::size_t file_size = std::ftell(fp);
    +      std::fclose(fp);
    +
    +      std::size_t text_offset = 0;
    +      while (text_offset < file_size) {
             container->addNormalWorkOrder(
                 new TextScanWorkOrder(query_id_,
                                       file,
    +                                  text_offset,
    +                                  std::min(kTextSegmentSize, file_size - text_offset),
                                       field_terminator_,
                                       process_escape_sequences_,
                                       output_destination,
                                       storage_manager),
                 op_index_);
    +        text_offset += kTextSegmentSize;
    --- End diff --
    
    This won't become a bug, but I think what we really mean is the following:
    
    ```
      const size_t text_actual_segment_size = std::min(kTextSegmentSize, file_size - text_offset);
      text_offset += text_actual_segment_size;
    ```


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---