Return-Path: X-Original-To: archive-asf-public-internal@cust-asf2.ponee.io Delivered-To: archive-asf-public-internal@cust-asf2.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by cust-asf2.ponee.io (Postfix) with ESMTP id E8B9E200BB6 for ; Fri, 4 Nov 2016 23:12:48 +0100 (CET) Received: by cust-asf.ponee.io (Postfix) id E7679160B07; Fri, 4 Nov 2016 22:12:48 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id 1666E160B04 for ; Fri, 4 Nov 2016 23:12:47 +0100 (CET) Received: (qmail 27713 invoked by uid 500); 4 Nov 2016 22:12:46 -0000 Mailing-List: contact dev-help@drill.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@drill.apache.org Delivered-To: mailing list dev@drill.apache.org Received: (qmail 27494 invoked by uid 99); 4 Nov 2016 22:12:46 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Fri, 04 Nov 2016 22:12:46 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id 489A2E3813; Fri, 4 Nov 2016 22:12:46 +0000 (UTC) From: parthchandra To: dev@drill.apache.org Reply-To: dev@drill.apache.org References: In-Reply-To: Subject: [GitHub] drill pull request #611: Drill-4800: Improve parquet reader performance Content-Type: text/plain Message-Id: <20161104221246.489A2E3813@git1-us-west.apache.org> Date: Fri, 4 Nov 2016 22:12:46 +0000 (UTC) archived-at: Fri, 04 Nov 2016 22:12:49 -0000 Github user parthchandra commented on a diff in the pull request: https://github.com/apache/drill/pull/611#discussion_r86627781 --- Diff: exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/columnreaders/VarLenBinaryReader.java --- @@ -43,43 +51,153 @@ public VarLenBinaryReader(ParquetRecordReader parentReader, List firstColumnStatus) throws IOException { long recordsReadInCurrentPass = 0; - int lengthVarFieldsInCurrentRecord; - long totalVariableLengthData = 0; - boolean exitLengthDeterminingLoop = false; + // write the first 0 offset for (VarLengthColumn columnReader : columns) { columnReader.reset(); } + //if(useAsyncTasks){ + // recordsReadInCurrentPass = determineSizesParallel(recordsToReadInThisPass); + //} else { + recordsReadInCurrentPass = determineSizesSerial(recordsToReadInThisPass); + //} + if(useAsyncTasks){ + readRecordsParallel(recordsReadInCurrentPass); + }else{ + readRecordsSerial(recordsReadInCurrentPass); + } + return recordsReadInCurrentPass; + } + + + private long determineSizesSerial(long recordsToReadInThisPass) throws IOException { + int lengthVarFieldsInCurrentRecord = 0; + boolean exitLengthDeterminingLoop = false; + long totalVariableLengthData = 0; + long recordsReadInCurrentPass = 0; do { - lengthVarFieldsInCurrentRecord = 0; for (VarLengthColumn columnReader : columns) { - if ( !exitLengthDeterminingLoop ) { - exitLengthDeterminingLoop = columnReader.determineSize(recordsReadInCurrentPass, lengthVarFieldsInCurrentRecord); + if (!exitLengthDeterminingLoop) { + exitLengthDeterminingLoop = + columnReader.determineSize(recordsReadInCurrentPass, lengthVarFieldsInCurrentRecord); } else { break; } } // check that the next record will fit in the batch - if (exitLengthDeterminingLoop || (recordsReadInCurrentPass + 1) * parentReader.getBitWidthAllFixedFields() + totalVariableLengthData - + lengthVarFieldsInCurrentRecord > parentReader.getBatchSize()) { + if (exitLengthDeterminingLoop || + (recordsReadInCurrentPass + 1) * parentReader.getBitWidthAllFixedFields() + + totalVariableLengthData + lengthVarFieldsInCurrentRecord > parentReader.getBatchSize()) { + break; + } + for (VarLengthColumn columnReader : columns) { + columnReader.updateReadyToReadPosition(); + columnReader.currDefLevel = -1; + } + recordsReadInCurrentPass++; + totalVariableLengthData += lengthVarFieldsInCurrentRecord; + } while (recordsReadInCurrentPass < recordsToReadInThisPass); + + return recordsReadInCurrentPass; + } + + + public long determineSizesParallel(long recordsToReadInThisPass ) throws IOException { + boolean doneReading = false; + int lengthVarFieldsInCurrentRecord = 0; + boolean exitLengthDeterminingLoop = false; + long totalVariableLengthData = 0; + long recordsReadInCurrentPass = 0; + + do { + doneReading = readPagesParallel(); + + if (!doneReading) { + lengthVarFieldsInCurrentRecord = 0; + for (VarLengthColumn columnReader : columns) { + doneReading = columnReader.processPageData((int) recordsReadInCurrentPass); + if(doneReading) { + break; + } + lengthVarFieldsInCurrentRecord += columnReader.dataTypeLengthInBits; + doneReading = columnReader.checkVectorCapacityReached(); + if(doneReading) { + break; + } + } + } + + exitLengthDeterminingLoop = doneReading; + + // check that the next record will fit in the batch + if (exitLengthDeterminingLoop || + (recordsReadInCurrentPass + 1) * parentReader.getBitWidthAllFixedFields() + + totalVariableLengthData + lengthVarFieldsInCurrentRecord > parentReader.getBatchSize()) { break; } - for (VarLengthColumn columnReader : columns ) { + for (VarLengthColumn columnReader : columns) { columnReader.updateReadyToReadPosition(); columnReader.currDefLevel = -1; } recordsReadInCurrentPass++; totalVariableLengthData += lengthVarFieldsInCurrentRecord; } while (recordsReadInCurrentPass < recordsToReadInThisPass); + return recordsReadInCurrentPass; + } + + public boolean readPagesParallel() { + + boolean isDone = false; + ArrayList> futures = Lists.newArrayList(); + for (VarLengthColumn columnReader : columns) { + Future f = columnReader.readPageAsync(); + futures.add(f); + } + for (Future f : futures) { + try { + isDone = isDone || f.get().booleanValue(); + } catch (Exception e) { + f.cancel(true); + handleAndRaise(null, e); + } + } + return isDone; + } + + + private void readRecordsSerial(long recordsReadInCurrentPass) { for (VarLengthColumn columnReader : columns) { columnReader.readRecords(columnReader.pageReader.valuesReadyToRead); } for (VarLengthColumn columnReader : columns) { - columnReader.valueVec.getMutator().setValueCount((int) recordsReadInCurrentPass); + columnReader.valueVec.getMutator().setValueCount((int)recordsReadInCurrentPass); } - return recordsReadInCurrentPass; + } + + private void readRecordsParallel(long recordsReadInCurrentPass){ + ArrayList> futures = Lists.newArrayList(); + for (VarLengthColumn columnReader : columns) { + Future f = columnReader.readRecordsAsync(columnReader.pageReader.valuesReadyToRead); + futures.add(f); + } + for (Future f : futures) { + try { + f.get(); + } catch (Exception e) { + f.cancel(true); --- End diff -- Done --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastructure@apache.org or file a JIRA ticket with INFRA. ---