Return-Path: X-Original-To: apmail-hive-commits-archive@www.apache.org Delivered-To: apmail-hive-commits-archive@www.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id EF77A11CD6 for ; Thu, 7 Aug 2014 00:22:26 +0000 (UTC) Received: (qmail 99359 invoked by uid 500); 7 Aug 2014 00:22:26 -0000 Delivered-To: apmail-hive-commits-archive@hive.apache.org Received: (qmail 99312 invoked by uid 500); 7 Aug 2014 00:22:26 -0000 Mailing-List: contact commits-help@hive.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: hive-dev@hive.apache.org Delivered-To: mailing list commits@hive.apache.org Received: (qmail 99301 invoked by uid 99); 7 Aug 2014 00:22:26 -0000 Received: from nike.apache.org (HELO nike.apache.org) (192.87.106.230) by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 07 Aug 2014 00:22:26 +0000 X-ASF-Spam-Status: No, hits=-2000.0 required=5.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 07 Aug 2014 00:22:18 +0000 Received: from eris.apache.org (localhost [127.0.0.1]) by eris.apache.org (Postfix) with ESMTP id 8C7052389393; Thu, 7 Aug 2014 00:21:51 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r1616379 [1/4] - in /hive/branches/cbo: ./ common/src/java/org/apache/hadoop/hive/common/ common/src/java/org/apache/hadoop/hive/conf/ conf/ data/files/ hcatalog/webhcat/java-client/src/main/java/org/apache/hive/hcatalog/api/ hcatalog/webhc... Date: Thu, 07 Aug 2014 00:21:48 -0000 To: commits@hive.apache.org From: gunther@apache.org X-Mailer: svnmailer-1.0.9 Message-Id: <20140807002151.8C7052389393@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Author: gunther Date: Thu Aug 7 00:21:45 2014 New Revision: 1616379 URL: http://svn.apache.org/r1616379 Log: Merge latest trunk into cbo branch (Gunther Hagleitner) Added: hive/branches/cbo/data/files/parquet_mixed_case - copied unchanged from r1616375, hive/trunk/data/files/parquet_mixed_case hive/branches/cbo/itests/hive-unit-hadoop2/src/test/java/org/apache/hadoop/hive/ql/security/TestStorageBasedMetastoreAuthorizationProviderWithACL.java - copied unchanged from r1616375, hive/trunk/itests/hive-unit-hadoop2/src/test/java/org/apache/hadoop/hive/ql/security/TestStorageBasedMetastoreAuthorizationProviderWithACL.java hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorExtractOperator.java - copied unchanged from r1616375, hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorExtractOperator.java hive/branches/cbo/ql/src/test/queries/clientnegative/char_pad_convert_fail0.q - copied unchanged from r1616375, hive/trunk/ql/src/test/queries/clientnegative/char_pad_convert_fail0.q hive/branches/cbo/ql/src/test/queries/clientnegative/char_pad_convert_fail1.q - copied unchanged from r1616375, hive/trunk/ql/src/test/queries/clientnegative/char_pad_convert_fail1.q hive/branches/cbo/ql/src/test/queries/clientnegative/char_pad_convert_fail2.q - copied unchanged from r1616375, hive/trunk/ql/src/test/queries/clientnegative/char_pad_convert_fail2.q hive/branches/cbo/ql/src/test/queries/clientnegative/char_pad_convert_fail3.q - copied unchanged from r1616375, hive/trunk/ql/src/test/queries/clientnegative/char_pad_convert_fail3.q hive/branches/cbo/ql/src/test/queries/clientpositive/char_pad_convert.q - copied unchanged from r1616375, hive/trunk/ql/src/test/queries/clientpositive/char_pad_convert.q hive/branches/cbo/ql/src/test/queries/clientpositive/parquet_mixed_case.q - copied unchanged from r1616375, hive/trunk/ql/src/test/queries/clientpositive/parquet_mixed_case.q hive/branches/cbo/ql/src/test/results/clientnegative/char_pad_convert_fail0.q.out - copied unchanged from r1616375, hive/trunk/ql/src/test/results/clientnegative/char_pad_convert_fail0.q.out hive/branches/cbo/ql/src/test/results/clientnegative/char_pad_convert_fail1.q.out - copied unchanged from r1616375, hive/trunk/ql/src/test/results/clientnegative/char_pad_convert_fail1.q.out hive/branches/cbo/ql/src/test/results/clientnegative/char_pad_convert_fail2.q.out - copied unchanged from r1616375, hive/trunk/ql/src/test/results/clientnegative/char_pad_convert_fail2.q.out hive/branches/cbo/ql/src/test/results/clientnegative/char_pad_convert_fail3.q.out - copied unchanged from r1616375, hive/trunk/ql/src/test/results/clientnegative/char_pad_convert_fail3.q.out hive/branches/cbo/ql/src/test/results/clientpositive/char_pad_convert.q.out - copied unchanged from r1616375, hive/trunk/ql/src/test/results/clientpositive/char_pad_convert.q.out hive/branches/cbo/ql/src/test/results/clientpositive/parquet_mixed_case.q.out - copied unchanged from r1616375, hive/trunk/ql/src/test/results/clientpositive/parquet_mixed_case.q.out hive/branches/cbo/ql/src/test/results/clientpositive/tez/vector_decimal_aggregate.q.out - copied unchanged from r1616375, hive/trunk/ql/src/test/results/clientpositive/tez/vector_decimal_aggregate.q.out hive/branches/cbo/ql/src/test/results/clientpositive/tez/vector_left_outer_join.q.out - copied unchanged from r1616375, hive/trunk/ql/src/test/results/clientpositive/tez/vector_left_outer_join.q.out hive/branches/cbo/ql/src/test/results/clientpositive/tez/vectorization_12.q.out - copied unchanged from r1616375, hive/trunk/ql/src/test/results/clientpositive/tez/vectorization_12.q.out hive/branches/cbo/ql/src/test/results/clientpositive/tez/vectorization_13.q.out - copied unchanged from r1616375, hive/trunk/ql/src/test/results/clientpositive/tez/vectorization_13.q.out hive/branches/cbo/ql/src/test/results/clientpositive/tez/vectorization_14.q.out - copied unchanged from r1616375, hive/trunk/ql/src/test/results/clientpositive/tez/vectorization_14.q.out hive/branches/cbo/ql/src/test/results/clientpositive/tez/vectorization_9.q.out - copied unchanged from r1616375, hive/trunk/ql/src/test/results/clientpositive/tez/vectorization_9.q.out hive/branches/cbo/ql/src/test/results/clientpositive/tez/vectorization_part_project.q.out - copied unchanged from r1616375, hive/trunk/ql/src/test/results/clientpositive/tez/vectorization_part_project.q.out hive/branches/cbo/ql/src/test/results/clientpositive/tez/vectorization_short_regress.q.out - copied unchanged from r1616375, hive/trunk/ql/src/test/results/clientpositive/tez/vectorization_short_regress.q.out hive/branches/cbo/ql/src/test/results/clientpositive/tez/vectorized_mapjoin.q.out - copied unchanged from r1616375, hive/trunk/ql/src/test/results/clientpositive/tez/vectorized_mapjoin.q.out hive/branches/cbo/ql/src/test/results/clientpositive/tez/vectorized_nested_mapjoin.q.out - copied unchanged from r1616375, hive/trunk/ql/src/test/results/clientpositive/tez/vectorized_nested_mapjoin.q.out hive/branches/cbo/ql/src/test/results/clientpositive/tez/vectorized_shufflejoin.q.out - copied unchanged from r1616375, hive/trunk/ql/src/test/results/clientpositive/tez/vectorized_shufflejoin.q.out hive/branches/cbo/ql/src/test/results/clientpositive/tez/vectorized_timestamp_funcs.q.out - copied unchanged from r1616375, hive/trunk/ql/src/test/results/clientpositive/tez/vectorized_timestamp_funcs.q.out hive/branches/cbo/shims/common/src/main/java/org/apache/hadoop/fs/DefaultFileAccess.java - copied unchanged from r1616375, hive/trunk/shims/common/src/main/java/org/apache/hadoop/fs/DefaultFileAccess.java Modified: hive/branches/cbo/ (props changed) hive/branches/cbo/common/src/java/org/apache/hadoop/hive/common/FileUtils.java hive/branches/cbo/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java hive/branches/cbo/conf/hive-default.xml.template hive/branches/cbo/data/files/dept.txt hive/branches/cbo/data/files/emp.txt hive/branches/cbo/data/files/loc.txt hive/branches/cbo/hcatalog/webhcat/java-client/src/main/java/org/apache/hive/hcatalog/api/HCatCreateTableDesc.java hive/branches/cbo/hcatalog/webhcat/java-client/src/test/java/org/apache/hive/hcatalog/api/TestHCatClient.java hive/branches/cbo/itests/hive-unit/src/test/java/org/apache/hadoop/hive/ql/security/TestMetastoreAuthorizationProvider.java hive/branches/cbo/itests/hive-unit/src/test/java/org/apache/hadoop/hive/ql/security/TestStorageBasedMetastoreAuthorizationProvider.java hive/branches/cbo/itests/qtest/testconfiguration.properties hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/exec/CommonJoinOperator.java hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/exec/OperatorFactory.java hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/exec/ReduceSinkOperator.java hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/exec/mr/ExecMapper.java hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/exec/mr/ExecReducer.java hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/MapRecordProcessor.java hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ReduceRecordProcessor.java hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/DataWritableReadSupport.java hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/AnnotateWithStatistics.java hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/parse/ColumnStatsSemanticAnalyzer.java hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/plan/BaseWork.java hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/plan/MapWork.java hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/plan/ReduceWork.java hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/security/authorization/StorageBasedAuthorizationProvider.java hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/security/authorization/plugin/sqlstd/SQLAuthorizationUtils.java hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFBasePad.java hive/branches/cbo/ql/src/test/org/apache/hadoop/hive/ql/optimizer/physical/TestVectorizer.java hive/branches/cbo/ql/src/test/queries/clientpositive/annotate_stats_filter.q hive/branches/cbo/ql/src/test/queries/clientpositive/annotate_stats_groupby.q hive/branches/cbo/ql/src/test/queries/clientpositive/annotate_stats_join.q hive/branches/cbo/ql/src/test/queries/clientpositive/annotate_stats_limit.q hive/branches/cbo/ql/src/test/queries/clientpositive/annotate_stats_part.q hive/branches/cbo/ql/src/test/queries/clientpositive/annotate_stats_select.q hive/branches/cbo/ql/src/test/queries/clientpositive/annotate_stats_table.q hive/branches/cbo/ql/src/test/queries/clientpositive/annotate_stats_union.q hive/branches/cbo/ql/src/test/queries/clientpositive/columnstats_partlvl.q hive/branches/cbo/ql/src/test/queries/clientpositive/parquet_columnar.q hive/branches/cbo/ql/src/test/queries/clientpositive/vectorization_14.q hive/branches/cbo/ql/src/test/queries/clientpositive/vectorization_15.q hive/branches/cbo/ql/src/test/queries/clientpositive/vectorization_16.q hive/branches/cbo/ql/src/test/queries/clientpositive/vectorization_9.q hive/branches/cbo/ql/src/test/results/clientpositive/annotate_stats_filter.q.out hive/branches/cbo/ql/src/test/results/clientpositive/annotate_stats_groupby.q.out hive/branches/cbo/ql/src/test/results/clientpositive/annotate_stats_join.q.out hive/branches/cbo/ql/src/test/results/clientpositive/annotate_stats_limit.q.out hive/branches/cbo/ql/src/test/results/clientpositive/annotate_stats_part.q.out hive/branches/cbo/ql/src/test/results/clientpositive/annotate_stats_select.q.out hive/branches/cbo/ql/src/test/results/clientpositive/annotate_stats_table.q.out hive/branches/cbo/ql/src/test/results/clientpositive/annotate_stats_union.q.out hive/branches/cbo/ql/src/test/results/clientpositive/columnstats_partlvl.q.out hive/branches/cbo/ql/src/test/results/clientpositive/combine2.q.out hive/branches/cbo/ql/src/test/results/clientpositive/groupby_sort_11.q.out hive/branches/cbo/ql/src/test/results/clientpositive/input24.q.out hive/branches/cbo/ql/src/test/results/clientpositive/input25.q.out hive/branches/cbo/ql/src/test/results/clientpositive/metadataonly1.q.out hive/branches/cbo/ql/src/test/results/clientpositive/nullgroup3.q.out hive/branches/cbo/ql/src/test/results/clientpositive/parquet_columnar.q.out hive/branches/cbo/ql/src/test/results/clientpositive/tez/metadataonly1.q.out hive/branches/cbo/ql/src/test/results/clientpositive/tez/union5.q.out hive/branches/cbo/ql/src/test/results/clientpositive/tez/union7.q.out hive/branches/cbo/ql/src/test/results/clientpositive/tez/vectorization_15.q.out hive/branches/cbo/ql/src/test/results/clientpositive/udf_explode.q.out hive/branches/cbo/ql/src/test/results/clientpositive/udtf_explode.q.out hive/branches/cbo/ql/src/test/results/clientpositive/union11.q.out hive/branches/cbo/ql/src/test/results/clientpositive/union14.q.out hive/branches/cbo/ql/src/test/results/clientpositive/union15.q.out hive/branches/cbo/ql/src/test/results/clientpositive/union17.q.out hive/branches/cbo/ql/src/test/results/clientpositive/union19.q.out hive/branches/cbo/ql/src/test/results/clientpositive/union20.q.out hive/branches/cbo/ql/src/test/results/clientpositive/union21.q.out hive/branches/cbo/ql/src/test/results/clientpositive/union5.q.out hive/branches/cbo/ql/src/test/results/clientpositive/union7.q.out hive/branches/cbo/ql/src/test/results/clientpositive/vectorization_14.q.out hive/branches/cbo/ql/src/test/results/clientpositive/vectorization_15.q.out hive/branches/cbo/ql/src/test/results/clientpositive/vectorization_16.q.out hive/branches/cbo/ql/src/test/results/clientpositive/vectorization_9.q.out hive/branches/cbo/service/src/java/org/apache/hive/service/cli/thrift/ThriftCLIService.java hive/branches/cbo/shims/0.20/src/main/java/org/apache/hadoop/hive/shims/Hadoop20Shims.java hive/branches/cbo/shims/0.23/src/main/java/org/apache/hadoop/hive/shims/Hadoop23Shims.java hive/branches/cbo/shims/common-secure/src/main/java/org/apache/hadoop/hive/shims/HadoopShimsSecure.java hive/branches/cbo/shims/common/src/main/java/org/apache/hadoop/hive/shims/HadoopShims.java Propchange: hive/branches/cbo/ ------------------------------------------------------------------------------ Merged /hive/trunk:r1615870-1616375 Modified: hive/branches/cbo/common/src/java/org/apache/hadoop/hive/common/FileUtils.java URL: http://svn.apache.org/viewvc/hive/branches/cbo/common/src/java/org/apache/hadoop/hive/common/FileUtils.java?rev=1616379&r1=1616378&r2=1616379&view=diff ============================================================================== --- hive/branches/cbo/common/src/java/org/apache/hadoop/hive/common/FileUtils.java (original) +++ hive/branches/cbo/common/src/java/org/apache/hadoop/hive/common/FileUtils.java Thu Aug 7 00:21:45 2014 @@ -22,6 +22,8 @@ import java.io.FileNotFoundException; import java.io.IOException; import java.net.URI; import java.net.URISyntaxException; +import java.security.AccessControlException; +import java.security.PrivilegedExceptionAction; import java.util.ArrayList; import java.util.BitSet; import java.util.List; @@ -352,35 +354,47 @@ public final class FileUtils { } /** - * Check if the given FileStatus indicates that the action is allowed for - * userName. It checks the group and other permissions also to determine this. - * - * @param userName - * @param fsStatus - * @param action - * @return true if it is writable for userName - */ - public static boolean isActionPermittedForUser(String userName, FileStatus fsStatus, FsAction action) { - FsPermission permissions = fsStatus.getPermission(); - // check user perm - if (fsStatus.getOwner().equals(userName) - && permissions.getUserAction().implies(action)) { - return true; - } - // check other perm - if (permissions.getOtherAction().implies(action)) { - return true; - } - // check group perm after ensuring user belongs to the file owner group - String fileGroup = fsStatus.getGroup(); - String[] userGroups = UserGroupInformation.createRemoteUser(userName).getGroupNames(); - for (String group : userGroups) { - if (group.equals(fileGroup)) { - // user belongs to the file group - return permissions.getGroupAction().implies(action); + * Perform a check to determine if the user is able to access the file passed in. + * If the user name passed in is different from the current user, this method will + * attempt to do impersonate the user to do the check; the current user should be + * able to create proxy users in this case. + * @param fs FileSystem of the path to check + * @param stat FileStatus representing the file + * @param action FsAction that will be checked + * @param user User name of the user that will be checked for access. If the user name + * is null or the same as the current user, no user impersonation will be done + * and the check will be done as the current user. Otherwise the file access + * check will be performed within a doAs() block to use the access privileges + * of this user. In this case the user must be configured to impersonate other + * users, otherwise this check will fail with error. + * @param groups List of groups for the user + * @throws IOException + * @throws AccessControlException + * @throws InterruptedException + * @throws Exception + */ + public static void checkFileAccessWithImpersonation(final FileSystem fs, + final FileStatus stat, final FsAction action, final String user) + throws IOException, AccessControlException, InterruptedException, Exception { + UserGroupInformation ugi = ShimLoader.getHadoopShims().getUGIForConf(fs.getConf()); + String currentUser = ShimLoader.getHadoopShims().getShortUserName(ugi); + + if (user == null || currentUser.equals(user)) { + // No need to impersonate user, do the checks as the currently configured user. + ShimLoader.getHadoopShims().checkFileAccess(fs, stat, action); + return; + } + + // Otherwise, try user impersonation. Current user must be configured to do user impersonation. + UserGroupInformation proxyUser = ShimLoader.getHadoopShims().createProxyUser(user); + ShimLoader.getHadoopShims().doAs(proxyUser, new PrivilegedExceptionAction() { + @Override + public Object run() throws Exception { + FileSystem fsAsUser = FileSystem.get(fs.getUri(), fs.getConf()); + ShimLoader.getHadoopShims().checkFileAccess(fsAsUser, stat, action); + return null; } - } - return false; + }); } /** @@ -395,7 +409,7 @@ public final class FileUtils { * @throws IOException */ public static boolean isActionPermittedForFileHierarchy(FileSystem fs, FileStatus fileStatus, - String userName, FsAction action) throws IOException { + String userName, FsAction action) throws Exception { boolean isDir = fileStatus.isDir(); FsAction dirActionNeeded = action; @@ -403,7 +417,11 @@ public final class FileUtils { // for dirs user needs execute privileges as well dirActionNeeded.and(FsAction.EXECUTE); } - if (!isActionPermittedForUser(userName, fileStatus, dirActionNeeded)) { + + try { + checkFileAccessWithImpersonation(fs, fileStatus, action, userName); + } catch (AccessControlException err) { + // Action not permitted for user return false; } Modified: hive/branches/cbo/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java URL: http://svn.apache.org/viewvc/hive/branches/cbo/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java?rev=1616379&r1=1616378&r2=1616379&view=diff ============================================================================== --- hive/branches/cbo/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java (original) +++ hive/branches/cbo/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java Thu Aug 7 00:21:45 2014 @@ -1104,7 +1104,9 @@ public class HiveConf extends Configurat "Whether queries will fail because stats cannot be collected completely accurately. \n" + "If this is set to true, reading/writing from/into a partition may fail because the stats\n" + "could not be computed accurately."), - + HIVE_STATS_COLLECT_PART_LEVEL_STATS("hive.analyze.stmt.collect.partlevel.stats", true, + "analyze table T compute statistics for columns. Queries like these should compute partition" + + "level stats for partitioned table even when no part spec is specified."), HIVE_STATS_GATHER_NUM_THREADS("hive.stats.gather.num.threads", 10, "Number of threads used by partialscan/noscan analyze command for partitioned tables.\n" + "This is applicable only for file formats that implement StatsProvidingRecordReader (like ORC)."), Modified: hive/branches/cbo/conf/hive-default.xml.template URL: http://svn.apache.org/viewvc/hive/branches/cbo/conf/hive-default.xml.template?rev=1616379&r1=1616378&r2=1616379&view=diff ============================================================================== --- hive/branches/cbo/conf/hive-default.xml.template (original) +++ hive/branches/cbo/conf/hive-default.xml.template Thu Aug 7 00:21:45 2014 @@ -1957,6 +1957,11 @@ + hive.analyze.stmt.collect.partlevel.stats + true + analyze table T compute statistics for columns. Queries like these should compute partitionlevel stats for partitioned table even when no part spec is specified. + + hive.stats.gather.num.threads 10 Modified: hive/branches/cbo/data/files/dept.txt URL: http://svn.apache.org/viewvc/hive/branches/cbo/data/files/dept.txt?rev=1616379&r1=1616378&r2=1616379&view=diff ============================================================================== --- hive/branches/cbo/data/files/dept.txt (original) +++ hive/branches/cbo/data/files/dept.txt Thu Aug 7 00:21:45 2014 @@ -2,3 +2,5 @@ 33|engineering 34|clerical 35|marketing +36|transport +37|hr Modified: hive/branches/cbo/data/files/emp.txt URL: http://svn.apache.org/viewvc/hive/branches/cbo/data/files/emp.txt?rev=1616379&r1=1616378&r2=1616379&view=diff ============================================================================== --- hive/branches/cbo/data/files/emp.txt (original) +++ hive/branches/cbo/data/files/emp.txt Thu Aug 7 00:21:45 2014 @@ -1,6 +1,48 @@ -Rafferty|31 -Jones|33 -Steinberg|33 -Robinson|34 -Smith|34 -John| +Rafferty|31|1 +Jones|33|2 +Steinberg|33|3 +Robinson|34|4 +Smith|34|5 +John|31|6 +Rafferty|31|1 +Jones|33|2 +Steinberg|33|3 +Robinson|34|4 +Smith|34|5 +John|31|6 +Rafferty|31|1 +Jones|33|2 +Steinberg|33|3 +Robinson|34|4 +Smith|34|5 +John|31|6 +Rafferty|31|1 +Jones|33|2 +Steinberg|33|3 +Robinson|34|4 +Smith|34|5 +John|31|6 +Rafferty|31|1 +Jones|33|2 +Steinberg|33|3 +Robinson|34|4 +Smith|34|5 +John|31|6 +Rafferty|31|1 +Jones|33|2 +Steinberg|33|3 +Robinson|34|4 +Smith|34|5 +John|31|6 +Rafferty|31|1 +Jones|33|2 +Steinberg|33|3 +Robinson|34|4 +Smith|34|5 +John|31|6 +Rafferty|31|1 +Jones|33|2 +Steinberg|33|3 +Robinson|34|4 +Smith|34|5 +John|31|6 Modified: hive/branches/cbo/data/files/loc.txt URL: http://svn.apache.org/viewvc/hive/branches/cbo/data/files/loc.txt?rev=1616379&r1=1616378&r2=1616379&view=diff ============================================================================== --- hive/branches/cbo/data/files/loc.txt (original) +++ hive/branches/cbo/data/files/loc.txt Thu Aug 7 00:21:45 2014 @@ -1,8 +1,8 @@ -OH|31|43201|2001 -IO|32|43202|2001 -CA|35|43809|2001 -FL|33|54342|2001 -UT|35||2001 -CA|35|43809|2001 -|34|40000| -FL|33|54342|2001 +OH|1|43201|2001 +IO|2|43202|2001 +CA|5|43809|2001 +FL|3|54342|2001 +UT|5||2001 +CA|5|43809|2001 +|4|40000| +FL|6|54342|2001 Modified: hive/branches/cbo/hcatalog/webhcat/java-client/src/main/java/org/apache/hive/hcatalog/api/HCatCreateTableDesc.java URL: http://svn.apache.org/viewvc/hive/branches/cbo/hcatalog/webhcat/java-client/src/main/java/org/apache/hive/hcatalog/api/HCatCreateTableDesc.java?rev=1616379&r1=1616378&r2=1616379&view=diff ============================================================================== --- hive/branches/cbo/hcatalog/webhcat/java-client/src/main/java/org/apache/hive/hcatalog/api/HCatCreateTableDesc.java (original) +++ hive/branches/cbo/hcatalog/webhcat/java-client/src/main/java/org/apache/hive/hcatalog/api/HCatCreateTableDesc.java Thu Aug 7 00:21:45 2014 @@ -32,7 +32,11 @@ import org.apache.hadoop.hive.metastore. import org.apache.hadoop.hive.metastore.api.SerDeInfo; import org.apache.hadoop.hive.metastore.api.StorageDescriptor; import org.apache.hadoop.hive.metastore.api.Table; -import org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat; +import org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat; +import org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat; +import org.apache.hadoop.hive.ql.io.orc.OrcInputFormat; +import org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat; +import org.apache.hadoop.hive.ql.io.orc.OrcSerde; import org.apache.hadoop.hive.ql.io.RCFileInputFormat; import org.apache.hadoop.hive.ql.io.RCFileOutputFormat; import org.apache.hadoop.hive.ql.metadata.HiveException; @@ -41,7 +45,6 @@ import org.apache.hadoop.hive.ql.metadat import org.apache.hadoop.hive.serde.serdeConstants; import org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe; import org.apache.hadoop.mapred.SequenceFileInputFormat; -import org.apache.hadoop.mapred.SequenceFileOutputFormat; import org.apache.hadoop.mapred.TextInputFormat; import org.apache.hive.hcatalog.common.HCatException; import org.apache.hive.hcatalog.data.schema.HCatFieldSchema; @@ -568,12 +571,16 @@ public class HCatCreateTableDesc { desc.fileFormat = fileFormat; if ("SequenceFile".equalsIgnoreCase(fileFormat)) { desc.inputformat = SequenceFileInputFormat.class.getName(); - desc.outputformat = SequenceFileOutputFormat.class + desc.outputformat = HiveSequenceFileOutputFormat.class .getName(); } else if ("RCFile".equalsIgnoreCase(fileFormat)) { desc.inputformat = RCFileInputFormat.class.getName(); desc.outputformat = RCFileOutputFormat.class.getName(); desc.serde = ColumnarSerDe.class.getName(); + } else if ("orcfile".equalsIgnoreCase(fileFormat)) { + desc.inputformat = OrcInputFormat.class.getName(); + desc.outputformat = OrcOutputFormat.class.getName(); + desc.serde = OrcSerde.class.getName(); } desc.storageHandler = StringUtils.EMPTY; } else if (!StringUtils.isEmpty(storageHandler)) { @@ -583,7 +590,7 @@ public class HCatCreateTableDesc { LOG.info("Using text file format for the table."); desc.inputformat = TextInputFormat.class.getName(); LOG.info("Table input format:" + desc.inputformat); - desc.outputformat = IgnoreKeyTextOutputFormat.class + desc.outputformat = HiveIgnoreKeyTextOutputFormat.class .getName(); LOG.info("Table output format:" + desc.outputformat); } Modified: hive/branches/cbo/hcatalog/webhcat/java-client/src/test/java/org/apache/hive/hcatalog/api/TestHCatClient.java URL: http://svn.apache.org/viewvc/hive/branches/cbo/hcatalog/webhcat/java-client/src/test/java/org/apache/hive/hcatalog/api/TestHCatClient.java?rev=1616379&r1=1616378&r2=1616379&view=diff ============================================================================== --- hive/branches/cbo/hcatalog/webhcat/java-client/src/test/java/org/apache/hive/hcatalog/api/TestHCatClient.java (original) +++ hive/branches/cbo/hcatalog/webhcat/java-client/src/test/java/org/apache/hive/hcatalog/api/TestHCatClient.java Thu Aug 7 00:21:45 2014 @@ -30,9 +30,12 @@ import org.apache.hadoop.conf.Configurat import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.metastore.HiveMetaStore; import org.apache.hadoop.hive.metastore.api.PartitionEventType; -import org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat; +import org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat; import org.apache.hadoop.hive.ql.io.RCFileInputFormat; import org.apache.hadoop.hive.ql.io.RCFileOutputFormat; +import org.apache.hadoop.hive.ql.io.orc.OrcInputFormat; +import org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat; +import org.apache.hadoop.hive.ql.io.orc.OrcSerde; import org.apache.hadoop.hive.ql.metadata.Table; import org.apache.hadoop.hive.serde.serdeConstants; import org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe; @@ -119,6 +122,7 @@ public class TestHCatClient { String db = "testdb"; String tableOne = "testTable1"; String tableTwo = "testTable2"; + String tableThree = "testTable3"; HCatClient client = HCatClient.create(new Configuration(hcatConf)); client.dropDatabase(db, true, HCatClient.DropDBMode.CASCADE); @@ -170,7 +174,7 @@ public class TestHCatClient { assertTrue(table2.getInputFileFormat().equalsIgnoreCase( TextInputFormat.class.getName())); assertTrue(table2.getOutputFileFormat().equalsIgnoreCase( - IgnoreKeyTextOutputFormat.class.getName())); + HiveIgnoreKeyTextOutputFormat.class.getName())); assertTrue("SerdeParams not found", table2.getSerdeParams() != null); assertEquals("checking " + serdeConstants.FIELD_DELIM, Character.toString('\001'), table2.getSerdeParams().get(serdeConstants.FIELD_DELIM)); @@ -186,6 +190,19 @@ public class TestHCatClient { table2.getSerdeParams().get(serdeConstants.SERIALIZATION_NULL_FORMAT)); assertEquals((expectedDir + "/" + db + ".db/" + tableTwo).toLowerCase(), table2.getLocation().toLowerCase()); + + HCatCreateTableDesc tableDesc3 = HCatCreateTableDesc.create(db, + tableThree, cols).fileFormat("orcfile").build(); + client.createTable(tableDesc3); + HCatTable table3 = client.getTable(db, tableThree); + assertTrue(table3.getInputFileFormat().equalsIgnoreCase( + OrcInputFormat.class.getName())); + assertTrue(table3.getOutputFileFormat().equalsIgnoreCase( + OrcOutputFormat.class.getName())); + assertTrue(table3.getSerdeLib().equalsIgnoreCase( + OrcSerde.class.getName())); + assertTrue(table1.getCols().equals(cols)); + client.close(); } Modified: hive/branches/cbo/itests/hive-unit/src/test/java/org/apache/hadoop/hive/ql/security/TestMetastoreAuthorizationProvider.java URL: http://svn.apache.org/viewvc/hive/branches/cbo/itests/hive-unit/src/test/java/org/apache/hadoop/hive/ql/security/TestMetastoreAuthorizationProvider.java?rev=1616379&r1=1616378&r2=1616379&view=diff ============================================================================== --- hive/branches/cbo/itests/hive-unit/src/test/java/org/apache/hadoop/hive/ql/security/TestMetastoreAuthorizationProvider.java (original) +++ hive/branches/cbo/itests/hive-unit/src/test/java/org/apache/hadoop/hive/ql/security/TestMetastoreAuthorizationProvider.java Thu Aug 7 00:21:45 2014 @@ -72,6 +72,9 @@ public class TestMetastoreAuthorizationP return DefaultHiveMetastoreAuthorizationProvider.class.getName(); } + protected HiveConf createHiveConf() throws Exception { + return new HiveConf(this.getClass()); + } @Override protected void setUp() throws Exception { @@ -92,7 +95,7 @@ public class TestMetastoreAuthorizationP MetaStoreUtils.startMetaStore(port, ShimLoader.getHadoopThriftAuthBridge()); - clientHiveConf = new HiveConf(this.getClass()); + clientHiveConf = createHiveConf(); // Turn off client-side authorization clientHiveConf.setBoolVar(HiveConf.ConfVars.HIVE_AUTHORIZATION_ENABLED,false); @@ -134,10 +137,23 @@ public class TestMetastoreAuthorizationP return "smp_ms_tbl"; } + protected boolean isTestEnabled() { + return true; + } + + protected String setupUser() { + return ugi.getUserName(); + } + public void testSimplePrivileges() throws Exception { + if (!isTestEnabled()) { + System.out.println("Skipping test " + this.getClass().getName()); + return; + } + String dbName = getTestDbName(); String tblName = getTestTableName(); - String userName = ugi.getUserName(); + String userName = setupUser(); allowCreateDatabase(userName); Modified: hive/branches/cbo/itests/hive-unit/src/test/java/org/apache/hadoop/hive/ql/security/TestStorageBasedMetastoreAuthorizationProvider.java URL: http://svn.apache.org/viewvc/hive/branches/cbo/itests/hive-unit/src/test/java/org/apache/hadoop/hive/ql/security/TestStorageBasedMetastoreAuthorizationProvider.java?rev=1616379&r1=1616378&r2=1616379&view=diff ============================================================================== --- hive/branches/cbo/itests/hive-unit/src/test/java/org/apache/hadoop/hive/ql/security/TestStorageBasedMetastoreAuthorizationProvider.java (original) +++ hive/branches/cbo/itests/hive-unit/src/test/java/org/apache/hadoop/hive/ql/security/TestStorageBasedMetastoreAuthorizationProvider.java Thu Aug 7 00:21:45 2014 @@ -19,6 +19,7 @@ package org.apache.hadoop.hive.ql.security; import java.net.URI; +import java.security.AccessControlException; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -81,7 +82,7 @@ public class TestStorageBasedMetastoreAu setPermissions(location,"-rwxr--r--"); } - private void setPermissions(String locn, String permissions) throws Exception { + protected void setPermissions(String locn, String permissions) throws Exception { FileSystem fs = FileSystem.get(new URI(locn), clientHiveConf); fs.setPermission(new Path(locn), FsPermission.valueOf(permissions)); } @@ -89,7 +90,7 @@ public class TestStorageBasedMetastoreAu @Override protected void assertNoPrivileges(MetaException me){ assertNotNull(me); - assertTrue(me.getMessage().indexOf("not permitted") != -1); + assertTrue(me.getMessage().indexOf("AccessControlException") != -1); } @Override Modified: hive/branches/cbo/itests/qtest/testconfiguration.properties URL: http://svn.apache.org/viewvc/hive/branches/cbo/itests/qtest/testconfiguration.properties?rev=1616379&r1=1616378&r2=1616379&view=diff ============================================================================== --- hive/branches/cbo/itests/qtest/testconfiguration.properties (original) +++ hive/branches/cbo/itests/qtest/testconfiguration.properties Thu Aug 7 00:21:45 2014 @@ -1,5 +1,5 @@ minimr.query.files=stats_counter_partitioned.q,list_bucket_dml_10.q,input16_cc.q,scriptfile1.q,scriptfile1_win.q,bucket4.q,bucketmapjoin6.q,disable_merge_for_bucketing.q,reduce_deduplicate.q,smb_mapjoin_8.q,join1.q,groupby2.q,bucketizedhiveinputformat.q,bucketmapjoin7.q,optrstat_groupby.q,bucket_num_reducers.q,bucket5.q,load_fs2.q,bucket_num_reducers2.q,infer_bucket_sort_merge.q,infer_bucket_sort_reducers_power_two.q,infer_bucket_sort_dyn_part.q,infer_bucket_sort_bucketed_table.q,infer_bucket_sort_map_operators.q,infer_bucket_sort_num_buckets.q,leftsemijoin_mr.q,schemeAuthority.q,schemeAuthority2.q,truncate_column_buckets.q,remote_script.q,,load_hdfs_file_with_space_in_the_name.q,parallel_orderby.q,import_exported_table.q,stats_counter.q,auto_sortmerge_join_16.q,quotedid_smb.q,file_with_header_footer.q,external_table_with_space_in_location_path.q,root_dir_external_table.q,index_bitmap3.q,ql_rewrite_gbtoidx.q,index_bitmap_auto.q,udf_using.q,empty_dir_in_table.q,temp_table_external.q minimr.query.negative.files=cluster_tasklog_retrieval.q,minimr_broken_pipe.q,mapreduce_stack_trace.q,mapreduce_stack_trace_turnoff.q,mapreduce_stack_trace_hadoop20.q,mapreduce_stack_trace_turnoff_hadoop20.q,file_with_header_footer_negative.q,udf_local_resource.q minitez.query.files=tez_fsstat.q,mapjoin_decimal.q,tez_join_tests.q,tez_joins_explain.q,mrr.q,tez_dml.q,tez_insert_overwrite_local_directory_1.q,tez_union.q,bucket_map_join_tez1.q,bucket_map_join_tez2.q,tez_schema_evolution.q,tez_join_hash.q -minitez.query.files.shared=orc_merge1.q,orc_merge2.q,orc_merge3.q,orc_merge4.q,alter_merge_orc.q,alter_merge_2_orc.q,alter_merge_stats_orc.q,cross_product_check_1.q,cross_product_check_2.q,dynpart_sort_opt_vectorization.q,dynpart_sort_optimization.q,orc_analyze.q,join0.q,join1.q,auto_join0.q,auto_join1.q,bucket2.q,bucket3.q,bucket4.q,count.q,create_merge_compressed.q,cross_join.q,ctas.q,custom_input_output_format.q,disable_merge_for_bucketing.q,enforce_order.q,filter_join_breaktask.q,filter_join_breaktask2.q,groupby1.q,groupby2.q,groupby3.q,having.q,insert1.q,insert_into1.q,insert_into2.q,leftsemijoin.q,limit_pushdown.q,load_dyn_part1.q,load_dyn_part2.q,load_dyn_part3.q,mapjoin_mapjoin.q,mapreduce1.q,mapreduce2.q,merge1.q,merge2.q,metadata_only_queries.q,sample1.q,subquery_in.q,subquery_exists.q,vectorization_15.q,ptf.q,stats_counter.q,stats_noscan_1.q,stats_counter_partitioned.q,union2.q,union3.q,union4.q,union5.q,union6.q,union7.q,union8.q,union9.q,transform1.q,transform2.q,transf orm_ppr1.q,transform_ppr2.q,script_env_var1.q,script_env_var2.q,script_pipe.q,scriptfile1.q,metadataonly1.q,temp_table.q,vectorized_ptf.q,optimize_nullscan.q,vector_cast_constant.q,vector_string_concat.q,cbo_correctness.q +minitez.query.files.shared=orc_merge1.q,orc_merge2.q,orc_merge3.q,orc_merge4.q,alter_merge_orc.q,alter_merge_2_orc.q,alter_merge_stats_orc.q,cross_product_check_1.q,cross_product_check_2.q,dynpart_sort_opt_vectorization.q,dynpart_sort_optimization.q,orc_analyze.q,join0.q,join1.q,auto_join0.q,auto_join1.q,bucket2.q,bucket3.q,bucket4.q,count.q,create_merge_compressed.q,cross_join.q,ctas.q,custom_input_output_format.q,disable_merge_for_bucketing.q,enforce_order.q,filter_join_breaktask.q,filter_join_breaktask2.q,groupby1.q,groupby2.q,groupby3.q,having.q,insert1.q,insert_into1.q,insert_into2.q,leftsemijoin.q,limit_pushdown.q,load_dyn_part1.q,load_dyn_part2.q,load_dyn_part3.q,mapjoin_mapjoin.q,mapreduce1.q,mapreduce2.q,merge1.q,merge2.q,metadata_only_queries.q,sample1.q,subquery_in.q,subquery_exists.q,vectorization_15.q,ptf.q,stats_counter.q,stats_noscan_1.q,stats_counter_partitioned.q,union2.q,union3.q,union4.q,union5.q,union6.q,union7.q,union8.q,union9.q,transform1.q,transform2.q,transf orm_ppr1.q,transform_ppr2.q,script_env_var1.q,script_env_var2.q,script_pipe.q,scriptfile1.q,metadataonly1.q,temp_table.q,vectorized_ptf.q,optimize_nullscan.q,vector_cast_constant.q,vector_string_concat.q,vector_decimal_aggregate.q,vector_left_outer_join.q,vectorization_12.q,vectorization_13.q,vectorization_14.q,vectorization_9.q,vectorization_part_project.q,vectorization_short_regress.q,vectorized_mapjoin.q,vectorized_nested_mapjoin.q,vectorized_shufflejoin.q,vectorized_timestamp_funcs.q,cbo_correctness.q beeline.positive.exclude=add_part_exist.q,alter1.q,alter2.q,alter4.q,alter5.q,alter_rename_partition.q,alter_rename_partition_authorization.q,archive.q,archive_corrupt.q,archive_multi.q,archive_mr_1806.q,archive_multi_mr_1806.q,authorization_1.q,authorization_2.q,authorization_4.q,authorization_5.q,authorization_6.q,authorization_7.q,ba_table1.q,ba_table2.q,ba_table3.q,ba_table_udfs.q,binary_table_bincolserde.q,binary_table_colserde.q,cluster.q,columnarserde_create_shortcut.q,combine2.q,constant_prop.q,create_nested_type.q,create_or_replace_view.q,create_struct_table.q,create_union_table.q,database.q,database_location.q,database_properties.q,ddltime.q,describe_database_json.q,drop_database_removes_partition_dirs.q,escape1.q,escape2.q,exim_00_nonpart_empty.q,exim_01_nonpart.q,exim_02_00_part_empty.q,exim_02_part.q,exim_03_nonpart_over_compat.q,exim_04_all_part.q,exim_04_evolved_parts.q,exim_05_some_part.q,exim_06_one_part.q,exim_07_all_part_over_nonoverlap.q,exim_08_nonpart_rename.q, exim_09_part_spec_nonoverlap.q,exim_10_external_managed.q,exim_11_managed_external.q,exim_12_external_location.q,exim_13_managed_location.q,exim_14_managed_location_over_existing.q,exim_15_external_part.q,exim_16_part_external.q,exim_17_part_managed.q,exim_18_part_external.q,exim_19_00_part_external_location.q,exim_19_part_external_location.q,exim_20_part_managed_location.q,exim_21_export_authsuccess.q,exim_22_import_exist_authsuccess.q,exim_23_import_part_authsuccess.q,exim_24_import_nonexist_authsuccess.q,global_limit.q,groupby_complex_types.q,groupby_complex_types_multi_single_reducer.q,index_auth.q,index_auto.q,index_auto_empty.q,index_bitmap.q,index_bitmap1.q,index_bitmap2.q,index_bitmap3.q,index_bitmap_auto.q,index_bitmap_rc.q,index_compact.q,index_compact_1.q,index_compact_2.q,index_compact_3.q,index_stale_partitioned.q,init_file.q,input16.q,input16_cc.q,input46.q,input_columnarserde.q,input_dynamicserde.q,input_lazyserde.q,input_testxpath3.q,input_testxpath4.q,insert2_overwr ite_partitions.q,insertexternal1.q,join_thrift.q,lateral_view.q,load_binary_data.q,load_exist_part_authsuccess.q,load_nonpart_authsuccess.q,load_part_authsuccess.q,loadpart_err.q,lock1.q,lock2.q,lock3.q,lock4.q,merge_dynamic_partition.q,multi_insert.q,multi_insert_move_tasks_share_dependencies.q,null_column.q,ppd_clusterby.q,query_with_semi.q,rename_column.q,sample6.q,sample_islocalmode_hook.q,set_processor_namespaces.q,show_tables.q,source.q,split_sample.q,str_to_map.q,transform1.q,udaf_collect_set.q,udaf_context_ngrams.q,udaf_histogram_numeric.q,udaf_ngrams.q,udaf_percentile_approx.q,udf_array.q,udf_bitmap_and.q,udf_bitmap_or.q,udf_explode.q,udf_format_number.q,udf_map.q,udf_map_keys.q,udf_map_values.q,udf_max.q,udf_min.q,udf_named_struct.q,udf_percentile.q,udf_printf.q,udf_sentences.q,udf_sort_array.q,udf_split.q,udf_struct.q,udf_substr.q,udf_translate.q,udf_union.q,udf_xpath.q,udtf_stack.q,view.q,virtual_column.q Modified: hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/exec/CommonJoinOperator.java URL: http://svn.apache.org/viewvc/hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/exec/CommonJoinOperator.java?rev=1616379&r1=1616378&r2=1616379&view=diff ============================================================================== --- hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/exec/CommonJoinOperator.java (original) +++ hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/exec/CommonJoinOperator.java Thu Aug 7 00:21:45 2014 @@ -705,13 +705,9 @@ public abstract class CommonJoinOperator } if (allOne) { - LOG.info("calling genAllOneUniqueJoinObject"); genAllOneUniqueJoinObject(); - LOG.info("called genAllOneUniqueJoinObject"); } else { - LOG.trace("calling genUniqueJoinObject"); genUniqueJoinObject(0, 0); - LOG.trace("called genUniqueJoinObject"); } } else { // does any result need to be emitted @@ -753,17 +749,11 @@ public abstract class CommonJoinOperator } if (!hasEmpty && !mayHasMoreThanOne) { - LOG.trace("calling genAllOneUniqueJoinObject"); genAllOneUniqueJoinObject(); - LOG.trace("called genAllOneUniqueJoinObject"); } else if (!hasEmpty && !hasLeftSemiJoin) { - LOG.trace("calling genUniqueJoinObject"); genUniqueJoinObject(0, 0); - LOG.trace("called genUniqueJoinObject"); } else { - LOG.trace("calling genObject"); genJoinObject(); - LOG.trace("called genObject"); } } Arrays.fill(aliasFilterTags, (byte)0xff); Modified: hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/exec/OperatorFactory.java URL: http://svn.apache.org/viewvc/hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/exec/OperatorFactory.java?rev=1616379&r1=1616378&r2=1616379&view=diff ============================================================================== --- hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/exec/OperatorFactory.java (original) +++ hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/exec/OperatorFactory.java Thu Aug 7 00:21:45 2014 @@ -22,6 +22,7 @@ import java.util.ArrayList; import java.util.List; import java.util.Map; +import org.apache.hadoop.hive.ql.exec.vector.VectorExtractOperator; import org.apache.hadoop.hive.ql.exec.vector.VectorFileSinkOperator; import org.apache.hadoop.hive.ql.exec.vector.VectorFilterOperator; import org.apache.hadoop.hive.ql.exec.vector.VectorGroupByOperator; @@ -64,23 +65,9 @@ import org.apache.hadoop.hive.ql.plan.Un * */ public final class OperatorFactory { + private static final List opvec; + private static final List vectorOpvec; - /** - * OpTuple. - * - * @param - */ - public static final class OpTuple { - public Class descClass; - public Class> opClass; - - public OpTuple(Class descClass, Class> opClass) { - this.descClass = descClass; - this.opClass = opClass; - } - } - - public static ArrayList opvec; static { opvec = new ArrayList(); opvec.add(new OpTuple(FilterDesc.class, FilterOperator.class)); @@ -116,7 +103,6 @@ public final class OperatorFactory { MuxOperator.class)); } - public static ArrayList vectorOpvec; static { vectorOpvec = new ArrayList(); vectorOpvec.add(new OpTuple(SelectDesc.class, VectorSelectOperator.class)); @@ -128,8 +114,20 @@ public final class OperatorFactory { vectorOpvec.add(new OpTuple(FileSinkDesc.class, VectorFileSinkOperator.class)); vectorOpvec.add(new OpTuple(FilterDesc.class, VectorFilterOperator.class)); vectorOpvec.add(new OpTuple(LimitDesc.class, VectorLimitOperator.class)); + vectorOpvec.add(new OpTuple(ExtractDesc.class, VectorExtractOperator.class)); } + private static final class OpTuple { + private final Class descClass; + private final Class> opClass; + + public OpTuple(Class descClass, Class> opClass) { + this.descClass = descClass; + this.opClass = opClass; + } + } + + public static Operator getVectorOperator(T conf, VectorizationContext vContext) throws HiveException { Class descClass = (Class) conf.getClass(); Modified: hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/exec/ReduceSinkOperator.java URL: http://svn.apache.org/viewvc/hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/exec/ReduceSinkOperator.java?rev=1616379&r1=1616378&r2=1616379&view=diff ============================================================================== --- hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/exec/ReduceSinkOperator.java (original) +++ hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/exec/ReduceSinkOperator.java Thu Aug 7 00:21:45 2014 @@ -25,6 +25,8 @@ import java.util.Arrays; import java.util.List; import java.util.Random; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.io.HiveKey; @@ -61,9 +63,20 @@ public class ReduceSinkOperator extends PTFUtils.makeTransient(ReduceSinkOperator.class, "inputAliases", "valueIndex"); } + private static final Log LOG = LogFactory.getLog(ReduceSinkOperator.class.getName()); private static final long serialVersionUID = 1L; - protected transient OutputCollector out; + private static final MurmurHash hash = (MurmurHash) MurmurHash.getInstance(); + + private transient ObjectInspector[] partitionObjectInspectors; + private transient ObjectInspector[] bucketObjectInspectors; + private transient int buckColIdxInKey; + private boolean firstRow; + private transient int tag; + private boolean skipTag = false; + private transient InspectableObject tempInspectableObject = new InspectableObject(); + private transient int[] valueIndex; // index for value(+ from keys, - from values) + protected transient OutputCollector out; /** * The evaluators for the key columns. Key columns decide the sort order on * the reducer side. Key columns are passed to the reducer in the "key". @@ -84,38 +97,40 @@ public class ReduceSinkOperator extends * Evaluators for bucketing columns. This is used to compute bucket number. */ protected transient ExprNodeEvaluator[] bucketEval = null; - - // TODO: we use MetadataTypedColumnsetSerDe for now, till DynamicSerDe is - // ready + // TODO: we use MetadataTypedColumnsetSerDe for now, till DynamicSerDe is ready protected transient Serializer keySerializer; protected transient boolean keyIsText; protected transient Serializer valueSerializer; - transient int tag; protected transient byte[] tagByte = new byte[1]; - transient protected int numDistributionKeys; - transient protected int numDistinctExprs; - transient String[] inputAliases; // input aliases of this RS for join (used for PPD) - private boolean skipTag = false; + protected transient int numDistributionKeys; + protected transient int numDistinctExprs; + protected transient String[] inputAliases; // input aliases of this RS for join (used for PPD) protected transient boolean autoParallel = false; - - protected static final MurmurHash hash = (MurmurHash)MurmurHash.getInstance(); - - private transient int[] valueIndex; // index for value(+ from keys, - from values) - - public void setInputAliases(String[] inputAliases) { - this.inputAliases = inputAliases; - } - - public String[] getInputAliases() { - return inputAliases; - } - - public void setOutputCollector(OutputCollector _out) { - this.out = _out; - } - // picks topN K:V pairs from input. protected transient TopNHash reducerHash = new TopNHash(); + protected transient HiveKey keyWritable = new HiveKey(); + protected transient ObjectInspector keyObjectInspector; + protected transient ObjectInspector valueObjectInspector; + protected transient Object[] cachedValues; + protected transient List> distinctColIndices; + protected transient Random random; + /** + * This two dimensional array holds key data and a corresponding Union object + * which contains the tag identifying the aggregate expression for distinct columns. + * + * If there is no distict expression, cachedKeys is simply like this. + * cachedKeys[0] = [col0][col1] + * + * with two distict expression, union(tag:key) is attatched for each distinct expression + * cachedKeys[0] = [col0][col1][0:dist1] + * cachedKeys[1] = [col0][col1][1:dist2] + * + * in this case, child GBY evaluates distict values with expression like KEY.col2:0.dist1 + * see {@link ExprNodeColumnEvaluator} + */ + // TODO: we only ever use one row of these at a time. Why do we need to cache multiple? + protected transient Object[][] cachedKeys; + @Override protected void initializeOp(Configuration hconf) throws HiveException { try { @@ -184,40 +199,12 @@ public class ReduceSinkOperator extends firstRow = true; initializeChildren(hconf); } catch (Exception e) { - e.printStackTrace(); + String msg = "Error initializing ReduceSinkOperator: " + e.getMessage(); + LOG.error(msg, e); throw new RuntimeException(e); } } - transient InspectableObject tempInspectableObject = new InspectableObject(); - protected transient HiveKey keyWritable = new HiveKey(); - - protected transient ObjectInspector keyObjectInspector; - protected transient ObjectInspector valueObjectInspector; - transient ObjectInspector[] partitionObjectInspectors; - transient ObjectInspector[] bucketObjectInspectors = null; - transient int buckColIdxInKey; - - protected transient Object[] cachedValues; - protected transient List> distinctColIndices; - /** - * This two dimensional array holds key data and a corresponding Union object - * which contains the tag identifying the aggregate expression for distinct columns. - * - * If there is no distict expression, cachedKeys is simply like this. - * cachedKeys[0] = [col0][col1] - * - * with two distict expression, union(tag:key) is attatched for each distinct expression - * cachedKeys[0] = [col0][col1][0:dist1] - * cachedKeys[1] = [col0][col1][1:dist2] - * - * in this case, child GBY evaluates distict values with expression like KEY.col2:0.dist1 - * see {@link ExprNodeColumnEvaluator} - */ - // TODO: we only ever use one row of these at a time. Why do we need to cache multiple? - protected transient Object[][] cachedKeys; - boolean firstRow; - protected transient Random random; /** * Initializes array of ExprNodeEvaluator. Adds Union field for distinct @@ -509,4 +496,16 @@ public class ReduceSinkOperator extends public int[] getValueIndex() { return valueIndex; } + + public void setInputAliases(String[] inputAliases) { + this.inputAliases = inputAliases; + } + + public String[] getInputAliases() { + return inputAliases; + } + + public void setOutputCollector(OutputCollector _out) { + this.out = _out; + } } Modified: hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/exec/mr/ExecMapper.java URL: http://svn.apache.org/viewvc/hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/exec/mr/ExecMapper.java?rev=1616379&r1=1616378&r2=1616379&view=diff ============================================================================== --- hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/exec/mr/ExecMapper.java (original) +++ hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/exec/mr/ExecMapper.java Thu Aug 7 00:21:45 2014 @@ -250,7 +250,7 @@ public class ExecMapper extends MapReduc + used_memory); } - reportStats rps = new reportStats(rp); + ReportStats rps = new ReportStats(rp); mo.preorderMap(rps); return; } catch (Exception e) { @@ -285,10 +285,10 @@ public class ExecMapper extends MapReduc * reportStats. * */ - public static class reportStats implements Operator.OperatorFunc { - Reporter rp; + public static class ReportStats implements Operator.OperatorFunc { + private final Reporter rp; - public reportStats(Reporter rp) { + public ReportStats(Reporter rp) { this.rp = rp; } Modified: hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/exec/mr/ExecReducer.java URL: http://svn.apache.org/viewvc/hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/exec/mr/ExecReducer.java?rev=1616379&r1=1616378&r2=1616379&view=diff ============================================================================== --- hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/exec/mr/ExecReducer.java (original) +++ hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/exec/mr/ExecReducer.java Thu Aug 7 00:21:45 2014 @@ -34,7 +34,7 @@ import org.apache.hadoop.hive.ql.exec.Ob import org.apache.hadoop.hive.ql.exec.ObjectCacheFactory; import org.apache.hadoop.hive.ql.exec.Operator; import org.apache.hadoop.hive.ql.exec.Utilities; -import org.apache.hadoop.hive.ql.exec.mr.ExecMapper.reportStats; +import org.apache.hadoop.hive.ql.exec.mr.ExecMapper.ReportStats; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.ReduceWork; import org.apache.hadoop.hive.ql.plan.TableDesc; @@ -306,7 +306,7 @@ public class ExecReducer extends MapRedu } reducer.close(abort); - reportStats rps = new reportStats(rp); + ReportStats rps = new ReportStats(rp); reducer.preorderMap(rps); } catch (Exception e) { Modified: hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/MapRecordProcessor.java URL: http://svn.apache.org/viewvc/hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/MapRecordProcessor.java?rev=1616379&r1=1616378&r2=1616379&view=diff ============================================================================== --- hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/MapRecordProcessor.java (original) +++ hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/MapRecordProcessor.java Thu Aug 7 00:21:45 2014 @@ -33,7 +33,7 @@ import org.apache.hadoop.hive.ql.exec.Ob import org.apache.hadoop.hive.ql.exec.Operator; import org.apache.hadoop.hive.ql.exec.OperatorUtils; import org.apache.hadoop.hive.ql.exec.Utilities; -import org.apache.hadoop.hive.ql.exec.mr.ExecMapper.reportStats; +import org.apache.hadoop.hive.ql.exec.mr.ExecMapper.ReportStats; import org.apache.hadoop.hive.ql.exec.mr.ExecMapperContext; import org.apache.hadoop.hive.ql.exec.tez.TezProcessor.TezKVOutputCollector; import org.apache.hadoop.hive.ql.exec.vector.VectorMapOperator; @@ -225,7 +225,7 @@ public class MapRecordProcessor extends if (isLogInfoEnabled) { logCloseInfo(); } - reportStats rps = new reportStats(reporter); + ReportStats rps = new ReportStats(reporter); mapOp.preorderMap(rps); return; } catch (Exception e) { Modified: hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ReduceRecordProcessor.java URL: http://svn.apache.org/viewvc/hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ReduceRecordProcessor.java?rev=1616379&r1=1616378&r2=1616379&view=diff ============================================================================== --- hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ReduceRecordProcessor.java (original) +++ hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ReduceRecordProcessor.java Thu Aug 7 00:21:45 2014 @@ -35,7 +35,7 @@ import org.apache.hadoop.hive.ql.exec.Ob import org.apache.hadoop.hive.ql.exec.Operator; import org.apache.hadoop.hive.ql.exec.OperatorUtils; import org.apache.hadoop.hive.ql.exec.Utilities; -import org.apache.hadoop.hive.ql.exec.mr.ExecMapper.reportStats; +import org.apache.hadoop.hive.ql.exec.mr.ExecMapper.ReportStats; import org.apache.hadoop.hive.ql.exec.mr.ExecMapperContext; import org.apache.hadoop.hive.ql.exec.tez.TezProcessor.TezKVOutputCollector; import org.apache.hadoop.hive.ql.exec.tez.tools.InputMerger; @@ -136,7 +136,7 @@ public class ReduceRecordProcessor exte reducer.setParentOperators(null); // clear out any parents as reducer is the // root isTagged = redWork.getNeedsTagging(); - vectorized = redWork.getVectorModeOn() != null; + vectorized = redWork.getVectorMode(); try { keyTableDesc = redWork.getKeyDesc(); @@ -519,7 +519,7 @@ public class ReduceRecordProcessor exte dummyOp.close(abort); } } - reportStats rps = new reportStats(reporter); + ReportStats rps = new ReportStats(reporter); reducer.preorderMap(rps); } catch (Exception e) { Modified: hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/DataWritableReadSupport.java URL: http://svn.apache.org/viewvc/hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/DataWritableReadSupport.java?rev=1616379&r1=1616378&r2=1616379&view=diff ============================================================================== --- hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/DataWritableReadSupport.java (original) +++ hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/DataWritableReadSupport.java Thu Aug 7 00:21:45 2014 @@ -26,6 +26,7 @@ import org.apache.hadoop.hive.serde2.Col import org.apache.hadoop.io.ArrayWritable; import org.apache.hadoop.util.StringUtils; +import parquet.column.ColumnDescriptor; import parquet.hadoop.api.ReadSupport; import parquet.io.api.RecordMaterializer; import parquet.schema.MessageType; @@ -46,8 +47,8 @@ public class DataWritableReadSupport ext private static final String TABLE_SCHEMA = "table_schema"; public static final String HIVE_SCHEMA_KEY = "HIVE_TABLE_SCHEMA"; - public static final String PARQUET_COLUMN_INDEX_ACCESS = "parquet.column.index.access"; - + public static final String PARQUET_COLUMN_INDEX_ACCESS = "parquet.column.index.access"; + /** * From a string which columns names (including hive column), return a list * of string columns @@ -75,12 +76,16 @@ public class DataWritableReadSupport ext final Map contextMetadata = new HashMap(); if (columns != null) { final List listColumns = getColumns(columns); - + final Map lowerCaseFileSchemaColumns = new HashMap(); + for (ColumnDescriptor c : fileSchema.getColumns()) { + lowerCaseFileSchemaColumns.put(c.getPath()[0].toLowerCase(), c.getPath()[0]); + } final List typeListTable = new ArrayList(); - for (final String col : listColumns) { + for (String col : listColumns) { + col = col.toLowerCase(); // listColumns contains partition columns which are metadata only - if (fileSchema.containsField(col)) { - typeListTable.add(fileSchema.getType(col)); + if (lowerCaseFileSchemaColumns.containsKey(col)) { + typeListTable.add(fileSchema.getType(lowerCaseFileSchemaColumns.get(col))); } else { // below allows schema evolution typeListTable.add(new PrimitiveType(Repetition.OPTIONAL, PrimitiveTypeName.BINARY, col)); @@ -93,10 +98,24 @@ public class DataWritableReadSupport ext final List indexColumnsWanted = ColumnProjectionUtils.getReadColumnIDs(configuration); final List typeListWanted = new ArrayList(); + final boolean indexAccess = configuration.getBoolean(PARQUET_COLUMN_INDEX_ACCESS, false); for (final Integer idx : indexColumnsWanted) { - typeListWanted.add(tableSchema.getType(listColumns.get(idx))); + String col = listColumns.get(idx); + if (indexAccess) { + typeListWanted.add(tableSchema.getType(col)); + } else { + col = col.toLowerCase(); + if (lowerCaseFileSchemaColumns.containsKey(col)) { + typeListWanted.add(tableSchema.getType(lowerCaseFileSchemaColumns.get(col))); + } else { + // should never occur? + String msg = "Column " + col + " at index " + idx + " does not exist in " + + lowerCaseFileSchemaColumns; + throw new IllegalStateException(msg); + } + } } - requestedSchemaByUser = resolveSchemaAccess(new MessageType(fileSchema.getName(), + requestedSchemaByUser = resolveSchemaAccess(new MessageType(fileSchema.getName(), typeListWanted), fileSchema, configuration); return new ReadContext(requestedSchemaByUser, contextMetadata); @@ -127,29 +146,24 @@ public class DataWritableReadSupport ext } final MessageType tableSchema = resolveSchemaAccess(MessageTypeParser. parseMessageType(metadata.get(HIVE_SCHEMA_KEY)), fileSchema, configuration); - return new DataWritableRecordConverter(readContext.getRequestedSchema(), tableSchema); } - + /** - * Determine the file column names based on the position within the requested columns and + * Determine the file column names based on the position within the requested columns and * use that as the requested schema. */ - private MessageType resolveSchemaAccess(MessageType requestedSchema, MessageType fileSchema, + private MessageType resolveSchemaAccess(MessageType requestedSchema, MessageType fileSchema, Configuration configuration) { - if(configuration.getBoolean(PARQUET_COLUMN_INDEX_ACCESS, false)) { + if (configuration.getBoolean(PARQUET_COLUMN_INDEX_ACCESS, false)) { final List listColumns = getColumns(configuration.get(IOConstants.COLUMNS)); - List requestedTypes = new ArrayList(); - for(Type t : requestedSchema.getFields()) { int index = listColumns.indexOf(t.getName()); requestedTypes.add(fileSchema.getType(index)); } - requestedSchema = new MessageType(requestedSchema.getName(), requestedTypes); } - return requestedSchema; } } Modified: hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java URL: http://svn.apache.org/viewvc/hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java?rev=1616379&r1=1616378&r2=1616379&view=diff ============================================================================== --- hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java (original) +++ hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java Thu Aug 7 00:21:45 2014 @@ -36,6 +36,7 @@ import org.apache.hadoop.hive.conf.HiveC import org.apache.hadoop.hive.ql.exec.*; import org.apache.hadoop.hive.ql.exec.mr.MapRedTask; import org.apache.hadoop.hive.ql.exec.tez.TezTask; +import org.apache.hadoop.hive.ql.exec.vector.VectorExtractOperator; import org.apache.hadoop.hive.ql.exec.vector.VectorExpressionDescriptor; import org.apache.hadoop.hive.ql.exec.vector.VectorizationContext; import org.apache.hadoop.hive.ql.exec.vector.VectorizationContextRegion; @@ -62,9 +63,12 @@ import org.apache.hadoop.hive.ql.plan.Ex import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; import org.apache.hadoop.hive.ql.plan.MapJoinDesc; import org.apache.hadoop.hive.ql.plan.MapWork; +import org.apache.hadoop.hive.ql.plan.MapredWork; import org.apache.hadoop.hive.ql.plan.OperatorDesc; import org.apache.hadoop.hive.ql.plan.PartitionDesc; +import org.apache.hadoop.hive.ql.plan.ReduceWork; import org.apache.hadoop.hive.ql.plan.SMBJoinDesc; +import org.apache.hadoop.hive.ql.plan.TableDesc; import org.apache.hadoop.hive.ql.plan.TableScanDesc; import org.apache.hadoop.hive.ql.plan.TezWork; import org.apache.hadoop.hive.ql.plan.api.OperatorType; @@ -107,6 +111,12 @@ import org.apache.hadoop.hive.ql.udf.UDF import org.apache.hadoop.hive.ql.udf.UDFWeekOfYear; import org.apache.hadoop.hive.ql.udf.UDFYear; import org.apache.hadoop.hive.ql.udf.generic.*; +import org.apache.hadoop.hive.serde2.Deserializer; +import org.apache.hadoop.hive.serde2.SerDe; +import org.apache.hadoop.hive.serde2.SerDeUtils; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.util.ReflectionUtils; public class Vectorizer implements PhysicalPlanResolver { @@ -256,7 +266,15 @@ public class Vectorizer implements Physi class VectorizationDispatcher implements Dispatcher { + private PhysicalContext pctx; + + private int keyColCount; + private int valueColCount; + public VectorizationDispatcher(PhysicalContext pctx) { + this.pctx = pctx; + keyColCount = 0; + valueColCount = 0; } @Override @@ -270,6 +288,9 @@ public class Vectorizer implements Physi for (BaseWork w: work.getAllWork()) { if (w instanceof MapWork) { convertMapWork((MapWork)w); + } else if (w instanceof ReduceWork) { + // We are only vectorizing Reduce under Tez. + convertReduceWork((ReduceWork)w); } } } @@ -283,6 +304,13 @@ public class Vectorizer implements Physi } } + private void addMapWorkRules(Map opRules, NodeProcessor np) { + opRules.put(new RuleRegExp("R1", TableScanOperator.getOperatorName() + ".*" + + FileSinkOperator.getOperatorName()), np); + opRules.put(new RuleRegExp("R2", TableScanOperator.getOperatorName() + ".*" + + ReduceSinkOperator.getOperatorName()), np); + } + private boolean validateMapWork(MapWork mapWork) throws SemanticException { // Validate the input format @@ -297,11 +325,8 @@ public class Vectorizer implements Physi } } Map opRules = new LinkedHashMap(); - ValidationNodeProcessor vnp = new ValidationNodeProcessor(); - opRules.put(new RuleRegExp("R1", TableScanOperator.getOperatorName() + ".*" - + FileSinkOperator.getOperatorName()), vnp); - opRules.put(new RuleRegExp("R2", TableScanOperator.getOperatorName() + ".*" - + ReduceSinkOperator.getOperatorName()), vnp); + MapWorkValidationNodeProcessor vnp = new MapWorkValidationNodeProcessor(); + addMapWorkRules(opRules, vnp); Dispatcher disp = new DefaultRuleDispatcher(vnp, opRules, null); GraphWalker ogw = new DefaultGraphWalker(disp); // iterator the mapper operator tree @@ -320,14 +345,11 @@ public class Vectorizer implements Physi } private void vectorizeMapWork(MapWork mapWork) throws SemanticException { - LOG.info("Vectorizing task..."); + LOG.info("Vectorizing MapWork..."); mapWork.setVectorMode(true); Map opRules = new LinkedHashMap(); - VectorizationNodeProcessor vnp = new VectorizationNodeProcessor(mapWork); - opRules.put(new RuleRegExp("R1", TableScanOperator.getOperatorName() + ".*" + - ReduceSinkOperator.getOperatorName()), vnp); - opRules.put(new RuleRegExp("R2", TableScanOperator.getOperatorName() + ".*" - + FileSinkOperator.getOperatorName()), vnp); + MapWorkVectorizationNodeProcessor vnp = new MapWorkVectorizationNodeProcessor(mapWork); + addMapWorkRules(opRules, vnp); Dispatcher disp = new DefaultRuleDispatcher(vnp, opRules, null); GraphWalker ogw = new PreOrderWalker(disp); // iterator the mapper operator tree @@ -348,9 +370,114 @@ public class Vectorizer implements Physi return; } + + private void convertReduceWork(ReduceWork reduceWork) throws SemanticException { + boolean ret = validateReduceWork(reduceWork); + if (ret) { + vectorizeReduceWork(reduceWork); + } + } + + private boolean getOnlyStructObjectInspectors(ReduceWork reduceWork) throws SemanticException { + try { + // Check key ObjectInspector. + ObjectInspector keyObjectInspector = reduceWork.getKeyObjectInspector(); + if (keyObjectInspector == null || !(keyObjectInspector instanceof StructObjectInspector)) { + return false; + } + StructObjectInspector keyStructObjectInspector = (StructObjectInspector)keyObjectInspector; + keyColCount = keyStructObjectInspector.getAllStructFieldRefs().size(); + + // Tez doesn't use tagging... + if (reduceWork.getNeedsTagging()) { + return false; + } + + // Check value ObjectInspector. + ObjectInspector valueObjectInspector = reduceWork.getValueObjectInspector(); + if (valueObjectInspector == null || !(valueObjectInspector instanceof StructObjectInspector)) { + return false; + } + StructObjectInspector valueStructObjectInspector = (StructObjectInspector)valueObjectInspector; + valueColCount = valueStructObjectInspector.getAllStructFieldRefs().size(); + } catch (Exception e) { + throw new SemanticException(e); + } + return true; + } + + private void addReduceWorkRules(Map opRules, NodeProcessor np) { + opRules.put(new RuleRegExp("R1", ExtractOperator.getOperatorName() + ".*"), np); + opRules.put(new RuleRegExp("R2", GroupByOperator.getOperatorName() + ".*"), np); + } + + private boolean validateReduceWork(ReduceWork reduceWork) throws SemanticException { + // Validate input to ReduceWork. + if (!getOnlyStructObjectInspectors(reduceWork)) { + return false; + } + // Now check the reduce operator tree. + Map opRules = new LinkedHashMap(); + ReduceWorkValidationNodeProcessor vnp = new ReduceWorkValidationNodeProcessor(); + addReduceWorkRules(opRules, vnp); + Dispatcher disp = new DefaultRuleDispatcher(vnp, opRules, null); + GraphWalker ogw = new DefaultGraphWalker(disp); + // iterator the reduce operator tree + ArrayList topNodes = new ArrayList(); + topNodes.add(reduceWork.getReducer()); + HashMap nodeOutput = new HashMap(); + ogw.startWalking(topNodes, nodeOutput); + for (Node n : nodeOutput.keySet()) { + if (nodeOutput.get(n) != null) { + if (!((Boolean)nodeOutput.get(n)).booleanValue()) { + return false; + } + } + } + return true; + } + + private void vectorizeReduceWork(ReduceWork reduceWork) throws SemanticException { + LOG.info("Vectorizing ReduceWork..."); + reduceWork.setVectorMode(true); + + // For some reason, the DefaultGraphWalker does not descend down from the reducer Operator as expected. + // We need to descend down, otherwise it breaks our algorithm that determines VectorizationContext... + // Do we use PreOrderWalker instead of DefaultGraphWalker. + Map opRules = new LinkedHashMap(); + ReduceWorkVectorizationNodeProcessor vnp = new ReduceWorkVectorizationNodeProcessor(reduceWork, keyColCount, valueColCount); + addReduceWorkRules(opRules, vnp); + Dispatcher disp = new DefaultRuleDispatcher(vnp, opRules, null); + GraphWalker ogw = new PreOrderWalker(disp); + // iterator the reduce operator tree + ArrayList topNodes = new ArrayList(); + topNodes.add(reduceWork.getReducer()); + LOG.info("vectorizeReduceWork reducer Operator: " + reduceWork.getReducer().getName() + "..."); + HashMap nodeOutput = new HashMap(); + ogw.startWalking(topNodes, nodeOutput); + + // Necessary since we are vectorizing the root operator in reduce. + reduceWork.setReducer(vnp.getRootVectorOp()); + + Operator reducer = reduceWork.getReducer(); + if (reducer.getType().equals(OperatorType.EXTRACT)) { + ((VectorExtractOperator)reducer).setKeyAndValueColCounts(keyColCount, valueColCount); + } + + Map> columnVectorTypes = vnp.getScratchColumnVectorTypes(); + reduceWork.setScratchColumnVectorTypes(columnVectorTypes); + Map> columnMap = vnp.getScratchColumnMap(); + reduceWork.setScratchColumnMap(columnMap); + + + if (LOG.isDebugEnabled()) { + LOG.debug(String.format("vectorTypes: %s", columnVectorTypes.toString())); + LOG.debug(String.format("columnMap: %s", columnMap.toString())); + } + } } - class ValidationNodeProcessor implements NodeProcessor { + class MapWorkValidationNodeProcessor implements NodeProcessor { @Override public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, @@ -361,9 +488,9 @@ public class Vectorizer implements Physi op.getParentOperators().get(0).getType().equals(OperatorType.GROUPBY)) { return new Boolean(true); } - boolean ret = validateOperator(op); + boolean ret = validateMapWorkOperator(op); if (!ret) { - LOG.info("Operator: " + op.getName() + " could not be vectorized."); + LOG.info("MapWork Operator: " + op.getName() + " could not be vectorized."); return new Boolean(false); } } @@ -371,24 +498,37 @@ public class Vectorizer implements Physi } } - class VectorizationNodeProcessor implements NodeProcessor { + class ReduceWorkValidationNodeProcessor implements NodeProcessor { - private final MapWork mWork; + @Override + public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, + Object... nodeOutputs) throws SemanticException { + for (Node n : stack) { + Operator op = (Operator) n; + boolean ret = validateReduceWorkOperator(op); + if (!ret) { + LOG.info("ReduceWork Operator: " + op.getName() + " could not be vectorized."); + return new Boolean(false); + } + } + return new Boolean(true); + } + } + + // This class has common code used by both MapWorkVectorizationNodeProcessor and + // ReduceWorkVectorizationNodeProcessor. + class VectorizationNodeProcessor implements NodeProcessor { // This is used to extract scratch column types for each file key - private final Map scratchColumnContext = + protected final Map scratchColumnContext = new HashMap(); - private final Map, VectorizationContext> vContextsByTSOp = + protected final Map, VectorizationContext> vContextsByTSOp = new HashMap, VectorizationContext>(); - private final Set> opsDone = + protected final Set> opsDone = new HashSet>(); - public VectorizationNodeProcessor(MapWork mWork) { - this.mWork = mWork; - } - public Map> getScratchColumnVectorTypes() { Map> scratchColumnVectorTypes = new HashMap>(); @@ -411,16 +551,90 @@ public class Vectorizer implements Physi return scratchColumnMap; } + public VectorizationContext walkStackToFindVectorizationContext(Stack stack, Operator op) + throws SemanticException { + VectorizationContext vContext = null; + if (stack.size() <= 1) { + throw new SemanticException(String.format("Expected operator stack for operator %s to have at least 2 operators", op.getName())); + } + // Walk down the stack of operators until we found one willing to give us a context. + // At the bottom will be the root operator, guaranteed to have a context + int i= stack.size()-2; + while (vContext == null) { + if (i < 0) { + throw new SemanticException(String.format("Did not find vectorization context for operator %s in operator stack", op.getName())); + } + Operator opParent = (Operator) stack.get(i); + vContext = vContextsByTSOp.get(opParent); + --i; + } + return vContext; + } + + public Boolean nonVectorizableChildOfGroupBy(Operator op) { + Operator currentOp = op; + while (currentOp.getParentOperators().size() > 0) { + currentOp = currentOp.getParentOperators().get(0); + if (currentOp.getType().equals(OperatorType.GROUPBY)) { + // No need to vectorize + if (!opsDone.contains(op)) { + opsDone.add(op); + } + return true; + } + } + return false; + } + + public Operator doVectorize(Operator op, VectorizationContext vContext) + throws SemanticException { + Operator vectorOp = op; + try { + if (!opsDone.contains(op)) { + vectorOp = vectorizeOperator(op, vContext); + opsDone.add(op); + if (vectorOp != op) { + opsDone.add(vectorOp); + } + if (vectorOp instanceof VectorizationContextRegion) { + VectorizationContextRegion vcRegion = (VectorizationContextRegion) vectorOp; + VectorizationContext vOutContext = vcRegion.getOuputVectorizationContext(); + vContextsByTSOp.put(op, vOutContext); + scratchColumnContext.put(vOutContext.getFileKey(), vOutContext); + } + } + } catch (HiveException e) { + throw new SemanticException(e); + } + return vectorOp; + } + + @Override + public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, + Object... nodeOutputs) throws SemanticException { + throw new SemanticException("Must be overridden"); + } + } + + class MapWorkVectorizationNodeProcessor extends VectorizationNodeProcessor { + + private final MapWork mWork; + + public MapWorkVectorizationNodeProcessor(MapWork mWork) { + this.mWork = mWork; + } + @Override public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException { Operator op = (Operator) nd; + LOG.info("MapWorkVectorizationNodeProcessor processing Operator: " + op.getName() + "..."); VectorizationContext vContext = null; if (op instanceof TableScanOperator) { - vContext = getVectorizationContext((TableScanOperator) op, physicalContext); + vContext = getVectorizationContext(op, physicalContext); for (String onefile : mWork.getPathToAliases().keySet()) { List aliases = mWork.getPathToAliases().get(onefile); for (String alias : aliases) { @@ -438,45 +652,76 @@ public class Vectorizer implements Physi } vContextsByTSOp.put(op, vContext); } else { - assert stack.size() > 1; - // Walk down the stack of operators until we found one willing to give us a context. - // At the bottom will be the TS operator, guaranteed to have a context - int i= stack.size()-2; - while (vContext == null) { - Operator opParent = (Operator) stack.get(i); - vContext = vContextsByTSOp.get(opParent); - --i; - } + vContext = walkStackToFindVectorizationContext(stack, op); } assert vContext != null; - if ((op.getType().equals(OperatorType.REDUCESINK) || op.getType().equals(OperatorType.FILESINK)) && - op.getParentOperators().get(0).getType().equals(OperatorType.GROUPBY)) { - // No need to vectorize - if (!opsDone.contains(op)) { - opsDone.add(op); - } + // Currently, Vectorized GROUPBY outputs rows, not vectorized row batchs. So, don't vectorize + // any operators below GROUPBY. + if (nonVectorizableChildOfGroupBy(op)) { + return null; + } + + doVectorize(op, vContext); + + return null; + } + } + + class ReduceWorkVectorizationNodeProcessor extends VectorizationNodeProcessor { + + private final ReduceWork rWork; + private int keyColCount; + private int valueColCount; + private Map reduceColumnNameMap; + + private Operator rootVectorOp; + + public Operator getRootVectorOp() { + return rootVectorOp; + } + + public ReduceWorkVectorizationNodeProcessor(ReduceWork rWork, int keyColCount, int valueColCount) { + this.rWork = rWork; + reduceColumnNameMap = rWork.getReduceColumnNameMap(); + this.keyColCount = keyColCount; + this.valueColCount = valueColCount; + rootVectorOp = null; + } + + @Override + public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, + Object... nodeOutputs) throws SemanticException { + + Operator op = (Operator) nd; + LOG.info("ReduceWorkVectorizationNodeProcessor processing Operator: " + op.getName() + "..."); + + VectorizationContext vContext = null; + + boolean saveRootVectorOp = false; + + if (op.getParentOperators().size() == 0) { + vContext = getReduceVectorizationContext(reduceColumnNameMap); + vContextsByTSOp.put(op, vContext); + saveRootVectorOp = true; } else { - try { - if (!opsDone.contains(op)) { - Operator vectorOp = - vectorizeOperator(op, vContext); - opsDone.add(op); - if (vectorOp != op) { - opsDone.add(vectorOp); - } - if (vectorOp instanceof VectorizationContextRegion) { - VectorizationContextRegion vcRegion = (VectorizationContextRegion) vectorOp; - VectorizationContext vOutContext = vcRegion.getOuputVectorizationContext(); - vContextsByTSOp.put(op, vOutContext); - scratchColumnContext.put(vOutContext.getFileKey(), vOutContext); - } - } - } catch (HiveException e) { - throw new SemanticException(e); - } + vContext = walkStackToFindVectorizationContext(stack, op); + } + + assert vContext != null; + + // Currently, Vectorized GROUPBY outputs rows, not vectorized row batchs. So, don't vectorize + // any operators below GROUPBY. + if (nonVectorizableChildOfGroupBy(op)) { + return null; } + + Operator vectorOp = doVectorize(op, vContext); + if (saveRootVectorOp && op != vectorOp) { + rootVectorOp = vectorOp; + } + return null; } } @@ -519,7 +764,7 @@ public class Vectorizer implements Physi return pctx; } - boolean validateOperator(Operator op) { + boolean validateMapWorkOperator(Operator op) { boolean ret = false; switch (op.getType()) { case MAPJOIN: @@ -555,6 +800,32 @@ public class Vectorizer implements Physi return ret; } + boolean validateReduceWorkOperator(Operator op) { + boolean ret = false; + switch (op.getType()) { + case EXTRACT: + ret = validateExtractOperator((ExtractOperator) op); + break; + case FILTER: + ret = validateFilterOperator((FilterOperator) op); + break; + case SELECT: + ret = validateSelectOperator((SelectOperator) op); + break; + case REDUCESINK: + ret = validateReduceSinkOperator((ReduceSinkOperator) op); + break; + case FILESINK: + case LIMIT: + ret = true; + break; + default: + ret = false; + break; + } + return ret; + } + private boolean validateSMBMapJoinOperator(SMBMapJoinOperator op) { SMBJoinDesc desc = op.getConf(); // Validation is the same as for map join, since the 'small' tables are not vectorized @@ -617,6 +888,15 @@ public class Vectorizer implements Physi return validateAggregationDesc(op.getConf().getAggregators()); } + private boolean validateExtractOperator(ExtractOperator op) { + ExprNodeDesc expr = op.getConf().getCol(); + boolean ret = validateExprNodeDesc(expr); + if (!ret) { + return false; + } + return true; + } + private boolean validateExprNodeDesc(List descs) { return validateExprNodeDesc(descs, VectorExpressionDescriptor.Mode.PROJECTION); } @@ -728,7 +1008,7 @@ public class Vectorizer implements Physi return supportedDataTypesPattern.matcher(type.toLowerCase()).matches(); } - private VectorizationContext getVectorizationContext(TableScanOperator op, + private VectorizationContext getVectorizationContext(Operator op, PhysicalContext pctx) { RowSchema rs = op.getSchema(); @@ -741,8 +1021,26 @@ public class Vectorizer implements Physi } } - VectorizationContext vc = new VectorizationContext(cmap, columnCount); - return vc; + return new VectorizationContext(cmap, columnCount); + } + + private VectorizationContext getReduceVectorizationContext(Map reduceColumnNameMap) { + return new VectorizationContext(reduceColumnNameMap, reduceColumnNameMap.size()); + } + + private void fixupParentChildOperators(Operator op, Operator vectorOp) { + if (op.getParentOperators() != null) { + vectorOp.setParentOperators(op.getParentOperators()); + for (Operator p : op.getParentOperators()) { + p.replaceChild(op, vectorOp); + } + } + if (op.getChildOperators() != null) { + vectorOp.setChildOperators(op.getChildOperators()); + for (Operator c : op.getChildOperators()) { + c.replaceParent(op, vectorOp); + } + } } Operator vectorizeOperator(Operator op, @@ -757,6 +1055,7 @@ public class Vectorizer implements Physi case FILESINK: case REDUCESINK: case LIMIT: + case EXTRACT: vectorOp = OperatorFactory.getVectorOperator(op.getConf(), vContext); break; default: @@ -765,18 +1064,7 @@ public class Vectorizer implements Physi } if (vectorOp != op) { - if (op.getParentOperators() != null) { - vectorOp.setParentOperators(op.getParentOperators()); - for (Operator p : op.getParentOperators()) { - p.replaceChild(op, vectorOp); - } - } - if (op.getChildOperators() != null) { - vectorOp.setChildOperators(op.getChildOperators()); - for (Operator c : op.getChildOperators()) { - c.replaceParent(op, vectorOp); - } - } + fixupParentChildOperators(op, vectorOp); ((AbstractOperatorDesc) vectorOp.getConf()).setVectorMode(true); } return vectorOp;