Return-Path: X-Original-To: apmail-hive-commits-archive@www.apache.org Delivered-To: apmail-hive-commits-archive@www.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 3F9EE189B8 for ; Fri, 13 Nov 2015 20:49:57 +0000 (UTC) Received: (qmail 99597 invoked by uid 500); 13 Nov 2015 20:49:57 -0000 Delivered-To: apmail-hive-commits-archive@hive.apache.org Received: (qmail 99546 invoked by uid 500); 13 Nov 2015 20:49:57 -0000 Mailing-List: contact commits-help@hive.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: hive-dev@hive.apache.org Delivered-To: mailing list commits@hive.apache.org Received: (qmail 99535 invoked by uid 99); 13 Nov 2015 20:49:57 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Fri, 13 Nov 2015 20:49:57 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id 9A291E02A2; Fri, 13 Nov 2015 20:49:56 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: prasanthj@apache.org To: commits@hive.apache.org Message-Id: <93e7c9fc05ef46eab39d5dc1a68af0d7@git.apache.org> X-Mailer: ASF-Git Admin Mailer Subject: hive git commit: HIVE-11120: Generic interface for file format validation (Prasanth Jayachandran reviewed by Xuefu Zhang) Date: Fri, 13 Nov 2015 20:49:56 +0000 (UTC) Repository: hive Updated Branches: refs/heads/branch-1 9a86cad54 -> 9fc7442ec HIVE-11120: Generic interface for file format validation (Prasanth Jayachandran reviewed by Xuefu Zhang) Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/9fc7442e Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/9fc7442e Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/9fc7442e Branch: refs/heads/branch-1 Commit: 9fc7442ec231f5b97ffe2bdc5cf9642aced7ff68 Parents: 9a86cad Author: Prasanth Jayachandran Authored: Fri Nov 13 14:49:44 2015 -0600 Committer: Prasanth Jayachandran Committed: Fri Nov 13 14:49:44 2015 -0600 ---------------------------------------------------------------------- .../apache/hadoop/hive/ql/exec/MoveTask.java | 59 +++++-- .../hadoop/hive/ql/io/HiveFileFormatUtils.java | 162 +++++++++---------- .../ql/io/SequenceFileInputFormatChecker.java | 3 +- .../hadoop/hive/ql/io/orc/OrcInputFormat.java | 4 + .../hive/ql/parse/LoadSemanticAnalyzer.java | 49 +++--- .../ql/parse/LoadSemanticAnalyzer.java.orig | 48 +++--- .../clientnegative/archive_corrupt.q.out | 14 +- .../clientnegative/load_orc_negative1.q.out | 2 +- .../clientnegative/load_orc_negative2.q.out | 2 +- .../clientnegative/load_orc_negative3.q.out | 2 +- .../clientnegative/load_orc_negative_part.q.out | 2 +- .../clientnegative/load_wrong_fileformat.q.out | 7 +- .../load_wrong_fileformat_rc_seq.q.out | 7 +- .../load_wrong_fileformat_txt_seq.q.out | 7 +- 14 files changed, 170 insertions(+), 198 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/9fc7442e/ql/src/java/org/apache/hadoop/hive/ql/exec/MoveTask.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/MoveTask.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/MoveTask.java index 6b8cfd8..c428812 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/MoveTask.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/MoveTask.java @@ -18,6 +18,15 @@ package org.apache.hadoop.hive.ql.exec; +import java.io.IOException; +import java.io.Serializable; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.LinkedHashMap; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; + import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.fs.FileStatus; @@ -61,19 +70,9 @@ import org.apache.hadoop.hive.ql.plan.api.StageType; import org.apache.hadoop.hive.ql.session.SessionState; import org.apache.hadoop.hive.shims.HadoopShims; import org.apache.hadoop.hive.shims.ShimLoader; +import org.apache.hadoop.security.AccessControlException; import org.apache.hadoop.util.StringUtils; -import java.io.IOException; -import java.io.Serializable; -import java.security.AccessControlException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.HashSet; -import java.util.LinkedHashSet; -import java.util.LinkedHashMap; -import java.util.List; -import java.util.Map; - /** * MoveTask implementation. **/ @@ -292,13 +291,39 @@ public class MoveTask extends Task implements Serializable { throw new HiveException( "addFiles: filesystem error in check phase", e); } + + // handle file format check for table level if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVECHECKFILEFORMAT)) { - // Check if the file format of the file matches that of the table. - boolean flag = HiveFileFormatUtils.checkInputFormat( - srcFs, conf, tbd.getTable().getInputFileFormatClass(), files); - if (!flag) { - throw new HiveException( - "Wrong file format. Please check the file's format."); + boolean flag = true; + // work.checkFileFormat is set to true only for Load Task, so assumption here is + // dynamic partition context is null + if (tbd.getDPCtx() == null) { + if (tbd.getPartitionSpec() == null || tbd.getPartitionSpec().isEmpty()) { + // Check if the file format of the file matches that of the table. + flag = HiveFileFormatUtils.checkInputFormat( + srcFs, conf, tbd.getTable().getInputFileFormatClass(), files); + } else { + // Check if the file format of the file matches that of the partition + Partition oldPart = db.getPartition(table, tbd.getPartitionSpec(), false); + if (oldPart == null) { + // this means we have just created a table and are specifying partition in the + // load statement (without pre-creating the partition), in which case lets use + // table input format class. inheritTableSpecs defaults to true so when a new + // partition is created later it will automatically inherit input format + // from table object + flag = HiveFileFormatUtils.checkInputFormat( + srcFs, conf, tbd.getTable().getInputFileFormatClass(), files); + } else { + flag = HiveFileFormatUtils.checkInputFormat( + srcFs, conf, oldPart.getInputFormatClass(), files); + } + } + if (!flag) { + throw new HiveException( + "Wrong file format. Please check the file's format."); + } + } else { + LOG.warn("Skipping file format check as dpCtx is not null"); } } } http://git-wip-us.apache.org/repos/asf/hive/blob/9fc7442e/ql/src/java/org/apache/hadoop/hive/ql/io/HiveFileFormatUtils.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/HiveFileFormatUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/io/HiveFileFormatUtils.java index 06d3df7..f6a0cb9 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/HiveFileFormatUtils.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/HiveFileFormatUtils.java @@ -31,7 +31,6 @@ import java.util.Map; import java.util.Map.Entry; import java.util.Properties; import java.util.Set; -import java.util.concurrent.ConcurrentHashMap; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -44,7 +43,7 @@ import org.apache.hadoop.hive.common.JavaUtils; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter; import org.apache.hadoop.hive.ql.exec.Operator; -import org.apache.hadoop.hive.ql.exec.Utilities; +import org.apache.hadoop.hive.ql.io.orc.OrcInputFormat; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.FileSinkDesc; import org.apache.hadoop.hive.ql.plan.OperatorDesc; @@ -69,6 +68,10 @@ import org.apache.hadoop.mapred.TextInputFormat; import org.apache.hadoop.util.Shell; import org.apache.hive.common.util.ReflectionUtil; +import com.google.common.cache.Cache; +import com.google.common.cache.CacheBuilder; +import com.google.common.collect.ImmutableMap; + /** * An util class for various Hive file format tasks. * registerOutputFormatSubstitute(Class, Class) getOutputFormatSubstitute(Class) @@ -79,30 +82,68 @@ import org.apache.hive.common.util.ReflectionUtil; public final class HiveFileFormatUtils { private static final Log LOG = LogFactory.getLog(HiveFileFormatUtils.class); - static { - outputFormatSubstituteMap = - new ConcurrentHashMap, Class>(); - HiveFileFormatUtils.registerOutputFormatSubstitute( - IgnoreKeyTextOutputFormat.class, HiveIgnoreKeyTextOutputFormat.class); - HiveFileFormatUtils.registerOutputFormatSubstitute( - SequenceFileOutputFormat.class, HiveSequenceFileOutputFormat.class); - } + public static class FileChecker { + // we don't have many file formats that implement InputFormatChecker. We won't be holding + // multiple instances of such classes + private static int MAX_CACHE_SIZE = 16; - @SuppressWarnings("unchecked") - private static Map, Class> - outputFormatSubstituteMap; + // immutable maps + Map, Class> inputFormatCheckerMap; + Map, Class> outputFormatSubstituteMap; - /** - * register a substitute. - * - * @param origin - * the class that need to be substituted - * @param substitute - */ - @SuppressWarnings("unchecked") - public static void registerOutputFormatSubstitute(Class origin, - Class substitute) { - outputFormatSubstituteMap.put(origin, substitute); + // mutable thread-safe map to store instances + Cache, InputFormatChecker> inputFormatCheckerInstanceCache; + + // classloader invokes this static block when its first loaded (lazy initialization). + // Class loading is thread safe. + private static class Factory { + static final FileChecker INSTANCE = new FileChecker(); + } + + public static FileChecker getInstance() { + return Factory.INSTANCE; + } + + private FileChecker() { + // read-only maps (initialized once) + inputFormatCheckerMap = ImmutableMap + ., Class>builder() + .put(SequenceFileInputFormat.class, SequenceFileInputFormatChecker.class) + .put(RCFileInputFormat.class, RCFileInputFormat.class) + .put(OrcInputFormat.class, OrcInputFormat.class) + .build(); + outputFormatSubstituteMap = ImmutableMap + ., Class>builder() + .put(IgnoreKeyTextOutputFormat.class, HiveIgnoreKeyTextOutputFormat.class) + .put(SequenceFileOutputFormat.class, HiveSequenceFileOutputFormat.class) + .build(); + + // updatable map that holds instances of the class + inputFormatCheckerInstanceCache = CacheBuilder.newBuilder().maximumSize(MAX_CACHE_SIZE) + .build(); + } + + public Set> registeredClasses() { + return inputFormatCheckerMap.keySet(); + } + + public Class getOutputFormatSubstiture(Class origin) { + return outputFormatSubstituteMap.get(origin); + } + + public Class getInputFormatCheckerClass(Class inputFormat) { + return inputFormatCheckerMap.get(inputFormat); + } + + public void putInputFormatCheckerInstance( + Class checkerCls, InputFormatChecker instanceCls) { + inputFormatCheckerInstanceCache.put(checkerCls, instanceCls); + } + + public InputFormatChecker getInputFormatCheckerInstance( + Class checkerCls) { + return inputFormatCheckerInstanceCache.getIfPresent(checkerCls); + } } /** @@ -114,7 +155,8 @@ public final class HiveFileFormatUtils { if (origin == null || HiveOutputFormat.class.isAssignableFrom(origin)) { return (Class) origin; // hive native } - Class substitute = outputFormatSubstituteMap.get(origin); + Class substitute = FileChecker.getInstance() + .getOutputFormatSubstiture(origin); if (substitute != null) { return substitute; // substituted } @@ -122,66 +164,6 @@ public final class HiveFileFormatUtils { } /** - * get the final output path of a given FileOutputFormat. - * - * @param parent - * parent dir of the expected final output path - * @param jc - * job configuration - * @deprecated - */ - @Deprecated - public static Path getOutputFormatFinalPath(Path parent, String taskId, JobConf jc, - HiveOutputFormat hiveOutputFormat, boolean isCompressed, - Path defaultFinalPath) throws IOException { - if (hiveOutputFormat instanceof HiveIgnoreKeyTextOutputFormat) { - return new Path(parent, taskId - + Utilities.getFileExtension(jc, isCompressed)); - } - return defaultFinalPath; - } - - static { - inputFormatCheckerMap = - new HashMap, Class>(); - HiveFileFormatUtils.registerInputFormatChecker( - SequenceFileInputFormat.class, SequenceFileInputFormatChecker.class); - HiveFileFormatUtils.registerInputFormatChecker(RCFileInputFormat.class, - RCFileInputFormat.class); - inputFormatCheckerInstanceCache = - new HashMap, InputFormatChecker>(); - } - - @SuppressWarnings("unchecked") - private static Map, Class> inputFormatCheckerMap; - - private static Map, InputFormatChecker> inputFormatCheckerInstanceCache; - - /** - * register an InputFormatChecker for a given InputFormat. - * - * @param format - * the class that need to be substituted - * @param checker - */ - @SuppressWarnings("unchecked") - public static synchronized void registerInputFormatChecker( - Class format, - Class checker) { - inputFormatCheckerMap.put(format, checker); - } - - /** - * get an InputFormatChecker for a file format. - */ - public static synchronized Class getInputFormatChecker( - Class inputFormat) { - Class result = inputFormatCheckerMap - .get(inputFormat); - return result; - } - - /** * checks if files are in same format as the given input format. */ @SuppressWarnings("unchecked") @@ -189,7 +171,8 @@ public final class HiveFileFormatUtils { Class inputFormatCls, List files) throws HiveException { if (files.isEmpty()) return false; - Class checkerCls = getInputFormatChecker(inputFormatCls); + Class checkerCls = FileChecker.getInstance() + .getInputFormatCheckerClass(inputFormatCls); if (checkerCls == null && inputFormatCls.isAssignableFrom(TextInputFormat.class)) { // we get a text input format here, we can not determine a file is text @@ -200,11 +183,12 @@ public final class HiveFileFormatUtils { } if (checkerCls != null) { - InputFormatChecker checkerInstance = inputFormatCheckerInstanceCache.get(checkerCls); + InputFormatChecker checkerInstance = FileChecker.getInstance() + .getInputFormatCheckerInstance(checkerCls); try { if (checkerInstance == null) { checkerInstance = checkerCls.newInstance(); - inputFormatCheckerInstanceCache.put(checkerCls, checkerInstance); + FileChecker.getInstance().putInputFormatCheckerInstance(checkerCls, checkerInstance); } return checkerInstance.validateInput(fs, conf, files); } catch (Exception e) { @@ -228,7 +212,7 @@ public final class HiveFileFormatUtils { } } if (files2.isEmpty()) return true; - Set> inputFormatter = inputFormatCheckerMap.keySet(); + Set> inputFormatter = FileChecker.getInstance().registeredClasses(); for (Class reg : inputFormatter) { boolean result = checkInputFormat(fs, conf, reg, files2); if (result) { http://git-wip-us.apache.org/repos/asf/hive/blob/9fc7442e/ql/src/java/org/apache/hadoop/hive/ql/io/SequenceFileInputFormatChecker.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/SequenceFileInputFormatChecker.java b/ql/src/java/org/apache/hadoop/hive/ql/io/SequenceFileInputFormatChecker.java index 6cb46c9..f59b838 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/SequenceFileInputFormatChecker.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/SequenceFileInputFormatChecker.java @@ -19,7 +19,6 @@ package org.apache.hadoop.hive.ql.io; import java.io.IOException; -import java.util.ArrayList; import java.util.List; import org.apache.hadoop.fs.FileStatus; @@ -49,7 +48,7 @@ public class SequenceFileInputFormatChecker implements InputFormatChecker { reader = null; } catch (IOException e) { return false; - }finally{ + } finally{ IOUtils.closeStream(reader); } } http://git-wip-us.apache.org/repos/asf/hive/blob/9fc7442e/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java index 9a61ca0..e3e6893 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java @@ -334,6 +334,10 @@ public class OrcInputFormat implements InputFormat, return false; } for (FileStatus file : files) { + // 0 length files cannot be ORC files + if (file.getLen() == 0) { + return false; + } try { OrcFile.createReader(file.getPath(), OrcFile.readerOptions(conf).filesystem(fs)); http://git-wip-us.apache.org/repos/asf/hive/blob/9fc7442e/ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java index aacfa92..1e88484 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java @@ -41,9 +41,7 @@ import org.apache.hadoop.hive.ql.exec.Task; import org.apache.hadoop.hive.ql.exec.TaskFactory; import org.apache.hadoop.hive.ql.exec.Utilities; import org.apache.hadoop.hive.ql.hooks.WriteEntity; -import org.apache.hadoop.hive.ql.io.FileFormatException; -import org.apache.hadoop.hive.ql.io.orc.OrcFile; -import org.apache.hadoop.hive.ql.io.orc.OrcInputFormat; +import org.apache.hadoop.hive.ql.io.HiveFileFormatUtils; import org.apache.hadoop.hive.ql.metadata.Hive; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.metadata.Partition; @@ -52,6 +50,8 @@ import org.apache.hadoop.hive.ql.plan.MoveWork; import org.apache.hadoop.hive.ql.plan.StatsWork; import org.apache.hadoop.mapred.InputFormat; +import com.google.common.collect.Lists; + /** * LoadSemanticAnalyzer. * @@ -128,7 +128,7 @@ public class LoadSemanticAnalyzer extends BaseSemanticAnalyzer { return new URI(fromScheme, fromAuthority, path, null, null); } - private FileStatus[] applyConstraintsAndGetFiles(URI fromURI, URI toURI, Tree ast, + private List applyConstraintsAndGetFiles(URI fromURI, Tree ast, boolean isLocal) throws SemanticException { FileStatus[] srcs = null; @@ -159,7 +159,7 @@ public class LoadSemanticAnalyzer extends BaseSemanticAnalyzer { throw new SemanticException(ErrorMsg.INVALID_PATH.getMsg(ast), e); } - return srcs; + return Lists.newArrayList(srcs); } @Override @@ -214,9 +214,6 @@ public class LoadSemanticAnalyzer extends BaseSemanticAnalyzer { throw new SemanticException(ErrorMsg.LOAD_INTO_STORED_AS_DIR.getMsg()); } - URI toURI = ((ts.partHandle != null) ? ts.partHandle.getDataLocation() - : ts.tableHandle.getDataLocation()).toUri(); - List parts = ts.tableHandle.getPartitionKeys(); if ((parts != null && parts.size() > 0) && (ts.partSpec == null || ts.partSpec.size() == 0)) { @@ -224,11 +221,12 @@ public class LoadSemanticAnalyzer extends BaseSemanticAnalyzer { } // make sure the arguments make sense - FileStatus[] files = applyConstraintsAndGetFiles(fromURI, toURI, fromTree, isLocal); + List files = applyConstraintsAndGetFiles(fromURI, fromTree, isLocal); // for managed tables, make sure the file formats match - if (TableType.MANAGED_TABLE.equals(ts.tableHandle.getTableType())) { - ensureFileFormatsMatch(ts, files); + if (TableType.MANAGED_TABLE.equals(ts.tableHandle.getTableType()) + && conf.getBoolVar(HiveConf.ConfVars.HIVECHECKFILEFORMAT)) { + ensureFileFormatsMatch(ts, files, fromURI); } inputs.add(toReadEntity(new Path(fromURI))); Task rTask = null; @@ -326,7 +324,9 @@ public class LoadSemanticAnalyzer extends BaseSemanticAnalyzer { } } - private void ensureFileFormatsMatch(TableSpec ts, FileStatus[] fileStatuses) throws SemanticException { + private void ensureFileFormatsMatch(TableSpec ts, List fileStatuses, + final URI fromURI) + throws SemanticException { final Class destInputFormat; try { if (ts.getPartSpec() == null || ts.getPartSpec().isEmpty()) { @@ -338,23 +338,16 @@ public class LoadSemanticAnalyzer extends BaseSemanticAnalyzer { throw new SemanticException(e); } - // Other file formats should do similar check to make sure file formats match - // when doing LOAD DATA .. INTO TABLE - if (OrcInputFormat.class.equals(destInputFormat)) { - for (FileStatus fileStatus : fileStatuses) { - try { - Path filePath = fileStatus.getPath(); - FileSystem fs = FileSystem.get(filePath.toUri(), conf); - // just creating orc reader is going to do sanity checks to make sure its valid ORC file - OrcFile.createReader(fs, filePath); - } catch (FileFormatException e) { - throw new SemanticException(ErrorMsg.INVALID_FILE_FORMAT_IN_LOAD.getMsg("Destination" + - " table is stored as ORC but the file being loaded is not a valid ORC file.")); - } catch (IOException e) { - throw new SemanticException("Unable to load data to destination table." + - " Error: " + e.getMessage()); - } + try { + FileSystem fs = FileSystem.get(fromURI, conf); + boolean validFormat = HiveFileFormatUtils.checkInputFormat(fs, conf, destInputFormat, + fileStatuses); + if (!validFormat) { + throw new SemanticException(ErrorMsg.INVALID_FILE_FORMAT_IN_LOAD.getMsg()); } + } catch (Exception e) { + throw new SemanticException("Unable to load data to destination table." + + " Error: " + e.getMessage()); } } } http://git-wip-us.apache.org/repos/asf/hive/blob/9fc7442e/ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java.orig ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java.orig b/ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java.orig index 944cee4..aacfa92 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java.orig +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java.orig @@ -128,9 +128,11 @@ public class LoadSemanticAnalyzer extends BaseSemanticAnalyzer { return new URI(fromScheme, fromAuthority, path, null, null); } - private void applyConstraints(URI fromURI, URI toURI, Tree ast, + private FileStatus[] applyConstraintsAndGetFiles(URI fromURI, URI toURI, Tree ast, boolean isLocal) throws SemanticException { + FileStatus[] srcs = null; + // local mode implies that scheme should be "file" // we can change this going forward if (isLocal && !fromURI.getScheme().equals("file")) { @@ -139,7 +141,7 @@ public class LoadSemanticAnalyzer extends BaseSemanticAnalyzer { } try { - FileStatus[] srcs = matchFilesOrDir(FileSystem.get(fromURI, conf), new Path(fromURI)); + srcs = matchFilesOrDir(FileSystem.get(fromURI, conf), new Path(fromURI)); if (srcs == null || srcs.length == 0) { throw new SemanticException(ErrorMsg.INVALID_PATH.getMsg(ast, "No files matching path " + fromURI)); @@ -157,17 +159,7 @@ public class LoadSemanticAnalyzer extends BaseSemanticAnalyzer { throw new SemanticException(ErrorMsg.INVALID_PATH.getMsg(ast), e); } - // only in 'local' mode do we copy stuff from one place to another. - // reject different scheme/authority in other cases. - if (!isLocal - && (!StringUtils.equals(fromURI.getScheme(), toURI.getScheme()) || !StringUtils - .equals(fromURI.getAuthority(), toURI.getAuthority()))) { - String reason = "Move from: " + fromURI.toString() + " to: " - + toURI.toString() + " is not valid. " - + "Please check that values for params \"default.fs.name\" and " - + "\"hive.metastore.warehouse.dir\" do not conflict."; - throw new SemanticException(ErrorMsg.ILLEGAL_PATH.getMsg(ast, reason)); - } + return srcs; } @Override @@ -232,11 +224,11 @@ public class LoadSemanticAnalyzer extends BaseSemanticAnalyzer { } // make sure the arguments make sense - applyConstraints(fromURI, toURI, fromTree, isLocal); + FileStatus[] files = applyConstraintsAndGetFiles(fromURI, toURI, fromTree, isLocal); // for managed tables, make sure the file formats match if (TableType.MANAGED_TABLE.equals(ts.tableHandle.getTableType())) { - ensureFileFormatsMatch(ts, fromURI); + ensureFileFormatsMatch(ts, files); } inputs.add(toReadEntity(new Path(fromURI))); Task rTask = null; @@ -334,7 +326,7 @@ public class LoadSemanticAnalyzer extends BaseSemanticAnalyzer { } } - private void ensureFileFormatsMatch(TableSpec ts, URI fromURI) throws SemanticException { + private void ensureFileFormatsMatch(TableSpec ts, FileStatus[] fileStatuses) throws SemanticException { final Class destInputFormat; try { if (ts.getPartSpec() == null || ts.getPartSpec().isEmpty()) { @@ -349,17 +341,19 @@ public class LoadSemanticAnalyzer extends BaseSemanticAnalyzer { // Other file formats should do similar check to make sure file formats match // when doing LOAD DATA .. INTO TABLE if (OrcInputFormat.class.equals(destInputFormat)) { - Path inputFilePath = new Path(fromURI); - try { - FileSystem fs = FileSystem.get(fromURI, conf); - // just creating orc reader is going to do sanity checks to make sure its valid ORC file - OrcFile.createReader(fs, inputFilePath); - } catch (FileFormatException e) { - throw new SemanticException(ErrorMsg.INVALID_FILE_FORMAT_IN_LOAD.getMsg("Destination" + - " table is stored as ORC but the file being loaded is not a valid ORC file.")); - } catch (IOException e) { - throw new SemanticException("Unable to load data to destination table." + - " Error: " + e.getMessage()); + for (FileStatus fileStatus : fileStatuses) { + try { + Path filePath = fileStatus.getPath(); + FileSystem fs = FileSystem.get(filePath.toUri(), conf); + // just creating orc reader is going to do sanity checks to make sure its valid ORC file + OrcFile.createReader(fs, filePath); + } catch (FileFormatException e) { + throw new SemanticException(ErrorMsg.INVALID_FILE_FORMAT_IN_LOAD.getMsg("Destination" + + " table is stored as ORC but the file being loaded is not a valid ORC file.")); + } catch (IOException e) { + throw new SemanticException("Unable to load data to destination table." + + " Error: " + e.getMessage()); + } } } } http://git-wip-us.apache.org/repos/asf/hive/blob/9fc7442e/ql/src/test/results/clientnegative/archive_corrupt.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientnegative/archive_corrupt.q.out b/ql/src/test/results/clientnegative/archive_corrupt.q.out index 56e8ec4..892fbac 100644 --- a/ql/src/test/results/clientnegative/archive_corrupt.q.out +++ b/ql/src/test/results/clientnegative/archive_corrupt.q.out @@ -16,16 +16,4 @@ POSTHOOK: query: create table tstsrcpart like srcpart POSTHOOK: type: CREATETABLE POSTHOOK: Output: database:default POSTHOOK: Output: default@tstsrcpart -PREHOOK: query: -- EXCLUDE_HADOOP_MAJOR_VERSIONS(0.20) --- The version of GzipCodec that is provided in Hadoop 0.20 silently ignores --- file format errors. However, versions of Hadoop that include --- HADOOP-6835 (e.g. 0.23 and 1.x) cause a Wrong File Format exception --- to be thrown during the LOAD step. This former behavior is tested --- in clientpositive/archive_corrupt.q - -load data local inpath '../../data/files/archive_corrupt.rc' overwrite into table tstsrcpart partition (ds='2008-04-08', hr='11') -PREHOOK: type: LOAD -#### A masked pattern was here #### -PREHOOK: Output: default@tstsrcpart -Failed with exception Wrong file format. Please check the file's format. -FAILED: Execution Error, return code 1 from org.apache.hadoop.hive.ql.exec.MoveTask +FAILED: SemanticException Unable to load data to destination table. Error: The file that you are trying to load does not match the file format of the destination table. http://git-wip-us.apache.org/repos/asf/hive/blob/9fc7442e/ql/src/test/results/clientnegative/load_orc_negative1.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientnegative/load_orc_negative1.q.out b/ql/src/test/results/clientnegative/load_orc_negative1.q.out index ca15a30..d103546 100644 --- a/ql/src/test/results/clientnegative/load_orc_negative1.q.out +++ b/ql/src/test/results/clientnegative/load_orc_negative1.q.out @@ -6,4 +6,4 @@ POSTHOOK: query: create table orc_test (userid bigint, string1 string, subtype d POSTHOOK: type: CREATETABLE POSTHOOK: Output: database:default POSTHOOK: Output: default@orc_test -FAILED: SemanticException [Error 30019]: The file that you are trying to load does not match the file format of the destination table. Destination table is stored as ORC but the file being loaded is not a valid ORC file. +FAILED: SemanticException Unable to load data to destination table. Error: The file that you are trying to load does not match the file format of the destination table. http://git-wip-us.apache.org/repos/asf/hive/blob/9fc7442e/ql/src/test/results/clientnegative/load_orc_negative2.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientnegative/load_orc_negative2.q.out b/ql/src/test/results/clientnegative/load_orc_negative2.q.out index 77fb50e..9b0cb69 100644 --- a/ql/src/test/results/clientnegative/load_orc_negative2.q.out +++ b/ql/src/test/results/clientnegative/load_orc_negative2.q.out @@ -22,4 +22,4 @@ POSTHOOK: query: create table orc_test (userid bigint, string1 string, subtype d POSTHOOK: type: CREATETABLE POSTHOOK: Output: database:default POSTHOOK: Output: default@orc_test -FAILED: SemanticException [Error 30019]: The file that you are trying to load does not match the file format of the destination table. Destination table is stored as ORC but the file being loaded is not a valid ORC file. +FAILED: SemanticException Unable to load data to destination table. Error: The file that you are trying to load does not match the file format of the destination table. http://git-wip-us.apache.org/repos/asf/hive/blob/9fc7442e/ql/src/test/results/clientnegative/load_orc_negative3.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientnegative/load_orc_negative3.q.out b/ql/src/test/results/clientnegative/load_orc_negative3.q.out index 77fb50e..9b0cb69 100644 --- a/ql/src/test/results/clientnegative/load_orc_negative3.q.out +++ b/ql/src/test/results/clientnegative/load_orc_negative3.q.out @@ -22,4 +22,4 @@ POSTHOOK: query: create table orc_test (userid bigint, string1 string, subtype d POSTHOOK: type: CREATETABLE POSTHOOK: Output: database:default POSTHOOK: Output: default@orc_test -FAILED: SemanticException [Error 30019]: The file that you are trying to load does not match the file format of the destination table. Destination table is stored as ORC but the file being loaded is not a valid ORC file. +FAILED: SemanticException Unable to load data to destination table. Error: The file that you are trying to load does not match the file format of the destination table. http://git-wip-us.apache.org/repos/asf/hive/blob/9fc7442e/ql/src/test/results/clientnegative/load_orc_negative_part.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientnegative/load_orc_negative_part.q.out b/ql/src/test/results/clientnegative/load_orc_negative_part.q.out index 32dd627..2e8068d 100644 --- a/ql/src/test/results/clientnegative/load_orc_negative_part.q.out +++ b/ql/src/test/results/clientnegative/load_orc_negative_part.q.out @@ -49,4 +49,4 @@ POSTHOOK: query: alter table orc_test add partition(ds='11') POSTHOOK: type: ALTERTABLE_ADDPARTS POSTHOOK: Output: default@orc_test POSTHOOK: Output: default@orc_test@ds=11 -FAILED: SemanticException [Error 30019]: The file that you are trying to load does not match the file format of the destination table. Destination table is stored as ORC but the file being loaded is not a valid ORC file. +FAILED: SemanticException Unable to load data to destination table. Error: The file that you are trying to load does not match the file format of the destination table. http://git-wip-us.apache.org/repos/asf/hive/blob/9fc7442e/ql/src/test/results/clientnegative/load_wrong_fileformat.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientnegative/load_wrong_fileformat.q.out b/ql/src/test/results/clientnegative/load_wrong_fileformat.q.out index 732eb22..8ec0058 100644 --- a/ql/src/test/results/clientnegative/load_wrong_fileformat.q.out +++ b/ql/src/test/results/clientnegative/load_wrong_fileformat.q.out @@ -14,9 +14,4 @@ CREATE TABLE load_wrong_fileformat_T1(name STRING) STORED AS SEQUENCEFILE POSTHOOK: type: CREATETABLE POSTHOOK: Output: database:default POSTHOOK: Output: default@load_wrong_fileformat_T1 -PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/kv1.txt' INTO TABLE load_wrong_fileformat_T1 -PREHOOK: type: LOAD -#### A masked pattern was here #### -PREHOOK: Output: default@load_wrong_fileformat_t1 -Failed with exception Wrong file format. Please check the file's format. -FAILED: Execution Error, return code 1 from org.apache.hadoop.hive.ql.exec.MoveTask +FAILED: SemanticException Unable to load data to destination table. Error: The file that you are trying to load does not match the file format of the destination table. http://git-wip-us.apache.org/repos/asf/hive/blob/9fc7442e/ql/src/test/results/clientnegative/load_wrong_fileformat_rc_seq.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientnegative/load_wrong_fileformat_rc_seq.q.out b/ql/src/test/results/clientnegative/load_wrong_fileformat_rc_seq.q.out index b68b8e6..916eca4 100644 --- a/ql/src/test/results/clientnegative/load_wrong_fileformat_rc_seq.q.out +++ b/ql/src/test/results/clientnegative/load_wrong_fileformat_rc_seq.q.out @@ -14,9 +14,4 @@ CREATE TABLE T1(name STRING) STORED AS RCFILE POSTHOOK: type: CREATETABLE POSTHOOK: Output: database:default POSTHOOK: Output: default@T1 -PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/kv1.seq' INTO TABLE T1 -PREHOOK: type: LOAD -#### A masked pattern was here #### -PREHOOK: Output: default@t1 -Failed with exception Wrong file format. Please check the file's format. -FAILED: Execution Error, return code 1 from org.apache.hadoop.hive.ql.exec.MoveTask +FAILED: SemanticException Unable to load data to destination table. Error: The file that you are trying to load does not match the file format of the destination table. http://git-wip-us.apache.org/repos/asf/hive/blob/9fc7442e/ql/src/test/results/clientnegative/load_wrong_fileformat_txt_seq.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientnegative/load_wrong_fileformat_txt_seq.q.out b/ql/src/test/results/clientnegative/load_wrong_fileformat_txt_seq.q.out index 179a654..645ece6 100644 --- a/ql/src/test/results/clientnegative/load_wrong_fileformat_txt_seq.q.out +++ b/ql/src/test/results/clientnegative/load_wrong_fileformat_txt_seq.q.out @@ -14,9 +14,4 @@ CREATE TABLE T1(name STRING) STORED AS TEXTFILE POSTHOOK: type: CREATETABLE POSTHOOK: Output: database:default POSTHOOK: Output: default@T1 -PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/kv1.seq' INTO TABLE T1 -PREHOOK: type: LOAD -#### A masked pattern was here #### -PREHOOK: Output: default@t1 -Failed with exception Wrong file format. Please check the file's format. -FAILED: Execution Error, return code 1 from org.apache.hadoop.hive.ql.exec.MoveTask +FAILED: SemanticException Unable to load data to destination table. Error: The file that you are trying to load does not match the file format of the destination table.