hive-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From gunt...@apache.org
Subject hive git commit: HIVE-11118: Load data query should validate file formats with destination tables (Prasanth Jayachandran via Gunther Hagleitner)
Date Sat, 27 Jun 2015 01:50:10 GMT
Repository: hive
Updated Branches:
  refs/heads/branch-1 9b52ab52e -> 49da35903


HIVE-11118: Load data query should validate file formats with destination tables (Prasanth
Jayachandran via Gunther Hagleitner)


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/49da3590
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/49da3590
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/49da3590

Branch: refs/heads/branch-1
Commit: 49da35903f8334d6dd0c597563c34388772914cc
Parents: 9b52ab5
Author: Gunther Hagleitner <gunther@apache.org>
Authored: Fri Jun 26 18:48:40 2015 -0700
Committer: Gunther Hagleitner <gunther@apache.org>
Committed: Fri Jun 26 18:49:41 2015 -0700

----------------------------------------------------------------------
 .../org/apache/hadoop/hive/ql/ErrorMsg.java     | 15 ++---
 .../hadoop/hive/ql/io/FileFormatException.java  | 30 ++++++++++
 .../hadoop/hive/ql/io/orc/ReaderImpl.java       | 12 ++--
 .../hive/ql/parse/LoadSemanticAnalyzer.java     | 58 +++++++++++++++-----
 .../queries/clientnegative/load_orc_negative1.q |  4 ++
 .../queries/clientnegative/load_orc_negative2.q |  6 ++
 ql/src/test/queries/clientpositive/load_orc.q   | 10 ++++
 .../clientnegative/load_orc_negative1.q.out     |  9 +++
 .../clientnegative/load_orc_negative2.q.out     | 25 +++++++++
 .../test/results/clientpositive/load_orc.q.out  | 43 +++++++++++++++
 10 files changed, 184 insertions(+), 28 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/49da3590/ql/src/java/org/apache/hadoop/hive/ql/ErrorMsg.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/ErrorMsg.java b/ql/src/java/org/apache/hadoop/hive/ql/ErrorMsg.java
index e1cbaa6..20509ce 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/ErrorMsg.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/ErrorMsg.java
@@ -18,18 +18,18 @@
 
 package org.apache.hadoop.hive.ql;
 
-import org.antlr.runtime.tree.Tree;
-import org.apache.hadoop.hive.conf.HiveConf;
-import org.apache.hadoop.hive.ql.metadata.HiveUtils;
-import org.apache.hadoop.hive.ql.parse.ASTNode;
-import org.apache.hadoop.hive.ql.parse.ASTNodeOrigin;
-
 import java.text.MessageFormat;
 import java.util.HashMap;
 import java.util.Map;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
+import org.antlr.runtime.tree.Tree;
+import org.apache.hadoop.hive.conf.HiveConf;
+import org.apache.hadoop.hive.ql.metadata.HiveUtils;
+import org.apache.hadoop.hive.ql.parse.ASTNode;
+import org.apache.hadoop.hive.ql.parse.ASTNodeOrigin;
+
 /**
  * List of all error messages.
  * This list contains both compile time and run-time errors.
@@ -493,7 +493,8 @@ public enum ErrorMsg {
   ORC_CORRUPTED_READ(30018, "Corruption in ORC data encountered. To skip reading corrupted
"
       + "data, set " + HiveConf.ConfVars.HIVE_ORC_SKIP_CORRUPT_DATA + " to true"),
 
-
+  INVALID_FILE_FORMAT_IN_LOAD(30019, "The file that you are trying to load does not match
the" +
+      " file format of the destination table.")
 
   ;
 

http://git-wip-us.apache.org/repos/asf/hive/blob/49da3590/ql/src/java/org/apache/hadoop/hive/ql/io/FileFormatException.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/FileFormatException.java b/ql/src/java/org/apache/hadoop/hive/ql/io/FileFormatException.java
new file mode 100644
index 0000000..12417aa
--- /dev/null
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/FileFormatException.java
@@ -0,0 +1,30 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.io;
+
+import java.io.IOException;
+
+/**
+ * Thrown when an invalid file format is encountered.
+ */
+public class FileFormatException extends IOException {
+
+  public FileFormatException(String errMsg) {
+    super(errMsg);
+  }
+}

http://git-wip-us.apache.org/repos/asf/hive/blob/49da3590/ql/src/java/org/apache/hadoop/hive/ql/io/orc/ReaderImpl.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/ReaderImpl.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/ReaderImpl.java
index 50f417b..bbc4654 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/ReaderImpl.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/ReaderImpl.java
@@ -18,8 +18,6 @@
 
 package org.apache.hadoop.hive.ql.io.orc;
 
-import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.HIVE_ORC_ZEROCOPY;
-
 import java.io.IOException;
 import java.io.InputStream;
 import java.nio.ByteBuffer;
@@ -36,14 +34,14 @@ import org.apache.hadoop.fs.FSDataInputStream;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.hive.common.DiskRange;
-import org.apache.hadoop.hive.conf.HiveConf;
+import org.apache.hadoop.hive.ql.io.FileFormatException;
 import org.apache.hadoop.hive.ql.io.orc.OrcProto.Type;
-import org.apache.hadoop.hive.ql.io.sarg.SearchArgument;
 import org.apache.hadoop.hive.ql.io.orc.OrcProto.UserMetadataItem;
+import org.apache.hadoop.hive.ql.io.orc.RecordReaderImpl.BufferChunk;
+import org.apache.hadoop.hive.ql.io.sarg.SearchArgument;
 import org.apache.hadoop.hive.ql.util.JavaDataModel;
 import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
 import org.apache.hadoop.io.Text;
-import org.apache.hadoop.hive.ql.io.orc.RecordReaderImpl.BufferChunk;
 
 import com.google.common.collect.Lists;
 import com.google.common.collect.Sets;
@@ -232,7 +230,7 @@ public class ReaderImpl implements Reader {
                                       ByteBuffer buffer) throws IOException {
     int len = OrcFile.MAGIC.length();
     if (psLen < len + 1) {
-      throw new IOException("Malformed ORC file " + path +
+      throw new FileFormatException("Malformed ORC file " + path +
           ". Invalid postscript length " + psLen);
     }
     int offset = buffer.arrayOffset() + buffer.position() + buffer.limit() - 1
@@ -247,7 +245,7 @@ public class ReaderImpl implements Reader {
       in.readFully(header, 0, len);
       // if it isn't there, this isn't an ORC file
       if (!Text.decode(header, 0 , len).equals(OrcFile.MAGIC)) {
-        throw new IOException("Malformed ORC file " + path +
+        throw new FileFormatException("Malformed ORC file " + path +
             ". Invalid postscript.");
       }
     }

http://git-wip-us.apache.org/repos/asf/hive/blob/49da3590/ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java
index 1a9b42b..187dc20 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java
@@ -18,6 +18,14 @@
 
 package org.apache.hadoop.hive.ql.parse;
 
+import java.io.IOException;
+import java.io.Serializable;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+
 import org.antlr.runtime.tree.Tree;
 import org.apache.commons.httpclient.util.URIUtil;
 import org.apache.commons.lang.StringUtils;
@@ -26,26 +34,23 @@ import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.PathFilter;
 import org.apache.hadoop.hive.conf.HiveConf;
+import org.apache.hadoop.hive.metastore.TableType;
 import org.apache.hadoop.hive.metastore.api.FieldSchema;
 import org.apache.hadoop.hive.ql.ErrorMsg;
 import org.apache.hadoop.hive.ql.exec.Task;
 import org.apache.hadoop.hive.ql.exec.TaskFactory;
 import org.apache.hadoop.hive.ql.exec.Utilities;
 import org.apache.hadoop.hive.ql.hooks.WriteEntity;
+import org.apache.hadoop.hive.ql.io.FileFormatException;
+import org.apache.hadoop.hive.ql.io.orc.OrcFile;
+import org.apache.hadoop.hive.ql.io.orc.OrcInputFormat;
 import org.apache.hadoop.hive.ql.metadata.Hive;
 import org.apache.hadoop.hive.ql.metadata.HiveException;
 import org.apache.hadoop.hive.ql.metadata.Partition;
 import org.apache.hadoop.hive.ql.plan.LoadTableDesc;
 import org.apache.hadoop.hive.ql.plan.MoveWork;
 import org.apache.hadoop.hive.ql.plan.StatsWork;
-
-import java.io.IOException;
-import java.io.Serializable;
-import java.net.URI;
-import java.net.URISyntaxException;
-import java.util.LinkedHashMap;
-import java.util.List;
-import java.util.Map;
+import org.apache.hadoop.mapred.InputFormat;
 
 /**
  * LoadSemanticAnalyzer.
@@ -60,12 +65,12 @@ public class LoadSemanticAnalyzer extends BaseSemanticAnalyzer {
   public static FileStatus[] matchFilesOrDir(FileSystem fs, Path path)
       throws IOException {
     FileStatus[] srcs = fs.globStatus(path, new PathFilter() {
-              @Override
-              public boolean accept(Path p) {
-                String name = p.getName();
-                return name.equals("_metadata") ? true : !name.startsWith("_") &&
!name.startsWith(".");
-              }
-            });
+      @Override
+      public boolean accept(Path p) {
+        String name = p.getName();
+        return name.equals("_metadata") ? true : !name.startsWith("_") && !name.startsWith(".");
+      }
+    });
     if ((srcs != null) && srcs.length == 1) {
       if (srcs[0].isDir()) {
         srcs = fs.listStatus(srcs[0].getPath(), new PathFilter() {
@@ -228,6 +233,11 @@ public class LoadSemanticAnalyzer extends BaseSemanticAnalyzer {
 
     // make sure the arguments make sense
     applyConstraints(fromURI, toURI, fromTree, isLocal);
+
+    // for managed tables, make sure the file formats match
+    if (TableType.MANAGED_TABLE.equals(ts.tableHandle.getTableType())) {
+      ensureFileFormatsMatch(ts, fromURI);
+    }
     inputs.add(toReadEntity(new Path(fromURI)));
     Task<? extends Serializable> rTask = null;
 
@@ -323,4 +333,24 @@ public class LoadSemanticAnalyzer extends BaseSemanticAnalyzer {
       childTask.addDependentTask(statTask);
     }
   }
+
+  private void ensureFileFormatsMatch(TableSpec ts, URI fromURI) throws SemanticException
{
+    Class<? extends InputFormat> destInputFormat = ts.tableHandle.getInputFormatClass();
+    // Other file formats should do similar check to make sure file formats match
+    // when doing LOAD DATA .. INTO TABLE
+    if (OrcInputFormat.class.equals(destInputFormat)) {
+      Path inputFilePath = new Path(fromURI);
+      try {
+        FileSystem fs = FileSystem.get(fromURI, conf);
+        // just creating orc reader is going to do sanity checks to make sure its valid ORC
file
+        OrcFile.createReader(fs, inputFilePath);
+      } catch (FileFormatException e) {
+        throw new SemanticException(ErrorMsg.INVALID_FILE_FORMAT_IN_LOAD.getMsg("Destination"
+
+            " table is stored as ORC but the file being loaded is not a valid ORC file."));
+      } catch (IOException e) {
+        throw new SemanticException("Unable to load data to destination table." +
+            " Error: " + e.getMessage());
+      }
+    }
+  }
 }

http://git-wip-us.apache.org/repos/asf/hive/blob/49da3590/ql/src/test/queries/clientnegative/load_orc_negative1.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientnegative/load_orc_negative1.q b/ql/src/test/queries/clientnegative/load_orc_negative1.q
new file mode 100644
index 0000000..9edb2f9
--- /dev/null
+++ b/ql/src/test/queries/clientnegative/load_orc_negative1.q
@@ -0,0 +1,4 @@
+set hive.default.fileformat=ORC;
+create table orc_test (userid bigint, string1 string, subtype double, decimal1 decimal, ts
timestamp);
+
+load data local inpath '../../data/files/kv1.txt' into table orc_test;

http://git-wip-us.apache.org/repos/asf/hive/blob/49da3590/ql/src/test/queries/clientnegative/load_orc_negative2.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientnegative/load_orc_negative2.q b/ql/src/test/queries/clientnegative/load_orc_negative2.q
new file mode 100644
index 0000000..b044c9d
--- /dev/null
+++ b/ql/src/test/queries/clientnegative/load_orc_negative2.q
@@ -0,0 +1,6 @@
+create table text_test (userid bigint, string1 string, subtype double, decimal1 decimal,
ts timestamp);
+load data local inpath '../../data/files/kv1.txt' into table text_test;
+
+set hive.default.fileformat=ORC;
+create table orc_test (userid bigint, string1 string, subtype double, decimal1 decimal, ts
timestamp);
+load data inpath '${hiveconf:hive.metastore.warehouse.dir}/text_test/kv1.txt' into table
orc_test;

http://git-wip-us.apache.org/repos/asf/hive/blob/49da3590/ql/src/test/queries/clientpositive/load_orc.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/load_orc.q b/ql/src/test/queries/clientpositive/load_orc.q
new file mode 100644
index 0000000..2eaf098
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/load_orc.q
@@ -0,0 +1,10 @@
+set hive.default.fileformat=ORC;
+create table orc_staging (userid bigint, string1 string, subtype double, decimal1 decimal,
ts timestamp);
+create table orc_test (userid bigint, string1 string, subtype double, decimal1 decimal, ts
timestamp);
+
+load data local inpath '../../data/files/orc_split_elim.orc' into table orc_staging;
+dfs -ls ${hiveconf:hive.metastore.warehouse.dir}/orc_staging/;
+
+load data inpath '${hiveconf:hive.metastore.warehouse.dir}/orc_staging/orc_split_elim.orc'
into table orc_test;
+load data local inpath '../../data/files/orc_split_elim.orc' into table orc_test;
+dfs -ls ${hiveconf:hive.metastore.warehouse.dir}/orc_test/;

http://git-wip-us.apache.org/repos/asf/hive/blob/49da3590/ql/src/test/results/clientnegative/load_orc_negative1.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientnegative/load_orc_negative1.q.out b/ql/src/test/results/clientnegative/load_orc_negative1.q.out
new file mode 100644
index 0000000..ca15a30
--- /dev/null
+++ b/ql/src/test/results/clientnegative/load_orc_negative1.q.out
@@ -0,0 +1,9 @@
+PREHOOK: query: create table orc_test (userid bigint, string1 string, subtype double, decimal1
decimal, ts timestamp)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@orc_test
+POSTHOOK: query: create table orc_test (userid bigint, string1 string, subtype double, decimal1
decimal, ts timestamp)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@orc_test
+FAILED: SemanticException [Error 30019]: The file that you are trying to load does not match
the file format of the destination table. Destination table is stored as ORC but the file
being loaded is not a valid ORC file.

http://git-wip-us.apache.org/repos/asf/hive/blob/49da3590/ql/src/test/results/clientnegative/load_orc_negative2.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientnegative/load_orc_negative2.q.out b/ql/src/test/results/clientnegative/load_orc_negative2.q.out
new file mode 100644
index 0000000..77fb50e
--- /dev/null
+++ b/ql/src/test/results/clientnegative/load_orc_negative2.q.out
@@ -0,0 +1,25 @@
+PREHOOK: query: create table text_test (userid bigint, string1 string, subtype double, decimal1
decimal, ts timestamp)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@text_test
+POSTHOOK: query: create table text_test (userid bigint, string1 string, subtype double, decimal1
decimal, ts timestamp)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@text_test
+PREHOOK: query: load data local inpath '../../data/files/kv1.txt' into table text_test
+PREHOOK: type: LOAD
+#### A masked pattern was here ####
+PREHOOK: Output: default@text_test
+POSTHOOK: query: load data local inpath '../../data/files/kv1.txt' into table text_test
+POSTHOOK: type: LOAD
+#### A masked pattern was here ####
+POSTHOOK: Output: default@text_test
+PREHOOK: query: create table orc_test (userid bigint, string1 string, subtype double, decimal1
decimal, ts timestamp)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@orc_test
+POSTHOOK: query: create table orc_test (userid bigint, string1 string, subtype double, decimal1
decimal, ts timestamp)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@orc_test
+FAILED: SemanticException [Error 30019]: The file that you are trying to load does not match
the file format of the destination table. Destination table is stored as ORC but the file
being loaded is not a valid ORC file.

http://git-wip-us.apache.org/repos/asf/hive/blob/49da3590/ql/src/test/results/clientpositive/load_orc.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/load_orc.q.out b/ql/src/test/results/clientpositive/load_orc.q.out
new file mode 100644
index 0000000..b0835de
--- /dev/null
+++ b/ql/src/test/results/clientpositive/load_orc.q.out
@@ -0,0 +1,43 @@
+PREHOOK: query: create table orc_staging (userid bigint, string1 string, subtype double,
decimal1 decimal, ts timestamp)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@orc_staging
+POSTHOOK: query: create table orc_staging (userid bigint, string1 string, subtype double,
decimal1 decimal, ts timestamp)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@orc_staging
+PREHOOK: query: create table orc_test (userid bigint, string1 string, subtype double, decimal1
decimal, ts timestamp)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@orc_test
+POSTHOOK: query: create table orc_test (userid bigint, string1 string, subtype double, decimal1
decimal, ts timestamp)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@orc_test
+PREHOOK: query: load data local inpath '../../data/files/orc_split_elim.orc' into table orc_staging
+PREHOOK: type: LOAD
+#### A masked pattern was here ####
+PREHOOK: Output: default@orc_staging
+POSTHOOK: query: load data local inpath '../../data/files/orc_split_elim.orc' into table
orc_staging
+POSTHOOK: type: LOAD
+#### A masked pattern was here ####
+POSTHOOK: Output: default@orc_staging
+Found 1 items
+#### A masked pattern was here ####
+PREHOOK: type: LOAD
+#### A masked pattern was here ####
+PREHOOK: Output: default@orc_test
+#### A masked pattern was here ####
+POSTHOOK: type: LOAD
+#### A masked pattern was here ####
+POSTHOOK: Output: default@orc_test
+PREHOOK: query: load data local inpath '../../data/files/orc_split_elim.orc' into table orc_test
+PREHOOK: type: LOAD
+#### A masked pattern was here ####
+PREHOOK: Output: default@orc_test
+POSTHOOK: query: load data local inpath '../../data/files/orc_split_elim.orc' into table
orc_test
+POSTHOOK: type: LOAD
+#### A masked pattern was here ####
+POSTHOOK: Output: default@orc_test
+Found 2 items
+#### A masked pattern was here ####


Mime
View raw message