hive-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From c..@apache.org
Subject svn commit: r1340256 - in /hive/trunk: contrib/src/test/queries/clientnegative/ contrib/src/test/queries/clientpositive/ contrib/src/test/results/clientnegative/ contrib/src/test/results/clientpositive/ ql/src/test/queries/clientnegative/ ql/src/test/q...
Date Fri, 18 May 2012 21:19:51 GMT
Author: cws
Date: Fri May 18 21:19:50 2012
New Revision: 1340256

URL: http://svn.apache.org/viewvc?rev=1340256&view=rev
Log:
HIVE-1719 [jira] Move RegexSerDe out of hive-contrib and over to hive-serde
(Shreepadma Venugopalan via Carl Steinbach)

Summary:
Regex Serde Changes

RegexSerDe is as much a part of the standard Hive distribution as the other SerDes
currently in hive-serde. I think we should move it over to the hive-serde module so that
users don't have to go to the added effort of manually registering the contrib jar before
using it.

Test Plan: EMPTY

Reviewers: JIRA, cwsteinbach

Reviewed By: cwsteinbach

Differential Revision: https://reviews.facebook.net/D3249

Added:
    hive/trunk/ql/src/test/queries/clientnegative/serde_regex.q
    hive/trunk/ql/src/test/queries/clientnegative/serde_regex2.q
    hive/trunk/ql/src/test/queries/clientnegative/serde_regex3.q
    hive/trunk/ql/src/test/queries/clientpositive/serde_regex.q
      - copied, changed from r1340252, hive/trunk/contrib/src/test/queries/clientpositive/serde_regex.q
    hive/trunk/ql/src/test/results/clientnegative/serde_regex.q.out
    hive/trunk/ql/src/test/results/clientnegative/serde_regex2.q.out
    hive/trunk/ql/src/test/results/clientnegative/serde_regex3.q.out
    hive/trunk/ql/src/test/results/clientpositive/serde_regex.q.out
      - copied, changed from r1340252, hive/trunk/contrib/src/test/results/clientpositive/serde_regex.q.out
    hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/RegexSerDe.java   (with props)
Modified:
    hive/trunk/contrib/src/test/queries/clientnegative/serde_regex.q
    hive/trunk/contrib/src/test/queries/clientpositive/serde_regex.q
    hive/trunk/contrib/src/test/results/clientnegative/serde_regex.q.out
    hive/trunk/contrib/src/test/results/clientpositive/serde_regex.q.out

Modified: hive/trunk/contrib/src/test/queries/clientnegative/serde_regex.q
URL: http://svn.apache.org/viewvc/hive/trunk/contrib/src/test/queries/clientnegative/serde_regex.q?rev=1340256&r1=1340255&r2=1340256&view=diff
==============================================================================
--- hive/trunk/contrib/src/test/queries/clientnegative/serde_regex.q (original)
+++ hive/trunk/contrib/src/test/queries/clientnegative/serde_regex.q Fri May 18 21:19:50 2012
@@ -1,7 +1,8 @@
 add jar ${system:build.dir}/hive-contrib-${system:hive.version}.jar;
 
-DROP TABLE serde_regex;
+USE default;
 
+--  This should fail because Regex SerDe supports only columns of type string
 EXPLAIN
 CREATE TABLE serde_regex(
   host STRING,
@@ -35,11 +36,4 @@ WITH SERDEPROPERTIES (
   "input.regex" = "([^ ]*) ([^ ]*) ([^ ]*) (-|\\[[^\\]]*\\]) ([^ \"]*|\"[^\"]*\") (-|[0-9]*)
(-|[0-9]*)(?: ([^ \"]*|\"[^\"]*\") ([^ \"]*|\"[^\"]*\"))?",
   "output.format.string" = "%1$s %2$s %3$s %4$s %5$s %6$s %7$s %8$s %9$s"
 )
-STORED AS TEXTFILE;
-
-LOAD DATA LOCAL INPATH "../data/files/apache.access.log" INTO TABLE serde_regex;
-LOAD DATA LOCAL INPATH "../data/files/apache.access.2.log" INTO TABLE serde_regex;
-
-SELECT * FROM serde_regex ORDER BY time;
-
-DROP TABLE serde_regex;
+STORED AS TEXTFILE;
\ No newline at end of file

Modified: hive/trunk/contrib/src/test/queries/clientpositive/serde_regex.q
URL: http://svn.apache.org/viewvc/hive/trunk/contrib/src/test/queries/clientpositive/serde_regex.q?rev=1340256&r1=1340255&r2=1340256&view=diff
==============================================================================
--- hive/trunk/contrib/src/test/queries/clientpositive/serde_regex.q (original)
+++ hive/trunk/contrib/src/test/queries/clientpositive/serde_regex.q Fri May 18 21:19:50 2012
@@ -1,7 +1,5 @@
 add jar ${system:build.dir}/hive-contrib-${system:hive.version}.jar;
 
-DROP TABLE serde_regex;
-
 EXPLAIN
 CREATE TABLE serde_regex(
   host STRING,
@@ -15,7 +13,7 @@ CREATE TABLE serde_regex(
   agent STRING)
 ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.RegexSerDe'
 WITH SERDEPROPERTIES (
-  "input.regex" = "([^ ]*) ([^ ]*) ([^ ]*) (-|\\[[^\\]]*\\]) ([^ \"]*|\"[^\"]*\") (-|[0-9]*)
(-|[0-9]*)(?: ([^ \"]*|\"[^\"]*\") ([^ \"]*|\"[^\"]*\"))?",
+  "input.regex" = "([^ ]*) ([^ ]*) ([^ ]*) (-|\\[[^\\]]*\\]) ([^ \"]*|\"[^\"]*\") (-|[0-9]*)
(-|[0-9]*)(?: ([^ \"]*|\"[^\"]*\") ([^ \"]*|\"[^\"]*\"))?", 
   "output.format.string" = "%1$s %2$s %3$s %4$s %5$s %6$s %7$s %8$s %9$s"
 )
 STORED AS TEXTFILE;
@@ -40,6 +38,4 @@ STORED AS TEXTFILE;
 LOAD DATA LOCAL INPATH "../data/files/apache.access.log" INTO TABLE serde_regex;
 LOAD DATA LOCAL INPATH "../data/files/apache.access.2.log" INTO TABLE serde_regex;
 
-SELECT * FROM serde_regex ORDER BY time;
-
-DROP TABLE serde_regex;
+SELECT * FROM serde_regex ORDER BY time;
\ No newline at end of file

Modified: hive/trunk/contrib/src/test/results/clientnegative/serde_regex.q.out
URL: http://svn.apache.org/viewvc/hive/trunk/contrib/src/test/results/clientnegative/serde_regex.q.out?rev=1340256&r1=1340255&r2=1340256&view=diff
==============================================================================
--- hive/trunk/contrib/src/test/results/clientnegative/serde_regex.q.out (original)
+++ hive/trunk/contrib/src/test/results/clientnegative/serde_regex.q.out Fri May 18 21:19:50
2012
@@ -1,8 +1,9 @@
-PREHOOK: query: DROP TABLE serde_regex
-PREHOOK: type: DROPTABLE
-POSTHOOK: query: DROP TABLE serde_regex
-POSTHOOK: type: DROPTABLE
-PREHOOK: query: EXPLAIN
+PREHOOK: query: USE default
+PREHOOK: type: SWITCHDATABASE
+POSTHOOK: query: USE default
+POSTHOOK: type: SWITCHDATABASE
+PREHOOK: query: --  This should fail because Regex SerDe supports only columns of type string
+EXPLAIN
 CREATE TABLE serde_regex(
   host STRING,
   identity STRING,
@@ -20,7 +21,8 @@ WITH SERDEPROPERTIES (
 )
 STORED AS TEXTFILE
 PREHOOK: type: CREATETABLE
-POSTHOOK: query: EXPLAIN
+POSTHOOK: query: --  This should fail because Regex SerDe supports only columns of type string
+EXPLAIN
 CREATE TABLE serde_regex(
   host STRING,
   identity STRING,

Modified: hive/trunk/contrib/src/test/results/clientpositive/serde_regex.q.out
URL: http://svn.apache.org/viewvc/hive/trunk/contrib/src/test/results/clientpositive/serde_regex.q.out?rev=1340256&r1=1340255&r2=1340256&view=diff
==============================================================================
--- hive/trunk/contrib/src/test/results/clientpositive/serde_regex.q.out (original)
+++ hive/trunk/contrib/src/test/results/clientpositive/serde_regex.q.out Fri May 18 21:19:50
2012
@@ -1,7 +1,3 @@
-PREHOOK: query: DROP TABLE serde_regex
-PREHOOK: type: DROPTABLE
-POSTHOOK: query: DROP TABLE serde_regex
-POSTHOOK: type: DROPTABLE
 PREHOOK: query: EXPLAIN
 CREATE TABLE serde_regex(
   host STRING,
@@ -15,7 +11,7 @@ CREATE TABLE serde_regex(
   agent STRING)
 ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.RegexSerDe'
 WITH SERDEPROPERTIES (
-  "input.regex" = "([^ ]*) ([^ ]*) ([^ ]*) (-|\\[[^\\]]*\\]) ([^ \"]*|\"[^\"]*\") (-|[0-9]*)
(-|[0-9]*)(?: ([^ \"]*|\"[^\"]*\") ([^ \"]*|\"[^\"]*\"))?",
+  "input.regex" = "([^ ]*) ([^ ]*) ([^ ]*) (-|\\[[^\\]]*\\]) ([^ \"]*|\"[^\"]*\") (-|[0-9]*)
(-|[0-9]*)(?: ([^ \"]*|\"[^\"]*\") ([^ \"]*|\"[^\"]*\"))?", 
   "output.format.string" = "%1$s %2$s %3$s %4$s %5$s %6$s %7$s %8$s %9$s"
 )
 STORED AS TEXTFILE
@@ -33,7 +29,7 @@ CREATE TABLE serde_regex(
   agent STRING)
 ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.RegexSerDe'
 WITH SERDEPROPERTIES (
-  "input.regex" = "([^ ]*) ([^ ]*) ([^ ]*) (-|\\[[^\\]]*\\]) ([^ \"]*|\"[^\"]*\") (-|[0-9]*)
(-|[0-9]*)(?: ([^ \"]*|\"[^\"]*\") ([^ \"]*|\"[^\"]*\"))?",
+  "input.regex" = "([^ ]*) ([^ ]*) ([^ ]*) (-|\\[[^\\]]*\\]) ([^ \"]*|\"[^\"]*\") (-|[0-9]*)
(-|[0-9]*)(?: ([^ \"]*|\"[^\"]*\") ([^ \"]*|\"[^\"]*\"))?", 
   "output.format.string" = "%1$s %2$s %3$s %4$s %5$s %6$s %7$s %8$s %9$s"
 )
 STORED AS TEXTFILE
@@ -118,11 +114,3 @@ POSTHOOK: Input: default@serde_regex
 #### A masked pattern was here ####
 127.0.0.1	-	frank	[10/Oct/2000:13:55:36 -0700]	"GET /apache_pb.gif HTTP/1.0"	200	2326	NULL
NULL
 127.0.0.1	-	-	[26/May/2009:00:00:00 +0000]	"GET /someurl/?track=Blabla(Main) HTTP/1.1"	200
5864	-	"Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/525.19 (KHTML, like Gecko)
Chrome/1.0.154.65 Safari/525.19"
-PREHOOK: query: DROP TABLE serde_regex
-PREHOOK: type: DROPTABLE
-PREHOOK: Input: default@serde_regex
-PREHOOK: Output: default@serde_regex
-POSTHOOK: query: DROP TABLE serde_regex
-POSTHOOK: type: DROPTABLE
-POSTHOOK: Input: default@serde_regex
-POSTHOOK: Output: default@serde_regex

Added: hive/trunk/ql/src/test/queries/clientnegative/serde_regex.q
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientnegative/serde_regex.q?rev=1340256&view=auto
==============================================================================
--- hive/trunk/ql/src/test/queries/clientnegative/serde_regex.q (added)
+++ hive/trunk/ql/src/test/queries/clientnegative/serde_regex.q Fri May 18 21:19:50 2012
@@ -0,0 +1,16 @@
+USE default;
+--  This should fail because Regex SerDe supports only columns of type string
+CREATE TABLE serde_regex(
+  host STRING,
+  identity STRING,
+  user STRING,
+  time STRING,
+  request STRING,
+  status INT,
+  size INT,
+  referer STRING,
+  agent STRING)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
+WITH SERDEPROPERTIES (
+  "input.regex" = "([^ ]*) ([^ ]*) ([^ ]*) (-|\\[[^\\]]*\\]) ([^ \"]*|\"[^\"]*\") (-|[0-9]*)
(-|[0-9]*)(?: ([^ \"]*|\"[^\"]*\") ([^ \"]*|\"[^\"]*\"))?")
+STORED AS TEXTFILE;

Added: hive/trunk/ql/src/test/queries/clientnegative/serde_regex2.q
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientnegative/serde_regex2.q?rev=1340256&view=auto
==============================================================================
--- hive/trunk/ql/src/test/queries/clientnegative/serde_regex2.q (added)
+++ hive/trunk/ql/src/test/queries/clientnegative/serde_regex2.q Fri May 18 21:19:50 2012
@@ -0,0 +1,23 @@
+USE default;
+-- Mismatch between the number of matching groups and columns, throw run time exception.
Ideally this should throw a compile time exception. See JIRA-3023 for more details.
+ CREATE TABLE serde_regex(
+  host STRING,
+  identity STRING,
+  user STRING,
+  time STRING,
+  request STRING,
+  status STRING,
+  size STRING,
+  referer STRING,
+  agent STRING)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
+WITH SERDEPROPERTIES (
+  "input.regex" = "([^ ]*) ([^ ]*) ([^ ]*) (-|\\[[^\\]]*\\]) ([^ \"]*|\"[^\"]*\") (-|[0-9]*)
(-|[0-9]*)"  
+)
+STORED AS TEXTFILE;
+
+LOAD DATA LOCAL INPATH "../data/files/apache.access.log" INTO TABLE serde_regex;
+LOAD DATA LOCAL INPATH "../data/files/apache.access.2.log" INTO TABLE serde_regex;
+
+-- raise an exception 
+SELECT * FROM serde_regex ORDER BY time;
\ No newline at end of file

Added: hive/trunk/ql/src/test/queries/clientnegative/serde_regex3.q
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientnegative/serde_regex3.q?rev=1340256&view=auto
==============================================================================
--- hive/trunk/ql/src/test/queries/clientnegative/serde_regex3.q (added)
+++ hive/trunk/ql/src/test/queries/clientnegative/serde_regex3.q Fri May 18 21:19:50 2012
@@ -0,0 +1,14 @@
+USE default;
+-- null input.regex, raise an exception
+ CREATE TABLE serde_regex(
+  host STRING,
+  identity STRING,
+  user STRING,
+  time STRING,
+  request STRING,
+  status STRING,
+  size STRING,
+  referer STRING,
+  agent STRING)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
+STORED AS TEXTFILE;

Copied: hive/trunk/ql/src/test/queries/clientpositive/serde_regex.q (from r1340252, hive/trunk/contrib/src/test/queries/clientpositive/serde_regex.q)
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/serde_regex.q?p2=hive/trunk/ql/src/test/queries/clientpositive/serde_regex.q&p1=hive/trunk/contrib/src/test/queries/clientpositive/serde_regex.q&r1=1340252&r2=1340256&rev=1340256&view=diff
==============================================================================
--- hive/trunk/contrib/src/test/queries/clientpositive/serde_regex.q (original)
+++ hive/trunk/ql/src/test/queries/clientpositive/serde_regex.q Fri May 18 21:19:50 2012
@@ -1,7 +1,3 @@
-add jar ${system:build.dir}/hive-contrib-${system:hive.version}.jar;
-
-DROP TABLE serde_regex;
-
 EXPLAIN
 CREATE TABLE serde_regex(
   host STRING,
@@ -13,10 +9,9 @@ CREATE TABLE serde_regex(
   size STRING,
   referer STRING,
   agent STRING)
-ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.RegexSerDe'
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
 WITH SERDEPROPERTIES (
-  "input.regex" = "([^ ]*) ([^ ]*) ([^ ]*) (-|\\[[^\\]]*\\]) ([^ \"]*|\"[^\"]*\") (-|[0-9]*)
(-|[0-9]*)(?: ([^ \"]*|\"[^\"]*\") ([^ \"]*|\"[^\"]*\"))?",
-  "output.format.string" = "%1$s %2$s %3$s %4$s %5$s %6$s %7$s %8$s %9$s"
+  "input.regex" = "([^ ]*) ([^ ]*) ([^ ]*) (-|\\[[^\\]]*\\]) ([^ \"]*|\"[^\"]*\") (-|[0-9]*)
(-|[0-9]*)(?: ([^ \"]*|\"[^\"]*\") ([^ \"]*|\"[^\"]*\"))?"
 )
 STORED AS TEXTFILE;
 
@@ -30,10 +25,9 @@ CREATE TABLE serde_regex(
   size STRING,
   referer STRING,
   agent STRING)
-ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.RegexSerDe'
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
 WITH SERDEPROPERTIES (
-  "input.regex" = "([^ ]*) ([^ ]*) ([^ ]*) (-|\\[[^\\]]*\\]) ([^ \"]*|\"[^\"]*\") (-|[0-9]*)
(-|[0-9]*)(?: ([^ \"]*|\"[^\"]*\") ([^ \"]*|\"[^\"]*\"))?",
-  "output.format.string" = "%1$s %2$s %3$s %4$s %5$s %6$s %7$s %8$s %9$s"
+  "input.regex" = "([^ ]*) ([^ ]*) ([^ ]*) (-|\\[[^\\]]*\\]) ([^ \"]*|\"[^\"]*\") (-|[0-9]*)
(-|[0-9]*)(?: ([^ \"]*|\"[^\"]*\") ([^ \"]*|\"[^\"]*\"))?"
 )
 STORED AS TEXTFILE;
 
@@ -42,4 +36,6 @@ LOAD DATA LOCAL INPATH "../data/files/ap
 
 SELECT * FROM serde_regex ORDER BY time;
 
+SELECT host, size, status, time from serde_regex ORDER BY time;
+
 DROP TABLE serde_regex;

Added: hive/trunk/ql/src/test/results/clientnegative/serde_regex.q.out
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/results/clientnegative/serde_regex.q.out?rev=1340256&view=auto
==============================================================================
--- hive/trunk/ql/src/test/results/clientnegative/serde_regex.q.out (added)
+++ hive/trunk/ql/src/test/results/clientnegative/serde_regex.q.out Fri May 18 21:19:50 2012
@@ -0,0 +1,22 @@
+PREHOOK: query: USE default
+PREHOOK: type: SWITCHDATABASE
+POSTHOOK: query: USE default
+POSTHOOK: type: SWITCHDATABASE
+PREHOOK: query: --  This should fail because Regex SerDe supports only columns of type string
+CREATE TABLE serde_regex(
+  host STRING,
+  identity STRING,
+  user STRING,
+  time STRING,
+  request STRING,
+  status INT,
+  size INT,
+  referer STRING,
+  agent STRING)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
+WITH SERDEPROPERTIES (
+  "input.regex" = "([^ ]*) ([^ ]*) ([^ ]*) (-|\\[[^\\]]*\\]) ([^ \"]*|\"[^\"]*\") (-|[0-9]*)
(-|[0-9]*)(?: ([^ \"]*|\"[^\"]*\") ([^ \"]*|\"[^\"]*\"))?")
+STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+#### A masked pattern was here ####
+FAILED: Execution Error, return code 1 from org.apache.hadoop.hive.ql.exec.DDLTask

Added: hive/trunk/ql/src/test/results/clientnegative/serde_regex2.q.out
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/results/clientnegative/serde_regex2.q.out?rev=1340256&view=auto
==============================================================================
--- hive/trunk/ql/src/test/results/clientnegative/serde_regex2.q.out (added)
+++ hive/trunk/ql/src/test/results/clientnegative/serde_regex2.q.out Fri May 18 21:19:50 2012
@@ -0,0 +1,67 @@
+PREHOOK: query: USE default
+PREHOOK: type: SWITCHDATABASE
+POSTHOOK: query: USE default
+POSTHOOK: type: SWITCHDATABASE
+PREHOOK: query: -- Mismatch between the number of matching groups and columns, throw run
time exception. Ideally this should throw a compile time exception. See JIRA-3023 for more
details.
+ CREATE TABLE serde_regex(
+  host STRING,
+  identity STRING,
+  user STRING,
+  time STRING,
+  request STRING,
+  status STRING,
+  size STRING,
+  referer STRING,
+  agent STRING)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
+WITH SERDEPROPERTIES (
+  "input.regex" = "([^ ]*) ([^ ]*) ([^ ]*) (-|\\[[^\\]]*\\]) ([^ \"]*|\"[^\"]*\") (-|[0-9]*)
(-|[0-9]*)"  
+)
+STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: -- Mismatch between the number of matching groups and columns, throw run
time exception. Ideally this should throw a compile time exception. See JIRA-3023 for more
details.
+ CREATE TABLE serde_regex(
+  host STRING,
+  identity STRING,
+  user STRING,
+  time STRING,
+  request STRING,
+  status STRING,
+  size STRING,
+  referer STRING,
+  agent STRING)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
+WITH SERDEPROPERTIES (
+  "input.regex" = "([^ ]*) ([^ ]*) ([^ ]*) (-|\\[[^\\]]*\\]) ([^ \"]*|\"[^\"]*\") (-|[0-9]*)
(-|[0-9]*)"  
+)
+STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@serde_regex
+PREHOOK: query: LOAD DATA LOCAL INPATH "../data/files/apache.access.log" INTO TABLE serde_regex
+PREHOOK: type: LOAD
+PREHOOK: Output: default@serde_regex
+POSTHOOK: query: LOAD DATA LOCAL INPATH "../data/files/apache.access.log" INTO TABLE serde_regex
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@serde_regex
+PREHOOK: query: LOAD DATA LOCAL INPATH "../data/files/apache.access.2.log" INTO TABLE serde_regex
+PREHOOK: type: LOAD
+PREHOOK: Output: default@serde_regex
+POSTHOOK: query: LOAD DATA LOCAL INPATH "../data/files/apache.access.2.log" INTO TABLE serde_regex
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@serde_regex
+PREHOOK: query: -- raise an exception 
+SELECT * FROM serde_regex ORDER BY time
+PREHOOK: type: QUERY
+PREHOOK: Input: default@serde_regex
+#### A masked pattern was here ####
+Execution failed with exit status: 2
+Obtaining error information
+
+Task failed!
+Task ID:
+  Stage-1
+
+Logs:
+
+#### A masked pattern was here ####
+FAILED: Execution Error, return code 2 from org.apache.hadoop.hive.ql.exec.MapRedTask

Added: hive/trunk/ql/src/test/results/clientnegative/serde_regex3.q.out
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/results/clientnegative/serde_regex3.q.out?rev=1340256&view=auto
==============================================================================
--- hive/trunk/ql/src/test/results/clientnegative/serde_regex3.q.out (added)
+++ hive/trunk/ql/src/test/results/clientnegative/serde_regex3.q.out Fri May 18 21:19:50 2012
@@ -0,0 +1,20 @@
+PREHOOK: query: USE default
+PREHOOK: type: SWITCHDATABASE
+POSTHOOK: query: USE default
+POSTHOOK: type: SWITCHDATABASE
+PREHOOK: query: -- null input.regex, raise an exception
+ CREATE TABLE serde_regex(
+  host STRING,
+  identity STRING,
+  user STRING,
+  time STRING,
+  request STRING,
+  status STRING,
+  size STRING,
+  referer STRING,
+  agent STRING)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
+STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+#### A masked pattern was here ####
+FAILED: Execution Error, return code 1 from org.apache.hadoop.hive.ql.exec.DDLTask

Copied: hive/trunk/ql/src/test/results/clientpositive/serde_regex.q.out (from r1340252, hive/trunk/contrib/src/test/results/clientpositive/serde_regex.q.out)
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/results/clientpositive/serde_regex.q.out?p2=hive/trunk/ql/src/test/results/clientpositive/serde_regex.q.out&p1=hive/trunk/contrib/src/test/results/clientpositive/serde_regex.q.out&r1=1340252&r2=1340256&rev=1340256&view=diff
==============================================================================
--- hive/trunk/contrib/src/test/results/clientpositive/serde_regex.q.out (original)
+++ hive/trunk/ql/src/test/results/clientpositive/serde_regex.q.out Fri May 18 21:19:50 2012
@@ -1,7 +1,3 @@
-PREHOOK: query: DROP TABLE serde_regex
-PREHOOK: type: DROPTABLE
-POSTHOOK: query: DROP TABLE serde_regex
-POSTHOOK: type: DROPTABLE
 PREHOOK: query: EXPLAIN
 CREATE TABLE serde_regex(
   host STRING,
@@ -13,10 +9,9 @@ CREATE TABLE serde_regex(
   size STRING,
   referer STRING,
   agent STRING)
-ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.RegexSerDe'
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
 WITH SERDEPROPERTIES (
-  "input.regex" = "([^ ]*) ([^ ]*) ([^ ]*) (-|\\[[^\\]]*\\]) ([^ \"]*|\"[^\"]*\") (-|[0-9]*)
(-|[0-9]*)(?: ([^ \"]*|\"[^\"]*\") ([^ \"]*|\"[^\"]*\"))?",
-  "output.format.string" = "%1$s %2$s %3$s %4$s %5$s %6$s %7$s %8$s %9$s"
+  "input.regex" = "([^ ]*) ([^ ]*) ([^ ]*) (-|\\[[^\\]]*\\]) ([^ \"]*|\"[^\"]*\") (-|[0-9]*)
(-|[0-9]*)(?: ([^ \"]*|\"[^\"]*\") ([^ \"]*|\"[^\"]*\"))?"
 )
 STORED AS TEXTFILE
 PREHOOK: type: CREATETABLE
@@ -31,15 +26,14 @@ CREATE TABLE serde_regex(
   size STRING,
   referer STRING,
   agent STRING)
-ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.RegexSerDe'
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
 WITH SERDEPROPERTIES (
-  "input.regex" = "([^ ]*) ([^ ]*) ([^ ]*) (-|\\[[^\\]]*\\]) ([^ \"]*|\"[^\"]*\") (-|[0-9]*)
(-|[0-9]*)(?: ([^ \"]*|\"[^\"]*\") ([^ \"]*|\"[^\"]*\"))?",
-  "output.format.string" = "%1$s %2$s %3$s %4$s %5$s %6$s %7$s %8$s %9$s"
+  "input.regex" = "([^ ]*) ([^ ]*) ([^ ]*) (-|\\[[^\\]]*\\]) ([^ \"]*|\"[^\"]*\") (-|[0-9]*)
(-|[0-9]*)(?: ([^ \"]*|\"[^\"]*\") ([^ \"]*|\"[^\"]*\"))?"
 )
 STORED AS TEXTFILE
 POSTHOOK: type: CREATETABLE
 ABSTRACT SYNTAX TREE:
-  (TOK_CREATETABLE (TOK_TABNAME serde_regex) TOK_LIKETABLE (TOK_TABCOLLIST (TOK_TABCOL host
TOK_STRING) (TOK_TABCOL identity TOK_STRING) (TOK_TABCOL user TOK_STRING) (TOK_TABCOL time
TOK_STRING) (TOK_TABCOL request TOK_STRING) (TOK_TABCOL status TOK_STRING) (TOK_TABCOL size
TOK_STRING) (TOK_TABCOL referer TOK_STRING) (TOK_TABCOL agent TOK_STRING)) (TOK_TABLESERIALIZER
(TOK_SERDENAME 'org.apache.hadoop.hive.contrib.serde2.RegexSerDe' (TOK_TABLEPROPERTIES (TOK_TABLEPROPLIST
(TOK_TABLEPROPERTY "input.regex" "([^ ]*) ([^ ]*) ([^ ]*) (-|\\[[^\\]]*\\]) ([^ \"]*|\"[^\"]*\")
(-|[0-9]*) (-|[0-9]*)(?: ([^ \"]*|\"[^\"]*\") ([^ \"]*|\"[^\"]*\"))?") (TOK_TABLEPROPERTY
"output.format.string" "%1$s %2$s %3$s %4$s %5$s %6$s %7$s %8$s %9$s"))))) TOK_TBLTEXTFILE)
+  (TOK_CREATETABLE (TOK_TABNAME serde_regex) TOK_LIKETABLE (TOK_TABCOLLIST (TOK_TABCOL host
TOK_STRING) (TOK_TABCOL identity TOK_STRING) (TOK_TABCOL user TOK_STRING) (TOK_TABCOL time
TOK_STRING) (TOK_TABCOL request TOK_STRING) (TOK_TABCOL status TOK_STRING) (TOK_TABCOL size
TOK_STRING) (TOK_TABCOL referer TOK_STRING) (TOK_TABCOL agent TOK_STRING)) (TOK_TABLESERIALIZER
(TOK_SERDENAME 'org.apache.hadoop.hive.serde2.RegexSerDe' (TOK_TABLEPROPERTIES (TOK_TABLEPROPLIST
(TOK_TABLEPROPERTY "input.regex" "([^ ]*) ([^ ]*) ([^ ]*) (-|\\[[^\\]]*\\]) ([^ \"]*|\"[^\"]*\")
(-|[0-9]*) (-|[0-9]*)(?: ([^ \"]*|\"[^\"]*\") ([^ \"]*|\"[^\"]*\"))?"))))) TOK_TBLTEXTFILE)
 
 STAGE DEPENDENCIES:
   Stage-0 is a root stage
@@ -53,10 +47,9 @@ STAGE PLANS:
           input format: org.apache.hadoop.mapred.TextInputFormat
           # buckets: -1
           output format: org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat
-          serde name: org.apache.hadoop.hive.contrib.serde2.RegexSerDe
+          serde name: org.apache.hadoop.hive.serde2.RegexSerDe
           serde properties:
             input.regex ([^ ]*) ([^ ]*) ([^ ]*) (-|\[[^\]]*\]) ([^ "]*|"[^"]*") (-|[0-9]*)
(-|[0-9]*)(?: ([^ "]*|"[^"]*") ([^ "]*|"[^"]*"))?
-            output.format.string %1$s %2$s %3$s %4$s %5$s %6$s %7$s %8$s %9$s
           name: serde_regex
           isExternal: false
 
@@ -71,10 +64,9 @@ PREHOOK: query: CREATE TABLE serde_regex
   size STRING,
   referer STRING,
   agent STRING)
-ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.RegexSerDe'
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
 WITH SERDEPROPERTIES (
-  "input.regex" = "([^ ]*) ([^ ]*) ([^ ]*) (-|\\[[^\\]]*\\]) ([^ \"]*|\"[^\"]*\") (-|[0-9]*)
(-|[0-9]*)(?: ([^ \"]*|\"[^\"]*\") ([^ \"]*|\"[^\"]*\"))?",
-  "output.format.string" = "%1$s %2$s %3$s %4$s %5$s %6$s %7$s %8$s %9$s"
+  "input.regex" = "([^ ]*) ([^ ]*) ([^ ]*) (-|\\[[^\\]]*\\]) ([^ \"]*|\"[^\"]*\") (-|[0-9]*)
(-|[0-9]*)(?: ([^ \"]*|\"[^\"]*\") ([^ \"]*|\"[^\"]*\"))?"
 )
 STORED AS TEXTFILE
 PREHOOK: type: CREATETABLE
@@ -88,10 +80,9 @@ POSTHOOK: query: CREATE TABLE serde_rege
   size STRING,
   referer STRING,
   agent STRING)
-ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.RegexSerDe'
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
 WITH SERDEPROPERTIES (
-  "input.regex" = "([^ ]*) ([^ ]*) ([^ ]*) (-|\\[[^\\]]*\\]) ([^ \"]*|\"[^\"]*\") (-|[0-9]*)
(-|[0-9]*)(?: ([^ \"]*|\"[^\"]*\") ([^ \"]*|\"[^\"]*\"))?",
-  "output.format.string" = "%1$s %2$s %3$s %4$s %5$s %6$s %7$s %8$s %9$s"
+  "input.regex" = "([^ ]*) ([^ ]*) ([^ ]*) (-|\\[[^\\]]*\\]) ([^ \"]*|\"[^\"]*\") (-|[0-9]*)
(-|[0-9]*)(?: ([^ \"]*|\"[^\"]*\") ([^ \"]*|\"[^\"]*\"))?"
 )
 STORED AS TEXTFILE
 POSTHOOK: type: CREATETABLE
@@ -118,6 +109,16 @@ POSTHOOK: Input: default@serde_regex
 #### A masked pattern was here ####
 127.0.0.1	-	frank	[10/Oct/2000:13:55:36 -0700]	"GET /apache_pb.gif HTTP/1.0"	200	2326	NULL
NULL
 127.0.0.1	-	-	[26/May/2009:00:00:00 +0000]	"GET /someurl/?track=Blabla(Main) HTTP/1.1"	200
5864	-	"Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/525.19 (KHTML, like Gecko)
Chrome/1.0.154.65 Safari/525.19"
+PREHOOK: query: SELECT host, size, status, time from serde_regex ORDER BY time
+PREHOOK: type: QUERY
+PREHOOK: Input: default@serde_regex
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT host, size, status, time from serde_regex ORDER BY time
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@serde_regex
+#### A masked pattern was here ####
+127.0.0.1	2326	200	[10/Oct/2000:13:55:36 -0700]
+127.0.0.1	5864	200	[26/May/2009:00:00:00 +0000]
 PREHOOK: query: DROP TABLE serde_regex
 PREHOOK: type: DROPTABLE
 PREHOOK: Input: default@serde_regex

Added: hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/RegexSerDe.java
URL: http://svn.apache.org/viewvc/hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/RegexSerDe.java?rev=1340256&view=auto
==============================================================================
--- hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/RegexSerDe.java (added)
+++ hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/RegexSerDe.java Fri May 18 21:19:50
2012
@@ -0,0 +1,211 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.serde2;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Properties;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hive.serde.Constants;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
+import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+
+/**
+ * RegexSerDe uses regular expression (regex) to deserialize data. It doesn't
+ * support data serialization.
+ *
+ * It can deserialize the data using regex and extracts groups as columns.
+ *
+ * In deserialization stage, if a row does not match the regex, then all columns
+ * in the row will be NULL. If a row matches the regex but has less than
+ * expected groups, the missing groups will be NULL. If a row matches the regex
+ * but has more than expected groups, the additional groups are just ignored.
+ *
+ * NOTE: Obviously, all columns have to be strings. Users can use
+ * "CAST(a AS INT)" to convert columns to other types.
+ *
+ * NOTE: This implementation is using String, and javaStringObjectInspector. A
+ * more efficient implementation should use UTF-8 encoded Text and
+ * writableStringObjectInspector. We should switch to that when we have a UTF-8
+ * based Regex library.
+ */
+public class RegexSerDe implements SerDe {
+
+  public static final Log LOG = LogFactory.getLog(RegexSerDe.class.getName());
+
+  int numColumns;
+  String inputRegex;
+
+  Pattern inputPattern;
+
+  StructObjectInspector rowOI;
+  ArrayList<String> row;
+  Object[] outputFields;
+  Text outputRowText;
+
+  boolean alreadyLoggedNoMatch = false;
+  boolean alreadyLoggedPartialMatch = false;
+
+  @Override
+  public void initialize(Configuration conf, Properties tbl)
+      throws SerDeException {
+
+    // We can get the table definition from tbl.
+
+    // Read the configuration parameters
+    inputRegex = tbl.getProperty("input.regex");
+    String columnNameProperty = tbl.getProperty(Constants.LIST_COLUMNS);
+    String columnTypeProperty = tbl.getProperty(Constants.LIST_COLUMN_TYPES);
+    boolean inputRegexIgnoreCase = "true".equalsIgnoreCase(tbl
+        .getProperty("input.regex.case.insensitive"));
+
+    // output format string is not supported anymore, warn user of deprecation
+    if (null != tbl.getProperty("output.format.string")) {
+      LOG.warn("output.format.string has been deprecated");
+    }
+
+    // Parse the configuration parameters
+    if (inputRegex != null) {
+      inputPattern = Pattern.compile(inputRegex, Pattern.DOTALL
+          + (inputRegexIgnoreCase ? Pattern.CASE_INSENSITIVE : 0));
+    } else {
+      inputPattern = null;
+      throw new SerDeException(
+          "This table does not have serde property \"input.regex\"!");
+    }
+
+
+    List<String> columnNames = Arrays.asList(columnNameProperty.split(","));
+    List<TypeInfo> columnTypes = TypeInfoUtils
+        .getTypeInfosFromTypeString(columnTypeProperty);
+    assert columnNames.size() == columnTypes.size();
+    numColumns = columnNames.size();
+
+    // All columns have to be of type STRING.
+    for (int c = 0; c < numColumns; c++) {
+      if (!columnTypes.get(c).equals(TypeInfoFactory.stringTypeInfo)) {
+        throw new SerDeException(getClass().getName()
+            + " only accepts string columns, but column[" + c + "] named "
+            + columnNames.get(c) + " has type " + columnTypes.get(c));
+      }
+    }
+
+    // Constructing the row ObjectInspector:
+    // The row consists of some string columns, each column will be a java
+    // String object.
+    List<ObjectInspector> columnOIs = new ArrayList<ObjectInspector>(
+        columnNames.size());
+    for (int c = 0; c < numColumns; c++) {
+      columnOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
+    }
+    // StandardStruct uses ArrayList to store the row.
+    rowOI = ObjectInspectorFactory.getStandardStructObjectInspector(
+        columnNames, columnOIs);
+
+    // Constructing the row object, etc, which will be reused for all rows.
+    row = new ArrayList<String>(numColumns);
+    for (int c = 0; c < numColumns; c++) {
+      row.add(null);
+    }
+    outputFields = new Object[numColumns];
+    outputRowText = new Text();
+  }
+
+  @Override
+  public ObjectInspector getObjectInspector() throws SerDeException {
+    return rowOI;
+  }
+
+  @Override
+  public Class<? extends Writable> getSerializedClass() {
+    return Text.class;
+  }
+
+  // Number of rows not matching the regex
+  long unmatchedRowsCount = 0;
+  // Number of rows that match the regex but have missing groups.
+  long partialMatchedRowsCount = 0;
+
+  @Override
+  public Object deserialize(Writable blob) throws SerDeException {
+
+
+    Text rowText = (Text) blob;
+
+    Matcher m = inputPattern.matcher(rowText.toString());
+
+    if (m.groupCount() != numColumns) {
+      throw new SerDeException("Number of matching groups doesn't match the number of columns");
+    }
+
+    // If do not match, ignore the line, return a row with all nulls.
+    if (!m.matches()) {
+      unmatchedRowsCount++;
+        if (!alreadyLoggedNoMatch) {
+         // Report the row if its the first time
+         LOG.warn("" + unmatchedRowsCount + " unmatched rows are found: " + rowText);
+         alreadyLoggedNoMatch = true;
+      }
+      return null;
+    }
+
+    // Otherwise, return the row.
+    for (int c = 0; c < numColumns; c++) {
+      try {
+        row.set(c, m.group(c + 1));
+      } catch (RuntimeException e) {
+        partialMatchedRowsCount++;
+          if (!alreadyLoggedPartialMatch) {
+          // Report the row if its the first time
+          LOG.warn("" + partialMatchedRowsCount
+              + " partially unmatched rows are found, " + " cannot find group "
+              + c + ": " + rowText);
+          alreadyLoggedPartialMatch = true;
+        }
+        row.set(c, null);
+       }
+     }
+    return row;
+  }
+
+  @Override
+  public Writable serialize(Object obj, ObjectInspector objInspector)
+      throws SerDeException {
+        throw new UnsupportedOperationException(
+          "Regex SerDe doesn't support the serialize() method");
+  }
+
+  public SerDeStats getSerDeStats() {
+    // no support for statistics
+    return null;
+  }
+
+}

Propchange: hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/RegexSerDe.java
------------------------------------------------------------------------------
    svn:eol-style = native



Mime
View raw message