Return-Path: X-Original-To: apmail-hive-user-archive@www.apache.org Delivered-To: apmail-hive-user-archive@www.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 783179FA4 for ; Wed, 20 Jun 2012 13:58:28 +0000 (UTC) Received: (qmail 7665 invoked by uid 500); 20 Jun 2012 13:58:27 -0000 Delivered-To: apmail-hive-user-archive@hive.apache.org Received: (qmail 7616 invoked by uid 500); 20 Jun 2012 13:58:27 -0000 Mailing-List: contact user-help@hive.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: user@hive.apache.org Delivered-To: mailing list user@hive.apache.org Received: (qmail 7606 invoked by uid 99); 20 Jun 2012 13:58:27 -0000 Received: from nike.apache.org (HELO nike.apache.org) (192.87.106.230) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 20 Jun 2012 13:58:27 +0000 X-ASF-Spam-Status: No, hits=-2.8 required=5.0 tests=FSL_RCVD_USER,HTML_MESSAGE,RCVD_IN_DNSWL_HI,SPF_PASS X-Spam-Check-By: apache.org Received-SPF: pass (nike.apache.org: domain of Matt.Tucker@disney.com designates 192.195.66.26 as permitted sender) Received: from [192.195.66.26] (HELO msg11.disney.com) (192.195.66.26) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 20 Jun 2012 13:58:16 +0000 Received: from int11.disney.pvt (int11.disney.pvt [153.6.62.222]) by msg11.disney.com (Switch-3.4.4/Switch-3.4.4) with ESMTP id q5KDvs2Y016148 for ; Wed, 20 Jun 2012 13:57:54 GMT Received: from sm-flor-xht01.wdw.disney.com (sm-flor-xht01.wdw.disney.com [153.6.66.104]) by int11.disney.pvt (Switch-3.4.4/Switch-3.4.4) with ESMTP id q5KDvo7t006985 (version=TLSv1/SSLv3 cipher=AES128-SHA bits=128 verify=FAIL) for ; Wed, 20 Jun 2012 13:57:54 GMT Received: from sm-flor-vxmb06b.wdw.disney.com ([fe80::1c05:28bc:5b5a:5883]) by sm-flor-xht01.wdw.disney.com ([2002:9906:4268::9906:4268]) with mapi; Wed, 20 Jun 2012 09:57:52 -0400 From: "Tucker, Matt" To: "user@hive.apache.org" Date: Wed, 20 Jun 2012 09:57:32 -0400 Subject: Reading XML Files Thread-Topic: Reading XML Files Thread-Index: Ac1O7KNAWR528k4TSryk6IXoZ9EjVQ== Message-ID: <4CED3A5A427FF34BAEE89C98D64D336D1FFC16380D@SM-FLOR-VXMB06B.wdw.disney.com> Accept-Language: en-US Content-Language: en-US X-MS-Has-Attach: X-MS-TNEF-Correlator: acceptlanguage: en-US Content-Type: multipart/alternative; boundary="_000_4CED3A5A427FF34BAEE89C98D64D336D1FFC16380DSMFLORVXMB06B_" MIME-Version: 1.0 X-Source-IP: sm-flor-xht01.wdw.disney.com [153.6.66.104] --_000_4CED3A5A427FF34BAEE89C98D64D336D1FFC16380DSMFLORVXMB06B_ Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: quoted-printable Has anyone had success reading XML files in Hive? I've been looking at the= cloud9 XMLInputFormat to read in the top-level XML node in each file, with= the goal to then use XPath (and/or LATERAL VIEW) to read individual record= s in the file. There isn't much in the way of documentation or examples th= at I can find. The structure of my XML is: Here are the Hive DDL statements that I've tried so far: add jar /data1/mtucker/cloud9-1.3.2.jar; add jar /usr/lib/mahout/mahout-examples-0.5-cdh3u3.jar; -- Mahout XMLInputFormat DROP TABLE IF EXISTS xmltable; CREATE TABLE xmltable ( xmldata STRING ) STORED AS INPUTFORMAT 'org.apache.mahout.classifier.bayes.XmlInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat' TBLPROPERTIES ( "xmlinput.start"=3D"" ); LOAD DATA LOCAL INPATH '/data1/mtucker/datafile.xml' INTO TABLE xmltable; select COUNT(*) FROM xmltable WHERE xmldata !=3D "" limit 1; --FAILED: Error in semantic analysis: Line 1:21 Input format must implement= InputFormat xmltable --Cloud9 XMLInputFormat DROP TABLE IF EXISTS xmltable; CREATE TABLE xmltable ( xmldata STRING ) STORED AS INPUTFORMAT 'edu.umd.cloud9.collection.XMLInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputForma= t' TBLPROPERTIES ( "xmlinput.start"=3D"" ); LOAD DATA LOCAL INPATH '/data1/mtucker/datafile.xml' INTO TABLE xmltable; select COUNT(*) FROM xmltable WHERE xmldata !=3D "" limit 1; --FAILED: Error in semantic analysis: Line 1:21 Input format must implement= InputFormat xmltable --RegEx SerDe DROP TABLE IF EXISTS xmltable; CREATE TABLE xmltable ( xmldata STRING ) ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.RegexSerDe' WITH SERDEPROPERTIES ( "input.regex" =3D "(?)(<[\\w\\W]*>?)()*", "output.format.string" =3D "%2$s" ); LOAD DATA LOCAL INPATH '/data1/mtucker/datafile.xml' INTO TABLE xmltable; select COUNT(*) FROM xmltable WHERE xmldata !=3D "" limit 1; --FAILED: Execution Error, return code 2 from org.apache.hadoop.hive.ql.exe= c.MapRedTask Thanks Matt Tucker --_000_4CED3A5A427FF34BAEE89C98D64D336D1FFC16380DSMFLORVXMB06B_ Content-Type: text/html; charset="us-ascii" Content-Transfer-Encoding: quoted-printable

Has anyone had s= uccess reading XML files in Hive?  I’ve been looking at the clou= d9 XMLInputFormat to read in the top-level XML node in each file, with the = goal to then use XPath (and/or LATERAL VIEW) to read individual records in = the file.  There isn’t much in the way of documentation or examp= les that I can find.

 

The structure of my XML is:

 

<= ?xml version=3D"1.0" encoding=3D"UTF-8"?>=

&= lt;Report user=3D"sample_user">

   &n= bsp; <Session id=3D"ID617693930">

  &nbs= p;  </Session>

     <Session id= =3D"ID617695571">

     </Sessi= on>

     <Variables>

 &= nbsp;         <var>

=            </var>

           <va= r>

          = </var>

     </Variables>

</Report>

 

Here are the Hive DDL statements that I’ve = tried so far:

 

add jar /data1/mt= ucker/cloud9-1.3.2.jar;

add jar /usr/lib/mahout/mahout-examples-0.= 5-cdh3u3.jar;

 

-- Mahout XMLInputFormat<= /span>

DR= OP TABLE IF EXISTS xmltable;

CREATE TABLE xmltable (

&nbs= p;   xmldata STRING

)

STORED AS

 =    INPUTFORMAT 'org.apache.mahout.classifier.bayes.XmlInputFormat= '

    OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.Ig= noreKeyTextOutputFormat'

TBLPROPERTIES (

  &nbs= p; "xmlinput.start"=3D"<Report",

 &nbs= p;  "xmlinput.end"=3D"</Report>"<= /span>

);=

LOAD DATA LOCAL INPATH '/data1/mtucker/datafile.xml' INTO TABLE x= mltable;

select COUNT(*) FROM xmltable WHERE xmldata !=3D "&q= uot; limit 1;

--FAILED: Error in semantic analysis: Line 1:21 Inpu= t format must implement InputFormat xmltable

 

 

--Cloud9 XMLInputFormat

DROP TABLE IF EXISTS xmltable;=

CREATE TABLE xmltable (

    xmldata STRIN= G

)

STORED AS

<= span style=3D'font-family:"Courier New"'>    INPUTFORMAT 'ed= u.umd.cloud9.collection.XMLInputFormat'

    OUTPUTF= ORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'

TBLPROPERTIES (

    "xmlinput.start"=3D&= quot;<Report",

    "xmlinput.end"= =3D"</Report>"

);

LOAD DATA LOCAL INPATH = '/data1/mtucker/datafile.xml' INTO TABLE xmltable;

select COUNT(*)= FROM xmltable WHERE xmldata !=3D "" limit 1;

--FAILED: = Error in semantic analysis: Line 1:21 Input format must implement InputForm= at xmltable

 

 

--RegEx SerDe

CREATE TABLE xmltable (

    xmldata STRING

)

ROW FORMAT SERDE 'or= g.apache.hadoop.hive.contrib.serde2.RegexSerDe'

WITH SERDEPROPERTI= ES (

    "input.regex" =3D "(<Rep= ort([\\w\\W]*)>?)(<[\\w\\W]*>?)(</Report>)*",

=     "output.format.string" =3D "%2$s"

);

LOAD DATA LOCAL INPATH '/data1/mtucker/datafile.xml' INT= O TABLE xmltable;

select COUNT(*) FROM xmltable WHERE xmldata !=3D= "" limit 1;

--FAILED: Execution Error, return code 2 fr= om org.apache.hadoop.hive.ql.exec.MapRedTask

 

 

Thanks

 =

Matt Tucker=

 

= --_000_4CED3A5A427FF34BAEE89C98D64D336D1FFC16380DSMFLORVXMB06B_--