Mailing-List: contact hive-user-help@hadoop.apache.org; run by ezmlm
Precedence: bulk
Reply-To: hive-user@hadoop.apache.org
Received-SPF: pass (nike.apache.org: domain of goukas@gmail.com designates
 209.85.223.198 as permitted sender)
DomainKey-Signature: a=rsa-sha1; c=nofws;
        d=gmail.com; s=gamma;
        h=mime-version:in-reply-to:references:date:message-id:subject:from:to
         :content-type:content-transfer-encoding;
        b=utP6w4S9E3uu7YqK/WLdFYRSdug7DgBOj4EuZLUBhD2uMkyWcZ8iTw+dXLw4ouGs5F
         UCpk1VjnlgVTse0HunYL3tKwyUzm4S4EGGx+oMn4v9oNDlqBj3gdXe5YV6meUXzEbxBR
         EkeHnr4ved8CCMu8jTO39catmcQlLW9JbUkzk=
MIME-Version: 1.0
In-Reply-To: <j2h2936d8211004021311z26bf292cj293dab0dba2dbcdc@mail.gmail.com>
References: <j2h2936d8211004021311z26bf292cj293dab0dba2dbcdc@mail.gmail.com>
Date: Tue, 6 Apr 2010 13:34:50 -0400
Message-ID: <r2g2936d8211004061034kc6cdad6ev2fe06937c7ff78c6@mail.gmail.com>
Subject: Re: UDAF on AWS Hive
From: Matthew Bryan <goukas@gmail.com>
To: hive-user@hadoop.apache.org
Content-Type: text/plain; charset=ISO-8859-1
Content-Transfer-Encoding: quoted-printable

Thanks Zheng, and thanks for your great support to this list. I took
your idea and wrote the following code that worked for me...I'm no
Java whiz...so it's probably fairly inefficient. I do get to talk to
the Amazon folks from time to time, so I'll definitely mention my
interest in upgrading the Hive version. Thanks again.

Matt

package com.company.hadoop.hive.udaf;

import org.apache.hadoop.hive.ql.exec.UDAF;
import org.apache.hadoop.hive.ql.exec.UDAFEvaluator;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.IntWritable;
import java.util.Arrays;

public class UDAFGroupConcat extends UDAF{

        public static class GroupConcatStringEvaluator implements
UDAFEvaluator {
                private Text mOutput;
                private boolean mEmpty;

        public GroupConcatStringEvaluator() {
                super();
                init();
        }

        public void init() {
                mOutput =3D null;
                mEmpty =3D true;
        }
        public boolean iterate(Text o,  IntWritable N) {
                if (o!=3Dnull) {
                        if(mEmpty) {
                                mOutput =3D new Text(N+" "+o.toString());
                                mEmpty =3D false;
                        } else {
                                String temp =3D mOutput.toString() +
"\t" + N + " " + o.toString();
                                String[] split =3D temp.split("\t");
                                Arrays.sort(split);
                                String sorted =3D split[0];
                                for (int i =3D 1; i < split.length; i++)
                                {
                                        sorted =3D sorted + "\t" + split[i]=
;
                                }
                                mOutput.set(sorted);
                        }
                }
                return true;
        }
        public Text terminatePartial() {return mEmpty ? null : mOutput;}
        public boolean merge(Text o) {
                if (o!=3Dnull) {
                        if(mEmpty) {
                            mOutput =3D new Text(o.toString());
                            mEmpty =3D false;
                        } else {
                                String temp =3D mOutput.toString() +
"\t" + o.toString();
                                String[] split =3D temp.split("\t");
                                Arrays.sort(split);
                                String sorted =3D split[0];
                                for (int i =3D 1; i < split.length; i++)
                                {
                                        sorted =3D sorted + "\t" + split[i]=
;
                                }
                                mOutput.set(sorted);
                        }
                }
                return true;
        }
        public Text terminate() {return mEmpty ? null : mOutput;}
}
}


On Fri, Apr 2, 2010 at 4:11 PM, Matthew Bryan <goukas@gmail.com> wrote:
> I'm writing a basic group_concat UDAF for the Amazon version of
> Hive....and it's working fine for unordered groupings. But I can't
> seem to get an ordered version working (filling an array based on an
> IntWritable passed alongside). When I move from using Text return type
> on terminatePartial() to either Text[] or a State class I start
> getting errors:
>
> FAILED: Error in semantic analysis:
> org.apache.hadoop.hive.ql.metadata.HiveException: Cannot recognize
> return type class [Lorg.apache.hadoop.io.Text; from public
> org.apache.hadoop.io.Text[]
> com.company.hadoop.hive.udaf.UDAFGroupConcatN$GroupConcatNStringEvaluator=
.terminatePartial()
>
> or
>
> FAILED: Error in semantic analysis:
> org.apache.hadoop.hive.ql.metadata.HiveException: Cannot recognize
> return type class
> com.company.hadoop.hive.udaf.UDAFGroupConcatN$UDAFGroupConc
> atNState from public
> com.company.hadoop.hive.udaf.UDAFGroupConcatN$UDAFGroupConcatNState
> com.company.hadoop.hive.udaf.UDAFGroupConcatN$GroupConcatNStringEvaluator=
.terminatePartial
> ()
>
> What limits are there on the return type of
> terminatePartial()....shouldn't it just have to match the argument of
> merge and nothing more? Keep in mind this is the Amazon version of
> Hive (0.4 I think)....
>
> I put both versions of the UDAF below, ordered and unordered.
>
> Thanks for your time.
>
> Matt
>
>
> ######### Working Unordered ############
> /*QUERY: select user, event, group_concat(details) from datatable
> group by user,event;*/
>
> package com.company.hadoop.hive.udaf;
>
> import org.apache.hadoop.hive.ql.exec.UDAF;
> import org.apache.hadoop.hive.ql.exec.UDAFEvaluator;
> import org.apache.hadoop.io.Text;
>
> public class UDAFGroupConcat extends UDAF{
>
> =A0 =A0 =A0 =A0public static class GroupConcatStringEvaluator implements
> UDAFEvaluator {
> =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0private Text mOutput;
> =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0private boolean mEmpty;
>
> =A0 =A0 =A0 =A0public GroupConcatStringEvaluator() {
> =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0super();
> =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0init();
> =A0 =A0 =A0 =A0}
>
> =A0 =A0 =A0 =A0public void init() {
> =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0mOutput =3D null;
> =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0mEmpty =3D true;
> =A0 =A0 =A0 =A0}
>
> =A0 =A0 =A0 =A0public boolean iterate(Text o) {
> =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0if (o!=3Dnull) {
> =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0if(mEmpty) {
> =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0mOutput =
=3D new Text(o);
> =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0mEmpty =3D=
 false;
> =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0} else {
> =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0mOutput.se=
t(mOutput.toString()+"
> "+o.toString());
> =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0}
> =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0}
> =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0return true;
> =A0 =A0 =A0 =A0}
> =A0 =A0 =A0 =A0public Text terminatePartial() {return mEmpty ? null : mOu=
tput;}
> =A0 =A0 =A0 =A0public boolean merge(Text o) {return iterate(o);}
> =A0 =A0 =A0 =A0public Text terminate() {return mEmpty ? null : mOutput;}
> }
> }
>
> ############ Not Working Ordered #############
> /*QUERY: select user, event, group_concatN(details, detail_id) from
> datatable group by user,event;*/
>
> package com.company.hadoop.hive.udaf;
>
> import org.apache.hadoop.hive.ql.exec.UDAF;
> import org.apache.hadoop.hive.ql.exec.UDAFEvaluator;
> import org.apache.hadoop.io.Text;
> import org.apache.hadoop.io.IntWritable;
>
> public class UDAFGroupConcatN extends UDAF{
>
> =A0 =A0 =A0 =A0public static class GroupConcatNStringEvaluator implements
> UDAFEvaluator {
>
> =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0private Text[] mArray;
> =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0private boolean mEmpty;
>
> =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0public GroupConcatNStringEvaluator() {
> =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0super();
> =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0init();
> =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0}
>
> =A0 =A0 =A0 =A0public void init() {
> =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0mArray =3D new Text[5];
> =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0mEmpty =3D true;
> =A0 =A0 =A0 =A0}
>
> =A0 =A0 =A0 =A0public boolean iterate(Text o, IntWritable N) {
> =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0if (o!=3Dnull&&N!=3Dnull) {
> =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0mArray[N.get()].set(o.toSt=
ring());
> =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0mEmpty=3Dfalse;
> =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0}
> =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0return true;
> =A0 =A0 =A0 =A0}
> =A0 =A0 =A0 =A0public Text[] terminatePartial() {return mEmpty ? null : m=
Array;}
> =A0 =A0 =A0 =A0public boolean merge(Text[] o) {
> =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0if (o!=3Dnull) {
> =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0for(int i=3D0; i<=3D5; i++=
){
> =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0if(mArray[=
i].getLength()=3D=3D0){
> =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =
=A0 =A0mArray[i].set(o[i].toString());
> =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0}
> =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0}
> =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0}
> =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0return true;
> =A0 =A0 =A0 =A0}
>
> =A0 =A0 =A0 =A0public Text[] terminate() {return mEmpty ? null : mArray;}
> }
> }
>