hive-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Peter Chu <pete....@outlook.com>
Subject RE: A GenericUDF Function to Extract a Field From an Array of Structs
Date Fri, 05 Apr 2013 18:43:57 GMT
Hi Navis류승우,
Thank you very much.  Your code works, now I can run the function against external table.
 Thank you so much.
However, do you or can someone point me into testing this function?  I am completely stuck
in the testing part.
Thanks,Peter
The code for this function below.
=======================================
import org.apache.hadoop.hive.ql.exec.Description;import org.apache.hadoop.hive.ql.exec.UDFArgumentException;import
org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;import
org.apache.hadoop.hive.ql.metadata.HiveException;import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;import
org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;import
org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;import
org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;import org.apache.hadoop.hive.serde2.objectinspector.StructField;import
org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import java.util.ArrayList;
@Description(name = "extract_product_category",        value = "_FUNC_( array< struct<productCategory:string>
> ) - Collect all productCategory field values inside an array of struct(s), and return
the results in an array<string>",        extended = "Example:\n SELECT _FUNC_(array_of_product_category_structs)")public
class GenericUDFExtractProductCategory        extends GenericUDF{    private ArrayList ret;
    private ListObjectInspector listOI;    private StructObjectInspector structOI;    private
ObjectInspector prodCatOI;
    @Override    public ObjectInspector initialize(ObjectInspector[] args)            throws
UDFArgumentException    {        if (args.length != 1) {            throw new UDFArgumentLengthException("The
function extract_product_category() requires exactly one argument.");        }
        if (args[0].getCategory() != Category.LIST) {            throw new UDFArgumentTypeException(0,
"Type array<struct> is expected to be the argument for extract_product_category but
" + args[0].getTypeName() + " is found instead");        }
        listOI = ((ListObjectInspector) args[0]);        structOI = ((StructObjectInspector)
listOI.getListElementObjectInspector());
        if (structOI.getAllStructFieldRefs().size() != 1) {            throw new UDFArgumentTypeException(0,
"Incorrect number of fields in the struct, should be one");        }
        StructField productCategoryField = structOI.getStructFieldRef("productCategory");
       if (productCategoryField == null) {            throw new UDFArgumentTypeException(0,
"NO \"productCategory\" field in input structure");        }
        prodCatOI = productCategoryField.getFieldObjectInspector();        if (prodCatOI.getCategory()
!= Category.PRIMITIVE) {            throw new UDFArgumentTypeException(0, "productCategory
field must be of string type");        }
        if (((PrimitiveObjectInspector)prodCatOI).getPrimitiveCategory() != PrimitiveObjectInspector.PrimitiveCategory.STRING)
{            throw new UDFArgumentTypeException(0, "productCategory field must be of string
type");        }
        ret = new ArrayList();
        return ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableStringObjectInspector);
   }
    @Override    public ArrayList evaluate(DeferredObject[] arguments)            throws HiveException
   {        ret.clear();
        if (arguments.length != 1) {            ArrayList<String> emptyList = new ArrayList<String>();
           return emptyList;        }
        if (arguments[0].get() == null) {            ArrayList<String> emptyList = new
ArrayList<String>();            return emptyList;        }
        int numElements = listOI.getListLength(arguments[0].get());
        for (int i = 0; i< numElements; i++){            Object element = listOI.getListElement(arguments[0].get(),
i);            Object prodCatValue = structOI.getStructFieldData(element, structOI.getStructFieldRef("productCategory"));
           ret.add(((PrimitiveObjectInspector)prodCatOI).getPrimitiveWritableObject(prodCatValue));
       }        return ret;    }
    @Override    public String getDisplayString(String[] strings)    {        assert (strings.length
> 0);        StringBuilder sb = new StringBuilder();        sb.append("extract_product_category(");
       sb.append(strings[0]);        sb.append(")");        return sb.toString();    }}
=====================================================
From: pc3375@hotmail.com
To: user@hive.apache.org
Subject: RE: A GenericUDF Function to Extract a Field From an Array of Structs
Date: Fri, 5 Apr 2013 11:39:55 -0700




Hi Navis류승우,
Thank you very much.  Your code works, now I can run the function against external table.
 Thank you so much.
However, do you or can someone point me into testing this function?  I am completely stuck
in the testing part.
Peter
The code for this function below.
=======================================
import org.apache.hadoop.hive.ql.exec.Description;import org.apache.hadoop.hive.ql.exec.UDFArgumentException;import
org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;import
org.apache.hadoop.hive.ql.metadata.HiveException;import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;import
org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;import
org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;import
org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;import org.apache.hadoop.hive.serde2.objectinspector.StructField;import
org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import java.util.ArrayList;
@Description(name = "extract_product_category",        value = "_FUNC_( array< struct<productCategory:string>
> ) - Collect all productCategory field values inside an array of struct(s), and return
the results in an array<string>",        extended = "Example:\n SELECT _FUNC_(array_of_product_category_structs)")public
class GenericUDFExtractProductCategory        extends GenericUDF{    private ArrayList ret;
    private ListObjectInspector listOI;    private StructObjectInspector structOI;    private
ObjectInspector prodCatOI;
    @Override    public ObjectInspector initialize(ObjectInspector[] args)            throws
UDFArgumentException    {        if (args.length != 1) {            throw new UDFArgumentLengthException("The
function extract_product_category() requires exactly one argument.");        }
        if (args[0].getCategory() != Category.LIST) {            throw new UDFArgumentTypeException(0,
"Type array<struct> is expected to be the argument for extract_product_category but
" + args[0].getTypeName() + " is found instead");        }
        listOI = ((ListObjectInspector) args[0]);        structOI = ((StructObjectInspector)
listOI.getListElementObjectInspector());
        if (structOI.getAllStructFieldRefs().size() != 1) {            throw new UDFArgumentTypeException(0,
"Incorrect number of fields in the struct, should be one");        }
        StructField productCategoryField = structOI.getStructFieldRef("productCategory");
       if (productCategoryField == null) {            throw new UDFArgumentTypeException(0,
"NO \"productCategory\" field in input structure");        }
        prodCatOI = productCategoryField.getFieldObjectInspector();        if (prodCatOI.getCategory()
!= Category.PRIMITIVE) {            throw new UDFArgumentTypeException(0, "productCategory
field must be of string type");        }
        if (((PrimitiveObjectInspector)prodCatOI).getPrimitiveCategory() != PrimitiveObjectInspector.PrimitiveCategory.STRING)
{            throw new UDFArgumentTypeException(0, "productCategory field must be of string
type");        }
        ret = new ArrayList();
        return ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableStringObjectInspector);
   }
    @Override    public ArrayList evaluate(DeferredObject[] arguments)            throws HiveException
   {        ret.clear();
        if (arguments.length != 1) {            ArrayList<String> emptyList = new ArrayList<String>();
           return emptyList;        }
        if (arguments[0].get() == null) {            ArrayList<String> emptyList = new
ArrayList<String>();            return emptyList;        }
        int numElements = listOI.getListLength(arguments[0].get());
        for (int i = 0; i< numElements; i++){            Object element = listOI.getListElement(arguments[0].get(),
i);            Object prodCatValue = structOI.getStructFieldData(element, structOI.getStructFieldRef("productCategory"));
           ret.add(((PrimitiveObjectInspector)prodCatOI).getPrimitiveWritableObject(prodCatValue));
       }        return ret;    }
    @Override    public String getDisplayString(String[] strings)    {        assert (strings.length
> 0);        StringBuilder sb = new StringBuilder();        sb.append("extract_product_category(");
       sb.append(strings[0]);        sb.append(")");        return sb.toString();    }}
=====================================================
> Date: Wed, 3 Apr 2013 10:07:49 +0900
> Subject: Re: A GenericUDF Function to Extract a Field From an Array of Structs
> From: navis.ryu@nexr.com
> To: user@hive.apache.org
> 
> try to change codes in evaluate method like,
> 
> for (int i = 0; i < numElements; i++) {
>       Object element = listOI.getListElement(arguments[0].get(), i);
>       Object product = structOI.getStructFieldData(element,
> structOI.getStructFieldRef("productCategory"));
>       ret.add(((PrimitiveObjectInspector)prodCatOI).getPrimitiveWritableObject(product));
> }
> 
> 2013/3/29 Peter Chu <pete.chu@outlook.com>:
> > Sorry, the test should be following (changed extract_shas to
> > extract_product_category):
> >
> > import org.apache.hadoop.hive.ql.metadata.HiveException;
> > import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
> > import org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredObject;
> > import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;
> > import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
> > import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
> > import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
> > import
> > org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
> > import org.testng.annotations.Test;
> >
> > import java.util.ArrayList;
> > import java.util.List;
> >
> > public class TestGenericUDFExtractProductCategory
> > {
> >     ArrayList<String> fieldNames = new ArrayList<String>();
> >     ArrayList<ObjectInspector> fieldObjectInspectors = new
> > ArrayList<ObjectInspector>();
> >
> >     @Test
> >     public void simpleTest()
> >         throws Exception
> >     {
> >         ListObjectInspector firstInspector = new MyListObjectInspector();
> >
> >         ArrayList test = new ArrayList();
> >         test.add("test");
> >
> >         ArrayList test2 = new ArrayList();
> >         test2.add(test);
> >
> >         StructObjectInspector soi =
> > ObjectInspectorFactory.getStandardStructObjectInspector(test, test2);
> >
> >         fieldNames.add("productCategory");
> >
> > fieldObjectInspectors.add(PrimitiveObjectInspectorFactory.writableStringObjectInspector);
> >
> >         GenericUDF.DeferredObject firstDeferredObject = new
> > MyDeferredObject(test2);
> >
> >         GenericUDF extract_product_category = new
> > GenericUDFExtractProductCategory();
> >
> >         extract_product_category.initialize(new
> > ObjectInspector[]{firstInspector});
> >
> >         extract_product_category.evaluate(new
> > DeferredObject[]{firstDeferredObject});
> >     }
> >
> >     public class MyDeferredObject implements DeferredObject
> >     {
> >         private Object value;
> >
> >         public MyDeferredObject(Object value) {
> >             this.value = value;
> >         }
> >
> >         @Override
> >         public Object get() throws HiveException
> >         {
> >             return value;
> >         }
> >     }
> >
> >     private class MyListObjectInspector implements ListObjectInspector
> >     {
> >         @Override
> >         public ObjectInspector getListElementObjectInspector()
> >         {
> >             return
> > ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames,
> > fieldObjectInspectors);
> >         }
> >
> >         @Override
> >         public Object getListElement(Object data, int index)
> >         {
> >             List myList = (List) data;
> >             if (myList == null || index > myList.size()) {
> >                 return null;
> >             }
> >             return myList.get(index);
> >         }
> >
> >         @Override
> >         public int getListLength(Object data)
> >         {
> >             if (data == null) {
> >                 return -1;
> >             }
> >             return ((List) data).size();
> >         }
> >
> >         @Override
> >         public List<?> getList(Object data)
> >         {
> >             return (List) data;
> >         }
> >
> >         @Override
> >         public String getTypeName()
> >         {
> >             return null;  //To change body of implemented methods use File |
> > Settings | File Templates.
> >         }
> >
> >         @Override
> >         public Category getCategory()
> >         {
> >             return Category.LIST;
> >         }
> >     }
> > }
> >
> > ________________________________
> > From: pete.chu@outlook.com
> > To: user@hive.apache.org
> > Subject: A GenericUDF Function to Extract a Field From an Array of Structs
> > Date: Thu, 28 Mar 2013 14:16:33 -0700
> >
> > I am trying to write a GenericUDF function to collect all of a specific
> > struct field(s) within an array for each record, and return them in an array
> > as well.
> >
> > I wrote the UDF (as below), and it seems to work but:
> >
> > 1) It does not work when I am performing this on an external table, it works
> > fine on a managed table, any idea?
> >
> > 2) I am having a tough time writing a test on this.  I have attached the
> > test I have so far, and it does not work,
> > always getting 'java.util.ArrayList cannot be cast to
> > org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector' or cannot
> > cast String to LazyString',
> > my question is how do I supply a list of structs for the evalue method?
> >
> > Any help will be greatly appreciated.
> >
> > Thanks,
> > Peter
> >
> > The table:
> >
> > CREATE EXTERNAL TABLE FOO (
> >   TS string,
> >   customerId string,
> >   products array< struct<productCategory:string> >
> > )
> > PARTITIONED BY (ds string)
> > ROW FORMAT SERDE 'some.serde'
> > WITH SERDEPROPERTIES ('error.ignore'='true')
> > LOCATION 'some_locations'
> > ;
> >
> > A row of record holds:
> > 1340321132000, 'some_company',
> > [{"productCategory":"footwear"},{"productCategory":"eyewear"}]
> >
> > This is my code:
> >
> > import org.apache.hadoop.hive.ql.exec.Description;
> > import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
> > import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
> > import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
> > import org.apache.hadoop.hive.ql.metadata.HiveException;
> > import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
> > import org.apache.hadoop.hive.serde2.lazy.LazyString;
> > import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;
> > import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
> > import
> > org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;
> > import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
> > import
> > org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
> > import org.apache.hadoop.hive.serde2.objectinspector.StructField;
> > import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
> > import
> > org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
> > import
> > org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;
> > import org.apache.hadoop.io.Text;
> >
> > import java.util.ArrayList;
> >
> > @Description(name = "extract_product_category",
> >         value = "_FUNC_( array< struct<productCategory:string> > ) -
Collect
> > all product category field values inside an array of struct(s), and return
> > the results in an array<string>",
> >         extended = "Example:\n SELECT
> > _FUNC_(array_of_structs_with_product_category_field)")
> > public class GenericUDFExtractProductCategory
> >         extends GenericUDF
> > {
> >     private ArrayList ret;
> >
> >     private ListObjectInspector listOI;
> >     private StructObjectInspector structOI;
> >     private ObjectInspector prodCatOI;
> >
> >     @Override
> >     public ObjectInspector initialize(ObjectInspector[] args)
> >             throws UDFArgumentException
> >     {
> >         if (args.length != 1) {
> >             throw new UDFArgumentLengthException("The function
> > extract_product_category() requires exactly one argument.");
> >         }
> >
> >         if (args[0].getCategory() != Category.LIST) {
> >             throw new UDFArgumentTypeException(0, "Type array<struct> is
> > expected to be the argument for extract_product_category but " +
> > args[0].getTypeName() + " is found instead");
> >         }
> >
> >         listOI = ((ListObjectInspector) args[0]);
> >         structOI = ((StructObjectInspector)
> > listOI.getListElementObjectInspector());
> >
> >         if (structOI.getAllStructFieldRefs().size() != 1) {
> >             throw new UDFArgumentTypeException(0, "Incorrect number of
> > fields in the struct, should be one");
> >         }
> >
> >         StructField productCategoryField =
> > structOI.getStructFieldRef("productCategory");
> >         //If not, throw exception
> >         if (productCategoryField == null) {
> >             throw new UDFArgumentTypeException(0, "NO \"productCategory\"
> > field in input structure");
> >         }
> >
> >         //Are they of the correct types?
> >         //We store these object inspectors for use in the evaluate() method
> >         prodCatOI = productCategoryField.getFieldObjectInspector();
> >
> >         //First are they primitives
> >         if (prodCatOI.getCategory() != Category.PRIMITIVE) {
> >             throw new UDFArgumentTypeException(0, "productCategory field
> > must be of string type");
> >         }
> >
> >         //Are they of the correct primitives?
> >         if (((PrimitiveObjectInspector)prodCatOI).getPrimitiveCategory() !=
> > PrimitiveObjectInspector.PrimitiveCategory.STRING) {
> >             throw new UDFArgumentTypeException(0, "productCategory field
> > must be of string type");
> >         }
> >
> >         ret = new ArrayList();
> >
> >         return
> > ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableStringObjectInspector);
> >     }
> >
> >     @Override
> >     public ArrayList evaluate(DeferredObject[] arguments)
> >             throws HiveException
> >     {
> >         ret.clear();
> >
> >         if (arguments.length != 1) {
> >             return null;
> >         }
> >
> >         if (arguments[0].get() == null) {
> >         return null;
> >         }
> >
> >         int numElements = listOI.getListLength(arguments[0].get());
> >
> >         for (int i = 0; i < numElements; i++) {
> >             LazyString prodCatDataObject = (LazyString)
> > (structOI.getStructFieldData(listOI.getListElement(arguments[0].get(), i),
> > structOI.getStructFieldRef("productCategory")));
> >             Text productCategoryValue = ((StringObjectInspector)
> > prodCatOI).getPrimitiveWritableObject(prodCatDataObject);
> >             ret.add(productCategoryValue);
> >         }
> >         return ret;
> >     }
> >
> >     @Override
> >     public String getDisplayString(String[] strings)
> >     {
> >         assert (strings.length > 0);
> >         StringBuilder sb = new StringBuilder();
> >         sb.append("extract_product_category(");
> >         sb.append(strings[0]);
> >         sb.append(")");
> >         return sb.toString();
> >     }
> > }
> >
> >
> > My Test:
> >
> > import org.apache.hadoop.hive.ql.metadata.HiveException;
> > import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
> > import org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredObject;
> > import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;
> > import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
> > import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
> > import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
> > import
> > org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
> > import org.testng.annotations.Test;
> >
> > import java.util.ArrayList;
> > import java.util.List;
> >
> > public class TestGenericUDFExtractShas
> > {
> >     ArrayList<String> fieldNames = new ArrayList<String>();
> >     ArrayList<ObjectInspector> fieldObjectInspectors = new
> > ArrayList<ObjectInspector>();
> >
> >     @Test
> >     public void simpleTest()
> >         throws Exception
> >     {
> >         ListObjectInspector firstInspector = new MyListObjectInspector();
> >
> >         ArrayList test = new ArrayList();
> >         test.add("test");
> >
> >         ArrayList test2 = new ArrayList();
> >         test2.add(test);
> >
> >         StructObjectInspector soi =
> > ObjectInspectorFactory.getStandardStructObjectInspector(test, test2);
> >
> >         fieldNames.add("productCategory");
> >
> > fieldObjectInspectors.add(PrimitiveObjectInspectorFactory.writableStringObjectInspector);
> >
> >         GenericUDF.DeferredObject firstDeferredObject = new
> > MyDeferredObject(test2);
> >
> >         GenericUDF extract_shas = new GenericUDFExtractShas();
> >
> >         extract_shas.initialize(new ObjectInspector[]{firstInspector});
> >
> >         extract_shas.evaluate(new DeferredObject[]{firstDeferredObject});
> >     }
> >
> >     public class MyDeferredObject implements DeferredObject
> >     {
> >         private Object value;
> >
> >         public MyDeferredObject(Object value) {
> >             this.value = value;
> >         }
> >
> >         @Override
> >         public Object get() throws HiveException
> >         {
> >             return value;
> >         }
> >     }
> >
> >     private class MyListObjectInspector implements ListObjectInspector
> >     {
> >         @Override
> >         public ObjectInspector getListElementObjectInspector()
> >         {
> >             return
> > ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames,
> > fieldObjectInspectors);
> >         }
> >
> >         @Override
> >         public Object getListElement(Object data, int index)
> >         {
> >             List myList = (List) data;
> >             if (myList == null || index > myList.size()) {
> >                 return null;
> >             }
> >             return myList.get(index);
> >         }
> >
> >         @Override
> >         public int getListLength(Object data)
> >         {
> >             if (data == null) {
> >                 return -1;
> >             }
> >             return ((List) data).size();
> >         }
> >
> >         @Override
> >         public List<?> getList(Object data)
> >         {
> >             return (List) data;
> >         }
> >
> >         @Override
> >         public String getTypeName()
> >         {
> >             return null;  //To change body of implemented methods use File |
> > Settings | File Templates.
> >         }
> >
> >         @Override
> >         public Category getCategory()
> >         {
> >             return Category.LIST;
> >         }
> >     }
> > }
 		 	   		   		 	   		  
Mime
View raw message