Return-Path: X-Original-To: apmail-hive-commits-archive@www.apache.org Delivered-To: apmail-hive-commits-archive@www.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 50CC711BE4 for ; Thu, 14 Aug 2014 03:45:05 +0000 (UTC) Received: (qmail 90762 invoked by uid 500); 14 Aug 2014 03:45:05 -0000 Delivered-To: apmail-hive-commits-archive@hive.apache.org Received: (qmail 90705 invoked by uid 500); 14 Aug 2014 03:45:05 -0000 Mailing-List: contact commits-help@hive.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: hive-dev@hive.apache.org Delivered-To: mailing list commits@hive.apache.org Received: (qmail 90692 invoked by uid 99); 14 Aug 2014 03:45:05 -0000 Received: from nike.apache.org (HELO nike.apache.org) (192.87.106.230) by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 14 Aug 2014 03:45:05 +0000 X-ASF-Spam-Status: No, hits=-2000.0 required=5.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 14 Aug 2014 03:44:41 +0000 Received: from eris.apache.org (localhost [127.0.0.1]) by eris.apache.org (Postfix) with ESMTP id 269F823889D5; Thu, 14 Aug 2014 03:44:39 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r1617869 - in /hive/trunk/serde: if/ src/gen/thrift/gen-cpp/ src/gen/thrift/gen-javabean/org/apache/hadoop/hive/serde/ src/gen/thrift/gen-php/org/apache/hadoop/hive/serde/ src/gen/thrift/gen-py/org_apache_hadoop_hive_serde/ src/gen/thrift/g... Date: Thu, 14 Aug 2014 03:44:38 -0000 To: commits@hive.apache.org From: brock@apache.org X-Mailer: svnmailer-1.0.9 Message-Id: <20140814034439.269F823889D5@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Author: brock Date: Thu Aug 14 03:44:38 2014 New Revision: 1617869 URL: http://svn.apache.org/r1617869 Log: HIVE-7142 - Hive multi serialization encoding support (Chengxiang Li via Brock) Added: hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/AbstractEncodingAwareSerDe.java Modified: hive/trunk/serde/if/serde.thrift hive/trunk/serde/src/gen/thrift/gen-cpp/serde_constants.cpp hive/trunk/serde/src/gen/thrift/gen-cpp/serde_constants.h hive/trunk/serde/src/gen/thrift/gen-javabean/org/apache/hadoop/hive/serde/serdeConstants.java hive/trunk/serde/src/gen/thrift/gen-php/org/apache/hadoop/hive/serde/Types.php hive/trunk/serde/src/gen/thrift/gen-py/org_apache_hadoop_hive_serde/constants.py hive/trunk/serde/src/gen/thrift/gen-rb/serde_constants.rb hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/DelimitedJSONSerDe.java hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/SerDeUtils.java hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazySimpleSerDe.java Modified: hive/trunk/serde/if/serde.thrift URL: http://svn.apache.org/viewvc/hive/trunk/serde/if/serde.thrift?rev=1617869&r1=1617868&r2=1617869&view=diff ============================================================================== --- hive/trunk/serde/if/serde.thrift (original) +++ hive/trunk/serde/if/serde.thrift Thu Aug 14 03:44:38 2014 @@ -30,6 +30,7 @@ const string SERIALIZATION_NULL_FORMAT = const string SERIALIZATION_LAST_COLUMN_TAKES_REST = "serialization.last.column.takes.rest" const string SERIALIZATION_SORT_ORDER = "serialization.sort.order" const string SERIALIZATION_USE_JSON_OBJECTS = "serialization.use.json.object" +const string SERIALIZATION_ENCODING = "serialization.encoding" const string FIELD_DELIM = "field.delim" const string COLLECTION_DELIM = "colelction.delim" Modified: hive/trunk/serde/src/gen/thrift/gen-cpp/serde_constants.cpp URL: http://svn.apache.org/viewvc/hive/trunk/serde/src/gen/thrift/gen-cpp/serde_constants.cpp?rev=1617869&r1=1617868&r2=1617869&view=diff ============================================================================== --- hive/trunk/serde/src/gen/thrift/gen-cpp/serde_constants.cpp (original) +++ hive/trunk/serde/src/gen/thrift/gen-cpp/serde_constants.cpp Thu Aug 14 03:44:38 2014 @@ -27,6 +27,8 @@ serdeConstants::serdeConstants() { SERIALIZATION_USE_JSON_OBJECTS = "serialization.use.json.object"; + SERIALIZATION_ENCODING = "serialization.encoding"; + FIELD_DELIM = "field.delim"; COLLECTION_DELIM = "colelction.delim"; Modified: hive/trunk/serde/src/gen/thrift/gen-cpp/serde_constants.h URL: http://svn.apache.org/viewvc/hive/trunk/serde/src/gen/thrift/gen-cpp/serde_constants.h?rev=1617869&r1=1617868&r2=1617869&view=diff ============================================================================== --- hive/trunk/serde/src/gen/thrift/gen-cpp/serde_constants.h (original) +++ hive/trunk/serde/src/gen/thrift/gen-cpp/serde_constants.h Thu Aug 14 03:44:38 2014 @@ -23,6 +23,7 @@ class serdeConstants { std::string SERIALIZATION_LAST_COLUMN_TAKES_REST; std::string SERIALIZATION_SORT_ORDER; std::string SERIALIZATION_USE_JSON_OBJECTS; + std::string SERIALIZATION_ENCODING; std::string FIELD_DELIM; std::string COLLECTION_DELIM; std::string LINE_DELIM; Modified: hive/trunk/serde/src/gen/thrift/gen-javabean/org/apache/hadoop/hive/serde/serdeConstants.java URL: http://svn.apache.org/viewvc/hive/trunk/serde/src/gen/thrift/gen-javabean/org/apache/hadoop/hive/serde/serdeConstants.java?rev=1617869&r1=1617868&r2=1617869&view=diff ============================================================================== --- hive/trunk/serde/src/gen/thrift/gen-javabean/org/apache/hadoop/hive/serde/serdeConstants.java (original) +++ hive/trunk/serde/src/gen/thrift/gen-javabean/org/apache/hadoop/hive/serde/serdeConstants.java Thu Aug 14 03:44:38 2014 @@ -49,6 +49,8 @@ public class serdeConstants { public static final String SERIALIZATION_USE_JSON_OBJECTS = "serialization.use.json.object"; + public static final String SERIALIZATION_ENCODING = "serialization.encoding"; + public static final String FIELD_DELIM = "field.delim"; public static final String COLLECTION_DELIM = "colelction.delim"; Modified: hive/trunk/serde/src/gen/thrift/gen-php/org/apache/hadoop/hive/serde/Types.php URL: http://svn.apache.org/viewvc/hive/trunk/serde/src/gen/thrift/gen-php/org/apache/hadoop/hive/serde/Types.php?rev=1617869&r1=1617868&r2=1617869&view=diff ============================================================================== --- hive/trunk/serde/src/gen/thrift/gen-php/org/apache/hadoop/hive/serde/Types.php (original) +++ hive/trunk/serde/src/gen/thrift/gen-php/org/apache/hadoop/hive/serde/Types.php Thu Aug 14 03:44:38 2014 @@ -32,6 +32,8 @@ $GLOBALS['serde_CONSTANTS']['SERIALIZATI $GLOBALS['serde_CONSTANTS']['SERIALIZATION_USE_JSON_OBJECTS'] = "serialization.use.json.object"; +$GLOBALS['serde_CONSTANTS']['SERIALIZATION_ENCODING'] = "serialization.encoding"; + $GLOBALS['serde_CONSTANTS']['FIELD_DELIM'] = "field.delim"; $GLOBALS['serde_CONSTANTS']['COLLECTION_DELIM'] = "colelction.delim"; Modified: hive/trunk/serde/src/gen/thrift/gen-py/org_apache_hadoop_hive_serde/constants.py URL: http://svn.apache.org/viewvc/hive/trunk/serde/src/gen/thrift/gen-py/org_apache_hadoop_hive_serde/constants.py?rev=1617869&r1=1617868&r2=1617869&view=diff ============================================================================== --- hive/trunk/serde/src/gen/thrift/gen-py/org_apache_hadoop_hive_serde/constants.py (original) +++ hive/trunk/serde/src/gen/thrift/gen-py/org_apache_hadoop_hive_serde/constants.py Thu Aug 14 03:44:38 2014 @@ -17,6 +17,7 @@ SERIALIZATION_NULL_FORMAT = "serializati SERIALIZATION_LAST_COLUMN_TAKES_REST = "serialization.last.column.takes.rest" SERIALIZATION_SORT_ORDER = "serialization.sort.order" SERIALIZATION_USE_JSON_OBJECTS = "serialization.use.json.object" +SERIALIZATION_ENCODING = "serialization.encoding" FIELD_DELIM = "field.delim" COLLECTION_DELIM = "colelction.delim" LINE_DELIM = "line.delim" Modified: hive/trunk/serde/src/gen/thrift/gen-rb/serde_constants.rb URL: http://svn.apache.org/viewvc/hive/trunk/serde/src/gen/thrift/gen-rb/serde_constants.rb?rev=1617869&r1=1617868&r2=1617869&view=diff ============================================================================== --- hive/trunk/serde/src/gen/thrift/gen-rb/serde_constants.rb (original) +++ hive/trunk/serde/src/gen/thrift/gen-rb/serde_constants.rb Thu Aug 14 03:44:38 2014 @@ -23,6 +23,8 @@ SERIALIZATION_SORT_ORDER = %q"serializat SERIALIZATION_USE_JSON_OBJECTS = %q"serialization.use.json.object" +SERIALIZATION_ENCODING = %q"serialization.encoding" + FIELD_DELIM = %q"field.delim" COLLECTION_DELIM = %q"colelction.delim" Added: hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/AbstractEncodingAwareSerDe.java URL: http://svn.apache.org/viewvc/hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/AbstractEncodingAwareSerDe.java?rev=1617869&view=auto ============================================================================== --- hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/AbstractEncodingAwareSerDe.java (added) +++ hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/AbstractEncodingAwareSerDe.java Thu Aug 14 03:44:38 2014 @@ -0,0 +1,82 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.serde2; + +import java.nio.charset.Charset; +import java.util.Properties; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.serde.serdeConstants; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.io.Writable; + +import com.google.common.base.Charsets; + +/** + * AbstractEncodingAwareSerDe aware the encoding from table properties, + * transform data from specified charset to UTF-8 during serialize, and + * transform data from UTF-8 to specified charset during deserialize. + */ +public abstract class AbstractEncodingAwareSerDe extends AbstractSerDe { + + protected Charset charset; + + @Override + @Deprecated + public void initialize(Configuration conf, Properties tbl) + throws SerDeException { + charset = Charset.forName(tbl.getProperty(serdeConstants.SERIALIZATION_ENCODING, "UTF-8")); + } + + @Override + public final Writable serialize(Object obj, ObjectInspector objInspector) + throws SerDeException { + Writable result = doSerialize(obj, objInspector); + if (!this.charset.equals(Charsets.UTF_8)) { + result = transformFromUTF8(result); + } + return result; + } + + /** + * transform Writable data from UTF-8 to charset before serialize. + * @param blob + * @return + */ + protected abstract Writable transformFromUTF8(Writable blob); + + protected abstract Writable doSerialize(Object obj, ObjectInspector objInspector) throws SerDeException; + + @Override + public final Object deserialize(Writable blob) throws SerDeException { + if (!this.charset.equals(Charsets.UTF_8)) { + blob = transformToUTF8(blob); + } + return doDeserialize(blob); + } + + /** + * transform Writable data from charset to UTF-8 before doDeserialize. + * @param blob + * @return + */ + protected abstract Writable transformToUTF8(Writable blob); + + protected abstract Object doDeserialize(Writable blob) throws SerDeException; +} Modified: hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/DelimitedJSONSerDe.java URL: http://svn.apache.org/viewvc/hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/DelimitedJSONSerDe.java?rev=1617869&r1=1617868&r2=1617869&view=diff ============================================================================== --- hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/DelimitedJSONSerDe.java (original) +++ hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/DelimitedJSONSerDe.java Thu Aug 14 03:44:38 2014 @@ -46,7 +46,7 @@ public class DelimitedJSONSerDe extends * Not implemented. */ @Override - public Object deserialize(Writable field) throws SerDeException { + public Object doDeserialize(Writable field) throws SerDeException { LOG.error("DelimitedJSONSerDe cannot deserialize."); throw new SerDeException("DelimitedJSONSerDe cannot deserialize."); } Modified: hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/SerDeUtils.java URL: http://svn.apache.org/viewvc/hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/SerDeUtils.java?rev=1617869&r1=1617868&r2=1617869&view=diff ============================================================================== --- hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/SerDeUtils.java (original) +++ hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/SerDeUtils.java Thu Aug 14 03:44:38 2014 @@ -18,6 +18,7 @@ package org.apache.hadoop.hive.serde2; +import java.nio.charset.Charset; import java.util.ArrayList; import java.util.List; import java.util.Map; @@ -523,4 +524,12 @@ public final class SerDeUtils { private SerDeUtils() { // prevent instantiation } + + public static Text transformTextToUTF8(Text text, Charset previousCharset) { + return new Text(new String(text.getBytes(), previousCharset)); + } + + public static Text transformTextFromUTF8(Text text, Charset targetCharset) { + return new Text(new String(text.getBytes()).getBytes(targetCharset)); + } } Modified: hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazySimpleSerDe.java URL: http://svn.apache.org/viewvc/hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazySimpleSerDe.java?rev=1617869&r1=1617868&r2=1617869&view=diff ============================================================================== --- hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazySimpleSerDe.java (original) +++ hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazySimpleSerDe.java Thu Aug 14 03:44:38 2014 @@ -19,6 +19,7 @@ package org.apache.hadoop.hive.serde2.lazy; import java.io.IOException; +import java.nio.charset.Charset; import java.util.Arrays; import java.util.List; import java.util.Map; @@ -29,11 +30,13 @@ import org.apache.commons.logging.LogFac import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.conf.HiveConf.ConfVars; import org.apache.hadoop.hive.serde.serdeConstants; +import org.apache.hadoop.hive.serde2.AbstractEncodingAwareSerDe; import org.apache.hadoop.hive.serde2.AbstractSerDe; import org.apache.hadoop.hive.serde2.ByteStream; import org.apache.hadoop.hive.serde2.SerDe; import org.apache.hadoop.hive.serde2.SerDeException; import org.apache.hadoop.hive.serde2.SerDeStats; +import org.apache.hadoop.hive.serde2.SerDeUtils; import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; @@ -61,7 +64,7 @@ import org.apache.hadoop.io.Writable; * Also LazySimpleSerDe outputs typed columns instead of treating all columns as * String like MetadataTypedColumnsetSerDe. */ -public class LazySimpleSerDe extends AbstractSerDe { +public class LazySimpleSerDe extends AbstractEncodingAwareSerDe { public static final Log LOG = LogFactory.getLog(LazySimpleSerDe.class .getName()); @@ -187,6 +190,8 @@ public class LazySimpleSerDe extends Abs public void initialize(Configuration job, Properties tbl) throws SerDeException { + super.initialize(job, tbl); + serdeParams = LazySimpleSerDe.initSerdeParams(job, tbl, getClass() .getName()); @@ -330,7 +335,7 @@ public class LazySimpleSerDe extends Abs * @see SerDe#deserialize(Writable) */ @Override - public Object deserialize(Writable field) throws SerDeException { + public Object doDeserialize(Writable field) throws SerDeException { if (byteArrayRef == null) { byteArrayRef = new ByteArrayRef(); } @@ -375,7 +380,7 @@ public class LazySimpleSerDe extends Abs * @see SerDe#serialize(Object, ObjectInspector) */ @Override - public Writable serialize(Object obj, ObjectInspector objInspector) + public Writable doSerialize(Object obj, ObjectInspector objInspector) throws SerDeException { if (objInspector.getCategory() != Category.STRUCT) { @@ -584,4 +589,16 @@ public class LazySimpleSerDe extends Abs return stats; } + + @Override + protected Writable transformFromUTF8(Writable blob) { + Text text = (Text)blob; + return SerDeUtils.transformTextFromUTF8(text, this.charset); + } + + @Override + protected Writable transformToUTF8(Writable blob) { + Text text = (Text)blob; + return SerDeUtils.transformTextToUTF8(text, this.charset); + } }