hadoop-common-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Apache Wiki <wikidi...@apache.org>
Subject [Hadoop Wiki] Update of "Hive/HBaseIntegration" by JohnSichi
Date Fri, 04 Jun 2010 21:17:01 GMT
Dear Wiki user,

You have subscribed to a wiki page or wiki category on "Hadoop Wiki" for change notification.

The "Hive/HBaseIntegration" page has been changed by JohnSichi.
http://wiki.apache.org/hadoop/Hive/HBaseIntegration?action=diff&rev1=27&rev2=28

--------------------------------------------------

  The column mapping support currently available is somewhat
  cumbersome and restrictive:
  
-  * the first column in the Hive table always maps to the key in the HBase table
-  * for each subsequent Hive column, the table creator must specify a corresponding entry
in the comma-delimited {{{hbase.columns.mapping}}} string (so for a Hive table with n columns,
the string should have n-1 entries); whitespace should '''not''' be used in between entries
since these will be interperted as part of the column name, which is almost certainly not
what you want
+  * for each Hive column, the table creator must specify a corresponding entry in the comma-delimited
{{{hbase.columns.mapping}}} string (so for a Hive table with n columns, the string should
have n entries); whitespace should '''not''' be used in between entries since these will be
interperted as part of the column name, which is almost certainly not what you want
-  * a mapping entry is of the form {{{column-family-name:[column-name]}}}
+  * a mapping entry must be either {{{:key}}} or of the form {{{column-family-name:[column-name]}}}
+  * there must be exactly one {{{:key}}} mapping (we don't support compound keys yet)
+  ** note that before HIVE-1228, {{{:key}}} was not supported, and the first Hive column
implicitly mapped to the key; as of HIVE-1228, it is now strongly recommended that you always
specify the key explictly; we will drop support for implicit key mapping in the future
   * if no column-name is given, then the Hive column will map to all columns in the corresponding
HBase column family, and the Hive MAP datatype must be used to allow access to these (possibly
sparse) columns
   * there is currently no way to access the HBase timestamp attribute, and queries always
access data with the latest timestamp.
   * since HBase does not associate datatype information with columns, the serde converts
everything to string representation before storing it in HBase; there is currently no way
to plug in a custom serde per column
@@ -160, +161 @@

  CREATE TABLE hbase_table_1(key int, value1 string, value2 int, value3 int) 
  STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
  WITH SERDEPROPERTIES (
- "hbase.columns.mapping" = "a:b,a:c,d:e"
+ "hbase.columns.mapping" = ":key,a:b,a:c,d:e"
  );
  INSERT OVERWRITE TABLE hbase_table_1 SELECT foo, bar, foo+1, foo+2 
  FROM pokes WHERE foo=98 OR foo=100;
@@ -212, +213 @@

  CREATE TABLE hbase_table_1(key int, value map<string,int>) 
  STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
  WITH SERDEPROPERTIES (
- "hbase.columns.mapping" = "cf:"
+ "hbase.columns.mapping" = ":key,cf:"
  );
  INSERT OVERWRITE TABLE hbase_table_1 SELECT foo, map(bar, foo) FROM pokes 
  WHERE foo=98 OR foo=100;
@@ -248, +249 @@

  CREATE TABLE hbase_table_1(key int, value map<int,int>) 
  STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
  WITH SERDEPROPERTIES (
- "hbase.columns.mapping" = "cf:"
+ "hbase.columns.mapping" = ":key,cf:"
  );
  FAILED: Error in metadata: java.lang.RuntimeException: MetaException(message:org.apache.hadoop.hive.serde2.SerDeException
org.apache.hadoop.hive.hbase.HBaseSerDe: hbase column family 'cf:' should be mapped to map<string,?>
but is mapped to map<int,int>)
  }}}
@@ -262, +263 @@

  CREATE TABLE hbase_table_1(key int, value string) 
  STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
  WITH SERDEPROPERTIES (
- "hbase.columns.mapping" = "cf:"
+ "hbase.columns.mapping" = ":key,cf:"
  );
  FAILED: Error in metadata: java.lang.RuntimeException: MetaException(message:org.apache.hadoop.hive.serde2.SerDeException
org.apache.hadoop.hive.hbase.HBaseSerDe: hbase column family 'cf:' should be mapped to map<string,?>
but is mapped to string)
  }}}
@@ -289, +290 @@

  CREATE TABLE pokes3(foo INT, bar STRING)
  STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
  WITH SERDEPROPERTIES (
- "hbase.columns.mapping" = "cf:bar"
+ "hbase.columns.mapping" = ":key,cf:bar"
  );
  INSERT OVERWRITE TABLE pokes3 SELECT * FROM pokes;
  -- this will return 1 instead of 3
@@ -308, +309 @@

   * run profiler and minimize any per-row overhead in column mapping
   * user defined routines for lookups and data loads via HBase client API (HIVE-758 and HIVE-791)
   * logging is very noisy, with a lot of spurious exceptions; investigate these and either
fix their cause or squelch them
-  * replace dependencies on deprecated HBase API's such as RowResult
+  * replace dependencies on deprecated HBase API's such as RowResult (HIVE-1229)
   * allow HBase WAL to be disabled (HIVE-1383)
  
  = Build =

Mime
View raw message