impala-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Dejan Prokić <dej...@nordeus.com>
Subject Bug with string in maps
Date Mon, 15 Aug 2016 10:54:11 GMT
Hello,

I have a problem to extract data from a column of type MAP<STRING, STRING>,
impala returns me bad data. Please, tell me if there is a workaround to
extract specific values from a map. This is version of impala I use:
Server version: impalad version 2.3.0-cdh5.5.2 RELEASE (build
cc1125f10419a7269366f7f950f57b24b07acd64)

Here is an example which doesn't work well:

-- this query returns correct result only for smallint_required_param_int,
CASE without CAST and GROUP_CONCAT return bad data
SELECT
e.user_id,
m.smallint_required_param_int,
m.smallint_required_param_str,
m.all_values_from_map,
e.date_id
FROM event e,
(SELECT
MAX(CASE WHEN key='smallint_required_param' THEN CAST(value AS SMALLINT)
END) AS smallint_required_param_int,
MAX(CASE WHEN key='smallint_required_param' THEN value END) AS
smallint_required_param_str,
GROUP_CONCAT(concat(key, ':', value), ',')  as all_values_from_map
FROM e.event_map) m
WHERE e.event_id = 3;


Here is how to prepare data from attachment:

-- hive
CREATE TABLE event_unpartitioned
ROW FORMAT
SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
STORED AS
INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
TBLPROPERTIES ('avro.schema.literal'='{
  "namespace": "testing.hive.avro.serde",
  "name": "events",
  "type": "record",
  "fields": [
   {"name": "user_id", "type": ["null", "long"], "default": null},
   {"name": "event_id",  "type": ["null", "int"] , "default": null},
   {"name": "event_type_id",  "type": ["null", "int"] , "default": null},
   {"name": "ts_bigint", "type": ["null", "long"], "default": null},
   {"name": "event_map", "type": ["null", {"type": "map", "values":
"string"}], "default": null},
   {"name": "server_id", "type": ["null", "int"], "default": null},
   {"name": "date_id", "type": ["null", "int"], "default": null}
  ]
}');

-- impala
CREATE TABLE event (
  user_id BIGINT,
  event_id INT,
  event_type_id INT,
  ts_bigint BIGINT,
  event_map MAP<STRING, STRING>,
  server_id INT
)
PARTITIONED BY (date_id INT)
STORED AS PARQUET;

-- bash
hdfs dfs -put events.avro /user/hive/warehouse/event_unpartitioned/

-- hive
set hive.exec.dynamic.partition.mode=nonstrict;
insert into event partition (date_id) select * from event_unpartitioned;

-- impala
refresh event;

select * from event e, e.event_map; -- this query returns correct results

Thanks

*Dejan Prokić* | Data Engineer | Nordeus

Mime
View raw message