hive-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Sanjay Subramanian <Sanjay.Subraman...@wizecommerce.com>
Subject Errors in one Hive script using LZO compression
Date Tue, 18 Jun 2013 06:59:53 GMT
Hi

I am using LZO compression in our scripts but one script is still creating errors

Diagnostic Messages for this Task:
Error: java.io.IOException: java.io.EOFException: Premature EOF from inputStream
        at org.apache.hadoop.hive.io.HiveIOExceptionHandlerChain.handleRecordReaderCreationException(HiveIOExceptionHandlerChain.java:97)
        at org.apache.hadoop.hive.io.HiveIOExceptionHandlerUtil.handleRecordReaderCreationException(HiveIOExceptionHandlerUtil.java:57)
        at org.apache.hadoop.hive.ql.io.HiveInputFormat.getRecordReader(HiveInputFormat.java:243)
        at org.apache.hadoop.hive.ql.io.CombineHiveInputFormat.getRecordReader(CombineHiveInputFormat.java:522)
        at org.apache.hadoop.mapred.MapTask$TrackedRecordReader.<init>(MapTask.java:160)
        at org.apache.hadoop.mapred.MapTask.runOldMapper(MapTask.java:381)
        at org.apache.hadoop.mapred.MapTask.run(MapTask.java:334)
        at org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:152)
        at java.security.AccessController.doPrivileged(Native Method)
        at javax.security.auth.Subject.doAs(Subject.java:396)
        at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1332)
        at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:147)
Caused by: java.io.EOFException: Premature EOF from inputStream
        at com.hadoop.compression.lzo.LzopInputStream.readFully(LzopInputStream.java:75)
        at com.hadoop.compression.lzo.LzopInputStream.readHeader(LzopInputStream.java:114)
        at com.hadoop.compression.lzo.LzopInputStream.<init>(LzopInputStream.java:54)
        at com.hadoop.compression.lzo.LzopCodec.createInputStream(LzopCodec.java:83)
        at org.apache.hadoop.io.SequenceFile$Reader.init(SequenceFile.java:1871)
        at org.apache.hadoop.io.SequenceFile$Reader.initialize(SequenceFile.java:1765)
        at org.apache.hadoop.io.SequenceFile$Reader.<init>(SequenceFile.java:1714)
        at org.apache.hadoop.io.SequenceFile$Reader.<init>(SequenceFile.java:1728)
        at org.apache.hadoop.mapred.SequenceFileRecordReader.<init>(SequenceFileRecordReader.java:49)
        at org.apache.hadoop.mapred.SequenceFileInputFormat.getRecordReader(SequenceFileInputFormat.java:64)
        at org.apache.hadoop.hive.ql.io.HiveInputFormat.getRecordReader(HiveInputFormat.java:240)
        ... 9 more


SCRIPT
=======
set hiveconf mapred.output.compression.type=BLOCK;
set mapred.map.output.compression.codec=org.apache.hadoop.io.compress.SnappyCodec;
set mapreduce.map.output.compress=true;
set hive.exec.compress.output=true;
set mapreduce.output.fileoutputformat.compress.codec=com.hadoop.compression.lzo.LzopCodec;
set mapreduce.output.fileoutputformat.compress=true;
set hive.exec.compress.intermediate=true;
set mapreduce.job.maps=500;
set mapreduce.job.reduces=8;
set mapreduce.tasktracker.map.tasks.maximum=12;
set mapreduce.tasktracker.reduce.tasks.maximum=8;
add jar /home/nextag/sasubramanian/mycode/impressions/jar/impressions-hiveudfs-1.0-20130615-155038.jar;
create temporary function collect  as 'com.wizecommerce.utils.hive.udf.GenericUDAFCollect';
create temporary function isnextagip  as 'com.wizecommerce.utils.hive.udf.IsNextagIP';
create temporary function isfrombot  as 'com.wizecommerce.utils.hive.udf.IsFromBot';
create temporary function processblankkeyword  as 'com.wizecommerce.utils.hive.udf.ProcessBlankKeyword';
create temporary function getValidHiddenSellers as 'com.wizecommerce.utils.hive.udf.GetValidHiddenSellers';
INSERT OVERWRITE DIRECTORY '/user/beeswax/warehouse/keyword_impressions_ptitles_log/2013-03-19'
SELECT
     hp.header_date,
     hp.impression_id,
     hp.header_searchsessionid,
     hp.cached_visit_id,
     split(hp.header_servername,'[\.]')[0],
     hp.cached_ip,
     hp.header_adnode,
     IF (concat_ws(',' , collect_set(concat_ws('|', cast(hp.seller_id as STRING), cast(IF(hp.seller_pricetier
IS NULL, -1L, hp.seller_pricetier) as STRING), cast(hp.seller_price as STRING), cast(IF(hp.ptitle_rank
IS  NULL, -1L, hp.ptitle_rank) as STRING)))) = '-1|-1',NULL,concat_ws(',' , collect_set(concat_ws('|',
cast(hp.seller_id as STRING), cast(IF(hp.seller_pricetier IS NULL, -1L, hp.seller_pricetier)
as STRING), cast(hp.seller_price as STRING), cast(IF(hp.ptitle_rank IS  NULL, -1L, hp.ptitle_rank)
as STRING))))),
     IF(concat_ws(',' , getValidHiddenSellers(collect_set(concat_ws('|', cast(sh.seller_id
as STRING), cast(sh.ptitle_id as STRING), cast(sh.tag_id as STRING), cast(IF(sh.price_tier
IS NULL, -1L, sh.price_tier) as STRING))))) = '',NULL, concat_ws(',' , getValidHiddenSellers(collect_set(concat_ws('|',
cast(sh.seller_id as STRING), cast(sh.ptitle_id as STRING), cast(sh.tag_id as STRING), cast(IF(sh.price_tier
IS NULL, -1L, sh.price_tier) as STRING))))))
FROM
     (SELECT
          h.header_date,
          h.header_servername,
          h.impression_id,
          h.header_searchsessionid,
          h.cached_visit_id,
          h.cached_ip,
          h.header_adnode,
          p.ptitle_ptitleid,
          p.seller_id,
          p.seller_pricetier,
          p.seller_price,
          p.ptitle_rank
     FROM
          (SELECT
               header_date,
               header_servername,
               impression_id,
               header_searchsessionid,
               cached_ip,
               header_adnode,
               cached_recordid,
               cached_visit_id
           FROM
                outpdir_impressions_header
           WHERE
              header_date_partition='2013-03-19'
           AND
              header_rbabsentsellers = 1L
           AND
              cached_recordid IS NOT NULL
           AND
              isnextagip(cached_ip) = FALSE
           AND
              isfrombot(cached_visit_id) = FALSE
          ) h
     LEFT OUTER JOIN
          (SELECT
               po.impression_id,
               po.ptitle_ptitleid,
               po.header_date,
               po.seller_id,
               po.seller_pricetier,
               po.seller_price,
               po.ptitle_rank
           FROM
               (SELECT
                    impression_id,
                    ptitle_ptitleid,
                    header_date,
                    seller_id,
                    seller_pricetier,
                    seller_price,
                    ptitle_rank
                FROM
                    outpdir_impressions_ptitle
                WHERE
                    header_date_partition = '2013-03-19'
                AND
                    seller_id IS NOT NULL
                )  po
           JOIN
               (SELECT
                    impression_id,
                    ptitle_ptitleid,
                    ptitle_rank,
                    COUNT(DISTINCT seller_id, seller_pricetier, seller_price, ptitle_rank)
                FROM
                    outpdir_impressions_ptitle pi
                WHERE
                    header_date_partition = '2013-03-19'
                AND
                    seller_id IS NOT NULL
                GROUP BY
                    impression_id,
                    ptitle_ptitleid,
                    ptitle_rank
                HAVING
                    COUNT(DISTINCT seller_id, seller_pricetier, seller_price, ptitle_rank)
 = 1
                ) pi
           ON
               po.impression_id = pi.impression_id
           AND
               po.ptitle_ptitleid = pi.ptitle_ptitleid
           AND
               po.ptitle_rank = pi.ptitle_rank
          ) p
     ON
          h.impression_id = p.impression_id
     AND
          h.header_date=p.header_date
     ) hp
LEFT OUTER JOIN
     (SELECT
           *
      FROM
           outpdir_seller_hidden
      WHERE
          header_date_partition='2013-03-19'
     ) sh
ON
     hp.impression_id = sh.impression_id
AND
     hp.header_date = sh.header_date
GROUP BY
     hp.header_date,
     hp.impression_id,
     hp.header_searchsessionid,
     hp.cached_visit_id,
     hp.header_servername,
     hp.cached_ip,
     hp.header_adnode


CONFIDENTIALITY NOTICE
======================
This email message and any attachments are for the exclusive use of the intended recipient(s)
and may contain confidential and privileged information. Any unauthorized review, use, disclosure
or distribution is prohibited. If you are not the intended recipient, please contact the sender
by reply email and destroy all copies of the original message along with any attachments,
from your computer system. If you are the intended recipient, please be advised that the content
of this message is subject to access, review and disclosure by the sender's Email System Administrator.

Mime
View raw message