incubator-chukwa-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Preetam Patil <pvpatil.i...@gmail.com>
Subject Chunks for events getting jumbles up in Collector
Date Mon, 04 Jul 2011 04:40:47 GMT
Hi,
I'm noticing a strange behavior in the chukwa collector that is causing the
parsers to fail. When a large number of events are coming to the collector,
the event entries get kind of jumbled up (date missing, two entries getting
interleaved, two events getting passed as a single chunk, etc). As a result,
the HadoopMetricsProcessor throws a NullPointerException because the parser
returns a null object for a chunk. I have noticed that in all cases of this
error, chukwa.metrics.jvm is involved. But it does not occur on every
instance of chukwa.metrics.jvm event being received.

Here's one example of the error (in collector.log):

2011-07-04 09:42:18,617 WARN btpool0-18 HadoopMetricsProcessor - Wrong
format in HadoopMetricsProcessor [2011-07-01 16:29:14,860 2011-07-01
16:29:14,862 INFOINFO  chukwa.metrics.jvm: chukwa.metrics.dfs:
{"memHeapCommittedM":24.6875,"recordName":"metrics","chukwa_timestamp":1309517954860,"logFatal":0,"processName":"DataNode","threadsWaiting":8,"threadsBlocked":0,"gcCount":482,"logError":8,"logWarn":9,"period":60,"memNonHeapCommittedM":23.1875,"memNonHeapUsedM":18.441505,"gcTimeMillis":9625,"sessionId":"","logInfo":332,"threadsNew":0,"memHeapUsedM":10.359726,"threadsTerminated":0,"hostName":"clus1-datanode6","threadsTimedWaiting":9,"maxMemoryM":888.9375,"threadsRunnable":8,"contextName":"jvm"}{"recordName":"datanode","chukwa_timestamp":1309517954862,"readBlockOp_num_ops":10,"writeBlockOp_avg_time":0,"block_verification_failures":0,"blockChecksumOp_num_ops":0,"blocks_read":10,"blocks_written":13,"copyBlockOp_avg_time":0,"heartBeats_num_ops":197707,"writes_from_remote_client":12,"replaceBlockOp_num_ops":0,"blockReports_avg_time":0,"blocks_replicated":0,"writeBlockOp_num_ops":13,"heartBeats_avg_time":7,"hostName":"clus1-datanode6","bytes_read":60797,"copyBlockOp_num_ops":0,"contextName":"dfs","blockReports_num_ops":167,"bytes_written":389862107,"replaceBlockOp_avg_time":0,"reads_from_remote_client":8,"period":60,"readBlockOp_avg_time":0,"reads_from_local_client":2,"blocks_verified":1,"writes_from_local_client":1,"sessionId":"","blockChecksumOp_avg_time":0,"blocks_removed":14}

]
java.lang.NullPointerException
at
org.apache.hadoop.chukwa.extraction.demux.processor.mapper.HadoopMetricsProcessor.parse(HadoopMetricsProcessor.java:103)
at
org.apache.hadoop.chukwa.extraction.demux.processor.mapper.AbstractProcessor.process(AbstractProcessor.java:81)
at
org.apache.hadoop.chukwa.datacollection.writer.hbase.HBaseWriter.add(HBaseWriter.java:202)
at
org.apache.hadoop.chukwa.datacollection.writer.SocketTeeWriter.add(SocketTeeWriter.java:252)
at
org.apache.hadoop.chukwa.datacollection.writer.PipelineStageWriter.add(PipelineStageWriter.java:41)
at
org.apache.hadoop.chukwa.datacollection.collector.servlet.ServletCollector.accept(ServletCollector.java:154)
at
org.apache.hadoop.chukwa.datacollection.collector.servlet.ServletCollector.doPost(ServletCollector.java:203)
at javax.servlet.http.HttpServlet.service(HttpServlet.java:727)
at javax.servlet.http.HttpServlet.service(HttpServlet.java:820)
at org.mortbay.jetty.servlet.ServletHolder.handle(ServletHolder.java:487)
at org.mortbay.jetty.servlet.ServletHandler.handle(ServletHandler.java:362)
at org.mortbay.jetty.servlet.SessionHandler.handle(SessionHandler.java:181)
at org.mortbay.jetty.handler.ContextHandler.handle(ContextHandler.java:729)
at org.mortbay.jetty.handler.HandlerWrapper.handle(HandlerWrapper.java:152)
at org.mortbay.jetty.Server.handle(Server.java:324)
at org.mortbay.jetty.HttpConnection.handleRequest(HttpConnection.java:505)
at
org.mortbay.jetty.HttpConnection$RequestHandler.content(HttpConnection.java:843)
at org.mortbay.jetty.HttpParser.parseNext(HttpParser.java:647)
at org.mortbay.jetty.HttpParser.parseAvailable(HttpParser.java:211)
at org.mortbay.jetty.HttpConnection.handle(HttpConnection.java:380)
at
org.mortbay.io.nio.SelectChannelEndPoint.run(SelectChannelEndPoint.java:395)
at
org.mortbay.thread.BoundedThreadPool$PoolThread.run(BoundedThreadPool.java:450)
2011-07-04 09:42:18,618 WARN btpool0-18 HBaseWriter -
org.apache.hadoop.hbase.client.RetriesExhaustedWithDetailsException: Failed
1 action: NoSuchColumnFamilyException: 1 time, servers with issues:
chukwanode:49137,
at
org.apache.hadoop.hbase.client.HConnectionManager$HConnectionImplementation.processBatch(HConnectionManager.java:1223)
at
org.apache.hadoop.hbase.client.HConnectionManager$HConnectionImplementation.processBatchOfPuts(HConnectionManager.java:1237)
at org.apache.hadoop.hbase.client.HTable.flushCommits(HTable.java:819)
at org.apache.hadoop.hbase.client.HTable.doPut(HTable.java:675)
at org.apache.hadoop.hbase.client.HTable.put(HTable.java:665)
at
org.apache.hadoop.chukwa.datacollection.writer.hbase.HBaseWriter.add(HBaseWriter.java:203)
at
org.apache.hadoop.chukwa.datacollection.writer.SocketTeeWriter.add(SocketTeeWriter.java:252)
at
org.apache.hadoop.chukwa.datacollection.writer.PipelineStageWriter.add(PipelineStageWriter.java:41)
at
org.apache.hadoop.chukwa.datacollection.collector.servlet.ServletCollector.accept(ServletCollector.java:154)
at
org.apache.hadoop.chukwa.datacollection.collector.servlet.ServletCollector.doPost(ServletCollector.java:203)
at javax.servlet.http.HttpServlet.service(HttpServlet.java:727)
at javax.servlet.http.HttpServlet.service(HttpServlet.java:820)
at org.mortbay.jetty.servlet.ServletHolder.handle(ServletHolder.java:487)
at org.mortbay.jetty.servlet.ServletHandler.handle(ServletHandler.java:362)
at org.mortbay.jetty.servlet.SessionHandler.handle(SessionHandler.java:181)
at org.mortbay.jetty.handler.ContextHandler.handle(ContextHandler.java:729)
at org.mortbay.jetty.handler.HandlerWrapper.handle(HandlerWrapper.java:152)
at org.mortbay.jetty.Server.handle(Server.java:324)
at org.mortbay.jetty.HttpConnection.handleRequest(HttpConnection.java:505)
at
org.mortbay.jetty.HttpConnection$RequestHandler.content(HttpConnection.java:843)
at org.mortbay.jetty.HttpParser.parseNext(HttpParser.java:647)
at org.mortbay.jetty.HttpParser.parseAvailable(HttpParser.java:211)
at org.mortbay.jetty.HttpConnection.handle(HttpConnection.java:380)
at
org.mortbay.io.nio.SelectChannelEndPoint.run(SelectChannelEndPoint.java:395)
at
org.mortbay.thread.BoundedThreadPool$PoolThread.run(BoundedThreadPool.java:450)

Here's the corresponding entry in the sequenceFile written by the
SequenceFileWriter in HDFS:

^@^@^F^R^@^@^@I<8a>^A0ó2^FÀ^MHadoopMetrics0
cluster="Clus1"/clus1-datanode6/clus1-datanode6<8e>^Ep^@^@^@^A^@^@^@^@^@^@^Ep^@^Oclus1-datanode6^@^P
cluster="Clus1"^@^Oclus1-datanode6^@^MHadoopMetrics^@^@^@^@^@^A^@^@^Eo2011-07-01
16:29:14,860 2011-07-01 16:29:14,862 INFOINFO  chukwa.metrics.jvm:
chukwa.metrics.dfs:
{"memHeapCommittedM":24.6875,"recordName":"metrics","chukwa_timestamp":1309517954860,"logFatal":0,"processName":"DataNode","threadsWaiting":8,"threadsBlocked":0,"gcCount":482,"logError":8,"logWarn":9,"period":60,"memNonHeapCommittedM":23.1875,"memNonHeapUsedM":18.441505,"gcTimeMillis":9625,"sessionId":"","logInfo":332,"threadsNew":0,"memHeapUsedM":10.359726,"threadsTerminated":0,"hostName":"clus1-datanode6","threadsTimedWaiting":9,"maxMemoryM":888.9375,"threadsRunnable":8,"contextName":"jvm"}{"recordName":"datanode","chukwa_timestamp":1309517954862,"readBlockOp_num_ops":10,"writeBlockOp_avg_time":0,"block_verification_failures":0,"blockChecksumOp_num_ops":0,"blocks_read":10,"blocks_written":13,"copyBlockOp_avg_time":0,"heartBeats_num_ops":197707,"writes_from_remote_client":12,"replaceBlockOp_num_ops":0,"blockReports_avg_time":0,"blocks_replicated":0,"writeBlockOp_num_ops":13,"heartBeats_avg_time":7,"hostName":"clus1-datanode6","bytes_read":60797,"copyBlockOp_num_ops":0,"contextName":"dfs","blockReports_num_ops":167,"bytes_written":389862107,"replaceBlockOp_avg_time":0,"reads_from_remote_client":8,"period":60,"readBlockOp_avg_time":0,"reads_from_local_client":2,"blocks_verified":1,"writes_from_local_client":1,"sessionId":"","blockChecksumOp_avg_time":0,"blocks_removed":14}

Anyone else observed this kind of error? To me, it seems to be something to
do with 'events' LIST creation. Any fix or a pointer to fixing it is welcome
:-)
Thanks,

-preetam

Mime
View raw message