hbase-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From 曾伟展 <zengweiz...@jd.com>
Subject Re: What cause "Compaction is trying to add a bad range"
Date Wed, 27 Apr 2016 03:25:18 GMT
Here is my  below all log

2016-04-26 15:48:57,396 INFO  [regionserver/
regionserver.SplitLogWorker: Sending interrupt to stop the worker thread
2016-04-26 15:48:57,398 FATAL [MemStoreFlusher.1]
regionserver.HRegionServer: RegionServer abort: loaded coprocessors are: []
2016-04-26 15:48:57,399 INFO 
[regionserver/MJQ-HBASE-DIANA-11050.hadoop.jd.local/172.28.110.50:16020]
regionserver.HRegionServer: Stopping infoServer
2016-04-26 15:48:57,401 INFO 
[SplitLogWorker-MJQ-HBASE-DIANA-11050:16020]
regionserver.SplitLogWorker: SplitLogWorker interrupted. Exiting.
2016-04-26 15:48:57,401 INFO 
[SplitLogWorker-MJQ-HBASE-DIANA-11050:16020]
regionserver.SplitLogWorker: SplitLogWorker Myip  exiting
2016-04-26 15:48:57,407 INFO  [MemStoreFlusher.1]
regionserver.HRegionServer: Dump of metrics as JSON on abort: {
  "beans" : [ {
    "name" : "java.lang:type=Memory",
    "modelerType" : "sun.management.MemoryImpl",
    "HeapMemoryUsage" : {
      "committed" : 51413778432,
      "init" : 51539607552,
      "max" : 51413778432,
      "used" : 27834774512
    },
    "NonHeapMemoryUsage" : {
      "committed" : 2154954752,
      "init" : 2150039552,
      "max" : 2197815296,
      "used" : 52442256
    },
    "ObjectPendingFinalizationCount" : 0,
    "Verbose" : true,
    "ObjectName" : "java.lang:type=Memory"
  } ],
  "beans" : [ {
    "queueSize" : 0,
    "numCallsInGeneralQueue" : 0,
    "numCallsInReplicationQueue" : 0,
    "numCallsInPriorityQueue" : 0,
    "numOpenConnections" : 8,
    "numActiveHandler" : 0,
    "TotalCallTime_num_ops" : 335699,
    "TotalCallTime_min" : 0,
    "TotalCallTime_max" : 337,
    "TotalCallTime_mean" : 38.55036803803407,
    "TotalCallTime_median" : 12.0,
    "TotalCallTime_75th_percentile" : 13.0,
    "TotalCallTime_95th_percentile" : 36.649999999999864,
    "TotalCallTime_99th_percentile" : 71.0,
    "exceptions.FailedSanityCheckException" : 0,
    "exceptions.RegionMovedException" : 0,
    "QueueCallTime_num_ops" : 335699,
    "QueueCallTime_min" : 0,
    "QueueCallTime_max" : 205,
    "QueueCallTime_mean" : 0.36370975189083077,
    "QueueCallTime_median" : 0.0,
    "QueueCallTime_75th_percentile" : 0.0,
    "QueueCallTime_95th_percentile" : 1.0,
    "QueueCallTime_99th_percentile" : 1.0,
    "authenticationFailures" : 0,
    "authorizationFailures" : 0,
    "exceptions" : 39,
    "authenticationSuccesses" : 0,
    "authorizationSuccesses" : 33,
    "ProcessCallTime_num_ops" : 335699,
    "ProcessCallTime_min" : 0,
    "ProcessCallTime_max" : 337,
    "ProcessCallTime_mean" : 38.18665828614324,
    "ProcessCallTime_median" : 12.0,
    "ProcessCallTime_75th_percentile" : 13.0,
    "ProcessCallTime_95th_percentile" : 36.0,
    "ProcessCallTime_99th_percentile" : 75.05999999999995,
    "exceptions.NotServingRegionException" : 38,
    "sentBytes" : 110000013746,
    "exceptions.RegionTooBusyException" : 0,
    "receivedBytes" : 507636548377,
    "exceptions.OutOfOrderScannerNextException" : 0,
    "exceptions.UnknownScannerException" : 0
  } ],
  "beans" : [ {
    "name" : "Hadoop:service=HBase,name=RegionServer,sub=Replication",
    "modelerType" : "RegionServer,sub=Replication",
    "tag.Context" : "regionserver",
    "tag.Hostname" : "MJQ-HBASE-DIANA-11050.hadoop.jd.local",
    "sink.appliedOps" : 0,
    "sink.appliedBatches" : 0,
    "sink.ageOfLastAppliedOp" : 0
  } ],
  "beans" : [ {
    "name" : "Hadoop:service=HBase,name=RegionServer,sub=Server",
    "modelerType" : "RegionServer,sub=Server",
    "tag.clusterId" : "b370cdfd-17b1-467e-a73c-52754ac476b0",
    "tag.Context" : "regionserver",
    "regionCount" : 6,
    "storeCount" : 6,
    "hlogFileCount" : 16,
    "hlogFileSize" : 1969726204,
    "storeFileCount" : 733,
    "memStoreSize" : 1261464560,
    "storeFileSize" : 440572021085,
    "regionServerStartTime" : 1461577323055,
    "totalRequestCount" : 470472276,
    "readRequestCount" : 0,
    "writeRequestCount" : 388870304,
    "checkMutateFailedCount" : 0,
    "checkMutatePassedCount" : 0,
    "storeFileIndexSize" : 1503704,
    "staticIndexSize" : 355799929,
    "staticBloomSize" : 512946192,
    "mutationsWithoutWALCount" : 0,
    "mutationsWithoutWALSize" : 0,
    "percentFilesLocal" : 95,
    "percentFilesLocalSecondaryRegions" : 0,
    "splitQueueLength" : 0,
    "compactionQueueLength" : 0,
    "flushQueueLength" : 1,
    "blockCacheFreeSize" : 20553617448,
    "blockCacheCount" : 0,Exiting
    "blockCacheSize" : 10545112,
    "blockCacheHitCount" : 196832,
    "blockCacheMissCount" : 706112,
    "blockCacheEvictionCount" : 91,
    "blockCacheCountHitPercent" : 21.0,
    "blockCacheExpressHitPercent" : 99,
    "updatesBlockedTime" : 0,
    "flushedCellsCount" : 384497176,
    "compactedCellsCount" : 16591416,
    "majorCompactedCellsCount" : 0,
    "flushedCellsSize" : 455244656384,
    "compactedCellsSize" : 17437578216,
    "majorCompactedCellsSize" : 0,
    "blockedRequestCount" : 58001,
    "splitSuccessCount" : 1,
    "splitRequestCount" : 1,
    "Append_num_ops" : 0,
    "Append_min" : 0,
    "Append_max" : 0,
    "Append_mean" : 0.0,
    "Append_median" : 0.0,
    "Append_75th_percentile" : 0.0,
    "Append_95th_percentile" : 0.0,
    "Append_99th_percentile" : 0.0,
    "Delete_num_ops" : 0,
    "Delete_min" : 0,
    "Delete_max" : 0,
    "Delete_mean" : 0.0,
    "Delete_median" : 0.0,
    "Delete_75th_percentile" : 0.0,
    "Delete_95th_percentile" : 0.0,
    "Delete_99th_percentile" : 0.0,
    "Mutate_num_ops" : 335559,
    "Mutate_min" : 0,
    "Mutate_max" : 336,
    "Mutate_mean" : 37.965365256184455,
    "Mutate_median" : 11.0,
    "Mutate_75th_percentile" : 13.0,
    "Mutate_95th_percentile" : 36.649999999999864,
    "Mutate_99th_percentile" : 85.0,
    "ScanNext_num_ops" : 0,
    "ScanNext_min" : 0,
    "ScanNext_max" : 0,
    "ScanNext_mean" : 0.0,
    "ScanNext_median" : 0.0,
    "ScanNext_75th_percentile" : 0.0,
    "ScanNext_95th_percentile" : 0.0,
    "ScanNext_99th_percentile" : 0.0,
    "slowDeleteCount" : 0,
    "slowIncrementCount" : 0,
    "FlushTime_num_ops" : 708,
    "FlushTime_min" : 981,
    "FlushTime_max" : 93640,
    "FlushTime_mean" : 64740.536723163845,
    "FlushTime_median" : 93073.0,
    "FlushTime_75th_percentile" : 93277.0,
    "FlushTime_95th_percentile" : 93391.4,
    "FlushTime_99th_percentile" : 93499.8,
    "Get_num_ops" : 0,
    "Get_min" : 0,
    "Get_max" : 0,
    "Get_mean" : 0.0,
    "Get_median" : 0.0,
    "Get_75th_percentile" : 0.0,
    "Get_95th_percentile" : 0.0,
    "Get_99th_percentile" : 0.0,
    "Replay_num_ops" : 0,
    "Replay_min" : 0,
    "Replay_max" : 0,
    "Replay_mean" : 0.0,
    "Replay_median" : 0.0,
    "Replay_75th_percentile" : 0.0,
    "Replay_95th_percentile" : 0.0,
    "Replay_99th_percentile" : 0.0,
    "slowGetCount" : 0,
    "slowAppendCount" : 0,
    "slowPutCount" : 0,
    "SplitTime_num_ops" : 1,
    "SplitTime_min" : 2114,
    "SplitTime_max" : 2114,
    "SplitTime_mean" : 2114.0,
    "SplitTime_median" : 2114.0,
    "SplitTime_75th_percentile" : 2114.0,
    "SplitTime_95th_percentile" : 2114.0,
    "SplitTime_99th_percentile" : 2114.0,
    "Increment_num_ops" : 0,


在 2016年04月26日 21:19, Ted Yu 写道:
> Can you pastebin region server log leading up to the exception below ?
>
> Thanks
>
>> On Apr 26, 2016, at 5:19 AM, 曾伟展 <zengweizhan@jd.com> wrote:
>>
>> Hi, ALL
>>
>>    I used PE to test HBase-1.1.2 ,and cause many regionserver died !
>>
>> here is my log:
>>
>> FATAL [MemStoreFlusher.1] regionserver.HRegionServer: ABORTING region server  MyIP
 Replay of WAL required. Forcing server shutdown
>> org.apache.hadoop.hbase.DroppedSnapshotException: region: TestTable,00000000000000001394759025,1461590081858.a215be2647106a2a0fe8c4fad1430107.
>>    at org.apache.hadoop.hbase.regionserver.HRegion.internalFlushCacheAndCommit(HRegion.java:2354)
>>    at org.apache.hadoop.hbase.regionserver.HRegion.internalFlushcache(HRegion.java:2057)
>>    at org.apache.hadoop.hbase.regionserver.HRegion.internalFlushcache(HRegion.java:2019)
>>    at org.apache.hadoop.hbase.regionserver.HRegion.flushcache(HRegion.java:1911)
>>    at org.apache.hadoop.hbase.regionserver.HRegion.flush(HRegion.java:1837)
>>    at org.apache.hadoop.hbase.regionserver.MemStoreFlusher.flushRegion(MemStoreFlusher.java:510)
>>    at org.apache.hadoop.hbase.regionserver.MemStoreFlusher.flushRegion(MemStoreFlusher.java:471)
>>    at org.apache.hadoop.hbase.regionserver.MemStoreFlusher.access$800(MemStoreFlusher.java:75)
>>    at org.apache.hadoop.hbase.regionserver.MemStoreFlusher$FlushHandler.run(MemStoreFlusher.java:259)
>>    at java.lang.Thread.run(Thread.java:745)
>> Caused by: java.io.IOException: Compaction is trying to add a bad range.
>>    at org.apache.hadoop.hbase.regionserver.StripeStoreFileManager$CompactionOrFlushMergeCopy.processNewCandidateStripes(StripeStoreFileManager.java:837)
>>    at org.apache.hadoop.hbase.regionserver.StripeStoreFileManager$CompactionOrFlushMergeCopy.mergeResults(StripeStoreFileManager.java:672)
>>    at org.apache.hadoop.hbase.regionserver.StripeStoreFileManager.insertNewFiles(StripeStoreFileManager.java:144)
>>    at org.apache.hadoop.hbase.regionserver.HStore.updateStorefiles(HStore.java:1052)
>>    at org.apache.hadoop.hbase.regionserver.HStore.access$500(HStore.java:128)
>>    at org.apache.hadoop.hbase.regionserver.HStore$StoreFlusherImpl.commit(HStore.java:2231)
>>    at org.apache.hadoop.hbase.regionserver.HRegion.internalFlushCacheAndCommit(HRegion.java:2315)
>>    ... 9 more
>>
>>
>> and i don't know why  binarySearch return < 1
>>
>>
>> private final int findStripeIndexByEndRow(byte[] endRow) {
>>  assert !isInvalid(endRow);
>>  if (isOpen(endRow)) return state.stripeEndRows.length;
>>  return Arrays.binarySearch(state.stripeEndRows, endRow, Bytes.BYTES_COMPARATOR);
>> }
>>
>>

Mime
View raw message