ignite-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Ray <ray...@cisco.com>
Subject Ignite data can't be recovered after node fail
Date Tue, 24 Jul 2018 08:10:00 GMT
Following node fail described in this thread
http://apache-ignite-users.70518.x6.nabble.com/Ignite-node-failed-for-no-obvious-reason-td22866.html,
I tried to reboot this node and recover the data to make Ignite cluster
available again.

First, I try reboot node2 directly but failed.
The node log is as follows.

[2018-07-24T02:57:38,956][INFO ][main][IgniteKernal] 

>>>    __________  ________________  
>>>   /  _/ ___/ |/ /  _/_  __/ __/  
>>>  _/ // (7 7    // /  / / / _/    
>>> /___/\___/_/|_/___/ /_/ /___/   
>>> 
>>> ver. 2.6.0#20180710-sha1:669feacc
>>> 2018 Copyright(C) Apache Software Foundation
>>> 
>>> Ignite documentation: http://ignite.apache.org

[2018-07-24T02:57:38,976][INFO ][main][IgniteKernal] Config URL:
file:/opt/apache-ignite-fabric-2.6.0-bin/config/persistent-config.xml
[2018-07-24T02:57:38,984][INFO ][main][IgniteKernal] IgniteConfiguration
[igniteInstanceName=null, pubPoolSize=56, svcPoolSize=56,
callbackPoolSize=56, stripedPoolSize=56, sysPoolSize=56, mgmtPoolSize=4,
igfsPoolSize=56, dataStreamerPoolSize=56, utilityCachePoolSize=56,
utilityCacheKeepAliveTime=60000, p2pPoolSize=2, qryPoolSize=56,
igniteHome=/opt/apache-ignite-fabric-2.6.0-bin,
igniteWorkDir=/opt/apache-ignite-fabric-2.6.0-bin/work,
mbeanSrv=com.sun.jmx.mbeanserver.JmxMBeanServer@6f94fa3e,
nodeId=7e3c0623-a6a5-4a7b-966e-6882b86ff922,
marsh=org.apache.ignite.internal.binary.BinaryMarshaller@1890516e,
marshLocJobs=false, daemon=false, p2pEnabled=true, netTimeout=5000,
sndRetryDelay=1000, sndRetryCnt=3, metricsHistSize=10000,
metricsUpdateFreq=2000, metricsExpTime=9223372036854775807,
discoSpi=TcpDiscoverySpi [addrRslvr=null, sockTimeout=0, ackTimeout=0,
marsh=null, reconCnt=10, reconDelay=2000, maxAckTimeout=600000,
forceSrvMode=false, clientReconnectDisabled=false, internalLsnr=null],
segPlc=RESTART_JVM, segResolveAttempts=2, waitForSegOnStart=true,
allResolversPassReq=true, segChkFreq=10000, commSpi=TcpCommunicationSpi
[connectGate=null, connPlc=null, enableForcibleNodeKill=false,
enableTroubleshootingLog=false,
srvLsnr=org.apache.ignite.spi.communication.tcp.TcpCommunicationSpi$2@42e25b0b,
locAddr=null, locHost=null, locPort=47100, locPortRange=100, shmemPort=-1,
directBuf=true, directSndBuf=false, idleConnTimeout=600000,
connTimeout=5000, maxConnTimeout=600000, reconCnt=10, sockSndBuf=32768,
sockRcvBuf=32768, msgQueueLimit=0, slowClientQueueLimit=0, nioSrvr=null,
shmemSrv=null, usePairedConnections=false, connectionsPerNode=1,
tcpNoDelay=true, filterReachableAddresses=false, ackSndThreshold=32,
unackedMsgsBufSize=0, sockWriteTimeout=2000, lsnr=null, boundTcpPort=-1,
boundTcpShmemPort=-1, selectorsCnt=28, selectorSpins=0, addrRslvr=null,
ctxInitLatch=java.util.concurrent.CountDownLatch@39b43d60[Count = 1],
stopping=false,
metricsLsnr=org.apache.ignite.spi.communication.tcp.TcpCommunicationMetricsListener@44be0077],
evtSpi=org.apache.ignite.spi.eventstorage.NoopEventStorageSpi@2205a05d,
colSpi=NoopCollisionSpi [], deploySpi=LocalDeploymentSpi [lsnr=null],
indexingSpi=org.apache.ignite.spi.indexing.noop.NoopIndexingSpi@5f20155b,
addrRslvr=null, clientMode=false, rebalanceThreadPoolSize=1,
txCfg=org.apache.ignite.configuration.TransactionConfiguration@72ade7e3,
cacheSanityCheckEnabled=true, discoStartupDelay=60000, deployMode=SHARED,
p2pMissedCacheSize=100, locHost=null, timeSrvPortBase=31100,
timeSrvPortRange=100, failureDetectionTimeout=60000,
clientFailureDetectionTimeout=30000, metricsLogFreq=60000, hadoopCfg=null,
connectorCfg=org.apache.ignite.configuration.ConnectorConfiguration@239105a8,
odbcCfg=null, warmupClos=null, atomicCfg=AtomicConfiguration
[seqReserveSize=1000, cacheMode=PARTITIONED, backups=1, aff=null,
grpName=null], classLdr=null, sslCtxFactory=null, platformCfg=null,
binaryCfg=null, memCfg=null, pstCfg=null, dsCfg=DataStorageConfiguration
[sysRegionInitSize=41943040, sysCacheMaxSize=104857600, pageSize=0,
concLvl=0, dfltDataRegConf=DataRegionConfiguration [name=default_Region,
maxSize=493921239040, initSize=107374182400, swapPath=null,
pageEvictionMode=DISABLED, evictionThreshold=0.9, emptyPagesPoolSize=100,
metricsEnabled=false, metricsSubIntervalCount=5,
metricsRateTimeInterval=60000, persistenceEnabled=true,
checkpointPageBufSize=8589934592], storagePath=/data/ignite/persistence,
checkpointFreq=600000, lockWaitTime=10000, checkpointThreads=4,
checkpointWriteOrder=SEQUENTIAL, walHistSize=20, walSegments=10,
walSegmentSize=67108864, walPath=/wal, walArchivePath=/wal/archive,
metricsEnabled=false, walMode=BACKGROUND, walTlbSize=131072, walBuffSize=0,
walFlushFreq=5000, walFsyncDelay=1000, walRecordIterBuffSize=67108864,
alwaysWriteFullPages=false,
fileIOFactory=org.apache.ignite.internal.processors.cache.persistence.file.AsyncFileIOFactory@609bcfb6,
metricsSubIntervalCnt=5, metricsRateTimeInterval=60000,
walAutoArchiveAfterInactivity=-1, writeThrottlingEnabled=false,
walCompactionEnabled=false], activeOnStart=true, autoActivation=true,
longQryWarnTimeout=3000, sqlConnCfg=null,
cliConnCfg=ClientConnectorConfiguration [host=null, port=10800,
portRange=100, sockSndBufSize=0, sockRcvBufSize=0, tcpNoDelay=true,
maxOpenCursorsPerConn=128, threadPoolSize=56, idleTimeout=0,
jdbcEnabled=true, odbcEnabled=true, thinCliEnabled=true, sslEnabled=false,
useIgniteSslCtxFactory=true, sslClientAuth=false, sslCtxFactory=null],
authEnabled=false, failureHnd=null, commFailureRslvr=null]
[2018-07-24T02:57:38,984][INFO ][main][IgniteKernal] Daemon mode: off
[2018-07-24T02:57:38,985][INFO ][main][IgniteKernal] OS: Linux
2.6.32-696.16.1.el6.x86_64 amd64
[2018-07-24T02:57:38,985][INFO ][main][IgniteKernal] OS user: root
[2018-07-24T02:57:38,985][INFO ][main][IgniteKernal] PID: 49525
[2018-07-24T02:57:38,985][INFO ][main][IgniteKernal] Language runtime: Java
Platform API Specification ver. 1.8
[2018-07-24T02:57:38,985][INFO ][main][IgniteKernal] VM information: OpenJDK
Runtime Environment 1.8.0_161-b14 Oracle Corporation OpenJDK 64-Bit Server
VM 25.161-b14
[2018-07-24T02:57:38,986][INFO ][main][IgniteKernal] VM total memory: 31.0GB
[2018-07-24T02:57:38,986][INFO ][main][IgniteKernal] Remote Management
[restart: on, REST: on, JMX (remote: on, port: 49112, auth: off, ssl: off)]
[2018-07-24T02:57:38,987][INFO ][main][IgniteKernal] Logger: Log4J2Logger
[quiet=true, config=config/ignite-log4j2.xml]
[2018-07-24T02:57:38,987][INFO ][main][IgniteKernal]
IGNITE_HOME=/opt/apache-ignite-fabric-2.6.0-bin
[2018-07-24T02:57:38,987][INFO ][main][IgniteKernal] VM arguments: [-Xms1g,
-Xmx1g, -XX:+AggressiveOpts, -XX:MaxMetaspaceSize=256m, -DIGNITE_QUIET=true,
-DIGNITE_SUCCESS_FILE=/opt/apache-ignite-fabric-2.6.0-bin/work/ignite_success_2fd90195-cb52-4762-9732-3f1366e2e9cb,
-Dcom.sun.management.jmxremote, -Dcom.sun.management.jmxremote.port=49112,
-Dcom.sun.management.jmxremote.authenticate=false,
-Dcom.sun.management.jmxremote.ssl=false,
-DIGNITE_HOME=/opt/apache-ignite-fabric-2.6.0-bin,
-DIGNITE_PROG_NAME=./ignite.sh, -Xmx32000m, -Xms32000m, -XX:+UseG1GC,
-XX:+ScavengeBeforeFullGC, -XX:+DisableExplicitGC, -XX:+AlwaysPreTouch,
-XX:+PrintGCDetails, -XX:+PrintGCTimeStamps, -XX:+PrintGCDateStamps,
-XX:+PrintAdaptiveSizePolicy,
-Xloggc:/spare/ignite/log/ignitegc-2018_07_24-02_57.log]
[2018-07-24T02:57:38,987][INFO ][main][IgniteKernal] System cache's
DataRegion size is configured to 40 MB. Use
DataStorageConfiguration.systemCacheMemorySize property to change the
setting.
[2018-07-24T02:57:38,993][INFO ][main][IgniteKernal] Configured caches [in
'sysMemPlc' dataRegion: ['ignite-sys-cache']]
[2018-07-24T02:57:38,993][WARN ][main][IgniteKernal] Peer class loading is
enabled (disable it in production for performance and deployment consistency
reasons)
[2018-07-24T02:57:38,996][INFO ][main][IgniteKernal] 3-rd party licenses can
be found at: /opt/apache-ignite-fabric-2.6.0-bin/libs/licenses
[2018-07-24T02:57:39,070][INFO ][main][IgnitePluginProcessor] Configured
plugins:
[2018-07-24T02:57:39,071][INFO ][main][IgnitePluginProcessor]   ^-- None
[2018-07-24T02:57:39,071][INFO ][main][IgnitePluginProcessor] 
[2018-07-24T02:57:39,072][INFO ][main][FailureProcessor] Configured failure
handler: [hnd=StopNodeOrHaltFailureHandler [tryStop=false, timeout=0]]
[2018-07-24T02:57:39,109][INFO ][main][TcpCommunicationSpi] Successfully
bound communication NIO server to TCP port [port=47100,
locHost=0.0.0.0/0.0.0.0, selectorsCnt=28, selectorSpins=0, pairedConn=false]
[2018-07-24T02:57:39,110][WARN ][main][TcpCommunicationSpi] Message queue
limit is set to 0 which may lead to potential OOMEs when running cache
operations in FULL_ASYNC or PRIMARY_SYNC modes due to message queues growth
on sender and receiver sides.
[2018-07-24T02:57:39,126][WARN ][main][NoopCheckpointSpi] Checkpoints are
disabled (to enable configure any GridCheckpointSpi implementation)
[2018-07-24T02:57:39,145][WARN ][main][GridCollisionManager] Collision
resolution is disabled (all jobs will be activated upon arrival).
[2018-07-24T02:57:39,146][INFO ][main][IgniteKernal] Security status
[authentication=off, tls/ssl=off]
[2018-07-24T02:57:39,171][INFO ][main][TcpDiscoverySpi] Successfully bound
to TCP port [port=49500, localHost=0.0.0.0/0.0.0.0,
locNodeId=7e3c0623-a6a5-4a7b-966e-6882b86ff922]
[2018-07-24T02:57:39,178][INFO ][main][PdsFoldersResolver] Successfully
locked persistence storage folder
[/data/ignite/persistence/node00-33503bf4-323b-4965-8bb1-31597d3bedf4]
[2018-07-24T02:57:39,178][INFO ][main][PdsFoldersResolver] Consistent ID
used for local node is [33503bf4-323b-4965-8bb1-31597d3bedf4] according to
persistence data storage folders
[2018-07-24T02:57:39,178][INFO ][main][CacheObjectBinaryProcessorImpl]
Resolved directory for serialized binary metadata:
/opt/apache-ignite-fabric-2.6.0-bin/work/binary_meta/node00-33503bf4-323b-4965-8bb1-31597d3bedf4
[2018-07-24T02:57:39,361][INFO ][main][FilePageStoreManager] Resolved page
store work directory:
/data/ignite/persistence/node00-33503bf4-323b-4965-8bb1-31597d3bedf4
[2018-07-24T02:57:39,361][INFO ][main][FileWriteAheadLogManager] Resolved
write ahead log work directory:
/wal/node00-33503bf4-323b-4965-8bb1-31597d3bedf4
[2018-07-24T02:57:39,361][INFO ][main][FileWriteAheadLogManager] Resolved
write ahead log archive directory:
/wal/archive/node00-33503bf4-323b-4965-8bb1-31597d3bedf4
[2018-07-24T02:57:39,422][INFO ][main][FileWriteAheadLogManager] Started
write-ahead log manager [mode=BACKGROUND]
[2018-07-24T02:57:39,454][INFO ][main][GridCacheDatabaseSharedManager] Read
checkpoint status
[startMarker=/data/ignite/persistence/node00-33503bf4-323b-4965-8bb1-31597d3bedf4/cp/1532374156904-fb69dc16-1947-411c-bc5e-ee6540ba8e53-START.bin,
endMarker=/data/ignite/persistence/node00-33503bf4-323b-4965-8bb1-31597d3bedf4/cp/1532374156904-fb69dc16-1947-411c-bc5e-ee6540ba8e53-END.bin]
[2018-07-24T02:57:39,465][INFO ][main][PageMemoryImpl] Started page memory
[memoryAllocated=100.0 MiB, pages=24808, tableSize=1.9 MiB,
checkpointBuffer=100.0 MiB]
[2018-07-24T02:57:39,465][INFO ][main][GridCacheDatabaseSharedManager]
Checking memory state [lastValidPos=FileWALPointer [idx=21672,
fileOff=13729571, len=27723], lastMarked=FileWALPointer [idx=21672,
fileOff=13729571, len=27723],
lastCheckpointId=fb69dc16-1947-411c-bc5e-ee6540ba8e53]
[2018-07-24T02:57:39,544][WARN ][main][FileWriteAheadLogManager] WAL segment
tail is reached. [ Expected next state: {Index=21672,Offset=13757294},
Actual state : {Index=3690196541643296009,Offset=603992117} ]
[2018-07-24T02:57:39,544][INFO ][main][GridCacheDatabaseSharedManager] Found
last checkpoint marker [cpId=fb69dc16-1947-411c-bc5e-ee6540ba8e53,
pos=FileWALPointer [idx=21672, fileOff=13729571, len=27723]]
[2018-07-24T02:57:39,569][INFO ][main][GridCacheDatabaseSharedManager]
Applying lost cache updates since last checkpoint record
[lastMarked=FileWALPointer [idx=21672, fileOff=13729571, len=27723],
lastCheckpointId=fb69dc16-1947-411c-bc5e-ee6540ba8e53]
[2018-07-24T02:57:39,628][WARN ][main][FileWriteAheadLogManager] WAL segment
tail is reached. [ Expected next state: {Index=21672,Offset=13757294},
Actual state : {Index=3690196541643296009,Offset=603992117} ]
[2018-07-24T02:57:39,629][INFO ][main][GridCacheDatabaseSharedManager]
Finished applying WAL changes [updatesApplied=0, time=61ms]
[2018-07-24T02:57:39,666][INFO ][main][GridClusterStateProcessor] Restoring
history for BaselineTopology[id=0]
[2018-07-24T02:57:39,752][INFO ][main][ClientListenerProcessor] Client
connector processor has started on TCP port 10800
[2018-07-24T02:57:39,790][INFO ][main][GridTcpRestProtocol] Command protocol
successfully started [name=TCP binary, host=0.0.0.0/0.0.0.0, port=11211]
[2018-07-24T02:57:39,939][INFO ][main][GridJettyRestProtocol] Command
protocol successfully started [name=Jetty REST, host=/0.0.0.0, port=8080]
[2018-07-24T02:57:39,977][INFO ][main][IgniteKernal] Non-loopback local IPs:
10.252.10.4
[2018-07-24T02:57:39,977][INFO ][main][IgniteKernal] Enabled local MACs:
7079B364407D
[2018-07-24T02:57:40,098][INFO ][tcp-disco-srvr-#2][TcpDiscoverySpi] TCP
discovery accepted incoming connection [rmtAddr=/10.252.10.20,
rmtPort=51032]
[2018-07-24T02:57:40,107][INFO ][tcp-disco-srvr-#2][TcpDiscoverySpi] TCP
discovery spawning a new thread for connection [rmtAddr=/10.252.10.20,
rmtPort=51032]
[2018-07-24T02:57:40,107][INFO ][tcp-disco-sock-reader-#4][TcpDiscoverySpi]
Started serving remote node connection [rmtAddr=/10.252.10.20:51032,
rmtPort=51032]
[2018-07-24T02:57:40,329][ERROR][tcp-disco-msg-worker-#3][TcpDiscoverySpi]
TcpDiscoverSpi's message worker thread failed abnormally. Stopping the node
in order to prevent cluster wide instability.
org.apache.ignite.IgniteException: Node with BaselineTopology cannot join
mixed cluster running in compatibility mode
	at
org.apache.ignite.internal.processors.cluster.GridClusterStateProcessor.onGridDataReceived(GridClusterStateProcessor.java:714)
~[ignite-core-2.6.0.jar:2.6.0]
	at
org.apache.ignite.internal.managers.discovery.GridDiscoveryManager$5.onExchange(GridDiscoveryManager.java:883)
~[ignite-core-2.6.0.jar:2.6.0]
	at
org.apache.ignite.spi.discovery.tcp.TcpDiscoverySpi.onExchange(TcpDiscoverySpi.java:1939)
~[ignite-core-2.6.0.jar:2.6.0]
	at
org.apache.ignite.spi.discovery.tcp.ServerImpl$RingMessageWorker.processNodeAddedMessage(ServerImpl.java:4354)
~[ignite-core-2.6.0.jar:2.6.0]
	at
org.apache.ignite.spi.discovery.tcp.ServerImpl$RingMessageWorker.processMessage(ServerImpl.java:2744)
~[ignite-core-2.6.0.jar:2.6.0]
	at
org.apache.ignite.spi.discovery.tcp.ServerImpl$RingMessageWorker.processMessage(ServerImpl.java:2536)
~[ignite-core-2.6.0.jar:2.6.0]
	at
org.apache.ignite.spi.discovery.tcp.ServerImpl$MessageWorkerAdapter.body(ServerImpl.java:6775)
~[ignite-core-2.6.0.jar:2.6.0]
	at
org.apache.ignite.spi.discovery.tcp.ServerImpl$RingMessageWorker.body(ServerImpl.java:2621)
[ignite-core-2.6.0.jar:2.6.0]
	at org.apache.ignite.spi.IgniteSpiThread.run(IgniteSpiThread.java:62)
[ignite-core-2.6.0.jar:2.6.0]
[2018-07-24T02:57:40,333][ERROR][tcp-disco-msg-worker-#3][] Critical system
error detected. Will be handled accordingly to configured handler [hnd=class
o.a.i.failure.StopNodeOrHaltFailureHandler, failureCtx=FailureContext
[type=SYSTEM_WORKER_TERMINATION, err=class o.a.i.IgniteException: Node with
BaselineTopology cannot join mixed cluster running in compatibility mode]]
org.apache.ignite.IgniteException: Node with BaselineTopology cannot join
mixed cluster running in compatibility mode
	at
org.apache.ignite.internal.processors.cluster.GridClusterStateProcessor.onGridDataReceived(GridClusterStateProcessor.java:714)
~[ignite-core-2.6.0.jar:2.6.0]
	at
org.apache.ignite.internal.managers.discovery.GridDiscoveryManager$5.onExchange(GridDiscoveryManager.java:883)
~[ignite-core-2.6.0.jar:2.6.0]
	at
org.apache.ignite.spi.discovery.tcp.TcpDiscoverySpi.onExchange(TcpDiscoverySpi.java:1939)
~[ignite-core-2.6.0.jar:2.6.0]
	at
org.apache.ignite.spi.discovery.tcp.ServerImpl$RingMessageWorker.processNodeAddedMessage(ServerImpl.java:4354)
~[ignite-core-2.6.0.jar:2.6.0]
	at
org.apache.ignite.spi.discovery.tcp.ServerImpl$RingMessageWorker.processMessage(ServerImpl.java:2744)
~[ignite-core-2.6.0.jar:2.6.0]
	at
org.apache.ignite.spi.discovery.tcp.ServerImpl$RingMessageWorker.processMessage(ServerImpl.java:2536)
~[ignite-core-2.6.0.jar:2.6.0]
	at
org.apache.ignite.spi.discovery.tcp.ServerImpl$MessageWorkerAdapter.body(ServerImpl.java:6775)
~[ignite-core-2.6.0.jar:2.6.0]
	at
org.apache.ignite.spi.discovery.tcp.ServerImpl$RingMessageWorker.body(ServerImpl.java:2621)
[ignite-core-2.6.0.jar:2.6.0]
	at org.apache.ignite.spi.IgniteSpiThread.run(IgniteSpiThread.java:62)
[ignite-core-2.6.0.jar:2.6.0]
[2018-07-24T02:57:40,334][ERROR][main][IgniteKernal] Failed to start
manager: GridManagerAdapter [enabled=true,
name=o.a.i.i.managers.discovery.GridDiscoveryManager]
org.apache.ignite.IgniteCheckedException: Failed to start SPI:
TcpDiscoverySpi [addrRslvr=null, sockTimeout=5000, ackTimeout=5000,
marsh=JdkMarshaller
[clsFilter=org.apache.ignite.marshaller.MarshallerUtils$1@64a9d48c],
reconCnt=10, reconDelay=2000, maxAckTimeout=600000, forceSrvMode=false,
clientReconnectDisabled=false, internalLsnr=null]
	at
org.apache.ignite.internal.managers.GridManagerAdapter.startSpi(GridManagerAdapter.java:300)
~[ignite-core-2.6.0.jar:2.6.0]
	at
org.apache.ignite.internal.managers.discovery.GridDiscoveryManager.start(GridDiscoveryManager.java:915)
~[ignite-core-2.6.0.jar:2.6.0]
	at
org.apache.ignite.internal.IgniteKernal.startManager(IgniteKernal.java:1721)
[ignite-core-2.6.0.jar:2.6.0]
	at org.apache.ignite.internal.IgniteKernal.start(IgniteKernal.java:1028)
[ignite-core-2.6.0.jar:2.6.0]
	at
org.apache.ignite.internal.IgnitionEx$IgniteNamedInstance.start0(IgnitionEx.java:2014)
[ignite-core-2.6.0.jar:2.6.0]
	at
org.apache.ignite.internal.IgnitionEx$IgniteNamedInstance.start(IgnitionEx.java:1723)
[ignite-core-2.6.0.jar:2.6.0]
	at org.apache.ignite.internal.IgnitionEx.start0(IgnitionEx.java:1151)
[ignite-core-2.6.0.jar:2.6.0]
	at
org.apache.ignite.internal.IgnitionEx.startConfigurations(IgnitionEx.java:1069)
[ignite-core-2.6.0.jar:2.6.0]
	at org.apache.ignite.internal.IgnitionEx.start(IgnitionEx.java:955)
[ignite-core-2.6.0.jar:2.6.0]
	at org.apache.ignite.internal.IgnitionEx.start(IgnitionEx.java:854)
[ignite-core-2.6.0.jar:2.6.0]
	at org.apache.ignite.internal.IgnitionEx.start(IgnitionEx.java:724)
[ignite-core-2.6.0.jar:2.6.0]
	at org.apache.ignite.internal.IgnitionEx.start(IgnitionEx.java:693)
[ignite-core-2.6.0.jar:2.6.0]
	at org.apache.ignite.Ignition.start(Ignition.java:352)
[ignite-core-2.6.0.jar:2.6.0]
	at
org.apache.ignite.startup.cmdline.CommandLineStartup.main(CommandLineStartup.java:301)
[ignite-core-2.6.0.jar:2.6.0]
Caused by: org.apache.ignite.spi.IgniteSpiException: Thread has been
interrupted.
	at
org.apache.ignite.spi.discovery.tcp.ServerImpl.joinTopology(ServerImpl.java:938)
~[ignite-core-2.6.0.jar:2.6.0]
	at
org.apache.ignite.spi.discovery.tcp.ServerImpl.spiStart(ServerImpl.java:373)
~[ignite-core-2.6.0.jar:2.6.0]
	at
org.apache.ignite.spi.discovery.tcp.TcpDiscoverySpi.spiStart(TcpDiscoverySpi.java:1948)
~[ignite-core-2.6.0.jar:2.6.0]
	at
org.apache.ignite.internal.managers.GridManagerAdapter.startSpi(GridManagerAdapter.java:297)
~[ignite-core-2.6.0.jar:2.6.0]
	... 13 more
[2018-07-24T02:57:40,336][ERROR][tcp-disco-msg-worker-#3][] JVM will be
halted immediately due to the failure: [failureCtx=FailureContext
[type=SYSTEM_WORKER_TERMINATION, err=class o.a.i.IgniteException: Node with
BaselineTopology cannot join mixed cluster running in compatibility mode]]
[2018-07-24T02:57:40,335][ERROR][main][IgniteKernal] Got exception while
starting (will rollback startup routine).
org.apache.ignite.IgniteCheckedException: Failed to start manager:
GridManagerAdapter [enabled=true,
name=org.apache.ignite.internal.managers.discovery.GridDiscoveryManager]
	at
org.apache.ignite.internal.IgniteKernal.startManager(IgniteKernal.java:1726)
~[ignite-core-2.6.0.jar:2.6.0]
	at org.apache.ignite.internal.IgniteKernal.start(IgniteKernal.java:1028)
[ignite-core-2.6.0.jar:2.6.0]
	at
org.apache.ignite.internal.IgnitionEx$IgniteNamedInstance.start0(IgnitionEx.java:2014)
[ignite-core-2.6.0.jar:2.6.0]
	at
org.apache.ignite.internal.IgnitionEx$IgniteNamedInstance.start(IgnitionEx.java:1723)
[ignite-core-2.6.0.jar:2.6.0]
	at org.apache.ignite.internal.IgnitionEx.start0(IgnitionEx.java:1151)
[ignite-core-2.6.0.jar:2.6.0]
	at
org.apache.ignite.internal.IgnitionEx.startConfigurations(IgnitionEx.java:1069)
[ignite-core-2.6.0.jar:2.6.0]
	at org.apache.ignite.internal.IgnitionEx.start(IgnitionEx.java:955)
[ignite-core-2.6.0.jar:2.6.0]
	at org.apache.ignite.internal.IgnitionEx.start(IgnitionEx.java:854)
[ignite-core-2.6.0.jar:2.6.0]
	at org.apache.ignite.internal.IgnitionEx.start(IgnitionEx.java:724)
[ignite-core-2.6.0.jar:2.6.0]
	at org.apache.ignite.internal.IgnitionEx.start(IgnitionEx.java:693)
[ignite-core-2.6.0.jar:2.6.0]
	at org.apache.ignite.Ignition.start(Ignition.java:352)
[ignite-core-2.6.0.jar:2.6.0]
	at
org.apache.ignite.startup.cmdline.CommandLineStartup.main(CommandLineStartup.java:301)
[ignite-core-2.6.0.jar:2.6.0]
Caused by: org.apache.ignite.IgniteCheckedException: Failed to start SPI:
TcpDiscoverySpi [addrRslvr=null, sockTimeout=5000, ackTimeout=5000,
marsh=JdkMarshaller
[clsFilter=org.apache.ignite.marshaller.MarshallerUtils$1@64a9d48c],
reconCnt=10, reconDelay=2000, maxAckTimeout=600000, forceSrvMode=false,
clientReconnectDisabled=false, internalLsnr=null]
	at
org.apache.ignite.internal.managers.GridManagerAdapter.startSpi(GridManagerAdapter.java:300)
~[ignite-core-2.6.0.jar:2.6.0]
	at
org.apache.ignite.internal.managers.discovery.GridDiscoveryManager.start(GridDiscoveryManager.java:915)
~[ignite-core-2.6.0.jar:2.6.0]
	at
org.apache.ignite.internal.IgniteKernal.startManager(IgniteKernal.java:1721)
~[ignite-core-2.6.0.jar:2.6.0]
	... 11 more
Caused by: org.apache.ignite.spi.IgniteSpiException: Thread has been
interrupted.
	at
org.apache.ignite.spi.discovery.tcp.ServerImpl.joinTopology(ServerImpl.java:938)
~[ignite-core-2.6.0.jar:2.6.0]
	at
org.apache.ignite.spi.discovery.tcp.ServerImpl.spiStart(ServerImpl.java:373)
~[ignite-core-2.6.0.jar:2.6.0]
	at
org.apache.ignite.spi.discovery.tcp.TcpDiscoverySpi.spiStart(TcpDiscoverySpi.java:1948)
~[ignite-core-2.6.0.jar:2.6.0]
	at
org.apache.ignite.internal.managers.GridManagerAdapter.startSpi(GridManagerAdapter.java:297)
~[ignite-core-2.6.0.jar:2.6.0]
	at
org.apache.ignite.internal.managers.discovery.GridDiscoveryManager.start(GridDiscoveryManager.java:915)
~[ignite-core-2.6.0.jar:2.6.0]
	at
org.apache.ignite.internal.IgniteKernal.startManager(IgniteKernal.java:1721)
~[ignite-core-2.6.0.jar:2.6.0]
	... 11 more


The next step I tried is stop node1 and node3, then start node2,node1,node3.
The cluster is up but after a while there's this message in log "Partition
states validation has failed" and the cluster starts to rebalance.

I think the data and partition information in node2 is outdated because when
node2 is down there's new data ingested into node1 and node3.

But after a while there's this message in log "[WARN
][exchange-worker-#162][GridDhtPartitionsExchangeFuture] Unable to await
partitions release latch within timeout: ServerLatch [permits=2,
pendingAcks=[0f687998-bb44-4d6c-8cac-c3dd05b28b9b,
f1eaed2f-d2b3-429f-8e97-ac150e106d3e], super=CompletableLatch [id=exchange,
topVer=AffinityTopologyVersion [topVer=4, minorTopVer=0]]]" and the cluster
freezes.
Here's the detailed log.

second.log
<http://apache-ignite-users.70518.x6.nabble.com/file/t1346/second.log>  

I have to stop the cluster and try to reboot again.
The cluster is up and I ingested some data into Ignite using Spark dataframe
API and triggered a checkpoint.
And the error message is the same as my second try, also the cluster
freezed. 
Here's the detailed log.
third.log
<http://apache-ignite-users.70518.x6.nabble.com/file/t1346/third.log>  

When I try to reboot again.
Now the cluster is in unrecoverable model.
Here's the detailed log.
fourth.log
<http://apache-ignite-users.70518.x6.nabble.com/file/t1346/fourth.log>  

And this is files in the checkpoint folder.
ll -t
/data/ignite/persistence/node00-33503bf4-323b-4965-8bb1-31597d3bedf4/cp
total 232
-rw------- 1 root root 16 Jul 24 05:15
1532409300819-4c35d510-f4ef-4a9d-8311-91afbe08cdb2-START.bin
-rw------- 1 root root 16 Jul 24 05:14 1532409286225-node-started.bin
-rw------- 1 root root 16 Jul 24 05:13 1532409199626-node-started.bin
-rw------- 1 root root 16 Jul 24 05:13
1532409179923-e8332756-9efc-4baa-b854-722d858fe879-END.bin
-rw------- 1 root root 16 Jul 24 05:12
1532409179923-e8332756-9efc-4baa-b854-722d858fe879-START.bin
-rw------- 1 root root 16 Jul 24 05:12 1532409166705-node-started.bin
-rw------- 1 root root 16 Jul 24 04:30 1532406650280-node-started.bin
-rw------- 1 root root 16 Jul 24 04:30
1532406379589-0f14ace1-476c-4751-9237-c32b44409348-END.bin
-rw------- 1 root root 16 Jul 24 04:26
1532406379589-0f14ace1-476c-4751-9237-c32b44409348-START.bin
-rw------- 1 root root 16 Jul 24 04:20
1532405902580-4f0d9446-df20-4327-89f5-66db3c3d231d-END.bin
-rw------- 1 root root 16 Jul 24 04:18
1532405902580-4f0d9446-df20-4327-89f5-66db3c3d231d-START.bin
-rw------- 1 root root 16 Jul 24 04:11
1532405301597-7ed8ec28-4761-431b-a702-3401db38e43e-END.bin
-rw------- 1 root root 16 Jul 24 04:08
1532405301597-7ed8ec28-4761-431b-a702-3401db38e43e-START.bin
-rw------- 1 root root 16 Jul 24 04:05
1532404700308-38bfb4f8-ffe2-4fca-a4ac-488146fb8bc6-END.bin
-rw------- 1 root root 16 Jul 24 03:58
1532404700308-38bfb4f8-ffe2-4fca-a4ac-488146fb8bc6-START.bin
-rw------- 1 root root 16 Jul 24 03:48
1532404100330-20d1e18a-6c2b-468b-8058-913140a73778-END.bin
-rw------- 1 root root 16 Jul 24 03:48
1532404100330-20d1e18a-6c2b-468b-8058-913140a73778-START.bin
-rw------- 1 root root 16 Jul 24 03:38
1532403500142-500c576d-f6b6-4677-aa20-b2ea27275241-END.bin
-rw------- 1 root root 16 Jul 24 03:38
1532403500142-500c576d-f6b6-4677-aa20-b2ea27275241-START.bin
-rw------- 1 root root 16 Jul 24 03:28
1532402900158-21ff15b0-f404-402a-8ae3-7daf68b0bd7c-END.bin
-rw------- 1 root root 16 Jul 24 03:28
1532402900158-21ff15b0-f404-402a-8ae3-7daf68b0bd7c-START.bin
-rw------- 1 root root 16 Jul 24 03:18
1532402300056-a6109747-fe76-4b6b-869c-ea7529ee42fb-END.bin
-rw------- 1 root root 16 Jul 24 03:18
1532402300056-a6109747-fe76-4b6b-869c-ea7529ee42fb-START.bin
-rw------- 1 root root 16 Jul 24 03:17 1532402278723-node-started.bin
-rw------- 1 root root 16 Jul 24 03:17
1532401832758-43fc7608-a11e-4176-a879-37a1ee0f9f37-END.bin
-rw------- 1 root root 16 Jul 24 03:10
1532401832758-43fc7608-a11e-4176-a879-37a1ee0f9f37-START.bin
-rw------- 1 root root 16 Jul 24 03:00
1532401232746-5f6483fe-4d1b-4426-8e25-28d3dc1b156b-END.bin
-rw------- 1 root root 16 Jul 24 03:00
1532401232746-5f6483fe-4d1b-4426-8e25-28d3dc1b156b-START.bin
-rw------- 1 root root 16 Jul 24 03:00 1532401217918-node-started.bin
-rw------- 1 root root 16 Jul 23 19:29
1532374156904-fb69dc16-1947-411c-bc5e-ee6540ba8e53-END.bin
-rw------- 1 root root 16 Jul 23 19:29
1532374156904-fb69dc16-1947-411c-bc5e-ee6540ba8e53-START.bin
-rw------- 1 root root 16 Jul 23 17:19
1532366356784-bf07f598-0aa4-4503-a594-336b8eccbdbe-END.bin
-rw------- 1 root root 16 Jul 23 17:19
1532366356784-bf07f598-0aa4-4503-a594-336b8eccbdbe-START.bin
-rw------- 1 root root 16 Jul 23 17:09
1532365756812-ff66bf97-c4fd-4ae3-9aa0-872fb09eb610-END.bin
-rw------- 1 root root 16 Jul 23 17:09
1532365756812-ff66bf97-c4fd-4ae3-9aa0-872fb09eb610-START.bin
-rw------- 1 root root 16 Jul 23 16:59
1532365156819-9774fab8-da28-4188-80ee-a5753668b52a-END.bin
-rw------- 1 root root 16 Jul 23 16:59
1532365156819-9774fab8-da28-4188-80ee-a5753668b52a-START.bin
-rw------- 1 root root 16 Jul 23 16:49
1532364556960-64ee6f39-dc21-4fa2-b472-c00d3d738a87-END.bin
-rw------- 1 root root 16 Jul 23 16:49
1532364556960-64ee6f39-dc21-4fa2-b472-c00d3d738a87-START.bin
-rw------- 1 root root 16 Jul 23 16:39
1532363956827-d9a3189d-586f-437f-9d86-463bdb23ae1f-END.bin
-rw------- 1 root root 16 Jul 23 16:39
1532363956827-d9a3189d-586f-437f-9d86-463bdb23ae1f-START.bin



--
Sent from: http://apache-ignite-users.70518.x6.nabble.com/

Mime
View raw message