hadoop-hdfs-issues mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From "Nie Gus (JIRA)" <j...@apache.org>
Subject [jira] [Updated] (HDFS-13220) Change lastCheckpointTime to use fsimage mostRecentCheckpointTime
Date Fri, 02 Mar 2018 09:57:00 GMT

     [ https://issues.apache.org/jira/browse/HDFS-13220?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]

Nie Gus updated HDFS-13220:
---------------------------
    Description: 
we found the our standby nn did not do the checkpoint, and the checkpoint alert keep alert,
we use the jmx last checkpoint time and dfs.namenode.checkpoint.period to do the monitor
check.

 

then check the code and log, found the standby NN are using monotonicNow, not fsimage checkpoint
time, so when Standby NN restart or switch to Active, then the

lastCheckpointTime in doWork will be reset. so there is risk standby nn restart or stand active
switch will cause the checkpoint delay. 

 StandbyCheckpointer.java
{code:java}
private void doWork() {
final long checkPeriod = 1000 * checkpointConf.getCheckPeriod();
// Reset checkpoint time so that we don't always checkpoint
// on startup.
lastCheckpointTime = monotonicNow();
while (shouldRun) {
boolean needRollbackCheckpoint = namesystem.isNeedRollbackFsImage();
if (!needRollbackCheckpoint) {
try {
Thread.sleep(checkPeriod);
} catch (InterruptedException ie) {
}
if (!shouldRun) {
break;
}
}
try {
// We may have lost our ticket since last checkpoint, log in again, just in case
if (UserGroupInformation.isSecurityEnabled()) {
UserGroupInformation.getCurrentUser().checkTGTAndReloginFromKeytab();
}
final long now = monotonicNow();
final long uncheckpointed = countUncheckpointedTxns();
final long secsSinceLast = (now - lastCheckpointTime) / 1000;
boolean needCheckpoint = needRollbackCheckpoint;
if (needCheckpoint) {
LOG.info("Triggering a rollback fsimage for rolling upgrade.");
} else if (uncheckpointed >= checkpointConf.getTxnCount()) {
LOG.info("Triggering checkpoint because there have been " +
uncheckpointed + " txns since the last checkpoint, which " +
"exceeds the configured threshold " +
checkpointConf.getTxnCount());
needCheckpoint = true;
} else if (secsSinceLast >= checkpointConf.getPeriod()) {
LOG.info("Triggering checkpoint because it has been " +
secsSinceLast + " seconds since the last checkpoint, which " +
"exceeds the configured interval " + checkpointConf.getPeriod());
needCheckpoint = true;
}
synchronized (cancelLock) {
if (now < preventCheckpointsUntil) {
LOG.info("But skipping this checkpoint since we are about to failover!");
canceledCount++;
continue;
}
assert canceler == null;
canceler = new Canceler();
}
if (needCheckpoint) {
doCheckpoint();
// reset needRollbackCheckpoint to false only when we finish a ckpt
// for rollback image
if (needRollbackCheckpoint
&& namesystem.getFSImage().hasRollbackFSImage()) {
namesystem.setCreatedRollbackImages(true);
namesystem.setNeedRollbackFsImage(false);
}
lastCheckpointTime = now;
}
} catch (SaveNamespaceCancelledException ce) {
LOG.info("Checkpoint was cancelled: " + ce.getMessage());
canceledCount++;
} catch (InterruptedException ie) {
LOG.info("Interrupted during checkpointing", ie);
// Probably requested shutdown.
continue;
} catch (Throwable t) {
LOG.error("Exception in doCheckpoint", t);
} finally {
synchronized (cancelLock) {
canceler = null;
}
}
}
}
}
{code}
 

can we use the fsimage's mostRecentCheckpointTime to do the check.

 

thanks,

Gus

  was:
we found the our standby nn did not do the checkpoint, and the checkpoint alert keep alert,
we use the jmx last checkpoint time and dfs.namenode.checkpoint.period to do the monitor
check.

 

then check the code and log, found the standby NN are using monotonicNow, not fsimage checkpoint
time, so when Standby NN restart or switch to Active, then the

lastCheckpointTime in doWork will be reset. so there is risk standby nn restart or stand active
switch will cause the checkpoint delay. 

 
{code:java}
private void doWork() {
final long checkPeriod = 1000 * checkpointConf.getCheckPeriod();
// Reset checkpoint time so that we don't always checkpoint
// on startup.
lastCheckpointTime = monotonicNow();
while (shouldRun) {
boolean needRollbackCheckpoint = namesystem.isNeedRollbackFsImage();
if (!needRollbackCheckpoint) {
try {
Thread.sleep(checkPeriod);
} catch (InterruptedException ie) {
}
if (!shouldRun) {
break;
}
}
try {
// We may have lost our ticket since last checkpoint, log in again, just in case
if (UserGroupInformation.isSecurityEnabled()) {
UserGroupInformation.getCurrentUser().checkTGTAndReloginFromKeytab();
}
final long now = monotonicNow();
final long uncheckpointed = countUncheckpointedTxns();
final long secsSinceLast = (now - lastCheckpointTime) / 1000;
boolean needCheckpoint = needRollbackCheckpoint;
if (needCheckpoint) {
LOG.info("Triggering a rollback fsimage for rolling upgrade.");
} else if (uncheckpointed >= checkpointConf.getTxnCount()) {
LOG.info("Triggering checkpoint because there have been " +
uncheckpointed + " txns since the last checkpoint, which " +
"exceeds the configured threshold " +
checkpointConf.getTxnCount());
needCheckpoint = true;
} else if (secsSinceLast >= checkpointConf.getPeriod()) {
LOG.info("Triggering checkpoint because it has been " +
secsSinceLast + " seconds since the last checkpoint, which " +
"exceeds the configured interval " + checkpointConf.getPeriod());
needCheckpoint = true;
}
synchronized (cancelLock) {
if (now < preventCheckpointsUntil) {
LOG.info("But skipping this checkpoint since we are about to failover!");
canceledCount++;
continue;
}
assert canceler == null;
canceler = new Canceler();
}
if (needCheckpoint) {
doCheckpoint();
// reset needRollbackCheckpoint to false only when we finish a ckpt
// for rollback image
if (needRollbackCheckpoint
&& namesystem.getFSImage().hasRollbackFSImage()) {
namesystem.setCreatedRollbackImages(true);
namesystem.setNeedRollbackFsImage(false);
}
lastCheckpointTime = now;
}
} catch (SaveNamespaceCancelledException ce) {
LOG.info("Checkpoint was cancelled: " + ce.getMessage());
canceledCount++;
} catch (InterruptedException ie) {
LOG.info("Interrupted during checkpointing", ie);
// Probably requested shutdown.
continue;
} catch (Throwable t) {
LOG.error("Exception in doCheckpoint", t);
} finally {
synchronized (cancelLock) {
canceler = null;
}
}
}
}
}
{code}
 

can we use the fsimage's mostRecentCheckpointTime to do the check.

 

thanks,

Gus


> Change lastCheckpointTime to use fsimage mostRecentCheckpointTime
> -----------------------------------------------------------------
>
>                 Key: HDFS-13220
>                 URL: https://issues.apache.org/jira/browse/HDFS-13220
>             Project: Hadoop HDFS
>          Issue Type: Bug
>          Components: namenode
>            Reporter: Nie Gus
>            Priority: Minor
>
> we found the our standby nn did not do the checkpoint, and the checkpoint alert keep
alert, we use the jmx last checkpoint time and dfs.namenode.checkpoint.period to do the monitor
check.
>  
> then check the code and log, found the standby NN are using monotonicNow, not fsimage
checkpoint time, so when Standby NN restart or switch to Active, then the
> lastCheckpointTime in doWork will be reset. so there is risk standby nn restart or stand
active switch will cause the checkpoint delay. 
>  StandbyCheckpointer.java
> {code:java}
> private void doWork() {
> final long checkPeriod = 1000 * checkpointConf.getCheckPeriod();
> // Reset checkpoint time so that we don't always checkpoint
> // on startup.
> lastCheckpointTime = monotonicNow();
> while (shouldRun) {
> boolean needRollbackCheckpoint = namesystem.isNeedRollbackFsImage();
> if (!needRollbackCheckpoint) {
> try {
> Thread.sleep(checkPeriod);
> } catch (InterruptedException ie) {
> }
> if (!shouldRun) {
> break;
> }
> }
> try {
> // We may have lost our ticket since last checkpoint, log in again, just in case
> if (UserGroupInformation.isSecurityEnabled()) {
> UserGroupInformation.getCurrentUser().checkTGTAndReloginFromKeytab();
> }
> final long now = monotonicNow();
> final long uncheckpointed = countUncheckpointedTxns();
> final long secsSinceLast = (now - lastCheckpointTime) / 1000;
> boolean needCheckpoint = needRollbackCheckpoint;
> if (needCheckpoint) {
> LOG.info("Triggering a rollback fsimage for rolling upgrade.");
> } else if (uncheckpointed >= checkpointConf.getTxnCount()) {
> LOG.info("Triggering checkpoint because there have been " +
> uncheckpointed + " txns since the last checkpoint, which " +
> "exceeds the configured threshold " +
> checkpointConf.getTxnCount());
> needCheckpoint = true;
> } else if (secsSinceLast >= checkpointConf.getPeriod()) {
> LOG.info("Triggering checkpoint because it has been " +
> secsSinceLast + " seconds since the last checkpoint, which " +
> "exceeds the configured interval " + checkpointConf.getPeriod());
> needCheckpoint = true;
> }
> synchronized (cancelLock) {
> if (now < preventCheckpointsUntil) {
> LOG.info("But skipping this checkpoint since we are about to failover!");
> canceledCount++;
> continue;
> }
> assert canceler == null;
> canceler = new Canceler();
> }
> if (needCheckpoint) {
> doCheckpoint();
> // reset needRollbackCheckpoint to false only when we finish a ckpt
> // for rollback image
> if (needRollbackCheckpoint
> && namesystem.getFSImage().hasRollbackFSImage()) {
> namesystem.setCreatedRollbackImages(true);
> namesystem.setNeedRollbackFsImage(false);
> }
> lastCheckpointTime = now;
> }
> } catch (SaveNamespaceCancelledException ce) {
> LOG.info("Checkpoint was cancelled: " + ce.getMessage());
> canceledCount++;
> } catch (InterruptedException ie) {
> LOG.info("Interrupted during checkpointing", ie);
> // Probably requested shutdown.
> continue;
> } catch (Throwable t) {
> LOG.error("Exception in doCheckpoint", t);
> } finally {
> synchronized (cancelLock) {
> canceler = null;
> }
> }
> }
> }
> }
> {code}
>  
> can we use the fsimage's mostRecentCheckpointTime to do the check.
>  
> thanks,
> Gus



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

---------------------------------------------------------------------
To unsubscribe, e-mail: hdfs-issues-unsubscribe@hadoop.apache.org
For additional commands, e-mail: hdfs-issues-help@hadoop.apache.org


Mime
View raw message