From common-commits-return-98879-archive-asf-public=cust-asf.ponee.io@hadoop.apache.org Mon May 18 19:06:10 2020 Return-Path: X-Original-To: archive-asf-public@cust-asf.ponee.io Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [207.244.88.153]) by mx-eu-01.ponee.io (Postfix) with SMTP id B969818062B for ; Mon, 18 May 2020 21:06:09 +0200 (CEST) Received: (qmail 92929 invoked by uid 500); 18 May 2020 19:06:09 -0000 Mailing-List: contact common-commits-help@hadoop.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Delivered-To: mailing list common-commits@hadoop.apache.org Received: (qmail 92920 invoked by uid 99); 18 May 2020 19:06:08 -0000 Received: from ec2-52-202-80-70.compute-1.amazonaws.com (HELO gitbox.apache.org) (52.202.80.70) by apache.org (qpsmtpd/0.29) with ESMTP; Mon, 18 May 2020 19:06:08 +0000 Received: by gitbox.apache.org (ASF Mail Server at gitbox.apache.org, from userid 33) id 6EB1B85E29; Mon, 18 May 2020 19:06:08 +0000 (UTC) Date: Mon, 18 May 2020 19:06:08 +0000 To: "common-commits@hadoop.apache.org" Subject: [hadoop] branch branch-3.2 updated: HDFS-15293. Relax the condition for accepting a fsimage when receiving a checkpoint. Contributed by Chen Liang MIME-Version: 1.0 Content-Type: text/plain; charset=utf-8 Content-Transfer-Encoding: 8bit Message-ID: <158982876802.23415.3750544153145275429@gitbox.apache.org> From: cliang@apache.org X-Git-Host: gitbox.apache.org X-Git-Repo: hadoop X-Git-Refname: refs/heads/branch-3.2 X-Git-Reftype: branch X-Git-Oldrev: 777e36448cef41c347a1ee57d88cf96cd086e869 X-Git-Newrev: 1813d25bf23b2ae3fe87cf5bbf2d5dcb7987cf65 X-Git-Rev: 1813d25bf23b2ae3fe87cf5bbf2d5dcb7987cf65 X-Git-NotificationType: ref_changed_plus_diff X-Git-Multimail-Version: 1.5.dev Auto-Submitted: auto-generated This is an automated email from the ASF dual-hosted git repository. cliang pushed a commit to branch branch-3.2 in repository https://gitbox.apache.org/repos/asf/hadoop.git The following commit(s) were added to refs/heads/branch-3.2 by this push: new 1813d25 HDFS-15293. Relax the condition for accepting a fsimage when receiving a checkpoint. Contributed by Chen Liang 1813d25 is described below commit 1813d25bf23b2ae3fe87cf5bbf2d5dcb7987cf65 Author: Chen Liang AuthorDate: Mon May 18 10:58:52 2020 -0700 HDFS-15293. Relax the condition for accepting a fsimage when receiving a checkpoint. Contributed by Chen Liang --- .../hadoop/hdfs/server/namenode/ImageServlet.java | 39 ++++++++++++---- .../hdfs/server/namenode/TestCheckpoint.java | 53 +++++++++++++++++++++- 2 files changed, 81 insertions(+), 11 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ImageServlet.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ImageServlet.java index c36b46d..5ed2a16 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ImageServlet.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ImageServlet.java @@ -98,6 +98,19 @@ public class ImageServlet extends HttpServlet { "recent.image.check.enabled"; public static final boolean RECENT_IMAGE_CHECK_ENABLED_DEFAULT = true; + /* + * Specify a relaxation for the time delta check, the relaxation is to account + * for the scenario that there are chances that minor time difference (e.g. + * due to image upload delay, or minor machine clock skew) can cause ANN to + * reject a fsImage too aggressively. + */ + private static double recentImageCheckTimePrecision = 0.75; + + @VisibleForTesting + static void setRecentImageCheckTimePrecision(double ratio) { + recentImageCheckTimePrecision = ratio; + } + @Override public void doGet(final HttpServletRequest request, final HttpServletResponse response) throws ServletException, IOException { @@ -566,6 +579,9 @@ public class ImageServlet extends HttpServlet { long checkpointPeriod = conf.getTimeDuration(DFS_NAMENODE_CHECKPOINT_PERIOD_KEY, DFS_NAMENODE_CHECKPOINT_PERIOD_DEFAULT, TimeUnit.SECONDS); + checkpointPeriod = Math.round( + checkpointPeriod * recentImageCheckTimePrecision); + long checkpointTxnCount = conf.getLong(DFS_NAMENODE_CHECKPOINT_TXNS_KEY, DFS_NAMENODE_CHECKPOINT_TXNS_DEFAULT); @@ -586,21 +602,24 @@ public class ImageServlet extends HttpServlet { // a new fsImage // 1. most recent image's txid is too far behind // 2. last checkpoint time was too old - response.sendError(HttpServletResponse.SC_CONFLICT, - "Most recent checkpoint is neither too far behind in " - + "txid, nor too old. New txnid cnt is " - + (txid - lastCheckpointTxid) - + ", expecting at least " + checkpointTxnCount - + " unless too long since last upload."); + String message = "Rejecting a fsimage due to small time delta " + + "and txnid delta. Time since previous checkpoint is " + + timeDelta + " expecting at least " + checkpointPeriod + + " txnid delta since previous checkpoint is " + + (txid - lastCheckpointTxid) + " expecting at least " + + checkpointTxnCount; + LOG.info(message); + response.sendError(HttpServletResponse.SC_CONFLICT, message); return null; } try { if (nnImage.getStorage().findImageFile(nnf, txid) != null) { - response.sendError(HttpServletResponse.SC_CONFLICT, - "Either current namenode has checkpointed or " - + "another checkpointer already uploaded an " - + "checkpoint for txid " + txid); + String message = "Either current namenode has checkpointed or " + + "another checkpointer already uploaded an " + + "checkpoint for txid " + txid; + LOG.info(message); + response.sendError(HttpServletResponse.SC_CONFLICT, message); return null; } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestCheckpoint.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestCheckpoint.java index 46f6694..7bd2f9c 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestCheckpoint.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestCheckpoint.java @@ -2474,7 +2474,7 @@ public class TestCheckpoint { } @Test(timeout = 300000) - public void testActiveRejectSmallerDeltaImage() throws Exception { + public void testActiveRejectSmallerTxidDeltaImage() throws Exception { MiniDFSCluster cluster = null; Configuration conf = new HdfsConfiguration(); // Set the delta txid threshold to 10 @@ -2527,6 +2527,57 @@ public class TestCheckpoint { } } + /** + * Test that even with txid and time delta threshold, by having time + * relaxation, SBN can still upload images to ANN. + * + * @throws Exception + */ + @Test + public void testActiveImageWithTimeDeltaRelaxation() throws Exception { + Configuration conf = new HdfsConfiguration(); + // Set the delta txid threshold to some arbitrarily large value, so + // it does not trigger a checkpoint during this test. + conf.setInt(DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_TXNS_KEY, 1000000); + // Set the delta time threshold to some arbitrarily large value, so + // it does not trigger a checkpoint during this test. + conf.setInt(DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_PERIOD_KEY, 900000); + // Set relaxation to 0, means time delta = 0 from previous image is fine, + // this will effectively disable reject small delta image + ImageServlet.setRecentImageCheckTimePrecision(0); + + SecondaryNameNode secondary = null; + + try (MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf) + .numDataNodes(0).format(true).build()) { + // enable small delta rejection + NameNode active = cluster.getNameNode(); + active.httpServer.getHttpServer() + .setAttribute(RECENT_IMAGE_CHECK_ENABLED, true); + + secondary = startSecondaryNameNode(conf); + + FileSystem fs = cluster.getFileSystem(); + assertEquals(0, active.getNamesystem().getFSImage() + .getMostRecentCheckpointTxId()); + + // create 5 dir. + for (int i = 0; i < 5; i++) { + fs.mkdirs(new Path("dir-" + i)); + } + + // Checkpoint 1st + secondary.doCheckpoint(); + // at this point, despite this is a small delta change, w.r.t both + // txid and time delta, due to we set relaxation to 0, this image + // still gets accepted + assertEquals(9, active.getNamesystem().getFSImage() + .getMostRecentCheckpointTxId()); + } finally { + cleanup(secondary); + } + } + private static void cleanup(SecondaryNameNode snn) { if (snn != null) { try { --------------------------------------------------------------------- To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org For additional commands, e-mail: common-commits-help@hadoop.apache.org