From common-commits-return-98882-archive-asf-public=cust-asf.ponee.io@hadoop.apache.org Tue May 19 00:53:00 2020 Return-Path: X-Original-To: archive-asf-public@cust-asf.ponee.io Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [207.244.88.153]) by mx-eu-01.ponee.io (Postfix) with SMTP id 88A0818062B for ; Tue, 19 May 2020 02:53:00 +0200 (CEST) Received: (qmail 9920 invoked by uid 500); 19 May 2020 00:52:56 -0000 Mailing-List: contact common-commits-help@hadoop.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Delivered-To: mailing list common-commits@hadoop.apache.org Received: (qmail 9788 invoked by uid 99); 19 May 2020 00:52:56 -0000 Received: from ec2-52-202-80-70.compute-1.amazonaws.com (HELO gitbox.apache.org) (52.202.80.70) by apache.org (qpsmtpd/0.29) with ESMTP; Tue, 19 May 2020 00:52:56 +0000 Received: by gitbox.apache.org (ASF Mail Server at gitbox.apache.org, from userid 33) id DD15E85E29; Tue, 19 May 2020 00:52:55 +0000 (UTC) Date: Tue, 19 May 2020 00:52:55 +0000 To: "common-commits@hadoop.apache.org" Subject: [hadoop] branch branch-3.3 updated: HDFS-15293. Relax the condition for accepting a fsimage when receiving a checkpoint. Contributed by Chen Liang MIME-Version: 1.0 Content-Type: text/plain; charset=utf-8 Content-Transfer-Encoding: 8bit Message-ID: <158984957547.3462.1781266477428403415@gitbox.apache.org> From: aajisaka@apache.org X-Git-Host: gitbox.apache.org X-Git-Repo: hadoop X-Git-Refname: refs/heads/branch-3.3 X-Git-Reftype: branch X-Git-Oldrev: eb045ea056c6b70efd9f0ba514f37ec707dda209 X-Git-Newrev: e452163a06daa6bbebc571127754962d8776a925 X-Git-Rev: e452163a06daa6bbebc571127754962d8776a925 X-Git-NotificationType: ref_changed_plus_diff X-Git-Multimail-Version: 1.5.dev Auto-Submitted: auto-generated This is an automated email from the ASF dual-hosted git repository. aajisaka pushed a commit to branch branch-3.3 in repository https://gitbox.apache.org/repos/asf/hadoop.git The following commit(s) were added to refs/heads/branch-3.3 by this push: new e452163 HDFS-15293. Relax the condition for accepting a fsimage when receiving a checkpoint. Contributed by Chen Liang e452163 is described below commit e452163a06daa6bbebc571127754962d8776a925 Author: Chen Liang AuthorDate: Mon May 18 10:58:52 2020 -0700 HDFS-15293. Relax the condition for accepting a fsimage when receiving a checkpoint. Contributed by Chen Liang (cherry picked from commit 7bb902bc0d0c62d63a8960db444de3abb0a6ad22) --- .../hadoop/hdfs/server/namenode/ImageServlet.java | 39 ++++++++++++---- .../hdfs/server/namenode/TestCheckpoint.java | 53 +++++++++++++++++++++- 2 files changed, 81 insertions(+), 11 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ImageServlet.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ImageServlet.java index 91f24dd..a9c2a09 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ImageServlet.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ImageServlet.java @@ -99,6 +99,19 @@ public class ImageServlet extends HttpServlet { "recent.image.check.enabled"; public static final boolean RECENT_IMAGE_CHECK_ENABLED_DEFAULT = true; + /* + * Specify a relaxation for the time delta check, the relaxation is to account + * for the scenario that there are chances that minor time difference (e.g. + * due to image upload delay, or minor machine clock skew) can cause ANN to + * reject a fsImage too aggressively. + */ + private static double recentImageCheckTimePrecision = 0.75; + + @VisibleForTesting + static void setRecentImageCheckTimePrecision(double ratio) { + recentImageCheckTimePrecision = ratio; + } + @Override public void doGet(final HttpServletRequest request, final HttpServletResponse response) throws ServletException, IOException { @@ -592,6 +605,9 @@ public class ImageServlet extends HttpServlet { long checkpointPeriod = conf.getTimeDuration(DFS_NAMENODE_CHECKPOINT_PERIOD_KEY, DFS_NAMENODE_CHECKPOINT_PERIOD_DEFAULT, TimeUnit.SECONDS); + checkpointPeriod = Math.round( + checkpointPeriod * recentImageCheckTimePrecision); + long checkpointTxnCount = conf.getLong(DFS_NAMENODE_CHECKPOINT_TXNS_KEY, DFS_NAMENODE_CHECKPOINT_TXNS_DEFAULT); @@ -612,21 +628,24 @@ public class ImageServlet extends HttpServlet { // a new fsImage // 1. most recent image's txid is too far behind // 2. last checkpoint time was too old - response.sendError(HttpServletResponse.SC_CONFLICT, - "Most recent checkpoint is neither too far behind in " - + "txid, nor too old. New txnid cnt is " - + (txid - lastCheckpointTxid) - + ", expecting at least " + checkpointTxnCount - + " unless too long since last upload."); + String message = "Rejecting a fsimage due to small time delta " + + "and txnid delta. Time since previous checkpoint is " + + timeDelta + " expecting at least " + checkpointPeriod + + " txnid delta since previous checkpoint is " + + (txid - lastCheckpointTxid) + " expecting at least " + + checkpointTxnCount; + LOG.info(message); + response.sendError(HttpServletResponse.SC_CONFLICT, message); return null; } try { if (nnImage.getStorage().findImageFile(nnf, txid) != null) { - response.sendError(HttpServletResponse.SC_CONFLICT, - "Either current namenode has checkpointed or " - + "another checkpointer already uploaded an " - + "checkpoint for txid " + txid); + String message = "Either current namenode has checkpointed or " + + "another checkpointer already uploaded an " + + "checkpoint for txid " + txid; + LOG.info(message); + response.sendError(HttpServletResponse.SC_CONFLICT, message); return null; } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestCheckpoint.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestCheckpoint.java index 572ad8b..fabe8c5 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestCheckpoint.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestCheckpoint.java @@ -2464,7 +2464,7 @@ public class TestCheckpoint { } @Test(timeout = 300000) - public void testActiveRejectSmallerDeltaImage() throws Exception { + public void testActiveRejectSmallerTxidDeltaImage() throws Exception { MiniDFSCluster cluster = null; Configuration conf = new HdfsConfiguration(); // Set the delta txid threshold to 10 @@ -2517,6 +2517,57 @@ public class TestCheckpoint { } } + /** + * Test that even with txid and time delta threshold, by having time + * relaxation, SBN can still upload images to ANN. + * + * @throws Exception + */ + @Test + public void testActiveImageWithTimeDeltaRelaxation() throws Exception { + Configuration conf = new HdfsConfiguration(); + // Set the delta txid threshold to some arbitrarily large value, so + // it does not trigger a checkpoint during this test. + conf.setInt(DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_TXNS_KEY, 1000000); + // Set the delta time threshold to some arbitrarily large value, so + // it does not trigger a checkpoint during this test. + conf.setInt(DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_PERIOD_KEY, 900000); + // Set relaxation to 0, means time delta = 0 from previous image is fine, + // this will effectively disable reject small delta image + ImageServlet.setRecentImageCheckTimePrecision(0); + + SecondaryNameNode secondary = null; + + try (MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf) + .numDataNodes(0).format(true).build()) { + // enable small delta rejection + NameNode active = cluster.getNameNode(); + active.httpServer.getHttpServer() + .setAttribute(RECENT_IMAGE_CHECK_ENABLED, true); + + secondary = startSecondaryNameNode(conf); + + FileSystem fs = cluster.getFileSystem(); + assertEquals(0, active.getNamesystem().getFSImage() + .getMostRecentCheckpointTxId()); + + // create 5 dir. + for (int i = 0; i < 5; i++) { + fs.mkdirs(new Path("dir-" + i)); + } + + // Checkpoint 1st + secondary.doCheckpoint(); + // at this point, despite this is a small delta change, w.r.t both + // txid and time delta, due to we set relaxation to 0, this image + // still gets accepted + assertEquals(9, active.getNamesystem().getFSImage() + .getMostRecentCheckpointTxId()); + } finally { + cleanup(secondary); + } + } + private static void cleanup(SecondaryNameNode snn) { if (snn != null) { try { --------------------------------------------------------------------- To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org For additional commands, e-mail: common-commits-help@hadoop.apache.org