From ozone-commits-return-881-archive-asf-public=cust-asf.ponee.io@hadoop.apache.org Fri Mar 13 15:29:54 2020 Return-Path: X-Original-To: archive-asf-public@cust-asf.ponee.io Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [207.244.88.153]) by mx-eu-01.ponee.io (Postfix) with SMTP id 4A2B518062C for ; Fri, 13 Mar 2020 16:29:54 +0100 (CET) Received: (qmail 83396 invoked by uid 500); 13 Mar 2020 15:29:53 -0000 Mailing-List: contact ozone-commits-help@hadoop.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: ozone-commits@hadoop.apache.org Delivered-To: mailing list ozone-commits@hadoop.apache.org Received: (qmail 83384 invoked by uid 99); 13 Mar 2020 15:29:53 -0000 Received: from ec2-52-202-80-70.compute-1.amazonaws.com (HELO gitbox.apache.org) (52.202.80.70) by apache.org (qpsmtpd/0.29) with ESMTP; Fri, 13 Mar 2020 15:29:53 +0000 Received: by gitbox.apache.org (ASF Mail Server at gitbox.apache.org, from userid 33) id 4B6E48DACA; Fri, 13 Mar 2020 15:29:53 +0000 (UTC) Date: Fri, 13 Mar 2020 15:29:53 +0000 To: "ozone-commits@hadoop.apache.org" Subject: [hadoop-ozone] branch ozone-0.5.0 updated: HDDS-3116. Datanode sometimes fails to start with NPE when starting Ratis xceiver server (#630) MIME-Version: 1.0 Content-Type: text/plain; charset=utf-8 Content-Transfer-Encoding: 8bit Message-ID: <158411339311.27474.16824017849617194116@gitbox.apache.org> From: arp@apache.org X-Git-Host: gitbox.apache.org X-Git-Repo: hadoop-ozone X-Git-Refname: refs/heads/ozone-0.5.0 X-Git-Reftype: branch X-Git-Oldrev: 637add05d410c8722741c0764578a8eb10d92197 X-Git-Newrev: cf3f6ed0da13ba7d71c2ae181e369eaf5e51aa6b X-Git-Rev: cf3f6ed0da13ba7d71c2ae181e369eaf5e51aa6b X-Git-NotificationType: ref_changed_plus_diff X-Git-Multimail-Version: 1.5.dev Auto-Submitted: auto-generated This is an automated email from the ASF dual-hosted git repository. arp pushed a commit to branch ozone-0.5.0 in repository https://gitbox.apache.org/repos/asf/hadoop-ozone.git The following commit(s) were added to refs/heads/ozone-0.5.0 by this push: new cf3f6ed HDDS-3116. Datanode sometimes fails to start with NPE when starting Ratis xceiver server (#630) cf3f6ed is described below commit cf3f6ed0da13ba7d71c2ae181e369eaf5e51aa6b Author: Stephen O'Donnell AuthorDate: Thu Mar 12 08:40:31 2020 +0000 HDDS-3116. Datanode sometimes fails to start with NPE when starting Ratis xceiver server (#630) (cherry picked from commit c1997218a4e1a6695a275c73cf85360cd046329c) --- .../common/statemachine/DatanodeStateMachine.java | 27 +++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/DatanodeStateMachine.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/DatanodeStateMachine.java index 5229ae8..dc39025 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/DatanodeStateMachine.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/DatanodeStateMachine.java @@ -21,6 +21,8 @@ import java.io.IOException; import java.util.concurrent.ExecutorService; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicLong; +import java.util.concurrent.locks.ReadWriteLock; +import java.util.concurrent.locks.ReentrantReadWriteLock; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hdds.conf.OzoneConfiguration; @@ -90,6 +92,11 @@ public class DatanodeStateMachine implements Closeable { private JvmPauseMonitor jvmPauseMonitor; private CertificateClient dnCertClient; private final HddsDatanodeStopService hddsDatanodeStopService; + /** + * Used to synchronize to the OzoneContainer object created in the + * constructor in a non-thread-safe way - see HDDS-3116. + */ + private final ReadWriteLock constructionLock = new ReentrantReadWriteLock(); /** * Constructs a a datanode state machine. @@ -113,8 +120,16 @@ public class DatanodeStateMachine implements Closeable { .setNameFormat("Datanode State Machine Thread - %d").build()); connectionManager = new SCMConnectionManager(conf); context = new StateContext(this.conf, DatanodeStates.getInitState(), this); - container = new OzoneContainer(this.datanodeDetails, - ozoneConf, context, certClient); + // OzoneContainer instance is used in a non-thread safe way by the context + // past to its constructor, so we much synchronize its access. See + // HDDS-3116 for more details. + constructionLock.writeLock().lock(); + try { + container = new OzoneContainer(this.datanodeDetails, + ozoneConf, context, certClient); + } finally { + constructionLock.writeLock().unlock(); + } dnCertClient = certClient; nextHB = new AtomicLong(Time.monotonicNow()); @@ -173,7 +188,13 @@ public class DatanodeStateMachine implements Closeable { } public OzoneContainer getContainer() { - return this.container; + // See HDDS-3116 to explain the need for this lock + constructionLock.readLock().lock(); + try { + return this.container; + } finally { + constructionLock.readLock().unlock(); + } } /** --------------------------------------------------------------------- To unsubscribe, e-mail: ozone-commits-unsubscribe@hadoop.apache.org For additional commands, e-mail: ozone-commits-help@hadoop.apache.org