spark-reviews mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From vanzin <...@git.apache.org>
Subject [GitHub] spark pull request: [SPARK-6602][Core] Update Master, Worker, Clie...
Date Tue, 07 Apr 2015 22:56:38 GMT
Github user vanzin commented on a diff in the pull request:

    https://github.com/apache/spark/pull/5392#discussion_r27929385
  
    --- Diff: core/src/main/scala/org/apache/spark/deploy/client/AppClient.scala ---
    @@ -40,98 +37,127 @@ import org.apache.spark.util.{ActorLogReceive, Utils, AkkaUtils}
      * @param masterUrls Each url should look like spark://host:port.
      */
     private[spark] class AppClient(
    -    actorSystem: ActorSystem,
    +    rpcEnv: RpcEnv,
         masterUrls: Array[String],
         appDescription: ApplicationDescription,
         listener: AppClientListener,
         conf: SparkConf)
       extends Logging {
     
    -  private val masterAkkaUrls = masterUrls.map(Master.toAkkaUrl(_, AkkaUtils.protocol(actorSystem)))
    +  private val masterRpcAddresses = masterUrls.map(RpcAddress.fromSparkURL(_))
     
    -  private val REGISTRATION_TIMEOUT = 20.seconds
    +  private val REGISTRATION_TIMEOUT_SECONDS = 20
       private val REGISTRATION_RETRIES = 3
     
    -  private var masterAddress: Address = null
    -  private var actor: ActorRef = null
    +  private var endpoint: RpcEndpointRef = null
       private var appId: String = null
    -  private var registered = false
    -  private var activeMasterUrl: String = null
    +  @volatile private var registered = false
    +
    +  private class ClientEndpoint(override val rpcEnv: RpcEnv) extends ThreadSafeRpcEndpoint
    +    with Logging {
    +
    +    var master: Option[RpcEndpointRef] = None
    +    var alreadyDisconnected = false // To avoid calling listener.disconnected() multiple
times
    +    @volatile private var alreadyDead = false // To avoid calling listener.dead() multiple
times
    +    @volatile private var registerMasterFutures: Array[Future[_]] = null
    +    @volatile private var registrationRetryTimer: ScheduledFuture[_] = null
    +
    +    private val registerMasterThreadPool = new ThreadPoolExecutor(
    +      0,
    +      masterRpcAddresses.size, // Make sure we can register with all masters at the same
time
    +      60L, TimeUnit.SECONDS,
    +      new SynchronousQueue[Runnable](),
    +      Utils.namedThreadFactory("appclient-register-master-threadpool"))
     
    -  private class ClientActor extends Actor with ActorLogReceive with Logging {
    -    var master: ActorSelection = null
    -    var alreadyDisconnected = false  // To avoid calling listener.disconnected() multiple
times
    -    var alreadyDead = false  // To avoid calling listener.dead() multiple times
    -    var registrationRetryTimer: Option[Cancellable] = None
    +    private val registrationRetryThread = Executors.newScheduledThreadPool(1,
    +      Utils.namedThreadFactory("appclient-registration-retry-thread"))
     
    -    override def preStart() {
    -      context.system.eventStream.subscribe(self, classOf[RemotingLifecycleEvent])
    +    override def onStart(): Unit = {
           try {
    -        registerWithMaster()
    +        registerWithMaster(1)
           } catch {
             case e: Exception =>
               logWarning("Failed to connect to master", e)
               markDisconnected()
    -          context.stop(self)
    +          stop()
           }
         }
     
    -    def tryRegisterAllMasters() {
    -      for (masterAkkaUrl <- masterAkkaUrls) {
    -        logInfo("Connecting to master " + masterAkkaUrl + "...")
    -        val actor = context.actorSelection(masterAkkaUrl)
    -        actor ! RegisterApplication(appDescription)
    +    private def tryRegisterAllMasters(): Array[Future[_]] = {
    +      for (masterAddress <- masterRpcAddresses) yield {
    +        registerMasterThreadPool.submit(new Runnable {
    +          override def run(): Unit = try {
    +            if (registered) {
    +              return
    +            }
    +            logInfo("Connecting to master " + masterAddress.toSparkURL + "...")
    +            val masterRef =
    +              rpcEnv.setupEndpointRef(Master.SYSTEM_NAME, masterAddress, Master.ENDPOINT_NAME)
    +            masterRef.send(RegisterApplication(appDescription, self))
    +          } catch {
    +            case ie: InterruptedException => // Cancelled
    +            case NonFatal(e) => logError(e.getMessage, e)
    +          }
    +        })
           }
         }
     
    -    def registerWithMaster() {
    -      tryRegisterAllMasters()
    -      import context.dispatcher
    -      var retries = 0
    -      registrationRetryTimer = Some {
    -        context.system.scheduler.schedule(REGISTRATION_TIMEOUT, REGISTRATION_TIMEOUT)
{
    +    /**
    +     * nthRetry means this is the nth attempt to register with master
    +     */
    +    private def registerWithMaster(nthRetry: Int) {
    +      registerMasterFutures = tryRegisterAllMasters()
    +      registrationRetryTimer = registrationRetryThread.scheduleAtFixedRate(new Runnable
{
    +        override def run(): Unit = {
               Utils.tryOrExit {
    -            retries += 1
                 if (registered) {
    -              registrationRetryTimer.foreach(_.cancel())
    -            } else if (retries >= REGISTRATION_RETRIES) {
    +              registerMasterFutures.foreach(_.cancel(true))
    +              registerMasterThreadPool.shutdownNow()
    +            } else if (nthRetry >= REGISTRATION_RETRIES) {
                   markDead("All masters are unresponsive! Giving up.")
                 } else {
    -              tryRegisterAllMasters()
    +              registerMasterFutures.foreach(_.cancel(true))
    +              registerWithMaster(nthRetry + 1)
                 }
               }
             }
    -      }
    +      }, REGISTRATION_TIMEOUT_SECONDS, REGISTRATION_TIMEOUT_SECONDS, TimeUnit.SECONDS)
         }
     
    -    def changeMaster(url: String) {
    -      // activeMasterUrl is a valid Spark url since we receive it from master.
    -      activeMasterUrl = url
    -      master = context.actorSelection(
    -        Master.toAkkaUrl(activeMasterUrl, AkkaUtils.protocol(actorSystem)))
    -      masterAddress = Master.toAkkaAddress(activeMasterUrl, AkkaUtils.protocol(actorSystem))
    +    private def sendToMaster(message: Any): Unit = {
    +      master match {
    +        case Some(masterRef) => masterRef.send(message)
    +        case None => logWarning(s"Drop $message because has not yet connected to master")
    +      }
         }
     
    -    private def isPossibleMaster(remoteUrl: Address) = {
    -      masterAkkaUrls.map(AddressFromURIString(_).hostPort).contains(remoteUrl.hostPort)
    +    private def isPossibleMaster(remoteAddress: RpcAddress): Boolean = {
    +      masterRpcAddresses.map(_.hostPort).contains(remoteAddress.hostPort)
         }
     
    -    override def receiveWithLogging: PartialFunction[Any, Unit] = {
    -      case RegisteredApplication(appId_, masterUrl) =>
    +    override def receive: PartialFunction[Any, Unit] = {
    +      case RegisteredApplication(appId_, masterRef) =>
    +        // FIXME How to handle the following cases?
    --- End diff --
    
    So, this is the kind of thing that `sendWithReply` was meant to do. Can that be used here
instead?
    
    That way, if the call fails, you know (with a good probability) the app was not registered.
At that point I'm not sure what the protocol is; send the request to the next master? Give
up?
    
    But in any case, this is one case where the "ask" pattern is really weird.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org


Mime
View raw message