Mailing-List: contact issues-help@carbondata.incubator.apache.org; run by ezmlm
Precedence: bulk
Reply-To: dev@carbondata.incubator.apache.org
From: jackylk <git@git.apache.org>
To: issues@carbondata.incubator.apache.org
Reply-To: issues@carbondata.incubator.apache.org
References: <git-pr-362-incubator-carbondata@git.apache.org>
In-Reply-To: <git-pr-362-incubator-carbondata@git.apache.org>
Subject: [GitHub] incubator-carbondata pull request #362: [CARBONDATA-459] Block distribution ...
Content-Type: text/plain
Message-Id: <20161201011319.C0936E04BB@git1-us-west.apache.org>
Date: Thu,  1 Dec 2016 01:13:19 +0000 (UTC)
archived-at: Thu, 01 Dec 2016 01:13:26 -0000

Github user jackylk commented on a diff in the pull request:

    https://github.com/apache/incubator-carbondata/pull/362#discussion_r90365594
  
    --- Diff: integration/spark/src/main/scala/org/apache/spark/sql/hive/DistributionUtil.scala ---
    @@ -101,47 +101,107 @@ object DistributionUtil {
        * Checking if the existing executors is greater than configured executors, if yes
        * returning configured executors.
        *
    -   * @param blockList
    +   * @param blockList total number of blocks in the identified segments
        * @param sparkContext
        * @return
        */
       def ensureExecutorsAndGetNodeList(blockList: Seq[Distributable],
         sparkContext: SparkContext): Seq[String] = {
         val nodeMapping = CarbonLoaderUtil.getRequiredExecutors(blockList.asJava)
    -    var confExecutorsTemp: String = null
    -    if (sparkContext.getConf.contains("spark.executor.instances")) {
    -      confExecutorsTemp = sparkContext.getConf.get("spark.executor.instances")
    -    } else if (sparkContext.getConf.contains("spark.dynamicAllocation.enabled")
    -               && sparkContext.getConf.get("spark.dynamicAllocation.enabled").trim
    -                 .equalsIgnoreCase("true")) {
    -      if (sparkContext.getConf.contains("spark.dynamicAllocation.maxExecutors")) {
    -        confExecutorsTemp = sparkContext.getConf.get("spark.dynamicAllocation.maxExecutors")
    +    ensureExecutorsByNumberAndGetNodeList(nodeMapping, blockList, sparkContext)
    +  }
    +
    +  /**
    +   * This method will ensure that the required/configured number of executors are requested
    +   * for processing the identified blocks
    +   *
    +   * @param nodeMapping
    +   * @param blockList
    +   * @param sparkContext
    +   * @return
    +   */
    +  private def ensureExecutorsByNumberAndGetNodeList(nodeMapping: java.util.Map[String, java.util
    +  .List[Distributable]], blockList: Seq[Distributable],
    +      sparkContext: SparkContext): Seq[String] = {
    +    val nodesOfData = nodeMapping.size()
    +    val confExecutors: Int = getConfiguredExecutors(sparkContext)
    +    LOGGER.info("Executors configured : " + confExecutors)
    +    val requiredExecutors = if (nodesOfData < 1 || nodesOfData > confExecutors) {
    +      confExecutors
    +    } else if (confExecutors > nodesOfData) {
    +      // this case will come only if dynamic allocation is true
    +      var totalExecutorsToBeRequested = nodesOfData
    +      // If total number of blocks are greater than the nodes identified then ensure
    +      // that the configured number of max executors can be opened based on the difference of
    +      // block list size and nodes identified
    +      if (blockList.size > nodesOfData) {
    +        // e.g 1. blockList size = 40, nodesOfData = 3, confExecutors = 6, then all executors
    +        // need to be opened
    +        // 2. blockList size = 4, nodesOfData = 3, confExecutors = 6, then
    +        // total 4 executors need to be opened
    +        val extraExecutorsToBeRequested = blockList.size - nodesOfData
    +        if (extraExecutorsToBeRequested > confExecutors) {
    +          totalExecutorsToBeRequested = confExecutors
    +        } else {
    +          totalExecutorsToBeRequested = nodesOfData + extraExecutorsToBeRequested
    +        }
           }
    +      LOGGER.info("Total executors requested: " + totalExecutorsToBeRequested)
    +      totalExecutorsToBeRequested
    +    } else {
    +      nodesOfData
         }
     
    -    val confExecutors = if (null != confExecutorsTemp) {
    -      confExecutorsTemp.toInt
    +    val startTime = System.currentTimeMillis();
    +    if (sparkContext.getConf.getBoolean("spark.dynamicAllocation.enabled", false)) {
    +      // this case will come only if dynamic allocation is true
    +      CarbonContext
    +        .ensureExecutors(sparkContext, requiredExecutors, blockList.size, Map.empty)
         } else {
    -      1
    +      CarbonContext.ensureExecutors(sparkContext, requiredExecutors)
         }
    -    val requiredExecutors = if (nodeMapping.size > confExecutors) {
    -      confExecutors
    +    getDistinctNodesList(sparkContext, requiredExecutors, startTime)
    +  }
    +
    +  /**
    +   * This method will return the configured executors
    +   *
    +   * @param sparkContext
    +   * @return
    +   */
    +  private def getConfiguredExecutors(sparkContext: SparkContext): Int = {
    +    var confExecutors: Int = 0
    +    if (sparkContext.getConf.getBoolean("spark.dynamicAllocation.enabled", false)) {
    +      // default value for spark.dynamicAllocation.maxExecutors is infinity
    +      confExecutors = sparkContext.getConf.getInt("spark.dynamicAllocation.maxExecutors", 1)
         } else {
    -      nodeMapping.size()
    +      // default value for spark.executor.instances is 2
    +      confExecutors = sparkContext.getConf.getInt("spark.executor.instances", 1)
         }
    +    confExecutors
    +  }
     
    -    val startTime = System.currentTimeMillis()
    -    CarbonContext.ensureExecutors(sparkContext, requiredExecutors)
    +  /**
    +   * This method will return the distinct nodes list
    +   *
    +   * @param sparkContext
    +   * @param requiredExecutors
    +   * @param startTime
    +   * @return
    +   */
    +  private def getDistinctNodesList(sparkContext: SparkContext,
    +      requiredExecutors: Int,
    +      startTime: Long): Seq[String] = {
         var nodes = DistributionUtil.getNodeList(sparkContext)
    -    var maxTimes = 30
    +    var maxTimes = 10;
    --- End diff --
    
    can we make it calculated based on configuration instead of hard coding it?


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---