From reviews-return-710836-archive-asf-public=cust-asf.ponee.io@spark.apache.org  Wed Oct  3 09:59:16 2018
Return-Path: <reviews-return-710836-archive-asf-public=cust-asf.ponee.io@spark.apache.org>
X-Original-To: archive-asf-public@cust-asf.ponee.io
Delivered-To: archive-asf-public@cust-asf.ponee.io
Received: from mail.apache.org (hermes.apache.org [140.211.11.3])
	by mx-eu-01.ponee.io (Postfix) with SMTP id 2323F18065B
	for <archive-asf-public@cust-asf.ponee.io>; Wed,  3 Oct 2018 09:59:15 +0200 (CEST)
Received: (qmail 3200 invoked by uid 500); 3 Oct 2018 07:59:14 -0000
Mailing-List: contact reviews-help@spark.apache.org; run by ezmlm
Precedence: bulk
List-Help: <mailto:reviews-help@spark.apache.org>
List-Unsubscribe: <mailto:reviews-unsubscribe@spark.apache.org>
List-Post: <mailto:reviews@spark.apache.org>
List-Id: <reviews.spark.apache.org>
Delivered-To: mailing list reviews@spark.apache.org
Received: (qmail 3188 invoked by uid 99); 3 Oct 2018 07:59:13 -0000
Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23)
    by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 03 Oct 2018 07:59:13 +0000
Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33)
	id 5D138DFEAD; Wed,  3 Oct 2018 07:59:13 +0000 (UTC)
From: mgaido91 <git@git.apache.org>
To: reviews@spark.apache.org
Reply-To: reviews@spark.apache.org
References: <git-pr-10989-spark@git.apache.org>
In-Reply-To: <git-pr-10989-spark@git.apache.org>
Subject: [GitHub] spark pull request #10989: [SPARK-12798] [SQL] generated BroadcastHashJoin
Content-Type: text/plain
Message-Id: <20181003075913.5D138DFEAD@git1-us-west.apache.org>
Date: Wed,  3 Oct 2018 07:59:13 +0000 (UTC)

Github user mgaido91 commented on a diff in the pull request:

    https://github.com/apache/spark/pull/10989#discussion_r222216289
  
    --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoin.scala ---
    @@ -117,6 +120,87 @@ case class BroadcastHashJoin(
           hashJoin(streamedIter, numStreamedRows, hashedRelation, numOutputRows)
         }
       }
    +
    +  // the term for hash relation
    +  private var relationTerm: String = _
    +
    +  override def upstream(): RDD[InternalRow] = {
    +    streamedPlan.asInstanceOf[CodegenSupport].upstream()
    +  }
    +
    +  override def doProduce(ctx: CodegenContext): String = {
    +    // create a name for HashRelation
    +    val broadcastRelation = Await.result(broadcastFuture, timeout)
    +    val broadcast = ctx.addReferenceObj("broadcast", broadcastRelation)
    +    relationTerm = ctx.freshName("relation")
    +    // TODO: create specialized HashRelation for single join key
    +    val clsName = classOf[UnsafeHashedRelation].getName
    +    ctx.addMutableState(clsName, relationTerm,
    +      s"""
    +         | $relationTerm = ($clsName) $broadcast.value();
    +         | incPeakExecutionMemory($relationTerm.getUnsafeSize());
    +       """.stripMargin)
    +
    +    s"""
    +       | ${streamedPlan.asInstanceOf[CodegenSupport].produce(ctx, this)}
    +     """.stripMargin
    +  }
    +
    +  override def doConsume(ctx: CodegenContext, input: Seq[ExprCode]): String = {
    +    // generate the key as UnsafeRow
    +    ctx.currentVars = input
    +    val keyExpr = streamedKeys.map(BindReferences.bindReference(_, streamedPlan.output))
    +    val keyVal = GenerateUnsafeProjection.createCode(ctx, keyExpr)
    +    val keyTerm = keyVal.value
    +    val anyNull = if (keyExpr.exists(_.nullable)) s"$keyTerm.anyNull()" else "false"
    +
    +    // find the matches from HashedRelation
    +    val matches = ctx.freshName("matches")
    +    val bufferType = classOf[CompactBuffer[UnsafeRow]].getName
    +    val i = ctx.freshName("i")
    +    val size = ctx.freshName("size")
    +    val row = ctx.freshName("row")
    +
    +    // create variables for output
    +    ctx.currentVars = null
    +    ctx.INPUT_ROW = row
    +    val buildColumns = buildPlan.output.zipWithIndex.map { case (a, i) =>
    +      BoundReference(i, a.dataType, a.nullable).gen(ctx)
    +    }
    +    val resultVars = buildSide match {
    +      case BuildLeft => buildColumns ++ input
    +      case BuildRight => input ++ buildColumns
    +    }
    +
    +    val ouputCode = if (condition.isDefined) {
    +      // filter the output via condition
    +      ctx.currentVars = resultVars
    +      val ev = BindReferences.bindReference(condition.get, this.output).gen(ctx)
    +      s"""
    +         | ${ev.code}
    +         | if (!${ev.isNull} && ${ev.value}) {
    +         |   ${consume(ctx, resultVars)}
    +         | }
    +       """.stripMargin
    +    } else {
    +      consume(ctx, resultVars)
    +    }
    +
    +    s"""
    +       | // generate join key
    +       | ${keyVal.code}
    +       | // find matches from HashRelation
    +       | $bufferType $matches = $anyNull ? null : ($bufferType) $relationTerm.get($keyTerm);
    +       | if ($matches != null) {
    +       |   int $size = $matches.size();
    +       |   for (int $i = 0; $i < $size; $i++) {
    --- End diff --
    
    mmmh, maybe I see your point now. I think it may be feasible but a bit complex. We might keep a global variable for `$matches` and read from it in the produce method. This is what you are saying right? Just changing here wouldn't work IMHO because in the next iteration the keys are changed...


---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org