spark-reviews mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From GitBox <...@apache.org>
Subject [GitHub] [spark] HyukjinKwon commented on a change in pull request #26279: [SPARK-28382][SQL] Add array function - unnest
Date Fri, 15 Nov 2019 07:42:22 GMT
HyukjinKwon commented on a change in pull request #26279: [SPARK-28382][SQL] Add array function
- unnest
URL: https://github.com/apache/spark/pull/26279#discussion_r346694150
 
 

 ##########
 File path: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
 ##########
 @@ -343,6 +344,87 @@ abstract class ExplodeBase extends UnaryExpression with CollectionGenerator
with
   }
 }
 
+@ExpressionDescription(
+  usage = "_FUNC_(expr) - Separates the elements of array `expr` into multiple rows recursively.",
+  examples = """
+    Examples:
+      > SELECT _FUNC_(array(10, 20));
+       10
+       20
+      > SELECT _FUNC_(a) FROM VALUES (array(1,2)), (array(3,4)) AS v1(a);
+       1
+       2
+       3
+       4
+      > SELECT _FUNC_(a) FROM VALUES (array(array(1,2), array(3,4))) AS v1(a);
+       1
+       2
+       3
+       4
+  """,
+  since = "3.0.0")
+case class UnNest(child: Expression) extends UnaryExpression with Generator with CodegenFallback
{
+
+  override def prettyName: String = "unnest"
+
+  override def elementSchema: StructType = {
+    new StructType().add(prettyName, getLeafDataType(child.dataType), true)
+  }
+
+  // TODO: multidimensional arrays must have array expressions with matching dimensions
+  override def checkInputDataTypes(): TypeCheckResult = child.dataType match {
+    case ArrayType(_: NullType, _) => TypeCheckResult.TypeCheckSuccess
+    case _: ArrayType =>
+      var currentType = child.dataType
+      while(currentType.isInstanceOf[ArrayType]) {
+        val arrayType = currentType.asInstanceOf[ArrayType]
+        if (arrayType.containsNull && !arrayType.elementType.isInstanceOf[AtomicType])
{
+          return TypeCheckResult.TypeCheckFailure("multidimensional arrays must not contains
nulls")
+        }
+        currentType = getElementTypeOrItself(currentType)
+      }
+      TypeUtils.checkForSameTypeInputExpr(child.children.map(_.dataType), s"function $prettyName")
+
+    case _ =>
+      TypeCheckResult.TypeCheckFailure(
+        s"input to function unnest should be array type not ${child.dataType.catalogString}")
+  }
+
+  @scala.annotation.tailrec
+  private def getLeafDataType(typ: DataType): DataType = typ match {
+    case ArrayType(et, _) => getLeafDataType(et)
+    case at => at
+  }
+
+  private def getElementTypeOrItself(typ: DataType): DataType = typ match {
+    case ArrayType(et, _) => et
+    case _ => typ
+  }
+
+  override def eval(input: InternalRow): TraversableOnce[InternalRow] = {
+    var inputArray = child.eval(input).asInstanceOf[ArrayData]
+    if (inputArray == null) {
+      Nil
+    } else {
+      val rows = ArrayBuffer[InternalRow]()
+      var currentType = child.dataType
+      while (currentType.isInstanceOf[ArrayType]) {
+        val currentElementType = getElementTypeOrItself(currentType)
+        if (!currentElementType.isInstanceOf[ArrayType]) {
+          inputArray.foreach(currentElementType, (_, e) => rows += InternalRow(e))
 
 Review comment:
   Can we avoid Scala's foreach in such critical path (see https://github.com/databricks/scala-style-guide#traversal-and-zipwithindex)?
 We can at least switch to a regular for loop instead.

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org


Mime
View raw message