spark-reviews mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From GitBox <...@apache.org>
Subject [GitHub] [spark] jiangxb1987 commented on a change in pull request #27968: [SPARK-31202][CORE]Improve SizeEstimator for AppendOnlyMap
Date Tue, 24 Mar 2020 06:00:41 GMT
jiangxb1987 commented on a change in pull request #27968: [SPARK-31202][CORE]Improve SizeEstimator
for AppendOnlyMap
URL: https://github.com/apache/spark/pull/27968#discussion_r396917895
 
 

 ##########
 File path: core/src/main/scala/org/apache/spark/util/SizeEstimator.scala
 ##########
 @@ -290,6 +306,88 @@ object SizeEstimator extends Logging {
     size
   }
 
+
+  /** Visit AppendOnlyMap data field which stored all the KVs, we handle this field separately
+   *  because the underlying type of the elems of this array is different, and their size
may vary
+   *  significantly, for example, the value may be an array-like buffer to store merged or
grouped
+   *  values for aggregation.
+   * */
+  private def visitKVDataArray(
+      data: Array[AnyRef],
+      keyPositions: java.util.BitSet,
+      totalValueElements: Int,
+      state: SearchState): Unit = {
+    val length = data.length
+    var arrSize: Long = alignSize(objectSize + INT_SIZE)
+    state.size += arrSize
+    state.size += alignSize((length - keyPositions.size) * pointerSize)
+
+    if (length <= ARRAY_SIZE_FOR_SAMPLING) {
+      for (e <- data) {
+        state.enqueue(e)
+      }
+    } else {
+      val rand = new Random(42)
+      val drawn = new OpenHashSet[Int](2 * ARRAY_SAMPLE_SIZE)
+      val (numKeys1, keySize1, numValueElements1, valueSize1) =
+        sampleKVDataArray(data, keyPositions, state, rand, drawn, length)
+      val (numKeys2, keySize2, numValueElements2, valueSize2) =
+        sampleKVDataArray(data, keyPositions, state, rand, drawn, length)
+      val (_, keySizeForMax, numKeysForMin, keySizeForMin) = if (keySize1 > keySize2)
{
+        (numKeys1, keySize1, numKeys2, keySize2)
+      } else (numKeys2, keySize2, numKeys1, keySize1)
+      val keySize = keySizeForMax + (keySizeForMin *
 
 Review comment:
   What does this try to do?

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org


Mime
View raw message