spark-reviews mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From kiszk <...@git.apache.org>
Subject [GitHub] spark issue #13758: [SPARK-16043][SQL] Prepare GenericArrayData implementati...
Date Mon, 15 Aug 2016 15:59:59 GMT
Github user kiszk commented on the issue:

    https://github.com/apache/spark/pull/13758
  
    I am sorry for missing something in my description. [The PR](https://github.com/apache/spark/pull/13680)
for `UnsafeArrayData` and this PR and  address different issues. [The PR](https://github.com/apache/spark/pull/13680)
for `UnsafeArrayData` can improve `InternalRow` representation for an array. This PR can improve
projection for an array in generated code.
    
    Here is an use case. When we execute the following one line program, we currently get
the following code. There is an issue at line 86. While a type of ```mapelements_value```
is ```int[]```, the current  constructor of ```GenericArrayData``` store it into ```Object[]```
with boxing. This is very high cost operation. This PR avoids this boxing. As a result, this
PR can improve performance of a Dataset program with an array.
    
    I think that this is not an edge case. This will happen in mllib as described in [here](https://issues.apache.org/jira/browse/SPARK-16070).
What do you think?
    
    ```java
    sparkContext.parallelize(Seq(Array(1)), 1).toDS.map(e => e).show
    ```
    
    ```java
    /* 038 */   protected void processNext() throws java.io.IOException {
    /* 039 */     while (inputadapter_input.hasNext()) {
    /* 040 */       InternalRow inputadapter_row = (InternalRow) inputadapter_input.next();
    /* 041 */       boolean inputadapter_isNull = inputadapter_row.isNullAt(0);
    /* 042 */       ArrayData inputadapter_value = inputadapter_isNull ? null : (inputadapter_row.getArray(0));
    /* 043 */
    /* 044 */       boolean deserializetoobject_isNull1 = inputadapter_isNull;
    /* 045 */       ArrayData deserializetoobject_value1 = null;
    /* 046 */       if (!inputadapter_isNull) {
    /* 047 */         final int deserializetoobject_n = inputadapter_value.numElements();
    /* 048 */         final Object[] deserializetoobject_values = new Object[deserializetoobject_n];
    /* 049 */         for (int deserializetoobject_j = 0; deserializetoobject_j < deserializetoobject_n;
deserializetoobject_j ++) {
    /* 050 */           if (inputadapter_value.isNullAt(deserializetoobject_j)) {
    /* 051 */             deserializetoobject_values[deserializetoobject_j] = null;
    /* 052 */           } else {
    /* 053 */             boolean deserializetoobject_feNull = false;
    /* 054 */             int deserializetoobject_fePrim =
    /* 055 */             inputadapter_value.getInt(deserializetoobject_j);
    /* 056 */
    /* 057 */             boolean deserializetoobject_teNull = deserializetoobject_feNull;
    /* 058 */             int deserializetoobject_tePrim = -1;
    /* 059 */             if (!deserializetoobject_feNull) {
    /* 060 */               deserializetoobject_tePrim = deserializetoobject_fePrim;
    /* 061 */             }
    /* 062 */
    /* 063 */             if (deserializetoobject_teNull) {
    /* 064 */               deserializetoobject_values[deserializetoobject_j] = null;
    /* 065 */             } else {
    /* 066 */               deserializetoobject_values[deserializetoobject_j] = deserializetoobject_tePrim;
    /* 067 */             }
    /* 068 */           }
    /* 069 */         }
    /* 070 */         deserializetoobject_value1 = new org.apache.spark.sql.catalyst.util.GenericArrayData(deserializetoobject_values);
    /* 071 */
    /* 072 */       }
    /* 073 */
    /* 074 */       boolean deserializetoobject_isNull = deserializetoobject_isNull1;
    /* 075 */       final int[] deserializetoobject_value = deserializetoobject_isNull ? null
: (int[]) deserializetoobject_value1.toIntArray();
    /* 076 */       deserializetoobject_isNull = deserializetoobject_value == null;
    /* 077 */
    /* 078 */       Object mapelements_obj = ((Expression) references[0]).eval(null);
    /* 079 */       scala.Function1 mapelements_value1 = (scala.Function1) mapelements_obj;
    /* 080 */
    /* 081 */       boolean mapelements_isNull = false || deserializetoobject_isNull;
    /* 082 */       final int[] mapelements_value = mapelements_isNull ? null : (int[]) mapelements_value1.apply(deserializetoobject_value);
    /* 083 */       mapelements_isNull = mapelements_value == null;
    /* 084 */
    /* 085 */       final boolean serializefromobject_isNull = mapelements_isNull;
    /* 086 */       final ArrayData serializefromobject_value = serializefromobject_isNull
? null : new org.apache.spark.sql.catalyst.util.GenericArrayData(mapelements_value);
    /* 087 */       serializefromobject_holder.reset();
    /* 088 */
    /* 089 */       serializefromobject_rowWriter.zeroOutNullBytes();
    /* 090 */
    /* 091 */       if (serializefromobject_isNull) {
    /* 092 */         serializefromobject_rowWriter.setNullAt(0);
    /* 093 */       } else {
    /* 094 */         // Remember the current cursor so that we can calculate how many bytes
are
    /* 095 */         // written later.
    /* 096 */         final int serializefromobject_tmpCursor = serializefromobject_holder.cursor;
    /* 097 */
    /* 098 */         if (serializefromobject_value instanceof UnsafeArrayData) {
    /* 099 */           final int serializefromobject_sizeInBytes = ((UnsafeArrayData) serializefromobject_value).getSizeInBytes();
    /* 100 */           // grow the global buffer before writing data.
    /* 101 */           serializefromobject_holder.grow(serializefromobject_sizeInBytes);
    /* 102 */           ((UnsafeArrayData) serializefromobject_value).writeToMemory(serializefromobject_holder.buffer,
serializefromobject_holder.cursor);
    /* 103 */           serializefromobject_holder.cursor += serializefromobject_sizeInBytes;
    /* 104 */
    /* 105 */         } else {
    /* 106 */           final int serializefromobject_numElements = serializefromobject_value.numElements();
    /* 107 */           serializefromobject_arrayWriter.initialize(serializefromobject_holder,
serializefromobject_numElements, 4);
    /* 108 */
    /* 109 */           for (int serializefromobject_index = 0; serializefromobject_index
< serializefromobject_numElements; serializefromobject_index++) {
    /* 110 */             if (serializefromobject_value.isNullAt(serializefromobject_index))
{
    /* 111 */               serializefromobject_arrayWriter.setNullInt(serializefromobject_index);
    /* 112 */             } else {
    /* 113 */               final int serializefromobject_element = serializefromobject_value.getInt(serializefromobject_index);
    /* 114 */               serializefromobject_arrayWriter.write(serializefromobject_index,
serializefromobject_element);
    /* 115 */             }
    /* 116 */           }
    /* 117 */         }
    /* 118 */
    /* 119 */         serializefromobject_rowWriter.setOffsetAndSize(0, serializefromobject_tmpCursor,
serializefromobject_holder.cursor - serializefromobject_tmpCursor);
    /* 120 */         serializefromobject_rowWriter.alignToWords(serializefromobject_holder.cursor
- serializefromobject_tmpCursor);
    /* 121 */       }
    /* 122 */       serializefromobject_result.setTotalSize(serializefromobject_holder.totalSize());
    /* 123 */       append(serializefromobject_result);
    /* 124 */       if (shouldStop()) return;
    /* 125 */     }
    /* 126 */   }
    ```


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org


Mime
View raw message