arrow-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From w...@apache.org
Subject arrow git commit: ARROW-380: [Java] optimize null count when serializing vectors
Date Sat, 17 Dec 2016 23:05:13 GMT
Repository: arrow
Updated Branches:
  refs/heads/master cfb544de2 -> a2ead2f64


ARROW-380: [Java] optimize null count when serializing vectors

I added `getNullCount()` to the `Accessor` interface. I don't know if this is the best way
to achieve this. Hence, we'll have both ValueCount and NullCount immediately accessible from
the accessor.

Author: Mohamed Zenadi <mohamed@zenadi.com>

Closes #207 from zeapo/ARROW-380 and squashes the following commits:

27c0342 [Mohamed Zenadi] implement missing getNullCount implementation for NullableMapVector
9ff3355 [Mohamed Zenadi] implement the base case of getNullCount()
ad3f24a [Mohamed Zenadi] the used size is not the same as the allocated size
e858432 [Mohamed Zenadi] use the valueCount as basis for counting nulls rather than allocated
bytes
0530c85 [Mohamed Zenadi] test the null count byte by byte and the odd length case
95667d3 [Mohamed Zenadi] fix the comment
b12a2a5 [Mohamed Zenadi] fix wrong value returned by the method
f264250 [Mohamed Zenadi] use getNullCount() rather than isNull
baca69c [Mohamed Zenadi] Add methods to count the number null values in the vector


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/a2ead2f6
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/a2ead2f6
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/a2ead2f6

Branch: refs/heads/master
Commit: a2ead2f646baad78de01fcb1b90f710fa1eae70b
Parents: cfb544d
Author: Mohamed Zenadi <mohamed@zenadi.com>
Authored: Sat Dec 17 18:05:04 2016 -0500
Committer: Wes McKinney <wes.mckinney@twosigma.com>
Committed: Sat Dec 17 18:05:04 2016 -0500

----------------------------------------------------------------------
 .../apache/arrow/vector/BaseValueVector.java    | 12 ++++++
 .../java/org/apache/arrow/vector/BitVector.java | 22 ++++++++++
 .../org/apache/arrow/vector/ValueVector.java    |  5 +++
 .../org/apache/arrow/vector/VectorUnloader.java | 10 +----
 .../org/apache/arrow/vector/ZeroVector.java     |  5 +++
 .../apache/arrow/vector/complex/ListVector.java |  5 +++
 .../arrow/vector/complex/NullableMapVector.java |  5 +++
 .../apache/arrow/vector/TestValueVector.java    | 44 ++++++++++++++++++++
 8 files changed, 99 insertions(+), 9 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/a2ead2f6/java/vector/src/main/java/org/apache/arrow/vector/BaseValueVector.java
----------------------------------------------------------------------
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseValueVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseValueVector.java
index 884cdf0..2a61403 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/BaseValueVector.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseValueVector.java
@@ -72,6 +72,18 @@ public abstract class BaseValueVector implements ValueVector {
     public boolean isNull(int index) {
       return false;
     }
+
+    @Override
+    // override this in case your implementation is faster, see BitVector
+    public int getNullCount() {
+      int nullCount = 0;
+      for (int i = 0; i < getValueCount(); i++) {
+        if (isNull(i)) {
+          nullCount ++;
+        }
+      }
+      return nullCount;
+    }
   }
 
   public abstract static class BaseMutator implements ValueVector.Mutator {

http://git-wip-us.apache.org/repos/asf/arrow/blob/a2ead2f6/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java
----------------------------------------------------------------------
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java
index 26eeafd..9beabcb 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java
@@ -379,6 +379,28 @@ public final class BitVector extends BaseDataValueVector implements FixedWidthVe
       holder.isSet = 1;
       holder.value = get(index);
     }
+
+    /**
+     * Get the number nulls, this correspond to the number of bits set to 0 in the vector
+     * @return the number of bits set to 0
+     */
+    @Override
+    public final int getNullCount() {
+      int count = 0;
+      int sizeInBytes = getSizeFromCount(valueCount);
+
+      for (int i = 0; i < sizeInBytes; ++i) {
+        byte byteValue = data.getByte(i);
+        // Java uses two's complement binary representation, hence 11111111_b which is -1
when converted to Int
+        // will have 32bits set to 1. Masking the MSB and then adding it back solves the
issue.
+        count += Integer.bitCount(byteValue & 0x7F) - (byteValue >> 7);
+      }
+      int nullCount = (sizeInBytes * 8) - count;
+      // if the valueCount is not a multiple of 8, the bits on the right were counted as
null bits
+      int remainder = valueCount % 8;
+      nullCount -= remainder == 0 ? 0 : 8 - remainder;
+      return nullCount;
+    }
   }
 
   /**

http://git-wip-us.apache.org/repos/asf/arrow/blob/a2ead2f6/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java
----------------------------------------------------------------------
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java b/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java
index 5b24a41..ff7b94c 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java
@@ -180,6 +180,11 @@ public interface ValueVector extends Closeable, Iterable<ValueVector>
{
      * Returns true if the value at the given index is null, false otherwise.
      */
     boolean isNull(int index);
+
+    /**
+     * Returns the number of null values
+     */
+    int getNullCount();
   }
 
   /**

http://git-wip-us.apache.org/repos/asf/arrow/blob/a2ead2f6/java/vector/src/main/java/org/apache/arrow/vector/VectorUnloader.java
----------------------------------------------------------------------
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/VectorUnloader.java b/java/vector/src/main/java/org/apache/arrow/vector/VectorUnloader.java
index e246218..92d8cb0 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/VectorUnloader.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/VectorUnloader.java
@@ -60,15 +60,7 @@ public class VectorUnloader {
 
   private void appendNodes(FieldVector vector, List<ArrowFieldNode> nodes, List<ArrowBuf>
buffers) {
     Accessor accessor = vector.getAccessor();
-    int nullCount = 0;
-    // TODO: should not have to do that
-    // we can do that a lot more efficiently (for example with Long.bitCount(i))
-    for (int i = 0; i < accessor.getValueCount(); i++) {
-      if (accessor.isNull(i)) {
-        nullCount ++;
-      }
-    }
-    nodes.add(new ArrowFieldNode(accessor.getValueCount(), nullCount));
+    nodes.add(new ArrowFieldNode(accessor.getValueCount(), accessor.getNullCount()));
     List<ArrowBuf> fieldBuffers = vector.getFieldBuffers();
     List<ArrowVectorType> expectedBuffers = vector.getField().getTypeLayout().getVectorTypes();
     if (fieldBuffers.size() != expectedBuffers.size()) {

http://git-wip-us.apache.org/repos/asf/arrow/blob/a2ead2f6/java/vector/src/main/java/org/apache/arrow/vector/ZeroVector.java
----------------------------------------------------------------------
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ZeroVector.java b/java/vector/src/main/java/org/apache/arrow/vector/ZeroVector.java
index c2482ad..e163b4f 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/ZeroVector.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/ZeroVector.java
@@ -69,6 +69,11 @@ public class ZeroVector implements FieldVector {
     public boolean isNull(int index) {
       return true;
     }
+
+    @Override
+    public int getNullCount() {
+      return 0;
+    }
   };
 
   private final Mutator defaultMutator = new Mutator() {

http://git-wip-us.apache.org/repos/asf/arrow/blob/a2ead2f6/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java
----------------------------------------------------------------------
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java
index 461bdbc..074b0aa 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java
@@ -310,6 +310,11 @@ public class ListVector extends BaseRepeatedValueVector implements FieldVector
{
     public boolean isNull(int index) {
       return bits.getAccessor().get(index) == 0;
     }
+
+    @Override
+    public int getNullCount() {
+      return bits.getAccessor().getNullCount();
+    }
   }
 
   public class Mutator extends BaseRepeatedMutator {

http://git-wip-us.apache.org/repos/asf/arrow/blob/a2ead2f6/java/vector/src/main/java/org/apache/arrow/vector/complex/NullableMapVector.java
----------------------------------------------------------------------
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/NullableMapVector.java
b/java/vector/src/main/java/org/apache/arrow/vector/complex/NullableMapVector.java
index f0ddf27..5fa3530 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/complex/NullableMapVector.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/NullableMapVector.java
@@ -204,6 +204,11 @@ public class NullableMapVector extends MapVector implements FieldVector
{
     }
 
     @Override
+    public int getNullCount() {
+      return bits.getAccessor().getNullCount();
+    }
+
+    @Override
     public boolean isNull(int index) {
       return isSet(index) == 0;
     }

http://git-wip-us.apache.org/repos/asf/arrow/blob/a2ead2f6/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java
----------------------------------------------------------------------
diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java
index 124452e..b33919b 100644
--- a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java
+++ b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java
@@ -288,6 +288,7 @@ public class TestValueVector {
     try (final BitVector vector = new BitVector(EMPTY_SCHEMA_PATH, allocator)) {
       final BitVector.Mutator m = vector.getMutator();
       vector.allocateNew(1024);
+      m.setValueCount(1024);
 
       // Put and set a few values
       m.set(0, 1);
@@ -295,12 +296,16 @@ public class TestValueVector {
       m.set(100, 0);
       m.set(1022, 1);
 
+      m.setValueCount(1024);
+
       final BitVector.Accessor accessor = vector.getAccessor();
       assertEquals(1, accessor.get(0));
       assertEquals(0, accessor.get(1));
       assertEquals(0, accessor.get(100));
       assertEquals(1, accessor.get(1022));
 
+      assertEquals(1022, accessor.getNullCount());
+
       // test setting the same value twice
       m.set(0, 1);
       m.set(0, 1);
@@ -315,8 +320,47 @@ public class TestValueVector {
       assertEquals(0, accessor.get(0));
       assertEquals(1, accessor.get(1));
 
+      // should not change
+      assertEquals(1022, accessor.getNullCount());
+
       // Ensure unallocated space returns 0
       assertEquals(0, accessor.get(3));
+
+      // unset the previously set bits
+      m.set(1, 0);
+      m.set(1022, 0);
+      // this should set all the array to 0
+      assertEquals(1024, accessor.getNullCount());
+
+      // set all the array to 1
+      for (int i = 0; i < 1024; ++i) {
+        assertEquals(1024 - i, accessor.getNullCount());
+        m.set(i, 1);
+      }
+
+      assertEquals(0, accessor.getNullCount());
+
+      vector.allocateNew(1015);
+      m.setValueCount(1015);
+
+      // ensure it has been zeroed
+      assertEquals(1015, accessor.getNullCount());
+
+      m.set(0, 1);
+      m.set(1014, 1); // ensure that the last item of the last byte is allocated
+
+      assertEquals(1013, accessor.getNullCount());
+
+      vector.zeroVector();
+      assertEquals(1015, accessor.getNullCount());
+
+      // set all the array to 1
+      for (int i = 0; i < 1015; ++i) {
+        assertEquals(1015 - i, accessor.getNullCount());
+        m.set(i, 1);
+      }
+
+      assertEquals(0, accessor.getNullCount());
     }
   }
 


Mime
View raw message