parquet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From w...@apache.org
Subject [1/2] parquet-cpp git commit: PARQUET-671: performance improvements for rle/bit-packed decoding
Date Tue, 02 Aug 2016 23:38:31 GMT
Repository: parquet-cpp
Updated Branches:
  refs/heads/master 616305cb9 -> 38f0ffd5a


http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/38f0ffd5/src/parquet/util/bpacking.h
----------------------------------------------------------------------
diff --git a/src/parquet/util/bpacking.h b/src/parquet/util/bpacking.h
new file mode 100644
index 0000000..d9ae531
--- /dev/null
+++ b/src/parquet/util/bpacking.h
@@ -0,0 +1,3323 @@
+// This file was modified from its original version for inclusion in parquet-cpp.
+// Original source:
+// https://github.com/lemire/FrameOfReference/blob/6ccaf9e97160f9a3b299e23a8ef739e711ef0c71/src/bpacking.cpp
+// The original copyright notice follows.
+
+/**
+*
+* This code is released under the
+* Apache License Version 2.0 http://www.apache.org/licenses/.
+* (c) Daniel Lemire 2013
+*/
+
+#ifndef PARQUET_UTIL_BPACKING_H
+#define PARQUET_UTIL_BPACKING_H
+
+namespace parquet {
+
+inline const uint32_t* unpack1_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) & 1;
+  out++;
+  *out = ((*in) >> 1) & 1;
+  out++;
+  *out = ((*in) >> 2) & 1;
+  out++;
+  *out = ((*in) >> 3) & 1;
+  out++;
+  *out = ((*in) >> 4) & 1;
+  out++;
+  *out = ((*in) >> 5) & 1;
+  out++;
+  *out = ((*in) >> 6) & 1;
+  out++;
+  *out = ((*in) >> 7) & 1;
+  out++;
+  *out = ((*in) >> 8) & 1;
+  out++;
+  *out = ((*in) >> 9) & 1;
+  out++;
+  *out = ((*in) >> 10) & 1;
+  out++;
+  *out = ((*in) >> 11) & 1;
+  out++;
+  *out = ((*in) >> 12) & 1;
+  out++;
+  *out = ((*in) >> 13) & 1;
+  out++;
+  *out = ((*in) >> 14) & 1;
+  out++;
+  *out = ((*in) >> 15) & 1;
+  out++;
+  *out = ((*in) >> 16) & 1;
+  out++;
+  *out = ((*in) >> 17) & 1;
+  out++;
+  *out = ((*in) >> 18) & 1;
+  out++;
+  *out = ((*in) >> 19) & 1;
+  out++;
+  *out = ((*in) >> 20) & 1;
+  out++;
+  *out = ((*in) >> 21) & 1;
+  out++;
+  *out = ((*in) >> 22) & 1;
+  out++;
+  *out = ((*in) >> 23) & 1;
+  out++;
+  *out = ((*in) >> 24) & 1;
+  out++;
+  *out = ((*in) >> 25) & 1;
+  out++;
+  *out = ((*in) >> 26) & 1;
+  out++;
+  *out = ((*in) >> 27) & 1;
+  out++;
+  *out = ((*in) >> 28) & 1;
+  out++;
+  *out = ((*in) >> 29) & 1;
+  out++;
+  *out = ((*in) >> 30) & 1;
+  out++;
+  *out = ((*in) >> 31);
+  ++in;
+  out++;
+
+  return in;
+}
+
+inline const uint32_t* unpack2_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 2);
+  out++;
+  *out = ((*in) >> 4) % (1U << 2);
+  out++;
+  *out = ((*in) >> 6) % (1U << 2);
+  out++;
+  *out = ((*in) >> 8) % (1U << 2);
+  out++;
+  *out = ((*in) >> 10) % (1U << 2);
+  out++;
+  *out = ((*in) >> 12) % (1U << 2);
+  out++;
+  *out = ((*in) >> 14) % (1U << 2);
+  out++;
+  *out = ((*in) >> 16) % (1U << 2);
+  out++;
+  *out = ((*in) >> 18) % (1U << 2);
+  out++;
+  *out = ((*in) >> 20) % (1U << 2);
+  out++;
+  *out = ((*in) >> 22) % (1U << 2);
+  out++;
+  *out = ((*in) >> 24) % (1U << 2);
+  out++;
+  *out = ((*in) >> 26) % (1U << 2);
+  out++;
+  *out = ((*in) >> 28) % (1U << 2);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 2);
+  out++;
+  *out = ((*in) >> 4) % (1U << 2);
+  out++;
+  *out = ((*in) >> 6) % (1U << 2);
+  out++;
+  *out = ((*in) >> 8) % (1U << 2);
+  out++;
+  *out = ((*in) >> 10) % (1U << 2);
+  out++;
+  *out = ((*in) >> 12) % (1U << 2);
+  out++;
+  *out = ((*in) >> 14) % (1U << 2);
+  out++;
+  *out = ((*in) >> 16) % (1U << 2);
+  out++;
+  *out = ((*in) >> 18) % (1U << 2);
+  out++;
+  *out = ((*in) >> 20) % (1U << 2);
+  out++;
+  *out = ((*in) >> 22) % (1U << 2);
+  out++;
+  *out = ((*in) >> 24) % (1U << 2);
+  out++;
+  *out = ((*in) >> 26) % (1U << 2);
+  out++;
+  *out = ((*in) >> 28) % (1U << 2);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  out++;
+
+  return in;
+}
+
+inline const uint32_t* unpack3_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 3);
+  out++;
+  *out = ((*in) >> 3) % (1U << 3);
+  out++;
+  *out = ((*in) >> 6) % (1U << 3);
+  out++;
+  *out = ((*in) >> 9) % (1U << 3);
+  out++;
+  *out = ((*in) >> 12) % (1U << 3);
+  out++;
+  *out = ((*in) >> 15) % (1U << 3);
+  out++;
+  *out = ((*in) >> 18) % (1U << 3);
+  out++;
+  *out = ((*in) >> 21) % (1U << 3);
+  out++;
+  *out = ((*in) >> 24) % (1U << 3);
+  out++;
+  *out = ((*in) >> 27) % (1U << 3);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 1)) << (3 - 1);
+  out++;
+  *out = ((*in) >> 1) % (1U << 3);
+  out++;
+  *out = ((*in) >> 4) % (1U << 3);
+  out++;
+  *out = ((*in) >> 7) % (1U << 3);
+  out++;
+  *out = ((*in) >> 10) % (1U << 3);
+  out++;
+  *out = ((*in) >> 13) % (1U << 3);
+  out++;
+  *out = ((*in) >> 16) % (1U << 3);
+  out++;
+  *out = ((*in) >> 19) % (1U << 3);
+  out++;
+  *out = ((*in) >> 22) % (1U << 3);
+  out++;
+  *out = ((*in) >> 25) % (1U << 3);
+  out++;
+  *out = ((*in) >> 28) % (1U << 3);
+  out++;
+  *out = ((*in) >> 31);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (3 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 3);
+  out++;
+  *out = ((*in) >> 5) % (1U << 3);
+  out++;
+  *out = ((*in) >> 8) % (1U << 3);
+  out++;
+  *out = ((*in) >> 11) % (1U << 3);
+  out++;
+  *out = ((*in) >> 14) % (1U << 3);
+  out++;
+  *out = ((*in) >> 17) % (1U << 3);
+  out++;
+  *out = ((*in) >> 20) % (1U << 3);
+  out++;
+  *out = ((*in) >> 23) % (1U << 3);
+  out++;
+  *out = ((*in) >> 26) % (1U << 3);
+  out++;
+  *out = ((*in) >> 29);
+  ++in;
+  out++;
+
+  return in;
+}
+
+inline const uint32_t* unpack4_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 4);
+  out++;
+  *out = ((*in) >> 8) % (1U << 4);
+  out++;
+  *out = ((*in) >> 12) % (1U << 4);
+  out++;
+  *out = ((*in) >> 16) % (1U << 4);
+  out++;
+  *out = ((*in) >> 20) % (1U << 4);
+  out++;
+  *out = ((*in) >> 24) % (1U << 4);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 4);
+  out++;
+  *out = ((*in) >> 8) % (1U << 4);
+  out++;
+  *out = ((*in) >> 12) % (1U << 4);
+  out++;
+  *out = ((*in) >> 16) % (1U << 4);
+  out++;
+  *out = ((*in) >> 20) % (1U << 4);
+  out++;
+  *out = ((*in) >> 24) % (1U << 4);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 4);
+  out++;
+  *out = ((*in) >> 8) % (1U << 4);
+  out++;
+  *out = ((*in) >> 12) % (1U << 4);
+  out++;
+  *out = ((*in) >> 16) % (1U << 4);
+  out++;
+  *out = ((*in) >> 20) % (1U << 4);
+  out++;
+  *out = ((*in) >> 24) % (1U << 4);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 4);
+  out++;
+  *out = ((*in) >> 8) % (1U << 4);
+  out++;
+  *out = ((*in) >> 12) % (1U << 4);
+  out++;
+  *out = ((*in) >> 16) % (1U << 4);
+  out++;
+  *out = ((*in) >> 20) % (1U << 4);
+  out++;
+  *out = ((*in) >> 24) % (1U << 4);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  out++;
+
+  return in;
+}
+
+inline const uint32_t* unpack5_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 5);
+  out++;
+  *out = ((*in) >> 5) % (1U << 5);
+  out++;
+  *out = ((*in) >> 10) % (1U << 5);
+  out++;
+  *out = ((*in) >> 15) % (1U << 5);
+  out++;
+  *out = ((*in) >> 20) % (1U << 5);
+  out++;
+  *out = ((*in) >> 25) % (1U << 5);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 3)) << (5 - 3);
+  out++;
+  *out = ((*in) >> 3) % (1U << 5);
+  out++;
+  *out = ((*in) >> 8) % (1U << 5);
+  out++;
+  *out = ((*in) >> 13) % (1U << 5);
+  out++;
+  *out = ((*in) >> 18) % (1U << 5);
+  out++;
+  *out = ((*in) >> 23) % (1U << 5);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 1)) << (5 - 1);
+  out++;
+  *out = ((*in) >> 1) % (1U << 5);
+  out++;
+  *out = ((*in) >> 6) % (1U << 5);
+  out++;
+  *out = ((*in) >> 11) % (1U << 5);
+  out++;
+  *out = ((*in) >> 16) % (1U << 5);
+  out++;
+  *out = ((*in) >> 21) % (1U << 5);
+  out++;
+  *out = ((*in) >> 26) % (1U << 5);
+  out++;
+  *out = ((*in) >> 31);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (5 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 5);
+  out++;
+  *out = ((*in) >> 9) % (1U << 5);
+  out++;
+  *out = ((*in) >> 14) % (1U << 5);
+  out++;
+  *out = ((*in) >> 19) % (1U << 5);
+  out++;
+  *out = ((*in) >> 24) % (1U << 5);
+  out++;
+  *out = ((*in) >> 29);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (5 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 5);
+  out++;
+  *out = ((*in) >> 7) % (1U << 5);
+  out++;
+  *out = ((*in) >> 12) % (1U << 5);
+  out++;
+  *out = ((*in) >> 17) % (1U << 5);
+  out++;
+  *out = ((*in) >> 22) % (1U << 5);
+  out++;
+  *out = ((*in) >> 27);
+  ++in;
+  out++;
+
+  return in;
+}
+
+inline const uint32_t* unpack6_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 6);
+  out++;
+  *out = ((*in) >> 6) % (1U << 6);
+  out++;
+  *out = ((*in) >> 12) % (1U << 6);
+  out++;
+  *out = ((*in) >> 18) % (1U << 6);
+  out++;
+  *out = ((*in) >> 24) % (1U << 6);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (6 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 6);
+  out++;
+  *out = ((*in) >> 10) % (1U << 6);
+  out++;
+  *out = ((*in) >> 16) % (1U << 6);
+  out++;
+  *out = ((*in) >> 22) % (1U << 6);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (6 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 6);
+  out++;
+  *out = ((*in) >> 8) % (1U << 6);
+  out++;
+  *out = ((*in) >> 14) % (1U << 6);
+  out++;
+  *out = ((*in) >> 20) % (1U << 6);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 6);
+  out++;
+  *out = ((*in) >> 6) % (1U << 6);
+  out++;
+  *out = ((*in) >> 12) % (1U << 6);
+  out++;
+  *out = ((*in) >> 18) % (1U << 6);
+  out++;
+  *out = ((*in) >> 24) % (1U << 6);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (6 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 6);
+  out++;
+  *out = ((*in) >> 10) % (1U << 6);
+  out++;
+  *out = ((*in) >> 16) % (1U << 6);
+  out++;
+  *out = ((*in) >> 22) % (1U << 6);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (6 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 6);
+  out++;
+  *out = ((*in) >> 8) % (1U << 6);
+  out++;
+  *out = ((*in) >> 14) % (1U << 6);
+  out++;
+  *out = ((*in) >> 20) % (1U << 6);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  out++;
+
+  return in;
+}
+
+inline const uint32_t* unpack7_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 7);
+  out++;
+  *out = ((*in) >> 7) % (1U << 7);
+  out++;
+  *out = ((*in) >> 14) % (1U << 7);
+  out++;
+  *out = ((*in) >> 21) % (1U << 7);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 3)) << (7 - 3);
+  out++;
+  *out = ((*in) >> 3) % (1U << 7);
+  out++;
+  *out = ((*in) >> 10) % (1U << 7);
+  out++;
+  *out = ((*in) >> 17) % (1U << 7);
+  out++;
+  *out = ((*in) >> 24) % (1U << 7);
+  out++;
+  *out = ((*in) >> 31);
+  ++in;
+  *out |= ((*in) % (1U << 6)) << (7 - 6);
+  out++;
+  *out = ((*in) >> 6) % (1U << 7);
+  out++;
+  *out = ((*in) >> 13) % (1U << 7);
+  out++;
+  *out = ((*in) >> 20) % (1U << 7);
+  out++;
+  *out = ((*in) >> 27);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (7 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 7);
+  out++;
+  *out = ((*in) >> 9) % (1U << 7);
+  out++;
+  *out = ((*in) >> 16) % (1U << 7);
+  out++;
+  *out = ((*in) >> 23) % (1U << 7);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 5)) << (7 - 5);
+  out++;
+  *out = ((*in) >> 5) % (1U << 7);
+  out++;
+  *out = ((*in) >> 12) % (1U << 7);
+  out++;
+  *out = ((*in) >> 19) % (1U << 7);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  *out |= ((*in) % (1U << 1)) << (7 - 1);
+  out++;
+  *out = ((*in) >> 1) % (1U << 7);
+  out++;
+  *out = ((*in) >> 8) % (1U << 7);
+  out++;
+  *out = ((*in) >> 15) % (1U << 7);
+  out++;
+  *out = ((*in) >> 22) % (1U << 7);
+  out++;
+  *out = ((*in) >> 29);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (7 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 7);
+  out++;
+  *out = ((*in) >> 11) % (1U << 7);
+  out++;
+  *out = ((*in) >> 18) % (1U << 7);
+  out++;
+  *out = ((*in) >> 25);
+  ++in;
+  out++;
+
+  return in;
+}
+
+inline const uint32_t* unpack8_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 8);
+  out++;
+  *out = ((*in) >> 16) % (1U << 8);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 8);
+  out++;
+  *out = ((*in) >> 16) % (1U << 8);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 8);
+  out++;
+  *out = ((*in) >> 16) % (1U << 8);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 8);
+  out++;
+  *out = ((*in) >> 16) % (1U << 8);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 8);
+  out++;
+  *out = ((*in) >> 16) % (1U << 8);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 8);
+  out++;
+  *out = ((*in) >> 16) % (1U << 8);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 8);
+  out++;
+  *out = ((*in) >> 16) % (1U << 8);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 8);
+  out++;
+  *out = ((*in) >> 16) % (1U << 8);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  out++;
+
+  return in;
+}
+
+inline const uint32_t* unpack9_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 9);
+  out++;
+  *out = ((*in) >> 9) % (1U << 9);
+  out++;
+  *out = ((*in) >> 18) % (1U << 9);
+  out++;
+  *out = ((*in) >> 27);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (9 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 9);
+  out++;
+  *out = ((*in) >> 13) % (1U << 9);
+  out++;
+  *out = ((*in) >> 22) % (1U << 9);
+  out++;
+  *out = ((*in) >> 31);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (9 - 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 9);
+  out++;
+  *out = ((*in) >> 17) % (1U << 9);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  *out |= ((*in) % (1U << 3)) << (9 - 3);
+  out++;
+  *out = ((*in) >> 3) % (1U << 9);
+  out++;
+  *out = ((*in) >> 12) % (1U << 9);
+  out++;
+  *out = ((*in) >> 21) % (1U << 9);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 7)) << (9 - 7);
+  out++;
+  *out = ((*in) >> 7) % (1U << 9);
+  out++;
+  *out = ((*in) >> 16) % (1U << 9);
+  out++;
+  *out = ((*in) >> 25);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (9 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 9);
+  out++;
+  *out = ((*in) >> 11) % (1U << 9);
+  out++;
+  *out = ((*in) >> 20) % (1U << 9);
+  out++;
+  *out = ((*in) >> 29);
+  ++in;
+  *out |= ((*in) % (1U << 6)) << (9 - 6);
+  out++;
+  *out = ((*in) >> 6) % (1U << 9);
+  out++;
+  *out = ((*in) >> 15) % (1U << 9);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 1)) << (9 - 1);
+  out++;
+  *out = ((*in) >> 1) % (1U << 9);
+  out++;
+  *out = ((*in) >> 10) % (1U << 9);
+  out++;
+  *out = ((*in) >> 19) % (1U << 9);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 5)) << (9 - 5);
+  out++;
+  *out = ((*in) >> 5) % (1U << 9);
+  out++;
+  *out = ((*in) >> 14) % (1U << 9);
+  out++;
+  *out = ((*in) >> 23);
+  ++in;
+  out++;
+
+  return in;
+}
+
+inline const uint32_t* unpack10_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 10);
+  out++;
+  *out = ((*in) >> 10) % (1U << 10);
+  out++;
+  *out = ((*in) >> 20) % (1U << 10);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (10 - 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 10);
+  out++;
+  *out = ((*in) >> 18) % (1U << 10);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 6)) << (10 - 6);
+  out++;
+  *out = ((*in) >> 6) % (1U << 10);
+  out++;
+  *out = ((*in) >> 16) % (1U << 10);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (10 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 10);
+  out++;
+  *out = ((*in) >> 14) % (1U << 10);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (10 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 10);
+  out++;
+  *out = ((*in) >> 12) % (1U << 10);
+  out++;
+  *out = ((*in) >> 22);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 10);
+  out++;
+  *out = ((*in) >> 10) % (1U << 10);
+  out++;
+  *out = ((*in) >> 20) % (1U << 10);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (10 - 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 10);
+  out++;
+  *out = ((*in) >> 18) % (1U << 10);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 6)) << (10 - 6);
+  out++;
+  *out = ((*in) >> 6) % (1U << 10);
+  out++;
+  *out = ((*in) >> 16) % (1U << 10);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (10 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 10);
+  out++;
+  *out = ((*in) >> 14) % (1U << 10);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (10 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 10);
+  out++;
+  *out = ((*in) >> 12) % (1U << 10);
+  out++;
+  *out = ((*in) >> 22);
+  ++in;
+  out++;
+
+  return in;
+}
+
+inline const uint32_t* unpack11_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 11);
+  out++;
+  *out = ((*in) >> 11) % (1U << 11);
+  out++;
+  *out = ((*in) >> 22);
+  ++in;
+  *out |= ((*in) % (1U << 1)) << (11 - 1);
+  out++;
+  *out = ((*in) >> 1) % (1U << 11);
+  out++;
+  *out = ((*in) >> 12) % (1U << 11);
+  out++;
+  *out = ((*in) >> 23);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (11 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 11);
+  out++;
+  *out = ((*in) >> 13) % (1U << 11);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 3)) << (11 - 3);
+  out++;
+  *out = ((*in) >> 3) % (1U << 11);
+  out++;
+  *out = ((*in) >> 14) % (1U << 11);
+  out++;
+  *out = ((*in) >> 25);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (11 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 11);
+  out++;
+  *out = ((*in) >> 15) % (1U << 11);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  *out |= ((*in) % (1U << 5)) << (11 - 5);
+  out++;
+  *out = ((*in) >> 5) % (1U << 11);
+  out++;
+  *out = ((*in) >> 16) % (1U << 11);
+  out++;
+  *out = ((*in) >> 27);
+  ++in;
+  *out |= ((*in) % (1U << 6)) << (11 - 6);
+  out++;
+  *out = ((*in) >> 6) % (1U << 11);
+  out++;
+  *out = ((*in) >> 17) % (1U << 11);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 7)) << (11 - 7);
+  out++;
+  *out = ((*in) >> 7) % (1U << 11);
+  out++;
+  *out = ((*in) >> 18) % (1U << 11);
+  out++;
+  *out = ((*in) >> 29);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (11 - 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 11);
+  out++;
+  *out = ((*in) >> 19) % (1U << 11);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 9)) << (11 - 9);
+  out++;
+  *out = ((*in) >> 9) % (1U << 11);
+  out++;
+  *out = ((*in) >> 20) % (1U << 11);
+  out++;
+  *out = ((*in) >> 31);
+  ++in;
+  *out |= ((*in) % (1U << 10)) << (11 - 10);
+  out++;
+  *out = ((*in) >> 10) % (1U << 11);
+  out++;
+  *out = ((*in) >> 21);
+  ++in;
+  out++;
+
+  return in;
+}
+
+inline const uint32_t* unpack12_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 12);
+  out++;
+  *out = ((*in) >> 12) % (1U << 12);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (12 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 12);
+  out++;
+  *out = ((*in) >> 16) % (1U << 12);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (12 - 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 12);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 12);
+  out++;
+  *out = ((*in) >> 12) % (1U << 12);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (12 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 12);
+  out++;
+  *out = ((*in) >> 16) % (1U << 12);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (12 - 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 12);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 12);
+  out++;
+  *out = ((*in) >> 12) % (1U << 12);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (12 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 12);
+  out++;
+  *out = ((*in) >> 16) % (1U << 12);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (12 - 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 12);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 12);
+  out++;
+  *out = ((*in) >> 12) % (1U << 12);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (12 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 12);
+  out++;
+  *out = ((*in) >> 16) % (1U << 12);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (12 - 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 12);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  out++;
+
+  return in;
+}
+
+inline const uint32_t* unpack13_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 13);
+  out++;
+  *out = ((*in) >> 13) % (1U << 13);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  *out |= ((*in) % (1U << 7)) << (13 - 7);
+  out++;
+  *out = ((*in) >> 7) % (1U << 13);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 1)) << (13 - 1);
+  out++;
+  *out = ((*in) >> 1) % (1U << 13);
+  out++;
+  *out = ((*in) >> 14) % (1U << 13);
+  out++;
+  *out = ((*in) >> 27);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (13 - 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 13);
+  out++;
+  *out = ((*in) >> 21);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (13 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 13);
+  out++;
+  *out = ((*in) >> 15) % (1U << 13);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 9)) << (13 - 9);
+  out++;
+  *out = ((*in) >> 9) % (1U << 13);
+  out++;
+  *out = ((*in) >> 22);
+  ++in;
+  *out |= ((*in) % (1U << 3)) << (13 - 3);
+  out++;
+  *out = ((*in) >> 3) % (1U << 13);
+  out++;
+  *out = ((*in) >> 16) % (1U << 13);
+  out++;
+  *out = ((*in) >> 29);
+  ++in;
+  *out |= ((*in) % (1U << 10)) << (13 - 10);
+  out++;
+  *out = ((*in) >> 10) % (1U << 13);
+  out++;
+  *out = ((*in) >> 23);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (13 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 13);
+  out++;
+  *out = ((*in) >> 17) % (1U << 13);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 11)) << (13 - 11);
+  out++;
+  *out = ((*in) >> 11) % (1U << 13);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 5)) << (13 - 5);
+  out++;
+  *out = ((*in) >> 5) % (1U << 13);
+  out++;
+  *out = ((*in) >> 18) % (1U << 13);
+  out++;
+  *out = ((*in) >> 31);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (13 - 12);
+  out++;
+  *out = ((*in) >> 12) % (1U << 13);
+  out++;
+  *out = ((*in) >> 25);
+  ++in;
+  *out |= ((*in) % (1U << 6)) << (13 - 6);
+  out++;
+  *out = ((*in) >> 6) % (1U << 13);
+  out++;
+  *out = ((*in) >> 19);
+  ++in;
+  out++;
+
+  return in;
+}
+
+inline const uint32_t* unpack14_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 14);
+  out++;
+  *out = ((*in) >> 14) % (1U << 14);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 10)) << (14 - 10);
+  out++;
+  *out = ((*in) >> 10) % (1U << 14);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 6)) << (14 - 6);
+  out++;
+  *out = ((*in) >> 6) % (1U << 14);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (14 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 14);
+  out++;
+  *out = ((*in) >> 16) % (1U << 14);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (14 - 12);
+  out++;
+  *out = ((*in) >> 12) % (1U << 14);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (14 - 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 14);
+  out++;
+  *out = ((*in) >> 22);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (14 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 14);
+  out++;
+  *out = ((*in) >> 18);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 14);
+  out++;
+  *out = ((*in) >> 14) % (1U << 14);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 10)) << (14 - 10);
+  out++;
+  *out = ((*in) >> 10) % (1U << 14);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 6)) << (14 - 6);
+  out++;
+  *out = ((*in) >> 6) % (1U << 14);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (14 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 14);
+  out++;
+  *out = ((*in) >> 16) % (1U << 14);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (14 - 12);
+  out++;
+  *out = ((*in) >> 12) % (1U << 14);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (14 - 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 14);
+  out++;
+  *out = ((*in) >> 22);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (14 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 14);
+  out++;
+  *out = ((*in) >> 18);
+  ++in;
+  out++;
+
+  return in;
+}
+
+inline const uint32_t* unpack15_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 15);
+  out++;
+  *out = ((*in) >> 15) % (1U << 15);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 13)) << (15 - 13);
+  out++;
+  *out = ((*in) >> 13) % (1U << 15);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 11)) << (15 - 11);
+  out++;
+  *out = ((*in) >> 11) % (1U << 15);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  *out |= ((*in) % (1U << 9)) << (15 - 9);
+  out++;
+  *out = ((*in) >> 9) % (1U << 15);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 7)) << (15 - 7);
+  out++;
+  *out = ((*in) >> 7) % (1U << 15);
+  out++;
+  *out = ((*in) >> 22);
+  ++in;
+  *out |= ((*in) % (1U << 5)) << (15 - 5);
+  out++;
+  *out = ((*in) >> 5) % (1U << 15);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 3)) << (15 - 3);
+  out++;
+  *out = ((*in) >> 3) % (1U << 15);
+  out++;
+  *out = ((*in) >> 18);
+  ++in;
+  *out |= ((*in) % (1U << 1)) << (15 - 1);
+  out++;
+  *out = ((*in) >> 1) % (1U << 15);
+  out++;
+  *out = ((*in) >> 16) % (1U << 15);
+  out++;
+  *out = ((*in) >> 31);
+  ++in;
+  *out |= ((*in) % (1U << 14)) << (15 - 14);
+  out++;
+  *out = ((*in) >> 14) % (1U << 15);
+  out++;
+  *out = ((*in) >> 29);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (15 - 12);
+  out++;
+  *out = ((*in) >> 12) % (1U << 15);
+  out++;
+  *out = ((*in) >> 27);
+  ++in;
+  *out |= ((*in) % (1U << 10)) << (15 - 10);
+  out++;
+  *out = ((*in) >> 10) % (1U << 15);
+  out++;
+  *out = ((*in) >> 25);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (15 - 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 15);
+  out++;
+  *out = ((*in) >> 23);
+  ++in;
+  *out |= ((*in) % (1U << 6)) << (15 - 6);
+  out++;
+  *out = ((*in) >> 6) % (1U << 15);
+  out++;
+  *out = ((*in) >> 21);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (15 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 15);
+  out++;
+  *out = ((*in) >> 19);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (15 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 15);
+  out++;
+  *out = ((*in) >> 17);
+  ++in;
+  out++;
+
+  return in;
+}
+
+inline const uint32_t* unpack16_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  out++;
+
+  return in;
+}
+
+inline const uint32_t* unpack17_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 17);
+  out++;
+  *out = ((*in) >> 17);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (17 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 17);
+  out++;
+  *out = ((*in) >> 19);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (17 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 17);
+  out++;
+  *out = ((*in) >> 21);
+  ++in;
+  *out |= ((*in) % (1U << 6)) << (17 - 6);
+  out++;
+  *out = ((*in) >> 6) % (1U << 17);
+  out++;
+  *out = ((*in) >> 23);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (17 - 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 17);
+  out++;
+  *out = ((*in) >> 25);
+  ++in;
+  *out |= ((*in) % (1U << 10)) << (17 - 10);
+  out++;
+  *out = ((*in) >> 10) % (1U << 17);
+  out++;
+  *out = ((*in) >> 27);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (17 - 12);
+  out++;
+  *out = ((*in) >> 12) % (1U << 17);
+  out++;
+  *out = ((*in) >> 29);
+  ++in;
+  *out |= ((*in) % (1U << 14)) << (17 - 14);
+  out++;
+  *out = ((*in) >> 14) % (1U << 17);
+  out++;
+  *out = ((*in) >> 31);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (17 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 1)) << (17 - 1);
+  out++;
+  *out = ((*in) >> 1) % (1U << 17);
+  out++;
+  *out = ((*in) >> 18);
+  ++in;
+  *out |= ((*in) % (1U << 3)) << (17 - 3);
+  out++;
+  *out = ((*in) >> 3) % (1U << 17);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 5)) << (17 - 5);
+  out++;
+  *out = ((*in) >> 5) % (1U << 17);
+  out++;
+  *out = ((*in) >> 22);
+  ++in;
+  *out |= ((*in) % (1U << 7)) << (17 - 7);
+  out++;
+  *out = ((*in) >> 7) % (1U << 17);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 9)) << (17 - 9);
+  out++;
+  *out = ((*in) >> 9) % (1U << 17);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  *out |= ((*in) % (1U << 11)) << (17 - 11);
+  out++;
+  *out = ((*in) >> 11) % (1U << 17);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 13)) << (17 - 13);
+  out++;
+  *out = ((*in) >> 13) % (1U << 17);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 15)) << (17 - 15);
+  out++;
+  *out = ((*in) >> 15);
+  ++in;
+  out++;
+
+  return in;
+}
+
+inline const uint32_t* unpack18_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 18);
+  out++;
+  *out = ((*in) >> 18);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (18 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 18);
+  out++;
+  *out = ((*in) >> 22);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (18 - 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 18);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (18 - 12);
+  out++;
+  *out = ((*in) >> 12) % (1U << 18);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (18 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (18 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 18);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 6)) << (18 - 6);
+  out++;
+  *out = ((*in) >> 6) % (1U << 18);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 10)) << (18 - 10);
+  out++;
+  *out = ((*in) >> 10) % (1U << 18);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 14)) << (18 - 14);
+  out++;
+  *out = ((*in) >> 14);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 18);
+  out++;
+  *out = ((*in) >> 18);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (18 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 18);
+  out++;
+  *out = ((*in) >> 22);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (18 - 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 18);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (18 - 12);
+  out++;
+  *out = ((*in) >> 12) % (1U << 18);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (18 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (18 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 18);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 6)) << (18 - 6);
+  out++;
+  *out = ((*in) >> 6) % (1U << 18);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 10)) << (18 - 10);
+  out++;
+  *out = ((*in) >> 10) % (1U << 18);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 14)) << (18 - 14);
+  out++;
+  *out = ((*in) >> 14);
+  ++in;
+  out++;
+
+  return in;
+}
+
+inline const uint32_t* unpack19_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 19);
+  out++;
+  *out = ((*in) >> 19);
+  ++in;
+  *out |= ((*in) % (1U << 6)) << (19 - 6);
+  out++;
+  *out = ((*in) >> 6) % (1U << 19);
+  out++;
+  *out = ((*in) >> 25);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (19 - 12);
+  out++;
+  *out = ((*in) >> 12) % (1U << 19);
+  out++;
+  *out = ((*in) >> 31);
+  ++in;
+  *out |= ((*in) % (1U << 18)) << (19 - 18);
+  out++;
+  *out = ((*in) >> 18);
+  ++in;
+  *out |= ((*in) % (1U << 5)) << (19 - 5);
+  out++;
+  *out = ((*in) >> 5) % (1U << 19);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 11)) << (19 - 11);
+  out++;
+  *out = ((*in) >> 11) % (1U << 19);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 17)) << (19 - 17);
+  out++;
+  *out = ((*in) >> 17);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (19 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 19);
+  out++;
+  *out = ((*in) >> 23);
+  ++in;
+  *out |= ((*in) % (1U << 10)) << (19 - 10);
+  out++;
+  *out = ((*in) >> 10) % (1U << 19);
+  out++;
+  *out = ((*in) >> 29);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (19 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 3)) << (19 - 3);
+  out++;
+  *out = ((*in) >> 3) % (1U << 19);
+  out++;
+  *out = ((*in) >> 22);
+  ++in;
+  *out |= ((*in) % (1U << 9)) << (19 - 9);
+  out++;
+  *out = ((*in) >> 9) % (1U << 19);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 15)) << (19 - 15);
+  out++;
+  *out = ((*in) >> 15);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (19 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 19);
+  out++;
+  *out = ((*in) >> 21);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (19 - 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 19);
+  out++;
+  *out = ((*in) >> 27);
+  ++in;
+  *out |= ((*in) % (1U << 14)) << (19 - 14);
+  out++;
+  *out = ((*in) >> 14);
+  ++in;
+  *out |= ((*in) % (1U << 1)) << (19 - 1);
+  out++;
+  *out = ((*in) >> 1) % (1U << 19);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 7)) << (19 - 7);
+  out++;
+  *out = ((*in) >> 7) % (1U << 19);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  *out |= ((*in) % (1U << 13)) << (19 - 13);
+  out++;
+  *out = ((*in) >> 13);
+  ++in;
+  out++;
+
+  return in;
+}
+
+inline const uint32_t* unpack20_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 20);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (20 - 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 20);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (20 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (20 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 20);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (20 - 12);
+  out++;
+  *out = ((*in) >> 12);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 20);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (20 - 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 20);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (20 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (20 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 20);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (20 - 12);
+  out++;
+  *out = ((*in) >> 12);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 20);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (20 - 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 20);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (20 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (20 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 20);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (20 - 12);
+  out++;
+  *out = ((*in) >> 12);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 20);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (20 - 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 20);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (20 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (20 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 20);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (20 - 12);
+  out++;
+  *out = ((*in) >> 12);
+  ++in;
+  out++;
+
+  return in;
+}
+
+inline const uint32_t* unpack21_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 21);
+  out++;
+  *out = ((*in) >> 21);
+  ++in;
+  *out |= ((*in) % (1U << 10)) << (21 - 10);
+  out++;
+  *out = ((*in) >> 10) % (1U << 21);
+  out++;
+  *out = ((*in) >> 31);
+  ++in;
+  *out |= ((*in) % (1U << 20)) << (21 - 20);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 9)) << (21 - 9);
+  out++;
+  *out = ((*in) >> 9) % (1U << 21);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 19)) << (21 - 19);
+  out++;
+  *out = ((*in) >> 19);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (21 - 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 21);
+  out++;
+  *out = ((*in) >> 29);
+  ++in;
+  *out |= ((*in) % (1U << 18)) << (21 - 18);
+  out++;
+  *out = ((*in) >> 18);
+  ++in;
+  *out |= ((*in) % (1U << 7)) << (21 - 7);
+  out++;
+  *out = ((*in) >> 7) % (1U << 21);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 17)) << (21 - 17);
+  out++;
+  *out = ((*in) >> 17);
+  ++in;
+  *out |= ((*in) % (1U << 6)) << (21 - 6);
+  out++;
+  *out = ((*in) >> 6) % (1U << 21);
+  out++;
+  *out = ((*in) >> 27);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (21 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 5)) << (21 - 5);
+  out++;
+  *out = ((*in) >> 5) % (1U << 21);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  *out |= ((*in) % (1U << 15)) << (21 - 15);
+  out++;
+  *out = ((*in) >> 15);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (21 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 21);
+  out++;
+  *out = ((*in) >> 25);
+  ++in;
+  *out |= ((*in) % (1U << 14)) << (21 - 14);
+  out++;
+  *out = ((*in) >> 14);
+  ++in;
+  *out |= ((*in) % (1U << 3)) << (21 - 3);
+  out++;
+  *out = ((*in) >> 3) % (1U << 21);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 13)) << (21 - 13);
+  out++;
+  *out = ((*in) >> 13);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (21 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 21);
+  out++;
+  *out = ((*in) >> 23);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (21 - 12);
+  out++;
+  *out = ((*in) >> 12);
+  ++in;
+  *out |= ((*in) % (1U << 1)) << (21 - 1);
+  out++;
+  *out = ((*in) >> 1) % (1U << 21);
+  out++;
+  *out = ((*in) >> 22);
+  ++in;
+  *out |= ((*in) % (1U << 11)) << (21 - 11);
+  out++;
+  *out = ((*in) >> 11);
+  ++in;
+  out++;
+
+  return in;
+}
+
+inline const uint32_t* unpack22_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 22);
+  out++;
+  *out = ((*in) >> 22);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (22 - 12);
+  out++;
+  *out = ((*in) >> 12);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (22 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 22);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 14)) << (22 - 14);
+  out++;
+  *out = ((*in) >> 14);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (22 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 22);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (22 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 6)) << (22 - 6);
+  out++;
+  *out = ((*in) >> 6) % (1U << 22);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 18)) << (22 - 18);
+  out++;
+  *out = ((*in) >> 18);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (22 - 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 22);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 20)) << (22 - 20);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 10)) << (22 - 10);
+  out++;
+  *out = ((*in) >> 10);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 22);
+  out++;
+  *out = ((*in) >> 22);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (22 - 12);
+  out++;
+  *out = ((*in) >> 12);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (22 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 22);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 14)) << (22 - 14);
+  out++;
+  *out = ((*in) >> 14);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (22 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 22);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (22 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 6)) << (22 - 6);
+  out++;
+  *out = ((*in) >> 6) % (1U << 22);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 18)) << (22 - 18);
+  out++;
+  *out = ((*in) >> 18);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (22 - 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 22);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 20)) << (22 - 20);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 10)) << (22 - 10);
+  out++;
+  *out = ((*in) >> 10);
+  ++in;
+  out++;
+
+  return in;
+}
+
+inline const uint32_t* unpack23_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 23);
+  out++;
+  *out = ((*in) >> 23);
+  ++in;
+  *out |= ((*in) % (1U << 14)) << (23 - 14);
+  out++;
+  *out = ((*in) >> 14);
+  ++in;
+  *out |= ((*in) % (1U << 5)) << (23 - 5);
+  out++;
+  *out = ((*in) >> 5) % (1U << 23);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 19)) << (23 - 19);
+  out++;
+  *out = ((*in) >> 19);
+  ++in;
+  *out |= ((*in) % (1U << 10)) << (23 - 10);
+  out++;
+  *out = ((*in) >> 10);
+  ++in;
+  *out |= ((*in) % (1U << 1)) << (23 - 1);
+  out++;
+  *out = ((*in) >> 1) % (1U << 23);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 15)) << (23 - 15);
+  out++;
+  *out = ((*in) >> 15);
+  ++in;
+  *out |= ((*in) % (1U << 6)) << (23 - 6);
+  out++;
+  *out = ((*in) >> 6) % (1U << 23);
+  out++;
+  *out = ((*in) >> 29);
+  ++in;
+  *out |= ((*in) % (1U << 20)) << (23 - 20);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 11)) << (23 - 11);
+  out++;
+  *out = ((*in) >> 11);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (23 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 23);
+  out++;
+  *out = ((*in) >> 25);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (23 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 7)) << (23 - 7);
+  out++;
+  *out = ((*in) >> 7) % (1U << 23);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 21)) << (23 - 21);
+  out++;
+  *out = ((*in) >> 21);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (23 - 12);
+  out++;
+  *out = ((*in) >> 12);
+  ++in;
+  *out |= ((*in) % (1U << 3)) << (23 - 3);
+  out++;
+  *out = ((*in) >> 3) % (1U << 23);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  *out |= ((*in) % (1U << 17)) << (23 - 17);
+  out++;
+  *out = ((*in) >> 17);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (23 - 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 23);
+  out++;
+  *out = ((*in) >> 31);
+  ++in;
+  *out |= ((*in) % (1U << 22)) << (23 - 22);
+  out++;
+  *out = ((*in) >> 22);
+  ++in;
+  *out |= ((*in) % (1U << 13)) << (23 - 13);
+  out++;
+  *out = ((*in) >> 13);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (23 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 23);
+  out++;
+  *out = ((*in) >> 27);
+  ++in;
+  *out |= ((*in) % (1U << 18)) << (23 - 18);
+  out++;
+  *out = ((*in) >> 18);
+  ++in;
+  *out |= ((*in) % (1U << 9)) << (23 - 9);
+  out++;
+  *out = ((*in) >> 9);
+  ++in;
+  out++;
+
+  return in;
+}
+
+inline const uint32_t* unpack24_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 24);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (24 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (24 - 8);
+  out++;
+  *out = ((*in) >> 8);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 24);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (24 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (24 - 8);
+  out++;
+  *out = ((*in) >> 8);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 24);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (24 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (24 - 8);
+  out++;
+  *out = ((*in) >> 8);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 24);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (24 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (24 - 8);
+  out++;
+  *out = ((*in) >> 8);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 24);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (24 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (24 - 8);
+  out++;
+  *out = ((*in) >> 8);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 24);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (24 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (24 - 8);
+  out++;
+  *out = ((*in) >> 8);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 24);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (24 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (24 - 8);
+  out++;
+  *out = ((*in) >> 8);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 24);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (24 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (24 - 8);
+  out++;
+  *out = ((*in) >> 8);
+  ++in;
+  out++;
+
+  return in;
+}
+
+inline const uint32_t* unpack25_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 25);
+  out++;
+  *out = ((*in) >> 25);
+  ++in;
+  *out |= ((*in) % (1U << 18)) << (25 - 18);
+  out++;
+  *out = ((*in) >> 18);
+  ++in;
+  *out |= ((*in) % (1U << 11)) << (25 - 11);
+  out++;
+  *out = ((*in) >> 11);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (25 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 25);
+  out++;
+  *out = ((*in) >> 29);
+  ++in;
+  *out |= ((*in) % (1U << 22)) << (25 - 22);
+  out++;
+  *out = ((*in) >> 22);
+  ++in;
+  *out |= ((*in) % (1U << 15)) << (25 - 15);
+  out++;
+  *out = ((*in) >> 15);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (25 - 8);
+  out++;
+  *out = ((*in) >> 8);
+  ++in;
+  *out |= ((*in) % (1U << 1)) << (25 - 1);
+  out++;
+  *out = ((*in) >> 1) % (1U << 25);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  *out |= ((*in) % (1U << 19)) << (25 - 19);
+  out++;
+  *out = ((*in) >> 19);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (25 - 12);
+  out++;
+  *out = ((*in) >> 12);
+  ++in;
+  *out |= ((*in) % (1U << 5)) << (25 - 5);
+  out++;
+  *out = ((*in) >> 5) % (1U << 25);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 23)) << (25 - 23);
+  out++;
+  *out = ((*in) >> 23);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (25 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 9)) << (25 - 9);
+  out++;
+  *out = ((*in) >> 9);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (25 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 25);
+  out++;
+  *out = ((*in) >> 27);
+  ++in;
+  *out |= ((*in) % (1U << 20)) << (25 - 20);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 13)) << (25 - 13);
+  out++;
+  *out = ((*in) >> 13);
+  ++in;
+  *out |= ((*in) % (1U << 6)) << (25 - 6);
+  out++;
+  *out = ((*in) >> 6) % (1U << 25);
+  out++;
+  *out = ((*in) >> 31);
+  ++in;
+  *out |= ((*in) % (1U << 24)) << (25 - 24);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 17)) << (25 - 17);
+  out++;
+  *out = ((*in) >> 17);
+  ++in;
+  *out |= ((*in) % (1U << 10)) << (25 - 10);
+  out++;
+  *out = ((*in) >> 10);
+  ++in;
+  *out |= ((*in) % (1U << 3)) << (25 - 3);
+  out++;
+  *out = ((*in) >> 3) % (1U << 25);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 21)) << (25 - 21);
+  out++;
+  *out = ((*in) >> 21);
+  ++in;
+  *out |= ((*in) % (1U << 14)) << (25 - 14);
+  out++;
+  *out = ((*in) >> 14);
+  ++in;
+  *out |= ((*in) % (1U << 7)) << (25 - 7);
+  out++;
+  *out = ((*in) >> 7);
+  ++in;
+  out++;
+
+  return in;
+}
+
+inline const uint32_t* unpack26_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 26);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  *out |= ((*in) % (1U << 20)) << (26 - 20);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 14)) << (26 - 14);
+  out++;
+  *out = ((*in) >> 14);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (26 - 8);
+  out++;
+  *out = ((*in) >> 8);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (26 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 26);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 22)) << (26 - 22);
+  out++;
+  *out = ((*in) >> 22);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (26 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 10)) << (26 - 10);
+  out++;
+  *out = ((*in) >> 10);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (26 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 26);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 24)) << (26 - 24);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 18)) << (26 - 18);
+  out++;
+  *out = ((*in) >> 18);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (26 - 12);
+  out++;
+  *out = ((*in) >> 12);
+  ++in;
+  *out |= ((*in) % (1U << 6)) << (26 - 6);
+  out++;
+  *out = ((*in) >> 6);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 26);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  *out |= ((*in) % (1U << 20)) << (26 - 20);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 14)) << (26 - 14);
+  out++;
+  *out = ((*in) >> 14);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (26 - 8);
+  out++;
+  *out = ((*in) >> 8);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (26 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 26);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 22)) << (26 - 22);
+  out++;
+  *out = ((*in) >> 22);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (26 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 10)) << (26 - 10);
+  out++;
+  *out = ((*in) >> 10);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (26 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 26);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 24)) << (26 - 24);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 18)) << (26 - 18);
+  out++;
+  *out = ((*in) >> 18);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (26 - 12);
+  out++;
+  *out = ((*in) >> 12);
+  ++in;
+  *out |= ((*in) % (1U << 6)) << (26 - 6);
+  out++;
+  *out = ((*in) >> 6);
+  ++in;
+  out++;
+
+  return in;
+}
+
+inline const uint32_t* unpack27_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 27);
+  out++;
+  *out = ((*in) >> 27);
+  ++in;
+  *out |= ((*in) % (1U << 22)) << (27 - 22);
+  out++;
+  *out = ((*in) >> 22);
+  ++in;
+  *out |= ((*in) % (1U << 17)) << (27 - 17);
+  out++;
+  *out = ((*in) >> 17);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (27 - 12);
+  out++;
+  *out = ((*in) >> 12);
+  ++in;
+  *out |= ((*in) % (1U << 7)) << (27 - 7);
+  out++;
+  *out = ((*in) >> 7);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (27 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 27);
+  out++;
+  *out = ((*in) >> 29);
+  ++in;
+  *out |= ((*in) % (1U << 24)) << (27 - 24);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 19)) << (27 - 19);
+  out++;
+  *out = ((*in) >> 19);
+  ++in;
+  *out |= ((*in) % (1U << 14)) << (27 - 14);
+  out++;
+  *out = ((*in) >> 14);
+  ++in;
+  *out |= ((*in) % (1U << 9)) << (27 - 9);
+  out++;
+  *out = ((*in) >> 9);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (27 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 27);
+  out++;
+  *out = ((*in) >> 31);
+  ++in;
+  *out |= ((*in) % (1U << 26)) << (27 - 26);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  *out |= ((*in) % (1U << 21)) << (27 - 21);
+  out++;
+  *out = ((*in) >> 21);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (27 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 11)) << (27 - 11);
+  out++;
+  *out = ((*in) >> 11);
+  ++in;
+  *out |= ((*in) % (1U << 6)) << (27 - 6);
+  out++;
+  *out = ((*in) >> 6);
+  ++in;
+  *out |= ((*in) % (1U << 1)) << (27 - 1);
+  out++;
+  *out = ((*in) >> 1) % (1U << 27);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 23)) << (27 - 23);
+  out++;
+  *out = ((*in) >> 23);
+  ++in;
+  *out |= ((*in) % (1U << 18)) << (27 - 18);
+  out++;
+  *out = ((*in) >> 18);
+  ++in;
+  *out |= ((*in) % (1U << 13)) << (27 - 13);
+  out++;
+  *out = ((*in) >> 13);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (27 - 8);
+  out++;
+  *out = ((*in) >> 8);
+  ++in;
+  *out |= ((*in) % (1U << 3)) << (27 - 3);
+  out++;
+  *out = ((*in) >> 3) % (1U << 27);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 25)) << (27 - 25);
+  out++;
+  *out = ((*in) >> 25);
+  ++in;
+  *out |= ((*in) % (1U << 20)) << (27 - 20);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 15)) << (27 - 15);
+  out++;
+  *out = ((*in) >> 15);
+  ++in;
+  *out |= ((*in) % (1U << 10)) << (27 - 10);
+  out++;
+  *out = ((*in) >> 10);
+  ++in;
+  *out |= ((*in) % (1U << 5)) << (27 - 5);
+  out++;
+  *out = ((*in) >> 5);
+  ++in;
+  out++;
+
+  return in;
+}
+
+inline const uint32_t* unpack28_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 28);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 24)) << (28 - 24);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 20)) << (28 - 20);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (28 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (28 - 12);
+  out++;
+  *out = ((*in) >> 12);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (28 - 8);
+  out++;
+  *out = ((*in) >> 8);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (28 - 4);
+  out++;
+  *out = ((*in) >> 4);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 28);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 24)) << (28 - 24);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 20)) << (28 - 20);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (28 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (28 - 12);
+  out++;
+  *out = ((*in) >> 12);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (28 - 8);
+  out++;
+  *out = ((*in) >> 8);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (28 - 4);
+  out++;
+  *out = ((*in) >> 4);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 28);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 24)) << (28 - 24);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 20)) << (28 - 20);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (28 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (28 - 12);
+  out++;
+  *out = ((*in) >> 12);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (28 - 8);
+  out++;
+  *out = ((*in) >> 8);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (28 - 4);
+  out++;
+  *out = ((*in) >> 4);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 28);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 24)) << (28 - 24);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 20)) << (28 - 20);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (28 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (28 - 12);
+  out++;
+  *out = ((*in) >> 12);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (28 - 8);
+  out++;
+  *out = ((*in) >> 8);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (28 - 4);
+  out++;
+  *out = ((*in) >> 4);
+  ++in;
+  out++;
+
+  return in;
+}
+
+inline const uint32_t* unpack29_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 29);
+  out++;
+  *out = ((*in) >> 29);
+  ++in;
+  *out |= ((*in) % (1U << 26)) << (29 - 26);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  *out |= ((*in) % (1U << 23)) << (29 - 23);
+  out++;
+  *out = ((*in) >> 23);
+  ++in;
+  *out |= ((*in) % (1U << 20)) << (29 - 20);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 17)) << (29 - 17);
+  out++;
+  *out = ((*in) >> 17);
+  ++in;
+  *out |= ((*in) % (1U << 14)) << (29 - 14);
+  out++;
+  *out = ((*in) >> 14);
+  ++in;
+  *out |= ((*in) % (1U << 11)) << (29 - 11);
+  out++;
+  *out = ((*in) >> 11);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (29 - 8);
+  out++;
+  *out = ((*in) >> 8);
+  ++in;
+  *out |= ((*in) % (1U << 5)) << (29 - 5);
+  out++;
+  *out = ((*in) >> 5);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (29 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 29);
+  out++;
+  *out = ((*in) >> 31);
+  ++in;
+  *out |= ((*in) % (1U << 28)) << (29 - 28);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 25)) << (29 - 25);
+  out++;
+  *out = ((*in) >> 25);
+  ++in;
+  *out |= ((*in) % (1U << 22)) << (29 - 22);
+  out++;
+  *out = ((*in) >> 22);
+  ++in;
+  *out |= ((*in) % (1U << 19)) << (29 - 19);
+  out++;
+  *out = ((*in) >> 19);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (29 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 13)) << (29 - 13);
+  out++;
+  *out = ((*in) >> 13);
+  ++in;
+  *out |= ((*in) % (1U << 10)) << (29 - 10);
+  out++;
+  *out = ((*in) >> 10);
+  ++in;
+  *out |= ((*in) % (1U << 7)) << (29 - 7);
+  out++;
+  *out = ((*in) >> 7);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (29 - 4);
+  out++;
+  *out = ((*in) >> 4);
+  ++in;
+  *out |= ((*in) % (1U << 1)) << (29 - 1);
+  out++;
+  *out = ((*in) >> 1) % (1U << 29);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 27)) << (29 - 27);
+  out++;
+  *out = ((*in) >> 27);
+  ++in;
+  *out |= ((*in) % (1U << 24)) << (29 - 24);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 21)) << (29 - 21);
+  out++;
+  *out = ((*in) >> 21);
+  ++in;
+  *out |= ((*in) % (1U << 18)) << (29 - 18);
+  out++;
+  *out = ((*in) >> 18);
+  ++in;
+  *out |= ((*in) % (1U << 15)) << (29 - 15);
+  out++;
+  *out = ((*in) >> 15);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (29 - 12);
+  out++;
+  *out = ((*in) >> 12);
+  ++in;
+  *out |= ((*in) % (1U << 9)) << (29 - 9);
+  out++;
+  *out = ((*in) >> 9);
+  ++in;
+  *out |= ((*in) % (1U << 6)) << (29 - 6);
+  out++;
+  *out = ((*in) >> 6);
+  ++in;
+  *out |= ((*in) % (1U << 3)) << (29 - 3);
+  out++;
+  *out = ((*in) >> 3);
+  ++in;
+  out++;
+
+  return in;
+}
+
+inline const uint32_t* unpack30_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 30);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 28)) << (30 - 28);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 26)) << (30 - 26);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  *out |= ((*in) % (1U << 24)) << (30 - 24);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 22)) << (30 - 22);
+  out++;
+  *out = ((*in) >> 22);
+  ++in;
+  *out |= ((*in) % (1U << 20)) << (30 - 20);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 18)) << (30 - 18);
+  out++;
+  *out = ((*in) >> 18);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (30 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 14)) << (30 - 14);
+  out++;
+  *out = ((*in) >> 14);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (30 - 12);
+  out++;
+  *out = ((*in) >> 12);
+  ++in;
+  *out |= ((*in) % (1U << 10)) << (30 - 10);
+  out++;
+  *out = ((*in) >> 10);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (30 - 8);
+  out++;
+  *out = ((*in) >> 8);
+  ++in;
+  *out |= ((*in) % (1U << 6)) << (30 - 6);
+  out++;
+  *out = ((*in) >> 6);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (30 - 4);
+  out++;
+  *out = ((*in) >> 4);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (30 - 2);
+  out++;
+  *out = ((*in) >> 2);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 30);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 28)) << (30 - 28);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 26)) << (30 - 26);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  *out |= ((*in) % (1U << 24)) << (30 - 24);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 22)) << (30 - 22);
+  out++;
+  *out = ((*in) >> 22);
+  ++in;
+  *out |= ((*in) % (1U << 20)) << (30 - 20);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 18)) << (30 - 18);
+  out++;
+  *out = ((*in) >> 18);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (30 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 14)) << (30 - 14);
+  out++;
+  *out = ((*in) >> 14);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (30 - 12);
+  out++;
+  *out = ((*in) >> 12);
+  ++in;
+  *out |= ((*in) % (1U << 10)) << (30 - 10);
+  out++;
+  *out = ((*in) >> 10);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (30 - 8);
+  out++;
+  *out = ((*in) >> 8);
+  ++in;
+  *out |= ((*in) % (1U << 6)) << (30 - 6);
+  out++;
+  *out = ((*in) >> 6);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (30 - 4);
+  out++;
+  *out = ((*in) >> 4);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (30 - 2);
+  out++;
+  *out = ((*in) >> 2);
+  ++in;
+  out++;
+
+  return in;
+}
+
+inline const uint32_t* unpack31_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 31);
+  out++;
+  *out = ((*in) >> 31);
+  ++in;
+  *out |= ((*in) % (1U << 30)) << (31 - 30);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 29)) << (31 - 29);
+  out++;
+  *out = ((*in) >> 29);
+  ++in;
+  *out |= ((*in) % (1U << 28)) << (31 - 28);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 27)) << (31 - 27);
+  out++;
+  *out = ((*in) >> 27);
+  ++in;
+  *out |= ((*in) % (1U << 26)) << (31 - 26);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  *out |= ((*in) % (1U << 25)) << (31 - 25);
+  out++;
+  *out = ((*in) >> 25);
+  ++in;
+  *out |= ((*in) % (1U << 24)) << (31 - 24);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 23)) << (31 - 23);
+  out++;
+  *out = ((*in) >> 23);
+  ++in;
+  *out |= ((*in) % (1U << 22)) << (31 - 22);
+  out++;
+  *out = ((*in) >> 22);
+  ++in;
+  *out |= ((*in) % (1U << 21)) << (31 - 21);
+  out++;
+  *out = ((*in) >> 21);
+  ++in;
+  *out |= ((*in) % (1U << 20)) << (31 - 20);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 19)) << (31 - 19);
+  out++;
+  *out = ((*in) >> 19);
+  ++in;
+  *out |= ((*in) % (1U << 18)) << (31 - 18);
+  out++;
+  *out = ((*in) >> 18);
+  ++in;
+  *out |= ((*in) % (1U << 17)) << (31 - 17);
+  out++;
+  *out = ((*in) >> 17);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (31 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 15)) << (31 - 15);
+  out++;
+  *out = ((*in) >> 15);
+  ++in;
+  *out |= ((*in) % (1U << 14)) << (31 - 14);
+  out++;
+  *out = ((*in) >> 14);
+  ++in;
+  *out |= ((*in) % (1U << 13)) << (31 - 13);
+  out++;
+  *out = ((*in) >> 13);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (31 - 12);
+  out++;
+  *out = ((*in) >> 12);
+  ++in;
+  *out |= ((*in) % (1U << 11)) << (31 - 11);
+  out++;
+  *out = ((*in) >> 11);
+  ++in;
+  *out |= ((*in) % (1U << 10)) << (31 - 10);
+  out++;
+  *out = ((*in) >> 10);
+  ++in;
+  *out |= ((*in) % (1U << 9)) << (31 - 9);
+  out++;
+  *out = ((*in) >> 9);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (31 - 8);
+  out++;
+  *out = ((*in) >> 8);
+  ++in;
+  *out |= ((*in) % (1U << 7)) << (31 - 7);
+  out++;
+  *out = ((*in) >> 7);
+  ++in;
+  *out |= ((*in) % (1U << 6)) << (31 - 6);
+  out++;
+  *out = ((*in) >> 6);
+  ++in;
+  *out |= ((*in) % (1U << 5)) << (31 - 5);
+  out++;
+  *out = ((*in) >> 5);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (31 - 4);
+  out++;
+  *out = ((*in) >> 4);
+  ++in;
+  *out |= ((*in) % (1U << 3)) << (31 - 3);
+  out++;
+  *out = ((*in) >> 3);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (31 - 2);
+  out++;
+  *out = ((*in) >> 2);
+  ++in;
+  *out |= ((*in) % (1U << 1)) << (31 - 1);
+  out++;
+  *out = ((*in) >> 1);
+  ++in;
+  out++;
+
+  return in;
+}
+
+inline const uint32_t* unpack32_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+
+  return in;
+}
+
+inline const uint32_t* nullunpacker32(const uint32_t* in, uint32_t* out) {
+  for (int k = 0; k < 32; ++k) {
+    out[k] = 0;
+  }
+  return in;
+}
+
+inline int unpack32(const uint32_t* in, uint32_t* out, int batch_size, int num_bits) {
+  batch_size = batch_size / 32 * 32;
+  int num_loops = batch_size / 32;
+
+  switch (num_bits) {
+    case 0:
+      for (int i = 0; i < num_loops; ++i)
+        in = nullunpacker32(in, out + i * 32);
+      break;
+    case 1:
+      for (int i = 0; i < num_loops; ++i)
+        in = unpack1_32(in, out + i * 32);
+      break;
+    case 2:
+      for (int i = 0; i < num_loops; ++i)
+        in = unpack2_32(in, out + i * 32);
+      break;
+    case 3:
+      for (int i = 0; i < num_loops; ++i)
+        in = unpack3_32(in, out + i * 32);
+      break;
+    case 4:
+      for (int i = 0; i < num_loops; ++i)
+        in = unpack4_32(in, out + i * 32);
+      break;
+    case 5:
+      for (int i = 0; i < num_loops; ++i)
+        in = unpack5_32(in, out + i * 32);
+      break;
+    case 6:
+      for (int i = 0; i < num_loops; ++i)
+        in = unpack6_32(in, out + i * 32);
+      break;
+    case 7:
+      for (int i = 0; i < num_loops; ++i)
+        in = unpack7_32(in, out + i * 32);
+      break;
+    case 8:
+      for (int i = 0; i < num_loops; ++i)
+        in = unpack8_32(in, out + i * 32);
+      break;
+    case 9:
+      for (int i = 0; i < num_loops; ++i)
+        in = unpack9_32(in, out + i * 32);
+      break;
+    case 10:
+      for (int i = 0; i < num_loops; ++i)
+        in = unpack10_32(in, out + i * 32);
+      break;
+    case 11:
+      for (int i = 0; i < num_loops; ++i)
+        in = unpack11_32(in, out + i * 32);
+      break;
+    case 12:
+      for (int i = 0; i < num_loops; ++i)
+        in = unpack12_32(in, out + i * 32);
+      break;
+    case 13:
+      for (int i = 0; i < num_loops; ++i)
+        in = unpack13_32(in, out + i * 32);
+      break;
+    case 14:
+      for (int i = 0; i < num_loops; ++i)
+        in = unpack14_32(in, out + i * 32);
+      break;
+    case 15:
+      for (int i = 0; i < num_loops; ++i)
+        in = unpack15_32(in, out + i * 32);
+      break;
+    case 16:
+      for (int i = 0; i < num_loops; ++i)
+        in = unpack16_32(in, out + i * 32);
+      break;
+    case 17:
+      for (int i = 0; i < num_loops; ++i)
+        in = unpack17_32(in, out + i * 32);
+      break;
+    case 18:
+      for (int i = 0; i < num_loops; ++i)
+        in = unpack18_32(in, out + i * 32);
+      break;
+    case 19:
+      for (int i = 0; i < num_loops; ++i)
+        in = unpack19_32(in, out + i * 32);
+      break;
+    case 20:
+      for (int i = 0; i < num_loops; ++i)
+        in = unpack20_32(in, out + i * 32);
+      break;
+    case 21:
+      for (int i = 0; i < num_loops; ++i)
+        in = unpack21_32(in, out + i * 32);
+      break;
+    case 22:
+      for (int i = 0; i < num_loops; ++i)
+        in = unpack22_32(in, out + i * 32);
+      break;
+    case 23:
+      for (int i = 0; i < num_loops; ++i)
+        in = unpack23_32(in, out + i * 32);
+      break;
+    case 24:
+      for (int i = 0; i < num_loops; ++i)
+        in = unpack24_32(in, out + i * 32);
+      break;
+    case 25:
+      for (int i = 0; i < num_loops; ++i)
+        in = unpack25_32(in, out + i * 32);
+      break;
+    case 26:
+      for (int i = 0; i < num_loops; ++i)
+        in = unpack26_32(in, out + i * 32);
+      break;
+    case 27:
+      for (int i = 0; i < num_loops; ++i)
+        in = unpack27_32(in, out + i * 32);
+      break;
+    case 28:
+      for (int i = 0; i < num_loops; ++i)
+        in = unpack28_32(in, out + i * 32);
+      break;
+    case 29:
+      for (int i = 0; i < num_loops; ++i)
+        in = unpack29_32(in, out + i * 32);
+      break;
+    case 30:
+      for (int i = 0; i < num_loops; ++i)
+        in = unpack30_32(in, out + i * 32);
+      break;
+    case 31:
+      for (int i = 0; i < num_loops; ++i)
+        in = unpack31_32(in, out + i * 32);
+      break;
+    case 32:
+      for (int i = 0; i < num_loops; ++i)
+        in = unpack32_32(in, out + i * 32);
+      break;
+    default:
+      throw std::runtime_error("Unsupported num_bits");
+  }
+
+  return batch_size;
+}
+
+};  // namespace parquet
+
+#endif  // PARQUET_UTIL_BPACKING_H

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/38f0ffd5/src/parquet/util/buffer.h
----------------------------------------------------------------------
diff --git a/src/parquet/util/buffer.h b/src/parquet/util/buffer.h
index de64265..58a5f5e 100644
--- a/src/parquet/util/buffer.h
+++ b/src/parquet/util/buffer.h
@@ -133,7 +133,7 @@ class Vector {
   void Reserve(int64_t new_capacity);
   void Assign(int64_t size, const T val);
   void Swap(Vector<T>& v);
-  inline T& operator[](int64_t i) { return data_[i]; }
+  inline T& operator[](int64_t i) const { return data_[i]; }
 
  private:
   std::unique_ptr<OwnedMutableBuffer> buffer_;

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/38f0ffd5/src/parquet/util/rle-encoding.h
----------------------------------------------------------------------
diff --git a/src/parquet/util/rle-encoding.h b/src/parquet/util/rle-encoding.h
index b7fa83a..b0fb8d1 100644
--- a/src/parquet/util/rle-encoding.h
+++ b/src/parquet/util/rle-encoding.h
@@ -26,6 +26,7 @@
 #include "parquet/util/compiler-util.h"
 #include "parquet/util/bit-stream-utils.inline.h"
 #include "parquet/util/bit-util.h"
+#include "parquet/util/buffer.h"
 
 namespace parquet {
 
@@ -114,6 +115,10 @@ class RleDecoder {
   template <typename T>
   int GetBatch(T* values, int batch_size);
 
+  /// Like GetBatch but the values are then decoded using the provided dictionary
+  template <typename T>
+  int GetBatchWithDict(const Vector<T>& dictionary, T* values, int batch_size);
+
  protected:
   BitReader bit_reader_;
   /// Number of bits needed to encode the value. Must be between 0 and 64.
@@ -253,22 +258,7 @@ class RleEncoder {
 
 template <typename T>
 inline bool RleDecoder::Get(T* val) {
-  DCHECK_GE(bit_width_, 0);
-  if (UNLIKELY(literal_count_ == 0 && repeat_count_ == 0)) {
-    if (!NextCounts<T>()) return false;
-  }
-
-  if (LIKELY(repeat_count_ > 0)) {
-    *val = current_value_;
-    --repeat_count_;
-  } else {
-    DCHECK_GT(literal_count_, 0);
-    bool result = bit_reader_.GetValue(bit_width_, val);
-    DCHECK(result);
-    --literal_count_;
-  }
-
-  return true;
+  return GetBatch(val, 1) == 1;
 }
 
 template <typename T>
@@ -277,27 +267,59 @@ inline int RleDecoder::GetBatch(T* values, int batch_size) {
   int values_read = 0;
 
   while (values_read < batch_size) {
-    if (UNLIKELY(literal_count_ == 0 && repeat_count_ == 0)) {
-      if (!NextCounts<T>()) return values_read;
-    }
-
-    if (LIKELY(repeat_count_ > 0)) {
+    if (repeat_count_ > 0) {
       int repeat_batch =
           std::min(batch_size - values_read, static_cast<int>(repeat_count_));
       std::fill(
           values + values_read, values + values_read + repeat_batch, current_value_);
       repeat_count_ -= repeat_batch;
       values_read += repeat_batch;
+    } else if (literal_count_ > 0) {
+      int literal_batch =
+          std::min(batch_size - values_read, static_cast<int>(literal_count_));
+      int actual_read =
+          bit_reader_.GetBatch(bit_width_, values + values_read, literal_batch);
+      DCHECK_EQ(actual_read, literal_batch);
+      literal_count_ -= literal_batch;
+      values_read += literal_batch;
     } else {
-      DCHECK_GT(literal_count_, 0);
+      if (!NextCounts<T>()) return values_read;
+    }
+  }
+
+  return values_read;
+}
+
+template <typename T>
+inline int RleDecoder::GetBatchWithDict(
+    const Vector<T>& dictionary, T* values, int batch_size) {
+  DCHECK_GE(bit_width_, 0);
+  int values_read = 0;
+
+  while (values_read < batch_size) {
+    if (repeat_count_ > 0) {
+      int repeat_batch =
+          std::min(batch_size - values_read, static_cast<int>(repeat_count_));
+      std::fill(values + values_read, values + values_read + repeat_batch,
+          dictionary[current_value_]);
+      repeat_count_ -= repeat_batch;
+      values_read += repeat_batch;
+    } else if (literal_count_ > 0) {
       int literal_batch =
           std::min(batch_size - values_read, static_cast<int>(literal_count_));
-      for (int i = 0; i < literal_batch; i++) {
-        bool result = bit_reader_.GetValue(bit_width_, values + values_read + i);
-        DCHECK(result);
+
+      const int buffer_size = 1024;
+      static int indices[buffer_size];
+      literal_batch = std::min(literal_batch, buffer_size);
+      int actual_read = bit_reader_.GetBatch(bit_width_, &indices[0], literal_batch);
+      DCHECK_EQ(actual_read, literal_batch);
+      for (int i = 0; i < literal_batch; ++i) {
+        values[values_read + i] = dictionary[indices[i]];
       }
       literal_count_ -= literal_batch;
       values_read += literal_batch;
+    } else {
+      if (!NextCounts<T>()) return values_read;
     }
   }
 


Mime
View raw message