hive-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From omal...@apache.org
Subject [5/5] hive git commit: HIVE-12055. Move WriterImpl over to orc module.
Date Mon, 14 Dec 2015 21:36:18 GMT
HIVE-12055. Move WriterImpl over to orc module.


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/06e39ebe
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/06e39ebe
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/06e39ebe

Branch: refs/heads/master
Commit: 06e39ebe07d7854df669529e73f1c461f3c7d9d4
Parents: 49dc645
Author: Owen O'Malley <omalley@apache.org>
Authored: Mon Dec 14 13:35:39 2015 -0800
Committer: Owen O'Malley <omalley@apache.org>
Committed: Mon Dec 14 13:35:39 2015 -0800

----------------------------------------------------------------------
 .../apache/hive/common/util/BloomFilter.java    |  309 --
 .../org/apache/hive/common/util/Murmur3.java    |  335 --
 .../apache/hive/common/util/TestMurmur3.java    |  224 --
 orc/src/java/org/apache/orc/BloomFilterIO.java  |   43 +
 orc/src/java/org/apache/orc/OrcFile.java        |   22 +
 .../java/org/apache/orc/TypeDescription.java    |   26 +-
 .../java/org/apache/orc/impl/WriterImpl.java    | 2912 +++++++++++++++
 .../hive/ql/io/filters/BloomFilterIO.java       |   44 -
 .../apache/hadoop/hive/ql/io/orc/FileDump.java  |    2 +-
 .../hadoop/hive/ql/io/orc/JsonFileDump.java     |    2 +-
 .../apache/hadoop/hive/ql/io/orc/OrcFile.java   |   30 +-
 .../hadoop/hive/ql/io/orc/ReaderImpl.java       |   15 +
 .../hadoop/hive/ql/io/orc/RecordReaderImpl.java |    2 +-
 .../apache/hadoop/hive/ql/io/orc/Writer.java    |    2 +-
 .../hadoop/hive/ql/io/orc/WriterImpl.java       | 3394 ++----------------
 .../hadoop/hive/ql/util/JavaDataModel.java      |  335 --
 .../hadoop/hive/ql/io/orc/TestFileDump.java     |   25 +-
 .../hive/ql/io/orc/TestNewIntegerEncoding.java  |    2 +-
 .../hadoop/hive/ql/io/orc/TestOrcFile.java      |    9 +-
 .../hive/ql/io/orc/TestOrcRawRecordMerger.java  |   12 +-
 .../hive/ql/io/orc/TestRecordReaderImpl.java    |    2 +-
 .../resources/orc-file-dump-bloomfilter.out     |    2 +-
 .../resources/orc-file-dump-bloomfilter2.out    |    2 +-
 .../orc-file-dump-dictionary-threshold.out      |    2 +-
 ql/src/test/resources/orc-file-dump.json        |    2 +-
 ql/src/test/resources/orc-file-dump.out         |    2 +-
 ql/src/test/resources/orc-file-has-null.out     |    2 +-
 .../results/clientpositive/orc_file_dump.q.out  |    6 +-
 .../results/clientpositive/orc_merge10.q.out    |    4 +-
 .../results/clientpositive/orc_merge11.q.out    |    6 +-
 .../clientpositive/tez/orc_merge10.q.out        |    4 +-
 .../clientpositive/tez/orc_merge11.q.out        |    6 +-
 .../hadoop/hive/ql/util/JavaDataModel.java      |  335 ++
 .../apache/hive/common/util/BloomFilter.java    |  309 ++
 .../org/apache/hive/common/util/Murmur3.java    |  335 ++
 .../apache/hive/common/util/TestMurmur3.java    |  224 ++
 36 files changed, 4489 insertions(+), 4499 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/06e39ebe/common/src/java/org/apache/hive/common/util/BloomFilter.java
----------------------------------------------------------------------
diff --git a/common/src/java/org/apache/hive/common/util/BloomFilter.java b/common/src/java/org/apache/hive/common/util/BloomFilter.java
deleted file mode 100644
index bb0b8f2..0000000
--- a/common/src/java/org/apache/hive/common/util/BloomFilter.java
+++ /dev/null
@@ -1,309 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.hive.common.util;
-
-import java.util.Arrays;
-import java.util.List;
-
-import static com.google.common.base.Preconditions.checkArgument;
-
-/**
- * BloomFilter is a probabilistic data structure for set membership check. BloomFilters are
- * highly space efficient when compared to using a HashSet. Because of the probabilistic
nature of
- * bloom filter false positive (element not present in bloom filter but test() says true)
are
- * possible but false negatives are not possible (if element is present then test() will
never
- * say false). The false positive probability is configurable (default: 5%) depending on
which
- * storage requirement may increase or decrease. Lower the false positive probability greater
- * is the space requirement.
- * Bloom filters are sensitive to number of elements that will be inserted in the bloom filter.
- * During the creation of bloom filter expected number of entries must be specified. If the
number
- * of insertions exceed the specified initial number of entries then false positive probability
will
- * increase accordingly.
- *
- * Internally, this implementation of bloom filter uses Murmur3 fast non-cryptographic hash
- * algorithm. Although Murmur2 is slightly faster than Murmur3 in Java, it suffers from hash
- * collisions for specific sequence of repeating bytes. Check the following link for more
info
- * https://code.google.com/p/smhasher/wiki/MurmurHash2Flaw
- */
-public class BloomFilter {
-  public static final double DEFAULT_FPP = 0.05;
-  protected BitSet bitSet;
-  protected int numBits;
-  protected int numHashFunctions;
-
-  public BloomFilter() {
-  }
-
-  public BloomFilter(long expectedEntries) {
-    this(expectedEntries, DEFAULT_FPP);
-  }
-
-  public BloomFilter(long expectedEntries, double fpp) {
-    checkArgument(expectedEntries > 0, "expectedEntries should be > 0");
-    checkArgument(fpp > 0.0 && fpp < 1.0, "False positive probability should
be > 0.0 & < 1.0");
-    int nb = optimalNumOfBits(expectedEntries, fpp);
-    // make 'm' multiple of 64
-    this.numBits = nb + (Long.SIZE - (nb % Long.SIZE));
-    this.numHashFunctions = optimalNumOfHashFunctions(expectedEntries, numBits);
-    this.bitSet = new BitSet(numBits);
-  }
-
-  /**
-   * A constructor to support rebuilding the BloomFilter from a serialized representation.
-   * @param bits
-   * @param numBits
-   * @param numFuncs
-   */
-  public BloomFilter(List<Long> bits, int numBits, int numFuncs) {
-    super();
-    long[] copied = new long[bits.size()];
-    for (int i = 0; i < bits.size(); i++) copied[i] = bits.get(i);
-    bitSet = new BitSet(copied);
-    this.numBits = numBits;
-    numHashFunctions = numFuncs;
-  }
-
-  static int optimalNumOfHashFunctions(long n, long m) {
-    return Math.max(1, (int) Math.round((double) m / n * Math.log(2)));
-  }
-
-  static int optimalNumOfBits(long n, double p) {
-    return (int) (-n * Math.log(p) / (Math.log(2) * Math.log(2)));
-  }
-
-  public void add(byte[] val) {
-    if (val == null) {
-      addBytes(val, -1, -1);
-    } else {
-      addBytes(val, 0, val.length);
-    }
-  }
-
-  public void addBytes(byte[] val, int offset, int length) {
-    // We use the trick mentioned in "Less Hashing, Same Performance: Building a Better Bloom
Filter"
-    // by Kirsch et.al. From abstract 'only two hash functions are necessary to effectively
-    // implement a Bloom filter without any loss in the asymptotic false positive probability'
-
-    // Lets split up 64-bit hashcode into two 32-bit hash codes and employ the technique
mentioned
-    // in the above paper
-    long hash64 = val == null ? Murmur3.NULL_HASHCODE :
-        Murmur3.hash64(val, offset, length);
-    addHash(hash64);
-  }
-
-  private void addHash(long hash64) {
-    int hash1 = (int) hash64;
-    int hash2 = (int) (hash64 >>> 32);
-
-    for (int i = 1; i <= numHashFunctions; i++) {
-      int combinedHash = hash1 + (i * hash2);
-      // hashcode should be positive, flip all the bits if it's negative
-      if (combinedHash < 0) {
-        combinedHash = ~combinedHash;
-      }
-      int pos = combinedHash % numBits;
-      bitSet.set(pos);
-    }
-  }
-
-  public void addString(String val) {
-    if (val == null) {
-      add(null);
-    } else {
-      add(val.getBytes());
-    }
-  }
-
-  public void addLong(long val) {
-    addHash(getLongHash(val));
-  }
-
-  public void addDouble(double val) {
-    addLong(Double.doubleToLongBits(val));
-  }
-
-  public boolean test(byte[] val) {
-    if (val == null) {
-      return testBytes(val, -1, -1);
-    }
-    return testBytes(val, 0, val.length);
-  }
-
-  public boolean testBytes(byte[] val, int offset, int length) {
-    long hash64 = val == null ? Murmur3.NULL_HASHCODE :
-        Murmur3.hash64(val, offset, length);
-    return testHash(hash64);
-  }
-
-  private boolean testHash(long hash64) {
-    int hash1 = (int) hash64;
-    int hash2 = (int) (hash64 >>> 32);
-
-    for (int i = 1; i <= numHashFunctions; i++) {
-      int combinedHash = hash1 + (i * hash2);
-      // hashcode should be positive, flip all the bits if it's negative
-      if (combinedHash < 0) {
-        combinedHash = ~combinedHash;
-      }
-      int pos = combinedHash % numBits;
-      if (!bitSet.get(pos)) {
-        return false;
-      }
-    }
-    return true;
-  }
-
-  public boolean testString(String val) {
-    if (val == null) {
-      return test(null);
-    } else {
-      return test(val.getBytes());
-    }
-  }
-
-  public boolean testLong(long val) {
-    return testHash(getLongHash(val));
-  }
-
-  // Thomas Wang's integer hash function
-  // http://web.archive.org/web/20071223173210/http://www.concentric.net/~Ttwang/tech/inthash.htm
-  private long getLongHash(long key) {
-    key = (~key) + (key << 21); // key = (key << 21) - key - 1;
-    key = key ^ (key >> 24);
-    key = (key + (key << 3)) + (key << 8); // key * 265
-    key = key ^ (key >> 14);
-    key = (key + (key << 2)) + (key << 4); // key * 21
-    key = key ^ (key >> 28);
-    key = key + (key << 31);
-    return key;
-  }
-
-  public boolean testDouble(double val) {
-    return testLong(Double.doubleToLongBits(val));
-  }
-
-  public long sizeInBytes() {
-    return getBitSize() / 8;
-  }
-
-  public int getBitSize() {
-    return bitSet.getData().length * Long.SIZE;
-  }
-
-  public int getNumHashFunctions() {
-    return numHashFunctions;
-  }
-
-  public long[] getBitSet() {
-    return bitSet.getData();
-  }
-
-  @Override
-  public String toString() {
-    return "m: " + numBits + " k: " + numHashFunctions;
-  }
-
-  /**
-   * Merge the specified bloom filter with current bloom filter.
-   *
-   * @param that - bloom filter to merge
-   */
-  public void merge(BloomFilter that) {
-    if (this != that && this.numBits == that.numBits && this.numHashFunctions
== that.numHashFunctions) {
-      this.bitSet.putAll(that.bitSet);
-    } else {
-      throw new IllegalArgumentException("BloomFilters are not compatible for merging." +
-          " this - " + this.toString() + " that - " + that.toString());
-    }
-  }
-
-  public void reset() {
-    this.bitSet.clear();
-  }
-
-  /**
-   * Bare metal bit set implementation. For performance reasons, this implementation does
not check
-   * for index bounds nor expand the bit set size if the specified index is greater than
the size.
-   */
-  public class BitSet {
-    private final long[] data;
-
-    public BitSet(long bits) {
-      this(new long[(int) Math.ceil((double) bits / (double) Long.SIZE)]);
-    }
-
-    /**
-     * Deserialize long array as bit set.
-     *
-     * @param data - bit array
-     */
-    public BitSet(long[] data) {
-      assert data.length > 0 : "data length is zero!";
-      this.data = data;
-    }
-
-    /**
-     * Sets the bit at specified index.
-     *
-     * @param index - position
-     */
-    public void set(int index) {
-      data[index >>> 6] |= (1L << index);
-    }
-
-    /**
-     * Returns true if the bit is set in the specified index.
-     *
-     * @param index - position
-     * @return - value at the bit position
-     */
-    public boolean get(int index) {
-      return (data[index >>> 6] & (1L << index)) != 0;
-    }
-
-    /**
-     * Number of bits
-     */
-    public long bitSize() {
-      return (long) data.length * Long.SIZE;
-    }
-
-    public long[] getData() {
-      return data;
-    }
-
-    /**
-     * Combines the two BitArrays using bitwise OR.
-     */
-    public void putAll(BitSet array) {
-      assert data.length == array.data.length :
-          "BitArrays must be of equal length (" + data.length + "!= " + array.data.length
+ ")";
-      for (int i = 0; i < data.length; i++) {
-        data[i] |= array.data[i];
-      }
-    }
-
-    /**
-     * Clear the bit set.
-     */
-    public void clear() {
-      Arrays.fill(data, 0);
-    }
-  }
-}

http://git-wip-us.apache.org/repos/asf/hive/blob/06e39ebe/common/src/java/org/apache/hive/common/util/Murmur3.java
----------------------------------------------------------------------
diff --git a/common/src/java/org/apache/hive/common/util/Murmur3.java b/common/src/java/org/apache/hive/common/util/Murmur3.java
deleted file mode 100644
index 88c3514..0000000
--- a/common/src/java/org/apache/hive/common/util/Murmur3.java
+++ /dev/null
@@ -1,335 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.hive.common.util;
-
-/**
- * Murmur3 is successor to Murmur2 fast non-crytographic hash algorithms.
- *
- * Murmur3 32 and 128 bit variants.
- * 32-bit Java port of https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp#94
- * 128-bit Java port of https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp#255
- *
- * This is a public domain code with no copyrights.
- * From homepage of MurmurHash (https://code.google.com/p/smhasher/),
- * "All MurmurHash versions are public domain software, and the author disclaims all copyright
- * to their code."
- */
-public class Murmur3 {
-  // from 64-bit linear congruential generator
-  public static final long NULL_HASHCODE = 2862933555777941757L;
-
-  // Constants for 32 bit variant
-  private static final int C1_32 = 0xcc9e2d51;
-  private static final int C2_32 = 0x1b873593;
-  private static final int R1_32 = 15;
-  private static final int R2_32 = 13;
-  private static final int M_32 = 5;
-  private static final int N_32 = 0xe6546b64;
-
-  // Constants for 128 bit variant
-  private static final long C1 = 0x87c37b91114253d5L;
-  private static final long C2 = 0x4cf5ad432745937fL;
-  private static final int R1 = 31;
-  private static final int R2 = 27;
-  private static final int R3 = 33;
-  private static final int M = 5;
-  private static final int N1 = 0x52dce729;
-  private static final int N2 = 0x38495ab5;
-
-  private static final int DEFAULT_SEED = 104729;
-
-  /**
-   * Murmur3 32-bit variant.
-   *
-   * @param data - input byte array
-   * @return - hashcode
-   */
-  public static int hash32(byte[] data) {
-    return hash32(data, data.length, DEFAULT_SEED);
-  }
-
-  /**
-   * Murmur3 32-bit variant.
-   *
-   * @param data   - input byte array
-   * @param length - length of array
-   * @param seed   - seed. (default 0)
-   * @return - hashcode
-   */
-  public static int hash32(byte[] data, int length, int seed) {
-    int hash = seed;
-    final int nblocks = length >> 2;
-
-    // body
-    for (int i = 0; i < nblocks; i++) {
-      int i_4 = i << 2;
-      int k = (data[i_4] & 0xff)
-          | ((data[i_4 + 1] & 0xff) << 8)
-          | ((data[i_4 + 2] & 0xff) << 16)
-          | ((data[i_4 + 3] & 0xff) << 24);
-
-      // mix functions
-      k *= C1_32;
-      k = Integer.rotateLeft(k, R1_32);
-      k *= C2_32;
-      hash ^= k;
-      hash = Integer.rotateLeft(hash, R2_32) * M_32 + N_32;
-    }
-
-    // tail
-    int idx = nblocks << 2;
-    int k1 = 0;
-    switch (length - idx) {
-      case 3:
-        k1 ^= data[idx + 2] << 16;
-      case 2:
-        k1 ^= data[idx + 1] << 8;
-      case 1:
-        k1 ^= data[idx];
-
-        // mix functions
-        k1 *= C1_32;
-        k1 = Integer.rotateLeft(k1, R1_32);
-        k1 *= C2_32;
-        hash ^= k1;
-    }
-
-    // finalization
-    hash ^= length;
-    hash ^= (hash >>> 16);
-    hash *= 0x85ebca6b;
-    hash ^= (hash >>> 13);
-    hash *= 0xc2b2ae35;
-    hash ^= (hash >>> 16);
-
-    return hash;
-  }
-
-  /**
-   * Murmur3 64-bit variant. This is essentially MSB 8 bytes of Murmur3 128-bit variant.
-   *
-   * @param data - input byte array
-   * @return - hashcode
-   */
-  public static long hash64(byte[] data) {
-    return hash64(data, 0, data.length, DEFAULT_SEED);
-  }
-
-  public static long hash64(byte[] data, int offset, int length) {
-    return hash64(data, offset, length, DEFAULT_SEED);
-  }
-
-  /**
-   * Murmur3 64-bit variant. This is essentially MSB 8 bytes of Murmur3 128-bit variant.
-   *
-   * @param data   - input byte array
-   * @param length - length of array
-   * @param seed   - seed. (default is 0)
-   * @return - hashcode
-   */
-  public static long hash64(byte[] data, int offset, int length, int seed) {
-    long hash = seed;
-    final int nblocks = length >> 3;
-
-    // body
-    for (int i = 0; i < nblocks; i++) {
-      final int i8 = i << 3;
-      long k = ((long) data[offset + i8] & 0xff)
-          | (((long) data[offset + i8 + 1] & 0xff) << 8)
-          | (((long) data[offset + i8 + 2] & 0xff) << 16)
-          | (((long) data[offset + i8 + 3] & 0xff) << 24)
-          | (((long) data[offset + i8 + 4] & 0xff) << 32)
-          | (((long) data[offset + i8 + 5] & 0xff) << 40)
-          | (((long) data[offset + i8 + 6] & 0xff) << 48)
-          | (((long) data[offset + i8 + 7] & 0xff) << 56);
-
-      // mix functions
-      k *= C1;
-      k = Long.rotateLeft(k, R1);
-      k *= C2;
-      hash ^= k;
-      hash = Long.rotateLeft(hash, R2) * M + N1;
-    }
-
-    // tail
-    long k1 = 0;
-    int tailStart = nblocks << 3;
-    switch (length - tailStart) {
-      case 7:
-        k1 ^= ((long) data[offset + tailStart + 6] & 0xff) << 48;
-      case 6:
-        k1 ^= ((long) data[offset + tailStart + 5] & 0xff) << 40;
-      case 5:
-        k1 ^= ((long) data[offset + tailStart + 4] & 0xff) << 32;
-      case 4:
-        k1 ^= ((long) data[offset + tailStart + 3] & 0xff) << 24;
-      case 3:
-        k1 ^= ((long) data[offset + tailStart + 2] & 0xff) << 16;
-      case 2:
-        k1 ^= ((long) data[offset + tailStart + 1] & 0xff) << 8;
-      case 1:
-        k1 ^= ((long) data[offset + tailStart] & 0xff);
-        k1 *= C1;
-        k1 = Long.rotateLeft(k1, R1);
-        k1 *= C2;
-        hash ^= k1;
-    }
-
-    // finalization
-    hash ^= length;
-    hash = fmix64(hash);
-
-    return hash;
-  }
-
-  /**
-   * Murmur3 128-bit variant.
-   *
-   * @param data - input byte array
-   * @return - hashcode (2 longs)
-   */
-  public static long[] hash128(byte[] data) {
-    return hash128(data, 0, data.length, DEFAULT_SEED);
-  }
-
-  /**
-   * Murmur3 128-bit variant.
-   *
-   * @param data   - input byte array
-   * @param offset - the first element of array
-   * @param length - length of array
-   * @param seed   - seed. (default is 0)
-   * @return - hashcode (2 longs)
-   */
-  public static long[] hash128(byte[] data, int offset, int length, int seed) {
-    long h1 = seed;
-    long h2 = seed;
-    final int nblocks = length >> 4;
-
-    // body
-    for (int i = 0; i < nblocks; i++) {
-      final int i16 = i << 4;
-      long k1 = ((long) data[offset + i16] & 0xff)
-          | (((long) data[offset + i16 + 1] & 0xff) << 8)
-          | (((long) data[offset + i16 + 2] & 0xff) << 16)
-          | (((long) data[offset + i16 + 3] & 0xff) << 24)
-          | (((long) data[offset + i16 + 4] & 0xff) << 32)
-          | (((long) data[offset + i16 + 5] & 0xff) << 40)
-          | (((long) data[offset + i16 + 6] & 0xff) << 48)
-          | (((long) data[offset + i16 + 7] & 0xff) << 56);
-
-      long k2 = ((long) data[offset + i16 + 8] & 0xff)
-          | (((long) data[offset + i16 + 9] & 0xff) << 8)
-          | (((long) data[offset + i16 + 10] & 0xff) << 16)
-          | (((long) data[offset + i16 + 11] & 0xff) << 24)
-          | (((long) data[offset + i16 + 12] & 0xff) << 32)
-          | (((long) data[offset + i16 + 13] & 0xff) << 40)
-          | (((long) data[offset + i16 + 14] & 0xff) << 48)
-          | (((long) data[offset + i16 + 15] & 0xff) << 56);
-
-      // mix functions for k1
-      k1 *= C1;
-      k1 = Long.rotateLeft(k1, R1);
-      k1 *= C2;
-      h1 ^= k1;
-      h1 = Long.rotateLeft(h1, R2);
-      h1 += h2;
-      h1 = h1 * M + N1;
-
-      // mix functions for k2
-      k2 *= C2;
-      k2 = Long.rotateLeft(k2, R3);
-      k2 *= C1;
-      h2 ^= k2;
-      h2 = Long.rotateLeft(h2, R1);
-      h2 += h1;
-      h2 = h2 * M + N2;
-    }
-
-    // tail
-    long k1 = 0;
-    long k2 = 0;
-    int tailStart = nblocks << 4;
-    switch (length - tailStart) {
-      case 15:
-        k2 ^= (long) (data[offset + tailStart + 14] & 0xff) << 48;
-      case 14:
-        k2 ^= (long) (data[offset + tailStart + 13] & 0xff) << 40;
-      case 13:
-        k2 ^= (long) (data[offset + tailStart + 12] & 0xff) << 32;
-      case 12:
-        k2 ^= (long) (data[offset + tailStart + 11] & 0xff) << 24;
-      case 11:
-        k2 ^= (long) (data[offset + tailStart + 10] & 0xff) << 16;
-      case 10:
-        k2 ^= (long) (data[offset + tailStart + 9] & 0xff) << 8;
-      case 9:
-        k2 ^= (long) (data[offset + tailStart + 8] & 0xff);
-        k2 *= C2;
-        k2 = Long.rotateLeft(k2, R3);
-        k2 *= C1;
-        h2 ^= k2;
-
-      case 8:
-        k1 ^= (long) (data[offset + tailStart + 7] & 0xff) << 56;
-      case 7:
-        k1 ^= (long) (data[offset + tailStart + 6] & 0xff) << 48;
-      case 6:
-        k1 ^= (long) (data[offset + tailStart + 5] & 0xff) << 40;
-      case 5:
-        k1 ^= (long) (data[offset + tailStart + 4] & 0xff) << 32;
-      case 4:
-        k1 ^= (long) (data[offset + tailStart + 3] & 0xff) << 24;
-      case 3:
-        k1 ^= (long) (data[offset + tailStart + 2] & 0xff) << 16;
-      case 2:
-        k1 ^= (long) (data[offset + tailStart + 1] & 0xff) << 8;
-      case 1:
-        k1 ^= (long) (data[offset + tailStart] & 0xff);
-        k1 *= C1;
-        k1 = Long.rotateLeft(k1, R1);
-        k1 *= C2;
-        h1 ^= k1;
-    }
-
-    // finalization
-    h1 ^= length;
-    h2 ^= length;
-
-    h1 += h2;
-    h2 += h1;
-
-    h1 = fmix64(h1);
-    h2 = fmix64(h2);
-
-    h1 += h2;
-    h2 += h1;
-
-    return new long[]{h1, h2};
-  }
-
-  private static long fmix64(long h) {
-    h ^= (h >>> 33);
-    h *= 0xff51afd7ed558ccdL;
-    h ^= (h >>> 33);
-    h *= 0xc4ceb9fe1a85ec53L;
-    h ^= (h >>> 33);
-    return h;
-  }
-}

http://git-wip-us.apache.org/repos/asf/hive/blob/06e39ebe/common/src/test/org/apache/hive/common/util/TestMurmur3.java
----------------------------------------------------------------------
diff --git a/common/src/test/org/apache/hive/common/util/TestMurmur3.java b/common/src/test/org/apache/hive/common/util/TestMurmur3.java
deleted file mode 100644
index 5facc7c..0000000
--- a/common/src/test/org/apache/hive/common/util/TestMurmur3.java
+++ /dev/null
@@ -1,224 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.hive.common.util;
-
-import static org.junit.Assert.assertEquals;
-
-import com.google.common.hash.HashFunction;
-import com.google.common.hash.Hashing;
-
-import org.junit.Test;
-
-import java.nio.ByteBuffer;
-import java.nio.ByteOrder;
-import java.util.Arrays;
-import java.util.Random;
-
-/**
- * Tests for Murmur3 variants.
- */
-public class TestMurmur3 {
-
-  @Test
-  public void testHashCodesM3_32_string() {
-    String key = "test";
-    int seed = 123;
-    HashFunction hf = Hashing.murmur3_32(seed);
-    int hc1 = hf.hashBytes(key.getBytes()).asInt();
-    int hc2 = Murmur3.hash32(key.getBytes(), key.getBytes().length, seed);
-    assertEquals(hc1, hc2);
-
-    key = "testkey";
-    hc1 = hf.hashBytes(key.getBytes()).asInt();
-    hc2 = Murmur3.hash32(key.getBytes(), key.getBytes().length, seed);
-    assertEquals(hc1, hc2);
-  }
-
-  @Test
-  public void testHashCodesM3_32_ints() {
-    int seed = 123;
-    Random rand = new Random(seed);
-    HashFunction hf = Hashing.murmur3_32(seed);
-    for (int i = 0; i < 1000; i++) {
-      int val = rand.nextInt();
-      byte[] data = ByteBuffer.allocate(4).putInt(val).array();
-      int hc1 = hf.hashBytes(data).asInt();
-      int hc2 = Murmur3.hash32(data, data.length, seed);
-      assertEquals(hc1, hc2);
-    }
-  }
-
-  @Test
-  public void testHashCodesM3_32_longs() {
-    int seed = 123;
-    Random rand = new Random(seed);
-    HashFunction hf = Hashing.murmur3_32(seed);
-    for (int i = 0; i < 1000; i++) {
-      long val = rand.nextLong();
-      byte[] data = ByteBuffer.allocate(8).putLong(val).array();
-      int hc1 = hf.hashBytes(data).asInt();
-      int hc2 = Murmur3.hash32(data, data.length, seed);
-      assertEquals(hc1, hc2);
-    }
-  }
-
-  @Test
-  public void testHashCodesM3_32_double() {
-    int seed = 123;
-    Random rand = new Random(seed);
-    HashFunction hf = Hashing.murmur3_32(seed);
-    for (int i = 0; i < 1000; i++) {
-      double val = rand.nextDouble();
-      byte[] data = ByteBuffer.allocate(8).putDouble(val).array();
-      int hc1 = hf.hashBytes(data).asInt();
-      int hc2 = Murmur3.hash32(data, data.length, seed);
-      assertEquals(hc1, hc2);
-    }
-  }
-
-  @Test
-  public void testHashCodesM3_128_string() {
-    String key = "test";
-    int seed = 123;
-    HashFunction hf = Hashing.murmur3_128(seed);
-    // guava stores the hashcodes in little endian order
-    ByteBuffer buf = ByteBuffer.allocate(16).order(ByteOrder.LITTLE_ENDIAN);
-    buf.put(hf.hashBytes(key.getBytes()).asBytes());
-    buf.flip();
-    long gl1 = buf.getLong();
-    long gl2 = buf.getLong(8);
-    long[] hc = Murmur3.hash128(key.getBytes(), 0, key.getBytes().length, seed);
-    long m1 = hc[0];
-    long m2 = hc[1];
-    assertEquals(gl1, m1);
-    assertEquals(gl2, m2);
-
-    key = "testkey128_testkey128";
-    buf = ByteBuffer.allocate(16).order(ByteOrder.LITTLE_ENDIAN);
-    buf.put(hf.hashBytes(key.getBytes()).asBytes());
-    buf.flip();
-    gl1 = buf.getLong();
-    gl2 = buf.getLong(8);
-    byte[] keyBytes = key.getBytes();
-    hc = Murmur3.hash128(keyBytes, 0, keyBytes.length, seed);
-    m1 = hc[0];
-    m2 = hc[1];
-    assertEquals(gl1, m1);
-    assertEquals(gl2, m2);
-
-    byte[] offsetKeyBytes = new byte[keyBytes.length + 35];
-    Arrays.fill(offsetKeyBytes, (byte) -1);
-    System.arraycopy(keyBytes, 0, offsetKeyBytes, 35, keyBytes.length);
-    hc = Murmur3.hash128(offsetKeyBytes, 35, keyBytes.length, seed);
-    assertEquals(gl1, hc[0]);
-    assertEquals(gl2, hc[1]);
-  }
-
-  @Test
-  public void testHashCodeM3_64() {
-    byte[] origin = ("It was the best of times, it was the worst of times," +
-        " it was the age of wisdom, it was the age of foolishness," +
-        " it was the epoch of belief, it was the epoch of incredulity," +
-        " it was the season of Light, it was the season of Darkness," +
-        " it was the spring of hope, it was the winter of despair," +
-        " we had everything before us, we had nothing before us," +
-        " we were all going direct to Heaven," +
-        " we were all going direct the other way.").getBytes();
-    long hash = Murmur3.hash64(origin, 0, origin.length);
-    assertEquals(305830725663368540L, hash);
-
-    byte[] originOffset = new byte[origin.length + 150];
-    Arrays.fill(originOffset, (byte) 123);
-    System.arraycopy(origin, 0, originOffset, 150, origin.length);
-    hash = Murmur3.hash64(originOffset, 150, origin.length);
-    assertEquals(305830725663368540L, hash);
-  }
-
-  @Test
-  public void testHashCodesM3_128_ints() {
-    int seed = 123;
-    Random rand = new Random(seed);
-    HashFunction hf = Hashing.murmur3_128(seed);
-    for (int i = 0; i < 1000; i++) {
-      int val = rand.nextInt();
-      byte[] data = ByteBuffer.allocate(4).putInt(val).array();
-      // guava stores the hashcodes in little endian order
-      ByteBuffer buf = ByteBuffer.allocate(16).order(ByteOrder.LITTLE_ENDIAN);
-      buf.put(hf.hashBytes(data).asBytes());
-      buf.flip();
-      long gl1 = buf.getLong();
-      long gl2 = buf.getLong(8);
-      long[] hc = Murmur3.hash128(data, 0, data.length, seed);
-      long m1 = hc[0];
-      long m2 = hc[1];
-      assertEquals(gl1, m1);
-      assertEquals(gl2, m2);
-
-      byte[] offsetData = new byte[data.length + 50];
-      System.arraycopy(data, 0, offsetData, 50, data.length);
-      hc = Murmur3.hash128(offsetData, 50, data.length, seed);
-      assertEquals(gl1, hc[0]);
-      assertEquals(gl2, hc[1]);
-    }
-  }
-
-  @Test
-  public void testHashCodesM3_128_longs() {
-    int seed = 123;
-    Random rand = new Random(seed);
-    HashFunction hf = Hashing.murmur3_128(seed);
-    for (int i = 0; i < 1000; i++) {
-      long val = rand.nextLong();
-      byte[] data = ByteBuffer.allocate(8).putLong(val).array();
-      // guava stores the hashcodes in little endian order
-      ByteBuffer buf = ByteBuffer.allocate(16).order(ByteOrder.LITTLE_ENDIAN);
-      buf.put(hf.hashBytes(data).asBytes());
-      buf.flip();
-      long gl1 = buf.getLong();
-      long gl2 = buf.getLong(8);
-      long[] hc = Murmur3.hash128(data, 0, data.length, seed);
-      long m1 = hc[0];
-      long m2 = hc[1];
-      assertEquals(gl1, m1);
-      assertEquals(gl2, m2);
-    }
-  }
-
-  @Test
-  public void testHashCodesM3_128_double() {
-    int seed = 123;
-    Random rand = new Random(seed);
-    HashFunction hf = Hashing.murmur3_128(seed);
-    for (int i = 0; i < 1000; i++) {
-      double val = rand.nextDouble();
-      byte[] data = ByteBuffer.allocate(8).putDouble(val).array();
-      // guava stores the hashcodes in little endian order
-      ByteBuffer buf = ByteBuffer.allocate(16).order(ByteOrder.LITTLE_ENDIAN);
-      buf.put(hf.hashBytes(data).asBytes());
-      buf.flip();
-      long gl1 = buf.getLong();
-      long gl2 = buf.getLong(8);
-      long[] hc = Murmur3.hash128(data, 0, data.length, seed);
-      long m1 = hc[0];
-      long m2 = hc[1];
-      assertEquals(gl1, m1);
-      assertEquals(gl2, m2);
-    }
-  }
-}

http://git-wip-us.apache.org/repos/asf/hive/blob/06e39ebe/orc/src/java/org/apache/orc/BloomFilterIO.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/BloomFilterIO.java b/orc/src/java/org/apache/orc/BloomFilterIO.java
new file mode 100644
index 0000000..1406266
--- /dev/null
+++ b/orc/src/java/org/apache/orc/BloomFilterIO.java
@@ -0,0 +1,43 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc;
+
+import org.apache.hive.common.util.BloomFilter;
+
+import com.google.common.primitives.Longs;
+
+public class BloomFilterIO extends BloomFilter {
+
+  public BloomFilterIO(long expectedEntries) {
+    super(expectedEntries, DEFAULT_FPP);
+  }
+
+  public BloomFilterIO(long expectedEntries, double fpp) {
+    super(expectedEntries, fpp);
+  }
+
+/**
+ * Initializes the BloomFilter from the given Orc BloomFilter
+ */
+  public BloomFilterIO(OrcProto.BloomFilter bloomFilter) {
+    this.bitSet = new BitSet(Longs.toArray(bloomFilter.getBitsetList()));
+    this.numHashFunctions = bloomFilter.getNumHashFunctions();
+    this.numBits = (int) this.bitSet.bitSize();
+  }
+}

http://git-wip-us.apache.org/repos/asf/hive/blob/06e39ebe/orc/src/java/org/apache/orc/OrcFile.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/OrcFile.java b/orc/src/java/org/apache/orc/OrcFile.java
index 9ea0b52..98226f9 100644
--- a/orc/src/java/org/apache/orc/OrcFile.java
+++ b/orc/src/java/org/apache/orc/OrcFile.java
@@ -23,7 +23,9 @@ import java.util.Properties;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
 import org.apache.orc.impl.MemoryManager;
+import org.apache.orc.impl.WriterImpl;
 
 /**
  * Contains factory methods to read or write ORC files.
@@ -102,6 +104,8 @@ public class OrcFile {
     ORIGINAL(0),
     HIVE_8732(1), // corrupted stripe/file maximum column statistics
     HIVE_4243(2), // use real column names from Hive tables
+    HIVE_12055(3), // vectorized writer
+
     // Don't use any magic numbers here except for the below:
     FUTURE(Integer.MAX_VALUE); // a version from a future writer
 
@@ -138,6 +142,7 @@ public class OrcFile {
       return values[val];
     }
   }
+  public static final WriterVersion CURRENT_WRITER = WriterVersion.HIVE_12055;
 
   public enum EncodingStrategy {
     SPEED, COMPRESSION
@@ -511,4 +516,21 @@ public class OrcFile {
     return memoryManager.get();
   }
 
+  /**
+   * Create an ORC file writer. This is the public interface for creating
+   * writers going forward and new options will only be added to this method.
+   * @param path filename to write to
+   * @param opts the options
+   * @return a new ORC file writer
+   * @throws IOException
+   */
+  public static Writer createWriter(Path path,
+                                    WriterOptions opts
+                                    ) throws IOException {
+    FileSystem fs = opts.getFileSystem() == null ?
+        path.getFileSystem(opts.getConfiguration()) : opts.getFileSystem();
+
+    return new WriterImpl(fs, path, opts);
+  }
+
 }

http://git-wip-us.apache.org/repos/asf/hive/blob/06e39ebe/orc/src/java/org/apache/orc/TypeDescription.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/TypeDescription.java b/orc/src/java/org/apache/orc/TypeDescription.java
index fc945e4..f97a113 100644
--- a/orc/src/java/org/apache/orc/TypeDescription.java
+++ b/orc/src/java/org/apache/orc/TypeDescription.java
@@ -275,7 +275,7 @@ public class TypeDescription {
     return maxId;
   }
 
-  private ColumnVector createColumn() {
+  private ColumnVector createColumn(int maxSize) {
     switch (category) {
       case BOOLEAN:
       case BYTE:
@@ -298,7 +298,7 @@ public class TypeDescription {
       case STRUCT: {
         ColumnVector[] fieldVector = new ColumnVector[children.size()];
         for(int i=0; i < fieldVector.length; ++i) {
-          fieldVector[i] = children.get(i).createColumn();
+          fieldVector[i] = children.get(i).createColumn(maxSize);
         }
         return new StructColumnVector(VectorizedRowBatch.DEFAULT_SIZE,
                 fieldVector);
@@ -306,38 +306,42 @@ public class TypeDescription {
       case UNION: {
         ColumnVector[] fieldVector = new ColumnVector[children.size()];
         for(int i=0; i < fieldVector.length; ++i) {
-          fieldVector[i] = children.get(i).createColumn();
+          fieldVector[i] = children.get(i).createColumn(maxSize);
         }
         return new UnionColumnVector(VectorizedRowBatch.DEFAULT_SIZE,
             fieldVector);
       }
       case LIST:
         return new ListColumnVector(VectorizedRowBatch.DEFAULT_SIZE,
-            children.get(0).createColumn());
+            children.get(0).createColumn(maxSize));
       case MAP:
         return new MapColumnVector(VectorizedRowBatch.DEFAULT_SIZE,
-            children.get(0).createColumn(), children.get(1).createColumn());
+            children.get(0).createColumn(maxSize),
+            children.get(1).createColumn(maxSize));
       default:
         throw new IllegalArgumentException("Unknown type " + category);
     }
   }
 
-  public VectorizedRowBatch createRowBatch() {
+  public VectorizedRowBatch createRowBatch(int maxSize) {
     VectorizedRowBatch result;
     if (category == Category.STRUCT) {
-      result = new VectorizedRowBatch(children.size(),
-          VectorizedRowBatch.DEFAULT_SIZE);
+      result = new VectorizedRowBatch(children.size(), maxSize);
       for(int i=0; i < result.cols.length; ++i) {
-        result.cols[i] = children.get(i).createColumn();
+        result.cols[i] = children.get(i).createColumn(maxSize);
       }
     } else {
-      result = new VectorizedRowBatch(1, VectorizedRowBatch.DEFAULT_SIZE);
-      result.cols[0] = createColumn();
+      result = new VectorizedRowBatch(1, maxSize);
+      result.cols[0] = createColumn(maxSize);
     }
     result.reset();
     return result;
   }
 
+  public VectorizedRowBatch createRowBatch() {
+    return createRowBatch(VectorizedRowBatch.DEFAULT_SIZE);
+  }
+
   /**
    * Get the kind of this type.
    * @return get the category for this type.


Mime
View raw message