orc-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From moresandeep <...@git.apache.org>
Subject [GitHub] orc pull request #208: ORC-250 - Create sha256 mask
Date Wed, 17 Jan 2018 21:07:36 GMT
Github user moresandeep commented on a diff in the pull request:

    https://github.com/apache/orc/pull/208#discussion_r162178962
  
    --- Diff: java/core/src/java/org/apache/orc/impl/mask/SHA256MaskFactory.java ---
    @@ -0,0 +1,290 @@
    +/*
    + * Licensed to the Apache Software Foundation (ASF) under one
    + * or more contributor license agreements.  See the NOTICE file
    + * distributed with this work for additional information
    + * regarding copyright ownership.  The ASF licenses this file
    + * to you under the Apache License, Version 2.0 (the
    + * "License"); you may not use this file except in compliance
    + * with the License.  You may obtain a copy of the License at
    + *
    + *     http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +package org.apache.orc.impl.mask;
    +
    +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
    +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
    +import org.apache.orc.DataMask;
    +import org.apache.orc.TypeDescription;
    +
    +import javax.xml.bind.DatatypeConverter;
    +import java.nio.ByteBuffer;
    +import java.nio.charset.StandardCharsets;
    +import java.security.MessageDigest;
    +import java.security.NoSuchAlgorithmException;
    +import java.util.Arrays;
    +
    +/**
    + * Masking strategy that masks String, Varchar, Char and Binary types
    + * as SHA 256 hash.
    + * <p>
    + * <b>For String type:</b><br/>
    + * All string type of any length will be converted to 64 length SHA256 hash.<br/><br/>
    + * <p>
    + * <b>For Varchar type:</b><br/>
    + * For Varchar type, max-length property will be honored i.e.
    + * if the length is less than max-length then the SHA256 hash will be truncated
    + * to max-length. If max-length is greater than 64 then the output is the sha256
    + * length, which is 64.<br/><br/>
    + * <p>
    + * <b>For Char type:</b><br/>
    + * For Char type, the length of mask will always be equal to specified max-length.
    + * If the given length (max-length) is less than SHA256 hash length (64)
    + * the mask will be truncated.
    + * If the given length (max-length) is greater than SHA256 hash length (64)
    + * then the mask will be padded by blank spaces.<br/><br/>
    + * <p>
    + * <b>For Binary type:</b><br/>
    + * All Binary type of any length will be converted to 64 length SHA256 hash.<br/>
    + */
    +public class SHA256MaskFactory extends MaskFactory {
    +
    +  final MessageDigest md;
    +
    +  public SHA256MaskFactory(final String... params) {
    +    super();
    +    try {
    +      md = MessageDigest.getInstance("SHA-256");
    +    } catch (NoSuchAlgorithmException e) {
    +      throw new RuntimeException(e);
    +    }
    +  }
    +
    +  /**
    +   * Mask a string by finding the character category of each character
    +   * and replacing it with the matching literal.
    +   *
    +   * @param source the source column vector
    +   * @param row    the value index
    +   * @param target the target column vector
    +   * @param schema schema
    +   */
    +  void maskString(final BytesColumnVector source, final int row,
    +      final BytesColumnVector target, final TypeDescription schema) {
    +    final ByteBuffer sourceBytes = ByteBuffer
    +        .wrap(source.vector[row], source.start[row], source.length[row]);
    +
    +    // take SHA-256 Hash and convert to HEX
    +    byte[] hash = DatatypeConverter
    +        .printHexBinary(md.digest(sourceBytes.array()))
    +        .getBytes(StandardCharsets.UTF_8);
    +    int targetLength = hash.length;
    +
    +    /* For type varchar */
    +    if (schema.getCategory() == TypeDescription.Category.VARCHAR) {
    +
    +      /* truncate the hash if max length for varchar is less than hash length
    +       * on the other hand if if the max length is more than hash length (64 bytes)
    +       * we use the hash length (64 bytes) always.
    +       */
    +      if (schema.getMaxLength() < hash.length) {
    +        targetLength = schema.getMaxLength();
    +      }
    +
    +    }
    +
    +    /* For type char */
    +    if (schema.getCategory() == TypeDescription.Category.CHAR) {
    --- End diff --
    
    Sure, I'll use a Switch statement here.


---

Mime
View raw message