crunch-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From jwi...@apache.org
Subject [1/2] CRUNCH-450: Adding crunch-hive module w/ORC file support. Contributed by Zhong Wang.
Date Tue, 12 Aug 2014 16:26:30 GMT
Repository: crunch
Updated Branches:
  refs/heads/master dee0fcf51 -> 363c8243b


http://git-wip-us.apache.org/repos/asf/crunch/blob/363c8243/crunch-hive/src/test/java/org/apache/crunch/test/orc/pojos/AddressBook.java
----------------------------------------------------------------------
diff --git a/crunch-hive/src/test/java/org/apache/crunch/test/orc/pojos/AddressBook.java b/crunch-hive/src/test/java/org/apache/crunch/test/orc/pojos/AddressBook.java
new file mode 100644
index 0000000..1a52273
--- /dev/null
+++ b/crunch-hive/src/test/java/org/apache/crunch/test/orc/pojos/AddressBook.java
@@ -0,0 +1,141 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.test.orc.pojos;
+
+import java.sql.Timestamp;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
+
+public class AddressBook {
+  
+  public static final String TYPE_STR = "struct<myname:string,mynumbers:array<string>,"
+      + "contacts:map<string," + Person.TYPE_STR + ">,updatetime:timestamp,signature:binary>";
+  public static final TypeInfo TYPE_INFO = TypeInfoUtils.getTypeInfoFromTypeString(TYPE_STR);
+  
+  private String myName;
+  
+  private List<String> myNumbers;
+  private Map<String, Person> contacts;
+  
+  private Timestamp updateTime;
+  private byte[] signature;
+  
+  public AddressBook() {}
+
+  public AddressBook(String myName, List<String> myNumbers,
+      Map<String, Person> contacts, Timestamp updateTime, byte[] signature) {
+    super();
+    this.myName = myName;
+    this.myNumbers = myNumbers;
+    this.contacts = contacts;
+    this.updateTime = updateTime;
+    this.signature = signature;
+  }
+
+  public String getMyName() {
+    return myName;
+  }
+
+  public void setMyName(String myName) {
+    this.myName = myName;
+  }
+
+  public List<String> getMyNumbers() {
+    return myNumbers;
+  }
+
+  public void setMyNumbers(List<String> myNumbers) {
+    this.myNumbers = myNumbers;
+  }
+
+  public Map<String, Person> getContacts() {
+    return contacts;
+  }
+
+  public void setContacts(Map<String, Person> contacts) {
+    this.contacts = contacts;
+  }
+
+  public Timestamp getUpdateTime() {
+    return updateTime;
+  }
+
+  public void setUpdateTime(Timestamp updateTime) {
+    this.updateTime = updateTime;
+  }
+
+  public byte[] getSignature() {
+    return signature;
+  }
+
+  public void setSignature(byte[] signature) {
+    this.signature = signature;
+  }
+
+  @Override
+  public int hashCode() {
+    final int prime = 31;
+    int result = 1;
+    result = prime * result + ((contacts == null) ? 0 : contacts.hashCode());
+    result = prime * result + ((myName == null) ? 0 : myName.hashCode());
+    result = prime * result + ((myNumbers == null) ? 0 : myNumbers.hashCode());
+    result = prime * result + Arrays.hashCode(signature);
+    result = prime * result
+        + ((updateTime == null) ? 0 : updateTime.hashCode());
+    return result;
+  }
+
+  @Override
+  public boolean equals(Object obj) {
+    if (this == obj)
+      return true;
+    if (obj == null)
+      return false;
+    if (getClass() != obj.getClass())
+      return false;
+    AddressBook other = (AddressBook) obj;
+    if (contacts == null) {
+      if (other.contacts != null)
+        return false;
+    } else if (!contacts.equals(other.contacts))
+      return false;
+    if (myName == null) {
+      if (other.myName != null)
+        return false;
+    } else if (!myName.equals(other.myName))
+      return false;
+    if (myNumbers == null) {
+      if (other.myNumbers != null)
+        return false;
+    } else if (!myNumbers.equals(other.myNumbers))
+      return false;
+    if (!Arrays.equals(signature, other.signature))
+      return false;
+    if (updateTime == null) {
+      if (other.updateTime != null)
+        return false;
+    } else if (!updateTime.equals(other.updateTime))
+      return false;
+    return true;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/crunch/blob/363c8243/crunch-hive/src/test/java/org/apache/crunch/test/orc/pojos/Person.java
----------------------------------------------------------------------
diff --git a/crunch-hive/src/test/java/org/apache/crunch/test/orc/pojos/Person.java b/crunch-hive/src/test/java/org/apache/crunch/test/orc/pojos/Person.java
new file mode 100644
index 0000000..176b148
--- /dev/null
+++ b/crunch-hive/src/test/java/org/apache/crunch/test/orc/pojos/Person.java
@@ -0,0 +1,101 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.test.orc.pojos;
+
+import java.util.List;
+
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
+
+public class Person {
+  
+  public static final String TYPE_STR = "struct<name:string,age:int,number:array<string>>";
+  public static final TypeInfo TYPE_INFO = TypeInfoUtils.getTypeInfoFromTypeString(TYPE_STR);
+  
+  private String name;
+  private int age;
+  private List<String> numbers;
+
+  public Person() {}
+  
+  public Person(String name, int age, List<String> numbers) {
+    super();
+    this.name = name;
+    this.age = age;
+    this.numbers = numbers;
+  }
+
+  public String getName() {
+    return name;
+  }
+
+  public void setName(String name) {
+    this.name = name;
+  }
+
+  public int getAge() {
+    return age;
+  }
+
+  public void setAge(int age) {
+    this.age = age;
+  }
+
+  public List<String> getNumbers() {
+    return numbers;
+  }
+
+  public void setNumbers(List<String> numbers) {
+    this.numbers = numbers;
+  }
+
+  @Override
+  public int hashCode() {
+    final int prime = 31;
+    int result = 1;
+    result = prime * result + age;
+    result = prime * result + ((name == null) ? 0 : name.hashCode());
+    result = prime * result + ((numbers == null) ? 0 : numbers.hashCode());
+    return result;
+  }
+
+  @Override
+  public boolean equals(Object obj) {
+    if (this == obj)
+      return true;
+    if (obj == null)
+      return false;
+    if (getClass() != obj.getClass())
+      return false;
+    Person other = (Person) obj;
+    if (age != other.age)
+      return false;
+    if (name == null) {
+      if (other.name != null)
+        return false;
+    } else if (!name.equals(other.name))
+      return false;
+    if (numbers == null) {
+      if (other.numbers != null)
+        return false;
+    } else if (!numbers.equals(other.numbers))
+      return false;
+    return true;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/crunch/blob/363c8243/crunch-hive/src/test/java/org/apache/crunch/types/orc/OrcsTest.java
----------------------------------------------------------------------
diff --git a/crunch-hive/src/test/java/org/apache/crunch/types/orc/OrcsTest.java b/crunch-hive/src/test/java/org/apache/crunch/types/orc/OrcsTest.java
new file mode 100644
index 0000000..5b92f29
--- /dev/null
+++ b/crunch-hive/src/test/java/org/apache/crunch/types/orc/OrcsTest.java
@@ -0,0 +1,141 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.types.orc;
+
+import static org.junit.Assert.*;
+
+import java.sql.Timestamp;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.crunch.Pair;
+import org.apache.crunch.TupleN;
+import org.apache.crunch.io.orc.OrcWritable;
+import org.apache.crunch.test.orc.pojos.AddressBook;
+import org.apache.crunch.test.orc.pojos.Person;
+import org.apache.crunch.types.PType;
+import org.apache.crunch.types.writable.Writables;
+import org.apache.hadoop.hive.common.type.HiveVarchar;
+import org.apache.hadoop.hive.ql.io.orc.OrcStruct;
+import org.apache.hadoop.hive.serde2.io.HiveVarcharWritable;
+import org.apache.hadoop.hive.serde2.io.TimestampWritable;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
+import org.apache.hadoop.io.BytesWritable;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.Text;
+import org.junit.Test;
+
+public class OrcsTest {
+  
+  @SuppressWarnings({ "unchecked", "rawtypes" })
+  protected static void testInputOutputFn(PType ptype, Object java, OrcWritable orc) {
+    initialize(ptype);
+    assertEquals(java, ptype.getInputMapFn().map(orc));
+    assertEquals(orc, ptype.getOutputMapFn().map(java));
+  }
+  
+  private static void initialize(PType ptype) {
+    ptype.getInputMapFn().initialize();
+    ptype.getOutputMapFn().initialize();
+  }
+  
+  @Test
+  public void testOrcs() {
+    String mapValueTypeStr = "struct<a:string,b:int>";
+    String typeStr = "struct<a:int,b:string,c:float,d:varchar(64)"
+        + ",e:map<string," + mapValueTypeStr + ">>";
+    TypeInfo mapValueTypeInfo = TypeInfoUtils.getTypeInfoFromTypeString(mapValueTypeStr);
+    TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString(typeStr);
+    PType<OrcStruct> ptype = Orcs.orcs(typeInfo);
+    
+    HiveVarchar varchar = new HiveVarchar("Hello World", 32);
+    Map<Text, OrcStruct> map = new HashMap<Text, OrcStruct>();
+    OrcStruct value = OrcUtils.createOrcStruct(mapValueTypeInfo, new Text("age"), new IntWritable(24));
+    map.put(new Text("Bob"), value);
+    OrcStruct s = OrcUtils.createOrcStruct(typeInfo, new IntWritable(1024), new Text("Alice"),
+        null, new HiveVarcharWritable(varchar), map);
+    OrcWritable w = new OrcWritable();
+    w.set(s);
+    
+    testInputOutputFn(ptype, s, w);
+  }
+
+  @Test
+  public void testReflects() {
+    PType<AddressBook> ptype = Orcs.reflects(AddressBook.class);
+    
+    AddressBook ab = new AddressBook();
+    ab.setMyName("John Smith");
+    ab.setMyNumbers(Arrays.asList("919-333-4452", "650-777-4329"));
+    Map<String, Person> contacts = new HashMap<String, Person>();
+    contacts.put("Alice", new Person("Alice", 23, Arrays.asList("666-677-9999")));
+    contacts.put("Bob", new Person("Bob", 26, Arrays.asList("999-888-1132", "000-222-9934")));
+    contacts.put("David", null);
+    ab.setContacts(contacts);
+    Timestamp now = new Timestamp(System.currentTimeMillis());
+    ab.setUpdateTime(now);
+    byte[] signature = {0, 0, 64, 68, 39, 0};
+    ab.setSignature(signature);
+    
+    Map<Text, OrcStruct> map = new HashMap<Text, OrcStruct>();
+    map.put(new Text("Alice"), OrcUtils.createOrcStruct(Person.TYPE_INFO, new Text("Alice"),
new IntWritable(23),
+        Arrays.asList(new Text("666-677-9999"))));
+    map.put(new Text("Bob"), OrcUtils.createOrcStruct(Person.TYPE_INFO, new Text("Bob"),
new IntWritable(26),
+        Arrays.asList(new Text("999-888-1132"), new Text("000-222-9934"))));
+    map.put(new Text("David"), null);
+    OrcStruct s = OrcUtils.createOrcStruct(AddressBook.TYPE_INFO, new Text("John Smith"),
+        Arrays.asList(new Text("919-333-4452"), new Text("650-777-4329")), map, new TimestampWritable(now),
+        new BytesWritable(signature));
+    OrcWritable w = new OrcWritable();
+    w.set(s);
+    
+    testInputOutputFn(ptype, ab, w);
+  }
+  
+  @Test
+  public void testTuples() {
+    PType<TupleN> ptype = Orcs.tuples(Writables.ints(), Writables.strings(), Orcs.reflects(Person.class),
+        Writables.tableOf(Writables.strings(), Orcs.reflects(Person.class)));
+    
+    TupleN t = new TupleN(1, "John Smith", new Person("Alice", 23, Arrays.asList("666-677-9999")),
+        new Pair<String, Person>("Bob", new Person("Bob", 26, Arrays.asList("999-888-1132",
"000-222-9934"))));
+    
+    String typeStr = "struct<a:int,b:string,c:" + Person.TYPE_STR + ",d:struct<d1:string,d2:"
+ Person.TYPE_STR + ">>";
+    TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString(typeStr);
+    String tableTypeStr = "struct<a:string,b:" + Person.TYPE_STR + ">";
+    TypeInfo tableTypeInfo = TypeInfoUtils.getTypeInfoFromTypeString(tableTypeStr);
+    
+    OrcStruct s = OrcUtils.createOrcStruct(typeInfo, new IntWritable(1), new Text("John Smith"),
+        OrcUtils.createOrcStruct(Person.TYPE_INFO, new Text("Alice"), new IntWritable(23),
+            Arrays.asList(new Text("666-677-9999"))
+        ),
+        OrcUtils.createOrcStruct(tableTypeInfo, new Text("Bob"),
+            OrcUtils.createOrcStruct(Person.TYPE_INFO, new Text("Bob"), new IntWritable(26),
+                Arrays.asList(new Text("999-888-1132"), new Text("000-222-9934"))
+            )
+        )
+    );
+    OrcWritable w = new OrcWritable();
+    w.set(s);
+    
+    testInputOutputFn(ptype, t, w);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/crunch/blob/363c8243/crunch-hive/src/test/java/org/apache/crunch/types/orc/TupleObjectInspectorTest.java
----------------------------------------------------------------------
diff --git a/crunch-hive/src/test/java/org/apache/crunch/types/orc/TupleObjectInspectorTest.java
b/crunch-hive/src/test/java/org/apache/crunch/types/orc/TupleObjectInspectorTest.java
new file mode 100644
index 0000000..01666e2
--- /dev/null
+++ b/crunch-hive/src/test/java/org/apache/crunch/types/orc/TupleObjectInspectorTest.java
@@ -0,0 +1,73 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.types.orc;
+
+import static org.junit.Assert.*;
+
+import java.nio.ByteBuffer;
+import java.util.List;
+
+import org.apache.crunch.Pair;
+import org.apache.crunch.TupleN;
+import org.apache.crunch.types.TupleFactory;
+import org.apache.crunch.types.orc.TupleObjectInspector.ByteBufferObjectInspector;
+import org.apache.crunch.types.writable.Writables;
+import org.apache.hadoop.io.BytesWritable;
+import org.junit.Test;
+
+public class TupleObjectInspectorTest {
+  
+  @Test
+  public void testTupleObjectInspector() {
+    // test get
+    TupleObjectInspector<TupleN> toi = new TupleObjectInspector<TupleN>(TupleFactory.TUPLEN,

+        Writables.strings(), Writables.ints(), Writables.floats());
+    TupleN tuple = new TupleN("Alice", 28, 165.2f);
+    List<Object> values = toi.getStructFieldsDataAsList(tuple);
+    assertEquals("Alice", values.get(0));
+    assertEquals(28, values.get(1));
+    assertEquals(165.2f, values.get(2));
+    
+    // test create
+    TupleN newTuple = toi.create("Alice", 28, 165.2f);
+    assertEquals(tuple, newTuple);
+    TupleObjectInspector<Pair> poi = new TupleObjectInspector<Pair>(TupleFactory.PAIR,
+        Writables.strings(), Writables.ints());
+    Pair pair = poi.create("word", 29);
+    assertEquals("word", pair.first());
+    assertEquals(29, pair.second());
+  }
+  
+  @Test
+  public void testByteBufferObjectInspector() {
+    byte[] bytes = {0, 9, 4, 18, 64, 6, 1};
+    BytesWritable bw = new BytesWritable(bytes);
+    ByteBuffer buf = ByteBuffer.wrap(bytes);
+    ByteBufferObjectInspector bboi = new ByteBufferObjectInspector();
+    
+    assertArrayEquals(bytes, bboi.getPrimitiveJavaObject(buf));
+    assertEquals(bw, bboi.getPrimitiveWritableObject(buf));
+    assertEquals(buf, bboi.create(bytes));
+    assertEquals(buf, bboi.create(bw));
+    
+    ByteBuffer newBuf = bboi.copyObject(buf);
+    assertTrue(buf != newBuf);
+    assertEquals(buf, newBuf);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/crunch/blob/363c8243/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index 0503a80..f22d08d 100644
--- a/pom.xml
+++ b/pom.xml
@@ -52,6 +52,7 @@ under the License.
     <module>crunch-archetype</module>
     <module>crunch-scrunch</module>
     <module>crunch-spark</module>
+    <module>crunch-hive</module>
     <module>crunch-dist</module>
   </modules>
 
@@ -74,6 +75,7 @@ under the License.
     <commons-logging.version>1.1.1</commons-logging.version>
     <commons-cli.version>1.2</commons-cli.version>
     <avro.version>1.7.4</avro.version>
+    <hive.version>0.13.1</hive.version>
     <parquet.version>1.3.2</parquet.version>
     <javassist.version>3.16.1-GA</javassist.version>
     <jackson.version>1.8.8</jackson.version>
@@ -410,6 +412,12 @@ under the License.
       </dependency>
 
       <dependency>
+        <groupId>org.apache.hive</groupId>
+        <artifactId>hive-exec</artifactId>
+        <version>${hive.version}</version>
+      </dependency>
+
+      <dependency>
         <groupId>org.slf4j</groupId>
         <artifactId>slf4j-api</artifactId>
         <version>${slf4j.version}</version>


Mime
View raw message