hive-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From na...@apache.org
Subject svn commit: r794024 - in /hadoop/hive/trunk: ./ ql/src/java/org/apache/hadoop/hive/ql/udf/ ql/src/java/org/apache/hadoop/hive/ql/udf/generic/ ql/src/test/queries/clientpositive/ ql/src/test/results/clientpositive/
Date Tue, 14 Jul 2009 19:20:12 GMT
Author: namit
Date: Tue Jul 14 19:20:12 2009
New Revision: 794024

URL: http://svn.apache.org/viewvc?rev=794024&view=rev
Log:
HIVE-623. optimize UDF reverse and UDF length
(Emil Ibrishimov via namit)


Modified:
    hadoop/hive/trunk/CHANGES.txt
    hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFLength.java
    hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFReverse.java
    hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFUtils.java
    hadoop/hive/trunk/ql/src/test/queries/clientpositive/udf_length.q
    hadoop/hive/trunk/ql/src/test/queries/clientpositive/udf_reverse.q
    hadoop/hive/trunk/ql/src/test/results/clientpositive/udf_length.q.out
    hadoop/hive/trunk/ql/src/test/results/clientpositive/udf_reverse.q.out

Modified: hadoop/hive/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/CHANGES.txt?rev=794024&r1=794023&r2=794024&view=diff
==============================================================================
--- hadoop/hive/trunk/CHANGES.txt (original)
+++ hadoop/hive/trunk/CHANGES.txt Tue Jul 14 19:20:12 2009
@@ -347,6 +347,9 @@
 
     HIVE-626. Fix Column Pruner column order bug. (Yongqiang He via zshao)
 
+    HIVE-623. optimize UDF reverse and UDF length
+    (Emil Ibrishimov via namit)
+
     HIVE-632. Make conditional task serializable. (Namit Jain via zshao)
 
 Release 0.3.1 - Unreleased

Modified: hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFLength.java
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFLength.java?rev=794024&r1=794023&r2=794024&view=diff
==============================================================================
--- hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFLength.java (original)
+++ hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFLength.java Tue Jul 14
19:20:12 2009
@@ -18,6 +18,7 @@
 package org.apache.hadoop.hive.ql.udf;
 
 import org.apache.hadoop.hive.ql.exec.UDF;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDFUtils;
 import org.apache.hadoop.io.IntWritable;
 import org.apache.hadoop.io.Text;
 
@@ -29,7 +30,15 @@
       return null;
     }
 
-    result.set(s.toString().length());
+    byte[] data = s.getBytes();
+    int len = 0;
+    for(int i = 0; i < s.getLength(); i++) {
+      if( GenericUDFUtils.isUtfStartByte(data[i]) ) {
+        len++;
+      }
+    }
+    
+    result.set(len);
     return result;
   }
 }

Modified: hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFReverse.java
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFReverse.java?rev=794024&r1=794023&r2=794024&view=diff
==============================================================================
--- hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFReverse.java (original)
+++ hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFReverse.java Tue Jul 14
19:20:12 2009
@@ -19,28 +19,49 @@
 package org.apache.hadoop.hive.ql.udf;
 
 import org.apache.hadoop.hive.ql.exec.UDF;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDFUtils;
 import org.apache.hadoop.io.Text;
 
 public class UDFReverse extends UDF { 
   private Text result = new Text();
   
-  public Text evaluate(Text s) {      
+  /**
+   * Reverse a portion of an array in-place.
+   * 
+   * @param arr The array where the data will be reversed. 
+   * @param first The beginning of the portion (inclusive).
+   * @param last The end of the portion (inclusive).
+   */
+  private void reverse(byte[] arr, int first, int last) {
+    for(int i = 0; i < (last-first+1)/2; i++) {
+      byte temp = arr[last - i];
+      arr[last - i] = arr[first + i];
+      arr[first + i] = temp;
+    }
+  }
+  
+  public Text evaluate(Text s) {
     if (s == null) {
       return null;
     }
-
-    // Use a string because Text.getLength() returns the number of bytes.
-    // This can be optimized by walking over the utf8 characters and not
-    // creating a string at all.
-    String text = s.toString();
     
-    // Append the text to a StringBuffer in reverse order.
-    StringBuffer revBuff = new StringBuffer();
-    for (int i = text.length() - 1; i >= 0; i--) {
-      revBuff.append(text.charAt(i));
+    // set() will only allocate memory if the buffer of result is smaller than
+    // s.getLength() and will never resize the buffer down.
+    result.set(s);
+    
+    // Now do an in-place reversal in result.getBytes(). First, reverse every
+    // character, then reverse the whole string.
+    byte[] data = result.getBytes();
+    int prev = 0; // The index where the current char starts
+    for(int i = 1; i < result.getLength(); i++) {
+      if( GenericUDFUtils.isUtfStartByte(data[i]) ) {
+        reverse(data, prev, i-1);
+        prev = i;
+      }
     }
+    reverse(data, prev, result.getLength() - 1);
+    reverse(data, 0, result.getLength() - 1);
     
-    result.set(revBuff.toString());
     return result;
   }
 }

Modified: hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFUtils.java
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFUtils.java?rev=794024&r1=794023&r2=794024&view=diff
==============================================================================
--- hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFUtils.java
(original)
+++ hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFUtils.java
Tue Jul 14 19:20:12 2009
@@ -40,7 +40,14 @@
 
   private static Log LOG = LogFactory.getLog(GenericUDFUtils.class.getName());
 
-
+  /**
+   * Checks if b is the first byte of a UTF-8 character.
+   * 
+   */
+  public static boolean isUtfStartByte(byte b) {
+    return (b & 0xC0) != 0x80;
+  }
+  
   /**
    * This class helps to find the return ObjectInspector for a GenericUDF.
    * 

Modified: hadoop/hive/trunk/ql/src/test/queries/clientpositive/udf_length.q
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/test/queries/clientpositive/udf_length.q?rev=794024&r1=794023&r2=794024&view=diff
==============================================================================
--- hadoop/hive/trunk/ql/src/test/queries/clientpositive/udf_length.q (original)
+++ hadoop/hive/trunk/ql/src/test/queries/clientpositive/udf_length.q Tue Jul 14 19:20:12
2009
@@ -1,10 +1,12 @@
 CREATE TABLE dest1(len INT);
-
-EXPLAIN
-FROM src INSERT OVERWRITE TABLE dest1 SELECT length(src.value);
-
-FROM src INSERT OVERWRITE TABLE dest1 SELECT length(src.value);
-
+EXPLAIN FROM src1 INSERT OVERWRITE TABLE dest1 SELECT length(src1.value);
+FROM src1 INSERT OVERWRITE TABLE dest1 SELECT length(src1.value);
 SELECT dest1.* FROM dest1;
+DROP TABLE dest1;
 
+-- Test with non-ascii characters. 
+CREATE TABLE dest1(name STRING) STORED AS TEXTFILE;
+LOAD DATA LOCAL INPATH '../data/files/kv4.txt' INTO TABLE dest1;
+EXPLAIN SELECT length(dest1.name) FROM dest1;
+SELECT length(dest1.name) FROM dest1;
 DROP TABLE dest1;

Modified: hadoop/hive/trunk/ql/src/test/queries/clientpositive/udf_reverse.q
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/test/queries/clientpositive/udf_reverse.q?rev=794024&r1=794023&r2=794024&view=diff
==============================================================================
--- hadoop/hive/trunk/ql/src/test/queries/clientpositive/udf_reverse.q (original)
+++ hadoop/hive/trunk/ql/src/test/queries/clientpositive/udf_reverse.q Tue Jul 14 19:20:12
2009
@@ -1,10 +1,15 @@
 CREATE TABLE dest1(len STRING);
-
-EXPLAIN
-FROM src1 INSERT OVERWRITE TABLE dest1 SELECT reverse(src1.value);
-
+EXPLAIN FROM src1 INSERT OVERWRITE TABLE dest1 SELECT reverse(src1.value);
 FROM src1 INSERT OVERWRITE TABLE dest1 SELECT reverse(src1.value);
-
 SELECT dest1.* FROM dest1;
+DROP TABLE dest1;
 
+-- Test with non-ascii characters
+-- kv4.txt contains the text 0xE982B5E993AE, which should be reversed to
+-- 0xE993AEE982B5
+CREATE TABLE dest1(name STRING) STORED AS TEXTFILE;
+LOAD DATA LOCAL INPATH '../data/files/kv4.txt' INTO TABLE dest1;
+EXPLAIN SELECT count(1) FROM dest1 
+	WHERE reverse(dest1.name) = _UTF-8 0xE993AEE982B5;
+SELECT count(1) FROM dest1 WHERE reverse(dest1.name) = _UTF-8 0xE993AEE982B5;
 DROP TABLE dest1;

Modified: hadoop/hive/trunk/ql/src/test/results/clientpositive/udf_length.q.out
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/test/results/clientpositive/udf_length.q.out?rev=794024&r1=794023&r2=794024&view=diff
==============================================================================
--- hadoop/hive/trunk/ql/src/test/results/clientpositive/udf_length.q.out (original)
+++ hadoop/hive/trunk/ql/src/test/results/clientpositive/udf_length.q.out Tue Jul 14 19:20:12
2009
@@ -1,8 +1,7 @@
 query: CREATE TABLE dest1(len INT)
-query: EXPLAIN
-FROM src INSERT OVERWRITE TABLE dest1 SELECT length(src.value)
+query: EXPLAIN FROM src1 INSERT OVERWRITE TABLE dest1 SELECT length(src1.value)
 ABSTRACT SYNTAX TREE:
-  (TOK_QUERY (TOK_FROM (TOK_TABREF src)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB dest1)) (TOK_SELECT
(TOK_SELEXPR (TOK_FUNCTION length (. (TOK_TABLE_OR_COL src) value))))))
+  (TOK_QUERY (TOK_FROM (TOK_TABREF src1)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB dest1)) (TOK_SELECT
(TOK_SELEXPR (TOK_FUNCTION length (. (TOK_TABLE_OR_COL src1) value))))))
 
 STAGE DEPENDENCIES:
   Stage-1 is a root stage
@@ -13,7 +12,7 @@
   Stage: Stage-1
     Map Reduce
       Alias -> Map Operator Tree:
-        src 
+        src1 
             Select Operator
               expressions:
                     expr: length(value)
@@ -33,10 +32,10 @@
           Move Operator
             files:
                 hdfs directory: true
-                destination: file:/data/users/emil/hive1/hive1/build/ql/tmp/809193894/10000
+                destination: file:/data/users/emil/hive1/hive1/build/ql/tmp/891633093/10000
           Map Reduce
             Alias -> Map Operator Tree:
-              file:/data/users/emil/hive1/hive1/build/ql/tmp/353629462/10002 
+              file:/data/users/emil/hive1/hive1/build/ql/tmp/988560065/10002 
                   Reduce Output Operator
                     sort order: 
                     Map-reduce partition columns:
@@ -68,217 +67,14 @@
               name: dest1
 
 
-query: FROM src INSERT OVERWRITE TABLE dest1 SELECT length(src.value)
-Input: default/src
+query: FROM src1 INSERT OVERWRITE TABLE dest1 SELECT length(src1.value)
+Input: default/src1
 Output: default/dest1
 query: SELECT dest1.* FROM dest1
 Input: default/dest1
-Output: file:/data/users/emil/hive1/hive1/build/ql/tmp/1493924198/10000
-7
-6
-7
-6
-7
-7
-7
-7
-6
-7
-7
-7
-7
-7
-7
-7
-7
-6
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-6
-7
-7
-7
-7
-6
-6
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-6
-7
-7
-7
-7
-5
-7
-7
-7
-7
-6
-7
-7
-7
-7
-7
-7
-7
-6
-7
-7
-7
-7
-6
-7
-6
-6
-5
-7
-6
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-6
-7
-6
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-6
-7
-7
-7
-7
-7
-7
-6
-7
-7
-7
-6
-7
-7
-6
-7
-7
-7
-7
-7
-7
-7
-7
-7
-6
-7
-7
-6
-5
-7
-7
-7
-7
-6
-7
-7
-7
-7
-6
-7
-7
-7
-7
-5
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-6
-6
-7
-7
-7
-7
-7
-7
-6
-7
-6
-7
-7
-7
+Output: file:/data/users/emil/hive1/hive1/build/ql/tmp/469108904/10000
 7
+0
 7
 6
 7
@@ -292,286 +88,51 @@
 7
 7
 7
-7
-7
-7
-7
-7
-7
-6
-6
-5
-7
-6
-7
-7
-7
-7
-7
-7
-7
-7
-7
-6
-7
-6
-6
-7
-6
-7
+0
+0
 6
+0
 7
 7
 7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-5
-7
-7
-7
-7
-7
-6
-6
-7
-6
-6
-7
-6
-7
-7
-7
-7
-7
-7
-7
-7
-6
-7
-7
-7
-7
-7
-6
-6
-6
-7
-7
-7
-7
-7
-7
-7
-7
-6
-7
-6
-7
-6
-6
-7
-6
-7
-6
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-6
-7
-7
-7
-7
-7
-5
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-5
-6
-7
-7
-7
-6
-6
-7
-7
-7
-7
-7
-7
-7
-7
-7
-6
-7
-7
-7
-7
-7
-7
-7
-7
-7
-6
-7
-7
-7
-7
-6
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-5
-6
-7
-7
-7
-6
-7
-7
-7
-6
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-6
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-7
-6
-7
-6
-7
-7
-6
-7
-7
-7
-7
-5
-7
-7
-7
-7
-7
-7
-7
-7
-7
-6
-7
-7
-7
-7
-6
-7
-7
-6
-6
-6
-7
-7
-7
-7
-7
-7
-7
-7
-7
-6
-7
-7
-7
-7
-6
+0
+0
+0
+query: DROP TABLE dest1
+query: -- Test with non-ascii characters. 
+CREATE TABLE dest1(name STRING) STORED AS TEXTFILE
+query: LOAD DATA LOCAL INPATH '../data/files/kv4.txt' INTO TABLE dest1
+query: EXPLAIN SELECT length(dest1.name) FROM dest1
+ABSTRACT SYNTAX TREE:
+  (TOK_QUERY (TOK_FROM (TOK_TABREF dest1)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE))
(TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION length (. (TOK_TABLE_OR_COL dest1) name))))))
+
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 is a root stage
+
+STAGE PLANS:
+  Stage: Stage-1
+    Map Reduce
+      Alias -> Map Operator Tree:
+        dest1 
+            Select Operator
+              expressions:
+                    expr: length(name)
+                    type: int
+              File Output Operator
+                compressed: false
+                GlobalTableId: 0
+                table:
+                    input format: org.apache.hadoop.mapred.TextInputFormat
+                    output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+
+
+query: SELECT length(dest1.name) FROM dest1
+Input: default/dest1
+Output: file:/data/users/emil/hive1/hive1/build/ql/tmp/1552330536/10000
+2
 query: DROP TABLE dest1

Modified: hadoop/hive/trunk/ql/src/test/results/clientpositive/udf_reverse.q.out
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/test/results/clientpositive/udf_reverse.q.out?rev=794024&r1=794023&r2=794024&view=diff
==============================================================================
--- hadoop/hive/trunk/ql/src/test/results/clientpositive/udf_reverse.q.out (original)
+++ hadoop/hive/trunk/ql/src/test/results/clientpositive/udf_reverse.q.out Tue Jul 14 19:20:12
2009
@@ -1,6 +1,5 @@
 query: CREATE TABLE dest1(len STRING)
-query: EXPLAIN
-FROM src1 INSERT OVERWRITE TABLE dest1 SELECT reverse(src1.value)
+query: EXPLAIN FROM src1 INSERT OVERWRITE TABLE dest1 SELECT reverse(src1.value)
 ABSTRACT SYNTAX TREE:
   (TOK_QUERY (TOK_FROM (TOK_TABREF src1)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB dest1)) (TOK_SELECT
(TOK_SELEXPR (TOK_FUNCTION reverse (. (TOK_TABLE_OR_COL src1) value))))))
 
@@ -33,10 +32,10 @@
           Move Operator
             files:
                 hdfs directory: true
-                destination: file:/data/users/emil/hive1/hive1/build/ql/tmp/533825523/10000
+                destination: file:/data/users/emil/hive1/hive1/build/ql/tmp/1153507358/10000
           Map Reduce
             Alias -> Map Operator Tree:
-              file:/data/users/emil/hive1/hive1/build/ql/tmp/620216694/10002 
+              file:/data/users/emil/hive1/hive1/build/ql/tmp/1576575969/10002 
                   Reduce Output Operator
                     sort order: 
                     Map-reduce partition columns:
@@ -73,7 +72,7 @@
 Output: default/dest1
 query: SELECT dest1.* FROM dest1
 Input: default/dest1
-Output: file:/data/users/emil/hive1/hive1/build/ql/tmp/1181513979/10000
+Output: file:/data/users/emil/hive1/hive1/build/ql/tmp/1185835654/10000
 832_lav
 
 113_lav
@@ -100,3 +99,67 @@
 
 
 query: DROP TABLE dest1
+query: -- Test with non-ascii characters
+-- kv4.txt contains the text 0xE982B5E993AE, which should be reversed to
+-- 0xE993AEE982B5
+CREATE TABLE dest1(name STRING) STORED AS TEXTFILE
+query: LOAD DATA LOCAL INPATH '../data/files/kv4.txt' INTO TABLE dest1
+query: EXPLAIN SELECT count(1) FROM dest1 
+	WHERE reverse(dest1.name) = _UTF-8 0xE993AEE982B5
+ABSTRACT SYNTAX TREE:
+  (TOK_QUERY (TOK_FROM (TOK_TABREF dest1)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE))
(TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_WHERE (= (TOK_FUNCTION reverse (. (TOK_TABLE_OR_COL
dest1) name)) (TOK_CHARSETLITERAL _UTF-8 0xE993AEE982B5)))))
+
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 is a root stage
+
+STAGE PLANS:
+  Stage: Stage-1
+    Map Reduce
+      Alias -> Map Operator Tree:
+        dest1 
+            Filter Operator
+              predicate:
+                  expr: (reverse(name) = '??')
+                  type: boolean
+              Filter Operator
+                predicate:
+                    expr: (reverse(name) = '??')
+                    type: boolean
+                Select Operator
+                  Group By Operator
+                    aggregations:
+                          expr: count(1)
+                    mode: hash
+                    Reduce Output Operator
+                      sort order: 
+                      tag: -1
+                      value expressions:
+                            expr: _col0
+                            type: bigint
+      Reduce Operator Tree:
+        Group By Operator
+          aggregations:
+                expr: count(VALUE._col0)
+          mode: mergepartial
+          Select Operator
+            expressions:
+                  expr: _col0
+                  type: bigint
+            File Output Operator
+              compressed: false
+              GlobalTableId: 0
+              table:
+                  input format: org.apache.hadoop.mapred.TextInputFormat
+                  output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+
+
+query: SELECT count(1) FROM dest1 WHERE reverse(dest1.name) = _UTF-8 0xE993AEE982B5
+Input: default/dest1
+Output: file:/data/users/emil/hive1/hive1/build/ql/tmp/491483710/10000
+1
+query: DROP TABLE dest1



Mime
View raw message