hadoop-hive-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From zs...@apache.org
Subject svn commit: r803769 - in /hadoop/hive/branches/branch-0.4: ./ data/files/ ql/src/test/org/apache/hadoop/hive/scripts/ ql/src/test/queries/clientpositive/ ql/src/test/results/clientpositive/
Date Thu, 13 Aug 2009 03:44:26 GMT
Author: zshao
Date: Thu Aug 13 03:44:25 2009
New Revision: 803769

URL: http://svn.apache.org/viewvc?rev=803769&view=rev
Log:
HIVE-748. Test for extracting urls. (Namit Jain via zshao)

Added:
    hadoop/hive/branches/branch-0.4/data/files/docurl.txt
    hadoop/hive/branches/branch-0.4/ql/src/test/org/apache/hadoop/hive/scripts/
    hadoop/hive/branches/branch-0.4/ql/src/test/org/apache/hadoop/hive/scripts/extracturl.java
    hadoop/hive/branches/branch-0.4/ql/src/test/queries/clientpositive/input37.q
    hadoop/hive/branches/branch-0.4/ql/src/test/results/clientpositive/input37.q.out
Modified:
    hadoop/hive/branches/branch-0.4/CHANGES.txt

Modified: hadoop/hive/branches/branch-0.4/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/hive/branches/branch-0.4/CHANGES.txt?rev=803769&r1=803768&r2=803769&view=diff
==============================================================================
--- hadoop/hive/branches/branch-0.4/CHANGES.txt (original)
+++ hadoop/hive/branches/branch-0.4/CHANGES.txt Thu Aug 13 03:44:25 2009
@@ -128,6 +128,8 @@
     HIVE-749. add hive.optimize.pruner
     (Zheng Shao via namit)
 
+    HIVE-748. Test for extracting urls. (Namit Jain via zshao)
+
   IMPROVEMENTS
     HIVE-389. Option to build without ivy (jssarma)
 

Added: hadoop/hive/branches/branch-0.4/data/files/docurl.txt
URL: http://svn.apache.org/viewvc/hadoop/hive/branches/branch-0.4/data/files/docurl.txt?rev=803769&view=auto
==============================================================================
--- hadoop/hive/branches/branch-0.4/data/files/docurl.txt (added)
+++ hadoop/hive/branches/branch-0.4/data/files/docurl.txt Thu Aug 13 03:44:25 2009
@@ -0,0 +1,8 @@
+bonework Chad bullfrog almighty blubberers cynodictis boilersmith cosmopolitical corrie autoincrements

+casings choked colpohysterotomy comedist cradleman annexa agronomical archmockery Cocles
adhaka 
+daekon <a href="http://4uzsbtwvdypfitqfqdjosynqp.html">link</a> anticrochet auricular
cheeked Arbon alder-leaved 
+darlingness breamed company carbureted comediette condensery <a href="http://1uauniajqtunlsvadmxhlxvngxpqjuzbpzvdiwmzphmbaicduzkgxgtdeiunduosu.html">link</a>

+daekon <a href="http://4uzsbtwvdypfitqfqdjosynqp.html">link</a> anticrochet auricular
cheeked Arbon alder-leaved 
+darlingness breamed company carbureted comediette condensery <a href="http://1uauniajqtunlsvadmxhlxvngxpqjuzbpzvdiwmzphmbaicduzkgxgtdeiunduosu.html">link</a>

+daekon <a href="http://4uzsbtwvdypfitqfqdjosynqp.html">link</a> anticrochet auricular
cheeked Arbon alder-leaved darlingness breamed company carbureted comediette condensery <a
href="http://1uauniajqtunlsvadmxhlxvngxpqjuzbpzvdiwmzphmbaicduzkgxgtdeiunduosu.html">link</a>

+daekon <a href="http://4uzsbtwvdypfitqfqdjosynqp.html">link</a> anticrochet auricular
cheeked Arbon alder-leaved darlingness breamed company carbureted comediette condensery <a
href="http://1uauniajqtunlsvadmxhlxvngxpqjuzbpzvdiwmzphmbaicduzkgxgtdeiunduosu.html">link</a>


Added: hadoop/hive/branches/branch-0.4/ql/src/test/org/apache/hadoop/hive/scripts/extracturl.java
URL: http://svn.apache.org/viewvc/hadoop/hive/branches/branch-0.4/ql/src/test/org/apache/hadoop/hive/scripts/extracturl.java?rev=803769&view=auto
==============================================================================
--- hadoop/hive/branches/branch-0.4/ql/src/test/org/apache/hadoop/hive/scripts/extracturl.java
(added)
+++ hadoop/hive/branches/branch-0.4/ql/src/test/org/apache/hadoop/hive/scripts/extracturl.java
Thu Aug 13 03:44:25 2009
@@ -0,0 +1,50 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.scripts;
+
+import java.io.*;
+
+import java.util.HashMap;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public class extracturl {
+
+  protected static final Pattern pattern = Pattern.compile("<a href=\"http://([\\w\\d]+\\.html)\">link</a>",
Pattern.CASE_INSENSITIVE);
+  static InputStreamReader converter = new InputStreamReader (System.in);
+  static BufferedReader   in = new BufferedReader (converter);
+
+  public static void main(String[] args) {
+    String input;
+    try {
+      while ((input = in.readLine()) != null) {
+        Matcher m = pattern.matcher(input);
+
+        while(m.find()) {
+          String url = input.substring(m.start(1), m.end(1));
+          System.out.println(url + "\t" + "1");
+        } 
+      }
+    }
+    catch (Exception e) {
+      e.printStackTrace();
+      System.exit(1);
+    }
+  } 
+}

Added: hadoop/hive/branches/branch-0.4/ql/src/test/queries/clientpositive/input37.q
URL: http://svn.apache.org/viewvc/hadoop/hive/branches/branch-0.4/ql/src/test/queries/clientpositive/input37.q?rev=803769&view=auto
==============================================================================
--- hadoop/hive/branches/branch-0.4/ql/src/test/queries/clientpositive/input37.q (added)
+++ hadoop/hive/branches/branch-0.4/ql/src/test/queries/clientpositive/input37.q Thu Aug 13
03:44:25 2009
@@ -0,0 +1,16 @@
+create table documents(contents string) stored as textfile;
+
+LOAD DATA LOCAL INPATH '../data/files/docurl.txt' INTO TABLE documents;
+
+
+select url, count(1) 
+FROM
+(
+  FROM documents
+  MAP documents.contents
+  USING 'java -cp ../build/ql/test/classes org.apache.hadoop.hive.scripts.extracturl' AS
(url, count)
+) subq
+group by url;
+
+
+DROP TABLE documents;

Added: hadoop/hive/branches/branch-0.4/ql/src/test/results/clientpositive/input37.q.out
URL: http://svn.apache.org/viewvc/hadoop/hive/branches/branch-0.4/ql/src/test/results/clientpositive/input37.q.out?rev=803769&view=auto
==============================================================================
--- hadoop/hive/branches/branch-0.4/ql/src/test/results/clientpositive/input37.q.out (added)
+++ hadoop/hive/branches/branch-0.4/ql/src/test/results/clientpositive/input37.q.out Thu Aug
13 03:44:25 2009
@@ -0,0 +1,15 @@
+query: create table documents(contents string) stored as textfile
+query: LOAD DATA LOCAL INPATH '../data/files/docurl.txt' INTO TABLE documents
+query: select url, count(1) 
+FROM
+(
+  FROM documents
+  MAP documents.contents
+  USING 'java -cp ../build/ql/test/classes org.apache.hadoop.hive.scripts.extracturl' AS
(url, count)
+) subq
+group by url
+Input: default/documents
+Output: file:/data/users/njain/hive_commit2/hive_commit2/build/ql/tmp/781025216/10000
+1uauniajqtunlsvadmxhlxvngxpqjuzbpzvdiwmzphmbaicduzkgxgtdeiunduosu.html	4
+4uzsbtwvdypfitqfqdjosynqp.html	4
+query: DROP TABLE documents



Mime
View raw message