pig-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Apache Wiki <wikidi...@apache.org>
Subject [Pig Wiki] Update of "PigMix" by daijy
Date Wed, 16 Jun 2010 18:41:17 GMT
Dear Wiki user,

You have subscribed to a wiki page or wiki category on "Pig Wiki" for change notification.

The "PigMix" page has been changed by daijy.
http://wiki.apache.org/pig/PigMix?action=diff&rev1=15&rev2=16

--------------------------------------------------

  {{{
  A = load 'page_views' using org.apache.pig.test.udf.storefunc.PigPerformanceLoader()
      as (user, action, timespent, query_term, ip_addr, timestamp, estimated_revenue, page_info,
page_links);
- B = order A by user parallel $mappers;
+ B = order A by user $parallelfactor;
  store B into 'page_views_sorted' using PigStorage('\u0001');
  
  alpha = load 'users' using PigStorage('\u0001') as (name, phone, address, city, state, zip);
- a1 = order alpha by name parallel $mappers;
+ a1 = order alpha by name $parallelfactor;
  store a1 into 'users_sorted' using PigStorage('\u0001');
  
  a = load 'power_users' using PigStorage('\u0001') as (name, phone, address, city, state,
zip);
@@ -287, +287 @@

  This script tests reading from a map, flattening a bag of maps, and use of bincond (features
2, 3, and 4).
  {{{
  register pigperf.jar;
- A = load '$page_views' using org.apache.pig.test.utils.datagen.PigPerformanceLoader()
+ A = load '$page_views' using org.apache.pig.test.udf.storefunc.PigPerformanceLoader()
      as (user, action, timespent, query_term, ip_addr, timestamp,
          estimated_revenue, page_info, page_links);
  B = foreach A generate user, (int)action as action, (map[])page_info as page_info,
@@ -304, +304 @@

  This script tests using a join small enough to do in fragment and replicate (feature 7).

  {{{
  register pigperf.jar;
- A = load '$page_views' using org.apache.pig.test.utils.datagen.PigPerformanceLoader()
+ A = load '$page_views' using org.apache.pig.test.udf.storefunc.PigPerformanceLoader()
      as (user, action, timespent, query_term, ip_addr, timestamp,
          estimated_revenue, page_info, page_links);
  B = foreach A generate user, estimated_revenue;
@@ -321, +321 @@

  something that pig could potentially optimize by not regrouping.
  {{{
  register pigperf.jar;
- A = load '$page_views' using org.apache.pig.test.utils.datagen.PigPerformanceLoader()
+ A = load '$page_views' using org.apache.pig.test.udf.storefunc.PigPerformanceLoader()
      as (user, action, timespent, query_term, ip_addr, timestamp,
          estimated_revenue, page_info, page_links);
  B = foreach A generate user, (double)estimated_revenue;
@@ -340, +340 @@

  This script covers foreach generate with a nested distinct (feature 10).
  {{{
  register pigperf.jar;
- A = load '$page_views' using org.apache.pig.test.utils.datagen.PigPerformanceLoader()
+ A = load '$page_views' using org.apache.pig.test.udf.storefunc.PigPerformanceLoader()
      as (user, action, timespent, query_term, ip_addr, timestamp,
          estimated_revenue, page_info, page_links);
  B = foreach A generate user, action;
@@ -359, +359 @@

  This script does an anti-join.  This is useful because it is a use of cogroup that is not
a regular join (feature 9).
  {{{
  register pigperf.jar;
- A = load '$page_views' using org.apache.pig.test.utils.datagen.PigPerformanceLoader()
+ A = load '$page_views' using org.apache.pig.test.udf.storefunc.PigPerformanceLoader()
      as (user, action, timespent, query_term, ip_addr, timestamp,
          estimated_revenue, page_info, page_links);
  B = foreach A generate user;
@@ -377, +377 @@

  This script covers the case where the group by key is a significant percentage of the row
(feature 12).
  {{{
  register pigperf.jar;
- A = load '$page_views' using org.apache.pig.test.utils.datagen.PigPerformanceLoader()
+ A = load '$page_views' using org.apache.pig.test.udf.storefunc.PigPerformanceLoader()
      as (user, action, timespent, query_term, ip_addr, timestamp,
          estimated_revenue, page_info, page_links);
  B = foreach A generate user, action, (int)timespent as timespent, query_term, ip_addr, timestamp;
@@ -392, +392 @@

  This script covers having a nested plan with splits (feature 11).
  {{{
  register pigperf.jar;
- A = load '$page_views' using org.apache.pig.test.utils.datagen.PigPerformanceLoader() as
(user, action, timespent, query_term,
+ A = load '$page_views' using org.apache.pig.test.udf.storefunc.PigPerformanceLoader() as
(user, action, timespent, query_term,
              ip_addr, timestamp, estimated_revenue, page_info, page_links);
  B = foreach A generate user, timestamp;
  C = group B by user $parallelfactor;
@@ -409, +409 @@

  This script covers group all (feature 13).
  {{{
  register pigperf.jar;
- A = load '$page_views' using org.apache.pig.test.utils.datagen.PigPerformanceLoader()
+ A = load '$page_views' using org.apache.pig.test.udf.storefunc.PigPerformanceLoader()
      as (user, action, timespent, query_term, ip_addr, timestamp,
          estimated_revenue, page_info, page_links);
  B = foreach A generate user, (int)timespent as timespent, (double)estimated_revenue as estimated_revenue;
@@ -423, +423 @@

  This script covers order by of a single value (feature 15).
  {{{
  register pigperf.jar;
- A = load '$page_views' using org.apache.pig.test.utils.datagen.PigPerformanceLoader()
+ A = load '$page_views' using org.apache.pig.test.udf.storefunc.PigPerformanceLoader()
      as (user, action, timespent, query_term, ip_addr, timestamp,
          estimated_revenue, page_info, page_links);
  B = order A by query_term $parallelfactor;
@@ -435, +435 @@

  This script covers order by of multiple values (feature 15).
  {{{
  register pigperf.jar;
- A = load '$page_views' using org.apache.pig.test.utils.datagen.PigPerformanceLoader()
+ A = load '$page_views' using org.apache.pig.test.udf.storefunc.PigPerformanceLoader()
      as (user, action, timespent:int, query_term, ip_addr, timestamp,
          estimated_revenue:double, page_info, page_links);
  B = order A by query_term, estimated_revenue desc, timespent $parallelfactor;
@@ -448, +448 @@

  This script covers distinct and union and reading from a wide row but using only one field
(features: 1, 14).
  {{{
  register pigperf.jar;
- A = load '$page_views' using org.apache.pig.test.utils.datagen.PigPerformanceLoader()
+ A = load '$page_views' using org.apache.pig.test.udf.storefunc.PigPerformanceLoader()
      as (user, action, timespent, query_term, ip_addr, timestamp,
          estimated_revenue, page_info, page_links);
  B = foreach A generate user;
@@ -520, +520 @@

  A = load 'page_views' using org.apache.pig.test.udf.storefunc.PigPerformanceLoader()
      as (user, action, timespent, query_term, ip_addr, timestamp, estimated_revenue, page_info,
page_links);
  B = foreach A generate user, action, estimated_revenue, timespent;
- C = group B by user parallel 40;
+ C = group B by user $parallelfactor;
  D = foreach C {
      beth = distinct B.action;
      rev = distinct B.estimated_revenue;
@@ -538, +538 @@

  A = load 'page_views' using org.apache.pig.test.udf.storefunc.PigPerformanceLoader()
      as (user, action, timespent, query_term, ip_addr, timestamp, estimated_revenue, page_info,
page_links);
  B = foreach A generate user, estimated_revenue;
- C = group B by user parallel 40;
+ C = group B by user $parallelfactor;
  D = foreach C {
      E = order B by estimated_revenue;
      F = E.estimated_revenue;
@@ -560, +560 @@

  B = group A by (user, action, timespent, query_term, ip_addr, timestamp,
          estimated_revenue, user_1, action_1, timespent_1, query_term_1, ip_addr_1, timestamp_1,
          estimated_revenue_1, user_2, action_2, timespent_2, query_term_2, ip_addr_2, timestamp_2,
-         estimated_revenue_2) parallel 40;
+         estimated_revenue_2) $parallelfactor;
  C = foreach B generate SUM(A.timespent), SUM(A.timespent_1), SUM(A.timespent_2), AVG(A.estimated_revenue),
AVG(A.estimated_revenue_1), AVG(A.estimated_revenue_2);
  store C into '$out';
  }}}

Mime
View raw message