hawq-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From r...@apache.org
Subject [01/17] incubator-hawq git commit: HAWQ-100. Code Cleanup: gpmapreduce.
Date Thu, 05 Nov 2015 02:54:47 GMT
Repository: incubator-hawq
Updated Branches:
  refs/heads/master 04c0f28b9 -> 4e392375e


http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/4e392375/src/test/regress/output/mapred.source
----------------------------------------------------------------------
diff --git a/src/test/regress/output/mapred.source b/src/test/regress/output/mapred.source
deleted file mode 100644
index 1b6a2ba..0000000
--- a/src/test/regress/output/mapred.source
+++ /dev/null
@@ -1,507 +0,0 @@
---
--- map/reduce tests
---
--- Some notes: the PATH variable doesn't necessarily work correctly in
--- the regression context, so use the "gpwhich" token to get the full
--- path from your environment (see gpstringsubs for more details).  And
--- to make this work correctly with gpsourcify, you need to add your
--- gpwhich token to gptokencheck.
--- start_matchsubs
--- m|mapreduce_\d+_run|
--- s|mapreduce_\d+_run|mapreduce_PID_run|
--- end_matchsubs
---
--- This test makes use of plperlu
---
--- start_ignore
-create language plperlu;
-create language plpythonu;
--- Take a look at the installed languages, plperl and plpython should be in the list.
---  + pg_pltemplate contains the list of languages that can be simply installed
---  + pl_language contains the list of languages that are actually installed
-select * from pg_pltemplate;
- tmplname  | tmpltrusted |      tmplhandler      |   tmplvalidator   |   tmpllibrary    | tmplacl 
------------+-------------+-----------------------+-------------------+------------------+---------
- plpgsql   | t           | plpgsql_call_handler  | plpgsql_validator | $libdir/plpgsql  | 
- pltcl     | t           | pltcl_call_handler    |                   | $libdir/pltcl    | 
- pltclu    | f           | pltclu_call_handler   |                   | $libdir/pltcl    | 
- plperl    | t           | plperl_call_handler   | plperl_validator  | $libdir/plperl   | 
- plperlu   | f           | plperl_call_handler   | plperl_validator  | $libdir/plperl   | 
- plpythonu | f           | plpython_call_handler |                   | $libdir/plpython | 
- plr       | f           | plr_call_handler      |                   | $libdir/plr      | 
-(7 rows)
-
-select lanname, lanispl, lanpltrusted from pg_language;
-  lanname  | lanispl | lanpltrusted 
------------+---------+--------------
- internal  | f       | f
- c         | f       | f
- sql       | f       | t
- plpgsql   | t       | t
- plperlu   | t       | f
- plpythonu | t       | f
-(6 rows)
-
--- Check enviornment variables that should have been set by greenplum_path.sh
---
--- 1) We need to check these on all segments and on the master.
--- 2) We do this via external table rather than perl/python in case it is part
---    of the cause of a mis-installed plperl/plpython.
--- 3) It is normal for the master to have a slightly different enviornment from
---    the segments (but perhaps not desirable?)
---
-CREATE EXTERNAL WEB TABLE env_segment(var text, value text) 
-EXECUTE 'env | grep "^[^=]*=[^=]*$"' format 'text' (delimiter '=');
-CREATE EXTERNAL WEB TABLE env_master(var text, value text) 
-EXECUTE 'env | grep "^[^=]*=[^=]*$"' on master format 'text' (delimiter '=');
-CREATE VIEW env AS
-  SELECT gp_execution_segment(), * FROM env_segment
-  UNION ALL
-  SELECT gp_execution_segment(), * FROM env_master;
-SELECT * FROM env WHERE var in (
-	   'GPHOME', 
-	   'DYLD_LIBRARY_PATH', 
-	   'LD_LIBRARY_PATH', 
-	   'PATH'
-) ORDER BY var, gp_execution_segment;
- gp_execution_segment |        var        |                                                                      value                                                                                         
-----------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------
-                   -1 | DYLD_LIBRARY_PATH | @gphome@/lib:@gphome@/ext/python/lib:$DYLD_LIBRARY_PATH
-                    0 | DYLD_LIBRARY_PATH | @gphome@/lib:@gphome@/ext/python/lib:
-                    1 | DYLD_LIBRARY_PATH | @gphome@/lib:@gphome@/ext/python/lib:
-                   -1 | GPHOME            | @gphome@
-                    0 | GPHOME            | @gphome@
-                    1 | GPHOME            | @gphome@
-                   -1 | LD_LIBRARY_PATH   | @gphome@/lib:@gphome@/ext/python/lib:
-                   -1 | PATH              | @gphome@/bin:/usr/gnu/bin:/usr/local/bin:/bin:/usr/bin:/usr/sbin:/sbin:/usr/texbin:/usr/X11/bin:/sw/bin:/opt/local/bin:/opt/local/sbin
-                    0 | PATH              | @gphome@/ext/python/bin:/usr/bin:/bin:/usr/sbin:/sbin:/usr/local/bin
-                    1 | PATH              | @gphome@/ext/python/bin:/usr/bin:/bin:/usr/sbin:/sbin:/usr/local/bin
-(10 rows)
-
--- end_ignore
---
--- Some checks to verify what versions of perl/python we have.
--- If everything has been configured correctly this should be constant
--- across all our installations.
---
--- All of these checks should return a single row because it should produce
--- a constant across all segments and the master.
---
--- The current expected version is 2.6.2
---
-CREATE OR REPLACE FUNCTION python_version() returns text as $$
-import sys
-return sys.version_info
-$$ language plpythonu;
-SELECT python_version() FROM env GROUP BY python_version;
-    python_version     
------------------------
- (2, 6, 2, 'final', 0)
-(1 row)
-
---
--- Same check for perl version
---
--- Expected version is perl 5.008xxx
---
-CREATE OR REPLACE FUNCTION perl_version() returns text as $$
-return "Perl $]"
-$$ language plperlu;
--- ignore
-SELECT perl_version() FROM env GROUP BY perl_version;
- perl_version  
----------------
- Perl 5.008005
-(1 row)
-
---
--- The following two checks need to be put into big ignore blocks
--- because paths can be of differing lengths
---
--- start_ignore
-CREATE OR REPLACE FUNCTION python_path() returns text as $$
-import sys
-return sys.path[0]
-$$ language plpythonu;
-SELECT python_path() FROM env GROUP BY python_path;
-                 python_path                 
----------------------------------------------
- @gphome@/lib/python
-(1 row)
-
-CREATE OR REPLACE FUNCTION perl_path() returns text as $$
-return join(':', @INC)
-$$ language plperlu;
-SELECT perl_path() FROM env GROUP BY perl_path;
-                                                                                                                                                                                                      perl_path                                                                                                                                                                                                      
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
- /System/Library/Perl/5.8.8/darwin-thread-multi-2level:/System/Library/Perl/5.8.8:/Library/Perl/5.8.8/darwin-thread-multi-2level:/Library/Perl/5.8.8:/Library/Perl:/Network/Library/Perl/5.8.8/darwin-thread-multi-2level:/Network/Library/Perl/5.8.8:/Network/Library/Perl:/System/Library/Perl/Extras/5.8.8/darwin-thread-multi-2level:/System/Library/Perl/Extras/5.8.8:/Library/Perl/5.8.6:/Library/Perl/5.8.1:.
-(1 row)
-
--- end_ignore
---
--- Create a harness to run shell commands and output stderr and stdout
---
-CREATE OR REPLACE FUNCTION execute(cmd text) returns text as $$
-import subprocess
-p = subprocess.Popen(cmd, shell=True, 
-                     stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-r = p.communicate()
-header = "---------------------\n"
-return header + r[0]
-$$ LANGUAGE plpythonu;
---
--- Create a harness to run mapreduce jobs on the correct host/port
---
-CREATE OR REPLACE FUNCTION mapreduce(file text) returns setof text as $$
-import subprocess
-
-rv = plpy.execute("select hostname, port, user as user, " +
-   " current_database() as db from gp_segment_configuration where content=-1")
-
-cmd = ['@gpwhich_gpmapreduce@',
-       '--host=%s' % rv[0]['hostname'],
-       '--port=%s' % rv[0]['port'],
-       '--file=%s' % file,
-       rv[0]['db'], rv[0]['user'] ]
-p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-r = p.communicate()
-header = "---------------------"
-err = '\nSTDERR> '.join(('STDERR> ' + r[1]).split('\n')).replace('\t', '   ')
-out = 'STDOUT>\n' + r[0].replace('\t', ', ')
-return [header, err, out]
-$$ LANGUAGE plpythonu;
-CREATE OR REPLACE FUNCTION mapreduce(file text, keys text) returns setof text as $$
-import subprocess
-
-rv = plpy.execute("select hostname, port, user as user, " +
-   " current_database() as db from gp_segment_configuration where content=-1")
-
-cmd = ['@gpwhich_gpmapreduce@',
-       '--host=%s' % rv[0]['hostname'],
-       '--port=%s' % rv[0]['port'],
-       '--file=%s' % file,
-       rv[0]['db'], rv[0]['user'] ]
-for key in keys.split(';'):
-  cmd.append('--key=%s' % key)
-p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-r = p.communicate()
-header = "---------------------"
-err = '\nSTDERR> '.join(('STDERR> ' + r[1]).split('\n')).replace('\t', '   ')
-out = 'STDOUT>\n' + r[0].replace('\t', ', ')
-return [header, err, out]
-$$ LANGUAGE plpythonu;
---
--- CHECK 1) make sure plperlu is really installed:
---
--- Note: if not, you might need to 'export PG_LANG=true' and reconfigure/rebuild
---
-SELECT lanname, lanispl, lanpltrusted FROM pg_language WHERE lanname = 'plperlu';
- lanname | lanispl | lanpltrusted 
----------+---------+--------------
- plperlu | t       | f
-(1 row)
-
---
--- Since many of these tests will end up having variable length output depending
--- on local paths we want to disable the "----" lines from psql so that our diffs
--- look reasonable.
---
--- The default is "aligned" if that ever changes then we need to change how we restore
--- it at the end of the test
-\pset format
-\pset format unaligned
---
--- Check 2) Find gpmapreduce and print the help file
---
--- ignore
-select execute( '@gpwhich_gpmapreduce@ --help' );
-execute
----------------------
-@gpwhich_gpmapreduce@ - Greenplum Map/Reduce Driver 1.00b2
-
-Usage:
-  @gpwhich_gpmapreduce@ [options] -f file.yml [dbname [username]]
-
-General options:
-  -? | --help                   show this help, then exit
-  -V | --version                show version information, then exit
-  -v | --verbose                verbose output
-  -x | --explain                do not run jobs, but produce explain plans
-  -X | --explain-analyze        run jobs and produce explain-analyze plans
-  -k | --key <name>=<value>     sets a yaml variable
-
-Connection options:
-  -h | --host <hostname>        database server host or socket directory
-  -p | --port <port>            database server port
-  -U | --username <username>    database user name
-  -W | --password               prompt for password
-
-(1 row)
---
--- TEST 1) complain about missing file
---
-SELECT mapreduce('nosuchfile') ORDER BY 1;
-mapreduce
----------------------
-STDERR> Error: Could not open file 'nosuchfile'
-STDERR> 
-STDOUT>
-
-(3 rows)
---
--- TEST 2) Tests reading and sorting an input file;
---
-SELECT mapreduce('@abs_srcdir@/yml/sort.yml') ORDER BY 1;
-mapreduce
----------------------
-STDERR> mapreduce_73685_run_1
-STDERR> 
-STDOUT>
-value                    
--------------------------
-alex, 30, (1.352,8.2)      
-belinda, 38, (8.9,1.7)     
-bertha, 88, (2.75,9.4)     
-carina, 58, (4.27,8.8)     
-carmen, 78, (3.8,8.2)      
-chris, 78, (9.78,2)        
-denise, 24, (3.78,87.90)   
-diane, 18, (5.912,5.3)     
-edna, 18, (1.53,3.5)       
-esther, 98, (5.36,7.6)     
-fanny, 08, (1.2,0.9)       
-gina, 18, (9.82,7.5)       
-jane, 58, (1.34,0.44)      
-jean, 28, (8.561,7.3)      
-jenifer, 38, (6.6,23.3)    
-joan, 18, (9.4,47.04)      
-joe, 20, (5.5,2.5)         
-juanita, 58, (4.57,35.8)   
-julie, 68, (3.6,7.2)       
-karen, 48, (8.73,0.0)      
-koko, 88, (1.7,5.5)        
-leah, 68, (0.6,3.37)       
-lita, 25, (1.3,8.7)        
-liza, 38, (9.76,6.90)      
-louise, 98, (5.0,8.7)      
-martie, 88, (8.358,.93)    
-mary, 08, (3.7,39.20)      
-melissa, 28, (3.089,087.23)
-mike, 40, (3.1,6.2)        
-nan, 28, (6.35,0.43)       
-pamela, 48, (8.21,9.3)     
-pat, 18, (1.19,0.6)        
-paula, 68, (0.5,0.5)       
-rean, 48, (8.5,5.0)        
-sally, 34, (3.8,45.8)      
-sandra, 19, (9.345,09.6)   
-sandy, 38, (3.8,0.2)       
-sarah, 88, (8.4,2.3)       
-sharon, 78, (9.237,8.8)    
-sue, 50, (8.34,7.375)      
-sumi, 38, (1.15,0.6)       
-susan, 78, (6.579,3)       
-teresa, 38, (7.7,1.8)      
-trisha, 88, (1.29,2.2)     
-trudy, 88, (6.01,0.5)      
-velma, 68, (8.8,8.9)       
-vera, 78, (9.73,6.4)       
-wendy, 78, (2.62,03.3)     
-zena, 98, (0.35,0)         
-zola, 58, (2.56,4.3)       
-(50 rows)
-
-
-(3 rows)
---
--- TEST 3) Tests a basic map function and parameter passing
---
-SELECT mapreduce('@abs_srcdir@/yml/grep.yml', 'key=an') ORDER BY 1;
-mapreduce
----------------------
-STDERR> mapreduce_73691_run_1
-STDERR> 
-STDOUT>
-key|value                 
----+----------------------
-an |diane, 18, (5.912,5.3)  
-an |fanny, 08, (1.2,0.9)    
-an |jane, 58, (1.34,0.44)   
-an |jean, 28, (8.561,7.3)   
-an |joan, 18, (9.4,47.04)   
-an |juanita, 58, (4.57,35.8)
-an |nan, 28, (6.35,0.43)    
-an |rean, 48, (8.5,5.0)     
-an |sandra, 19, (9.345,09.6)
-an |sandy, 38, (3.8,0.2)    
-an |susan, 78, (6.579,3)    
-(11 rows)
-
-
-(3 rows)
---
--- Test 4) Tests producing multiple columns
---
-SELECT mapreduce('@abs_srcdir@/yml/grep2.yml', 'key=an') ORDER BY 1;
-mapreduce
----------------------
-STDERR> mapreduce_73697_run_1
-STDERR> 
-STDOUT>
-name   |age|location    
--------+---+------------
-diane  | 18|(5.912,5.3) 
-fanny  |  8|(1.2,0.9)   
-jane   | 58|(1.34,0.44) 
-jean   | 28|(8.561,7.3) 
-joan   | 18|(9.4,47.04) 
-juanita| 58|(4.57,35.8) 
-nan    | 28|(6.35,0.43) 
-rean   | 48|(8.5,5.0)   
-sandra | 19|(9.345,09.6)
-sandy  | 38|(3.8,0.2)   
-susan  | 78|(6.579,3)   
-(11 rows)
-
-
-(3 rows)
---
--- Test 5) Tests a basic reduce function
---
-SELECT mapreduce('@abs_srcdir@/yml/agebracket.yml') ORDER BY 1;
-mapreduce
----------------------
-STDERR> mapreduce_73703_run_1
-STDERR> 
-STDOUT>
-key            |value
----------------+-----
-0 => age < 10  |    2
-10 => age < 20 |    6
-20 => age < 30 |    6
-30 => age < 40 |    8
-40 => age < 50 |    4
-50 => age < 60 |    5
-60 => age < 70 |    4
-70 => age < 80 |    6
-80 => age < 90 |    6
-90 => age < 100|    3
-(10 rows)
-
-
-(3 rows)
---
--- Test 6) File Output tests
---
-SELECT execute('rm @abs_builddir@/results/fileout_*.out') ORDER BY 1;
-execute
----------------------
-
-(1 row)
-SELECT mapreduce('@abs_srcdir@/yml/fileout.yml') ORDER BY 1;
-mapreduce
----------------------
-STDERR> mapreduce_73712_run_1
-STDERR> mapreduce_73712_run_2
-STDERR> mapreduce_73712_run_3
-STDERR> 
-STDOUT>
-
-(3 rows)
-SELECT execute('cat @abs_builddir@/results/fileout_none.out') ORDER BY 1;
-execute
----------------------
-row 1:data 1
-row 2:data 2
-row 3:data 3
-
-(1 row)
-SELECT execute('cat @abs_builddir@/results/fileout_replace.out') ORDER BY 1;
-execute
----------------------
-row 1|data 1
-row 2|data 2
-row 3|data 3
-
-(1 row)
-SELECT execute('cat @abs_builddir@/results/fileout_append.out') ORDER BY 1;
-execute
----------------------
-row 1,data 1
-row 2,data 2
-row 3,data 3
-
-(1 row)
-SELECT mapreduce('@abs_srcdir@/yml/fileout.yml') ORDER BY 1;
-mapreduce
----------------------
-STDERR> mapreduce_73721_run_1
-STDERR> mapreduce_73721_run_2
-STDERR> mapreduce_73721_run_3
-STDERR> Error: OUTPUT 'out_3': file '@abs_builddir@/results/fileout_none.out' already exists, at line 27
-STDERR> Error: Object creation Failure
-STDERR> 
-STDOUT>
-
-(3 rows)
-SELECT execute('cat @abs_builddir@/results/fileout_none.out') ORDER BY 1;
-execute
----------------------
-row 1:data 1
-row 2:data 2
-row 3:data 3
-
-(1 row)
-SELECT execute('cat @abs_builddir@/results/fileout_replace.out') ORDER BY 1;
-execute
----------------------
-row 1|data 1
-row 2|data 2
-row 3|data 3
-
-(1 row)
-SELECT execute('cat @abs_builddir@/results/fileout_append.out') ORDER BY 1;
-execute
----------------------
-row 1,data 1
-row 2,data 2
-row 3,data 3
-row 1,data 1
-row 2,data 2
-row 3,data 3
-
-(1 row)
--- 
--- Test 7) Perl syntax error lineno reporting
---
-SELECT mapreduce('@abs_srcdir@/yml/perlerror.yml') ORDER BY 1;
-mapreduce
----------------------
-STDERR> ERROR:  creation of Perl function failed
-STDERR> DETAIL:  syntax error at line 18, near "[]"
-STDERR> syntax error at line 20, near ";
-STDERR>  }"
-STDERR> Error: Object creation Failure
-STDERR> ERROR:  creation of Perl function failed
-STDERR> DETAIL:  syntax error at line 28, near "[]"
-STDERR> syntax error at line 29, near ";
-STDERR>  }"
-STDERR> Error: Object creation Failure
-STDERR> ERROR:  creation of Perl function failed
-STDERR> DETAIL:  syntax error at line 37, near "[]"
-STDERR> Error: Object creation Failure
-STDERR> ERROR:  creation of Perl function failed
-STDERR> DETAIL:  syntax error at line 45, near "[]"
-STDERR> Error: Object creation Failure
-STDERR> ERROR:  creation of Perl function failed
-STDERR> DETAIL:  syntax error at line 53, near "[]"
-STDERR> Error: Object creation Failure
-STDERR>
-STDOUT>
-
-(3 rows)
---
--- Cleanup) Restore normal formatting options
---
-\pset format aligned

http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/4e392375/src/test/regress/pg_regress.c
----------------------------------------------------------------------
diff --git a/src/test/regress/pg_regress.c b/src/test/regress/pg_regress.c
index bb2c94d..b222a0b 100644
--- a/src/test/regress/pg_regress.c
+++ b/src/test/regress/pg_regress.c
@@ -526,10 +526,6 @@ convert_sourcefiles(void)
 	ret = stat("output", &st);
 	if (ret == 0 && S_ISDIR(st.st_mode))
 		convert_sourcefiles_in("output", "expected", "out");
-
-	ret = stat("mapred", &st);
-	if (ret == 0 && S_ISDIR(st.st_mode))
-		convert_sourcefiles_in("mapred", "yml", "yml");
 }
 
 /*

http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/4e392375/src/test/regress/sql/.gitignore
----------------------------------------------------------------------
diff --git a/src/test/regress/sql/.gitignore b/src/test/regress/sql/.gitignore
index debb73d..28218be 100644
--- a/src/test/regress/sql/.gitignore
+++ b/src/test/regress/sql/.gitignore
@@ -34,7 +34,6 @@ create_function_2.sql
 aocs.sql
 appendonly.sql
 bkup_bkupdb.sql
-mapred.sql
 ereport.sql
 orafunk.sql
 tidycat.sql

http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/4e392375/src/tools/pgindent/typedefs.list
----------------------------------------------------------------------
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 3fed79b..d31b74e 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -1303,27 +1303,6 @@ LWLockId
 LWLockMode
 LWLockPadded
 macaddr
-mapred_adt_t
-mapred_clist_t
-mapred_document_t
-mapred_format_t
-mapred_function_t
-mapred_input_kind_t
-mapred_input_t
-mapred_kind_t
-mapred_mode_t
-mapred_object_t
-mapred_olist_t
-mapred_output_kind_t
-mapred_output_mode_t
-mapred_output_t
-mapred_parser_context_t
-mapred_parser_state_t
-mapred_parser_t
-mapred_plist_t
-mapred_reducer_t
-mapred_reference_t
-mapred_task_t
 marker_t
 MatchedItemPtr
 Material

http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/4e392375/tools/demo/gpmapreduce/1_grep.yml
----------------------------------------------------------------------
diff --git a/tools/demo/gpmapreduce/1_grep.yml b/tools/demo/gpmapreduce/1_grep.yml
deleted file mode 100644
index 29cf974..0000000
--- a/tools/demo/gpmapreduce/1_grep.yml
+++ /dev/null
@@ -1,34 +0,0 @@
-%YAML 1.1
----
-VERSION:         1.0.0.1
-
-DEFINE:
-  - INPUT:
-      NAME:      access_logs
-      FILE:
-         # change seghostname1, seghostname2 and file_path to reflect
-         # your runtime file locations
-         - seghostname1:/file_path/access_log
-         - seghostname2:/file_path/access_log2
-         
-  - MAP:
-      NAME:      grep_map
-      LANGUAGE:  perl
-      FUNCTION:  |
-        # 0: name the input parameters
-        my ($key, $value) = @_;
-        
-        # 1: extract the URL portion of the access log
-        $value =~ /"GET (.*) HTTP/;
-        my $url = $1;
-        
-        return [{"key" => $key, "value" => $value}] if ($value =~/$key/);
-        return [];
-      
-EXECUTE:
-  - RUN:
-      SOURCE:    access_logs
-      MAP:       grep_map
-      REDUCE:    IDENTITY
-
-

http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/4e392375/tools/demo/gpmapreduce/2_wordcount.yml
----------------------------------------------------------------------
diff --git a/tools/demo/gpmapreduce/2_wordcount.yml b/tools/demo/gpmapreduce/2_wordcount.yml
deleted file mode 100644
index 2d136ab..0000000
--- a/tools/demo/gpmapreduce/2_wordcount.yml
+++ /dev/null
@@ -1,30 +0,0 @@
-%YAML 1.1
----
-VERSION:         1.0.0.1
-
-DEFINE:
-  - INPUT:
-      NAME:      book
-      FILE:
-        # change seghostname and file_path to reflect your 
-        # runtime file location
-        - seghostname:/file_path/whitepaper.txt
-
-  - MAP:
-      NAME:      wordsplit_python
-      FUNCTION:  |
-        for word in value.split():
-          yield [word, 1]
-      LANGUAGE:   python
-      OPTIMIZE:   STRICT IMMUTABLE
-      PARAMETERS: value text
-      RETURNS:
-        - key text
-        - value integer
-        
-EXECUTE:
-  - RUN:
-      SOURCE:    book
-      MAP:       wordsplit_python
-      REDUCE:    SUM
-

http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/4e392375/tools/demo/gpmapreduce/README
----------------------------------------------------------------------
diff --git a/tools/demo/gpmapreduce/README b/tools/demo/gpmapreduce/README
deleted file mode 100644
index af41045..0000000
--- a/tools/demo/gpmapreduce/README
+++ /dev/null
@@ -1,131 +0,0 @@
-**************************************************************************
-ABOUT THE GREENPLUM MAPREDUCE DEMOS
-**************************************************************************
-
-This package contains two example programs of Greenplum MapReduce:
-
-1. Grep - This Greenplum MapReduce program searches through web access 
-          log files and extracts the URL portion of the log.
-       
-          Specification File: 1_grep.yml
-          Data Files: access_log, access_log2
-          Outputs To: STDOUT
-
-2. Word Count - This Greenplum MapReduce program goes through a text 
-                document and counts the distinct words found.
-
-                Specification File: 2_wordcount.yml
-                Data Files: whitepaper.txt
-                Outputs To: STDOUT
-
-
- 
-**************************************************************************
-BEFORE YOU BEGIN
-**************************************************************************
-
-1. You must be a Greenplum Database superuser (such as gpadmin) to run 
-   the MapReduce demos.
-
-
-2. Create a database to use for the Greenplum MapReduce demos:
-
-    $ createdb gpmrdemo
-
-3. In this database, create the procedural languages used by the demos:
-
-    $ psql gpmrdemo -c 'CREATE LANGUAGE plpythonu;'
-
-
-NOTE: If the CREATE LANGUAGE command does not succeed because a
-      library (.so file) could not be found, you will need to make
-      sure that all hosts in your Greenplum array have a shared 
-      Perl and Python installation. You must also make sure the
-      Perl and Python libraries are findable by your runtime linker
-      on all hosts in your Greenplum Database array. Greenplum
-      provides installer packages for Perl and Python on 
-      https://emc.subscribenet.com  if you need them.
-
-
-4. On your segment hosts, create a location to put the Greenplum
-   MapReduce demo data files. If you are running multiple segment 
-   instances per segment host, you can create this location on just 
-   one of your segment hosts. For example:
-
-      $ gpssh -h seghost1 -e 'mkdir /home/gpadmin/gpmrdata'
-
-    If you are running with only segment instance per host, 
-    you will need to use two segment hosts:
-
-      $ $ gpssh -h seghost1 -h seghost2 -e 'mkdir /home/gpadmin/gpmrdata'
-
-
-
-
-**************************************************************************
-RUNNING DEMO 1: GREP
-**************************************************************************
-
-1. Copy the data files for this demo to the demo data location on your 
-   segment hosts. For example:
-
-    $ gpscp -h seghost1 -h seghost2 data/accesslog data/accesslog2 =:/home/gpadmin/gpmrdata
-
-
-2. Edit the 1_grep.yml file and change the <seghostname1>, <seghostname2>, 
-   and <file_path> place holders to reflect the actual location of the 
-   data files. For example:
-
-     FILE:
-        - myseghost1:/home/gpadmin/gpmrdata/access_log
-        - myseghost2:/home/gpadmin/gpmrdata/access_log2 
-
-3. Execute the 1_grep.yml Greenplum MapReduce job:
-
-    $ gpmapreduce -f 1_grep.yml gpmrdemo
-
-4. The program should return 47 output rows.
-
-
-**************************************************************************
-RUNNING DEMO 2: WORD COUNT
-**************************************************************************
-
-1. Copy the data file for this demo to the demo data location on a segment 
-   host. For example:
-
-    $ scp data/whitepaper.txt myseghost1:/home/gpadmin/gpmrdata
-
-
-2. Edit the 2_wordcount.yml file and change the <seghostname> and 
-   <file_path> place holders to reflect the actual location of the 
-   data file. For example:
-
-     FILE:
-        - myseghost1:/home/gpadmin/gpmrdata/whitepaper.txt 
-
-3. Execute the 2_wordcount.yml Greenplum MapReduce job:
-
-    $ gpmapreduce -f 2_wordcount.yml gpmrdemo | more
-
-4. The program should return 1488 output rows.
-
-
-**************************************************************************
-DEMO CLEANUP
-**************************************************************************
-
-After you have run the demos, run the following commands to clean up:
-
-1. Remove the demo data on the segment hosts:
-
-    $ gpssh -h seghost1 -h seghost2 -e 'rm /home/gpadmin/gpmrdata'
-
-2. Drop the demo database:
-
-     $ dropdb gpmrdemo
-
-
-
-
-                                   

http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/4e392375/tools/demo/gpmapreduce/data/access_log
----------------------------------------------------------------------
diff --git a/tools/demo/gpmapreduce/data/access_log b/tools/demo/gpmapreduce/data/access_log
deleted file mode 100644
index 6e57876..0000000
--- a/tools/demo/gpmapreduce/data/access_log
+++ /dev/null
@@ -1,27 +0,0 @@
-10.254.0.52 - - [28/Aug/2008:16:52:13 -0700] "GET / HTTP/1.1" 200 1456
-10.254.0.52 - - [28/Aug/2008:16:52:13 -0700] "GET /apache_pb.gif HTTP/1.1" 200 2326
-10.254.0.52 - - [28/Aug/2008:16:52:13 -0700] "GET /favicon.ico HTTP/1.1" 404 209
-10.254.0.52 - - [28/Aug/2008:16:52:16 -0700] "GET /favicon.ico HTTP/1.1" 404 209
-10.254.0.52 - - [28/Aug/2008:16:52:21 -0700] "GET /~mapreduce HTTP/1.1" 301 236
-10.254.0.52 - - [28/Aug/2008:16:52:21 -0700] "GET /~mapreduce/ HTTP/1.1" 200 2657
-10.254.0.52 - - [28/Aug/2008:16:52:21 -0700] "GET /~mapreduce/images/gradient.jpg HTTP/1.1" 200 16624
-10.254.0.52 - - [28/Aug/2008:16:52:27 -0700] "GET /manual/ HTTP/1.1" 200 7559
-10.254.0.52 - - [28/Aug/2008:16:52:27 -0700] "GET /manual/style/css/manual.css HTTP/1.1" 200 18674
-10.254.0.52 - - [28/Aug/2008:16:52:27 -0700] "GET /manual/style/css/manual-print.css HTTP/1.1" 200 13200
-10.254.0.52 - - [28/Aug/2008:16:52:27 -0700] "GET /manual/style/css/manual-loose-100pc.css HTTP/1.1" 200 3065
-10.254.0.52 - - [28/Aug/2008:16:52:28 -0700] "GET /manual/images/favicon.ico HTTP/1.1" 200 1078
-10.254.0.52 - - [28/Aug/2008:16:52:29 -0700] "GET /manual/images/feather.gif HTTP/1.1" 200 6471
-10.254.0.52 - - [28/Aug/2008:16:52:29 -0700] "GET /manual/images/left.gif HTTP/1.1" 200 60
-10.254.0.52 - - [28/Aug/2008:16:52:31 -0700] "GET /manual/caching.html HTTP/1.1" 200 38651
-10.254.0.52 - - [28/Aug/2008:16:52:31 -0700] "GET /manual/images/down.gif HTTP/1.1" 200 56
-10.254.0.52 - - [28/Aug/2008:16:52:31 -0700] "GET /manual/images/up.gif HTTP/1.1" 200 57
-10.254.0.52 - - [28/Aug/2008:16:52:31 -0700] "GET /manual/images/caching_fig1.gif HTTP/1.1" 200 16515
-10.254.0.52 - - [28/Aug/2008:16:52:34 -0700] "GET /manual/platform/netware.html HTTP/1.1" 200 30591
-10.254.0.52 - - [28/Aug/2008:16:52:38 -0700] "GET /manual/mod/quickreference.html HTTP/1.1" 200 105453
-10.254.0.52 - - [28/Aug/2008:16:52:43 -0700] "GET /~mapreduce/demo HTTP/1.1" 403 215
-10.254.0.52 - - [28/Aug/2008:16:53:19 -0700] "GET /~mapreduce/demo/ HTTP/1.1" 200 2072
-10.254.0.52 - - [28/Aug/2008:16:53:19 -0700] "GET /icons/blank.gif HTTP/1.1" 200 148
-10.254.0.52 - - [28/Aug/2008:16:53:19 -0700] "GET /icons/folder.gif HTTP/1.1" 200 225
-10.254.0.52 - - [28/Aug/2008:16:53:19 -0700] "GET /icons/unknown.gif HTTP/1.1" 200 245
-10.254.0.52 - - [28/Aug/2008:16:53:19 -0700] "GET /icons/back.gif HTTP/1.1" 200 216
-

http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/4e392375/tools/demo/gpmapreduce/data/access_log2
----------------------------------------------------------------------
diff --git a/tools/demo/gpmapreduce/data/access_log2 b/tools/demo/gpmapreduce/data/access_log2
deleted file mode 100644
index 10bbacd..0000000
--- a/tools/demo/gpmapreduce/data/access_log2
+++ /dev/null
@@ -1,20 +0,0 @@
-10.254.0.52 - - [28/Aug/2008:16:53:22 -0700] "GET /~mapreduce/demo/1_grep.yml HTTP/1.1" 200 700
-10.254.0.52 - - [28/Aug/2008:16:53:25 -0700] "GET /~mapreduce/demo/2_wordcount.yml HTTP/1.1" 200 571
-10.254.0.52 - - [28/Aug/2008:16:53:28 -0700] "GET /~mapreduce/demo/3_wordcount_stem.yml HTTP/1.1" 200 1502
-10.254.0.52 - - [28/Aug/2008:16:53:30 -0700] "GET /~mapreduce/demo/4_join.yml HTTP/1.1" 200 2342
-10.254.0.52 - - [28/Aug/2008:16:53:32 -0700] "GET /~mapreduce/demo/5_keyword_match.yml HTTP/1.1" 200 4424
-10.254.0.52 - - [28/Aug/2008:16:53:35 -0700] "GET /~mapreduce/demo/wordcount_stem.out HTTP/1.1" 200 1205
-10.254.0.52 - - [28/Aug/2008:16:53:38 -0700] "GET /~mapreduce/demo/?C=M;O=A HTTP/1.1" 200 2072
-10.254.0.52 - - [28/Aug/2008:16:53:40 -0700] "GET /~mapreduce/demo/?C=D;O=A HTTP/1.1" 200 2072
-10.254.0.52 - - [28/Aug/2008:16:53:40 -0700] "GET /~mapreduce/demo/?C=S;O=A HTTP/1.1" 200 2072
-10.254.0.52 - - [28/Aug/2008:16:53:42 -0700] "GET /~mapreduce/demo/?C=N;O=A HTTP/1.1" 200 2072
-10.254.0.52 - - [28/Aug/2008:16:53:44 -0700] "GET /~mapreduce/demo/data/ HTTP/1.1" 200 1262
-10.254.0.52 - - [28/Aug/2008:16:53:45 -0700] "GET /~mapreduce/demo/data/?C=M;O=A HTTP/1.1" 200 1262
-10.254.0.52 - - [28/Aug/2008:16:53:46 -0700] "GET /~mapreduce/demo/data/?C=D;O=A HTTP/1.1" 200 1262
-10.254.0.52 - - [28/Aug/2008:16:53:48 -0700] "GET /~mapreduce/demo/data/email/ HTTP/1.1" 200 1668
-10.254.0.52 - - [28/Aug/2008:16:53:48 -0700] "GET /icons/text.gif HTTP/1.1" 200 229
-10.254.0.52 - - [28/Aug/2008:16:53:49 -0700] "GET /~mapreduce/demo/data/email/?C=N;O=D HTTP/1.1" 200 1668
-10.254.0.52 - - [28/Aug/2008:16:53:54 -0700] "GET /~mapreduce/demo/data/email/mailfiles HTTP/1.1" 200 330
-10.254.0.52 - - [28/Aug/2008:17:20:30 -0700] "GET /~mapreduce/ HTTP/1.1" 304 -
-10.254.0.52 - - [28/Aug/2008:17:20:33 -0700] "GET /~mapreduce/demo HTTP/1.1" 301 241
-10.254.0.52 - - [28/Aug/2008:17:20:33 -0700] "GET /~mapreduce/demo/ HTTP/1.1" 200 2072

http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/4e392375/tools/demo/gpmapreduce/data/whitepaper.txt
----------------------------------------------------------------------
diff --git a/tools/demo/gpmapreduce/data/whitepaper.txt b/tools/demo/gpmapreduce/data/whitepaper.txt
deleted file mode 100644
index 3deae71..0000000
--- a/tools/demo/gpmapreduce/data/whitepaper.txt
+++ /dev/null
@@ -1,448 +0,0 @@
-Introduction 
-
-In 2004, Google published a research paper on the MapReduce 
-framework they developed for their internal data processing 
-needs.  In simple, approachable terms, the paper describes how 
-Google developers harness massively parallel clusters of computers 
-to analyze some of the largest datasets ever collected. 
-Since that paper was published, there has been ongoing discussion 
-about the role of this technology outside the walls of Google. 
-Excitement about MapReduce has spread quickly in the computing 
-industry, particularly in young and forward-looking firms.  But 
-there is confusion and controversy about how the technology fits 
-into the larger ecosystem of information technology, especially 
-with respect to other "big data" solutions like massively parallel 
-SQL database engines.  
-
-In this whitepaper, we provide a technical context for that discussion.  
-In a nutshell, we present SQL and MapReduce as two different 
-programming paradigms that are implemented via a common 
-engine architecture: parallel dataflow.  Seen in these terms, 
-MapReduce can be viewed as a new programming interface to 
-traditional data-parallel computing. 
-
-After presenting this context, we introduce Greenplum MapReduce: 
-a seamless integration of MapReduce and relational database 
-functionality unified in one massively parallel dataflow engine.   
-We describe how Greenplum allows MapReduce programs and 
-SQL to interoperate, efficiently and flexibly processing data in  
-both standard files and database tables. 
-
-History: Three Revolutions 
-
-To understand Greenplum’s MapReduce implementation, it is 
-helpful to see it in the context of three historical shifts in the 
-technology behind large-scale data management: the Relational 
-Database revolution, the rise of shared-nothing parallelism, and 
-the popularization of the MapReduce parallel programming paradigm. 
-
-Relational Database Systems (RDBMSs) were a radical idea when they 
-were introduced, and they revolutionized the way that enterprises 
-manage their records. The birth of relational technology in research 
-is well documented.  In 1970, IBM researcher Ted Codd published his 
-first paper on the relational model of data [Codd70], which proposed 
-representing data in tables of rows and columns, and querying the 
-data using a high-level declarative language that formed the foundation 
-for what we now know as SQL.  Some 5 years later, Jim Gray and 
-colleagues at IBM research proposed ACID transactions as a model for 
-correctness of concurrent updates in a database [Gray78].  Codd and 
-Gray both received the Turing award (the "Nobel prize of computer 
-science") for this work.  By the mid-1970’s, researchers at IBM and UC 
-Berkeley were hard at work on the two key prototype systems – System 
-R and Ingres –  that gave birth to the modern relational database  
-industry.  Both of these systems developed query optimizer technology 
-that compiles declarative queries and passes the result to a dataflow 
-processing engine, which direct streams of data through operations 
-like filters, index lookups, joins, grouping and aggregation. 
-
-Relational databases remain the workhorses of modern record keeping 
-30 years later, and for good reason.  Modern implementations of ACID 
-transactions ensure dependable, consistent management of data 
-storage.   The declarative nature of SQL enables data analysis via ad 
-hoc queries, and ensures that data-centric applications continue to 
-work correctly even as data layouts and database hardware evolves.  
-Beneath all this, the simple elegance of the relational model helps 
-provide discipline needed for sound, long-term database design. 
-
-"Shared Nothing" Parallelism (1980’s-present) 
-
-As relational database systems were becoming a commercial reality 
-in the 1980’s, efforts were afoot to accelerate database performance 
-via custom hardware known then as "database machines".  However, 
-it quickly became clear that economies of scale favored commodity 
-hardware over custom solutions: the latest version of a commodity 
-computer invariably provides a better price/performance point than 
-last year’s custom-designed machine, negating the performance 
-benefits of customization.  As a result, the main early efforts toward 
-database machines were deemed a failure by the researchers and 
-entrepreneurs who pioneered the area [BoralDeWitt83]. 
-
-Out of the ashes of the work on database machines, a new idea rose: 
-database software could be parallelized to leverage multiple commodity 
-processors in a network to deliver increased scalability and performance.  
-The failed history of custom database machines led to a particular  
-interest in building parallel databases using commodity computers.  
-The term "shared-nothing" parallelism was coined for these computing 
-clusters, to distinguish them from the shared-memory multiprocessors 
-that were being designed at the time for scientific applications. 
-
-In order to harness the power of a cluster, query processing software 
-had to evolve to take advantage of multiple disks, processors, and 
-network links operating in parallel.  To achieve this, the rows of a table 
-were partitioned across multiple machines with separate disks,  
-enabling parallel I/O scans of big tables.  Basic relational query operators 
-like selection, join, grouping and aggregation were reinvented to run 
-in parallel via similar partitioning schemes: the operations undertaken 
-by each node in the cluster are the same, but the data being pumped 
-through the fabric is automatically partitioned to allow each node to 
-work on its piece of the operator independently.  Finally, these  
-architectures allowed multiple relational operators to operate at the  
-same time, allowing pipeline parallelism in which an operator producing 
-a data stream runs in parallel with the operator consuming it.  The 
-resulting shared-nothing parallel RDBMSs were explored in research 
-prototypes like Gamma and Bubba, and commercialized early on by 
-Teradata and Tandem.  
-
-Shared-nothing architectures enabled relational databases to scale 
-to unprecedented levels.  This changed the way that many businesses 
-approached the value of data: in addition to keeping the current books 
-correct, analytic applications could be built over historical records to 
-provide new business efficiencies.  In the 1990’s, WalMart famously 
-utilized parallel databases to gain radical efficiencies in supply chain 
-management via item-level inventory and historical sales information.  
-In recent years, virtually every sizable enterprise has realized the 
-importance of scalable solutions for data warehousing and analytics. 
-
-Parallel Programming with MapReduce (2000-present) 
-
-In the last decade, the importance of shared-nothing clusters was 
-rediscovered in the design of web services like search engine 
-infrastructure and messaging [Brewer01].  However, the implementation 
-of those early web services was done by small teams of expert  
-developers, much as the early parallel database systems were built. 
-In this context, Google was differentiating itself as a company by 
-developing a data-driven culture, in which employees are explicitly 
-encouraged to (a) develop innovative solutions by analyzing the 
-company’s data assets, and (b) gain project approval from colleagues 
-by using data analysis to overcome "conventional wisdom" and other 
-institutional arguments against innovation [Kaushik06]. The growth 
-of Google’s data-driven culture was facilitated by getting the right  
-analytic tools into the hands of employees: tools that could allow  
-software developers to conveniently explore and analyze some of  
-the largest data sets ever assembled. 
-
-The key tools that Google built for their developers were the MapReduce 
-programming paradigm, and a proprietary runtime engine for internal 
-use at Google [DeanGhemawat08]. At heart, MapReduce is a very 
-simple dataflow programming model that passes data items through 
-simple user-written code fragments.  Google’s MapReduce programs 
-start with a large datafile that is broken into contiguous pieces called 
-"splits".  Each split is converted via user-defined parsing code into 
-(key, value) pairs that are sent to a Map module, which invokes a user- 
-supplied Map function on each pair, producing a new key and list of 
-output values.  Each (key, output_list) pair is passed to a Reduce  
-module (possibly on another machine) that gathers them together,  
-assembles them into groups by key, and then calls a user-supplied  
-Reduce function to produce one reduced output list per group of 
-mapped (key, output_list) pairs.  Both the Map and Reduce modules 
-utilize partition parallelism to enable many Map tasks (and many  
-Reduce tasks) to run in parallel. 
-
-MapReduce has become very popular within Google for everything 
-from the construction of their core web index, to simple programs 
-written by a single developer in a half hour [DeanGhemawat08]1 .  The 
-MapReduce programming model has become available to programmers 
-outside of Google as well, via the Hadoop open-source runtime. 
-MapReduce is particularly attractive to developers for two main reasons:
- 
-• Data accessibility: Data is accessed from standard files, with no 
-need for a priori definition of schemas or file formats, and no 
-need to configure and load a database before getting answers.  
-This allows developers to "wrangle" any file format that they 
-have at hand; at a company like Google this includes web crawls 
-(HTML), term occurrence data, clickstream logs, and advertising 
-revenue history.  The focus on standard files also means that  
-developers can typically get work done without requesting permission 
-from the "Keepers of the Data" that guard traditional IT shops. 
-
-• Language Familiarity: Most of the MapReduce programmer’s  
-work is done in familiar programming languages used by  
-developers: Google’s MapReduce uses C++, and Hadoop uses 
-Java.  This exposes massive data parallelism to developers within 
-the context of their familiar development environment: editors, 
-debuggers, and so on.  By contrast, relatively few developers 
-work with data-centric languages like SQL on a daily basis, and 
-SQL experts tend to inhabit a different ecosystem (training, job 
-title) than typical software developers. 
-
-Technical common ground: Parallel Dataflow 
-
-The MapReduce revolution is so recent that the dust has yet to  
-settle on the new regime – there is still plenty of debate about how 
-MapReduce and parallel RDBMSs fit together in a data-centric  
-organization. Some database leaders have argued publicly that the 
-MapReduce phenomenon is not a technical revolution at all – they 
-characterize it as a reinvention of well-known parallel database 
-techniques that is missing key database functionality (ACID storage, 
-indexes, query optimization, etc.) 
-
-
-[DeWittStonebraker08].  The MapReduce proponents argue that they 
-neither need nor want a heavyweight database for many tasks, and 
-they have no interest in coding in SQL.  From their standpoint, 
-MapReduce has revolutionized the developer ecosystem, providing 
-them with easy access to parallelism over their own data, in their own 
-language framework.  
-
-Both these arguments have merit.  But the disconnect between these 
-viewpoints can lead to inefficiencies and confusion in an organization 
-trying to instill a broad data-driven culture.  Consider what happens 
-if the traditionally cautious IT department requires the use of a full 
-RDBMS feature stack, and the maverick developers focus on the light- 
-weight and programmer-friendly MapReduce framework.  Data assets 
-get partitioned across teams, as do in-house program for data analysis.  
-Worse, two separate data cultures evolve within the organization, 
-leading to destructive "data ownership" politics, and arguments over 
-tools rather than solutions. 
-
-Despite the differences in programming interfaces and software 
-philosophy, RDBMSs and MapReduce engines both are brought alive 
-by the same "beating heart": a massively parallel dataflow engine, 
-pumping data across a parallel network mesh, through high-performance 
-bulk operations (join, map, reduce, etc.). Is it possible to take that core 
-dataflow component, and provide interfaces for both ecosystems? 
-In principle, this should be entirely natural. The main barriers come 
-from software engineering realities.  The tried-and-true parallel RDBMS 
-engines were built in the 1980’s and 90’s with the dataflow engine 
-embedded deep into the relational codebase.  It is a tall order to 
-extract the "beating heart" from those systems for reuse elsewhere.  
-By contrast, MapReduce implementations like Hadoop provide none 
-of the key features required of a DBMS.  They have a lot of "heart", but 
-the body-building required to replicate a full-featured RDBMS would 
-take years. 
-
-Greenplum enters this arena from a unique direction.  Greenplum 
-began in the "heart transplant" business: its core technology effort 
-was to take PostgreSQL, the best-of-breed open-source RDBMS, and 
-insert a massively parallel dataflow engine into its core.  Based on 
-that success, Greenplum is now able to offer the first commercial  
-implementation of MapReduce, built on that same core parallel  
-technology.  Because Greenplum’s RDBMS and MapReduce share  
-the same core engine, they are uniquely interoperable. 
-
-Introducing Greenplum MapReduce 
-
-Greenplum MapReduce provides a convenient, easy-to-program  
-platform for massive data-parallelism. It implements a harness for  
-parallel Map and Reduce functions, along with flexible data access  
-to files, database records, and system services. 
-
-Greenplum allows developers to write Map and Reduce functions in 
-a variety of popular scripting languages: the list currently includes 
-Python and Perl.  Support for these popular languages includes access 
-to entire ecosystems of open-source packages via the Python Package 
-Index (PyPi) and the Comprehensive Perl Archive Network (CPAN).  This 
-includes a host of features not usually found in an RDBMS: free-text 
-analysis, statistical toolkits, graph algorithms, HTML and XML parsing, 
-web connectivity (SOAP, REST, HTTP), and many more. 
-
-In terms of data access, Greenplum MapReduce provides developers 
-with the familiar flexibility to access their data "where it lives": in files, 
-websites, or even via arbitrary operating system commands.  Greenplum 
-provides this data without any of the overheads that developers often 
-associate with traditional RDBMSs: no locking, logging or distributed 
-"commit" protocols.  On the other hand, for data that does need to 
-be protected by a full-featured RDBMS, Greenplum MapReduce offers 
-efficient native access to database records: it pushes MapReduce 
-programs down into Greenplum’s parallel database engine, without 
-the cost of going "out-of-box" to get data from a separate DBMS over 
-narrow client interfaces. 
-
-The Power of Synergy 
-
-Greenplum is unique in offering a commercial-grade implementation 
-of MapReduce, providing a robust implementation of the open interfaces 
-that enable and encourage developers to work with massive data sets. 
-But the biggest advantage of Greenplum’s implementation comes 
-from its shared technology core, which unifies MapReduce and RDBMS 
-functionality within a single parallel dataflow engine. 
-
- This unique architecture allows developers to mix and match data 
-sources and programming styles.  Greenplum’s solution is also able to 
-make MapReduce programs visible to SQL queries and vice-versa.  This 
-set of features enables a number of natural design patterns that are 
-unique to Greenplum: 
-
-• MapReduce programs over high-performance database tables. 
-Access to database data is trivial in Greenplum MapReduce: the 
-MapReduce program simply specifies the database table name 
-as its input.  Because Greenplum database tables are partitioned 
-across multiple machines, the initial Map phase is executed in 
-the database engine directly on the local partition, providing fully 
-parallel I/O with computation "pushed" to the data. By contrast, 
-a standalone MapReduce engine would require the programmer 
-to write data access routines into their MapReduce script.  That 
-extra programmer code would then access a remote database 
-server via a connectivity protocol like JDBC, and pull the database 
-records over to Map workers. 
-
-• SQL over external data sources.  Greenplum’s "External Table" 
-facility allows files and data-producing programs to be registered 
-as read-only tables in the database, and queried in SQL along- 
-side database tables.  External data is accessed and converted to 
-records on the fly during query processing.  Because these external  
-tables can be stored or generated on an arbitrary number of 
-nodes in the cluster, data access and conversion is a massively 
-parallel process.  
-
-• Durable storage of MapReduce outputs.  Many MapReduce  
-programs run for hours, and provide important analytic results.  
-Like standalone MapReduce implementations, Greenplum can 
-store these results in a filesystem.  But it is equally easy to store 
-the results of Greenplum MapReduce in a Greenplum database, 
-with full ACID durability guarantees, and the option to subsequently 
-analyze those outputs via Business Intelligence tools, SQL queries, 
-and other enterprise analytic software designed for databases.  
-Again, because the MapReduce code runs in the same engine 
-as the database, writing of output tables is fully parallelized and 
-requires no remote connectivity overheads. 
-
-• Rich integration of MapReduce and SQL code.  Greenplum’s 
-unique architecture removes barriers between code written in the 
-MapReduce framework, and code written in SQL. Because  
-Greenplum MapReduce scripts can be configured to flexibly  
-access the database, they can use arbitrary SQL queries as input.  
-In the other direction, Greenplum MapReduce scripts can be 
-registered as "views" in the database, and used as virtual tables 
-within SQL statements: the MapReduce job is run on the fly as 
-part of the SQL query processing, and its outputs are pipelined 
-directly into the relational query plan. Greenplum’s engine executes 
-all code – SQL, Map functions, Reduce functions – on the same 
-cluster of machines where the database is stored.  This integration 
- allows SQL and MapReduce developers to share code freely with- 
-out performance penalties or the need to work with "adapter"  
-software.  This flexibility removes the overhead of cultural and 
-political debates about the "right" programming framework 
-within an organization. 
-
-MapReduce in Use 
-
-MapReduce supports simple, easy-to-program dataflows: a single  
-data source piped into a chain of customizable Map and Reduce  
-operators.  As a result, MapReduce is especially well suited for  
-parallelizing custom tasks over a single dataset. 
-Data extraction and transformation tasks fit this model well.  Consider 
-the example of an e-commerce website with free-text descriptions of 
-products.  As a first phase in categorizing products, we would like to 
-automatically extract keywords from the HTML description text for each 
-product.  That is, we want to convert each free text description into a 
-set of pairs of the form (productID, keyword).   
-
- MapReduce makes this easy.  We configure it to route product pages 
-(which may be in files, a database, or even on the Web) to a cluster of 
-Greenplum servers, each running a Python Map operator.  The Python 
-Map code on each node repeatedly gets a product page, splits the 
-product description text into a list of potential keywords, and then 
-loops through the resulting list and outputs(productID, keyword) 
-pairs2.   These can be routed to Python Reduce operators running on 
-the cluster, which can gather up and count these pairs to produce 
-outputs of the form (productID, keyword, occurrences), where the last 
-field captures the number of times each keyword occurs in each product 
-description. This output can be stored in a database table for use in 
-subsequent tasks.  For example, using this table, products can be 
-"auto-categorized" by a simple SQL query that joins the MapReduce 
-output with a table of keywords and product categories. 
-
-As a very different example, the New York Times used a simple MapReduce 
-program to convert years of scanned newspaper articles into digital 
-text. The approach is to use parallel dataflow to "kick off" parallel 
-computation.  To do this in Greenplum, a list of image filenames can 
-be piped into a cluster of Greenplum Map operators written in Perl.  
-Each Map operator uses Perl’s system command to execute an Optical 
-Character Recognition program (e.g. the open source Tesseract tool) 
-to convert the image file into text.  No Reduce phase is required; the 
-results of the OCR program can be written to text files, or loaded as 
-text fields into a database table. 
-
-Both of these examples do information extraction, transformation and 
-loading, often called ETL in the data warehousing business.  Some 
-database engines advertise the ability to do ELT: loading the data into 
-the database before transforming it, to allow subsequent transformations 
-to run in SQL. Greenplum’s flexibility makes the reordering of the "L" 
-phase completely fluid: data can be stored inside or outside the  
-database, and accessed in either case by massively parallel code  
-written in either MapReduce or SQL.   So Greenplum easily enables  
-either ETL or ELT, along with options like ET (in which the data is always 
-stored outside the database) and LET (in which the raw form of the  
-information is stored in the database.)  This is the kind of flexibility 
-that comes from decoupling the parallel dataflow engine, allowing it  
-to interoperate with various storage and language interfaces. 
-
- An important detail in handling free text is to canonicalize multiple forms of the same 
-word: e.g. "driver", "drivers", and "driving" should all be converted to "drive" so they 
-will match. Because Greenplum MapReduce provides access to Perl and Python’s 
-open-source libraries, we can use Python’s nltk toolkit for Natural Language Processing 
-to do this task – a two-line addition to the basic Map program sketched above. 
-
-The previous examples focused on data extraction and transformation,  
-but MapReduce is also useful for deeper data mining and analytics.   
-Many companies employ experts in statistics and finance, who 
-increasingly want to run complex mathematical models over large 
-volumes of data. Recently, there have been a number of tutorials and 
-papers on easily implementing popular data mining techniques in 
-parallel using MapReduce [KimballMichelsBisciglia07, ChuEtAl06].  A 
-variety of sophisticated data mining and machine learning algorithms 
-have been expressed in this framework, including popular techniques 
-for classification, clustering, regression, and dimensionality reduction.  
-And in the Greenplum context, these algorithms can be flexibly combined 
-with SQL and run over both database tables and files. 
-
-Conclusion 
-
-MapReduce and SQL are two useful interfaces that enable software 
-developers to take advantage of parallel processing over big data sets.  
-Until recently, SQL was targeted at enterprise application programmers 
-accessing transactional records, and MapReduce was targeted at more 
-general software developers manipulating files.  This distinction was 
-mostly an artifact of the limitations of systems in the marketplace,  
-but has led to significant confusion and slowed the adoption of 
-MapReduce as a programming model in traditional data-rich settings 
-in the business world. 
-Greenplum’s technical core competency is parallel data technology. 
-By applying that expertise to both MapReduce and SQL programs, 
-Greenplum has changed the landscape for parallel data processing, 
-removing arbitrary barriers between programming styles and usage 
-scenarios.  The resulting Greenplum engine is a uniquely flexible and 
-scalable data processing system, allowing flexible combinations of 
-SQL and MapReduce, database tables and files. 
-
-References 
-
-[Codd70] E. F. Codd: A Relational Model of Data for Large Shared Data 
-Banks. Commun. ACM 13(6): 377-387 (1970) 
-[Gray78] Jim Gray: Notes on Data Base Operating Systems. In Michael 
-J. Flynn, et al. (Eds.): Operating Systems, An Advanced Course. Lecture 
-Notes in Computer Science 60. Springer, 1978: 393-481. 
-[BoralDeWitt83] Haran Boral, David J. DeWitt: Database Machines:  
-An Idea Whose Time Passed? A Critique of the Future of Database  
-Machines. International Workshop on Database Machines (IWDM) 
-1983: 166-187 
-[Brewer01] Eric A. Brewer: Lessons from Giant-Scale Services. IEEE 
-Internet Computing 5(4): 46-55 (2001) 
-[Kaushik06]  Avinash Kaushik. Web Analytics: An Hour a Day.  Sybex 
-Publishers, 2007. 
-[DeanGhemawat08] Jeffrey Dean, Sanjay Ghemawat: MapReduce:  
-simplified data processing on large clusters. Commun.  
-ACM 51(1): 107-113 (2008) 
-[DeWittStonebraker08]  David J. DeWitt and Michael Stonebraker.   
-MapReduce: A Major Step Backwards.  The Database Column  
-(weblog).  January 17, 2008. http://www.databasecolumn. 
-com/2008/01/mapreduce-a-major-step-back.html 
-[KimballMichelsBisciglia07] Aaron Kimball, Sierra Michels-Slettvet, 
-and Christophe Bisciglia. Cluster Computing and MapReduce. Google 
-Code University (website).  Summer, 2007. 
-[ChuEtAl06] Cheng-Tao Chu, Sang Kyun Kim, Yi-An Lin, YuanYuan Yu, 
-Gary Bradski, Andrew Ng, and Kunle Olukotun.  MapReduce for  
-Machine Learning on Multicore. Advances in Neural Information  
-Processing Systems (NIPS), December, 2006. 
-Revision 1. August 2008. 

http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/4e392375/tools/doc/gpmapreduce_help
----------------------------------------------------------------------
diff --git a/tools/doc/gpmapreduce_help b/tools/doc/gpmapreduce_help
deleted file mode 100644
index be79f62..0000000
--- a/tools/doc/gpmapreduce_help
+++ /dev/null
@@ -1,144 +0,0 @@
-COMMAND NAME: gpmapreduce
-
-Runs Greenplum MapReduce jobs as defined in a YAML specification document.
-
-
-*****************************************************
-SYNOPSIS
-*****************************************************
-
-gpmapreduce -f <yaml_file> [<dbname> [<username>]]
-            [-k <name>=<value> | --key <name>=<value>] 
-            [-h <hostname> | --host <hostname>] 
-            [-p <port>| --port <port>] 
-            [-U <username> | --username <username>] [-W] [-v]
-
-gpmapreduce -V | --version
-
-gpmapreduce -h | --help
-
-gpmapreduce -x | --explain
-
-gpmapreduce -X | --explain-analyze
-
-
-*****************************************************
-PREREQUISITES
-*****************************************************
-
-The following are required prior to running this program:
-
-* You must have your MapReduce job defined in a YAML file.
-
-* You must be a Greenplum Database superuser to run MapReduce jobs 
-  written in untrusted Perl or Python.
-
-* You must be a Greenplum Database superuser to run MapReduce jobs 
-  with EXEC and FILE inputs.
-
-* Non-superuser roles must be granted external table permissions
-  using CREATE/ALTER ROLE in order to run MapReduce jobs. 
-  
-*****************************************************
-DESCRIPTION
-*****************************************************
-
-MapReduce is a programming model developed by Google for 
-processing and generating large data sets on an array of commodity 
-servers. Greenplum MapReduce allows programmers who are familiar 
-with the MapReduce paradigm to write map and reduce functions and 
-submit them to the Greenplum Database parallel engine for processing.
-
-In order for Greenplum to be able to process MapReduce functions, 
-the functions need to be defined in a YAML document, which is then 
-passed to the Greenplum MapReduce program, gpmapreduce, for execution 
-by the Greenplum Database parallel engine. The Greenplum system takes 
-care of the details of distributing the input data, executing the 
-program across a set of machines, handling machine failures, 
-and managing the required inter-machine communication.
-
-
-*****************************************************
-OPTIONS
-*****************************************************
-
-
--f <yaml_file>
-
-Required. The YAML file that contains the Greenplum MapReduce 
-job definitions. See the Greenplum Database Administrator Guide
-for more information about creating YAML documents.
-
-
--? | --help
-
-Show help, then exit.
-
-
--V | --version
-
-Show version information, then exit.
-
-
--v | --verbose
-
-Show verbose output.
-
-
--x | --explain
-
-Do not run MapReduce jobs, but produce explain plans.
-
-
--X | --explain-analyze
-
-Run MapReduce jobs and produce explain-analyze plans.
-
-
--k | --key <name>=<value>
-
-Sets a YAML variable.  A value is required. Defaults to �key� 
-if no variable name is specified. 
-
-
--h <host> | --host <host>
-
-Specifies the host name of the machine on which the Greenplum 
-master database server is running. If not specified, reads 
-from the environment variable PGHOST or defaults to localhost.
-
-
--p <port> | --port <port>
-
-Specifies the TCP port on which the Greenplum master database 
-server is listening for connections. If not specified, reads 
-from the environment variable PGPORT or defaults to 5432.
-
-
--U <username> | --username <username>
-
-The database role name to connect as. If not specified, reads 
-from the environment variable PGUSER or defaults to the 
-current system user name.
-
-
--W | --password
-
-Force a password prompt.
-
-
-*****************************************************
-EXAMPLES
-*****************************************************
-
-Run a MapReduce job as defined in my_yaml.txt:
-
-gpmapreduce -f my_yaml.txt
-
-
-*****************************************************
-SEE ALSO
-*****************************************************
-
-"Greenplum MapReduce YAML Specification" in the 
-Greenplum Database Administrator Guide


Mime
View raw message