accumulo-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From e..@apache.org
Subject [16/38] git commit: ACCUMULO-600 removed wikisearch from trunk
Date Tue, 26 Nov 2013 15:47:20 GMT
ACCUMULO-600 removed wikisearch from trunk

git-svn-id: https://svn.apache.org/repos/asf/accumulo/trunk@1370489 13f79535-47bb-0310-9956-ffa450edef68
(cherry picked from commit d1e5c6ebe2796154b514ec8f147720d70b3800b5)

Reason: Maintainability
Author: Billie Rinaldi <billie@apache.org>
Ref: ACCUMULO-1792

Differs from upstream by leaving a tombstone marker pointing to the contrib project.

Author: Sean Busbey <busbey@cloudera.com>

Signed-off-by: Eric Newton <eric.newton@gmail.com>


Project: http://git-wip-us.apache.org/repos/asf/accumulo/repo
Commit: http://git-wip-us.apache.org/repos/asf/accumulo/commit/8db62992
Tree: http://git-wip-us.apache.org/repos/asf/accumulo/tree/8db62992
Diff: http://git-wip-us.apache.org/repos/asf/accumulo/diff/8db62992

Branch: refs/heads/1.5.1-SNAPSHOT
Commit: 8db629923cf4f89b5055f80aa1f39251fd63b25c
Parents: 7fa0085
Author: Sean Busbey <busbey@cloudera.com>
Authored: Tue Oct 22 13:21:55 2013 -0500
Committer: Eric Newton <eric.newton@gmail.com>
Committed: Mon Nov 25 16:06:42 2013 -0500

----------------------------------------------------------------------
 .../1GB/native-standalone/generic_logger.xml    |    4 -
 conf/examples/1GB/standalone/generic_logger.xml |    4 -
 .../2GB/native-standalone/generic_logger.xml    |    4 -
 conf/examples/2GB/standalone/generic_logger.xml |    4 -
 .../3GB/native-standalone/generic_logger.xml    |    4 -
 conf/examples/3GB/standalone/generic_logger.xml |    4 -
 .../512MB/native-standalone/generic_logger.xml  |    4 -
 .../512MB/standalone/generic_logger.xml         |    4 -
 src/examples/pom.xml                            |    1 -
 src/examples/wikisearch/README                  |   69 +-
 src/examples/wikisearch/README.parallel         |   65 -
 src/examples/wikisearch/ingest/bin/ingest.sh    |   46 -
 .../wikisearch/ingest/bin/ingest_parallel.sh    |   46 -
 .../ingest/conf/wikipedia.xml.example           |   43 -
 .../ingest/conf/wikipedia_parallel.xml.example  |   75 -
 src/examples/wikisearch/ingest/pom.xml          |  160 --
 .../wikisearch/ingest/src/assembly/dist.xml     |   38 -
 .../wikisearch/ingest/ArticleExtractor.java     |  207 --
 .../wikisearch/ingest/LRUOutputCombiner.java    |   75 -
 .../ingest/WikipediaConfiguration.java          |  198 --
 .../wikisearch/ingest/WikipediaIngester.java    |  206 --
 .../wikisearch/ingest/WikipediaInputFormat.java |  136 --
 .../wikisearch/ingest/WikipediaMapper.java      |  245 ---
 .../ingest/WikipediaPartitionedIngester.java    |  310 ---
 .../ingest/WikipediaPartitionedMapper.java      |  310 ---
 .../wikisearch/ingest/WikipediaPartitioner.java |   89 -
 .../iterator/GlobalIndexUidCombiner.java        |   94 -
 .../wikisearch/iterator/TextIndexCombiner.java  |  102 -
 .../normalizer/LcNoDiacriticsNormalizer.java    |   49 -
 .../wikisearch/normalizer/NoOpNormalizer.java   |   23 -
 .../wikisearch/normalizer/Normalizer.java       |   32 -
 .../wikisearch/normalizer/NumberNormalizer.java |   42 -
 .../output/BufferingRFileRecordWriter.java      |  140 --
 .../output/SortingRFileOutputFormat.java        |  121 --
 .../wikisearch/protobuf/TermWeight.java         |  424 ----
 .../examples/wikisearch/protobuf/Uid.java       |  470 -----
 .../reader/AggregatingRecordReader.java         |  171 --
 .../wikisearch/reader/LfLineReader.java         |  173 --
 .../wikisearch/reader/LongLineRecordReader.java |  136 --
 .../examples/wikisearch/util/TextUtil.java      |  109 -
 .../ingest/src/main/protobuf/TermWeight.proto   |   28 -
 .../ingest/src/main/protobuf/Uid.proto          |   29 -
 .../ingest/src/main/protobuf/compile_protos.sh  |   19 -
 .../ingest/StandaloneStatusReporter.java        |   70 -
 .../ingest/WikipediaInputSplitTest.java         |   69 -
 .../wikisearch/ingest/WikipediaMapperTest.java  |  163 --
 .../wikisearch/iterator/GlobalIndexUidTest.java |  192 --
 .../wikisearch/iterator/TextIndexTest.java      |  185 --
 .../normalizer/testNumberNormalizer.java        |   90 -
 .../reader/AggregatingRecordReaderTest.java     |  287 ---
 .../src/test/resources/enwiki-20110901-001.xml  |  153 --
 src/examples/wikisearch/pom.xml                 |  253 ---
 src/examples/wikisearch/query-war/pom.xml       |   66 -
 .../src/main/webapp/WEB-INF/jboss-web.xml       |   20 -
 .../query-war/src/main/webapp/WEB-INF/web.xml   |   57 -
 .../query-war/src/main/webapp/style.xsl         |   47 -
 .../wikisearch/query-war/src/main/webapp/ui.jsp |  131 --
 .../query-war/src/test/resources/test.xml       | 1651 ---------------
 src/examples/wikisearch/query/pom.xml           |  180 --
 .../wikisearch/query/src/assembly/dist.xml      |   40 -
 .../wikisearch/function/QueryFunctions.java     |   68 -
 .../iterator/AbstractEvaluatingIterator.java    |  323 ---
 .../wikisearch/iterator/AndIterator.java        |  921 ---------
 .../iterator/BooleanLogicIterator.java          | 1949 ------------------
 .../iterator/BooleanLogicTreeNode.java          |  523 -----
 .../iterator/DefaultIteratorEnvironment.java    |   58 -
 .../wikisearch/iterator/EvaluatingIterator.java |  115 --
 .../wikisearch/iterator/FieldIndexIterator.java |  736 -------
 .../iterator/OptimizedQueryIterator.java        |  205 --
 .../wikisearch/iterator/OrIterator.java         |  822 --------
 .../wikisearch/iterator/ReadAheadIterator.java  |  297 ---
 .../iterator/UniqFieldNameValueIterator.java    |  342 ---
 .../examples/wikisearch/jexl/Arithmetic.java    |  126 --
 .../wikisearch/logic/AbstractQueryLogic.java    |  883 --------
 .../examples/wikisearch/logic/ContentLogic.java |  109 -
 .../examples/wikisearch/logic/QueryLogic.java   |  195 --
 .../examples/wikisearch/parser/EventFields.java |  227 --
 .../parser/FieldIndexQueryReWriter.java         | 1139 ----------
 .../parser/JexlOperatorConstants.java           |  105 -
 .../wikisearch/parser/QueryEvaluator.java       |  291 ---
 .../examples/wikisearch/parser/QueryParser.java |  845 --------
 .../wikisearch/parser/RangeCalculator.java      | 1199 -----------
 .../examples/wikisearch/parser/TreeBuilder.java |  675 ------
 .../examples/wikisearch/parser/TreeNode.java    |  235 ---
 .../examples/wikisearch/query/IQuery.java       |   66 -
 .../examples/wikisearch/query/Query.java        |  239 ---
 .../examples/wikisearch/sample/Document.java    |   61 -
 .../examples/wikisearch/sample/Field.java       |   58 -
 .../examples/wikisearch/sample/Results.java     |   53 -
 .../examples/wikisearch/util/BaseKeyParser.java |   77 -
 .../wikisearch/util/FieldIndexKeyParser.java    |   71 -
 .../examples/wikisearch/util/KeyParser.java     |   70 -
 .../src/main/resources/META-INF/MANIFEST.MF     |    2 -
 .../main/resources/META-INF/ejb-jar.xml.example |   62 -
 .../logic/StandaloneStatusReporter.java         |   70 -
 .../wikisearch/logic/TestQueryLogic.java        |  186 --
 .../src/test/resources/enwiki-20110901-001.xml  |  153 --
 97 files changed, 1 insertion(+), 21722 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/accumulo/blob/8db62992/conf/examples/1GB/native-standalone/generic_logger.xml
----------------------------------------------------------------------
diff --git a/conf/examples/1GB/native-standalone/generic_logger.xml b/conf/examples/1GB/native-standalone/generic_logger.xml
index dc45710..5dc38ac 100644
--- a/conf/examples/1GB/native-standalone/generic_logger.xml
+++ b/conf/examples/1GB/native-standalone/generic_logger.xml
@@ -69,10 +69,6 @@
      <level value="INFO"/>
   </logger>
 
-  <logger name="org.apache.accumulo.examples.wikisearch">
-     <level value="INFO"/>
-  </logger>
-
   <logger name="org.mortbay.log">
      <level value="WARN"/>
   </logger>

http://git-wip-us.apache.org/repos/asf/accumulo/blob/8db62992/conf/examples/1GB/standalone/generic_logger.xml
----------------------------------------------------------------------
diff --git a/conf/examples/1GB/standalone/generic_logger.xml b/conf/examples/1GB/standalone/generic_logger.xml
index dc45710..5dc38ac 100644
--- a/conf/examples/1GB/standalone/generic_logger.xml
+++ b/conf/examples/1GB/standalone/generic_logger.xml
@@ -69,10 +69,6 @@
      <level value="INFO"/>
   </logger>
 
-  <logger name="org.apache.accumulo.examples.wikisearch">
-     <level value="INFO"/>
-  </logger>
-
   <logger name="org.mortbay.log">
      <level value="WARN"/>
   </logger>

http://git-wip-us.apache.org/repos/asf/accumulo/blob/8db62992/conf/examples/2GB/native-standalone/generic_logger.xml
----------------------------------------------------------------------
diff --git a/conf/examples/2GB/native-standalone/generic_logger.xml b/conf/examples/2GB/native-standalone/generic_logger.xml
index dc45710..5dc38ac 100644
--- a/conf/examples/2GB/native-standalone/generic_logger.xml
+++ b/conf/examples/2GB/native-standalone/generic_logger.xml
@@ -69,10 +69,6 @@
      <level value="INFO"/>
   </logger>
 
-  <logger name="org.apache.accumulo.examples.wikisearch">
-     <level value="INFO"/>
-  </logger>
-
   <logger name="org.mortbay.log">
      <level value="WARN"/>
   </logger>

http://git-wip-us.apache.org/repos/asf/accumulo/blob/8db62992/conf/examples/2GB/standalone/generic_logger.xml
----------------------------------------------------------------------
diff --git a/conf/examples/2GB/standalone/generic_logger.xml b/conf/examples/2GB/standalone/generic_logger.xml
index dc45710..5dc38ac 100644
--- a/conf/examples/2GB/standalone/generic_logger.xml
+++ b/conf/examples/2GB/standalone/generic_logger.xml
@@ -69,10 +69,6 @@
      <level value="INFO"/>
   </logger>
 
-  <logger name="org.apache.accumulo.examples.wikisearch">
-     <level value="INFO"/>
-  </logger>
-
   <logger name="org.mortbay.log">
      <level value="WARN"/>
   </logger>

http://git-wip-us.apache.org/repos/asf/accumulo/blob/8db62992/conf/examples/3GB/native-standalone/generic_logger.xml
----------------------------------------------------------------------
diff --git a/conf/examples/3GB/native-standalone/generic_logger.xml b/conf/examples/3GB/native-standalone/generic_logger.xml
index dc45710..5dc38ac 100644
--- a/conf/examples/3GB/native-standalone/generic_logger.xml
+++ b/conf/examples/3GB/native-standalone/generic_logger.xml
@@ -69,10 +69,6 @@
      <level value="INFO"/>
   </logger>
 
-  <logger name="org.apache.accumulo.examples.wikisearch">
-     <level value="INFO"/>
-  </logger>
-
   <logger name="org.mortbay.log">
      <level value="WARN"/>
   </logger>

http://git-wip-us.apache.org/repos/asf/accumulo/blob/8db62992/conf/examples/3GB/standalone/generic_logger.xml
----------------------------------------------------------------------
diff --git a/conf/examples/3GB/standalone/generic_logger.xml b/conf/examples/3GB/standalone/generic_logger.xml
index dc45710..5dc38ac 100644
--- a/conf/examples/3GB/standalone/generic_logger.xml
+++ b/conf/examples/3GB/standalone/generic_logger.xml
@@ -69,10 +69,6 @@
      <level value="INFO"/>
   </logger>
 
-  <logger name="org.apache.accumulo.examples.wikisearch">
-     <level value="INFO"/>
-  </logger>
-
   <logger name="org.mortbay.log">
      <level value="WARN"/>
   </logger>

http://git-wip-us.apache.org/repos/asf/accumulo/blob/8db62992/conf/examples/512MB/native-standalone/generic_logger.xml
----------------------------------------------------------------------
diff --git a/conf/examples/512MB/native-standalone/generic_logger.xml b/conf/examples/512MB/native-standalone/generic_logger.xml
index dc45710..5dc38ac 100644
--- a/conf/examples/512MB/native-standalone/generic_logger.xml
+++ b/conf/examples/512MB/native-standalone/generic_logger.xml
@@ -69,10 +69,6 @@
      <level value="INFO"/>
   </logger>
 
-  <logger name="org.apache.accumulo.examples.wikisearch">
-     <level value="INFO"/>
-  </logger>
-
   <logger name="org.mortbay.log">
      <level value="WARN"/>
   </logger>

http://git-wip-us.apache.org/repos/asf/accumulo/blob/8db62992/conf/examples/512MB/standalone/generic_logger.xml
----------------------------------------------------------------------
diff --git a/conf/examples/512MB/standalone/generic_logger.xml b/conf/examples/512MB/standalone/generic_logger.xml
index dc45710..5dc38ac 100644
--- a/conf/examples/512MB/standalone/generic_logger.xml
+++ b/conf/examples/512MB/standalone/generic_logger.xml
@@ -69,10 +69,6 @@
      <level value="INFO"/>
   </logger>
 
-  <logger name="org.apache.accumulo.examples.wikisearch">
-     <level value="INFO"/>
-  </logger>
-
   <logger name="org.mortbay.log">
      <level value="WARN"/>
   </logger>

http://git-wip-us.apache.org/repos/asf/accumulo/blob/8db62992/src/examples/pom.xml
----------------------------------------------------------------------
diff --git a/src/examples/pom.xml b/src/examples/pom.xml
index 2d56be6..0ec2184 100644
--- a/src/examples/pom.xml
+++ b/src/examples/pom.xml
@@ -29,7 +29,6 @@
 
   <modules>
     <module>simple</module>
-    <module>wikisearch</module>
   </modules>
 
   <repositories>

http://git-wip-us.apache.org/repos/asf/accumulo/blob/8db62992/src/examples/wikisearch/README
----------------------------------------------------------------------
diff --git a/src/examples/wikisearch/README b/src/examples/wikisearch/README
index 041490f..806de3c 100644
--- a/src/examples/wikisearch/README
+++ b/src/examples/wikisearch/README
@@ -1,68 +1 @@
- Apache Accumulo Wikipedia Search Example
-
- This project contains a sample application for ingesting and querying wikipedia data.
- 
-  
- Ingest
- ------
- 
- 	Prerequisites
- 	-------------
- 	1. Accumulo, Hadoop, and ZooKeeper must be installed and running
- 	2. One or more wikipedia dump files (http://dumps.wikimedia.org/backup-index.html) placed in an HDFS directory.
-	   You will want to grab the files with the link name of pages-articles.xml.bz2
-        3. Though not strictly required, the ingest will go more quickly if the files are decompressed:
-
-            $ bunzip2 < enwiki-*-pages-articles.xml.bz2 | hadoop fs -put - /wikipedia/enwiki-pages-articles.xml
-
- 
- 	INSTRUCTIONS
- 	------------
-	1. Copy the ingest/conf/wikipedia.xml.example to ingest/conf/wikipedia.xml and change it to specify Accumulo information. 
-	2. Copy the ingest/lib/wikisearch-*.jar and ingest/lib/protobuf*.jar to $ACCUMULO_HOME/lib/ext
-	3. Then run ingest/bin/ingest.sh with one argument (the name of the directory in HDFS where the wikipedia XML 
-           files reside) and this will kick off a MapReduce job to ingest the data into Accumulo.
-   
- Query
- -----
- 
- 	Prerequisites
- 	-------------
-	1. The query software was tested using JBoss AS 6. Install this unless you feel like messing with the installation.
- 	
-	NOTE: Ran into a bug (https://issues.jboss.org/browse/RESTEASY-531) that did not allow an EJB3.1 war file. The
-	workaround is to separate the RESTEasy servlet from the EJBs by creating an EJB jar and a WAR file.
-	
-	INSTRUCTIONS
-	-------------
-	1. Copy the query/src/main/resources/META-INF/ejb-jar.xml.example file to 
-	   query/src/main/resources/META-INF/ejb-jar.xml. Modify to the file to contain the same 
-	   information that you put into the wikipedia.xml file from the Ingest step above. 
-	2. Re-build the query distribution by running 'mvn package assembly:single' in the top-level directory. 
-        3. Untar the resulting file in the $JBOSS_HOME/server/default directory.
-
-              $ cd $JBOSS_HOME/server/default
-              $ tar -xzf $ACCUMULO_HOME/src/examples/wikisearch/query/target/wikisearch-query*.tar.gz
- 
-           This will place the dependent jars in the lib directory and the EJB jar into the deploy directory.
-	4. Next, copy the wikisearch*.war file in the query-war/target directory to $JBOSS_HOME/server/default/deploy. 
-	5. Start JBoss ($JBOSS_HOME/bin/run.sh)
-	6. Use the Accumulo shell and give the user permissions for the wikis that you loaded, for example: 
-			setauths -u <user> -s all,enwiki,eswiki,frwiki,fawiki
-	7. Copy the following jars to the $ACCUMULO_HOME/lib/ext directory from the $JBOSS_HOME/server/default/lib directory:
-	
-		commons-lang*.jar
-		kryo*.jar
-		minlog*.jar
-		commons-jexl*.jar
-		google-collections*.jar
-		
-	8. Copy the $JBOSS_HOME/server/default/deploy/wikisearch-query*.jar to $ACCUMULO_HOME/lib/ext.
-
-
-	9. At this point you should be able to open a browser and view the page: http://localhost:8080/accumulo-wikisearch/ui/ui.jsp.
-	You can issue the queries using this user interface or via the following REST urls: <host>/accumulo-wikisearch/rest/Query/xml,
-	<host>/accumulo-wikisearch/rest/Query/html, <host>/accumulo-wikisearch/rest/Query/yaml, or <host>/accumulo-wikisearch/rest/Query/json.
-	There are two parameters to the REST service, query and auths. The query parameter is the same string that you would type
-	into the search box at ui.jsp, and the auths parameter is a comma-separated list of wikis that you want to search (i.e.
-	enwiki,frwiki,dewiki, etc. Or you can use all) 
+The Accumulo Wikipedia Search Example has moved to [a contrib project](http://accumulo.apache.org/contrib.html). For more information, see [ACCUMULO-600](https://issues.apache.org/jira/browse/ACCUMULO-600) and the [wikisearch contrib repository](https://git-wip-us.apache.org/repos/asf?p=accumulo-wikisearch.git;a=summary).

http://git-wip-us.apache.org/repos/asf/accumulo/blob/8db62992/src/examples/wikisearch/README.parallel
----------------------------------------------------------------------
diff --git a/src/examples/wikisearch/README.parallel b/src/examples/wikisearch/README.parallel
deleted file mode 100644
index 477556b..0000000
--- a/src/examples/wikisearch/README.parallel
+++ /dev/null
@@ -1,65 +0,0 @@
- Apache Accumulo Wikipedia Search Example (parallel version)
-
- This project contains a sample application for ingesting and querying wikipedia data.
- 
-  
- Ingest
- ------
- 
- 	Prerequisites
- 	-------------
- 	1. Accumulo, Hadoop, and ZooKeeper must be installed and running
- 	2. One or more wikipedia dump files (http://dumps.wikimedia.org/backup-index.html) placed in an HDFS directory.
-	     You will want to grab the files with the link name of pages-articles.xml.bz2
- 
- 
- 	INSTRUCTIONS
- 	------------
-	1. Copy the ingest/conf/wikipedia_parallel.xml.example to ingest/conf/wikipedia.xml and change it to specify Accumulo information. 
-	2. Copy the ingest/lib/wikisearch-*.jar and ingest/lib/protobuf*.jar to $ACCUMULO_HOME/lib/ext
-	3. Then run ingest/bin/ingest_parallel.sh with one argument (the name of the directory in HDFS where the wikipedia XML 
-             files reside) and this will kick off a MapReduce job to ingest the data into Accumulo.
-   
- Query
- -----
- 
- 	Prerequisites
- 	-------------
-	1. The query software was tested using JBoss AS 6. Install this unless you feel like messing with the installation.
- 	
-	NOTE: Ran into a bug (https://issues.jboss.org/browse/RESTEASY-531) that did not allow an EJB3.1 war file. The
-	workaround is to separate the RESTEasy servlet from the EJBs by creating an EJB jar and a WAR file.
-	
-	INSTRUCTIONS
-	-------------
-	1. Copy the query/src/main/resources/META-INF/ejb-jar.xml.example file to 
-	   query/src/main/resources/META-INF/ejb-jar.xml. Modify to the file to contain the same 
-	   information that you put into the wikipedia.xml file from the Ingest step above. 
-	2. Re-build the query distribution by running 'mvn package assembly:single' in the top-level directory. 
-        3. Untar the resulting file in the $JBOSS_HOME/server/default directory.
-
-              $ cd $JBOSS_HOME/server/default
-              $ tar -xzf $ACCUMULO_HOME/src/examples/wikisearch/query/target/wikisearch-query*.tar.gz
- 
-           This will place the dependent jars in the lib directory and the EJB jar into the deploy directory.
-	4. Next, copy the wikisearch*.war file in the query-war/target directory to $JBOSS_HOME/server/default/deploy. 
-	5. Start JBoss ($JBOSS_HOME/bin/run.sh)
-	6. Use the Accumulo shell and give the user permissions for the wikis that you loaded, for example: 
-			setauths -u <user> -s all,enwiki,eswiki,frwiki,fawiki
-	7. Copy the following jars to the $ACCUMULO_HOME/lib/ext directory from the $JBOSS_HOME/server/default/lib directory:
-	
-		commons-lang*.jar
-		kryo*.jar
-		minlog*.jar
-		commons-jexl*.jar
-		google-collections*.jar
-		
-	8. Copy the $JBOSS_HOME/server/default/deploy/wikisearch-query*.jar to $ACCUMULO_HOME/lib/ext.
-
-
-	9. At this point you should be able to open a browser and view the page: http://localhost:8080/accumulo-wikisearch/ui/ui.jsp.
-	You can issue the queries using this user interface or via the following REST urls: <host>/accumulo-wikisearch/rest/Query/xml,
-	<host>/accumulo-wikisearch/rest/Query/html, <host>/accumulo-wikisearch/rest/Query/yaml, or <host>/accumulo-wikisearch/rest/Query/json.
-	There are two parameters to the REST service, query and auths. The query parameter is the same string that you would type
-	into the search box at ui.jsp, and the auths parameter is a comma-separated list of wikis that you want to search (i.e.
-	enwiki,frwiki,dewiki, etc. Or you can use all) 

http://git-wip-us.apache.org/repos/asf/accumulo/blob/8db62992/src/examples/wikisearch/ingest/bin/ingest.sh
----------------------------------------------------------------------
diff --git a/src/examples/wikisearch/ingest/bin/ingest.sh b/src/examples/wikisearch/ingest/bin/ingest.sh
deleted file mode 100755
index acdcbf8..0000000
--- a/src/examples/wikisearch/ingest/bin/ingest.sh
+++ /dev/null
@@ -1,46 +0,0 @@
-#!/bin/bash
-
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-
-THIS_SCRIPT="$0"
-SCRIPT_DIR="${THIS_SCRIPT%/*}"
-SCRIPT_DIR=`cd $SCRIPT_DIR ; pwd`
-echo $SCRIPT_DIR
-
-#
-# Add our jars
-#
-for f in $SCRIPT_DIR/../lib/*.jar; do
-	CLASSPATH=${CLASSPATH}:$f  
-done
-
-#
-# Transform the classpath into a comma-separated list also
-#
-LIBJARS=`echo $CLASSPATH | sed 's/^://' | sed 's/:/,/g'`
-
-
-#
-# Map/Reduce job
-#
-JAR=$SCRIPT_DIR/../lib/wikisearch-ingest-1.4.5-SNAPSHOT.jar
-CONF=$SCRIPT_DIR/../conf/wikipedia.xml
-HDFS_DATA_DIR=$1
-export HADOOP_CLASSPATH=$CLASSPATH
-echo "hadoop jar $JAR org.apache.accumulo.examples.wikisearch.ingest.WikipediaIngester -libjars $LIBJARS -conf $CONF -Dwikipedia.input=${HDFS_DATA_DIR}"
-hadoop jar $JAR org.apache.accumulo.examples.wikisearch.ingest.WikipediaIngester -libjars $LIBJARS -conf $CONF -Dwikipedia.input=${HDFS_DATA_DIR}

http://git-wip-us.apache.org/repos/asf/accumulo/blob/8db62992/src/examples/wikisearch/ingest/bin/ingest_parallel.sh
----------------------------------------------------------------------
diff --git a/src/examples/wikisearch/ingest/bin/ingest_parallel.sh b/src/examples/wikisearch/ingest/bin/ingest_parallel.sh
deleted file mode 100755
index 8c63ac0..0000000
--- a/src/examples/wikisearch/ingest/bin/ingest_parallel.sh
+++ /dev/null
@@ -1,46 +0,0 @@
-#!/bin/bash
-
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-
-THIS_SCRIPT="$0"
-SCRIPT_DIR="${THIS_SCRIPT%/*}"
-SCRIPT_DIR=`cd $SCRIPT_DIR ; pwd`
-echo $SCRIPT_DIR
-
-#
-# Add our jars
-#
-for f in $SCRIPT_DIR/../lib/*.jar; do
-	CLASSPATH=${CLASSPATH}:$f  
-done
-
-#
-# Transform the classpath into a comma-separated list also
-#
-LIBJARS=`echo $CLASSPATH | sed 's/^://' | sed 's/:/,/g'`
-
-
-#
-# Map/Reduce job
-#
-JAR=$SCRIPT_DIR/../lib/wikisearch-ingest-1.4.5-SNAPSHOT.jar
-CONF=$SCRIPT_DIR/../conf/wikipedia.xml
-HDFS_DATA_DIR=$1
-export HADOOP_CLASSPATH=$CLASSPATH
-echo "hadoop jar $JAR org.apache.accumulo.examples.wikisearch.ingest.WikipediaPartitionedIngester -libjars $LIBJARS -conf $CONF -Dwikipedia.input=${HDFS_DATA_DIR}"
-hadoop jar $JAR org.apache.accumulo.examples.wikisearch.ingest.WikipediaPartitionedIngester -libjars $LIBJARS -conf $CONF -Dwikipedia.input=${HDFS_DATA_DIR}

http://git-wip-us.apache.org/repos/asf/accumulo/blob/8db62992/src/examples/wikisearch/ingest/conf/wikipedia.xml.example
----------------------------------------------------------------------
diff --git a/src/examples/wikisearch/ingest/conf/wikipedia.xml.example b/src/examples/wikisearch/ingest/conf/wikipedia.xml.example
deleted file mode 100644
index b08742e..0000000
--- a/src/examples/wikisearch/ingest/conf/wikipedia.xml.example
+++ /dev/null
@@ -1,43 +0,0 @@
-<?xml version="1.0"?>
-<!--
-  Licensed to the Apache Software Foundation (ASF) under one or more
-  contributor license agreements.  See the NOTICE file distributed with
-  this work for additional information regarding copyright ownership.
-  The ASF licenses this file to You under the Apache License, Version 2.0
-  (the "License"); you may not use this file except in compliance with
-  the License.  You may obtain a copy of the License at
-
-      http://www.apache.org/licenses/LICENSE-2.0
-
-  Unless required by applicable law or agreed to in writing, software
-  distributed under the License is distributed on an "AS IS" BASIS,
-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-  See the License for the specific language governing permissions and
-  limitations under the License.
--->
-<configuration>
-  <property>
-    <name>wikipedia.accumulo.zookeepers</name>
-    <value><!--zookeeper servers --></value>
-  </property>
-  <property>
-    <name>wikipedia.accumulo.instance_name</name>
-    <value><!--instance name --></value>
-  </property>
-  <property>
-    <name>wikipedia.accumulo.user</name>
-    <value><!--user name --></value>
-  </property>
-  <property>
-    <name>wikipedia.accumulo.password</name>
-    <value><!-- password --></value>
-  </property>
-  <property>
-    <name>wikipedia.accumulo.table</name>
-    <value><!--table name --></value>
-  </property>
-  <property>
-    <name>wikipedia.ingest.partitions</name>
-    <value><!--number of partitions --></value>
-  </property>
-</configuration>

http://git-wip-us.apache.org/repos/asf/accumulo/blob/8db62992/src/examples/wikisearch/ingest/conf/wikipedia_parallel.xml.example
----------------------------------------------------------------------
diff --git a/src/examples/wikisearch/ingest/conf/wikipedia_parallel.xml.example b/src/examples/wikisearch/ingest/conf/wikipedia_parallel.xml.example
deleted file mode 100644
index 53220f0..0000000
--- a/src/examples/wikisearch/ingest/conf/wikipedia_parallel.xml.example
+++ /dev/null
@@ -1,75 +0,0 @@
-<?xml version="1.0"?>
-<!--
-  Licensed to the Apache Software Foundation (ASF) under one or more
-  contributor license agreements.  See the NOTICE file distributed with
-  this work for additional information regarding copyright ownership.
-  The ASF licenses this file to You under the Apache License, Version 2.0
-  (the "License"); you may not use this file except in compliance with
-  the License.  You may obtain a copy of the License at
-
-      http://www.apache.org/licenses/LICENSE-2.0
-
-  Unless required by applicable law or agreed to in writing, software
-  distributed under the License is distributed on an "AS IS" BASIS,
-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-  See the License for the specific language governing permissions and
-  limitations under the License.
--->
-<configuration>
-  <property>
-    <name>wikipedia.accumulo.zookeepers</name>
-    <value><!--zookeeper servers --></value>
-  </property>
-  <property>
-    <name>wikipedia.accumulo.instance_name</name>
-    <value><!--instance name --></value>
-  </property>
-  <property>
-    <name>wikipedia.accumulo.user</name>
-    <value><!--user name --></value>
-  </property>
-  <property>
-    <name>wikipedia.accumulo.password</name>
-    <value><!-- password --></value>
-  </property>
-  <property>
-    <name>wikipedia.accumulo.table</name>
-    <value><!--table name --></value>
-  </property>
-  <property>
-    <name>wikipedia.ingest.partitions</name>
-    <value><!--number of partitions --></value>
-  </property>
-  <property>
-    <name>wikipedia.partitioned.directory</name>
-    <value><!--hdfs directory for intemediate partitioned storage --></value>
-  </property>
-  <property>
-    <name>wikipedia.ingest.groups</name>
-    <value><!--the number of intermediate partition groups to generate --></value>
-  </property>
-  <property>
-    <name>wikipedia.run.partitioner</name>
-    <value><!--whether to run the partitioner step --></value>
-  </property>
-  <property>
-    <name>wikipedia.run.ingest</name>
-    <value><!--whether to run the ingest step --></value>
-  </property>
-  <property>
-    <name>wikipedia.bulk.ingest</name>
-    <value><!--whether to use bulk ingest vice streaming ingest --></value>
-  </property>
-  <property>
-    <name>wikipedia.bulk.ingest.dir</name>
-    <value><!--the directory to store rfiles for bulk ingest --></value>
-  </property>
-  <property>
-    <name>wikipedia.bulk.ingest.failure.dir</name>
-    <value><!--the directory to store failed rfiles after bulk ingest --></value>
-  </property>
-  <property>
-    <name>wikipedia.bulk.ingest.buffer.size</name>
-    <value><!--the ammount of memory to use for buffering and sorting key/value pairs in each mapper before writing rfiles --></value>
-  </property>
-</configuration>

http://git-wip-us.apache.org/repos/asf/accumulo/blob/8db62992/src/examples/wikisearch/ingest/pom.xml
----------------------------------------------------------------------
diff --git a/src/examples/wikisearch/ingest/pom.xml b/src/examples/wikisearch/ingest/pom.xml
deleted file mode 100644
index 31d7110..0000000
--- a/src/examples/wikisearch/ingest/pom.xml
+++ /dev/null
@@ -1,160 +0,0 @@
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
-  <!--
-    Licensed to the Apache Software Foundation (ASF) under one or more
-    contributor license agreements. See the NOTICE file distributed with
-    this work for additional information regarding copyright ownership.
-    The ASF licenses this file to You under the Apache License, Version 2.0
-    (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software
-    distributed under the License is distributed on an "AS IS" BASIS,
-    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-    See the License for the specific language governing permissions and
-    limitations under the License.
-  -->
-
-  <modelVersion>4.0.0</modelVersion>
-  <parent>
-    <artifactId>accumulo-wikisearch</artifactId>
-    <groupId>org.apache.accumulo</groupId>
-    <version>1.4.5-SNAPSHOT</version>
-    <relativePath>../</relativePath>
-  </parent>
-
-  <artifactId>wikisearch-ingest</artifactId>
-  <name>wikisearch-ingest</name>
-
-  <dependencies>
-    <dependency>
-      <groupId>org.apache.zookeeper</groupId>
-      <artifactId>zookeeper</artifactId>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.accumulo</groupId>
-      <artifactId>accumulo-core</artifactId>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.accumulo</groupId>
-      <artifactId>accumulo-start</artifactId>
-    </dependency>
-    <dependency>
-      <groupId>log4j</groupId>
-      <artifactId>log4j</artifactId>
-    </dependency>
-    <dependency>
-      <groupId>commons-lang</groupId>
-      <artifactId>commons-lang</artifactId>
-    </dependency>
-    <dependency>
-      <groupId>com.google.collections</groupId>
-      <artifactId>google-collections</artifactId>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.lucene</groupId>
-      <artifactId>lucene-core</artifactId>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.lucene</groupId>
-      <artifactId>lucene-analyzers</artifactId>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.lucene</groupId>
-      <artifactId>lucene-wikipedia</artifactId>
-    </dependency>
-    <dependency>
-      <groupId>com.google.protobuf</groupId>
-      <artifactId>protobuf-java</artifactId>
-    </dependency>
-    <dependency>
-    	<groupId>com.sun.jersey</groupId>
-    	<artifactId>jersey-server</artifactId>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.accumulo</groupId>
-      <artifactId>cloudtrace</artifactId>
-      <scope>runtime</scope>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.thrift</groupId>
-      <artifactId>libthrift</artifactId>
-      <scope>runtime</scope>
-    </dependency>
-    <dependency>
-      <groupId>commons-codec</groupId>
-      <artifactId>commons-codec</artifactId>
-    </dependency>
-  </dependencies>
-
-  <build>
-    <plugins>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-dependency-plugin</artifactId>
-        <executions>
-          <execution>
-            <id>copy-dependencies</id>
-            <phase>process-resources</phase>
-            <goals>
-              <goal>copy-dependencies</goal>
-            </goals>
-            <configuration>
-              <outputDirectory>lib</outputDirectory>
-              <!-- just grab the non-provided runtime dependencies -->
-              <includeArtifactIds>commons-lang,google-collections,lucene-core,lucene-analyzers,lucene-wikipedia,protobuf-java,accumulo-core,hadoop-core,libthrift,cloudtrace,zookeeper,commons-codec</includeArtifactIds>
-              <excludeTransitive>false</excludeTransitive>
-            </configuration>
-          </execution>
-        </executions>
-      </plugin>
-      <plugin>
-        <artifactId>maven-assembly-plugin</artifactId>
-        <configuration>
-          <descriptors>
-            <descriptor>src/assembly/dist.xml</descriptor>
-          </descriptors>
-          <tarLongFileMode>gnu</tarLongFileMode>
-        </configuration>
-      </plugin>
-    </plugins>
-  </build>
-
-  <profiles>
-    <!-- profile for building against Hadoop 1.0.x
-    Activate by not specifying hadoop.profile -->
-    <profile>
-      <id>hadoop-1.0</id>
-      <activation>
-        <property>
-          <name>!hadoop.profile</name>
-        </property>
-      </activation>
-      <dependencies>
-        <dependency>
-          <groupId>org.apache.hadoop</groupId>
-          <artifactId>hadoop-core</artifactId>
-        </dependency>
-      </dependencies>
-    </profile>
-    <!-- profile for building against Hadoop 2.0.x
-    Activate using: mvn -Dhadoop.profile=2.0 -->
-    <profile>
-      <id>hadoop-2.0</id>
-      <activation>
-        <property>
-          <name>hadoop.profile</name>
-          <value>2.0</value>
-        </property>
-      </activation>
-      <dependencies>
-        <dependency>
-          <groupId>org.apache.hadoop</groupId>
-          <artifactId>hadoop-client</artifactId>
-        </dependency>
-      </dependencies>
-    </profile>
-  </profiles>
-
-</project>

http://git-wip-us.apache.org/repos/asf/accumulo/blob/8db62992/src/examples/wikisearch/ingest/src/assembly/dist.xml
----------------------------------------------------------------------
diff --git a/src/examples/wikisearch/ingest/src/assembly/dist.xml b/src/examples/wikisearch/ingest/src/assembly/dist.xml
deleted file mode 100644
index e3e59c1..0000000
--- a/src/examples/wikisearch/ingest/src/assembly/dist.xml
+++ /dev/null
@@ -1,38 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
-  Licensed to the Apache Software Foundation (ASF) under one or more
-  contributor license agreements.  See the NOTICE file distributed with
-  this work for additional information regarding copyright ownership.
-  The ASF licenses this file to You under the Apache License, Version 2.0
-  (the "License"); you may not use this file except in compliance with
-  the License.  You may obtain a copy of the License at
-
-      http://www.apache.org/licenses/LICENSE-2.0
-
-  Unless required by applicable law or agreed to in writing, software
-  distributed under the License is distributed on an "AS IS" BASIS,
-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-  See the License for the specific language governing permissions and
-  limitations under the License.
--->
-<assembly>
-  <id>dist</id>
-  <formats>
-    <format>tar.gz</format>
-  </formats>
-  <baseDirectory></baseDirectory>
-  <fileSets>
-    <fileSet>
-      <directory>lib</directory>
-      <fileMode>0644</fileMode>
-    </fileSet>
-    <fileSet>
-      <directory>bin</directory>
-      <fileMode>0744</fileMode>
-    </fileSet>
-    <fileSet>
-      <directory>conf</directory>
-      <fileMode>0644</fileMode>
-    </fileSet>
-  </fileSets>
-</assembly>

http://git-wip-us.apache.org/repos/asf/accumulo/blob/8db62992/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/ArticleExtractor.java
----------------------------------------------------------------------
diff --git a/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/ArticleExtractor.java b/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/ArticleExtractor.java
deleted file mode 100644
index 0699cfa..0000000
--- a/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/ArticleExtractor.java
+++ /dev/null
@@ -1,207 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.accumulo.examples.wikisearch.ingest;
-
-import java.io.DataInput;
-import java.io.DataOutput;
-import java.io.IOException;
-import java.io.Reader;
-import java.text.ParseException;
-import java.text.SimpleDateFormat;
-import java.util.HashMap;
-import java.util.Map;
-
-import javax.xml.namespace.QName;
-import javax.xml.stream.XMLInputFactory;
-import javax.xml.stream.XMLStreamException;
-import javax.xml.stream.XMLStreamReader;
-
-import org.apache.accumulo.examples.wikisearch.normalizer.LcNoDiacriticsNormalizer;
-import org.apache.accumulo.examples.wikisearch.normalizer.NumberNormalizer;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.io.Writable;
-
-
-public class ArticleExtractor {
-  
-  public final static SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'Z");
-  private static NumberNormalizer nn = new NumberNormalizer();
-  private static LcNoDiacriticsNormalizer lcdn = new LcNoDiacriticsNormalizer();
-  
-  public static class Article implements Writable {
-    int id;
-    String title;
-    long timestamp;
-    String comments;
-    String text;
-    
-    public Article(){}
-    
-    private Article(int id, String title, long timestamp, String comments, String text) {
-      super();
-      this.id = id;
-      this.title = title;
-      this.timestamp = timestamp;
-      this.comments = comments;
-      this.text = text;
-    }
-    
-    public int getId() {
-      return id;
-    }
-    
-    public String getTitle() {
-      return title;
-    }
-    
-    public String getComments() {
-      return comments;
-    }
-    
-    public String getText() {
-      return text;
-    }
-    
-    public long getTimestamp() {
-      return timestamp;
-    }
-    
-    public Map<String,Object> getFieldValues() {
-      Map<String,Object> fields = new HashMap<String,Object>();
-      fields.put("ID", this.id);
-      fields.put("TITLE", this.title);
-      fields.put("TIMESTAMP", this.timestamp);
-      fields.put("COMMENTS", this.comments);
-      return fields;
-    }
-    
-    public Map<String,String> getNormalizedFieldValues() {
-      Map<String,String> fields = new HashMap<String,String>();
-      fields.put("ID", nn.normalizeFieldValue("ID", this.id));
-      fields.put("TITLE", lcdn.normalizeFieldValue("TITLE", this.title));
-      fields.put("TIMESTAMP", nn.normalizeFieldValue("TIMESTAMP", this.timestamp));
-      fields.put("COMMENTS", lcdn.normalizeFieldValue("COMMENTS", this.comments));
-      return fields;
-    }
-
-    @Override
-    public void readFields(DataInput in) throws IOException {
-      id = in.readInt();
-      Text foo = new Text();
-      foo.readFields(in);
-      title = foo.toString();
-      timestamp = in.readLong();
-      foo.readFields(in);
-      comments = foo.toString();
-      foo.readFields(in);
-      text = foo.toString();
-    }
-
-    @Override
-    public void write(DataOutput out) throws IOException {
-      out.writeInt(id);
-      (new Text(title)).write(out);
-      out.writeLong(timestamp);
-      (new Text(comments)).write(out);
-      (new Text(text)).write(out);
-    }
-    
-  }
-  
-  public ArticleExtractor() {}
-  
-  private static XMLInputFactory xmlif = XMLInputFactory.newInstance();
-
-  static
-  {
-    xmlif.setProperty(XMLInputFactory.IS_REPLACING_ENTITY_REFERENCES, Boolean.TRUE);
-  }
-  
-  public Article extract(Reader reader) {
-    
-    XMLStreamReader xmlr = null;
-    
-    try {
-      xmlr = xmlif.createXMLStreamReader(reader);
-    } catch (XMLStreamException e1) {
-      throw new RuntimeException(e1);
-    }
-    
-    QName titleName = QName.valueOf("title");
-    QName textName = QName.valueOf("text");
-    QName revisionName = QName.valueOf("revision");
-    QName timestampName = QName.valueOf("timestamp");
-    QName commentName = QName.valueOf("comment");
-    QName idName = QName.valueOf("id");
-    
-    Map<QName,StringBuilder> tags = new HashMap<QName,StringBuilder>();
-    for (QName tag : new QName[] {titleName, textName, timestampName, commentName, idName}) {
-      tags.put(tag, new StringBuilder());
-    }
-    
-    StringBuilder articleText = tags.get(textName);
-    StringBuilder titleText = tags.get(titleName);
-    StringBuilder timestampText = tags.get(timestampName);
-    StringBuilder commentText = tags.get(commentName);
-    StringBuilder idText = tags.get(idName);
-    
-    StringBuilder current = null;
-    boolean inRevision = false;
-    while (true) {
-      try {
-        if (!xmlr.hasNext())
-          break;
-        xmlr.next();
-      } catch (XMLStreamException e) {
-        throw new RuntimeException(e);
-      }
-      QName currentName = null;
-      if (xmlr.hasName()) {
-        currentName = xmlr.getName();
-      }
-      if (xmlr.isStartElement() && tags.containsKey(currentName)) {
-        if (!inRevision || (!currentName.equals(revisionName) && !currentName.equals(idName))) {
-          current = tags.get(currentName);
-          current.setLength(0);
-        }
-      } else if (xmlr.isStartElement() && currentName.equals(revisionName)) {
-        inRevision = true;
-      } else if (xmlr.isEndElement() && currentName.equals(revisionName)) {
-        inRevision = false;
-      } else if (xmlr.isEndElement() && current != null) {
-        if (textName.equals(currentName)) {
-          
-          String title = titleText.toString();
-          String text = articleText.toString();
-          String comment = commentText.toString();
-          int id = Integer.parseInt(idText.toString());
-          long timestamp;
-          try {
-            timestamp = dateFormat.parse(timestampText.append("+0000").toString()).getTime();
-            return new Article(id, title, timestamp, comment, text);
-          } catch (ParseException e) {
-            return null;
-          }
-        }
-        current = null;
-      } else if (current != null && xmlr.hasText()) {
-        current.append(xmlr.getText());
-      }
-    }
-    return null;
-  }
-}

http://git-wip-us.apache.org/repos/asf/accumulo/blob/8db62992/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/LRUOutputCombiner.java
----------------------------------------------------------------------
diff --git a/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/LRUOutputCombiner.java b/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/LRUOutputCombiner.java
deleted file mode 100644
index 7d7b6dc..0000000
--- a/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/LRUOutputCombiner.java
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.accumulo.examples.wikisearch.ingest;
-
-import java.util.LinkedHashMap;
-import java.util.Map;
-
-public class LRUOutputCombiner<Key,Value> extends LinkedHashMap<Key,Value> {
-  
-  private static final long serialVersionUID = 1L;
-  
-  public static abstract class Fold<Value> {
-    public abstract Value fold(Value oldValue, Value newValue);
-  }
-  
-  public static abstract class Output<Key,Value> {
-    public abstract void output(Key key, Value value);
-  }
-  
-  private final int capacity;
-  private final Fold<Value> fold;
-  private final Output<Key,Value> output;
-  
-  private long cacheHits = 0;
-  private long cacheMisses = 0;
-  
-  public LRUOutputCombiner(int capacity, Fold<Value> fold, Output<Key,Value> output) {
-    super(capacity + 1, 1.1f, true);
-    this.capacity = capacity;
-    this.fold = fold;
-    this.output = output;
-  }
-  
-  protected boolean removeEldestEntry(Map.Entry<Key,Value> eldest) {
-    if (size() > capacity) {
-      output.output(eldest.getKey(), eldest.getValue());
-      return true;
-    }
-    return false;
-  }
-  
-  @Override
-  public Value put(Key key, Value value) {
-    Value val = get(key);
-    if (val != null) {
-      value = fold.fold(val, value);
-      cacheHits++;
-    } else {
-      cacheMisses++;
-    }
-    super.put(key, value);
-    return null;
-  }
-  
-  public void flush() {
-    for (Map.Entry<Key,Value> e : entrySet()) {
-      output.output(e.getKey(), e.getValue());
-    }
-    clear();
-  }
-}

http://git-wip-us.apache.org/repos/asf/accumulo/blob/8db62992/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaConfiguration.java
----------------------------------------------------------------------
diff --git a/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaConfiguration.java b/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaConfiguration.java
deleted file mode 100644
index 27a28a1..0000000
--- a/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaConfiguration.java
+++ /dev/null
@@ -1,198 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.accumulo.examples.wikisearch.ingest;
-
-import java.io.IOException;
-
-import org.apache.accumulo.core.client.AccumuloException;
-import org.apache.accumulo.core.client.AccumuloSecurityException;
-import org.apache.accumulo.core.client.Connector;
-import org.apache.accumulo.core.client.Instance;
-import org.apache.accumulo.core.client.ZooKeeperInstance;
-import org.apache.commons.lang.StringUtils;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.util.ReflectionUtils;
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.SimpleAnalyzer;
-
-public class WikipediaConfiguration {
-  public final static String INSTANCE_NAME = "wikipedia.accumulo.instance_name";
-  public final static String USER = "wikipedia.accumulo.user";
-  public final static String PASSWORD = "wikipedia.accumulo.password";
-  public final static String TABLE_NAME = "wikipedia.accumulo.table";
-  
-  public final static String ZOOKEEPERS = "wikipedia.accumulo.zookeepers";
-  
-  public final static String NAMESPACES_FILENAME = "wikipedia.namespaces.filename";
-  public final static String LANGUAGES_FILENAME = "wikipedia.languages.filename";
-  public final static String WORKING_DIRECTORY = "wikipedia.ingest.working";
-  
-  public final static String ANALYZER = "wikipedia.index.analyzer";
-  
-  public final static String NUM_PARTITIONS = "wikipedia.ingest.partitions";
-
-  public final static String NUM_GROUPS = "wikipedia.ingest.groups";
-
-  public final static String PARTITIONED_ARTICLES_DIRECTORY = "wikipedia.partitioned.directory";
-  
-  public final static String RUN_PARTITIONER = "wikipedia.run.partitioner";
-  public final static String RUN_INGEST = "wikipedia.run.ingest";
-  public final static String BULK_INGEST = "wikipedia.bulk.ingest";
-  public final static String BULK_INGEST_DIR = "wikipedia.bulk.ingest.dir";
-  public final static String BULK_INGEST_FAILURE_DIR = "wikipedia.bulk.ingest.failure.dir";
-  public final static String BULK_INGEST_BUFFER_SIZE = "wikipedia.bulk.ingest.buffer.size";
-  public final static String PARTITIONED_INPUT_MIN_SPLIT_SIZE = "wikipedia.min.input.split.size";
-  
-  
-  public static String getUser(Configuration conf) {
-    return conf.get(USER);
-  };
-  
-  public static byte[] getPassword(Configuration conf) {
-    String pass = conf.get(PASSWORD);
-    if (pass == null) {
-      return null;
-    }
-    return pass.getBytes();
-  }
-  
-  public static String getTableName(Configuration conf) {
-    String tablename = conf.get(TABLE_NAME);
-    if (tablename == null) {
-      throw new RuntimeException("No data table name specified in " + TABLE_NAME);
-    }
-    return tablename;
-  }
-  
-  public static String getInstanceName(Configuration conf) {
-    return conf.get(INSTANCE_NAME);
-  }
-  
-  public static String getZookeepers(Configuration conf) {
-    String zookeepers = conf.get(ZOOKEEPERS);
-    if (zookeepers == null) {
-      throw new RuntimeException("No zookeepers specified in " + ZOOKEEPERS);
-    }
-    return zookeepers;
-  }
-  
-  public static Path getNamespacesFile(Configuration conf) {
-    String filename = conf.get(NAMESPACES_FILENAME, new Path(getWorkingDirectory(conf), "namespaces.dat").toString());
-    return new Path(filename);
-  }
-  
-  public static Path getLanguagesFile(Configuration conf) {
-    String filename = conf.get(LANGUAGES_FILENAME, new Path(getWorkingDirectory(conf), "languages.txt").toString());
-    return new Path(filename);
-  }
-  
-  public static Path getWorkingDirectory(Configuration conf) {
-    String filename = conf.get(WORKING_DIRECTORY);
-    return new Path(filename);
-  }
-  
-  public static Analyzer getAnalyzer(Configuration conf) throws IOException {
-    Class<? extends Analyzer> analyzerClass = conf.getClass(ANALYZER, SimpleAnalyzer.class, Analyzer.class);
-    return ReflectionUtils.newInstance(analyzerClass, conf);
-  }
-  
-  public static Connector getConnector(Configuration conf) throws AccumuloException, AccumuloSecurityException {
-    return getInstance(conf).getConnector(getUser(conf), getPassword(conf));
-  }
-  
-  public static Instance getInstance(Configuration conf) {
-    return new ZooKeeperInstance(getInstanceName(conf), getZookeepers(conf));
-  }
-  
-  public static int getNumPartitions(Configuration conf) {
-    return conf.getInt(NUM_PARTITIONS, 25);
-  }
-  
-  public static int getNumGroups(Configuration conf) {
-    return conf.getInt(NUM_GROUPS, 1);
-  }
-  
-  public static Path getPartitionedArticlesPath(Configuration conf) {
-    return new Path(conf.get(PARTITIONED_ARTICLES_DIRECTORY));
-  }
-  
-  public static long getMinInputSplitSize(Configuration conf) {
-    return conf.getLong(PARTITIONED_INPUT_MIN_SPLIT_SIZE, 1l << 27);
-  }
-
-  public static boolean runPartitioner(Configuration conf) {
-    return conf.getBoolean(RUN_PARTITIONER, false);
-  }
-
-  public static boolean runIngest(Configuration conf) {
-    return conf.getBoolean(RUN_INGEST, true);
-  }
-
-  public static boolean bulkIngest(Configuration conf) {
-    return conf.getBoolean(BULK_INGEST, true);
-  }
-
-  public static String bulkIngestDir(Configuration conf) {
-    return conf.get(BULK_INGEST_DIR);
-  }
-
-  public static String bulkIngestFailureDir(Configuration conf) {
-    return conf.get(BULK_INGEST_FAILURE_DIR);
-  }
-  
-  public static long bulkIngestBufferSize(Configuration conf) {
-    return conf.getLong(BULK_INGEST_BUFFER_SIZE,1l<<28);
-  }
-
-  /**
-   * Helper method to get properties from Hadoop configuration
-   * 
-   * @param <T>
-   * @param conf
-   * @param propertyName
-   * @param resultClass
-   * @throws IllegalArgumentException
-   *           if property is not defined, null, or empty. Or if resultClass is not handled.
-   * @return value of property
-   */
-  @SuppressWarnings("unchecked")
-  public static <T> T isNull(Configuration conf, String propertyName, Class<T> resultClass) {
-    String p = conf.get(propertyName);
-    if (StringUtils.isEmpty(p))
-      throw new IllegalArgumentException(propertyName + " must be specified");
-    
-    if (resultClass.equals(String.class))
-      return (T) p;
-    else if (resultClass.equals(String[].class))
-      return (T) conf.getStrings(propertyName);
-    else if (resultClass.equals(Boolean.class))
-      return (T) Boolean.valueOf(p);
-    else if (resultClass.equals(Long.class))
-      return (T) Long.valueOf(p);
-    else if (resultClass.equals(Integer.class))
-      return (T) Integer.valueOf(p);
-    else if (resultClass.equals(Float.class))
-      return (T) Float.valueOf(p);
-    else if (resultClass.equals(Double.class))
-      return (T) Double.valueOf(p);
-    else
-      throw new IllegalArgumentException(resultClass.getSimpleName() + " is unhandled.");
-    
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/accumulo/blob/8db62992/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaIngester.java
----------------------------------------------------------------------
diff --git a/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaIngester.java b/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaIngester.java
deleted file mode 100644
index 50415a7..0000000
--- a/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaIngester.java
+++ /dev/null
@@ -1,206 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.accumulo.examples.wikisearch.ingest;
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.EnumSet;
-import java.util.List;
-import java.util.Set;
-import java.util.SortedSet;
-import java.util.TreeSet;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import org.apache.accumulo.core.client.AccumuloException;
-import org.apache.accumulo.core.client.AccumuloSecurityException;
-import org.apache.accumulo.core.client.Connector;
-import org.apache.accumulo.core.client.IteratorSetting;
-import org.apache.accumulo.core.client.IteratorSetting.Column;
-import org.apache.accumulo.core.client.TableExistsException;
-import org.apache.accumulo.core.client.TableNotFoundException;
-import org.apache.accumulo.core.client.admin.TableOperations;
-import org.apache.accumulo.core.client.mapreduce.AccumuloOutputFormat;
-import org.apache.accumulo.core.data.Mutation;
-import org.apache.accumulo.core.iterators.IteratorUtil.IteratorScope;
-import org.apache.accumulo.core.iterators.user.SummingCombiner;
-import org.apache.accumulo.examples.wikisearch.iterator.GlobalIndexUidCombiner;
-import org.apache.accumulo.examples.wikisearch.iterator.TextIndexCombiner;
-import org.apache.accumulo.examples.wikisearch.reader.AggregatingRecordReader;
-import org.apache.commons.lang.StringUtils;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.conf.Configured;
-import org.apache.hadoop.fs.FileStatus;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.fs.PathFilter;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.Job;
-import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
-import org.apache.hadoop.util.Tool;
-import org.apache.hadoop.util.ToolRunner;
-
-public class WikipediaIngester extends Configured implements Tool {
-  
-  public final static String INGEST_LANGUAGE = "wikipedia.ingest_language";
-  public final static String SPLIT_FILE = "wikipedia.split_file";
-  public final static String TABLE_NAME = "wikipedia.table";
-  
-  public static void main(String[] args) throws Exception {
-    int res = ToolRunner.run(new Configuration(), new WikipediaIngester(), args);
-    System.exit(res);
-  }
-  
-  private void createTables(TableOperations tops, String tableName) throws AccumuloException, AccumuloSecurityException, TableNotFoundException,
-      TableExistsException {
-    // Create the shard table
-    String indexTableName = tableName + "Index";
-    String reverseIndexTableName = tableName + "ReverseIndex";
-    String metadataTableName = tableName + "Metadata";
-    
-    // create the shard table
-    if (!tops.exists(tableName)) {
-      // Set a text index combiner on the given field names. No combiner is set if the option is not supplied
-      String textIndexFamilies = WikipediaMapper.TOKENS_FIELD_NAME;
-      
-      tops.create(tableName);
-      if (textIndexFamilies.length() > 0) {
-        System.out.println("Adding content combiner on the fields: " + textIndexFamilies);
-        
-        IteratorSetting setting = new IteratorSetting(10, TextIndexCombiner.class);
-        List<Column> columns = new ArrayList<Column>();
-        for (String family : StringUtils.split(textIndexFamilies, ',')) {
-          columns.add(new Column("fi\0" + family));
-        }
-        TextIndexCombiner.setColumns(setting, columns);
-        TextIndexCombiner.setLossyness(setting, true);
-        
-        tops.attachIterator(tableName, setting, EnumSet.allOf(IteratorScope.class));
-      }
-      
-      // Set the locality group for the full content column family
-      tops.setLocalityGroups(tableName, Collections.singletonMap("WikipediaDocuments", Collections.singleton(new Text(WikipediaMapper.DOCUMENT_COLUMN_FAMILY))));
-      
-    }
-    
-    if (!tops.exists(indexTableName)) {
-      tops.create(indexTableName);
-      // Add the UID combiner
-      IteratorSetting setting = new IteratorSetting(19, "UIDAggregator", GlobalIndexUidCombiner.class);
-      GlobalIndexUidCombiner.setCombineAllColumns(setting, true);
-      GlobalIndexUidCombiner.setLossyness(setting, true);
-      tops.attachIterator(indexTableName, setting, EnumSet.allOf(IteratorScope.class));
-    }
-    
-    if (!tops.exists(reverseIndexTableName)) {
-      tops.create(reverseIndexTableName);
-      // Add the UID combiner
-      IteratorSetting setting = new IteratorSetting(19, "UIDAggregator", GlobalIndexUidCombiner.class);
-      GlobalIndexUidCombiner.setCombineAllColumns(setting, true);
-      GlobalIndexUidCombiner.setLossyness(setting, true);
-      tops.attachIterator(reverseIndexTableName, setting, EnumSet.allOf(IteratorScope.class));
-    }
-    
-    if (!tops.exists(metadataTableName)) {
-      // Add the SummingCombiner with VARLEN encoding for the frequency column
-      tops.create(metadataTableName);
-      IteratorSetting setting = new IteratorSetting(10, SummingCombiner.class);
-      SummingCombiner.setColumns(setting, Collections.singletonList(new Column("f")));
-      SummingCombiner.setEncodingType(setting, SummingCombiner.Type.VARLEN);
-      tops.attachIterator(metadataTableName, setting, EnumSet.allOf(IteratorScope.class));
-    }
-  }
-  
-  @Override
-  public int run(String[] args) throws Exception {
-    Job job = new Job(getConf(), "Ingest Wikipedia");
-    Configuration conf = job.getConfiguration();
-    conf.set("mapred.map.tasks.speculative.execution", "false");
-    
-    String tablename = WikipediaConfiguration.getTableName(conf);
-    
-    String zookeepers = WikipediaConfiguration.getZookeepers(conf);
-    String instanceName = WikipediaConfiguration.getInstanceName(conf);
-    
-    String user = WikipediaConfiguration.getUser(conf);
-    byte[] password = WikipediaConfiguration.getPassword(conf);
-    Connector connector = WikipediaConfiguration.getConnector(conf);
-    
-    TableOperations tops = connector.tableOperations();
-    
-    createTables(tops, tablename);
-    
-    configureJob(job);
-    
-    List<Path> inputPaths = new ArrayList<Path>();
-    SortedSet<String> languages = new TreeSet<String>();
-    FileSystem fs = FileSystem.get(conf);
-    Path parent = new Path(conf.get("wikipedia.input"));
-    listFiles(parent, fs, inputPaths, languages);
-    
-    System.out.println("Input files in " + parent + ":" + inputPaths.size());
-    Path[] inputPathsArray = new Path[inputPaths.size()];
-    inputPaths.toArray(inputPathsArray);
-    
-    System.out.println("Languages:" + languages.size());
-    
-    FileInputFormat.setInputPaths(job, inputPathsArray);
-    
-    job.setMapperClass(WikipediaMapper.class);
-    job.setNumReduceTasks(0);
-    job.setMapOutputKeyClass(Text.class);
-    job.setMapOutputValueClass(Mutation.class);
-    job.setOutputFormatClass(AccumuloOutputFormat.class);
-    AccumuloOutputFormat.setOutputInfo(job.getConfiguration(), user, password, true, tablename);
-    AccumuloOutputFormat.setZooKeeperInstance(job.getConfiguration(), instanceName, zookeepers);
-    
-    return job.waitForCompletion(true) ? 0 : 1;
-  }
-  
-  public final static PathFilter partFilter = new PathFilter() {
-    @Override
-    public boolean accept(Path path) {
-      return path.getName().startsWith("part");
-    };
-  };
-  
-  protected void configureJob(Job job) {
-    Configuration conf = job.getConfiguration();
-    job.setJarByClass(WikipediaIngester.class);
-    job.setInputFormatClass(WikipediaInputFormat.class);
-    conf.set(AggregatingRecordReader.START_TOKEN, "<page>");
-    conf.set(AggregatingRecordReader.END_TOKEN, "</page>");
-  }
-  
-  protected static final Pattern filePattern = Pattern.compile("([a-z_]+).*.xml(.bz2)?");
-  
-  protected void listFiles(Path path, FileSystem fs, List<Path> files, Set<String> languages) throws IOException {
-    for (FileStatus status : fs.listStatus(path)) {
-      if (status.isDir()) {
-        listFiles(status.getPath(), fs, files, languages);
-      } else {
-        Path p = status.getPath();
-        Matcher matcher = filePattern.matcher(p.getName());
-        if (matcher.matches()) {
-          languages.add(matcher.group(1));
-          files.add(p);
-        }
-      }
-    }
-  }
-}

http://git-wip-us.apache.org/repos/asf/accumulo/blob/8db62992/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaInputFormat.java
----------------------------------------------------------------------
diff --git a/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaInputFormat.java b/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaInputFormat.java
deleted file mode 100644
index c582cbf..0000000
--- a/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaInputFormat.java
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.accumulo.examples.wikisearch.ingest;
-
-import java.io.DataInput;
-import java.io.DataOutput;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
-
-import org.apache.accumulo.examples.wikisearch.reader.AggregatingRecordReader;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.LongWritable;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.io.Writable;
-import org.apache.hadoop.mapreduce.InputSplit;
-import org.apache.hadoop.mapreduce.JobContext;
-import org.apache.hadoop.mapreduce.RecordReader;
-import org.apache.hadoop.mapreduce.TaskAttemptContext;
-import org.apache.hadoop.mapreduce.lib.input.FileSplit;
-import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
-
-
-public class WikipediaInputFormat extends TextInputFormat {
-
-  public static class WikipediaInputSplit extends InputSplit implements Writable {
-
-    public WikipediaInputSplit(){}
-    
-    public WikipediaInputSplit(FileSplit fileSplit, int partition)
-    {
-      this.fileSplit = fileSplit;
-      this.partition = partition;
-    }
-    
-    private FileSplit fileSplit = null;
-    private int partition = -1;
-
-    public int getPartition()
-    {
-      return partition;
-    }
-    
-    public FileSplit getFileSplit()
-    {
-      return fileSplit;
-    }
-    
-    @Override
-    public long getLength() throws IOException, InterruptedException {
-      return fileSplit.getLength();
-    }
-
-    @Override
-    public String[] getLocations() throws IOException, InterruptedException {
-      // for highly replicated files, returning all of the locations can lead to bunching
-      // TODO replace this with a subset of the locations
-      return fileSplit.getLocations();
-    }
-
-    @Override
-    public void readFields(DataInput in) throws IOException {
-      Path file = new Path(in.readUTF());
-      long start = in.readLong();
-      long length = in.readLong();
-      String [] hosts = null;
-      if(in.readBoolean())
-      {
-        int numHosts = in.readInt();
-        hosts = new String[numHosts];
-        for(int i = 0; i < numHosts; i++)
-          hosts[i] = in.readUTF();
-      }
-      fileSplit = new FileSplit(file, start, length, hosts);
-      partition = in.readInt();
-    }
-
-    @Override
-    public void write(DataOutput out) throws IOException {
-      out.writeUTF(fileSplit.getPath().toString());
-      out.writeLong(fileSplit.getStart());
-      out.writeLong(fileSplit.getLength());
-      String [] hosts = fileSplit.getLocations();
-      if(hosts == null)
-      {
-        out.writeBoolean(false);
-      }
-      else
-      {
-        out.writeBoolean(true);
-        out.writeInt(hosts.length);
-        for(String host:hosts)
-        out.writeUTF(host);
-      }
-      out.writeInt(partition);
-    }
-    
-  }
-  
-  @Override
-  public List<InputSplit> getSplits(JobContext job) throws IOException {
-    List<InputSplit> superSplits = super.getSplits(job);
-    List<InputSplit> splits = new ArrayList<InputSplit>();
-    
-    int numGroups = WikipediaConfiguration.getNumGroups(job.getConfiguration());
-
-    for(int group = 0; group < numGroups; group++)
-    {
-      for(InputSplit split:superSplits)
-      {
-        FileSplit fileSplit = (FileSplit)split;
-        splits.add(new WikipediaInputSplit(fileSplit,group));
-      }
-    }
-    return splits;
-  }
-
-  @Override
-  public RecordReader<LongWritable,Text> createRecordReader(InputSplit split, TaskAttemptContext context) {
-    return new AggregatingRecordReader();
-  }
-}

http://git-wip-us.apache.org/repos/asf/accumulo/blob/8db62992/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaMapper.java
----------------------------------------------------------------------
diff --git a/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaMapper.java b/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaMapper.java
deleted file mode 100644
index 8565b09..0000000
--- a/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaMapper.java
+++ /dev/null
@@ -1,245 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/**
- * 
- */
-package org.apache.accumulo.examples.wikisearch.ingest;
-
-
-import java.io.ByteArrayInputStream;
-import java.io.IOException;
-import java.io.InputStreamReader;
-import java.io.StringReader;
-import java.nio.charset.Charset;
-import java.util.HashSet;
-import java.util.IllegalFormatException;
-import java.util.Map.Entry;
-import java.util.Set;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import org.apache.accumulo.core.data.Mutation;
-import org.apache.accumulo.core.data.Value;
-import org.apache.accumulo.core.security.ColumnVisibility;
-import org.apache.accumulo.examples.wikisearch.ingest.ArticleExtractor.Article;
-import org.apache.accumulo.examples.wikisearch.ingest.WikipediaInputFormat.WikipediaInputSplit;
-import org.apache.accumulo.examples.wikisearch.normalizer.LcNoDiacriticsNormalizer;
-import org.apache.accumulo.examples.wikisearch.protobuf.Uid;
-import org.apache.accumulo.examples.wikisearch.protobuf.Uid.List.Builder;
-import org.apache.commons.codec.binary.Base64;
-import org.apache.commons.lang.StringUtils;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.LongWritable;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.Mapper;
-import org.apache.hadoop.mapreduce.lib.input.FileSplit;
-import org.apache.log4j.Logger;
-import org.apache.lucene.analysis.tokenattributes.TermAttribute;
-import org.apache.lucene.wikipedia.analysis.WikipediaTokenizer;
-
-import com.google.common.collect.HashMultimap;
-import com.google.common.collect.Multimap;
-
-public class WikipediaMapper extends Mapper<LongWritable,Text,Text,Mutation> {
-  
-  private static final Logger log = Logger.getLogger(WikipediaMapper.class);
-  
-  public final static Charset UTF8 = Charset.forName("UTF-8");
-  public static final String DOCUMENT_COLUMN_FAMILY = "d";
-  public static final String METADATA_EVENT_COLUMN_FAMILY = "e";
-  public static final String METADATA_INDEX_COLUMN_FAMILY = "i";
-  public static final String TOKENS_FIELD_NAME = "TEXT";
-  
-  private final static Pattern languagePattern = Pattern.compile("([a-z_]+).*.xml(.bz2)?");
-  private static final Value NULL_VALUE = new Value(new byte[0]);
-  private static final String cvPrefix = "all|";
-  
-  private ArticleExtractor extractor;
-  private String language;
-  private int numPartitions = 0;
-  private ColumnVisibility cv = null;
-
-  private int myGroup = -1;
-  private int numGroups = -1;
-  
-  private Text tablename = null;
-  private Text indexTableName = null;
-  private Text reverseIndexTableName = null;
-  private Text metadataTableName = null;
-  
-  @Override
-  public void setup(Context context) {
-    Configuration conf = context.getConfiguration();
-    tablename = new Text(WikipediaConfiguration.getTableName(conf));
-    indexTableName = new Text(tablename + "Index");
-    reverseIndexTableName = new Text(tablename + "ReverseIndex");
-    metadataTableName = new Text(tablename + "Metadata");
-    
-    WikipediaInputSplit wiSplit = (WikipediaInputSplit)context.getInputSplit();
-    myGroup = wiSplit.getPartition();
-    numGroups = WikipediaConfiguration.getNumGroups(conf);
-    
-    FileSplit split = wiSplit.getFileSplit();
-    String fileName = split.getPath().getName();
-    Matcher matcher = languagePattern.matcher(fileName);
-    if (matcher.matches()) {
-      language = matcher.group(1).replace('_', '-').toLowerCase();
-    } else {
-      throw new RuntimeException("Unknown ingest language! " + fileName);
-    }
-    extractor = new ArticleExtractor();
-    numPartitions = WikipediaConfiguration.getNumPartitions(conf);
-    cv = new ColumnVisibility(cvPrefix + language);
-    
-  }
-  
-  /**
-   * We will partition the documents based on the document id
-   * 
-   * @param article
-   * @param numPartitions
-   * @return The number of the partition for a given article.
-   * @throws IllegalFormatException
-   */
-  public static int getPartitionId(Article article, int numPartitions) throws IllegalFormatException {
-    return article.getId() % numPartitions;
-  }
-  
-  static HashSet<String> metadataSent = new HashSet<String>();
-
-  @Override
-  protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
-    Article article = extractor.extract(new InputStreamReader(new ByteArrayInputStream(value.getBytes()), UTF8));
-    String NULL_BYTE = "\u0000";
-    String colfPrefix = language + NULL_BYTE;
-    String indexPrefix = "fi" + NULL_BYTE;
-    if (article != null) {
-      int groupId = WikipediaMapper.getPartitionId(article, numGroups);
-      if(groupId != myGroup)
-        return;
-      Text partitionId = new Text(Integer.toString(WikipediaMapper.getPartitionId(article, numPartitions)));
-      
-      // Create the mutations for the document.
-      // Row is partition id, colf is language0articleid, colq is fieldName\0fieldValue
-      Mutation m = new Mutation(partitionId);
-      for (Entry<String,Object> entry : article.getFieldValues().entrySet()) {
-        m.put(colfPrefix + article.getId(), entry.getKey() + NULL_BYTE + entry.getValue().toString(), cv, article.getTimestamp(), NULL_VALUE);
-        // Create mutations for the metadata table.
-        String metadataKey = entry.getKey() + METADATA_EVENT_COLUMN_FAMILY + language;
-        if (!metadataSent.contains(metadataKey)) {
-          Mutation mm = new Mutation(entry.getKey());
-          mm.put(METADATA_EVENT_COLUMN_FAMILY, language, cv, article.getTimestamp(), NULL_VALUE);
-          context.write(metadataTableName, mm);
-          metadataSent.add(metadataKey);
-        }
-      }
-      
-      // Tokenize the content
-      Set<String> tokens = getTokens(article);
-      
-      // We are going to put the fields to be indexed into a multimap. This allows us to iterate
-      // over the entire set once.
-      Multimap<String,String> indexFields = HashMultimap.create();
-      // Add the normalized field values
-      LcNoDiacriticsNormalizer normalizer = new LcNoDiacriticsNormalizer();
-      for (Entry<String,String> index : article.getNormalizedFieldValues().entrySet())
-        indexFields.put(index.getKey(), index.getValue());
-      // Add the tokens
-      for (String token : tokens)
-        indexFields.put(TOKENS_FIELD_NAME, normalizer.normalizeFieldValue("", token));
-      
-      for (Entry<String,String> index : indexFields.entries()) {
-        // Create mutations for the in partition index
-        // Row is partition id, colf is 'fi'\0fieldName, colq is fieldValue\0language\0article id
-        m.put(indexPrefix + index.getKey(), index.getValue() + NULL_BYTE + colfPrefix + article.getId(), cv, article.getTimestamp(), NULL_VALUE);
-        
-        // Create mutations for the global index
-        // Create a UID object for the Value
-        Builder uidBuilder = Uid.List.newBuilder();
-        uidBuilder.setIGNORE(false);
-        uidBuilder.setCOUNT(1);
-        uidBuilder.addUID(Integer.toString(article.getId()));
-        Uid.List uidList = uidBuilder.build();
-        Value val = new Value(uidList.toByteArray());
-        
-        // Create mutations for the global index
-        // Row is field value, colf is field name, colq is partitionid\0language, value is Uid.List object
-        Mutation gm = new Mutation(index.getValue());
-        gm.put(index.getKey(), partitionId + NULL_BYTE + language, cv, article.getTimestamp(), val);
-        context.write(indexTableName, gm);
-        
-        // Create mutations for the global reverse index
-        Mutation grm = new Mutation(StringUtils.reverse(index.getValue()));
-        grm.put(index.getKey(), partitionId + NULL_BYTE + language, cv, article.getTimestamp(), val);
-        context.write(reverseIndexTableName, grm);
-        
-        // Create mutations for the metadata table.
-        String metadataKey = index.getKey() + METADATA_INDEX_COLUMN_FAMILY + language;
-        if (!metadataSent.contains(metadataKey)) {
-          Mutation mm = new Mutation(index.getKey());
-          mm.put(METADATA_INDEX_COLUMN_FAMILY, language + NULL_BYTE + LcNoDiacriticsNormalizer.class.getName(), cv, article.getTimestamp(), NULL_VALUE);
-          context.write(metadataTableName, mm);
-          metadataSent.add(metadataKey);
-        }
-      }
-      // Add the entire text to the document section of the table.
-      // row is the partition, colf is 'd', colq is language\0articleid, value is Base64 encoded GZIP'd document
-      m.put(DOCUMENT_COLUMN_FAMILY, colfPrefix + article.getId(), cv, article.getTimestamp(), new Value(Base64.encodeBase64(article.getText().getBytes())));
-      context.write(tablename, m);
-      
-    } else {
-      context.getCounter("wikipedia", "invalid articles").increment(1);
-    }
-    context.progress();
-  }
-  
-  /**
-   * Tokenize the wikipedia content
-   * 
-   * @param article
-   * @return
-   * @throws IOException
-   */
-  static Set<String> getTokens(Article article) throws IOException {
-    Set<String> tokenList = new HashSet<String>();
-    WikipediaTokenizer tok = new WikipediaTokenizer(new StringReader(article.getText()));
-    TermAttribute term = tok.addAttribute(TermAttribute.class);
-    try {
-      while (tok.incrementToken()) {
-        String token = term.term();
-        if (!StringUtils.isEmpty(token))
-          tokenList.add(token);
-      }
-    } catch (IOException e) {
-      log.error("Error tokenizing text", e);
-    } finally {
-      try {
-        tok.end();
-      } catch (IOException e) {
-        log.error("Error calling end()", e);
-      } finally {
-        try {
-          tok.close();
-        } catch (IOException e) {
-          log.error("Error closing tokenizer", e);
-        }
-      }
-    }
-    return tokenList;
-  }
-  
-}


Mime
View raw message