Return-Path: X-Original-To: apmail-mahout-commits-archive@www.apache.org Delivered-To: apmail-mahout-commits-archive@www.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 8A9A8179C8 for ; Sat, 4 Apr 2015 19:48:27 +0000 (UTC) Received: (qmail 28112 invoked by uid 500); 4 Apr 2015 19:48:27 -0000 Delivered-To: apmail-mahout-commits-archive@mahout.apache.org Received: (qmail 28057 invoked by uid 500); 4 Apr 2015 19:48:27 -0000 Mailing-List: contact commits-help@mahout.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@mahout.apache.org Delivered-To: mailing list commits@mahout.apache.org Received: (qmail 28047 invoked by uid 99); 4 Apr 2015 19:48:27 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Sat, 04 Apr 2015 19:48:27 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id 156DCE1804; Sat, 4 Apr 2015 19:48:27 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: apalumbo@apache.org To: commits@mahout.apache.org Message-Id: <94138df29a334f00a7ea0b11e2a0cc9e@git.apache.org> X-Mailer: ASF-Git Admin Mailer Subject: mahout git commit: MAHOUT-1493: Naive Bayes CLI cleanup closes apache/mahout#102 Date: Sat, 4 Apr 2015 19:48:27 +0000 (UTC) Repository: mahout Updated Branches: refs/heads/master eb77ce6e6 -> 1bcda3214 MAHOUT-1493: Naive Bayes CLI cleanup closes apache/mahout#102 Project: http://git-wip-us.apache.org/repos/asf/mahout/repo Commit: http://git-wip-us.apache.org/repos/asf/mahout/commit/1bcda321 Tree: http://git-wip-us.apache.org/repos/asf/mahout/tree/1bcda321 Diff: http://git-wip-us.apache.org/repos/asf/mahout/diff/1bcda321 Branch: refs/heads/master Commit: 1bcda32146713144eac4889c3065c448417c36f6 Parents: eb77ce6 Author: Andrew Palumbo Authored: Sat Apr 4 15:47:25 2015 -0400 Committer: Andrew Palumbo Committed: Sat Apr 4 15:47:25 2015 -0400 ---------------------------------------------------------------------- examples/bin/classify-20newsgroups.sh | 22 ++++++++------ .../apache/mahout/drivers/TestNBDriver.scala | 30 ++++++++------------ .../apache/mahout/drivers/TrainNBDriver.scala | 21 ++++---------- 3 files changed, 32 insertions(+), 41 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/mahout/blob/1bcda321/examples/bin/classify-20newsgroups.sh ---------------------------------------------------------------------- diff --git a/examples/bin/classify-20newsgroups.sh b/examples/bin/classify-20newsgroups.sh index e92dc7d..061487b 100755 --- a/examples/bin/classify-20newsgroups.sh +++ b/examples/bin/classify-20newsgroups.sh @@ -42,7 +42,7 @@ if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then fi WORK_DIR=/tmp/mahout-work-${USER} -algorithm=( cnaivebayes-MapReduce naivebayes-MapReduce cnaivebayes-Spark naivebayes-Spark sgd-MapReduce clean) +algorithm=( cnaivebayes-MapReduce naivebayes-MapReduce cnaivebayes-Spark naivebayes-Spark sgd clean) if [ -n "$1" ]; then choice=$1 else @@ -59,6 +59,17 @@ fi echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]}" alg=${algorithm[$choice-1]} +# Spark specific check and work +if [ "x$alg" == "xnaivebayes-Spark" -o "x$alg" == "xcnaivebayes-Spark" ]; then + if [ "$MASTER" == "" ] ; then + echo "Plese set your MASTER env variable to point to your Spark Master URL. exiting..." + exit 1 + fi + set +e + $HADOOP dfs -rmr ${WORK_DIR}/spark-model + set -e +fi + if [ "x$alg" != "xclean" ]; then echo "creating work directory at ${WORK_DIR}" @@ -98,7 +109,6 @@ if ( [ "x$alg" == "xnaivebayes-MapReduce" ] || [ "x$alg" == "xcnaivebayes-MapR echo "Copying 20newsgroups data to HDFS" set +e $HADOOP dfs -rmr ${WORK_DIR}/20news-all - $HADOOP dfs -rmr ${WORK_DIR}/spark-model $HADOOP dfs -mkdir ${WORK_DIR} set -e $HADOOP dfs -put ${WORK_DIR}/20news-all ${WORK_DIR}/20news-all @@ -147,9 +157,6 @@ if ( [ "x$alg" == "xnaivebayes-MapReduce" ] || [ "x$alg" == "xcnaivebayes-MapR -ow -o ${WORK_DIR}/20news-testing $c elif [ "x$alg" == "xnaivebayes-Spark" -o "x$alg" == "xcnaivebayes-Spark" ]; then - set +e - $HADOOP dfs -rmr ${WORK_DIR}/spark-model - set -e echo "Training Naive Bayes model" ./bin/mahout spark-trainnb \ @@ -159,16 +166,15 @@ if ( [ "x$alg" == "xnaivebayes-MapReduce" ] || [ "x$alg" == "xcnaivebayes-MapR echo "Self testing on training set" ./bin/mahout spark-testnb \ -i ${WORK_DIR}/20news-train-vectors\ - -o ${WORK_DIR}\ -m ${WORK_DIR}/spark-model $c -ma $MASTER echo "Testing on holdout set" ./bin/mahout spark-testnb \ -i ${WORK_DIR}/20news-test-vectors\ - -o ${WORK_DIR}\ -m ${WORK_DIR}/spark-model $c -ma $MASTER + fi -elif [ "x$alg" == "xsgd-MapReduce" ]; then +elif [ "x$alg" == "xsgd" ]; then if [ ! -e "/tmp/news-group.model" ]; then echo "Training on ${WORK_DIR}/20news-bydate/20news-bydate-train/" ./bin/mahout org.apache.mahout.classifier.sgd.TrainNewsGroups ${WORK_DIR}/20news-bydate/20news-bydate-train/ http://git-wip-us.apache.org/repos/asf/mahout/blob/1bcda321/spark/src/main/scala/org/apache/mahout/drivers/TestNBDriver.scala ---------------------------------------------------------------------- diff --git a/spark/src/main/scala/org/apache/mahout/drivers/TestNBDriver.scala b/spark/src/main/scala/org/apache/mahout/drivers/TestNBDriver.scala index 9e73094..1a9228b 100644 --- a/spark/src/main/scala/org/apache/mahout/drivers/TestNBDriver.scala +++ b/spark/src/main/scala/org/apache/mahout/drivers/TestNBDriver.scala @@ -18,7 +18,6 @@ package org.apache.mahout.drivers import org.apache.mahout.classifier.naivebayes.{NBModel, NaiveBayes} -import org.apache.mahout.classifier.stats.ConfusionMatrix import org.apache.mahout.math.drm import org.apache.mahout.math.drm.DrmLike import scala.collection.immutable.HashMap @@ -37,38 +36,34 @@ object TestNBDriver extends MahoutSparkDriver { parser = new MahoutSparkOptionParser(programName = "spark-testnb") { head("spark-testnb", "Mahout 0.10.0") - //Input output options, non-driver specific - parseIOOptions(numInputs = 1) + // Input options, non-driver specific + // we have no output except the confusion matrix to stdout so we don't need an + // output option - //Algorithm control options--driver specific + note("Input, option") + opt[String]('i', "input") required() action { (x, options) => + options + ("input" -> x) + } text ("Input: path to test data " + + " (required)") + + // Algorithm control options--driver specific opts = opts ++ testNBOptipns note("\nAlgorithm control options:") - //default testComplementary is false + // default testComplementary is false opts = opts + ("testComplementary" -> false) opt[Unit]("testComplementary") abbr ("c") action { (_, options) => options + ("testComplementary" -> true) } text ("Test a complementary model, Default: false.") - opt[String]("pathToModel") abbr ("m") action { (x, options) => options + ("pathToModel" -> x) } text ("Path to the Trained Model") - - //How to search for input - parseFileDiscoveryOptions() - - //IndexedDataset output schema--not driver specific, IndexedDataset specific - parseIndexedDatasetFormatOptions() - - //Spark config options--not driver specific + // Spark config options--not driver specific parseSparkOptions() - //Jar inclusion, this option can be set when executing the driver from compiled code, not when from CLI - parseGenericOptions() - help("help") abbr ("h") text ("prints this usage text\n") } @@ -96,7 +91,6 @@ object TestNBDriver extends MahoutSparkDriver { start() val testComplementary = parser.opts("testComplementary").asInstanceOf[Boolean] - val outputPath = parser.opts("output").asInstanceOf[String] // todo: get the -ow option in to check for a model in the path and overwrite if flagged. http://git-wip-us.apache.org/repos/asf/mahout/blob/1bcda321/spark/src/main/scala/org/apache/mahout/drivers/TrainNBDriver.scala ---------------------------------------------------------------------- diff --git a/spark/src/main/scala/org/apache/mahout/drivers/TrainNBDriver.scala b/spark/src/main/scala/org/apache/mahout/drivers/TrainNBDriver.scala index 2edebca..bbedf64 100644 --- a/spark/src/main/scala/org/apache/mahout/drivers/TrainNBDriver.scala +++ b/spark/src/main/scala/org/apache/mahout/drivers/TrainNBDriver.scala @@ -37,32 +37,22 @@ object TrainNBDriver extends MahoutSparkDriver { parser = new MahoutSparkOptionParser(programName = "spark-trainnb") { head("spark-trainnb", "Mahout 0.10.0") - //Input output options, non-driver specific + // Input output options, non-driver specific parseIOOptions(numInputs = 1) - //Algorithm control options--driver specific + // Algorithm control options--driver specific opts = opts ++ trainNBOptipns note("\nAlgorithm control options:") - //default trainComplementary is false + // default trainComplementary is false opts = opts + ("trainComplementary" -> false) opt[Unit]("trainComplementary") abbr ("c") action { (_, options) => options + ("trainComplementary" -> true) } text ("Train a complementary model, Default: false.") - - //How to search for input - parseFileDiscoveryOptions() - - //IndexedDataset output schema--not driver specific, IndexedDataset specific - parseIndexedDatasetFormatOptions() - - //Spark config options--not driver specific + // Spark config options--not driver specific parseSparkOptions() - //Jar inclusion, this option can be set when executing the driver from compiled code, not when from CLI - parseGenericOptions() - help("help") abbr ("h") text ("prints this usage text\n") } @@ -86,8 +76,9 @@ object TrainNBDriver extends MahoutSparkDriver { val outputPath = parser.opts("output").asInstanceOf[String] val trainingSet = readTrainingSet + // Use Spark-Optimized Naive Bayes here to extract labels and aggregate options val (labelIndex, aggregatedObservations) = SparkNaiveBayes.extractLabelsAndAggregateObservations(trainingSet) - val model = NaiveBayes.train(aggregatedObservations, labelIndex) + val model = NaiveBayes.train(aggregatedObservations, labelIndex, complementary) model.dfsWrite(outputPath)