mahout-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From sro...@apache.org
Subject svn commit: r1170702 - in /mahout/trunk/examples/bin: build-20news-bayes.sh build-cluster-syntheticcontrol.sh build-reuters.sh factorize-movielens-1M.sh
Date Wed, 14 Sep 2011 16:12:54 GMT
Author: srowen
Date: Wed Sep 14 16:12:53 2011
New Revision: 1170702

URL: http://svn.apache.org/viewvc?rev=1170702&view=rev
Log:
MAHOUT-811 move work dir to /tmp

Modified:
    mahout/trunk/examples/bin/build-20news-bayes.sh
    mahout/trunk/examples/bin/build-cluster-syntheticcontrol.sh
    mahout/trunk/examples/bin/build-reuters.sh
    mahout/trunk/examples/bin/factorize-movielens-1M.sh

Modified: mahout/trunk/examples/bin/build-20news-bayes.sh
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/bin/build-20news-bayes.sh?rev=1170702&r1=1170701&r2=1170702&view=diff
==============================================================================
--- mahout/trunk/examples/bin/build-20news-bayes.sh (original)
+++ mahout/trunk/examples/bin/build-20news-bayes.sh Wed Sep 14 16:12:53 2011
@@ -27,16 +27,20 @@ if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCR
   cd $SCRIPT_PATH
 fi
 
-mkdir -p work
-if [ ! -e work/20news-bayesinput ]; then
-  if [ ! -e work/20news-bydate ]; then
-    if [ ! -f work/20news-bydate.tar.gz ]; then
+WORK_DIR=/tmp/mahout-work-${USER}
+
+echo "creating work directory at ${WORK_DIR}"
+
+mkdir -p ${WORK_DIR}
+if [ ! -e ${WORK_DIR}/20news-bayesinput ]; then
+  if [ ! -e ${WORK_DIR}/20news-bydate ]; then
+    if [ ! -f ${WORK_DIR}/20news-bydate.tar.gz ]; then
       echo "Downloading 20news-bydate"
-      curl http://people.csail.mit.edu/jrennie/20Newsgroups/20news-bydate.tar.gz -o work/20news-bydate.tar.gz
+      curl http://people.csail.mit.edu/jrennie/20Newsgroups/20news-bydate.tar.gz -o ${WORK_DIR}/20news-bydate.tar.gz
     fi
-    mkdir -p work/20news-bydate
+    mkdir -p ${WORK_DIR}/20news-bydate
     echo "Extracting..."
-    cd work/20news-bydate && tar xzf ../20news-bydate.tar.gz && cd .. &&
cd ..
+    cd ${WORK_DIR}/20news-bydate && tar xzf ../20news-bydate.tar.gz && cd
.. && cd ..
   fi
 fi
 
@@ -45,14 +49,14 @@ cd ../..
 set -e
 
 ./bin/mahout org.apache.mahout.classifier.bayes.PrepareTwentyNewsgroups \
-  -p examples/bin/work/20news-bydate/20news-bydate-train \
-  -o examples/bin/work/20news-bydate/bayes-train-input \
+  -p ${WORK_DIR}/20news-bydate/20news-bydate-train \
+  -o ${WORK_DIR}/20news-bydate/bayes-train-input \
   -a org.apache.mahout.vectorizer.DefaultAnalyzer \
   -c UTF-8
 
 ./bin/mahout org.apache.mahout.classifier.bayes.PrepareTwentyNewsgroups \
-  -p examples/bin/work/20news-bydate/20news-bydate-test \
-  -o examples/bin/work/20news-bydate/bayes-test-input \
+  -p ${WORK_DIR}/20news-bydate/20news-bydate-test \
+  -o ${WORK_DIR}/20news-bydate/bayes-test-input \
   -a org.apache.mahout.vectorizer.DefaultAnalyzer \
   -c UTF-8 
 
@@ -65,33 +69,36 @@ if [ "$HADOOP_HOME" != "" ]; then
 
     set +e 
     hadoop dfs -rmr \
-      examples/bin/work/20news-bydate/bayes-train-input 
+      ${WORK_DIR}/20news-bydate/bayes-train-input 
 
     hadoop dfs -rmr \
-      examples/bin/work/20news-bydate/bayes-test-input
+      ${WORK_DIR}/20news-bydate/bayes-test-input
 
     set -e
     hadoop dfs -put \
-      examples/bin/work/20news-bydate/bayes-train-input \
-      examples/bin/work/20news-bydate/bayes-train-input 
+      ${WORK_DIR}/20news-bydate/bayes-train-input \
+      ${WORK_DIR}/20news-bydate/bayes-train-input 
 
     hadoop dfs -put \
-      examples/bin/work/20news-bydate/bayes-test-input \
-      examples/bin/work/20news-bydate/bayes-test-input
+      ${WORK_DIR}/20news-bydate/bayes-test-input \
+      ${WORK_DIR}/20news-bydate/bayes-test-input
 fi
 
 
 ./bin/mahout trainclassifier \
-  -i examples/bin/work/20news-bydate/bayes-train-input \
-  -o examples/bin/work/20news-bydate/bayes-model \
+  -i ${WORK_DIR}/20news-bydate/bayes-train-input \
+  -o ${WORK_DIR}/20news-bydate/bayes-model \
   -type bayes \
   -ng 1 \
   -source hdfs
 
 ./bin/mahout testclassifier \
-  -m examples/bin/work/20news-bydate/bayes-model \
-  -d examples/bin/work/20news-bydate/bayes-test-input \
+  -m ${WORK_DIR}/20news-bydate/bayes-model \
+  -d ${WORK_DIR}/20news-bydate/bayes-test-input \
   -type bayes \
   -ng 1 \
   -source hdfs \
   -method ${TEST_METHOD}
+
+# Remove the work directory
+rm -rf ${WORK_DIR}

Modified: mahout/trunk/examples/bin/build-cluster-syntheticcontrol.sh
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/bin/build-cluster-syntheticcontrol.sh?rev=1170702&r1=1170701&r2=1170702&view=diff
==============================================================================
--- mahout/trunk/examples/bin/build-cluster-syntheticcontrol.sh (original)
+++ mahout/trunk/examples/bin/build-cluster-syntheticcontrol.sh Wed Sep 14 16:12:53 2011
@@ -40,10 +40,14 @@ else
 fi
 
 cd examples/bin/
-mkdir -p work
-if [ ! -f work/synthetic_control.data ]; then
+
+WORK_DIR=/tmp/mahout-work-${USER}
+
+echo "creating work directory at ${WORK_DIR}"
+mkdir -p ${WORK_DIR}
+if [ ! -f ${WORK_DIR}/synthetic_control.data ]; then
   echo "Downloading Synthetic control data"
-  curl http://archive.ics.uci.edu/ml/databases/synthetic_control/synthetic_control.data 
-o work/synthetic_control.data
+  curl http://archive.ics.uci.edu/ml/databases/synthetic_control/synthetic_control.data 
-o ${WORK_DIR}/synthetic_control.data
 fi
 
 if [ "$HADOOP_HOME" != "" ]; then
@@ -54,7 +58,7 @@ if [ "$HADOOP_HOME" != "" ]; then
     echo "Uploading Synthetic control data to HDFS"
     $HADOOP_HOME/bin/hadoop fs -rmr testdata
     $HADOOP_HOME/bin/hadoop fs -mkdir testdata
-    $HADOOP_HOME/bin/hadoop fs -put work/synthetic_control.data testdata
+    $HADOOP_HOME/bin/hadoop fs -put ${WORK_DIR}/synthetic_control.data testdata
     echo "Successfully Uploaded Synthetic control data to HDFS "
 
     ../../bin/mahout org.apache.mahout.clustering.syntheticcontrol."${clustertype}".Job
@@ -63,4 +67,7 @@ if [ "$HADOOP_HOME" != "" ]; then
   fi
 else
   echo " HADOOP_HOME variable is not set. Please set this environment variable and rerun
the script"
-fi
\ No newline at end of file
+fi
+
+# Remove the work directory
+rm -rf ${WORK_DIR}

Modified: mahout/trunk/examples/bin/build-reuters.sh
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/bin/build-reuters.sh?rev=1170702&r1=1170701&r2=1170702&view=diff
==============================================================================
--- mahout/trunk/examples/bin/build-reuters.sh (original)
+++ mahout/trunk/examples/bin/build-reuters.sh Wed Sep 14 16:12:53 2011
@@ -48,29 +48,32 @@ else
   clustertype=${algorithm[$choice-1]} 
 fi
 
-mkdir -p mahout-work
+WORK_DIR=/tmp/mahout-work-${USER}
+echo "creating work directory at ${WORK_DIR}"
 
-if [ ! -e mahout-work/reuters-out-seqdir ]; then
-    if [ ! -e mahout-work/reuters-out ]; then
-	if [ ! -e mahout-work/reuters-sgm ]; then
-	    if [ ! -f mahout-work/reuters21578.tar.gz ]; then
+mkdir -p ${WORK_DIR}
+
+if [ ! -e ${WORK_DIR}/reuters-out-seqdir ]; then
+    if [ ! -e ${WORK_DIR}/reuters-out ]; then
+	if [ ! -e ${WORK_DIR}/reuters-sgm ]; then
+	    if [ ! -f ${WORK_DIR}/reuters21578.tar.gz ]; then
 		echo "Downloading Reuters-21578"
 		curl http://kdd.ics.uci.edu/databases/reuters21578/reuters21578.tar.gz \
-                     -o mahout-work/reuters21578.tar.gz
+                     -o ${WORK_DIR}/reuters21578.tar.gz
 	    fi
-	    mkdir -p mahout-work/reuters-sgm
+	    mkdir -p ${WORK_DIR}/reuters-sgm
 	    echo "Extracting..."
-	    cd mahout-work/reuters-sgm && tar xzf ../reuters21578.tar.gz && cd ..
&& cd ..
+	    cd ${WORK_DIR}/reuters-sgm && tar xzf ../reuters21578.tar.gz && cd ..
&& cd ..
 	fi
 	
 	$MAHOUT org.apache.lucene.benchmark.utils.ExtractReuters \
-	    mahout-work/reuters-sgm \
-	    mahout-work/reuters-out 
+	    ${WORK_DIR}/reuters-sgm \
+	    ${WORK_DIR}/reuters-out 
     fi
 
     MAHOUT_LOCAL=true $MAHOUT seqdirectory \
-        -i mahout-work/reuters-out \
-        -o mahout-work/reuters-out-seqdir \
+        -i ${WORK_DIR}/reuters-out \
+        -o ${WORK_DIR}/reuters-out-seqdir \
         -c UTF-8 -chunk 5
 fi
 
@@ -86,42 +89,45 @@ if [ "$HADOOP_HOME" != "" ] && [ "$MAHOU
 
     set +e
     $HADOOP dfs -rmr \
-        mahout-work/reuters-out-seqdir
+        ${WORK_DIR}/reuters-out-seqdir
     set -e
     $HADOOP dfs -put \
-        mahout-work/reuters-out-seqdir \
-        mahout-work/reuters-out-seqdir
+        ${WORK_DIR}/reuters-out-seqdir \
+        ${WORK_DIR}/reuters-out-seqdir
 fi
 
 if [ "x$clustertype" == "xkmeans" ]; then
   $MAHOUT seq2sparse \
-    -i mahout-work/reuters-out-seqdir/ \
-    -o mahout-work/reuters-out-seqdir-sparse-kmeans \
+    -i ${WORK_DIR}/reuters-out-seqdir/ \
+    -o ${WORK_DIR}/reuters-out-seqdir-sparse-kmeans \
   && \
   $MAHOUT kmeans \
-    -i mahout-work/reuters-out-seqdir-sparse-kmeans/tfidf-vectors/ \
-    -c mahout-work/reuters-kmeans-clusters \
-    -o mahout-work/reuters-kmeans \
+    -i ${WORK_DIR}/reuters-out-seqdir-sparse-kmeans/tfidf-vectors/ \
+    -c ${WORK_DIR}/reuters-kmeans-clusters \
+    -o ${WORK_DIR}/reuters-kmeans \
     -x 10 -k 20 -ow \
   && \
   $MAHOUT clusterdump \
-    -s mahout-work/reuters-kmeans/clusters-10 \
-    -d mahout-work/reuters-out-seqdir-sparse-kmeans/dictionary.file-0 \
+    -s ${WORK_DIR}/reuters-kmeans/clusters-10 \
+    -d ${WORK_DIR}/reuters-out-seqdir-sparse-kmeans/dictionary.file-0 \
     -dt sequencefile -b 100 -n 20
 elif [ "x$clustertype" == "xlda" ]; then
   $MAHOUT seq2sparse \
-    -i mahout-work/reuters-out-seqdir/ \
-    -o mahout-work/reuters-out-seqdir-sparse-lda \
+    -i ${WORK_DIR}/reuters-out-seqdir/ \
+    -o ${WORK_DIR}/reuters-out-seqdir-sparse-lda \
     -wt tf -seq -nr 3 \
   && \
   $MAHOUT lda \
-    -i mahout-work/reuters-out-seqdir-sparse-lda/tf-vectors \
-    -o mahout-work/reuters-lda -k 20 -v 50000 -ow -x 20 \
+    -i ${WORK_DIR}/reuters-out-seqdir-sparse-lda/tf-vectors \
+    -o ${WORK_DIR}/reuters-lda -k 20 -v 50000 -ow -x 20 \
   && \
   $MAHOUT ldatopics \
-    -i mahout-work/reuters-lda/state-20 \
-    -d mahout-work/reuters-out-seqdir-sparse-lda/dictionary.file-0 \
+    -i ${WORK_DIR}/reuters-lda/state-20 \
+    -d ${WORK_DIR}/reuters-out-seqdir-sparse-lda/dictionary.file-0 \
     -dt sequencefile
 else 
   echo "unknown cluster type: $clustertype";
 fi 
+
+# Remove the work directory
+rm -rf ${WORK_DIR}

Modified: mahout/trunk/examples/bin/factorize-movielens-1M.sh
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/bin/factorize-movielens-1M.sh?rev=1170702&r1=1170701&r2=1170702&view=diff
==============================================================================
--- mahout/trunk/examples/bin/factorize-movielens-1M.sh (original)
+++ mahout/trunk/examples/bin/factorize-movielens-1M.sh Wed Sep 14 16:12:53 2011
@@ -32,28 +32,29 @@ then
   exit -1
 fi
 
-echo "creating work directory"
-mkdir -p work/movielens
+WORK_DIR=/tmp/mahout-work-${USER}
+echo "creating work directory at ${WORK_DIR}"
+mkdir -p ${WORK_DIR}/movielens
 
 echo "Converting ratings..."
-cat $1 |sed -e s/::/,/g| cut -d, -f1,2,3 > work/movielens/ratings.csv
+cat $1 |sed -e s/::/,/g| cut -d, -f1,2,3 > ${WORK_DIR}/movielens/ratings.csv
 
 #create a 90% percent training set and a 10% probe set
-bin/mahout splitDataset --input work/movielens/ratings.csv --output work/dataset \
-    --trainingPercentage 0.9 --probePercentage 0.1 --tempDir work/dataset/tmp
+bin/mahout splitDataset --input ${WORK_DIR}/movielens/ratings.csv --output ${WORK_DIR}/dataset
\
+    --trainingPercentage 0.9 --probePercentage 0.1 --tempDir ${WORK_DIR}/dataset/tmp
 
 #run distributed ALS-WR to factorize the rating matrix based on the training set
-bin/mahout parallelALS --input work/dataset/trainingSet/ --output work/als/out \
-    --tempDir work/als/tmp --numFeatures 20 --numIterations 10 --lambda 0.065
+bin/mahout parallelALS --input ${WORK_DIR}/dataset/trainingSet/ --output ${WORK_DIR}/als/out
\
+    --tempDir ${WORK_DIR}/als/tmp --numFeatures 20 --numIterations 10 --lambda 0.065
 
 # compute predictions against the probe set, measure the error
-bin/mahout evaluateFactorizationParallel --output work/als/rmse --pairs work/dataset/probeSet/
\
-    --userFeatures work/als/out/U/ --itemFeatures work/als/out/M/
+bin/mahout evaluateFactorizationParallel --output ${WORK_DIR}/als/rmse --pairs ${WORK_DIR}/dataset/probeSet/
\
+    --userFeatures ${WORK_DIR}/als/out/U/ --itemFeatures ${WORK_DIR}/als/out/M/
 
 # print the error
 echo -e "\nRMSE is:\n"
-cat work/als/rmse/rmse.txt
+cat ${WORK_DIR}/als/rmse/rmse.txt
 echo -e "\n\n"
 
 echo "removing work directory"
-rm -rf work
\ No newline at end of file
+rm -rf ${WORK_DIR}
\ No newline at end of file



Mime
View raw message