mahout-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From gsing...@apache.org
Subject svn commit: r1211812 - /mahout/trunk/examples/bin/asf-email-examples.sh
Date Thu, 08 Dec 2011 10:41:59 GMT
Author: gsingers
Date: Thu Dec  8 10:41:58 2011
New Revision: 1211812

URL: http://svn.apache.org/viewvc?rev=1211812&view=rev
Log:
MAHOUT-837: make ASF examples HDFS aware

Modified:
    mahout/trunk/examples/bin/asf-email-examples.sh

Modified: mahout/trunk/examples/bin/asf-email-examples.sh
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/bin/asf-email-examples.sh?rev=1211812&r1=1211811&r2=1211812&view=diff
==============================================================================
--- mahout/trunk/examples/bin/asf-email-examples.sh (original)
+++ mahout/trunk/examples/bin/asf-email-examples.sh Thu Dec  8 10:41:58 2011
@@ -16,6 +16,24 @@
 # limitations under the License.
 #
 
+function fileExists() {
+  if ([ "$MAHOUT_LOCAL" != "" ] && [ ! -e "$1" ]) || ([ "$MAHOUT_LOCAL" == "" ] &&
! hadoop fs -test -e /user/$USER/$1); then
+    return 1 # file doesn't exist
+  else
+    return 0 # file exists
+  fi
+}
+
+function removeFolder() {
+  if [ "$MAHOUT_LOCAL" != "" ]; then
+    rm -rf $1
+  else
+    if fileExists "$1"; then
+      hadoop fs -rmr /user/$USER/$1
+    fi
+  fi	
+}
+
 if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
   echo "This script runs recommendation, classification and clustering of the ASF Email Public
Dataset, as hosted on Amazon (http://aws.amazon.com/datasets/7791434387204566).  Requires
download."
   exit
@@ -53,7 +71,7 @@ alg=${algorithm[$choice-1]}
 if [ "x$alg" == "xrecommender" ]; then
   # convert the mail to seq files
   MAIL_OUT="$OUT/prefs/seq-files"
-  if [ ! -e "$MAIL_OUT/chunk-0" ]; then
+  if ! fileExists "$MAIL_OUT/chunk-0"; then
     echo "Converting Mail files to Sequence Files"
     $MAHOUT org.apache.mahout.text.SequenceFilesFromMailArchives --charset "UTF-8" --from
--references --input $ASF_ARCHIVES --output $MAIL_OUT --separator " ::: "
   fi
@@ -62,10 +80,12 @@ if [ "x$alg" == "xrecommender" ]; then
   PREFS_REC_INPUT="$OUT/prefs/input/recInput"
   RECS_OUT=$"$OUT/prefs/recommendations"
   # prep for recs
-  if [ ! -e "$PREFS/fromIds-dictionary-0" ]; then
+  if ! fileExists "$PREFS/fromIds-dictionary-0"; then
     echo "Prepping Sequence files for Recommender"
     $MAHOUT org.apache.mahout.cf.taste.example.email.MailToPrefsDriver --input $MAIL_OUT
--output $PREFS --overwrite --separator " ::: "
   fi
+  removeFolder "$PREFS_TMP"
+  removeFolder "$RECS_OUT"
   # run the recs
   echo "Run the recommender"
   $MAHOUT recommenditembased --input $PREFS_REC_INPUT --output $RECS_OUT --tempDir $PREFS_TMP
--similarityClassname SIMILARITY_LOGLIKELIHOOD
@@ -96,14 +116,14 @@ elif [ "x$alg" == "xclustering" ]; then
       read -p "Enter your choice : " numClusters
     fi
   fi
-  if [ ! -e "$MAIL_OUT/chunk-0" ]; then
+  if ! fileExists "$MAIL_OUT/chunk-0"; then
     echo "Converting Mail files to Sequence Files"
     $MAHOUT org.apache.mahout.text.SequenceFilesFromMailArchives --charset "UTF-8" --subject
--body --input $ASF_ARCHIVES --output $MAIL_OUT
   fi
 
   #convert to sparse vectors -- use the 2 norm (Euclidean distance) and lop of some of the
common terms
 
-  if [ ! -e "$SEQ2SP/dictionary.file-0" ]; then
+  if ! fileExists "$SEQ2SP/dictionary.file-0"; then
     echo "Converting the files to sparse vectors"
     $MAHOUT seq2sparse --input $MAIL_OUT --output $SEQ2SP --norm 2 --weight TFIDF --namedVector
--maxDFPercent 90 --minSupport 2 --analyzerName org.apache.mahout.text.MailArchivesClusteringAnalyzer
   fi
@@ -161,18 +181,18 @@ elif [ "x$alg" == "xclassification" ]; t
     TEST="$SPLIT/test"
     TEST_OUT="$CLASS/test-results"
     LABEL="$SPLIT/labels"
-    if [ ! -e "$MAIL_OUT/chunk-0" ]; then
+    if ! fileExists "$MAIL_OUT/chunk-0"; then
       echo "Converting Mail files to Sequence Files"
       $MAHOUT org.apache.mahout.text.SequenceFilesFromMailArchives --charset "UTF-8" --subject
--body --input $ASF_ARCHIVES --output $MAIL_OUT
     fi
-    if [ ! -e "$SEQ2SP/dictionary.file-0" ]; then
+    if ! fileExists "$SEQ2SP/dictionary.file-0"; then
       echo "Converting the files to sparse vectors"
       $MAHOUT seq2sparse --input $MAIL_OUT --output $SEQ2SP --norm 2 --weight TFIDF --namedVector
--maxDFPercent 90 --minSupport 2 --analyzerName org.apache.mahout.text.MailArchivesClusteringAnalyzer
       #We need to modify the vectors to have a better label
       echo "Converting vector labels"
       $MAHOUT org.apache.mahout.classifier.email.PrepEmailVectorsDriver --input "$SEQ2SP/tfidf-vectors"
--output $SEQ2SPLABEL --overwrite --maxItemsPerLabel 1000
     fi
-    if [ ! -e "$TRAIN/part-m-00000" ]; then
+    if ! fileExists "$TRAIN/part-m-00000"; then
       #setup train/test files
       echo "Creating training and test inputs"
       $MAHOUT split --input $SEQ2SPLABEL --trainingOutput $TRAIN --testOutput $TEST --randomSelectionPct
20 --overwrite --sequenceFiles
@@ -201,18 +221,18 @@ elif [ "x$alg" == "xclassification" ]; t
     TEST_OUT="$CLASS/test-results"
     MODELS="$CLASS/models"
     LABEL="$SPLIT/labels"
-    if [ ! -e "$MAIL_OUT/chunk-0" ]; then
+    if ! fileExists "$MAIL_OUT/chunk-0"; then
       echo "Converting Mail files to Sequence Files"
       $MAHOUT org.apache.mahout.text.SequenceFilesFromMailArchives --charset "UTF-8" --subject
--body --input $ASF_ARCHIVES --output $MAIL_OUT
     fi
     echo "Converting the files to sparse vectors in $SEQ2SP"
-    if [ ! -e "$SEQ2SP/part-m-00000" ]; then
+    if ! fileExists "$SEQ2SP/part-m-00000"; then
       $MAHOUT seq2encoded --input $MAIL_OUT --output $SEQ2SP --analyzerName org.apache.mahout.text.MailArchivesClusteringAnalyzer
--cardinality 20000
     fi
     #We need to modify the vectors to have a better label
     echo "Converting vector labels"
     $MAHOUT org.apache.mahout.classifier.email.PrepEmailVectorsDriver --input "$SEQ2SP" --output
$SEQ2SPLABEL --overwrite
-    if [ ! -e "$TRAIN/part-m-00000" ]; then
+    if ! fileExists "$TRAIN/part-m-00000"; then
       #setup train/test files
       echo "Creating training and test inputs from $SEQ2SPLABEL"
       $MAHOUT split --input $SEQ2SPLABEL --trainingOutput $TRAIN --testOutput $TEST --randomSelectionPct
20 --overwrite --sequenceFiles
@@ -231,7 +251,7 @@ elif [ "x$alg" == "xclean" ]; then
   read -p "Enter your choice (y/n): " answer
   if [ "x$answer" == "xy" ] || [ "x$answer" == "xY" ]; then
     echo "Cleaning out $OUT";
-    rm -rf "$OUT"
+	removeFolder "$OUT"
   fi
 fi
 



Mime
View raw message