mahout-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From gsing...@apache.org
Subject svn commit: r1183379 - in /mahout/trunk: examples/src/main/java/org/apache/mahout/cf/taste/example/email/ integration/src/main/java/org/apache/mahout/text/ integration/src/main/java/org/apache/mahout/utils/email/
Date Fri, 14 Oct 2011 14:58:05 GMT
Author: gsingers
Date: Fri Oct 14 14:58:05 2011
New Revision: 1183379

URL: http://svn.apache.org/viewvc?rev=1183379&view=rev
Log:
MAHOUT-798: fix recommender content extraction from email

Modified:
    mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/EmailUtility.java
    mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToPrefsDriver.java
    mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToRecMapper.java
    mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromMailArchives.java
    mahout/trunk/integration/src/main/java/org/apache/mahout/utils/email/MailOptions.java

Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/EmailUtility.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/EmailUtility.java?rev=1183379&r1=1183378&r2=1183379&view=diff
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/EmailUtility.java
(original)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/EmailUtility.java
Fri Oct 14 14:58:05 2011
@@ -23,6 +23,8 @@ public final class EmailUtility {
   public static final String MSG_IDS_PREFIX = "msgIdsPrefix";
   public static final String FROM_PREFIX = "fromPrefix";
   public static final String MSG_ID_DIMENSION = "msgIdDim";
+  public static final String FROM_INDEX = "fromIdx";
+  public static final String REFS_INDEX = "refsIdx";
 
   private EmailUtility() {
 

Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToPrefsDriver.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToPrefsDriver.java?rev=1183379&r1=1183378&r2=1183379&view=diff
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToPrefsDriver.java
(original)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToPrefsDriver.java
Fri Oct 14 14:58:05 2011
@@ -57,7 +57,7 @@ import java.util.concurrent.atomic.Atomi
  * file that can be consumed by the {@link org.apache.mahout.cf.taste.hadoop.pseudo.RecommenderJob}.
  * <p/>
  * This assumes the input is a Sequence File, that the key is: filename/message id and the
value is a list (separated by the
- * user's choosing) of: from, to, subject
+ * user's choosing) containing the from email and any references
  * <p/>
  * The output is a matrix where either the from or to are the rows (represented as longs)
and the columns are the message ids
  * that the user has interacted with (as a VectorWritable).  This class currently does not
account for thread hijacking.
@@ -83,6 +83,8 @@ public class MailToPrefsDriver extends A
     addOption(DefaultOptionCreator.overwriteOption().create());
     addOption("chunkSize", "cs", "The size of chunks to write.  Default is 100 mb", "100");
     addOption("separator", "sep", "The separator used in the input file to separate to, from,
subject.  Default is \\n", "\n");
+    addOption("from", "f", "The position in the input text (value) where the from email is
located, starting from zero (0).", "0");
+    addOption("refs", "r", "The position in the input text (value) where the reference ids
are located, starting from zero (0).", "1");
     Map<String, String> parsedArgs = parseArguments(args);
 
     Path input = getInputPath();
@@ -159,6 +161,8 @@ public class MailToPrefsDriver extends A
       conf.set(EmailUtility.MSG_ID_DIMENSION, String.valueOf(msgDim[0]));
       conf.set(EmailUtility.FROM_PREFIX, "fromIds-dictionary-");
       conf.set(EmailUtility.MSG_IDS_PREFIX, "msgIds-dictionary-");
+      conf.set(EmailUtility.FROM_INDEX, parsedArgs.get("--from"));
+      conf.set(EmailUtility.REFS_INDEX, parsedArgs.get("--refs"));
       conf.set(EmailUtility.SEPARATOR, separator);
       for (Path fromChunk : fromChunks) {
         for (Path idChunk : msgIdChunks) {

Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToRecMapper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToRecMapper.java?rev=1183379&r1=1183378&r2=1183379&view=diff
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToRecMapper.java
(original)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToRecMapper.java
Fri Oct 14 14:58:05 2011
@@ -39,6 +39,8 @@ public class MailToRecMapper extends
   private OpenObjectIntHashMap<String> fromDictionary = new OpenObjectIntHashMap<String>();
   private OpenObjectIntHashMap<String> msgIdDictionary = new OpenObjectIntHashMap<String>();
   private String separator = "\n";
+  protected int fromIdx;
+  protected int refsIdx;
 
   public enum Counters {
     REFERENCE, ORIGINAL
@@ -49,6 +51,8 @@ public class MailToRecMapper extends
     Configuration conf = context.getConfiguration();
     String fromPrefix = conf.get(EmailUtility.FROM_PREFIX);
     String msgPrefix = conf.get(EmailUtility.MSG_IDS_PREFIX);
+    fromIdx = conf.getInt(EmailUtility.FROM_INDEX, 0);
+    refsIdx = conf.getInt(EmailUtility.REFS_INDEX, 1);
     EmailUtility.loadDictionaries(conf, fromPrefix, fromDictionary, msgPrefix, msgIdDictionary);
     log.info("From Dictionary size: {} Msg Id Dictionary size: {}", fromDictionary.size(),
msgIdDictionary.size());
     separator = context.getConfiguration().get(EmailUtility.SEPARATOR);
@@ -64,14 +68,15 @@ public class MailToRecMapper extends
     int fromKey = Integer.MIN_VALUE;
     String valStr = value.toString();
     String[] splits = StringUtils.splitByWholeSeparatorPreserveAllTokens(valStr, separator);
-    //format is:  from, to, refs, subject, body
 
     if (splits != null && splits.length > 0) {
-      String from = EmailUtility.cleanUpEmailAddress(splits[0]);
-      fromKey = fromDictionary.get(from);
+      if (splits.length > refsIdx){
+        String from = EmailUtility.cleanUpEmailAddress(splits[fromIdx]);
+        fromKey = fromDictionary.get(from);
+      }
       //get the references
-      if (splits.length > 2) {
-        String[] theRefs = EmailUtility.parseReferences(splits[2]);
+      if (splits.length > refsIdx) {
+        String[] theRefs = EmailUtility.parseReferences(splits[refsIdx]);
         if (theRefs != null && theRefs.length > 0) {
           //we have a reference, the first one is the original message id, so map to that
one if it exists
           msgIdKey = msgIdDictionary.get(theRefs[0]);

Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromMailArchives.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromMailArchives.java?rev=1183379&r1=1183378&r2=1183379&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromMailArchives.java
(original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromMailArchives.java
Fri Oct 14 14:58:05 2011
@@ -41,7 +41,9 @@ import java.io.FileFilter;
 import java.io.IOException;
 import java.nio.charset.Charset;
 import java.util.ArrayList;
+import java.util.HashMap;
 import java.util.List;
+import java.util.Map;
 import java.util.regex.Pattern;
 
 /**
@@ -189,22 +191,30 @@ public final class SequenceFilesFromMail
       options.chunkSize = chunkSize;
       options.charset = charset;
 
-      //If this order changes, must change FromEmailToDictionaryMapper, potentially, as it
expects From to be first
-      List<Pattern> patterns = new ArrayList<Pattern>();
-      //new Pattern[]{MailProcessor.FROM_PREFIX, MailProcessor.TO_PREFIX, MailProcessor.REFS_PREFIX,
MailProcessor.SUBJECT_PREFIX, };
+
+      List<Pattern> patterns = new ArrayList<Pattern>(5);
+      //patternOrder is used downstream so that we can know what order the text is in instead
of encoding it in the string, which
+      //would require more processing later to remove it pre feature selection.
+      Map<String, Integer> patternOrder = new HashMap<String, Integer>();
+      int order = 0;
       if (cmdLine.hasOption(fromOpt)) {
         patterns.add(MailProcessor.FROM_PREFIX);
+        patternOrder.put(MailOptions.FROM, order++);
       }
       if (cmdLine.hasOption(toOpt)) {
         patterns.add(MailProcessor.TO_PREFIX);
+        patternOrder.put(MailOptions.TO, order++);
       }
       if (cmdLine.hasOption(refsOpt)) {
         patterns.add(MailProcessor.REFS_PREFIX);
+        patternOrder.put(MailOptions.REFS, order++);
       }
       if (cmdLine.hasOption(subjectOpt)) {
         patterns.add(MailProcessor.SUBJECT_PREFIX);
+        patternOrder.put(MailOptions.SUBJECT, order++);
       }
       options.patternsToMatch = patterns.toArray(new Pattern[patterns.size()]);
+      options.patternOrder = patternOrder;
       options.includeBody = cmdLine.hasOption(bodyOpt);
       options.separator = "\n";
       if (cmdLine.hasOption(separatorOpt)) {

Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/email/MailOptions.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/email/MailOptions.java?rev=1183379&r1=1183378&r2=1183379&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/email/MailOptions.java
(original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/email/MailOptions.java
Fri Oct 14 14:58:05 2011
@@ -21,6 +21,7 @@ package org.apache.mahout.utils.email;
 
 import java.io.File;
 import java.nio.charset.Charset;
+import java.util.Map;
 import java.util.regex.Pattern;
 
 /**
@@ -28,7 +29,10 @@ import java.util.regex.Pattern;
 *
 **/
 public class MailOptions {
-
+  public static final String FROM = "FROM";
+  public static final String TO = "TO";
+  public static final String REFS = "REFS";
+  public static final String SUBJECT = "SUBJECT";
   public File input;
   public String outputDir;
   public String prefix;
@@ -38,5 +42,6 @@ public class MailOptions {
   public String bodySeparator = "\n";
   public boolean includeBody;
   public Pattern[] patternsToMatch;
-
+  //maps FROM, TO, REFS, SUBJECT, etc. to the order they appear in patternsToMatch.  See
MailToRecMapper
+  public Map<String, Integer> patternOrder;
 }



Mime
View raw message