mahout-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From s..@apache.org
Subject svn commit: r1595643 - in /mahout/trunk: ./ mrlegacy/src/main/java/org/apache/mahout/common/ mrlegacy/src/main/java/org/apache/mahout/vectorizer/ mrlegacy/src/main/java/org/apache/mahout/vectorizer/term/ mrlegacy/src/main/java/org/apache/mahout/vectori...
Date Sun, 18 May 2014 17:35:16 GMT
Author: ssc
Date: Sun May 18 17:35:16 2014
New Revision: 1595643

URL: http://svn.apache.org/r1595643
Log:
MAHOUT-1498 DistributedCache.setCacheFiles in DictionaryVectorizer overwrites jars pushed
using oozie

Added:
    mahout/trunk/mrlegacy/src/test/java/org/apache/mahout/common/DistributedCacheFileLocationTest.java
Modified:
    mahout/trunk/CHANGELOG
    mahout/trunk/mrlegacy/src/main/java/org/apache/mahout/common/HadoopUtil.java
    mahout/trunk/mrlegacy/src/main/java/org/apache/mahout/vectorizer/DictionaryVectorizer.java
    mahout/trunk/mrlegacy/src/main/java/org/apache/mahout/vectorizer/term/TFPartialVectorReducer.java
    mahout/trunk/mrlegacy/src/main/java/org/apache/mahout/vectorizer/tfidf/TFIDFConverter.java
    mahout/trunk/mrlegacy/src/main/java/org/apache/mahout/vectorizer/tfidf/TFIDFPartialVectorReducer.java

Modified: mahout/trunk/CHANGELOG
URL: http://svn.apache.org/viewvc/mahout/trunk/CHANGELOG?rev=1595643&r1=1595642&r2=1595643&view=diff
==============================================================================
--- mahout/trunk/CHANGELOG (original)
+++ mahout/trunk/CHANGELOG Sun May 18 17:35:16 2014
@@ -2,6 +2,8 @@ Mahout Change Log
 
 Release 1.0 - unreleased
 
+  MAHOUT-1498: DistributedCache.setCacheFiles in DictionaryVectorizer overwrites jars pushed
using oozie (Sergey via ssc)
+
   MAHOUT-1385: Caching Encoders don't cache (Johannes Schulte, Manoj Awasthi via ssc)
 
   MAHOUT-1527: Fix wikipedia classifier example (Andrew Palumbo via ssc)

Modified: mahout/trunk/mrlegacy/src/main/java/org/apache/mahout/common/HadoopUtil.java
URL: http://svn.apache.org/viewvc/mahout/trunk/mrlegacy/src/main/java/org/apache/mahout/common/HadoopUtil.java?rev=1595643&r1=1595642&r2=1595643&view=diff
==============================================================================
--- mahout/trunk/mrlegacy/src/main/java/org/apache/mahout/common/HadoopUtil.java (original)
+++ mahout/trunk/mrlegacy/src/main/java/org/apache/mahout/common/HadoopUtil.java Sun May 18
17:35:16 2014
@@ -421,4 +421,22 @@ public final class HadoopUtil {
     }
     return currentPath;
   }
+
+  /**
+   * Finds a file in the DistributedCache
+   *
+   * @param partOfFilename a substring of the file name
+   * @param localFiles holds references to files stored in distributed cache
+   * @return Path to first matched file or null if nothing was found
+   **/
+  public static Path findInCacheByPartOfFilename(String partOfFilename, URI[] localFiles)
{
+    for (URI distCacheFile : localFiles) {
+      log.info("trying find a file in distributed cache containing [{}] in its name", partOfFilename);
+      if (distCacheFile != null && distCacheFile.toString().contains(partOfFilename))
{
+        log.info("found file [{}] containing [{}]", distCacheFile.toString(), partOfFilename);
+        return new Path(distCacheFile.getPath());
+      }
+    }
+    return null;
+  }
 }

Modified: mahout/trunk/mrlegacy/src/main/java/org/apache/mahout/vectorizer/DictionaryVectorizer.java
URL: http://svn.apache.org/viewvc/mahout/trunk/mrlegacy/src/main/java/org/apache/mahout/vectorizer/DictionaryVectorizer.java?rev=1595643&r1=1595642&r2=1595643&view=diff
==============================================================================
--- mahout/trunk/mrlegacy/src/main/java/org/apache/mahout/vectorizer/DictionaryVectorizer.java
(original)
+++ mahout/trunk/mrlegacy/src/main/java/org/apache/mahout/vectorizer/DictionaryVectorizer.java
Sun May 18 17:35:16 2014
@@ -17,11 +17,6 @@
 
 package org.apache.mahout.vectorizer;
 
-import java.io.IOException;
-import java.net.URI;
-import java.util.Collection;
-import java.util.List;
-
 import com.google.common.base.Preconditions;
 import com.google.common.collect.Lists;
 import com.google.common.io.Closeables;
@@ -59,6 +54,10 @@ import org.apache.mahout.vectorizer.term
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import java.io.IOException;
+import java.util.Collection;
+import java.util.List;
+
 /**
  * This class converts a set of input documents in the sequence file format to vectors. The
Sequence file
  * input should have a {@link Text} key containing the unique document identifier and a {@link
StringTuple}
@@ -72,8 +71,8 @@ public final class DictionaryVectorizer 
   public static final String MIN_SUPPORT = "min.support";
   public static final String MAX_NGRAMS = "max.ngrams";
   public static final int DEFAULT_MIN_SUPPORT = 2;
-  
-  private static final String DICTIONARY_FILE = "dictionary.file-";
+  public static final String DICTIONARY_FILE = "dictionary.file-";
+
   private static final int MAX_CHUNKSIZE = 10000;
   private static final int MIN_CHUNKSIZE = 100;
   private static final String OUTPUT_FILES_PATTERN = "part-*";
@@ -301,8 +300,8 @@ public final class DictionaryVectorizer 
     conf.setInt(PartialVectorMerger.DIMENSION, dimension);
     conf.setBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, sequentialAccess);
     conf.setBoolean(PartialVectorMerger.NAMED_VECTOR, namedVectors);
-    conf.setInt(MAX_NGRAMS, maxNGramSize);   
-    DistributedCache.setCacheFiles(new URI[] {dictionaryFilePath.toUri()}, conf);
+    conf.setInt(MAX_NGRAMS, maxNGramSize);
+    DistributedCache.addCacheFile(dictionaryFilePath.toUri(), conf);
     
     Job job = new Job(conf);
     job.setJobName("DictionaryVectorizer::MakePartialVectors: input-folder: " + input

Modified: mahout/trunk/mrlegacy/src/main/java/org/apache/mahout/vectorizer/term/TFPartialVectorReducer.java
URL: http://svn.apache.org/viewvc/mahout/trunk/mrlegacy/src/main/java/org/apache/mahout/vectorizer/term/TFPartialVectorReducer.java?rev=1595643&r1=1595642&r2=1595643&view=diff
==============================================================================
--- mahout/trunk/mrlegacy/src/main/java/org/apache/mahout/vectorizer/term/TFPartialVectorReducer.java
(original)
+++ mahout/trunk/mrlegacy/src/main/java/org/apache/mahout/vectorizer/term/TFPartialVectorReducer.java
Sun May 18 17:35:16 2014
@@ -19,6 +19,7 @@ package org.apache.mahout.vectorizer.ter
 
 import com.google.common.io.Closeables;
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.filecache.DistributedCache;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.IntWritable;
 import org.apache.hadoop.io.Text;
@@ -41,6 +42,7 @@ import org.apache.mahout.vectorizer.Dict
 import org.apache.mahout.vectorizer.common.PartialVectorMerger;
 
 import java.io.IOException;
+import java.net.URI;
 import java.util.Iterator;
 
 /**
@@ -51,11 +53,8 @@ public class TFPartialVectorReducer exte
   private final OpenObjectIntHashMap<String> dictionary = new OpenObjectIntHashMap<String>();
 
   private int dimension;
-
   private boolean sequentialAccess;
-
   private boolean namedVector;
-
   private int maxNGramSize = 1;
 
   @Override
@@ -120,8 +119,8 @@ public class TFPartialVectorReducer exte
     namedVector = conf.getBoolean(PartialVectorMerger.NAMED_VECTOR, false);
     maxNGramSize = conf.getInt(DictionaryVectorizer.MAX_NGRAMS, maxNGramSize);
 
-    //MAHOUT-1247
-    Path dictionaryFile = HadoopUtil.getSingleCachedFile(conf);
+    URI[] localFiles = DistributedCache.getCacheFiles(conf);
+    Path dictionaryFile = HadoopUtil.findInCacheByPartOfFilename(DictionaryVectorizer.DICTIONARY_FILE,
localFiles);
     // key is word value is id
     for (Pair<Writable, IntWritable> record
             : new SequenceFileIterable<Writable, IntWritable>(dictionaryFile, true,
conf)) {

Modified: mahout/trunk/mrlegacy/src/main/java/org/apache/mahout/vectorizer/tfidf/TFIDFConverter.java
URL: http://svn.apache.org/viewvc/mahout/trunk/mrlegacy/src/main/java/org/apache/mahout/vectorizer/tfidf/TFIDFConverter.java?rev=1595643&r1=1595642&r2=1595643&view=diff
==============================================================================
--- mahout/trunk/mrlegacy/src/main/java/org/apache/mahout/vectorizer/tfidf/TFIDFConverter.java
(original)
+++ mahout/trunk/mrlegacy/src/main/java/org/apache/mahout/vectorizer/tfidf/TFIDFConverter.java
Sun May 18 17:35:16 2014
@@ -17,10 +17,6 @@
 
 package org.apache.mahout.vectorizer.tfidf;
 
-import java.io.IOException;
-import java.net.URI;
-import java.util.List;
-
 import com.google.common.base.Preconditions;
 import com.google.common.collect.Lists;
 import com.google.common.io.Closeables;
@@ -47,6 +43,9 @@ import org.apache.mahout.vectorizer.comm
 import org.apache.mahout.vectorizer.term.TermDocumentCountMapper;
 import org.apache.mahout.vectorizer.term.TermDocumentCountReducer;
 
+import java.io.IOException;
+import java.util.List;
+
 /**
  * This class converts a set of input vectors with term frequencies to TfIdf vectors. The
Sequence file input
  * should have a {@link org.apache.hadoop.io.WritableComparable} key containing and a
@@ -64,7 +63,7 @@ public final class TFIDFConverter {
   //public static final String TFIDF_OUTPUT_FOLDER = "tfidf";
 
   private static final String DOCUMENT_VECTOR_OUTPUT_FOLDER = "tfidf-vectors";
-  private static final String FREQUENCY_FILE = "frequency.file-";
+  public static final String FREQUENCY_FILE = "frequency.file-";
   private static final int MAX_CHUNKSIZE = 10000;
   private static final int MIN_CHUNKSIZE = 100;
   private static final String OUTPUT_FILES_PATTERN = "part-*";
@@ -75,8 +74,7 @@ public final class TFIDFConverter {
   /**
    * Cannot be initialized. Use the static functions
    */
-  private TFIDFConverter() {
-  }
+  private TFIDFConverter() {}
 
   /**
    * Create Term Frequency-Inverse Document Frequency (Tf-Idf) Vectors from the input set
of vectors in
@@ -299,7 +297,7 @@ public final class TFIDFConverter {
     conf.setLong(MAX_DF, maxDF);
     conf.setBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, sequentialAccess);
     conf.setBoolean(PartialVectorMerger.NAMED_VECTOR, namedVector);
-    DistributedCache.setCacheFiles(new URI[] {dictionaryFilePath.toUri()}, conf);
+    DistributedCache.addCacheFile(dictionaryFilePath.toUri(), conf);
 
     Job job = new Job(conf);
     job.setJobName(": MakePartialVectors: input-folder: " + input + ", dictionary-file: "

Modified: mahout/trunk/mrlegacy/src/main/java/org/apache/mahout/vectorizer/tfidf/TFIDFPartialVectorReducer.java
URL: http://svn.apache.org/viewvc/mahout/trunk/mrlegacy/src/main/java/org/apache/mahout/vectorizer/tfidf/TFIDFPartialVectorReducer.java?rev=1595643&r1=1595642&r2=1595643&view=diff
==============================================================================
--- mahout/trunk/mrlegacy/src/main/java/org/apache/mahout/vectorizer/tfidf/TFIDFPartialVectorReducer.java
(original)
+++ mahout/trunk/mrlegacy/src/main/java/org/apache/mahout/vectorizer/tfidf/TFIDFPartialVectorReducer.java
Sun May 18 17:35:16 2014
@@ -18,9 +18,11 @@
 package org.apache.mahout.vectorizer.tfidf;
 
 import java.io.IOException;
+import java.net.URI;
 import java.util.Iterator;
 
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.filecache.DistributedCache;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.IntWritable;
 import org.apache.hadoop.io.LongWritable;
@@ -39,25 +41,19 @@ import org.apache.mahout.vectorizer.TFID
 import org.apache.mahout.vectorizer.common.PartialVectorMerger;
 
 /**
- * Converts a document in to a sparse vector
+ * Converts a document into a sparse vector
  */
 public class TFIDFPartialVectorReducer extends
     Reducer<WritableComparable<?>, VectorWritable, WritableComparable<?>,
VectorWritable> {
 
   private final OpenIntLongHashMap dictionary = new OpenIntLongHashMap();
-
   private final TFIDF tfidf = new TFIDF();
 
   private int minDf = 1;
-
   private long maxDf = -1;
-
   private long vectorCount = 1;
-
   private long featureCount;
-
   private boolean sequentialAccess;
-
   private boolean namedVector;
   
   @Override
@@ -106,7 +102,8 @@ public class TFIDFPartialVectorReducer e
     sequentialAccess = conf.getBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, false);
     namedVector = conf.getBoolean(PartialVectorMerger.NAMED_VECTOR, false);
 
-    Path dictionaryFile = HadoopUtil.getSingleCachedFile(conf);
+    URI[] localFiles = DistributedCache.getCacheFiles(conf);
+    Path dictionaryFile = HadoopUtil.findInCacheByPartOfFilename(TFIDFConverter.FREQUENCY_FILE,
localFiles);
     // key is feature, value is the document frequency
     for (Pair<IntWritable,LongWritable> record 
          : new SequenceFileIterable<IntWritable,LongWritable>(dictionaryFile, true,
conf)) {

Added: mahout/trunk/mrlegacy/src/test/java/org/apache/mahout/common/DistributedCacheFileLocationTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/mrlegacy/src/test/java/org/apache/mahout/common/DistributedCacheFileLocationTest.java?rev=1595643&view=auto
==============================================================================
--- mahout/trunk/mrlegacy/src/test/java/org/apache/mahout/common/DistributedCacheFileLocationTest.java
(added)
+++ mahout/trunk/mrlegacy/src/test/java/org/apache/mahout/common/DistributedCacheFileLocationTest.java
Sun May 18 17:35:16 2014
@@ -0,0 +1,46 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.common;
+
+import org.apache.hadoop.fs.Path;
+import org.junit.Test;
+
+import java.io.File;
+import java.net.URI;
+
+
+public class DistributedCacheFileLocationTest extends MahoutTestCase {
+
+  static final File FILE_I_WANT_TO_FIND = new File("file/i_want_to_find.txt");
+  static final URI[] DISTRIBUTED_CACHE_FILES = new URI[] {
+      new File("/first/file").toURI(), new File("/second/file").toURI(), FILE_I_WANT_TO_FIND.toURI()
};
+
+  @Test
+  public void nonExistingFile() {
+    Path path = HadoopUtil.findInCacheByPartOfFilename("no such file", DISTRIBUTED_CACHE_FILES);
+    assertNull(path);
+  }
+
+  @Test
+  public void existingFile() {
+    Path path = HadoopUtil.findInCacheByPartOfFilename("want_to_find", DISTRIBUTED_CACHE_FILES);
+    assertNotNull(path);
+    assertEquals(FILE_I_WANT_TO_FIND.getName(), path.getName());
+  }
+
+}



Mime
View raw message