mahout-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From smar...@apache.org
Subject mahout git commit: MAHOUT-1619: HighDFWordsPruner overwrites cache files, this fixes #57
Date Tue, 31 Mar 2015 02:59:23 GMT
Repository: mahout
Updated Branches:
  refs/heads/master edec611f0 -> 5624a96c6


MAHOUT-1619: HighDFWordsPruner overwrites cache files, this fixes #57


Project: http://git-wip-us.apache.org/repos/asf/mahout/repo
Commit: http://git-wip-us.apache.org/repos/asf/mahout/commit/5624a96c
Tree: http://git-wip-us.apache.org/repos/asf/mahout/tree/5624a96c
Diff: http://git-wip-us.apache.org/repos/asf/mahout/diff/5624a96c

Branch: refs/heads/master
Commit: 5624a96c6da94ad1c95077dbbdab71409d85770c
Parents: edec611
Author: Suneel Marthi <suneel.marthi@gmail.com>
Authored: Mon Mar 30 22:56:18 2015 -0400
Committer: Suneel Marthi <suneel.marthi@gmail.com>
Committed: Mon Mar 30 23:00:50 2015 -0400

----------------------------------------------------------------------
 CHANGELOG                                                    | 2 ++
 .../java/org/apache/mahout/vectorizer/HighDFWordsPruner.java | 8 +++-----
 .../mahout/vectorizer/collocations/llr/CollocMapper.java     | 8 ++------
 3 files changed, 7 insertions(+), 11 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mahout/blob/5624a96c/CHANGELOG
----------------------------------------------------------------------
diff --git a/CHANGELOG b/CHANGELOG
index 55f5ca1..cfcd5f7 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -2,6 +2,8 @@ Mahout Change Log
 
 Release 0.10.0 - unreleased
 
+  MAHOUT-1619: HighDFWordsPruner overwrites cache files (Burke Webster, smarthi)
+
   MAHOUT-1516: classify-20newsgroups.sh failed: /tmp/mahout-work-jpan/20news-all does not
exists in hdfs. (Jian Pan via apalumbo)
 
   MAHOUT-1559: Add documentation for and clean up the wikipedia classifier example (apalumbo)

http://git-wip-us.apache.org/repos/asf/mahout/blob/5624a96c/mrlegacy/src/main/java/org/apache/mahout/vectorizer/HighDFWordsPruner.java
----------------------------------------------------------------------
diff --git a/mrlegacy/src/main/java/org/apache/mahout/vectorizer/HighDFWordsPruner.java b/mrlegacy/src/main/java/org/apache/mahout/vectorizer/HighDFWordsPruner.java
index a0f4866..c3813c3 100644
--- a/mrlegacy/src/main/java/org/apache/mahout/vectorizer/HighDFWordsPruner.java
+++ b/mrlegacy/src/main/java/org/apache/mahout/vectorizer/HighDFWordsPruner.java
@@ -17,7 +17,6 @@
 
 package org.apache.mahout.vectorizer;
 
-import com.google.common.collect.Lists;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.filecache.DistributedCache;
 import org.apache.hadoop.fs.Path;
@@ -36,7 +35,7 @@ import org.apache.mahout.vectorizer.pruner.PrunedPartialVectorMergeReducer;
 import org.apache.mahout.vectorizer.pruner.WordsPrunerReducer;
 
 import java.io.IOException;
-import java.net.URI;
+import java.util.ArrayList;
 import java.util.List;
 
 public final class HighDFWordsPruner {
@@ -56,7 +55,7 @@ public final class HighDFWordsPruner {
                                   int numReducers) throws IOException, InterruptedException,
ClassNotFoundException {
 
     int partialVectorIndex = 0;
-    List<Path> partialVectorPaths = Lists.newArrayList();
+    List<Path> partialVectorPaths = new ArrayList<>();
     for (Path path : docFrequenciesFeatures.getSecond()) {
       Path partialVectorOutputPath = new Path(prunedPartialTFDir, "partial-" + partialVectorIndex++);
       partialVectorPaths.add(partialVectorOutputPath);
@@ -79,8 +78,7 @@ public final class HighDFWordsPruner {
                     + "org.apache.hadoop.io.serializer.WritableSerialization");
     conf.setLong(MAX_DF, maxDF);
     conf.setLong(MIN_DF, minDF);
-    DistributedCache.setCacheFiles(
-            new URI[]{dictionaryFilePath.toUri()}, conf);
+    DistributedCache.addCacheFile(dictionaryFilePath.toUri(), conf);
 
     Job job = HadoopUtil.prepareJob(input, output, SequenceFileInputFormat.class,
             Mapper.class, null, null, WordsPrunerReducer.class,

http://git-wip-us.apache.org/repos/asf/mahout/blob/5624a96c/mrlegacy/src/main/java/org/apache/mahout/vectorizer/collocations/llr/CollocMapper.java
----------------------------------------------------------------------
diff --git a/mrlegacy/src/main/java/org/apache/mahout/vectorizer/collocations/llr/CollocMapper.java
b/mrlegacy/src/main/java/org/apache/mahout/vectorizer/collocations/llr/CollocMapper.java
index 81f7ee4..fd99293 100644
--- a/mrlegacy/src/main/java/org/apache/mahout/vectorizer/collocations/llr/CollocMapper.java
+++ b/mrlegacy/src/main/java/org/apache/mahout/vectorizer/collocations/llr/CollocMapper.java
@@ -17,7 +17,6 @@
 
 package org.apache.mahout.vectorizer.collocations.llr;
 
-import com.google.common.io.Closeables;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapreduce.Mapper;
@@ -90,9 +89,8 @@ public class CollocMapper extends Mapper<Text, StringTuple, GramKey,
Gram> {
   @Override
   protected void map(Text key, StringTuple value, final Context context) throws IOException,
InterruptedException {
 
-    ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value.getEntries().iterator()),
maxShingleSize);
-    sf.reset();
-    try {
+    try (ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value.getEntries().iterator()),
maxShingleSize)){
+      sf.reset();
       int count = 0; // ngram count
 
       OpenObjectIntHashMap<String> ngrams =
@@ -160,8 +158,6 @@ public class CollocMapper extends Mapper<Text, StringTuple, GramKey,
Gram> {
 
       context.getCounter(Count.NGRAM_TOTAL).increment(count);
       sf.end();
-    } finally {
-      Closeables.close(sf, true);
     }
   }
 


Mime
View raw message