mahout-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From smar...@apache.org
Subject mahout git commit: MAHOUT-1876: Upgrade lucene to 5.5.2 and fix compilation failures, this closes apache/mahout#248
Date Thu, 11 Aug 2016 05:44:22 GMT
Repository: mahout
Updated Branches:
  refs/heads/master 33c1eab11 -> 4d0cd66a6


MAHOUT-1876: Upgrade lucene to 5.5.2 and fix compilation failures, this closes apache/mahout#248


Project: http://git-wip-us.apache.org/repos/asf/mahout/repo
Commit: http://git-wip-us.apache.org/repos/asf/mahout/commit/4d0cd66a
Tree: http://git-wip-us.apache.org/repos/asf/mahout/tree/4d0cd66a
Diff: http://git-wip-us.apache.org/repos/asf/mahout/diff/4d0cd66a

Branch: refs/heads/master
Commit: 4d0cd66a6269eb02fceaabdb11d70fd38d433474
Parents: 33c1eab
Author: smarthi <smarthi@apache.org>
Authored: Thu Aug 11 01:42:30 2016 -0400
Committer: smarthi <smarthi@apache.org>
Committed: Thu Aug 11 01:42:30 2016 -0400

----------------------------------------------------------------------
 .../mahout/classifier/NewsgroupHelper.java      |  3 +-
 .../text/MailArchivesClusteringAnalyzer.java    | 31 ++++++++------------
 .../text/wikipedia/WikipediaAnalyzer.java       | 17 +++++------
 .../mahout/utils/regex/AnalyzerTransformer.java |  3 +-
 .../vectors/lucene/AbstractLuceneIterator.java  |  2 +-
 .../utils/vectors/lucene/CachedTermInfo.java    |  2 +-
 .../utils/vectors/lucene/ClusterLabels.java     | 19 ++++++------
 .../mahout/utils/vectors/lucene/Driver.java     |  3 +-
 .../mahout/clustering/TestClusterDumper.java    |  6 ++--
 .../collocations/llr/BloomTokenFilterTest.java  |  9 +++---
 .../vectors/lucene/CachedTermInfoTest.java      |  6 ++--
 .../mahout/utils/vectors/lucene/DriverTest.java | 17 +++++------
 .../vectors/lucene/LuceneIterableTest.java      |  8 ++---
 .../mahout/common/lucene/AnalyzerUtils.java     |  4 +--
 .../org/apache/mahout/vectorizer/TFIDF.java     |  4 +--
 .../encoders/LuceneTextValueEncoder.java        | 10 ++-----
 .../encoders/TextValueEncoderTest.java          |  3 +-
 pom.xml                                         |  2 +-
 18 files changed, 67 insertions(+), 82 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mahout/blob/4d0cd66a/examples/src/main/java/org/apache/mahout/classifier/NewsgroupHelper.java
----------------------------------------------------------------------
diff --git a/examples/src/main/java/org/apache/mahout/classifier/NewsgroupHelper.java b/examples/src/main/java/org/apache/mahout/classifier/NewsgroupHelper.java
index 3674a57..5cec51c 100644
--- a/examples/src/main/java/org/apache/mahout/classifier/NewsgroupHelper.java
+++ b/examples/src/main/java/org/apache/mahout/classifier/NewsgroupHelper.java
@@ -26,7 +26,6 @@ import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.util.Version;
 import org.apache.mahout.common.RandomUtils;
 import org.apache.mahout.math.RandomAccessSparseVector;
 import org.apache.mahout.math.Vector;
@@ -60,7 +59,7 @@ public final class NewsgroupHelper {
   private static final long WEEK = 7 * 24 * 3600;
   
   private final Random rand = RandomUtils.getRandom();  
-  private final Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_46);
+  private final Analyzer analyzer = new StandardAnalyzer();
   private final FeatureVectorEncoder encoder = new StaticWordValueEncoder("body");
   private final FeatureVectorEncoder bias = new ConstantValueEncoder("Intercept");
   

http://git-wip-us.apache.org/repos/asf/mahout/blob/4d0cd66a/integration/src/main/java/org/apache/mahout/text/MailArchivesClusteringAnalyzer.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/text/MailArchivesClusteringAnalyzer.java
b/integration/src/main/java/org/apache/mahout/text/MailArchivesClusteringAnalyzer.java
index 8776c5f..12ed471 100644
--- a/integration/src/main/java/org/apache/mahout/text/MailArchivesClusteringAnalyzer.java
+++ b/integration/src/main/java/org/apache/mahout/text/MailArchivesClusteringAnalyzer.java
@@ -16,12 +16,6 @@
  */
 package org.apache.mahout.text;
 
-import java.io.IOException;
-import java.io.Reader;
-import java.util.Arrays;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
@@ -34,7 +28,11 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.util.CharArraySet;
 import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
-import org.apache.lucene.util.Version;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 
 /**
  * Custom Lucene Analyzer designed for aggressive feature reduction
@@ -42,13 +40,11 @@ import org.apache.lucene.util.Version;
  * stop words, excluding non-alpha-numeric tokens, and porter stemming.
  */
 public final class MailArchivesClusteringAnalyzer extends StopwordAnalyzerBase {
-  private static final Version LUCENE_VERSION = Version.LUCENE_46;
-  
   // extended set of stop words composed of common mail terms like "hi",
   // HTML tags, and Java keywords asmany of the messages in the archives
   // are subversion check-in notifications
     
-  private static final CharArraySet STOP_SET = new CharArraySet(LUCENE_VERSION, Arrays.asList(
+  private static final CharArraySet STOP_SET = new CharArraySet(Arrays.asList(
     "3d","7bit","a0","about","above","abstract","across","additional","after",
     "afterwards","again","against","align","all","almost","alone","along",
     "already","also","although","always","am","among","amongst","amoungst",
@@ -108,22 +104,21 @@ public final class MailArchivesClusteringAnalyzer extends StopwordAnalyzerBase
{
   private static final Matcher MATCHER = ALPHA_NUMERIC.matcher("");
 
   public MailArchivesClusteringAnalyzer() {
-    super(LUCENE_VERSION, STOP_SET);
+    super(STOP_SET);
   }
 
   public MailArchivesClusteringAnalyzer(CharArraySet stopSet) {
-    super(LUCENE_VERSION, stopSet);
-
+    super(stopSet);
   }
   
   @Override
-  protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-    Tokenizer tokenizer = new StandardTokenizer(LUCENE_VERSION, reader);
-    TokenStream result = new StandardFilter(LUCENE_VERSION, tokenizer);
-    result = new LowerCaseFilter(LUCENE_VERSION, result);
+  protected TokenStreamComponents createComponents(String fieldName) {
+    Tokenizer tokenizer = new StandardTokenizer();
+    TokenStream result = new StandardFilter(tokenizer);
+    result = new LowerCaseFilter(result);
     result = new ASCIIFoldingFilter(result);
     result = new AlphaNumericMaxLengthFilter(result);
-    result = new StopFilter(LUCENE_VERSION, result, STOP_SET);
+    result = new StopFilter(result, STOP_SET);
     result = new PorterStemFilter(result);
     return new TokenStreamComponents(tokenizer, result);
   }

http://git-wip-us.apache.org/repos/asf/mahout/blob/4d0cd66a/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaAnalyzer.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaAnalyzer.java
b/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaAnalyzer.java
index ad55ba7..d50323d 100644
--- a/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaAnalyzer.java
+++ b/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaAnalyzer.java
@@ -17,8 +17,6 @@
 
 package org.apache.mahout.text.wikipedia;
 
-import java.io.Reader;
-
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.core.LowerCaseFilter;
@@ -28,25 +26,24 @@ import org.apache.lucene.analysis.standard.StandardFilter;
 import org.apache.lucene.analysis.util.CharArraySet;
 import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
 import org.apache.lucene.analysis.wikipedia.WikipediaTokenizer;
-import org.apache.lucene.util.Version;
 
 
 public class WikipediaAnalyzer extends StopwordAnalyzerBase {
   
   public WikipediaAnalyzer() {
-    super(Version.LUCENE_46, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
+    super(StopAnalyzer.ENGLISH_STOP_WORDS_SET);
   }
   
   public WikipediaAnalyzer(CharArraySet stopSet) {
-    super(Version.LUCENE_46, stopSet);
+    super(stopSet);
   }
 
   @Override
-  protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-    Tokenizer tokenizer = new WikipediaTokenizer(reader);
-    TokenStream result = new StandardFilter(Version.LUCENE_46, tokenizer);
-    result = new LowerCaseFilter(Version.LUCENE_46, result);
-    result = new StopFilter(Version.LUCENE_46, result, getStopwordSet());
+  protected TokenStreamComponents createComponents(String fieldName) {
+    Tokenizer tokenizer = new WikipediaTokenizer();
+    TokenStream result = new StandardFilter(tokenizer);
+    result = new LowerCaseFilter(result);
+    result = new StopFilter(result, getStopwordSet());
     return new TokenStreamComponents(tokenizer, result);
   }
 }

http://git-wip-us.apache.org/repos/asf/mahout/blob/4d0cd66a/integration/src/main/java/org/apache/mahout/utils/regex/AnalyzerTransformer.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/regex/AnalyzerTransformer.java
b/integration/src/main/java/org/apache/mahout/utils/regex/AnalyzerTransformer.java
index 36b166a..4585a0a 100644
--- a/integration/src/main/java/org/apache/mahout/utils/regex/AnalyzerTransformer.java
+++ b/integration/src/main/java/org/apache/mahout/utils/regex/AnalyzerTransformer.java
@@ -24,7 +24,6 @@ import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.util.Version;
 import org.apache.mahout.common.lucene.TokenStreamIterator;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -37,7 +36,7 @@ public class AnalyzerTransformer implements RegexTransformer {
   private static final Logger log = LoggerFactory.getLogger(AnalyzerTransformer.class);
 
   public AnalyzerTransformer() {
-    this(new StandardAnalyzer(Version.LUCENE_46), "text");
+    this(new StandardAnalyzer(), "text");
   }
 
   public AnalyzerTransformer(Analyzer analyzer) {

http://git-wip-us.apache.org/repos/asf/mahout/blob/4d0cd66a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/AbstractLuceneIterator.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/AbstractLuceneIterator.java
b/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/AbstractLuceneIterator.java
index 233c95c..ff61a70 100644
--- a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/AbstractLuceneIterator.java
+++ b/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/AbstractLuceneIterator.java
@@ -113,7 +113,7 @@ public abstract class AbstractLuceneIterator extends AbstractIterator<Vector>
{
 
       // The loop exits with termFreqVector and name set.
 
-      TermsEnum te = termFreqVector.iterator(null);
+      TermsEnum te = termFreqVector.iterator();
       BytesRef term;
       TFDFMapper mapper = new TFDFMapper(indexReader.numDocs(), weight, this.terminfo);
       mapper.setExpectations(field, termFreqVector.size());

http://git-wip-us.apache.org/repos/asf/mahout/blob/4d0cd66a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfo.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfo.java
b/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfo.java
index 718704a..0b59ed6 100644
--- a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfo.java
+++ b/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfo.java
@@ -42,7 +42,7 @@ public class CachedTermInfo implements TermInfo {
   public CachedTermInfo(IndexReader reader, String field, int minDf, int maxDfPercent) throws
IOException {
     this.field = field;
     Terms t = MultiFields.getTerms(reader, field);
-    TermsEnum te = t.iterator(null);
+    TermsEnum te = t.iterator();
 
     int numDocs = reader.numDocs();
     double percent = numDocs * maxDfPercent / 100.0;

http://git-wip-us.apache.org/repos/asf/mahout/blob/4d0cd66a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java
b/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java
index 6ef7fba..b2568e7 100644
--- a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java
+++ b/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java
@@ -21,6 +21,7 @@ import java.io.File;
 import java.io.IOException;
 import java.io.OutputStreamWriter;
 import java.io.Writer;
+import java.nio.file.Paths;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.HashSet;
@@ -44,9 +45,9 @@ import org.apache.commons.cli2.commandline.Parser;
 import org.apache.commons.io.Charsets;
 import org.apache.hadoop.fs.Path;
 import org.apache.lucene.index.DirectoryReader;
-import org.apache.lucene.index.DocsEnum;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.MultiFields;
+import org.apache.lucene.index.PostingsEnum;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.index.Terms;
 import org.apache.lucene.index.TermsEnum;
@@ -55,7 +56,7 @@ import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.FSDirectory;
 import org.apache.lucene.util.Bits;
 import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.OpenBitSet;
+import org.apache.lucene.util.FixedBitSet;
 import org.apache.mahout.clustering.classify.WeightedPropertyVectorWritable;
 import org.apache.mahout.common.CommandLineUtil;
 import org.apache.mahout.common.commandline.DefaultOptionCreator;
@@ -149,7 +150,7 @@ public class ClusterLabels {
     }
 
     log.info("Processing Cluster {} with {} documents", integer, wpvws.size());
-    Directory dir = FSDirectory.open(new File(this.indexDir));
+    Directory dir = FSDirectory.open(Paths.get(this.indexDir));
     IndexReader reader = DirectoryReader.open(dir);
     
     
@@ -165,7 +166,7 @@ public class ClusterLabels {
 
     int numDocs = reader.numDocs();
 
-    OpenBitSet clusterDocBitset = getClusterDocBitset(reader, idSet, this.idField);
+    FixedBitSet clusterDocBitset = getClusterDocBitset(reader, idSet, this.idField);
 
     log.info("Populating term infos from the index");
 
@@ -179,7 +180,7 @@ public class ClusterLabels {
      * frequency.
      */
     Terms t = MultiFields.getTerms(reader, contentField);
-    TermsEnum te = t.iterator(null);
+    TermsEnum te = t.iterator();
     Map<String, TermEntry> termEntryMap = new LinkedHashMap<>();
     Bits liveDocs = MultiFields.getLiveDocs(reader); //WARNING: returns null if there are
no deletions
 
@@ -187,8 +188,8 @@ public class ClusterLabels {
     int count = 0;
     BytesRef term;
     while ((term = te.next()) != null) {
-      OpenBitSet termBitset = new OpenBitSet(reader.maxDoc());
-      DocsEnum docsEnum = MultiFields.getTermDocsEnum(reader, null, contentField, term);
+      FixedBitSet termBitset = new FixedBitSet(reader.maxDoc());
+      PostingsEnum docsEnum = MultiFields.getTermDocsEnum(reader, contentField, term);
       int docID;
       while ((docID = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
         //check to see if we don't have an deletions (null) or if document is live
@@ -230,12 +231,12 @@ public class ClusterLabels {
     return clusteredTermInfo.subList(0, Math.min(clusteredTermInfo.size(), maxLabels));
   }
 
-  private static OpenBitSet getClusterDocBitset(IndexReader reader,
+  private static FixedBitSet getClusterDocBitset(IndexReader reader,
                                                 Collection<String> idSet,
                                                 String idField) throws IOException {
     int numDocs = reader.numDocs();
 
-    OpenBitSet bitset = new OpenBitSet(numDocs);
+    FixedBitSet bitset = new FixedBitSet(numDocs);
     
     Set<String>  idFieldSelector = null;
     if (idField != null) {

http://git-wip-us.apache.org/repos/asf/mahout/blob/4d0cd66a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java
b/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java
index 2eeebd9..876816f 100644
--- a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java
+++ b/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java
@@ -20,6 +20,7 @@ package org.apache.mahout.utils.vectors.lucene;
 import java.io.File;
 import java.io.IOException;
 import java.io.Writer;
+import java.nio.file.Paths;
 import java.util.Iterator;
 
 import com.google.common.base.Preconditions;
@@ -85,7 +86,7 @@ public final class Driver {
     Preconditions.checkArgument(minDf >= 1, "minDf must be >= 1");
     Preconditions.checkArgument(maxDFPercent <= 99, "maxDFPercent must be <= 99");
 
-    Directory dir = FSDirectory.open(file);
+    Directory dir = FSDirectory.open(Paths.get(file.getAbsolutePath()));
     IndexReader reader = DirectoryReader.open(dir);
 
 

http://git-wip-us.apache.org/repos/asf/mahout/blob/4d0cd66a/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
b/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
index a1d2bbb..01d46fc 100644
--- a/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
+++ b/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
@@ -31,11 +31,11 @@ import org.apache.lucene.document.Field;
 import org.apache.lucene.document.FieldType;
 import org.apache.lucene.document.StringField;
 import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexOptions;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.IndexWriterConfig;
 import org.apache.lucene.store.RAMDirectory;
-import org.apache.lucene.util.Version;
 import org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansDriver;
 import org.apache.mahout.clustering.kmeans.KMeansDriver;
 import org.apache.mahout.clustering.kmeans.RandomSeedGenerator;
@@ -94,7 +94,7 @@ public final class TestClusterDumper extends MahoutTestCase {
     sampleData = new ArrayList<>();
     RAMDirectory directory = new RAMDirectory();
     try (IndexWriter writer = new IndexWriter(directory,
-        new IndexWriterConfig(Version.LUCENE_46, new StandardAnalyzer(Version.LUCENE_46)))){
+        new IndexWriterConfig(new StandardAnalyzer()))){
       for (int i = 0; i < docs2.length; i++) {
         Document doc = new Document();
         Field id = new StringField("id", "doc_" + i, Field.Store.YES);
@@ -102,7 +102,7 @@ public final class TestClusterDumper extends MahoutTestCase {
         // Store both position and offset information
         FieldType fieldType = new FieldType();
         fieldType.setStored(false);
-        fieldType.setIndexed(true);
+        fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
         fieldType.setTokenized(true);
         fieldType.setStoreTermVectors(true);
         fieldType.setStoreTermVectorPositions(true);

http://git-wip-us.apache.org/repos/asf/mahout/blob/4d0cd66a/integration/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilterTest.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilterTest.java
b/integration/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilterTest.java
index 37efc01..4fdbbbc 100644
--- a/integration/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilterTest.java
+++ b/integration/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilterTest.java
@@ -36,7 +36,6 @@ import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
 import org.apache.lucene.analysis.shingle.ShingleFilter;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.util.Version;
 import org.apache.mahout.common.MahoutTestCase;
 import org.junit.Test;
 
@@ -79,7 +78,7 @@ public final class BloomTokenFilterTest extends MahoutTestCase {
   @Test
   public void testAnalyzer() throws IOException {
     Reader reader = new StringReader(input);
-    Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_46);
+    Analyzer analyzer = new WhitespaceAnalyzer();
     TokenStream ts = analyzer.tokenStream(null, reader);
     ts.reset();
     validateTokens(allTokens, ts);
@@ -91,7 +90,7 @@ public final class BloomTokenFilterTest extends MahoutTestCase {
   @Test
   public void testNonKeepdAnalyzer() throws IOException {
     Reader reader = new StringReader(input);
-    Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_46);
+    Analyzer analyzer = new WhitespaceAnalyzer();
     TokenStream ts = analyzer.tokenStream(null, reader);
     ts.reset();
     TokenStream f = new BloomTokenFilter(getFilter(filterTokens), false /* toss matching
tokens */, ts);
@@ -104,7 +103,7 @@ public final class BloomTokenFilterTest extends MahoutTestCase {
   @Test
   public void testKeepAnalyzer() throws IOException {
     Reader reader = new StringReader(input);
-    Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_46);
+    Analyzer analyzer = new WhitespaceAnalyzer();
     TokenStream ts = analyzer.tokenStream(null, reader);
     ts.reset();
     TokenStream f = new BloomTokenFilter(getFilter(filterTokens), true /* keep matching tokens
*/, ts);
@@ -117,7 +116,7 @@ public final class BloomTokenFilterTest extends MahoutTestCase {
   @Test
   public void testShingleFilteredAnalyzer() throws IOException {
     Reader reader = new StringReader(input);
-    Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_46);
+    Analyzer analyzer = new WhitespaceAnalyzer();
     TokenStream ts = analyzer.tokenStream(null, reader);
     ts.reset();
     ShingleFilter sf = new ShingleFilter(ts, 3);

http://git-wip-us.apache.org/repos/asf/mahout/blob/4d0cd66a/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfoTest.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfoTest.java
b/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfoTest.java
index 44a91e9..890a14b 100644
--- a/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfoTest.java
+++ b/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfoTest.java
@@ -28,11 +28,11 @@ import org.apache.lucene.document.Field;
 import org.apache.lucene.document.FieldType;
 import org.apache.lucene.document.StringField;
 import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexOptions;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.IndexWriterConfig;
 import org.apache.lucene.store.RAMDirectory;
-import org.apache.lucene.util.Version;
 import org.apache.mahout.common.MahoutTestCase;
 import org.junit.Before;
 import org.junit.Test;
@@ -65,7 +65,7 @@ public class CachedTermInfoTest extends MahoutTestCase {
 
     FieldType fieldType = new FieldType();
     fieldType.setStored(false);
-    fieldType.setIndexed(true);
+    fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
     fieldType.setTokenized(true);
     fieldType.setStoreTermVectors(false);
     fieldType.setStoreTermVectorPositions(false);
@@ -100,7 +100,7 @@ public class CachedTermInfoTest extends MahoutTestCase {
   static RAMDirectory createTestIndex(FieldType fieldType,
                                       RAMDirectory directory,
                                       int startingId) throws IOException {
-    IndexWriter writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_46,
new WhitespaceAnalyzer(Version.LUCENE_46)));
+    IndexWriter writer = new IndexWriter(directory, new IndexWriterConfig(new WhitespaceAnalyzer()));
 
     try {
       for (int i = 0; i < DOCS.length; i++) {

http://git-wip-us.apache.org/repos/asf/mahout/blob/4d0cd66a/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/DriverTest.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/DriverTest.java
b/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/DriverTest.java
index 6ac2df8..86c8305 100644
--- a/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/DriverTest.java
+++ b/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/DriverTest.java
@@ -30,18 +30,18 @@ import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.FieldType;
-import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.IndexOptions;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.IndexWriterConfig;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.SimpleFSDirectory;
-import org.apache.lucene.util.Version;
 import org.apache.mahout.common.MahoutTestCase;
 import org.junit.Before;
 import org.junit.Test;
 
 import java.io.File;
 import java.io.IOException;
+import java.nio.file.Paths;
 import java.util.Set;
 
 public class DriverTest extends MahoutTestCase {
@@ -73,9 +73,8 @@ public class DriverTest extends MahoutTestCase {
     public static final FieldType TYPE = new FieldType();
 
     static {
-      TYPE.setIndexed(true);
       TYPE.setOmitNorms(true);
-      TYPE.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS);
+      TYPE.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
       TYPE.setStored(true);
       TYPE.setTokenized(true);
       TYPE.setStoreTermVectors(true);
@@ -90,9 +89,10 @@ public class DriverTest extends MahoutTestCase {
   @Test
   public void sequenceFileDictionary() throws IOException {
 
-    Directory index = new SimpleFSDirectory(indexDir);
-    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_46);
-    IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_46, analyzer);
+    Directory index = new SimpleFSDirectory(Paths.get(indexDir.getAbsolutePath()));
+    Analyzer analyzer = new StandardAnalyzer();
+    IndexWriterConfig config = new IndexWriterConfig(analyzer);
+    config.setCommitOnClose(true);
     final IndexWriter writer = new IndexWriter(index, config);
 
     try {
@@ -100,9 +100,8 @@ public class DriverTest extends MahoutTestCase {
       writer.addDocument(asDocument("One Ring to find them,"));
       writer.addDocument(asDocument("One Ring to bring them all"));
       writer.addDocument(asDocument("and in the darkness bind them"));
-
     } finally {
-      writer.close(true);
+      writer.close();
     }
 
     File seqDict = new File(outputDir, "dict.seq");

http://git-wip-us.apache.org/repos/asf/mahout/blob/4d0cd66a/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java
b/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java
index ba49a2d..8d92551 100644
--- a/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java
+++ b/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java
@@ -29,11 +29,11 @@ import org.apache.lucene.document.Field;
 import org.apache.lucene.document.FieldType;
 import org.apache.lucene.document.StringField;
 import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexOptions;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.IndexWriterConfig;
 import org.apache.lucene.store.RAMDirectory;
-import org.apache.lucene.util.Version;
 import org.apache.mahout.common.MahoutTestCase;
 import org.apache.mahout.math.NamedVector;
 import org.apache.mahout.math.Vector;
@@ -62,14 +62,14 @@ public final class LuceneIterableTest extends MahoutTestCase {
   @Before
   public void before() throws IOException {
 
-    TYPE_NO_TERM_VECTORS.setIndexed(true);
+    TYPE_NO_TERM_VECTORS.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
     TYPE_NO_TERM_VECTORS.setTokenized(true);
     TYPE_NO_TERM_VECTORS.setStoreTermVectors(false);
     TYPE_NO_TERM_VECTORS.setStoreTermVectorPositions(false);
     TYPE_NO_TERM_VECTORS.setStoreTermVectorOffsets(false);
     TYPE_NO_TERM_VECTORS.freeze();
 
-    TYPE_TERM_VECTORS.setIndexed(true);
+    TYPE_TERM_VECTORS.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
     TYPE_TERM_VECTORS.setTokenized(true);
     TYPE_TERM_VECTORS.setStored(true);
     TYPE_TERM_VECTORS.setStoreTermVectors(true);
@@ -177,7 +177,7 @@ public final class LuceneIterableTest extends MahoutTestCase {
                                               RAMDirectory directory,
                                               int startingId) throws IOException {
 
-    try (IndexWriter writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_46,new
StandardAnalyzer(Version.LUCENE_46)))) {
+    try (IndexWriter writer = new IndexWriter(directory, new IndexWriterConfig(new StandardAnalyzer())))
{
       for (int i = 0; i < DOCS.length; i++) {
         Document doc = new Document();
         Field id = new StringField("id", "doc_" + (i + startingId), Field.Store.YES);

http://git-wip-us.apache.org/repos/asf/mahout/blob/4d0cd66a/mr/src/main/java/org/apache/mahout/common/lucene/AnalyzerUtils.java
----------------------------------------------------------------------
diff --git a/mr/src/main/java/org/apache/mahout/common/lucene/AnalyzerUtils.java b/mr/src/main/java/org/apache/mahout/common/lucene/AnalyzerUtils.java
index 37ca383..742d6cf 100644
--- a/mr/src/main/java/org/apache/mahout/common/lucene/AnalyzerUtils.java
+++ b/mr/src/main/java/org/apache/mahout/common/lucene/AnalyzerUtils.java
@@ -32,7 +32,7 @@ public final class AnalyzerUtils {
    * @throws ClassNotFoundException - {@link ClassNotFoundException}
    */
   public static Analyzer createAnalyzer(String analyzerClassName) throws ClassNotFoundException
{
-    return createAnalyzer(analyzerClassName, Version.LUCENE_46);
+    return createAnalyzer(analyzerClassName, Version.LUCENE_5_5_2);
   }
 
   public static Analyzer createAnalyzer(String analyzerClassName, Version version) throws
ClassNotFoundException {
@@ -47,7 +47,7 @@ public final class AnalyzerUtils {
    * @return {@link Analyzer}
    */
   public static Analyzer createAnalyzer(Class<? extends Analyzer> analyzerClass) {
-    return createAnalyzer(analyzerClass, Version.LUCENE_46);
+    return createAnalyzer(analyzerClass, Version.LUCENE_5_5_2);
   }
 
   public static Analyzer createAnalyzer(Class<? extends Analyzer> analyzerClass, Version
version) {

http://git-wip-us.apache.org/repos/asf/mahout/blob/4d0cd66a/mr/src/main/java/org/apache/mahout/vectorizer/TFIDF.java
----------------------------------------------------------------------
diff --git a/mr/src/main/java/org/apache/mahout/vectorizer/TFIDF.java b/mr/src/main/java/org/apache/mahout/vectorizer/TFIDF.java
index 0a537eb..238fa03 100644
--- a/mr/src/main/java/org/apache/mahout/vectorizer/TFIDF.java
+++ b/mr/src/main/java/org/apache/mahout/vectorizer/TFIDF.java
@@ -17,11 +17,11 @@
 
 package org.apache.mahout.vectorizer;
 
-import org.apache.lucene.search.similarities.DefaultSimilarity;
+import org.apache.lucene.search.similarities.ClassicSimilarity;
 //TODO: add a new class that supports arbitrary Lucene similarity implementations
 public class TFIDF implements Weight {
 
-  private final DefaultSimilarity sim = new DefaultSimilarity();
+  private final ClassicSimilarity sim = new ClassicSimilarity();
 
   @Override
   public double calculate(int tf, int df, int length, int numDocs) {

http://git-wip-us.apache.org/repos/asf/mahout/blob/4d0cd66a/mr/src/main/java/org/apache/mahout/vectorizer/encoders/LuceneTextValueEncoder.java
----------------------------------------------------------------------
diff --git a/mr/src/main/java/org/apache/mahout/vectorizer/encoders/LuceneTextValueEncoder.java
b/mr/src/main/java/org/apache/mahout/vectorizer/encoders/LuceneTextValueEncoder.java
index 3bae26e..e3e133c 100644
--- a/mr/src/main/java/org/apache/mahout/vectorizer/encoders/LuceneTextValueEncoder.java
+++ b/mr/src/main/java/org/apache/mahout/vectorizer/encoders/LuceneTextValueEncoder.java
@@ -49,13 +49,9 @@ public class LuceneTextValueEncoder extends TextValueEncoder {
    */
   @Override
   protected Iterable<String> tokenize(CharSequence originalForm) {
-    try {
-      TokenStream ts = analyzer.tokenStream(getName(), new CharSequenceReader(originalForm));
-      ts.addAttribute(CharTermAttribute.class);
-      return new LuceneTokenIterable(ts, false);
-    } catch (IOException ex) {
-      throw new IllegalStateException(ex);
-    }
+    TokenStream ts = analyzer.tokenStream(getName(), new CharSequenceReader(originalForm));
+    ts.addAttribute(CharTermAttribute.class);
+    return new LuceneTokenIterable(ts, false);
   }
 
   private static final class CharSequenceReader extends Reader {

http://git-wip-us.apache.org/repos/asf/mahout/blob/4d0cd66a/mr/src/test/java/org/apache/mahout/vectorizer/encoders/TextValueEncoderTest.java
----------------------------------------------------------------------
diff --git a/mr/src/test/java/org/apache/mahout/vectorizer/encoders/TextValueEncoderTest.java
b/mr/src/test/java/org/apache/mahout/vectorizer/encoders/TextValueEncoderTest.java
index 4446fef..be3e03e 100644
--- a/mr/src/test/java/org/apache/mahout/vectorizer/encoders/TextValueEncoderTest.java
+++ b/mr/src/test/java/org/apache/mahout/vectorizer/encoders/TextValueEncoderTest.java
@@ -19,7 +19,6 @@ package org.apache.mahout.vectorizer.encoders;
 
 import com.google.common.collect.ImmutableMap;
 import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
-import org.apache.lucene.util.Version;
 import org.apache.mahout.common.MahoutTestCase;
 import org.apache.mahout.math.DenseVector;
 import org.apache.mahout.math.Vector;
@@ -70,7 +69,7 @@ public final class TextValueEncoderTest extends MahoutTestCase {
   @Test
   public void testLuceneEncoding() throws Exception {
     LuceneTextValueEncoder enc = new LuceneTextValueEncoder("text");
-    enc.setAnalyzer(new WhitespaceAnalyzer(Version.LUCENE_46));
+    enc.setAnalyzer(new WhitespaceAnalyzer());
     Vector v1 = new DenseVector(200);
     enc.addToVector("test1 and more", v1);
     enc.flush(1, v1);

http://git-wip-us.apache.org/repos/asf/mahout/blob/4d0cd66a/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index ca0ea21..165e42e 100644
--- a/pom.xml
+++ b/pom.xml
@@ -117,7 +117,7 @@
     <mjavadoc.version>2.10.3</mjavadoc.version>
     <mscala.version>3.2.0</mscala.version>
     <hbase.version>1.0.0</hbase.version>
-    <lucene.version>4.6.1</lucene.version>
+    <lucene.version>5.5.2</lucene.version>
     <slf4j.version>1.7.19</slf4j.version>
     <scala.compat.version>2.10</scala.compat.version>
     <scala.version>2.10.4</scala.version>


Mime
View raw message