mahout-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From vans...@apache.org
Subject [21/52] [partial] mahout git commit: removed all files except for website directory
Date Tue, 27 Jun 2017 16:14:46 GMT
http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/vectors/io/TextualVectorWriter.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/io/TextualVectorWriter.java b/integration/src/main/java/org/apache/mahout/utils/vectors/io/TextualVectorWriter.java
deleted file mode 100644
index cc27d1d..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/vectors/io/TextualVectorWriter.java
+++ /dev/null
@@ -1,70 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.io;
-
-import java.io.IOException;
-import java.io.Writer;
-
-import com.google.common.io.Closeables;
-import org.apache.mahout.math.Vector;
-
-/**
- * Write out the vectors to any {@link Writer} using {@link Vector#asFormatString()},
- * one per line by default.
- */
-public class TextualVectorWriter implements VectorWriter {
-
-  private final Writer writer;
-  
-  public TextualVectorWriter(Writer writer) {
-    this.writer = writer;
-  }
-
-  protected Writer getWriter() {
-    return writer;
-  }
-  
-  @Override
-  public long write(Iterable<Vector> iterable) throws IOException {
-    return write(iterable, Long.MAX_VALUE);
-  }
-  
-  @Override
-  public long write(Iterable<Vector> iterable, long maxDocs) throws IOException {
-    long result = 0;
-    for (Vector vector : iterable) {
-      if (result >= maxDocs) {
-        break;
-      }
-      write(vector);
-      result++;
-    }
-    return result;
-  }
-
-  @Override
-  public void write(Vector vector) throws IOException {
-    writer.write(vector.asFormatString());
-    writer.write('\n');
-  }
-
-  @Override
-  public void close() throws IOException {
-    Closeables.close(writer, false);
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/vectors/io/VectorWriter.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/io/VectorWriter.java b/integration/src/main/java/org/apache/mahout/utils/vectors/io/VectorWriter.java
deleted file mode 100644
index 923e270..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/vectors/io/VectorWriter.java
+++ /dev/null
@@ -1,52 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.io;
-
-import java.io.Closeable;
-import java.io.IOException;
-
-import org.apache.mahout.math.Vector;
-
-public interface VectorWriter extends Closeable {
-  /**
-   * Write all values in the Iterable to the output
-   * @param iterable The {@link Iterable} to loop over
-   * @return the number of docs written
-   * @throws IOException if there was a problem writing
-   *
-   */
-  long write(Iterable<Vector> iterable) throws IOException;
-
-  /**
-   * Write out a vector
-   *
-   * @param vector The {@link org.apache.mahout.math.Vector} to write
-   * @throws IOException
-   */
-  void write(Vector vector) throws IOException;
-  
-  /**
-   * Write the first {@code maxDocs} to the output.
-   * @param iterable The {@link Iterable} to loop over
-   * @param maxDocs the maximum number of docs to write
-   * @return The number of docs written
-   * @throws IOException if there was a problem writing
-   */
-  long write(Iterable<Vector> iterable, long maxDocs) throws IOException;
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/AbstractLuceneIterator.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/AbstractLuceneIterator.java b/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/AbstractLuceneIterator.java
deleted file mode 100644
index ff61a70..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/AbstractLuceneIterator.java
+++ /dev/null
@@ -1,140 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.lucene;
-
-import com.google.common.collect.AbstractIterator;
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.Terms;
-import org.apache.lucene.index.TermsEnum;
-import org.apache.lucene.util.BytesRef;
-import org.apache.mahout.math.NamedVector;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.utils.Bump125;
-import org.apache.mahout.utils.vectors.TermInfo;
-import org.apache.mahout.vectorizer.Weight;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.IOException;
-
-/**
- * Iterate over a Lucene index, extracting term vectors.
- * Subclasses define how much information to retrieve from the Lucene index.
- */
-public abstract class AbstractLuceneIterator extends AbstractIterator<Vector> {
-  private static final Logger log = LoggerFactory.getLogger(LuceneIterator.class);
-  protected final IndexReader indexReader;
-  protected final String field;
-  protected final TermInfo terminfo;
-  protected final double normPower;
-  protected final Weight weight;
-  protected final Bump125 bump = new Bump125();
-  protected int nextDocId;
-  protected int maxErrorDocs;
-  protected int numErrorDocs;
-  protected long nextLogRecord = bump.increment();
-  protected int skippedErrorMessages;
-
-  public AbstractLuceneIterator(TermInfo terminfo, double normPower, IndexReader indexReader, Weight weight,
-      double maxPercentErrorDocs, String field) {
-    this.terminfo = terminfo;
-    this.normPower = normPower;
-    this.indexReader = indexReader;
-
-    this.weight = weight;
-    this.nextDocId = 0;
-    this.maxErrorDocs = (int) (maxPercentErrorDocs * indexReader.numDocs());
-    this.field = field;
-  }
-
-  /**
-   * Given the document name, derive a name for the vector. This may involve
-   * reading the document from Lucene and setting up any other state that the
-   * subclass wants. This will be called once for each document that the
-   * iterator processes.
-   * @param documentIndex the lucene document index.
-   * @return the name to store in the vector.
-   */
-  protected abstract String getVectorName(int documentIndex) throws IOException;
-
-  @Override
-  protected Vector computeNext() {
-    try {
-      int doc;
-      Terms termFreqVector;
-      String name;
-
-      do {
-        doc = this.nextDocId;
-        nextDocId++;
-
-        if (doc >= indexReader.maxDoc()) {
-          return endOfData();
-        }
-
-        termFreqVector = indexReader.getTermVector(doc, field);
-        name = getVectorName(doc);
-
-        if (termFreqVector == null) {
-          numErrorDocs++;
-          if (numErrorDocs >= maxErrorDocs) {
-            log.error("There are too many documents that do not have a term vector for {}", field);
-            throw new IllegalStateException("There are too many documents that do not have a term vector for "
-                + field);
-          }
-          if (numErrorDocs >= nextLogRecord) {
-            if (skippedErrorMessages == 0) {
-              log.warn("{} does not have a term vector for {}", name, field);
-            } else {
-              log.warn("{} documents do not have a term vector for {}", numErrorDocs, field);
-            }
-            nextLogRecord = bump.increment();
-            skippedErrorMessages = 0;
-          } else {
-            skippedErrorMessages++;
-          }
-        }
-      } while (termFreqVector == null);
-
-      // The loop exits with termFreqVector and name set.
-
-      TermsEnum te = termFreqVector.iterator();
-      BytesRef term;
-      TFDFMapper mapper = new TFDFMapper(indexReader.numDocs(), weight, this.terminfo);
-      mapper.setExpectations(field, termFreqVector.size());
-      while ((term = te.next()) != null) {
-        mapper.map(term, (int) te.totalTermFreq());
-      }
-      Vector result = mapper.getVector();
-      if (result == null) {
-        // TODO is this right? last version would produce null in the iteration in this case, though it
-        // seems like that may not be desirable
-        return null;
-      }
-
-      if (normPower == LuceneIterable.NO_NORMALIZING) {
-        result = new NamedVector(result, name);
-      } else {
-        result = new NamedVector(result.normalize(normPower), name);
-      }
-      return result;
-    } catch (IOException ioe) {
-      throw new IllegalStateException(ioe);
-    }
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfo.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfo.java b/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfo.java
deleted file mode 100644
index 0b59ed6..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfo.java
+++ /dev/null
@@ -1,79 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.lucene;
-
-import java.io.IOException;
-import java.util.Iterator;
-import java.util.LinkedHashMap;
-import java.util.Map;
-
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.MultiFields;
-import org.apache.lucene.index.Terms;
-import org.apache.lucene.index.TermsEnum;
-import org.apache.lucene.util.BytesRef;
-import org.apache.mahout.utils.vectors.TermEntry;
-import org.apache.mahout.utils.vectors.TermInfo;
-
-
-/**
- * Caches TermEntries from a single field.  Materializes all values in the TermEnum to memory (much like FieldCache)
- */
-public class CachedTermInfo implements TermInfo {
-
-  private final Map<String, TermEntry> termEntries;
-  private final String field;
-
-  public CachedTermInfo(IndexReader reader, String field, int minDf, int maxDfPercent) throws IOException {
-    this.field = field;
-    Terms t = MultiFields.getTerms(reader, field);
-    TermsEnum te = t.iterator();
-
-    int numDocs = reader.numDocs();
-    double percent = numDocs * maxDfPercent / 100.0;
-    //Should we use a linked hash map so that we know terms are in order?
-    termEntries = new LinkedHashMap<>();
-    int count = 0;
-    BytesRef text;
-    while ((text = te.next()) != null) {
-      int df = te.docFreq();
-      if (df >= minDf && df <= percent) {
-        TermEntry entry = new TermEntry(text.utf8ToString(), count++, df);
-        termEntries.put(entry.getTerm(), entry);
-      }
-    }
-  }
-
-  @Override
-  public int totalTerms(String field) {
-    return termEntries.size();
-  }
-
-  @Override
-  public TermEntry getTermEntry(String field, String term) {
-    if (!this.field.equals(field)) {
-      return null;
-    }
-    return termEntries.get(term);
-  }
-
-  @Override
-  public Iterator<TermEntry> getAllEntries() {
-    return termEntries.values().iterator();
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java b/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java
deleted file mode 100644
index b2568e7..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java
+++ /dev/null
@@ -1,381 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.lucene;
-
-import java.io.File;
-import java.io.IOException;
-import java.io.OutputStreamWriter;
-import java.io.Writer;
-import java.nio.file.Paths;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.HashSet;
-import java.util.LinkedHashMap;
-import java.util.LinkedList;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-import java.util.TreeSet;
-
-import com.google.common.io.Closeables;
-import com.google.common.io.Files;
-import org.apache.commons.cli2.CommandLine;
-import org.apache.commons.cli2.Group;
-import org.apache.commons.cli2.Option;
-import org.apache.commons.cli2.OptionException;
-import org.apache.commons.cli2.builder.ArgumentBuilder;
-import org.apache.commons.cli2.builder.DefaultOptionBuilder;
-import org.apache.commons.cli2.builder.GroupBuilder;
-import org.apache.commons.cli2.commandline.Parser;
-import org.apache.commons.io.Charsets;
-import org.apache.hadoop.fs.Path;
-import org.apache.lucene.index.DirectoryReader;
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.MultiFields;
-import org.apache.lucene.index.PostingsEnum;
-import org.apache.lucene.index.Term;
-import org.apache.lucene.index.Terms;
-import org.apache.lucene.index.TermsEnum;
-import org.apache.lucene.search.DocIdSetIterator;
-import org.apache.lucene.store.Directory;
-import org.apache.lucene.store.FSDirectory;
-import org.apache.lucene.util.Bits;
-import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.FixedBitSet;
-import org.apache.mahout.clustering.classify.WeightedPropertyVectorWritable;
-import org.apache.mahout.common.CommandLineUtil;
-import org.apache.mahout.common.commandline.DefaultOptionCreator;
-import org.apache.mahout.math.NamedVector;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.math.stats.LogLikelihood;
-import org.apache.mahout.utils.clustering.ClusterDumper;
-import org.apache.mahout.utils.vectors.TermEntry;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * Get labels for the cluster using Log Likelihood Ratio (LLR).
- * <p/>
- *"The most useful way to think of this (LLR) is as the percentage of in-cluster documents that have the
- * feature (term) versus the percentage out, keeping in mind that both percentages are uncertain since we have
- * only a sample of all possible documents." - Ted Dunning
- * <p/>
- * More about LLR can be found at : http://tdunning.blogspot.com/2008/03/surprise-and-coincidence.html
- */
-public class ClusterLabels {
-
-  private static final Logger log = LoggerFactory.getLogger(ClusterLabels.class);
-
-  public static final int DEFAULT_MIN_IDS = 50;
-  public static final int DEFAULT_MAX_LABELS = 25;
-
-  private final String indexDir;
-  private final String contentField;
-  private String idField;
-  private final Map<Integer, List<WeightedPropertyVectorWritable>> clusterIdToPoints;
-  private String output;
-  private final int minNumIds;
-  private final int maxLabels;
-
-  public ClusterLabels(Path seqFileDir,
-                       Path pointsDir,
-                       String indexDir,
-                       String contentField,
-                       int minNumIds,
-                       int maxLabels) {
-    this.indexDir = indexDir;
-    this.contentField = contentField;
-    this.minNumIds = minNumIds;
-    this.maxLabels = maxLabels;
-    ClusterDumper clusterDumper = new ClusterDumper(seqFileDir, pointsDir);
-    this.clusterIdToPoints = clusterDumper.getClusterIdToPoints();
-  }
-
-  public void getLabels() throws IOException {
-
-    try (Writer writer = (this.output == null) ?
-        new OutputStreamWriter(System.out, Charsets.UTF_8) : Files.newWriter(new File(this.output), Charsets.UTF_8)){
-      for (Map.Entry<Integer, List<WeightedPropertyVectorWritable>> integerListEntry : clusterIdToPoints.entrySet()) {
-        List<WeightedPropertyVectorWritable> wpvws = integerListEntry.getValue();
-        List<TermInfoClusterInOut> termInfos = getClusterLabels(integerListEntry.getKey(), wpvws);
-        if (termInfos != null) {
-          writer.write('\n');
-          writer.write("Top labels for Cluster ");
-          writer.write(String.valueOf(integerListEntry.getKey()));
-          writer.write(" containing ");
-          writer.write(String.valueOf(wpvws.size()));
-          writer.write(" vectors");
-          writer.write('\n');
-          writer.write("Term \t\t LLR \t\t In-ClusterDF \t\t Out-ClusterDF ");
-          writer.write('\n');
-          for (TermInfoClusterInOut termInfo : termInfos) {
-            writer.write(termInfo.getTerm());
-            writer.write("\t\t");
-            writer.write(String.valueOf(termInfo.getLogLikelihoodRatio()));
-            writer.write("\t\t");
-            writer.write(String.valueOf(termInfo.getInClusterDF()));
-            writer.write("\t\t");
-            writer.write(String.valueOf(termInfo.getOutClusterDF()));
-            writer.write('\n');
-          }
-        }
-      }
-    }
-  }
-
-  /**
-   * Get the list of labels, sorted by best score.
-   */
-  protected List<TermInfoClusterInOut> getClusterLabels(Integer integer,
-                                                        Collection<WeightedPropertyVectorWritable> wpvws) throws IOException {
-
-    if (wpvws.size() < minNumIds) {
-      log.info("Skipping small cluster {} with size: {}", integer, wpvws.size());
-      return null;
-    }
-
-    log.info("Processing Cluster {} with {} documents", integer, wpvws.size());
-    Directory dir = FSDirectory.open(Paths.get(this.indexDir));
-    IndexReader reader = DirectoryReader.open(dir);
-    
-    
-    log.info("# of documents in the index {}", reader.numDocs());
-
-    Collection<String> idSet = new HashSet<>();
-    for (WeightedPropertyVectorWritable wpvw : wpvws) {
-      Vector vector = wpvw.getVector();
-      if (vector instanceof NamedVector) {
-        idSet.add(((NamedVector) vector).getName());
-      }
-    }
-
-    int numDocs = reader.numDocs();
-
-    FixedBitSet clusterDocBitset = getClusterDocBitset(reader, idSet, this.idField);
-
-    log.info("Populating term infos from the index");
-
-    /**
-     * This code is as that of CachedTermInfo, with one major change, which is to get the document frequency.
-     * 
-     * Since we have deleted the documents out of the cluster, the document frequency for a term should only
-     * include the in-cluster documents. The document frequency obtained from TermEnum reflects the frequency
-     * in the entire index. To get the in-cluster frequency, we need to query the index to get the term
-     * frequencies in each document. The number of results of this call will be the in-cluster document
-     * frequency.
-     */
-    Terms t = MultiFields.getTerms(reader, contentField);
-    TermsEnum te = t.iterator();
-    Map<String, TermEntry> termEntryMap = new LinkedHashMap<>();
-    Bits liveDocs = MultiFields.getLiveDocs(reader); //WARNING: returns null if there are no deletions
-
-
-    int count = 0;
-    BytesRef term;
-    while ((term = te.next()) != null) {
-      FixedBitSet termBitset = new FixedBitSet(reader.maxDoc());
-      PostingsEnum docsEnum = MultiFields.getTermDocsEnum(reader, contentField, term);
-      int docID;
-      while ((docID = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
-        //check to see if we don't have an deletions (null) or if document is live
-        if (liveDocs != null && !liveDocs.get(docID)) {
-          // document is deleted...
-          termBitset.set(docsEnum.docID());
-        }
-      }
-      // AND the term's bitset with cluster doc bitset to get the term's in-cluster frequency.
-      // This modifies the termBitset, but that's fine as we are not using it anywhere else.
-      termBitset.and(clusterDocBitset);
-      int inclusterDF = (int) termBitset.cardinality();
-
-      TermEntry entry = new TermEntry(term.utf8ToString(), count++, inclusterDF);
-      termEntryMap.put(entry.getTerm(), entry);
-
-    }
-
-    List<TermInfoClusterInOut> clusteredTermInfo = new LinkedList<>();
-
-    int clusterSize = wpvws.size();
-
-    for (TermEntry termEntry : termEntryMap.values()) {
-        
-      int corpusDF = reader.docFreq(new Term(this.contentField,termEntry.getTerm()));
-      int outDF = corpusDF - termEntry.getDocFreq();
-      int inDF = termEntry.getDocFreq();
-      double logLikelihoodRatio = scoreDocumentFrequencies(inDF, outDF, clusterSize, numDocs);
-      TermInfoClusterInOut termInfoCluster =
-          new TermInfoClusterInOut(termEntry.getTerm(), inDF, outDF, logLikelihoodRatio);
-      clusteredTermInfo.add(termInfoCluster);
-    }
-
-    Collections.sort(clusteredTermInfo);
-    // Cleanup
-    Closeables.close(reader, true);
-    termEntryMap.clear();
-
-    return clusteredTermInfo.subList(0, Math.min(clusteredTermInfo.size(), maxLabels));
-  }
-
-  private static FixedBitSet getClusterDocBitset(IndexReader reader,
-                                                Collection<String> idSet,
-                                                String idField) throws IOException {
-    int numDocs = reader.numDocs();
-
-    FixedBitSet bitset = new FixedBitSet(numDocs);
-    
-    Set<String>  idFieldSelector = null;
-    if (idField != null) {
-      idFieldSelector = new TreeSet<>();
-      idFieldSelector.add(idField);
-    }
-    
-    
-    for (int i = 0; i < numDocs; i++) {
-      String id;
-      // Use Lucene's internal ID if idField is not specified. Else, get it from the document.
-      if (idField == null) {
-        id = Integer.toString(i);
-      } else {
-        id = reader.document(i, idFieldSelector).get(idField);
-      }
-      if (idSet.contains(id)) {
-        bitset.set(i);
-      }
-    }
-    log.info("Created bitset for in-cluster documents : {}", bitset.cardinality());
-    return bitset;
-  }
-
-  private static double scoreDocumentFrequencies(long inDF, long outDF, long clusterSize, long corpusSize) {
-    long k12 = clusterSize - inDF;
-    long k22 = corpusSize - clusterSize - outDF;
-
-    return LogLikelihood.logLikelihoodRatio(inDF, k12, outDF, k22);
-  }
-
-  public String getIdField() {
-    return idField;
-  }
-
-  public void setIdField(String idField) {
-    this.idField = idField;
-  }
-
-  public String getOutput() {
-    return output;
-  }
-
-  public void setOutput(String output) {
-    this.output = output;
-  }
-
-  public static void main(String[] args) {
-
-    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
-    ArgumentBuilder abuilder = new ArgumentBuilder();
-    GroupBuilder gbuilder = new GroupBuilder();
-
-    Option indexOpt = obuilder.withLongName("dir").withRequired(true).withArgument(
-        abuilder.withName("dir").withMinimum(1).withMaximum(1).create())
-        .withDescription("The Lucene index directory").withShortName("d").create();
-
-    Option outputOpt = obuilder.withLongName("output").withRequired(false).withArgument(
-        abuilder.withName("output").withMinimum(1).withMaximum(1).create()).withDescription(
-        "The output file. If not specified, the result is printed on console.").withShortName("o").create();
-
-    Option fieldOpt = obuilder.withLongName("field").withRequired(true).withArgument(
-        abuilder.withName("field").withMinimum(1).withMaximum(1).create())
-        .withDescription("The content field in the index").withShortName("f").create();
-
-    Option idFieldOpt = obuilder.withLongName("idField").withRequired(false).withArgument(
-        abuilder.withName("idField").withMinimum(1).withMaximum(1).create()).withDescription(
-        "The field for the document ID in the index.  If null, then the Lucene internal doc "
-            + "id is used which is prone to error if the underlying index changes").withShortName("i").create();
-
-    Option seqOpt = obuilder.withLongName("seqFileDir").withRequired(true).withArgument(
-        abuilder.withName("seqFileDir").withMinimum(1).withMaximum(1).create()).withDescription(
-        "The directory containing Sequence Files for the Clusters").withShortName("s").create();
-
-    Option pointsOpt = obuilder.withLongName("pointsDir").withRequired(true).withArgument(
-        abuilder.withName("pointsDir").withMinimum(1).withMaximum(1).create()).withDescription(
-        "The directory containing points sequence files mapping input vectors to their cluster.  ")
-        .withShortName("p").create();
-    Option minClusterSizeOpt = obuilder.withLongName("minClusterSize").withRequired(false).withArgument(
-        abuilder.withName("minClusterSize").withMinimum(1).withMaximum(1).create()).withDescription(
-        "The minimum number of points required in a cluster to print the labels for").withShortName("m").create();
-    Option maxLabelsOpt = obuilder.withLongName("maxLabels").withRequired(false).withArgument(
-        abuilder.withName("maxLabels").withMinimum(1).withMaximum(1).create()).withDescription(
-        "The maximum number of labels to print per cluster").withShortName("x").create();
-    Option helpOpt = DefaultOptionCreator.helpOption();
-
-    Group group = gbuilder.withName("Options").withOption(indexOpt).withOption(idFieldOpt).withOption(outputOpt)
-        .withOption(fieldOpt).withOption(seqOpt).withOption(pointsOpt).withOption(helpOpt)
-        .withOption(maxLabelsOpt).withOption(minClusterSizeOpt).create();
-
-    try {
-      Parser parser = new Parser();
-      parser.setGroup(group);
-      CommandLine cmdLine = parser.parse(args);
-
-      if (cmdLine.hasOption(helpOpt)) {
-        CommandLineUtil.printHelp(group);
-        return;
-      }
-
-      Path seqFileDir = new Path(cmdLine.getValue(seqOpt).toString());
-      Path pointsDir = new Path(cmdLine.getValue(pointsOpt).toString());
-      String indexDir = cmdLine.getValue(indexOpt).toString();
-      String contentField = cmdLine.getValue(fieldOpt).toString();
-
-      String idField = null;
-
-      if (cmdLine.hasOption(idFieldOpt)) {
-        idField = cmdLine.getValue(idFieldOpt).toString();
-      }
-      String output = null;
-      if (cmdLine.hasOption(outputOpt)) {
-        output = cmdLine.getValue(outputOpt).toString();
-      }
-      int maxLabels = DEFAULT_MAX_LABELS;
-      if (cmdLine.hasOption(maxLabelsOpt)) {
-        maxLabels = Integer.parseInt(cmdLine.getValue(maxLabelsOpt).toString());
-      }
-      int minSize = DEFAULT_MIN_IDS;
-      if (cmdLine.hasOption(minClusterSizeOpt)) {
-        minSize = Integer.parseInt(cmdLine.getValue(minClusterSizeOpt).toString());
-      }
-      ClusterLabels clusterLabel = new ClusterLabels(seqFileDir, pointsDir, indexDir, contentField, minSize, maxLabels);
-
-      if (idField != null) {
-        clusterLabel.setIdField(idField);
-      }
-      if (output != null) {
-        clusterLabel.setOutput(output);
-      }
-
-      clusterLabel.getLabels();
-
-    } catch (OptionException e) {
-      log.error("Exception", e);
-      CommandLineUtil.printHelp(group);
-    } catch (IOException e) {
-      log.error("Exception", e);
-    }
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java b/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java
deleted file mode 100644
index 876816f..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java
+++ /dev/null
@@ -1,349 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- * <p/>
- * http://www.apache.org/licenses/LICENSE-2.0
- * <p/>
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.lucene;
-
-import java.io.File;
-import java.io.IOException;
-import java.io.Writer;
-import java.nio.file.Paths;
-import java.util.Iterator;
-
-import com.google.common.base.Preconditions;
-import com.google.common.io.Files;
-import org.apache.commons.cli2.CommandLine;
-import org.apache.commons.cli2.Group;
-import org.apache.commons.cli2.Option;
-import org.apache.commons.cli2.OptionException;
-import org.apache.commons.cli2.builder.ArgumentBuilder;
-import org.apache.commons.cli2.builder.DefaultOptionBuilder;
-import org.apache.commons.cli2.builder.GroupBuilder;
-import org.apache.commons.cli2.commandline.Parser;
-import org.apache.commons.io.Charsets;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.IntWritable;
-import org.apache.hadoop.io.LongWritable;
-import org.apache.hadoop.io.SequenceFile;
-import org.apache.hadoop.io.Text;
-import org.apache.lucene.index.DirectoryReader;
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.store.Directory;
-import org.apache.lucene.store.FSDirectory;
-import org.apache.mahout.common.CommandLineUtil;
-import org.apache.mahout.math.VectorWritable;
-import org.apache.mahout.utils.vectors.TermEntry;
-import org.apache.mahout.utils.vectors.TermInfo;
-import org.apache.mahout.utils.vectors.io.DelimitedTermInfoWriter;
-import org.apache.mahout.utils.vectors.io.SequenceFileVectorWriter;
-import org.apache.mahout.utils.vectors.io.VectorWriter;
-import org.apache.mahout.vectorizer.TF;
-import org.apache.mahout.vectorizer.TFIDF;
-import org.apache.mahout.vectorizer.Weight;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-public final class Driver {
-
-  private static final Logger log = LoggerFactory.getLogger(Driver.class);
-
-  private String luceneDir;
-  private String outFile;
-  private String field;
-  private String idField;
-  private String dictOut;
-  private String seqDictOut = "";
-  private String weightType = "tfidf";
-  private String delimiter = "\t";
-  private double norm = LuceneIterable.NO_NORMALIZING;
-  private long maxDocs = Long.MAX_VALUE;
-  private int minDf = 1;
-  private int maxDFPercent = 99;
-  private double maxPercentErrorDocs = 0.0;
-
-  public void dumpVectors() throws IOException {
-
-    File file = new File(luceneDir);
-    Preconditions.checkArgument(file.isDirectory(),
-        "Lucene directory: " + file.getAbsolutePath()
-            + " does not exist or is not a directory");
-    Preconditions.checkArgument(maxDocs >= 0, "maxDocs must be >= 0");
-    Preconditions.checkArgument(minDf >= 1, "minDf must be >= 1");
-    Preconditions.checkArgument(maxDFPercent <= 99, "maxDFPercent must be <= 99");
-
-    Directory dir = FSDirectory.open(Paths.get(file.getAbsolutePath()));
-    IndexReader reader = DirectoryReader.open(dir);
-
-
-    Weight weight;
-    if ("tf".equalsIgnoreCase(weightType)) {
-      weight = new TF();
-    } else if ("tfidf".equalsIgnoreCase(weightType)) {
-      weight = new TFIDF();
-    } else {
-      throw new IllegalArgumentException("Weight type " + weightType + " is not supported");
-    }
-
-    TermInfo termInfo = new CachedTermInfo(reader, field, minDf, maxDFPercent);
-
-    LuceneIterable iterable;
-    if (norm == LuceneIterable.NO_NORMALIZING) {
-      iterable = new LuceneIterable(reader, idField, field, termInfo, weight, LuceneIterable.NO_NORMALIZING,
-          maxPercentErrorDocs);
-    } else {
-      iterable = new LuceneIterable(reader, idField, field, termInfo, weight, norm, maxPercentErrorDocs);
-    }
-
-    log.info("Output File: {}", outFile);
-
-    try (VectorWriter vectorWriter = getSeqFileWriter(outFile)) {
-      long numDocs = vectorWriter.write(iterable, maxDocs);
-      log.info("Wrote: {} vectors", numDocs);
-    }
-
-    File dictOutFile = new File(dictOut);
-    log.info("Dictionary Output file: {}", dictOutFile);
-    Writer writer = Files.newWriter(dictOutFile, Charsets.UTF_8);
-    try (DelimitedTermInfoWriter tiWriter = new DelimitedTermInfoWriter(writer, delimiter, field)) {
-      tiWriter.write(termInfo);
-    }
-
-    if (!"".equals(seqDictOut)) {
-      log.info("SequenceFile Dictionary Output file: {}", seqDictOut);
-
-      Path path = new Path(seqDictOut);
-      Configuration conf = new Configuration();
-      FileSystem fs = FileSystem.get(conf);
-      try (SequenceFile.Writer seqWriter = SequenceFile.createWriter(fs, conf, path, Text.class, IntWritable.class)) {
-        Text term = new Text();
-        IntWritable termIndex = new IntWritable();
-        Iterator<TermEntry> termEntries = termInfo.getAllEntries();
-        while (termEntries.hasNext()) {
-          TermEntry termEntry = termEntries.next();
-          term.set(termEntry.getTerm());
-          termIndex.set(termEntry.getTermIdx());
-          seqWriter.append(term, termIndex);
-        }
-      }
-    }
-  }
-
-  public static void main(String[] args) throws IOException {
-
-    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
-    ArgumentBuilder abuilder = new ArgumentBuilder();
-    GroupBuilder gbuilder = new GroupBuilder();
-
-    Option inputOpt = obuilder.withLongName("dir").withRequired(true).withArgument(
-        abuilder.withName("dir").withMinimum(1).withMaximum(1).create())
-        .withDescription("The Lucene directory").withShortName("d").create();
-
-    Option outputOpt = obuilder.withLongName("output").withRequired(true).withArgument(
-        abuilder.withName("output").withMinimum(1).withMaximum(1).create()).withDescription("The output file")
-        .withShortName("o").create();
-
-    Option fieldOpt = obuilder.withLongName("field").withRequired(true).withArgument(
-        abuilder.withName("field").withMinimum(1).withMaximum(1).create()).withDescription(
-        "The field in the index").withShortName("f").create();
-
-    Option idFieldOpt = obuilder.withLongName("idField").withRequired(false).withArgument(
-        abuilder.withName("idField").withMinimum(1).withMaximum(1).create()).withDescription(
-        "The field in the index containing the index.  If null, then the Lucene internal doc "
-            + "id is used which is prone to error if the underlying index changes").create();
-
-    Option dictOutOpt = obuilder.withLongName("dictOut").withRequired(true).withArgument(
-        abuilder.withName("dictOut").withMinimum(1).withMaximum(1).create()).withDescription(
-        "The output of the dictionary").withShortName("t").create();
-
-    Option seqDictOutOpt = obuilder.withLongName("seqDictOut").withRequired(false).withArgument(
-        abuilder.withName("seqDictOut").withMinimum(1).withMaximum(1).create()).withDescription(
-        "The output of the dictionary as sequence file").withShortName("st").create();
-
-    Option weightOpt = obuilder.withLongName("weight").withRequired(false).withArgument(
-        abuilder.withName("weight").withMinimum(1).withMaximum(1).create()).withDescription(
-        "The kind of weight to use. Currently TF or TFIDF").withShortName("w").create();
-
-    Option delimiterOpt = obuilder.withLongName("delimiter").withRequired(false).withArgument(
-        abuilder.withName("delimiter").withMinimum(1).withMaximum(1).create()).withDescription(
-        "The delimiter for outputting the dictionary").withShortName("l").create();
-
-    Option powerOpt = obuilder.withLongName("norm").withRequired(false).withArgument(
-        abuilder.withName("norm").withMinimum(1).withMaximum(1).create()).withDescription(
-        "The norm to use, expressed as either a double or \"INF\" if you want to use the Infinite norm.  "
-            + "Must be greater or equal to 0.  The default is not to normalize").withShortName("n").create();
-
-    Option maxOpt = obuilder.withLongName("max").withRequired(false).withArgument(
-        abuilder.withName("max").withMinimum(1).withMaximum(1).create()).withDescription(
-        "The maximum number of vectors to output.  If not specified, then it will loop over all docs")
-        .withShortName("m").create();
-
-    Option minDFOpt = obuilder.withLongName("minDF").withRequired(false).withArgument(
-        abuilder.withName("minDF").withMinimum(1).withMaximum(1).create()).withDescription(
-        "The minimum document frequency.  Default is 1").withShortName("md").create();
-
-    Option maxDFPercentOpt = obuilder.withLongName("maxDFPercent").withRequired(false).withArgument(
-        abuilder.withName("maxDFPercent").withMinimum(1).withMaximum(1).create()).withDescription(
-        "The max percentage of docs for the DF.  Can be used to remove really high frequency terms."
-            + "  Expressed as an integer between 0 and 100. Default is 99.").withShortName("x").create();
-
-    Option maxPercentErrorDocsOpt = obuilder.withLongName("maxPercentErrorDocs").withRequired(false).withArgument(
-        abuilder.withName("maxPercentErrorDocs").withMinimum(1).withMaximum(1).create()).withDescription(
-        "The max percentage of docs that can have a null term vector. These are noise document and can occur if the "
-            + "analyzer used strips out all terms in the target field. This percentage is expressed as a value "
-            + "between 0 and 1. The default is 0.").withShortName("err").create();
-
-    Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
-        .create();
-
-    Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(idFieldOpt).withOption(
-        outputOpt).withOption(delimiterOpt).withOption(helpOpt).withOption(fieldOpt).withOption(maxOpt)
-        .withOption(dictOutOpt).withOption(seqDictOutOpt).withOption(powerOpt).withOption(maxDFPercentOpt)
-        .withOption(weightOpt).withOption(minDFOpt).withOption(maxPercentErrorDocsOpt).create();
-
-    try {
-      Parser parser = new Parser();
-      parser.setGroup(group);
-      CommandLine cmdLine = parser.parse(args);
-
-      if (cmdLine.hasOption(helpOpt)) {
-
-        CommandLineUtil.printHelp(group);
-        return;
-      }
-
-      if (cmdLine.hasOption(inputOpt)) { // Lucene case
-        Driver luceneDriver = new Driver();
-        luceneDriver.setLuceneDir(cmdLine.getValue(inputOpt).toString());
-
-        if (cmdLine.hasOption(maxOpt)) {
-          luceneDriver.setMaxDocs(Long.parseLong(cmdLine.getValue(maxOpt).toString()));
-        }
-
-        if (cmdLine.hasOption(weightOpt)) {
-          luceneDriver.setWeightType(cmdLine.getValue(weightOpt).toString());
-        }
-
-        luceneDriver.setField(cmdLine.getValue(fieldOpt).toString());
-
-        if (cmdLine.hasOption(minDFOpt)) {
-          luceneDriver.setMinDf(Integer.parseInt(cmdLine.getValue(minDFOpt).toString()));
-        }
-
-        if (cmdLine.hasOption(maxDFPercentOpt)) {
-          luceneDriver.setMaxDFPercent(Integer.parseInt(cmdLine.getValue(maxDFPercentOpt).toString()));
-        }
-
-        if (cmdLine.hasOption(powerOpt)) {
-          String power = cmdLine.getValue(powerOpt).toString();
-          if ("INF".equals(power)) {
-            luceneDriver.setNorm(Double.POSITIVE_INFINITY);
-          } else {
-            luceneDriver.setNorm(Double.parseDouble(power));
-          }
-        }
-
-        if (cmdLine.hasOption(idFieldOpt)) {
-          luceneDriver.setIdField(cmdLine.getValue(idFieldOpt).toString());
-        }
-
-        if (cmdLine.hasOption(maxPercentErrorDocsOpt)) {
-          luceneDriver.setMaxPercentErrorDocs(Double.parseDouble(cmdLine.getValue(maxPercentErrorDocsOpt).toString()));
-        }
-
-        luceneDriver.setOutFile(cmdLine.getValue(outputOpt).toString());
-
-        luceneDriver.setDelimiter(cmdLine.hasOption(delimiterOpt) ? cmdLine.getValue(delimiterOpt).toString() : "\t");
-
-        luceneDriver.setDictOut(cmdLine.getValue(dictOutOpt).toString());
-
-        if (cmdLine.hasOption(seqDictOutOpt)) {
-          luceneDriver.setSeqDictOut(cmdLine.getValue(seqDictOutOpt).toString());
-        }
-
-        luceneDriver.dumpVectors();
-      }
-    } catch (OptionException e) {
-      log.error("Exception", e);
-      CommandLineUtil.printHelp(group);
-    }
-  }
-
-  private static VectorWriter getSeqFileWriter(String outFile) throws IOException {
-    Path path = new Path(outFile);
-    Configuration conf = new Configuration();
-    FileSystem fs = FileSystem.get(conf);
-    // TODO: Make this parameter driven
-
-    SequenceFile.Writer seqWriter = SequenceFile.createWriter(fs, conf, path, LongWritable.class,
-        VectorWritable.class);
-
-    return new SequenceFileVectorWriter(seqWriter);
-  }
-
-  public void setLuceneDir(String luceneDir) {
-    this.luceneDir = luceneDir;
-  }
-
-  public void setMaxDocs(long maxDocs) {
-    this.maxDocs = maxDocs;
-  }
-
-  public void setWeightType(String weightType) {
-    this.weightType = weightType;
-  }
-
-  public void setField(String field) {
-    this.field = field;
-  }
-
-  public void setMinDf(int minDf) {
-    this.minDf = minDf;
-  }
-
-  public void setMaxDFPercent(int maxDFPercent) {
-    this.maxDFPercent = maxDFPercent;
-  }
-
-  public void setNorm(double norm) {
-    this.norm = norm;
-  }
-
-  public void setIdField(String idField) {
-    this.idField = idField;
-  }
-
-  public void setOutFile(String outFile) {
-    this.outFile = outFile;
-  }
-
-  public void setDelimiter(String delimiter) {
-    this.delimiter = delimiter;
-  }
-
-  public void setDictOut(String dictOut) {
-    this.dictOut = dictOut;
-  }
-
-  public void setSeqDictOut(String seqDictOut) {
-    this.seqDictOut = seqDictOut;
-  }
-
-  public void setMaxPercentErrorDocs(double maxPercentErrorDocs) {
-    this.maxPercentErrorDocs = maxPercentErrorDocs;
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java b/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java
deleted file mode 100644
index 1af0ed0..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java
+++ /dev/null
@@ -1,80 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.lucene;
-
-import org.apache.lucene.index.IndexReader;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.utils.vectors.TermInfo;
-import org.apache.mahout.vectorizer.Weight;
-
-import java.util.Iterator;
-
-/**
- * {@link Iterable} counterpart to {@link LuceneIterator}.
- */
-public final class LuceneIterable implements Iterable<Vector> {
-
-  public static final double NO_NORMALIZING = -1.0;
-
-  private final IndexReader indexReader;
-  private final String field;
-  private final String idField;
-  private final TermInfo terminfo;
-  private final double normPower;
-  private final double maxPercentErrorDocs;
-  private final Weight weight;
-
-  public LuceneIterable(IndexReader reader, String idField, String field, TermInfo terminfo, Weight weight) {
-    this(reader, idField, field, terminfo, weight, NO_NORMALIZING);
-  }
-
-  public LuceneIterable(IndexReader indexReader, String idField, String field, TermInfo terminfo, Weight weight,
-      double normPower) {
-    this(indexReader, idField, field, terminfo, weight, normPower, 0);
-  }
-
-  /**
-   * Produce a LuceneIterable that can create the Vector plus normalize it.
-   *
-   * @param indexReader         {@link org.apache.lucene.index.IndexReader} to read the documents from.
-   * @param idField             field containing the id. May be null.
-   * @param field               field to use for the Vector
-   * @param normPower           the normalization value. Must be nonnegative, or {@link #NO_NORMALIZING}
-   * @param maxPercentErrorDocs the percentage of documents in the lucene index that can have a null term vector
-   */
-  public LuceneIterable(IndexReader indexReader,
-                        String idField,
-                        String field,
-                        TermInfo terminfo,
-                        Weight weight,
-                        double normPower,
-                        double maxPercentErrorDocs) {
-    this.indexReader = indexReader;
-    this.idField = idField;
-    this.field = field;
-    this.terminfo = terminfo;
-    this.normPower = normPower;
-    this.maxPercentErrorDocs = maxPercentErrorDocs;
-    this.weight = weight;
-  }
-
-  @Override
-  public Iterator<Vector> iterator() {
-    return new LuceneIterator(indexReader, idField, field, terminfo, weight, normPower, maxPercentErrorDocs);
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java b/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java
deleted file mode 100644
index 6a8c659..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java
+++ /dev/null
@@ -1,99 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.lucene;
-
-import java.io.IOException;
-import java.util.Set;
-import java.util.TreeSet;
-
-import com.google.common.base.Preconditions;
-import org.apache.lucene.index.IndexReader;
-import org.apache.mahout.utils.vectors.TermInfo;
-import org.apache.mahout.vectorizer.Weight;
-
-/**
- * An {@link java.util.Iterator} over {@link org.apache.mahout.math.Vector}s that uses a Lucene index as the source
- * for creating the {@link org.apache.mahout.math.Vector}s. The field used to create the vectors currently must have
- * term vectors stored for it.
- */
-public class LuceneIterator extends AbstractLuceneIterator {
-
-  protected final Set<String> idFieldSelector;
-  protected final String idField;
-
-    /**
-   * Produce a LuceneIterable that can create the Vector plus normalize it.
-   *
-   * @param indexReader {@link IndexReader} to read the documents from.
-   * @param idField     field containing the id. May be null.
-   * @param field       field to use for the Vector
-   * @param termInfo    termInfo
-   * @param weight      weight
-   * @param normPower   the normalization value. Must be non-negative, or {@link LuceneIterable#NO_NORMALIZING}
-   */
-  public LuceneIterator(IndexReader indexReader, String idField, String field, TermInfo termInfo, Weight weight,
-                        double normPower) {
-    this(indexReader, idField, field, termInfo, weight, normPower, 0.0);
-  }
-
-  /**
-   * @param indexReader {@link IndexReader} to read the documents from.
-   * @param idField    field containing the id. May be null.
-   * @param field      field to use for the Vector
-   * @param termInfo   termInfo
-   * @param weight     weight
-   * @param normPower  the normalization value. Must be non-negative, or {@link LuceneIterable#NO_NORMALIZING}
-   * @param maxPercentErrorDocs most documents that will be tolerated without a term freq vector. In [0,1].
-   * @see #LuceneIterator(org.apache.lucene.index.IndexReader, String, String, org.apache.mahout.utils.vectors.TermInfo,
-   * org.apache.mahout.vectorizer.Weight, double)
-   */
-  public LuceneIterator(IndexReader indexReader,
-                        String idField,
-                        String field,
-                        TermInfo termInfo,
-                        Weight weight,
-                        double normPower,
-                        double maxPercentErrorDocs) {
-      super(termInfo, normPower, indexReader, weight, maxPercentErrorDocs, field);
-      // term docs(null) is a better way of iterating all the docs in Lucene
-    Preconditions.checkArgument(normPower == LuceneIterable.NO_NORMALIZING || normPower >= 0,
-        "normPower must be non-negative or -1, but normPower = " + normPower);
-    Preconditions.checkArgument(maxPercentErrorDocs >= 0.0 && maxPercentErrorDocs <= 1.0,
-        "Must be: 0.0 <= maxPercentErrorDocs <= 1.0");
-    this.idField = idField;
-    if (idField != null) {
-      idFieldSelector = new TreeSet<>();
-      idFieldSelector.add(idField);
-    } else {
-      /*The field in the index  containing the index. If null, then the Lucene internal doc id is used
-      which is prone to error if the underlying index changes*/
-      idFieldSelector = null;
-    }
-  }
-
-  @Override
-  protected String getVectorName(int documentIndex) throws IOException {
-    String name;
-    if (idField != null) {
-      name = indexReader.document(documentIndex, idFieldSelector).get(idField);
-    } else {
-      name = String.valueOf(documentIndex);
-    }
-    return name;
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/TFDFMapper.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/TFDFMapper.java b/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/TFDFMapper.java
deleted file mode 100644
index 5830ccc..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/TFDFMapper.java
+++ /dev/null
@@ -1,64 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.lucene;
-
-import org.apache.lucene.util.BytesRef;
-import org.apache.mahout.math.RandomAccessSparseVector;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.utils.vectors.TermEntry;
-import org.apache.mahout.utils.vectors.TermInfo;
-import org.apache.mahout.vectorizer.Weight;
-
-
-/**
- * Not thread-safe
- */
-public class TFDFMapper  {
-
-  private Vector vector;
-  
-  private final Weight weight;
-  private long numTerms;
-  private final TermInfo termInfo;
-  private String field;
-  private final int numDocs;
-  
-  public TFDFMapper(int numDocs, Weight weight, TermInfo termInfo) {
-    this.weight = weight;
-    this.termInfo = termInfo;
-    this.numDocs = numDocs;
-  }
-
-  public void setExpectations(String field, long numTerms) {
-    this.field = field;
-    vector = new RandomAccessSparseVector(termInfo.totalTerms(field));
-    this.numTerms = numTerms;
-  }
-  
-  public void map(BytesRef term, int frequency) {
-    TermEntry entry = termInfo.getTermEntry(field, term.utf8ToString());
-    if (entry != null) {
-      vector.setQuick(entry.getTermIdx(), weight.calculate(frequency, entry.getDocFreq(), (int)numTerms, numDocs));
-    }
-  }
-  
-  public Vector getVector() {
-    return this.vector;
-  }
-  
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/TermInfoClusterInOut.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/TermInfoClusterInOut.java b/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/TermInfoClusterInOut.java
deleted file mode 100644
index b0311c7..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/TermInfoClusterInOut.java
+++ /dev/null
@@ -1,81 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.lucene;
-
-import org.apache.mahout.common.RandomUtils;
-
-class TermInfoClusterInOut implements Comparable<TermInfoClusterInOut> {
-
-  private final String term;
-  private final int inClusterDF;
-  private final int outClusterDF;
-  private final double logLikelihoodRatio;
-
-  TermInfoClusterInOut(String term, int inClusterDF, int outClusterDF, double logLikelihoodRatio) {
-    this.term = term;
-    this.inClusterDF = inClusterDF;
-    this.outClusterDF = outClusterDF;
-    this.logLikelihoodRatio = logLikelihoodRatio;
-  }
-
-  @Override
-  public int hashCode() {
-    return term.hashCode() ^ inClusterDF ^ outClusterDF ^ RandomUtils.hashDouble(logLikelihoodRatio);
-  }
-
-  @Override
-  public boolean equals(Object o) {
-    if (!(o instanceof TermInfoClusterInOut)) {
-      return false;
-    }
-    TermInfoClusterInOut other = (TermInfoClusterInOut) o;
-    return term.equals(other.getTerm())
-        && inClusterDF == other.getInClusterDF()
-        && outClusterDF == other.getOutClusterDF()
-        && logLikelihoodRatio == other.getLogLikelihoodRatio();
-  }
-
-  @Override
-  public int compareTo(TermInfoClusterInOut that) {
-    int res = Double.compare(that.logLikelihoodRatio, logLikelihoodRatio);
-    if (res == 0) {
-      res = term.compareTo(that.term);
-    }
-    return res;
-  }
-
-  public int getInClusterDiff() {
-    return this.inClusterDF - this.outClusterDF;
-  }
-
-  String getTerm() {
-    return term;
-  }
-
-  int getInClusterDF() {
-    return inClusterDF;
-  }
-
-  int getOutClusterDF() {
-    return outClusterDF;
-  }
-
-  double getLogLikelihoodRatio() {
-    return logLikelihoodRatio;
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/test/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/MySQLJDBCInMemoryItemSimilarityTest.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/MySQLJDBCInMemoryItemSimilarityTest.java b/integration/src/test/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/MySQLJDBCInMemoryItemSimilarityTest.java
deleted file mode 100644
index 463a45f..0000000
--- a/integration/src/test/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/MySQLJDBCInMemoryItemSimilarityTest.java
+++ /dev/null
@@ -1,79 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.impl.similarity.jdbc;
-
-import org.apache.mahout.cf.taste.impl.TasteTestCase;
-import org.apache.mahout.cf.taste.similarity.ItemSimilarity;
-import org.easymock.EasyMock;
-import org.junit.Test;
-
-import javax.sql.DataSource;
-import java.sql.Connection;
-import java.sql.PreparedStatement;
-import java.sql.ResultSet;
-
-public class MySQLJDBCInMemoryItemSimilarityTest extends TasteTestCase {
-
-  @Test
-  public void testMemoryLoad() throws Exception {
-
-    DataSource dataSource = EasyMock.createMock(DataSource.class);
-    Connection connection = EasyMock.createMock(Connection.class);
-    PreparedStatement statement = EasyMock.createMock(PreparedStatement.class);
-    ResultSet resultSet = EasyMock.createMock(ResultSet.class);
-
-    EasyMock.expect(dataSource.getConnection()).andReturn(connection);
-    EasyMock.expect(connection.prepareStatement(MySQLJDBCInMemoryItemSimilarity.DEFAULT_GET_ALL_ITEMSIMILARITIES_SQL,
-        ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY)).andReturn(statement);
-    statement.setFetchDirection(ResultSet.FETCH_FORWARD);
-    EasyMock.expect(statement.executeQuery()).andReturn(resultSet);
-
-    EasyMock.expect(resultSet.next()).andReturn(true);
-
-    EasyMock.expect(resultSet.getLong(1)).andReturn(1L);
-    EasyMock.expect(resultSet.getLong(2)).andReturn(2L);
-    EasyMock.expect(resultSet.getDouble(3)).andReturn(0.5);
-    EasyMock.expect(resultSet.next()).andReturn(true);
-
-    EasyMock.expect(resultSet.getLong(1)).andReturn(1L);
-    EasyMock.expect(resultSet.getLong(2)).andReturn(3L);
-    EasyMock.expect(resultSet.getDouble(3)).andReturn(0.4);
-    EasyMock.expect(resultSet.next()).andReturn(true);
-
-    EasyMock.expect(resultSet.getLong(1)).andReturn(3L);
-    EasyMock.expect(resultSet.getLong(2)).andReturn(4L);
-    EasyMock.expect(resultSet.getDouble(3)).andReturn(0.1);
-
-    EasyMock.expect(resultSet.next()).andReturn(false);
-
-    resultSet.close();
-    statement.close();
-    connection.close();
-
-    EasyMock.replay(dataSource, connection, statement, resultSet);
-
-    ItemSimilarity similarity = new MySQLJDBCInMemoryItemSimilarity(dataSource);
-
-    assertEquals(0.5, similarity.itemSimilarity(1L, 2L), EPSILON);
-    assertEquals(0.4, similarity.itemSimilarity(1L, 3L), EPSILON);
-    assertEquals(0.1, similarity.itemSimilarity(3L, 4L), EPSILON);
-    assertTrue(Double.isNaN(similarity.itemSimilarity(1L, 4L)));
-
-    EasyMock.verify(dataSource, connection, statement, resultSet);
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java b/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
deleted file mode 100644
index 01d46fc..0000000
--- a/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
+++ /dev/null
@@ -1,236 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.clustering;
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Iterator;
-import java.util.List;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.lucene.analysis.standard.StandardAnalyzer;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.document.FieldType;
-import org.apache.lucene.document.StringField;
-import org.apache.lucene.index.DirectoryReader;
-import org.apache.lucene.index.IndexOptions;
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.IndexWriter;
-import org.apache.lucene.index.IndexWriterConfig;
-import org.apache.lucene.store.RAMDirectory;
-import org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansDriver;
-import org.apache.mahout.clustering.kmeans.KMeansDriver;
-import org.apache.mahout.clustering.kmeans.RandomSeedGenerator;
-import org.apache.mahout.common.MahoutTestCase;
-import org.apache.mahout.common.distance.DistanceMeasure;
-import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
-import org.apache.mahout.math.NamedVector;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.math.VectorWritable;
-import org.apache.mahout.utils.clustering.ClusterDumper;
-import org.apache.mahout.utils.vectors.TermEntry;
-import org.apache.mahout.utils.vectors.TermInfo;
-import org.apache.mahout.utils.vectors.lucene.CachedTermInfo;
-import org.apache.mahout.utils.vectors.lucene.LuceneIterable;
-import org.apache.mahout.vectorizer.TFIDF;
-import org.apache.mahout.vectorizer.Weight;
-import org.junit.Before;
-import org.junit.Test;
-
-public final class TestClusterDumper extends MahoutTestCase {
-
-  private static final String[] DOCS = {
-      "The quick red fox jumped over the lazy brown dogs.",
-      "The quick brown fox jumped over the lazy red dogs.",
-      "The quick red cat jumped over the lazy brown dogs.",
-      "The quick brown cat jumped over the lazy red dogs.",
-      "Mary had a little lamb whose fleece was white as snow.",
-      "Mary had a little goat whose fleece was white as snow.",
-      "Mary had a little lamb whose fleece was black as tar.",
-      "Dick had a little goat whose fleece was white as snow.",
-      "Moby Dick is a story of a whale and a man obsessed.",
-      "Moby Bob is a story of a walrus and a man obsessed.",
-      "Moby Dick is a story of a whale and a crazy man.",
-      "The robber wore a black fleece jacket and a baseball cap.",
-      "The robber wore a red fleece jacket and a baseball cap.",
-      "The robber wore a white fleece jacket and a baseball cap.",
-      "The English Springer Spaniel is the best of all dogs."};
-
-  private List<VectorWritable> sampleData;
-
-  private String[] termDictionary;
-
-  @Override
-  @Before
-  public void setUp() throws Exception {
-    super.setUp();
-    Configuration conf = getConfiguration();
-    FileSystem fs = FileSystem.get(conf);
-    // Create test data
-    getSampleData(DOCS);
-    ClusteringTestUtils.writePointsToFile(sampleData, true,
-        getTestTempFilePath("testdata/file1"), fs, conf);
-  }
-
-  private void getSampleData(String[] docs2) throws IOException {
-    sampleData = new ArrayList<>();
-    RAMDirectory directory = new RAMDirectory();
-    try (IndexWriter writer = new IndexWriter(directory,
-        new IndexWriterConfig(new StandardAnalyzer()))){
-      for (int i = 0; i < docs2.length; i++) {
-        Document doc = new Document();
-        Field id = new StringField("id", "doc_" + i, Field.Store.YES);
-        doc.add(id);
-        // Store both position and offset information
-        FieldType fieldType = new FieldType();
-        fieldType.setStored(false);
-        fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
-        fieldType.setTokenized(true);
-        fieldType.setStoreTermVectors(true);
-        fieldType.setStoreTermVectorPositions(true);
-        fieldType.setStoreTermVectorOffsets(true);
-        fieldType.freeze();
-        Field text = new Field("content", docs2[i], fieldType);
-        doc.add(text);
-        writer.addDocument(doc);
-      }
-    }
-
-    IndexReader reader = DirectoryReader.open(directory);
-
-    Weight weight = new TFIDF();
-    TermInfo termInfo = new CachedTermInfo(reader, "content", 1, 100);
-
-    int numTerms = 0;
-    for (Iterator<TermEntry> it = termInfo.getAllEntries(); it.hasNext();) {
-      it.next();
-      numTerms++;
-    }
-    termDictionary = new String[numTerms];
-    int i = 0;
-    for (Iterator<TermEntry> it = termInfo.getAllEntries(); it.hasNext();) {
-      String term = it.next().getTerm();
-      termDictionary[i] = term;
-      System.out.println(i + " " + term);
-      i++;
-    }
-    Iterable<Vector> iterable = new LuceneIterable(reader, "id", "content",
-        termInfo,weight);
-
-    i = 0;
-    for (Vector vector : iterable) {
-      assertNotNull(vector);
-      NamedVector namedVector;
-      if (vector instanceof NamedVector) {
-        // rename it for testing purposes
-        namedVector = new NamedVector(((NamedVector) vector).getDelegate(),
-            "P(" + i + ')');
-
-      } else {
-        namedVector = new NamedVector(vector, "P(" + i + ')');
-      }
-      System.out.println(AbstractCluster.formatVector(namedVector,
-          termDictionary));
-      sampleData.add(new VectorWritable(namedVector));
-      i++;
-    }
-  }
-
-  /**
-   * Return the path to the final iteration's clusters
-   */
-  private static Path finalClusterPath(Configuration conf, Path output,
-      int maxIterations) throws IOException {
-    FileSystem fs = FileSystem.get(conf);
-    for (int i = maxIterations; i >= 0; i--) {
-      Path clusters = new Path(output, "clusters-" + i + "-final");
-      if (fs.exists(clusters)) {
-        return clusters;
-      }
-    }
-    return null;
-  }
-
-  @Test
-  public void testKmeans() throws Exception {
-    DistanceMeasure measure = new EuclideanDistanceMeasure();
-    Path input = getTestTempFilePath("input");
-    Path output = getTestTempDirPath("output");
-    Path initialPoints = new Path(output, Cluster.CLUSTERS_DIR + '0' + Cluster.FINAL_ITERATION_SUFFIX);
-    Configuration conf = getConfiguration();
-    FileSystem fs = FileSystem.get(conf);
-    // Write test data to file
-    ClusteringTestUtils.writePointsToFile(sampleData, input, fs, conf);
-    // Select initial centroids
-    RandomSeedGenerator.buildRandom(conf, input, initialPoints, 8, measure, 1L);
-    // Run k-means
-    Path kMeansOutput = new Path(output, "kmeans");
-    KMeansDriver.run(conf, getTestTempDirPath("testdata"), initialPoints, kMeansOutput, 0.001, 10, true, 0.0, false);
-    // Print out clusters
-    ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf,
-            output, 10), new Path(kMeansOutput, "clusteredPoints"));
-    clusterDumper.printClusters(termDictionary);
-  }
-
-  @Test
-  public void testJsonClusterDumper() throws Exception {
-    DistanceMeasure measure = new EuclideanDistanceMeasure();
-    Path input = getTestTempFilePath("input");
-    Path output = getTestTempDirPath("output");
-    Path initialPoints = new Path(output, Cluster.CLUSTERS_DIR + '0' + Cluster.FINAL_ITERATION_SUFFIX);
-    Configuration conf = getConfiguration();
-    FileSystem fs = FileSystem.get(conf);
-    // Write test data to file
-    ClusteringTestUtils.writePointsToFile(sampleData, input, fs, conf);
-    // Select initial centroids
-    RandomSeedGenerator.buildRandom(conf, input, initialPoints, 8, measure, 1L);
-    // Run k-means
-    Path kmeansOutput = new Path(output, "kmeans");
-    KMeansDriver.run(conf, getTestTempDirPath("testdata"), initialPoints, kmeansOutput, 0.001, 10, true, 0.0, false);
-    // Print out clusters
-    ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf,
-        output, 10), new Path(kmeansOutput, "clusteredPoints"));
-    clusterDumper.setOutputFormat(ClusterDumper.OUTPUT_FORMAT.JSON);
-    clusterDumper.printClusters(termDictionary);
-  }
-
-  @Test
-  public void testFuzzyKmeans() throws Exception {
-    DistanceMeasure measure = new EuclideanDistanceMeasure();
-    Path input = getTestTempFilePath("input");
-    Path output = getTestTempDirPath("output");
-    Path initialPoints = new Path(output, Cluster.CLUSTERS_DIR + '0' + Cluster.FINAL_ITERATION_SUFFIX);
-    Configuration conf = getConfiguration();
-    FileSystem fs = FileSystem.get(conf);
-    // Write test data to file
-    ClusteringTestUtils.writePointsToFile(sampleData, input, fs, conf);
-    // Select initial centroids
-    RandomSeedGenerator.buildRandom(conf, input, initialPoints, 8, measure, 1L);
-    // Run k-means
-    Path kMeansOutput = new Path(output, "kmeans");
-    FuzzyKMeansDriver.run(conf, getTestTempDirPath("testdata"), initialPoints, kMeansOutput, 0.001, 10, 1.1f, true,
-        true, 0, true);
-    // run ClusterDumper
-    ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf,
-        output, 10), new Path(kMeansOutput, "clusteredPoints"));
-    clusterDumper.printClusters(termDictionary);
-  }
-}


Mime
View raw message