lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From tomm...@apache.org
Subject svn commit: r1540675 - in /lucene/dev/branches/branch_4x: ./ dev-tools/ lucene/ lucene/analysis/ lucene/analysis/icu/src/java/org/apache/lucene/collation/ lucene/backwards/ lucene/benchmark/ lucene/classification/ lucene/classification/src/ lucene/clas...
Date Mon, 11 Nov 2013 11:47:16 GMT
Author: tommaso
Date: Mon Nov 11 11:47:14 2013
New Revision: 1540675

URL: http://svn.apache.org/r1540675
Log:
LUCENE-5311 - backport to branch_4x

Modified:
    lucene/dev/branches/branch_4x/   (props changed)
    lucene/dev/branches/branch_4x/dev-tools/   (props changed)
    lucene/dev/branches/branch_4x/lucene/   (props changed)
    lucene/dev/branches/branch_4x/lucene/BUILD.txt   (props changed)
    lucene/dev/branches/branch_4x/lucene/CHANGES.txt   (props changed)
    lucene/dev/branches/branch_4x/lucene/JRE_VERSION_MIGRATION.txt   (props changed)
    lucene/dev/branches/branch_4x/lucene/LICENSE.txt   (props changed)
    lucene/dev/branches/branch_4x/lucene/MIGRATE.txt   (props changed)
    lucene/dev/branches/branch_4x/lucene/NOTICE.txt   (props changed)
    lucene/dev/branches/branch_4x/lucene/README.txt   (props changed)
    lucene/dev/branches/branch_4x/lucene/SYSTEM_REQUIREMENTS.txt   (props changed)
    lucene/dev/branches/branch_4x/lucene/analysis/   (props changed)
    lucene/dev/branches/branch_4x/lucene/analysis/icu/src/java/org/apache/lucene/collation/ICUCollationKeyFilterFactory.java
  (props changed)
    lucene/dev/branches/branch_4x/lucene/backwards/   (props changed)
    lucene/dev/branches/branch_4x/lucene/benchmark/   (props changed)
    lucene/dev/branches/branch_4x/lucene/build.xml   (props changed)
    lucene/dev/branches/branch_4x/lucene/classification/   (props changed)
    lucene/dev/branches/branch_4x/lucene/classification/build.xml   (props changed)
    lucene/dev/branches/branch_4x/lucene/classification/ivy.xml   (props changed)
    lucene/dev/branches/branch_4x/lucene/classification/src/   (props changed)
    lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/ClassificationResult.java
    lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/Classifier.java
    lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/KNearestNeighborClassifier.java
    lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java
    lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/utils/DatasetSplitter.java
    lucene/dev/branches/branch_4x/lucene/codecs/   (props changed)
    lucene/dev/branches/branch_4x/lucene/common-build.xml   (props changed)
    lucene/dev/branches/branch_4x/lucene/core/   (props changed)
    lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java
  (props changed)
    lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/index/index.40.cfs.zip
  (props changed)
    lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/index/index.40.nocfs.zip
  (props changed)
    lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/index/index.40.optimized.cfs.zip
  (props changed)
    lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/index/index.40.optimized.nocfs.zip
  (props changed)
    lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/search/TestSort.java
  (props changed)
    lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/search/TestSortDocValues.java
  (props changed)
    lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/search/TestSortRandom.java
  (props changed)
    lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/search/TestTopFieldCollector.java
  (props changed)
    lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/search/TestTotalHitCountCollector.java
  (props changed)
    lucene/dev/branches/branch_4x/lucene/demo/   (props changed)
    lucene/dev/branches/branch_4x/lucene/expressions/   (props changed)
    lucene/dev/branches/branch_4x/lucene/facet/   (props changed)
    lucene/dev/branches/branch_4x/lucene/grouping/   (props changed)
    lucene/dev/branches/branch_4x/lucene/highlighter/   (props changed)
    lucene/dev/branches/branch_4x/lucene/ivy-settings.xml   (props changed)
    lucene/dev/branches/branch_4x/lucene/ivy-versions.properties   (props changed)
    lucene/dev/branches/branch_4x/lucene/join/   (props changed)
    lucene/dev/branches/branch_4x/lucene/licenses/   (props changed)
    lucene/dev/branches/branch_4x/lucene/memory/   (props changed)
    lucene/dev/branches/branch_4x/lucene/misc/   (props changed)
    lucene/dev/branches/branch_4x/lucene/module-build.xml   (props changed)
    lucene/dev/branches/branch_4x/lucene/queries/   (props changed)
    lucene/dev/branches/branch_4x/lucene/queries/src/test/org/apache/lucene/queries/function/TestFunctionQuerySort.java
  (props changed)
    lucene/dev/branches/branch_4x/lucene/queryparser/   (props changed)
    lucene/dev/branches/branch_4x/lucene/replicator/   (props changed)
    lucene/dev/branches/branch_4x/lucene/sandbox/   (props changed)
    lucene/dev/branches/branch_4x/lucene/site/   (props changed)
    lucene/dev/branches/branch_4x/lucene/spatial/   (props changed)
    lucene/dev/branches/branch_4x/lucene/suggest/   (props changed)
    lucene/dev/branches/branch_4x/lucene/test-framework/   (props changed)
    lucene/dev/branches/branch_4x/lucene/tools/   (props changed)
    lucene/dev/branches/branch_4x/solr/   (props changed)
    lucene/dev/branches/branch_4x/solr/CHANGES.txt   (props changed)
    lucene/dev/branches/branch_4x/solr/LICENSE.txt   (props changed)
    lucene/dev/branches/branch_4x/solr/NOTICE.txt   (props changed)
    lucene/dev/branches/branch_4x/solr/README.txt   (props changed)
    lucene/dev/branches/branch_4x/solr/SYSTEM_REQUIREMENTS.txt   (props changed)
    lucene/dev/branches/branch_4x/solr/build.xml   (props changed)
    lucene/dev/branches/branch_4x/solr/cloud-dev/   (props changed)
    lucene/dev/branches/branch_4x/solr/common-build.xml   (props changed)
    lucene/dev/branches/branch_4x/solr/contrib/   (props changed)
    lucene/dev/branches/branch_4x/solr/core/   (props changed)
    lucene/dev/branches/branch_4x/solr/core/src/test/org/apache/solr/core/TestConfig.java
  (props changed)
    lucene/dev/branches/branch_4x/solr/example/   (props changed)
    lucene/dev/branches/branch_4x/solr/licenses/   (props changed)
    lucene/dev/branches/branch_4x/solr/licenses/httpclient-LICENSE-ASL.txt   (props changed)
    lucene/dev/branches/branch_4x/solr/licenses/httpclient-NOTICE.txt   (props changed)
    lucene/dev/branches/branch_4x/solr/licenses/httpcore-LICENSE-ASL.txt   (props changed)
    lucene/dev/branches/branch_4x/solr/licenses/httpcore-NOTICE.txt   (props changed)
    lucene/dev/branches/branch_4x/solr/licenses/httpmime-LICENSE-ASL.txt   (props changed)
    lucene/dev/branches/branch_4x/solr/licenses/httpmime-NOTICE.txt   (props changed)
    lucene/dev/branches/branch_4x/solr/scripts/   (props changed)
    lucene/dev/branches/branch_4x/solr/site/   (props changed)
    lucene/dev/branches/branch_4x/solr/solrj/   (props changed)
    lucene/dev/branches/branch_4x/solr/test-framework/   (props changed)
    lucene/dev/branches/branch_4x/solr/webapp/   (props changed)

Modified: lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/ClassificationResult.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/ClassificationResult.java?rev=1540675&r1=1540674&r2=1540675&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/ClassificationResult.java
(original)
+++ lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/ClassificationResult.java
Mon Nov 11 11:47:14 2013
@@ -22,8 +22,8 @@ package org.apache.lucene.classification
  */
 public class ClassificationResult<T> {
 
-  private T assignedClass;
-  private double score;
+  private final T assignedClass;
+  private final double score;
 
   /**
    * Constructor

Modified: lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/Classifier.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/Classifier.java?rev=1540675&r1=1540674&r2=1540675&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/Classifier.java
(original)
+++ lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/Classifier.java
Mon Nov 11 11:47:14 2013
@@ -60,4 +60,16 @@ public interface Classifier<T> {
   public void train(AtomicReader atomicReader, String textFieldName, String classFieldName,
Analyzer analyzer, Query query)
       throws IOException;
 
+  /**
+   * Train the classifier using the underlying Lucene index
+   * @param atomicReader the reader to use to access the Lucene index
+   * @param textFieldNames the names of the fields to be used to compare documents
+   * @param classFieldName the name of the field containing the class assigned to documents
+   * @param analyzer the analyzer used to tokenize / filter the unseen text
+   * @param query the query to filter which documents use for training
+   * @throws IOException If there is a low-level I/O error.
+   */
+  public void train(AtomicReader atomicReader, String[] textFieldNames, String classFieldName,
Analyzer analyzer, Query query)
+      throws IOException;
+
 }

Modified: lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/KNearestNeighborClassifier.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/KNearestNeighborClassifier.java?rev=1540675&r1=1540674&r2=1540675&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/KNearestNeighborClassifier.java
(original)
+++ lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/KNearestNeighborClassifier.java
Mon Nov 11 11:47:14 2013
@@ -41,10 +41,10 @@ import java.util.Map;
 public class KNearestNeighborClassifier implements Classifier<BytesRef> {
 
   private MoreLikeThis mlt;
-  private String textFieldName;
+  private String[] textFieldNames;
   private String classFieldName;
   private IndexSearcher indexSearcher;
-  private int k;
+  private final int k;
   private Query query;
 
   /**
@@ -65,14 +65,17 @@ public class KNearestNeighborClassifier 
       throw new IOException("You must first call Classifier#train");
     }
     Query q;
+    BooleanQuery mltQuery = new BooleanQuery();
+    for (String textFieldName : textFieldNames) {
+      mltQuery.add(new BooleanClause(mlt.like(new StringReader(text), textFieldName), BooleanClause.Occur.SHOULD));
+    }
     if (query != null) {
-      Query mltQuery = mlt.like(new StringReader(text), textFieldName);
       BooleanQuery bq = new BooleanQuery();
       bq.add(query, BooleanClause.Occur.MUST);
       bq.add(mltQuery, BooleanClause.Occur.MUST);
       q = bq;
     } else {
-      q = mlt.like(new StringReader(text), textFieldName);
+      q = mltQuery;
     }
     TopDocs topDocs = indexSearcher.search(q, k);
     return selectClassFromNeighbors(topDocs);
@@ -116,7 +119,7 @@ public class KNearestNeighborClassifier 
    */
   @Override
   public void train(AtomicReader atomicReader, String textFieldName, String classFieldName,
Analyzer analyzer, Query query) throws IOException {
-    this.textFieldName = textFieldName;
+    this.textFieldNames = new String[]{textFieldName};
     this.classFieldName = classFieldName;
     mlt = new MoreLikeThis(atomicReader);
     mlt.setAnalyzer(analyzer);
@@ -124,4 +127,18 @@ public class KNearestNeighborClassifier 
     indexSearcher = new IndexSearcher(atomicReader);
     this.query = query;
   }
-}
\ No newline at end of file
+
+  /**
+   * {@inheritDoc}
+   */
+  @Override
+  public void train(AtomicReader atomicReader, String[] textFieldNames, String classFieldName,
Analyzer analyzer, Query query) throws IOException {
+    this.textFieldNames = textFieldNames;
+    this.classFieldName = classFieldName;
+    mlt = new MoreLikeThis(atomicReader);
+    mlt.setAnalyzer(analyzer);
+    mlt.setFieldNames(textFieldNames);
+    indexSearcher = new IndexSearcher(atomicReader);
+    this.query = query;
+  }
+}

Modified: lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java?rev=1540675&r1=1540674&r2=1540675&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java
(original)
+++ lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java
Mon Nov 11 11:47:14 2013
@@ -46,7 +46,7 @@ import java.util.LinkedList;
 public class SimpleNaiveBayesClassifier implements Classifier<BytesRef> {
 
   private AtomicReader atomicReader;
-  private String textFieldName;
+  private String[] textFieldNames;
   private String classFieldName;
   private int docsWithClassSize;
   private Analyzer analyzer;
@@ -69,18 +69,36 @@ public class SimpleNaiveBayesClassifier 
       throws IOException {
     this.atomicReader = atomicReader;
     this.indexSearcher = new IndexSearcher(this.atomicReader);
-    this.textFieldName = textFieldName;
+    this.textFieldNames = new String[]{textFieldName};
     this.classFieldName = classFieldName;
     this.analyzer = analyzer;
     this.docsWithClassSize = countDocsWithClass();
     this.query = query;
   }
 
+  /**
+   * {@inheritDoc}
+   */
   @Override
   public void train(AtomicReader atomicReader, String textFieldName, String classFieldName,
Analyzer analyzer) throws IOException {
     train(atomicReader, textFieldName, classFieldName, analyzer, null);
   }
 
+  /**
+   * {@inheritDoc}
+   */
+  @Override
+  public void train(AtomicReader atomicReader, String[] textFieldNames, String classFieldName,
Analyzer analyzer, Query query)
+      throws IOException {
+    this.atomicReader = atomicReader;
+    this.indexSearcher = new IndexSearcher(this.atomicReader);
+    this.textFieldNames = textFieldNames;
+    this.classFieldName = classFieldName;
+    this.analyzer = analyzer;
+    this.docsWithClassSize = countDocsWithClass();
+    this.query = query;
+  }
+
   private int countDocsWithClass() throws IOException {
     int docCount = MultiFields.getTerms(this.atomicReader, this.classFieldName).getDocCount();
     if (docCount == -1) { // in case codec doesn't support getDocCount
@@ -104,16 +122,18 @@ public class SimpleNaiveBayesClassifier 
 
   private String[] tokenizeDoc(String doc) throws IOException {
     Collection<String> result = new LinkedList<String>();
-    TokenStream tokenStream = analyzer.tokenStream(textFieldName, doc);
-    try {
-      CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
-      tokenStream.reset();
-      while (tokenStream.incrementToken()) {
-        result.add(charTermAttribute.toString());
+    for (String textFieldName : textFieldNames) {
+      TokenStream tokenStream = analyzer.tokenStream(textFieldName, doc);
+      try {
+        CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
+        tokenStream.reset();
+        while (tokenStream.incrementToken()) {
+          result.add(charTermAttribute.toString());
+        }
+        tokenStream.end();
+      } finally {
+        IOUtils.closeWhileHandlingException(tokenStream);
       }
-      tokenStream.end();
-    } finally {
-      IOUtils.closeWhileHandlingException(tokenStream);
     }
     return result.toArray(new String[result.size()]);
   }
@@ -168,16 +188,23 @@ public class SimpleNaiveBayesClassifier 
   }
 
   private double getTextTermFreqForClass(BytesRef c) throws IOException {
-    Terms terms = MultiFields.getTerms(atomicReader, textFieldName);
-    long numPostings = terms.getSumDocFreq(); // number of term/doc pairs
-    double avgNumberOfUniqueTerms = numPostings / (double) terms.getDocCount(); // avg #
of unique terms per doc
+    double avgNumberOfUniqueTerms = 0;
+    for (String textFieldName : textFieldNames) {
+      Terms terms = MultiFields.getTerms(atomicReader, textFieldName);
+      long numPostings = terms.getSumDocFreq(); // number of term/doc pairs
+      avgNumberOfUniqueTerms += numPostings / (double) terms.getDocCount(); // avg # of unique
terms per doc
+    }
     int docsWithC = atomicReader.docFreq(new Term(classFieldName, c));
     return avgNumberOfUniqueTerms * docsWithC; // avg # of unique terms in text field per
doc * # docs with c
   }
 
   private int getWordFreqForClass(String word, BytesRef c) throws IOException {
     BooleanQuery booleanQuery = new BooleanQuery();
-    booleanQuery.add(new BooleanClause(new TermQuery(new Term(textFieldName, word)), BooleanClause.Occur.MUST));
+    BooleanQuery subQuery = new BooleanQuery();
+    for (String textFieldName : textFieldNames) {
+     subQuery.add(new BooleanClause(new TermQuery(new Term(textFieldName, word)), BooleanClause.Occur.SHOULD));
+    }
+    booleanQuery.add(new BooleanClause(subQuery, BooleanClause.Occur.MUST));
     booleanQuery.add(new BooleanClause(new TermQuery(new Term(classFieldName, c)), BooleanClause.Occur.MUST));
     if (query != null) {
       booleanQuery.add(query, BooleanClause.Occur.MUST);

Modified: lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/utils/DatasetSplitter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/utils/DatasetSplitter.java?rev=1540675&r1=1540674&r2=1540675&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/utils/DatasetSplitter.java
(original)
+++ lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/utils/DatasetSplitter.java
Mon Nov 11 11:47:14 2013
@@ -40,8 +40,8 @@ import java.io.IOException;
  */
 public class DatasetSplitter {
 
-  private double crossValidationRatio;
-  private double testRatio;
+  private final double crossValidationRatio;
+  private final double testRatio;
 
   /**
    * Create a {@link DatasetSplitter} by giving test and cross validation IDXs sizes
@@ -68,8 +68,6 @@ public class DatasetSplitter {
   public void split(AtomicReader originalIndex, Directory trainingIndex, Directory testIndex,
Directory crossValidationIndex,
                     Analyzer analyzer, String... fieldNames) throws IOException {
 
-    // TODO : check that the passed fields are stored in the original index
-
     // create IWs for train / test / cv IDXs
     IndexWriter testWriter = new IndexWriter(testIndex, new IndexWriterConfig(Version.LUCENE_43,
analyzer));
     IndexWriter cvWriter = new IndexWriter(crossValidationIndex, new IndexWriterConfig(Version.LUCENE_43,
analyzer));



Mime
View raw message