lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From rm...@apache.org
Subject lucene-solr:master: LUCENE-8020: don't force sim to score bogus terms (e.g. docfreq=0)
Date Tue, 31 Oct 2017 00:32:27 GMT
Repository: lucene-solr
Updated Branches:
  refs/heads/master 3edb23471 -> e0bde5798


LUCENE-8020: don't force sim to score bogus terms (e.g. docfreq=0)


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/e0bde579
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/e0bde579
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/e0bde579

Branch: refs/heads/master
Commit: e0bde579815ae5ce2525bb659d04e908812f1605
Parents: 3edb234
Author: Robert Muir <rmuir@apache.org>
Authored: Mon Oct 30 20:31:38 2017 -0400
Committer: Robert Muir <rmuir@apache.org>
Committed: Mon Oct 30 20:32:12 2017 -0400

----------------------------------------------------------------------
 lucene/CHANGES.txt                              |  5 ++
 .../lucene/search/CollectionStatistics.java     | 51 +++++++++++++++--
 .../org/apache/lucene/search/IndexSearcher.java | 24 ++++----
 .../apache/lucene/search/MultiPhraseQuery.java  | 11 +++-
 .../org/apache/lucene/search/PhraseQuery.java   | 12 +++-
 .../org/apache/lucene/search/SynonymQuery.java  | 27 ++++++---
 .../org/apache/lucene/search/TermQuery.java     | 13 +++--
 .../apache/lucene/search/TermStatistics.java    | 22 ++++++-
 .../apache/lucene/search/spans/SpanWeight.java  | 15 +++--
 .../similarities/TestClassicSimilarity.java     |  3 +-
 .../search/similarities/TestSimilarity2.java    |  5 +-
 .../lucene/search/TermAutomatonQuery.java       | 13 ++++-
 .../lucene/search/TestTermAutomatonQuery.java   | 60 ++++++++++++++++++++
 .../lucene/search/ShardSearchingTestBase.java   | 42 ++++++++------
 .../solr/search/stats/CollectionStats.java      |  3 +
 .../solr/search/stats/ExactStatsCache.java      |  7 ++-
 .../org/apache/solr/search/stats/TermStats.java |  3 +
 17 files changed, 252 insertions(+), 64 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/e0bde579/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index ea4b8be..f214620 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -11,6 +11,11 @@ Changes in Runtime Behavior
   will now fail to open even if they have been merged with the previous major
   version. (Adrien Grand)
 
+* LUCENE-8020: Similarities are no longer passed terms that don't exist by
+  queries such as SpanOrQuery, so scoring formulas no longer require
+  divide-by-zero hacks.  IndexSearcher.termStatistics/collectionStatistics return null
+  instead of returning bogus values for a non-existent term or field. (Robert Muir)
+
 Improvements
 
 * LUCENE-7997: Add BaseSimilarityTestCase to sanity check similarities.

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/e0bde579/lucene/core/src/java/org/apache/lucene/search/CollectionStatistics.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/search/CollectionStatistics.java b/lucene/core/src/java/org/apache/lucene/search/CollectionStatistics.java
index ef19abd..a6a1e19 100644
--- a/lucene/core/src/java/org/apache/lucene/search/CollectionStatistics.java
+++ b/lucene/core/src/java/org/apache/lucene/search/CollectionStatistics.java
@@ -16,6 +16,8 @@
  */
 package org.apache.lucene.search;
 
+import java.util.Objects;
+
 import org.apache.lucene.index.IndexReader; // javadocs
 import org.apache.lucene.index.Terms;       // javadocs
 
@@ -31,11 +33,52 @@ public class CollectionStatistics {
   private final long sumTotalTermFreq;
   private final long sumDocFreq;
   
+  /**
+   * Creates statistics instance for a collection (field).
+   * @param field Field's name
+   * @param maxDoc total number of documents.
+   * @param docCount number of documents containing the field.
+   * @param sumTotalTermFreq number of tokens in the field.
+   * @param sumDocFreq number of postings list entries for the field.
+   * @throws IllegalArgumentException if {@code maxDoc} is negative or zero.
+   * @throws IllegalArgumentException if {@code docCount} is negative or zero.
+   * @throws IllegalArgumentException if {@code docCount} is more than {@code maxDoc}.
+   * @throws IllegalArgumentException if {@code sumDocFreq} is less than {@code docCount}.
+   * @throws IllegalArgumentException if {@code sumTotalTermFreq} is less than {@code sumDocFreq}.
+   */
   public CollectionStatistics(String field, long maxDoc, long docCount, long sumTotalTermFreq,
long sumDocFreq) {
-    assert maxDoc >= 0;
-    assert docCount >= -1 && docCount <= maxDoc; // #docs with field must be
<= #docs
-    assert sumDocFreq == -1 || sumDocFreq >= docCount; // #postings must be >= #docs
with field
-    assert sumTotalTermFreq == -1 || sumTotalTermFreq >= sumDocFreq; // #positions must
be >= #postings
+    Objects.requireNonNull(field);
+    if (maxDoc <= 0) {
+      throw new IllegalArgumentException("maxDoc must be positive, maxDoc: " + maxDoc);
+    }
+    if (docCount != -1) {
+      if (docCount <= 0) {
+        throw new IllegalArgumentException("docCount must be positive, docCount: " + docCount);
+      }
+      if (docCount > maxDoc) {
+        throw new IllegalArgumentException("docCount must not exceed maxDoc, docCount: "
+ docCount + ", maxDoc: " + maxDoc);
+      }
+    }
+    if (sumDocFreq != -1) {
+      if (sumDocFreq <= 0) {
+        throw new IllegalArgumentException("sumDocFreq must be positive, sumDocFreq: " +
sumDocFreq);
+      }
+      if (docCount != -1) {
+        if (sumDocFreq < docCount) {
+          throw new IllegalArgumentException("sumDocFreq must be at least docCount, sumDocFreq:
" + sumDocFreq + ", docCount: " + docCount);
+        }
+      }
+    }
+    if (sumTotalTermFreq != -1) {
+      if (sumTotalTermFreq <= 0) {
+        throw new IllegalArgumentException("sumTotalTermFreq must be positive, sumTotalTermFreq:
" + sumTotalTermFreq);
+      }
+      if (sumDocFreq != -1) {
+        if (sumTotalTermFreq < sumDocFreq) {
+          throw new IllegalArgumentException("sumTotalTermFreq must be at least sumDocFreq,
sumTotalTermFreq: " + sumTotalTermFreq + ", sumDocFreq: " + sumDocFreq);
+        }
+      }
+    }
     this.field = field;
     this.maxDoc = maxDoc;
     this.docCount = docCount;

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/e0bde579/lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java b/lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java
index c5d9ddc..c8f45bf 100644
--- a/lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java
+++ b/lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java
@@ -767,18 +767,24 @@ public class IndexSearcher {
   }
   
   /**
-   * Returns {@link TermStatistics} for a term.
+   * Returns {@link TermStatistics} for a term, or {@code null} if
+   * the term does not exist.
    * 
    * This can be overridden for example, to return a term's statistics
    * across a distributed collection.
    * @lucene.experimental
    */
   public TermStatistics termStatistics(Term term, TermContext context) throws IOException
{
-    return new TermStatistics(term.bytes(), context.docFreq(), context.totalTermFreq());
+    if (context.docFreq() == 0) {
+      return null;
+    } else {
+      return new TermStatistics(term.bytes(), context.docFreq(), context.totalTermFreq());
+    }
   }
   
   /**
-   * Returns {@link CollectionStatistics} for a field.
+   * Returns {@link CollectionStatistics} for a field, or {@code null} if
+   * the field does not exist (has no indexed terms)
    * 
    * This can be overridden for example, to return a field's statistics
    * across a distributed collection.
@@ -793,15 +799,13 @@ public class IndexSearcher {
     
     Terms terms = MultiFields.getTerms(reader, field);
     if (terms == null) {
-      docCount = 0;
-      sumTotalTermFreq = 0;
-      sumDocFreq = 0;
-    } else {
-      docCount = terms.getDocCount();
-      sumTotalTermFreq = terms.getSumTotalTermFreq();
-      sumDocFreq = terms.getSumDocFreq();
+      return null;
     }
 
+    docCount = terms.getDocCount();
+    sumTotalTermFreq = terms.getSumTotalTermFreq();
+    sumDocFreq = terms.getSumDocFreq();
+
     return new CollectionStatistics(field, reader.maxDoc(), docCount, sumTotalTermFreq, sumDocFreq);
   }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/e0bde579/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java b/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java
index afb6fc7..d39fdc6 100644
--- a/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java
+++ b/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java
@@ -203,13 +203,20 @@ public class MultiPhraseQuery extends Query {
             termContext = TermContext.build(context, term);
             termContexts.put(term, termContext);
           }
-          allTermStats.add(searcher.termStatistics(term, termContext));
+          TermStatistics termStatistics = searcher.termStatistics(term, termContext);
+          if (termStatistics != null) {
+            allTermStats.add(termStatistics);
+          }
         }
       }
-      stats = similarity.computeWeight(
+      if (allTermStats.isEmpty()) {
+        stats = null; // none of the terms were found, we won't use sim at all
+      } else {
+        stats = similarity.computeWeight(
           boost,
           searcher.collectionStatistics(field),
           allTermStats.toArray(new TermStatistics[allTermStats.size()]));
+      }
     }
 
     @Override

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/e0bde579/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java b/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java
index d0bf828..79703f5 100644
--- a/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java
+++ b/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java
@@ -370,12 +370,20 @@ public class PhraseQuery extends Query {
       final IndexReaderContext context = searcher.getTopReaderContext();
       states = new TermContext[terms.length];
       TermStatistics termStats[] = new TermStatistics[terms.length];
+      int termUpTo = 0;
       for (int i = 0; i < terms.length; i++) {
         final Term term = terms[i];
         states[i] = TermContext.build(context, term);
-        termStats[i] = searcher.termStatistics(term, states[i]);
+        TermStatistics termStatistics = searcher.termStatistics(term, states[i]);
+        if (termStatistics != null) {
+          termStats[termUpTo++] = termStatistics;
+        }
+      }
+      if (termUpTo > 0) {
+        stats = similarity.computeWeight(boost, searcher.collectionStatistics(field), Arrays.copyOf(termStats,
termUpTo));
+      } else {
+        stats = null; // no terms at all, we won't use similarity
       }
-      stats = similarity.computeWeight(boost, searcher.collectionStatistics(field), termStats);
     }
 
     @Override

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/e0bde579/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java b/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java
index c718dc9..e9e6636 100644
--- a/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java
+++ b/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java
@@ -34,6 +34,7 @@ import org.apache.lucene.index.TermState;
 import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.search.similarities.Similarity;
 import org.apache.lucene.search.similarities.Similarity.SimScorer;
+import org.apache.lucene.util.BytesRef;
 
 /**
  * A query that treats multiple terms as synonyms.
@@ -137,16 +138,22 @@ public final class SynonymQuery extends Query {
       for (int i = 0; i < termContexts.length; i++) {
         termContexts[i] = TermContext.build(searcher.getTopReaderContext(), terms[i]);
         TermStatistics termStats = searcher.termStatistics(terms[i], termContexts[i]);
-        docFreq = Math.max(termStats.docFreq(), docFreq);
-        if (termStats.totalTermFreq() == -1) {
-          totalTermFreq = -1;
-        } else if (totalTermFreq != -1) {
-          totalTermFreq += termStats.totalTermFreq();
+        if (termStats != null) {
+          docFreq = Math.max(termStats.docFreq(), docFreq);
+          if (termStats.totalTermFreq() == -1) {
+            totalTermFreq = -1;
+          } else if (totalTermFreq != -1) {
+            totalTermFreq += termStats.totalTermFreq();
+          }
         }
       }
-      TermStatistics pseudoStats = new TermStatistics(null, docFreq, totalTermFreq);
       this.similarity = searcher.getSimilarity(true);
-      this.simWeight = similarity.computeWeight(boost, collectionStats, pseudoStats);
+      if (docFreq > 0) {
+        TermStatistics pseudoStats = new TermStatistics(new BytesRef("synonym pseudo-term"),
docFreq, totalTermFreq);
+        this.simWeight = similarity.computeWeight(boost, collectionStats, pseudoStats);
+      } else {
+        this.simWeight = null; // no terms exist at all, we won't use similarity
+      }
     }
 
     @Override
@@ -185,7 +192,7 @@ public final class SynonymQuery extends Query {
 
     @Override
     public Scorer scorer(LeafReaderContext context) throws IOException {
-      Similarity.SimScorer simScorer = similarity.simScorer(simWeight, context);
+      Similarity.SimScorer simScorer = null;
       // we use termscorers + disjunction as an impl detail
       List<Scorer> subScorers = new ArrayList<>();
       for (int i = 0; i < terms.length; i++) {
@@ -194,6 +201,10 @@ public final class SynonymQuery extends Query {
           TermsEnum termsEnum = context.reader().terms(terms[i].field()).iterator();
           termsEnum.seekExact(terms[i].bytes(), state);
           PostingsEnum postings = termsEnum.postings(null, PostingsEnum.FREQS);
+          // lazy init sim, in case no terms exist
+          if (simScorer == null) {
+            simScorer = similarity.simScorer(simWeight, context);
+          }
           subScorers.add(new TermScorer(this, postings, simScorer));
         }
       }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/e0bde579/lucene/core/src/java/org/apache/lucene/search/TermQuery.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/search/TermQuery.java b/lucene/core/src/java/org/apache/lucene/search/TermQuery.java
index 48c61fa..587c513 100644
--- a/lucene/core/src/java/org/apache/lucene/search/TermQuery.java
+++ b/lucene/core/src/java/org/apache/lucene/search/TermQuery.java
@@ -65,13 +65,16 @@ public class TermQuery extends Query {
         collectionStats = searcher.collectionStatistics(term.field());
         termStats = searcher.termStatistics(term, termStates);
       } else {
-        // we do not need the actual stats, use fake stats with docFreq=maxDoc and ttf=-1
-        final int maxDoc = searcher.getIndexReader().maxDoc();
-        collectionStats = new CollectionStatistics(term.field(), maxDoc, -1, -1, -1);
-        termStats = new TermStatistics(term.bytes(), maxDoc, -1);
+        // we do not need the actual stats, use fake stats with docFreq=maxDoc=1 and ttf=-1
+        collectionStats = new CollectionStatistics(term.field(), 1, -1, -1, -1);
+        termStats = new TermStatistics(term.bytes(), 1, -1);
       }
      
-      this.stats = similarity.computeWeight(boost, collectionStats, termStats);
+      if (termStats == null) {
+        this.stats = null; // term doesn't exist in any segment, we won't use similarity
at all
+      } else {
+        this.stats = similarity.computeWeight(boost, collectionStats, termStats);
+      }
     }
 
     @Override

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/e0bde579/lucene/core/src/java/org/apache/lucene/search/TermStatistics.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/search/TermStatistics.java b/lucene/core/src/java/org/apache/lucene/search/TermStatistics.java
index a8e2e06..7d4f03a 100644
--- a/lucene/core/src/java/org/apache/lucene/search/TermStatistics.java
+++ b/lucene/core/src/java/org/apache/lucene/search/TermStatistics.java
@@ -17,6 +17,8 @@
 package org.apache.lucene.search;
 
 
+import java.util.Objects;
+
 import org.apache.lucene.index.Term;
 import org.apache.lucene.index.TermsEnum; // javadocs
 import org.apache.lucene.util.BytesRef;
@@ -29,9 +31,25 @@ public class TermStatistics {
   private final long docFreq;
   private final long totalTermFreq;
   
+  /**
+   * Creates statistics instance for a term.
+   * @param term Term bytes
+   * @param docFreq number of documents containing the term in the collection.
+   * @param totalTermFreq number of occurrences of the term in the collection.
+   * @throws NullPointerException if {@code term} is {@code null}.
+   * @throws IllegalArgumentException if {@code docFreq} is negative or zero.
+   * @throws IllegalArgumentException if {@code totalTermFreq} is less than {@code docFreq}.
+   */
   public TermStatistics(BytesRef term, long docFreq, long totalTermFreq) {
-    assert docFreq >= 0;
-    assert totalTermFreq == -1 || totalTermFreq >= docFreq; // #positions must be >=
#postings
+    Objects.requireNonNull(term);
+    if (docFreq <= 0) {
+      throw new IllegalArgumentException("docFreq must be positive, docFreq: " + docFreq);
+    }
+    if (totalTermFreq != -1) {
+      if (totalTermFreq < docFreq) {
+        throw new IllegalArgumentException("totalTermFreq must be at least docFreq, totalTermFreq:
" + totalTermFreq + ", docFreq: " + docFreq);
+      }
+    }
     this.term = term;
     this.docFreq = docFreq;
     this.totalTermFreq = totalTermFreq;

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/e0bde579/lucene/core/src/java/org/apache/lucene/search/spans/SpanWeight.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/SpanWeight.java b/lucene/core/src/java/org/apache/lucene/search/spans/SpanWeight.java
index 4d08172..0dad614 100644
--- a/lucene/core/src/java/org/apache/lucene/search/spans/SpanWeight.java
+++ b/lucene/core/src/java/org/apache/lucene/search/spans/SpanWeight.java
@@ -18,6 +18,7 @@ package org.apache.lucene.search.spans;
 
 
 import java.io.IOException;
+import java.util.Arrays;
 import java.util.Map;
 
 import org.apache.lucene.index.LeafReaderContext;
@@ -93,13 +94,19 @@ public abstract class SpanWeight extends Weight {
     if (termContexts == null || termContexts.size() == 0 || query.getField() == null)
       return null;
     TermStatistics[] termStats = new TermStatistics[termContexts.size()];
-    int i = 0;
+    int termUpTo = 0;
     for (Term term : termContexts.keySet()) {
-      termStats[i] = searcher.termStatistics(term, termContexts.get(term));
-      i++;
+      TermStatistics termStatistics = searcher.termStatistics(term, termContexts.get(term));
+      if (termStatistics != null) {
+        termStats[termUpTo++] = termStatistics;
+      }
     }
     CollectionStatistics collectionStats = searcher.collectionStatistics(query.getField());
-    return similarity.computeWeight(boost, collectionStats, termStats);
+    if (termUpTo > 0) {
+      return similarity.computeWeight(boost, collectionStats, Arrays.copyOf(termStats, termUpTo));
+    } else {
+      return null; // no terms at all exist, we won't use similarity
+    }
   }
 
   /**

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/e0bde579/lucene/core/src/test/org/apache/lucene/search/similarities/TestClassicSimilarity.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/search/similarities/TestClassicSimilarity.java
b/lucene/core/src/test/org/apache/lucene/search/similarities/TestClassicSimilarity.java
index 4a5a10f..a0fa0f3 100644
--- a/lucene/core/src/test/org/apache/lucene/search/similarities/TestClassicSimilarity.java
+++ b/lucene/core/src/test/org/apache/lucene/search/similarities/TestClassicSimilarity.java
@@ -28,7 +28,6 @@ import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.FieldInvertState;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexWriter;
-import org.apache.lucene.index.MultiReader;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.search.BooleanClause.Occur;
 import org.apache.lucene.search.BooleanQuery;
@@ -158,7 +157,7 @@ public class TestClassicSimilarity extends BaseSimilarityTestCase {
   
   public void testSaneNormValues() throws IOException {
     ClassicSimilarity sim = new ClassicSimilarity();
-    TFIDFSimilarity.IDFStats stats = (IDFStats) sim.computeWeight(1f, new IndexSearcher(new
MultiReader()).collectionStatistics("foo"));
+    TFIDFSimilarity.IDFStats stats = (IDFStats) sim.computeWeight(1f, indexSearcher.collectionStatistics("test"));
     for (int i = 0; i < 256; i++) {
       float boost = stats.normTable[i];
       assertFalse("negative boost: " + boost + ", byte=" + i, boost < 0.0f);

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/e0bde579/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarity2.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarity2.java
b/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarity2.java
index 09683b2..6fd38bd 100644
--- a/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarity2.java
+++ b/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarity2.java
@@ -262,10 +262,7 @@ public class TestSimilarity2 extends LuceneTestCase {
   
   /** make sure all sims work with spanOR(termX, termY) where termY does not exist */
   public void testCrazySpans() throws Exception {
-    // The problem: "normal" lucene queries create scorers, returning null if terms dont
exist
-    // This means they never score a term that does not exist.
-    // however with spans, there is only one scorer for the whole hierarchy:
-    // inner queries are not real queries, their boosts are ignored, etc.
+    // historically this was a problem, but sim's no longer have to score terms that dont
exist
     Directory dir = newDirectory();
     RandomIndexWriter iw = new RandomIndexWriter(random(), dir);
     Document doc = new Document();

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/e0bde579/lucene/sandbox/src/java/org/apache/lucene/search/TermAutomatonQuery.java
----------------------------------------------------------------------
diff --git a/lucene/sandbox/src/java/org/apache/lucene/search/TermAutomatonQuery.java b/lucene/sandbox/src/java/org/apache/lucene/search/TermAutomatonQuery.java
index 04c8736..67d8027 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/search/TermAutomatonQuery.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/search/TermAutomatonQuery.java
@@ -347,12 +347,19 @@ public class TermAutomatonQuery extends Query {
       for(Map.Entry<Integer,BytesRef> ent : idToTerm.entrySet()) {
         Integer termID = ent.getKey();
         if (ent.getValue() != null) {
-          allTermStats.add(searcher.termStatistics(new Term(field, ent.getValue()), termStates.get(termID)));
+          TermStatistics stats = searcher.termStatistics(new Term(field, ent.getValue()),
termStates.get(termID));
+          if (stats != null) {
+            allTermStats.add(stats);
+          }
         }
       }
 
-      stats = similarity.computeWeight(boost, searcher.collectionStatistics(field),
-                                       allTermStats.toArray(new TermStatistics[allTermStats.size()]));
+      if (allTermStats.isEmpty()) {
+        stats = null; // no terms matched at all, will not use sim
+      } else {
+        stats = similarity.computeWeight(boost, searcher.collectionStatistics(field),
+                                         allTermStats.toArray(new TermStatistics[allTermStats.size()]));
+      }
     }
 
     @Override

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/e0bde579/lucene/sandbox/src/test/org/apache/lucene/search/TestTermAutomatonQuery.java
----------------------------------------------------------------------
diff --git a/lucene/sandbox/src/test/org/apache/lucene/search/TestTermAutomatonQuery.java
b/lucene/sandbox/src/test/org/apache/lucene/search/TestTermAutomatonQuery.java
index 6ef9baf..5a1d506 100644
--- a/lucene/sandbox/src/test/org/apache/lucene/search/TestTermAutomatonQuery.java
+++ b/lucene/sandbox/src/test/org/apache/lucene/search/TestTermAutomatonQuery.java
@@ -965,4 +965,64 @@ public class TestTermAutomatonQuery extends LuceneTestCase {
     
     IOUtils.close(w, r, dir);
   }
+  
+  // we query with sun|moon but moon doesn't exist
+  public void testOneTermMissing() throws Exception {
+    Directory dir = newDirectory();
+    RandomIndexWriter w = new RandomIndexWriter(random(), dir);
+    Document doc = new Document();
+    doc.add(newTextField("field", "here comes the sun", Field.Store.NO));
+    w.addDocument(doc);
+
+    IndexReader r = w.getReader();
+    IndexSearcher s = newSearcher(r);
+
+    TermAutomatonQuery q = new TermAutomatonQuery("field");
+    int init = q.createState();
+    int s1 = q.createState();
+    q.addTransition(init, s1, "comes");
+    int s2 = q.createState();
+    q.addAnyTransition(s1, s2);
+    int s3 = q.createState();
+    q.setAccept(s3, true);
+    q.addTransition(s2, s3, "sun");
+    q.addTransition(s2, s3, "moon");
+    q.finish();
+
+    assertEquals(1, s.search(q, 1).totalHits);
+
+    w.close();
+    r.close();
+    dir.close();
+  }
+  
+  // we query with sun|moon but no terms exist for the field
+  public void testFieldMissing() throws Exception {
+    Directory dir = newDirectory();
+    RandomIndexWriter w = new RandomIndexWriter(random(), dir);
+    Document doc = new Document();
+    doc.add(newTextField("field", "here comes the sun", Field.Store.NO));
+    w.addDocument(doc);
+
+    IndexReader r = w.getReader();
+    IndexSearcher s = newSearcher(r);
+
+    TermAutomatonQuery q = new TermAutomatonQuery("bogusfield");
+    int init = q.createState();
+    int s1 = q.createState();
+    q.addTransition(init, s1, "comes");
+    int s2 = q.createState();
+    q.addAnyTransition(s1, s2);
+    int s3 = q.createState();
+    q.setAccept(s3, true);
+    q.addTransition(s2, s3, "sun");
+    q.addTransition(s2, s3, "moon");
+    q.finish();
+
+    assertEquals(0, s.search(q, 1).totalHits);
+
+    w.close();
+    r.close();
+    dir.close();
+  }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/e0bde579/lucene/test-framework/src/java/org/apache/lucene/search/ShardSearchingTestBase.java
----------------------------------------------------------------------
diff --git a/lucene/test-framework/src/java/org/apache/lucene/search/ShardSearchingTestBase.java
b/lucene/test-framework/src/java/org/apache/lucene/search/ShardSearchingTestBase.java
index 9449a72..a6b44b0 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/search/ShardSearchingTestBase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/search/ShardSearchingTestBase.java
@@ -136,11 +136,13 @@ public abstract class ShardSearchingTestBase extends LuceneTestCase
{
     // other nodes:
     for(String field : fieldsToShare) {
       final CollectionStatistics stats = newSearcher.collectionStatistics(field);
-      for (NodeState node : nodes) {
-        // Don't put my own collection stats into the cache;
-        // we pull locally:
-        if (node.myNodeID != nodeID) {
-          node.collectionStatsCache.put(new FieldAndShardVersion(nodeID, version, field),
stats);
+      if (stats != null) {
+        for (NodeState node : nodes) {
+          // Don't put my own collection stats into the cache;
+          // we pull locally:
+          if (node.myNodeID != nodeID) {
+            node.collectionStatsCache.put(new FieldAndShardVersion(nodeID, version, field),
stats);
+          }
         }
       }
     }
@@ -248,8 +250,10 @@ public abstract class ShardSearchingTestBase extends LuceneTestCase {
           }
           if (missing.size() != 0) {
             for(Map.Entry<Term,TermStatistics> ent : getNodeTermStats(missing, nodeID,
nodeVersions[nodeID]).entrySet()) {
-              final TermAndShardVersion key = new TermAndShardVersion(nodeID, nodeVersions[nodeID],
ent.getKey());
-              termStatsCache.put(key, ent.getValue());
+              if (ent.getValue() != null) {
+                final TermAndShardVersion key = new TermAndShardVersion(nodeID, nodeVersions[nodeID],
ent.getKey());
+                termStatsCache.put(key, ent.getValue());
+              }
             }
           }
         }
@@ -270,9 +274,10 @@ public abstract class ShardSearchingTestBase extends LuceneTestCase {
           } else {
             final TermAndShardVersion key = new TermAndShardVersion(nodeID, nodeVersions[nodeID],
term);
             subStats = termStatsCache.get(key);
-            // We pre-cached during rewrite so all terms
-            // better be here...
-            assert subStats != null;
+          }
+          
+          if (subStats == null) {
+            continue; // term not found
           }
         
           long nodeDocFreq = subStats.docFreq();
@@ -290,7 +295,11 @@ public abstract class ShardSearchingTestBase extends LuceneTestCase {
           }
         }
 
-        return new TermStatistics(term.bytes(), docFreq, totalTermFreq);
+        if (docFreq == 0) {
+          return null; // term not found in any node whatsoever
+        } else {
+          return new TermStatistics(term.bytes(), docFreq, totalTermFreq);
+        }
       }
 
       @Override
@@ -312,11 +321,8 @@ public abstract class ShardSearchingTestBase extends LuceneTestCase {
             nodeStats = collectionStatsCache.get(key);
           }
           if (nodeStats == null) {
-            System.out.println("coll stats myNodeID=" + myNodeID + ": " + collectionStatsCache.keySet());
+            continue; // field not in sub at all
           }
-          // Collection stats are pre-shared on reopen, so,
-          // we better not have a cache miss:
-          assert nodeStats != null: "myNodeID=" + myNodeID + " nodeID=" + nodeID + " version="
+ nodeVersions[nodeID] + " field=" + field;
           
           long nodeDocCount = nodeStats.docCount();
           if (docCount >= 0 && nodeDocCount >= 0) {
@@ -343,7 +349,11 @@ public abstract class ShardSearchingTestBase extends LuceneTestCase {
           maxDoc += nodeStats.maxDoc();
         }
 
-        return new CollectionStatistics(field, maxDoc, docCount, sumTotalTermFreq, sumDocFreq);
+        if (maxDoc == 0) {
+          return null; // field not found across any node whatsoever
+        } else {
+          return new CollectionStatistics(field, maxDoc, docCount, sumTotalTermFreq, sumDocFreq);
+        }
       }
 
       @Override

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/e0bde579/solr/core/src/java/org/apache/solr/search/stats/CollectionStats.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/search/stats/CollectionStats.java b/solr/core/src/java/org/apache/solr/search/stats/CollectionStats.java
index d151a0f..e57f768 100644
--- a/solr/core/src/java/org/apache/solr/search/stats/CollectionStats.java
+++ b/solr/core/src/java/org/apache/solr/search/stats/CollectionStats.java
@@ -74,6 +74,9 @@ public class CollectionStats {
   }
   
   public CollectionStatistics toCollectionStatistics() {
+    if (maxDoc == 0 || docCount == 0) {
+      return null;
+    }
     return new CollectionStatistics(field, maxDoc, docCount, sumTotalTermFreq, sumDocFreq);
   }
   

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/e0bde579/solr/core/src/java/org/apache/solr/search/stats/ExactStatsCache.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/search/stats/ExactStatsCache.java b/solr/core/src/java/org/apache/solr/search/stats/ExactStatsCache.java
index b4e76d4..e1f3984 100644
--- a/solr/core/src/java/org/apache/solr/search/stats/ExactStatsCache.java
+++ b/solr/core/src/java/org/apache/solr/search/stats/ExactStatsCache.java
@@ -164,11 +164,14 @@ public class ExactStatsCache extends StatsCache {
         TermContext termContext = TermContext.build(context, t);
 
         if (!colMap.containsKey(t.field())) { // collection stats for this field
-          colMap.put(t.field(), new CollectionStats(searcher.localCollectionStatistics(t.field())));
+          CollectionStatistics collectionStatistics = searcher.localCollectionStatistics(t.field());
+          if (collectionStatistics != null) {
+            colMap.put(t.field(), new CollectionStats(collectionStatistics));
+          }
         }
 
         TermStatistics tst = searcher.localTermStatistics(t, termContext);
-        if (tst.docFreq() == 0) { // skip terms that are not present here
+        if (tst == null) { // skip terms that are not present here
           continue;
         }
 

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/e0bde579/solr/core/src/java/org/apache/solr/search/stats/TermStats.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/search/stats/TermStats.java b/solr/core/src/java/org/apache/solr/search/stats/TermStats.java
index 8f45a9b..62b311e 100644
--- a/solr/core/src/java/org/apache/solr/search/stats/TermStats.java
+++ b/solr/core/src/java/org/apache/solr/search/stats/TermStats.java
@@ -72,6 +72,9 @@ public class TermStats {
   }
   
   public TermStatistics toTermStatistics() {
+    if (docFreq == 0) {
+      return null;
+    }
     return new TermStatistics(t.bytes(), docFreq, totalTermFreq);
   }
   


Mime
View raw message