lucene-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From o...@apache.org
Subject cvs commit: jakarta-lucene/src/java/org/apache/lucene/search FieldSortedHitQueue.java IntegerSortedSearcher.java
Date Fri, 30 Jan 2004 16:22:34 GMT
otis        2004/01/30 08:22:34

  Added:       src/java/org/apache/lucene/search FieldSortedHitQueue.java
                        IntegerSortedSearcher.java
  Log:
  - Initial commit
  Submitted by: Tim Jones
  Reviewed by: Otis, Doug
  
  Revision  Changes    Path
  1.1                  jakarta-lucene/src/java/org/apache/lucene/search/FieldSortedHitQueue.java
  
  Index: FieldSortedHitQueue.java
  ===================================================================
  package org.apache.lucene.search;
  
  /**
   * Copyright 2004 The Apache Software Foundation
   *
   * Licensed under the Apache License, Version 2.0 (the "License");
   * you may not use this file except in compliance with the License.
   * You may obtain a copy of the License at
   *
   *     http://www.apache.org/licenses/LICENSE-2.0
   *
   * Unless required by applicable law or agreed to in writing, software
   * distributed under the License is distributed on an "AS IS" BASIS,
   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   * See the License for the specific language governing permissions and
   * limitations under the License.
   */
  
  import org.apache.lucene.util.PriorityQueue;
  import org.apache.lucene.search.ScoreDoc;
  import org.apache.lucene.index.IndexReader;
  import org.apache.lucene.index.TermEnum;
  import org.apache.lucene.index.Term;
  import org.apache.lucene.index.TermDocs;
  
  import java.util.HashMap;
  import java.io.IOException;
  
  /**
   * Expert: collects results from a search and sorts them by terms in a
   * given field in each document.
   *
   * <p>In this version (0.1) the field to sort by must contain strictly
   * String representations of Integers.
   * See {@link SortedIndexSearcher SortedIndexSearcher} for more
   * information.  Each document is assumed to have a single term in the
   * given field, and the value of the term is the document's relative
   * position in the given sort order.
   *
   * <p>When one of these objects is created, a TermEnumerator is
   * created to fetch all the terms in the index for the given field.
   * The value of each term is assumed to be an integer representing a
   * sort position.  Each document is assumed to contain one of the
   * terms, indicating where in the sort it belongs.
   *
   * <p><h3>Memory Usage</h3>
   *
   * <p>A static cache is maintained.  This cache contains an integer
   * array of length <code>IndexReader.maxDoc()</code> for each field
   * name for which a sort is performed.  In other words, the size of
   * the cache in bytes is:
   *
   * <p><code>4 * IndexReader.maxDoc() * (# of different fields actually used
to sort)</code>
   *
   * <p>Note that the size of the cache is not affected by how many
   * fields are in the index and <i>might</i> be used to sort - only by
   * the ones actually used to sort a result set.
   *
   * <p>The cache is cleared each time a new <code>IndexReader</code> is
   * passed in, or if the value returned by <code>maxDoc()</code>
   * changes for the current IndexReader.  This class is not set up to
   * be able to efficiently sort hits from more than one index
   * simultaneously.
   *
   * <p>Created: Dec 8, 2003 12:56:03 PM
   *
   * @author  "Tim Jones" &lt;tjluc@nacimiento.com&gt;
   * @since   lucene 1.3
   * @version 0.1
   */
  public class FieldSortedHitQueue
  extends PriorityQueue {
  
      /**
       * Keeps track of the IndexReader which the cache
       * applies to.  If it changes, the cache is cleared.
       * We only store the hashcode so as not to mess up
       * garbage collection by having a reference to an
       * IndexReader.
       */
      protected static int lastReaderHash;
  
      /**
       * Contains the cache of sort information.  The
       * key is field name, the value an array of int.
       * A HashMap is used, and we are careful how we
       * handle synchronization.  This is because best
       * performance is obtained when the same IndexReader
       * is used over and over, and we therefore perform
       * many reads and few writes.
       */
      protected static HashMap fieldCache;
  
      /** The sort information being used by this instance */
      protected int[] fieldOrder;
  
      /**
       * Creates a hit queue sorted by the given field.
       * @param reader  IndexReader to use.
       * @param integer_field  Field to sort by.
       * @param size    Number of hits to return - see {@link PriorityQueue#initialize(int)
initialize}
       * @throws IOException  If the internal term enumerator fails.
       */
      public FieldSortedHitQueue (IndexReader reader, String integer_field, int size)
      throws IOException {
  
          int hash = reader.hashCode();
          if (hash != lastReaderHash) {
              lastReaderHash = hash;
              if (fieldCache != null) {
                  fieldCache.clear();
              }
              fieldCache = new HashMap();
          }
  
          initialize (size);
          initializeSort (reader, integer_field);
      }
  
      /**
       * Compares documents based on the value of the term in the field
       * being sorted by.  Documents which should appear at the top of the
       * list should have low values in the term; documents which should
       * appear at the end should have high values.
       *
       * <p>In the context of this method, "less than" means "less relevant",
       * so documents at the top of the list are "greatest" and documents at
       * the bottom are "least".
       *
       * <p>Document A is considered less than Document B
       * if A.field.term > B.field.term or A.doc > B.doc.
       *
       * @param a  ScoreDoc object for document a.
       * @param b  ScoreDoc object for document b.
       * @return true if document a is less than document b.
       * @see ScoreDoc
       */
      protected final boolean lessThan (Object a, Object b) {
          ScoreDoc hitA = (ScoreDoc) a;
          ScoreDoc hitB = (ScoreDoc) b;
          int scoreA = fieldOrder[hitA.doc];
          int scoreB = fieldOrder[hitB.doc];
          if (scoreA == scoreB)
              return hitA.doc > hitB.doc;
          else
              return scoreA > scoreB;   // bigger is really less - the ones at the top
should be the lowest
      }
  
      /**
       * Initializes the cache of sort information.  <code>fieldCache</code> is
queried
       * to see if it has the term information for the given field.
       * If so, and if the reader still has the same value for maxDoc()
       * (note that we assume new IndexReaders are caught during the
       * constructor), the existing data is used.  If not, all the term values
       * for the given field are fetched.  The value of the term is assumed
       * to be the sort index for any documents containing the term.  Documents
       * should only have one term in the given field. Multiple documents
       * can share the same term if desired (documents with the same term will
       * be sorted relative to each other by the order they were placed in
       * the index).
       * @param reader  The document index.
       * @param field   The field to sort by.
       * @throws IOException  If the term enumerator fails.
       */
      protected final void initializeSort (IndexReader reader, String field)
      throws IOException {
  
          fieldOrder = (int[]) fieldCache.get (field);
          if (fieldOrder == null || fieldOrder.length != reader.maxDoc()) {
              fieldOrder = new int [reader.maxDoc()];
  
              TermEnum enumerator = reader.terms (new Term (field, ""));
              TermDocs termDocs = reader.termDocs();
              if (enumerator.term() == null) {
                  throw new RuntimeException ("no terms in field "+field);
              }
  
              try {
                  Term term = enumerator.term();
                  while (term.field() == field) {
                      termDocs.seek (term);
                      if (termDocs.next()) {
                          fieldOrder[termDocs.doc()] = Integer.parseInt (term.text());
                      } else {
                          throw new RuntimeException ("termDocs.next() failed!");
                      }
                      if (!enumerator.next()) {
                          break;
                      }
                      term = enumerator.term();
                  }
              } finally {
                  enumerator.close();
                  termDocs.close();
              }
  
              // be careful how the cache is updated so we
              // don't have synchronization problems.  we do
              // it this way because we assume updates will be
              // few compared to the number of reads.
              HashMap newCache = (HashMap) fieldCache.clone();
              newCache.put (field, fieldOrder);
              fieldCache = newCache;
          }
      }
  }
  
  
  
  1.1                  jakarta-lucene/src/java/org/apache/lucene/search/IntegerSortedSearcher.java
  
  Index: IntegerSortedSearcher.java
  ===================================================================
  package org.apache.lucene.search;
  
  /**
   * Copyright 2004 The Apache Software Foundation
   *
   * Licensed under the Apache License, Version 2.0 (the "License");
   * you may not use this file except in compliance with the License.
   * You may obtain a copy of the License at
   *
   *     http://www.apache.org/licenses/LICENSE-2.0
   *
   * Unless required by applicable law or agreed to in writing, software
   * distributed under the License is distributed on an "AS IS" BASIS,
   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   * See the License for the specific language governing permissions and
   * limitations under the License.
   */
  
  import org.apache.lucene.index.*;
  import org.apache.lucene.store.Directory;
  import org.apache.lucene.search.*;
  import org.apache.lucene.search.TopDocs;
  
  import java.io.IOException;
  import java.util.BitSet;
  
  /**
   * Implements search over an IndexReader using the values of terms in
   * a field as the primary sort order.  Secondary sort is by the order
   * of documents in the index.
   *
   * <p>In this version (0.1) the field to sort by must contain strictly
   * String representations of Integers (i.e. {@link Integer#toString Integer.toString()}).
   *
   * Each document is assumed to have a single term in the given field,
   * and the value of the term is the document's relative position in
   * the given sort order.  The field must be indexed, but should not be
   * stored or tokenized:
   *
   * <p><code>document.add(new Field("byAlpha", Integer.toString(x), false, true,
false));</code>
   *
   * <p>In other words, the desired order of documents must be encoded
   * at the time they are entered into the index.  The first document
   * should have a low value integer, the last document a high value
   * (i.e. the documents should be numbered <code>1..n</code> where
   * <code>1</code> is the first and <code>n</code> the last).  Values
   * must be between <code>Integer.MIN_VALUE</code> and
   * <code>Integer.MAX_VALUE</code> inclusive.
   *
   * <p>Then, at search time, the field is designated to be used to sort
   * the returned hits:
   *
   * <p><code>IndexSearcher searcher = new IntegerSortedSearcher(indexReader,
"byAlpha");</code>
   *
   * <p>or:
   *
   * <p><code>IntegerSortedSearcher searcher = new IntegerSortedSearcher(indexReader,
"bySomething");
   * <br>Hits hits = searcher.search(query, filter);
   * <br>...
   * <br>searcher.setOrderByField("bySomethingElse");
   * <br>hits = searcher.search(query, filter);
   * <br>...
   * </code>
   *
   * <p>Note the above example shows that one of these objects can be
   * used multiple times, and the sort order changed between usages.
   *
   * <p><h3>Memory Usage</h3>
   *
   * <p>This object is almost identical to the regular IndexSearcher and
   * makes no additional memory requirements on its own.  Every time the
   * <code>search()</code> method is called, however, a new
   * {@link FieldSortedHitQueue FieldSortedHitQueue} object is created.
   * That object is responsible for putting the hits in the correct order,
   * and it maintains a cache of information based on the IndexReader
   * given to it.  See its documentation for more information on its
   * memory usage.
   *
   * <p><h3>Concurrency</h3>
   *
   * <p>This object has the same behavior during concurrent updates to
   * the index as does IndexSearcher.  Namely, in the default
   * implementation using
   * {@link org.apache.lucene.store.FSDirectory FSDirectory}, the index
   * can be updated (deletes, adds) without harm while this object
   * exists, but this object will not see the changes.  Ultimately this
   * behavior is a result of the
   * {@link org.apache.lucene.index.SegmentReader SegmentReader} class
   * internal to FSDirectory, which caches information about documents
   * in memory.
   *
   * <p>So, in order for IntegerSortedSearcher to be kept up to date with
   * changes to the index, new instances must be created instead of the
   * same one used over and over again.  This will result in lower
   * performance than if instances are reused.
   *
   * <p><h3>Updates</h3>
   *
   * <p>In order to be able to update the index without having to
   * recalculate all the sort numbers, the numbers should be stored with
   * "space" between them.  That is, sort the documents and number them
   * <code>1..n</code>.  Then, as <code>i</code> goes between
   * <code>1</code> and <code>n</code>:
   *
   * <p><code>document.add(new Field("byAlpha", Integer.toString(i*1000), false,
true, false));</code>
   *
   * <p>Add a new document sorted between position 1 and 2 by:
   *
   * <p><code>document.add(new Field("byAlpha", Integer.toString(1500), false,
true, false));</code>
   *
   * <p>Be careful not to overun <code>Integer.MAX_VALUE</code>
   * (<code>2147483647</code>).  Periodically a complete reindex should
   * be run so the sort orders can be "normalized".
   *
   * <p>Created: Dec 8, 2003 12:47:26 PM
   *
   * @author  "Tim Jones" &lt;tjluc@nacimiento.com&gt;
   * @since   lucene 1.3
   * @version 0.1
   * @see IndexSearcher
   */
  public class IntegerSortedSearcher
  extends IndexSearcher {
  
      /** stores the field being used to sort by **/
      protected String field;
  
      /**
       * Searches the index in the named directory using the given
       * field as the primary sort.
       * The terms in the field must contain strictly integers in
       * the range <code>Integer.MIN_VALUE</code> and <code>Integer.MAX_VALUE</code>
inclusive.
       * @see IndexSearcher(java.lang.String,java.lang.String)
       */
      public IntegerSortedSearcher(String path, String integer_field)
      throws IOException {
          this(IndexReader.open(path), integer_field);
      }
  
      /**
       * Searches the index in the provided directory using the
       * given field as the primary sort.
       * The terms in the field must contain strictly integers in
       * the range <code>Integer.MIN_VALUE</code> and <code>Integer.MAX_VALUE</code>
inclusive.
       * @see IndexSearcher(Directory,java.lang.String)
       */
      public IntegerSortedSearcher(Directory directory, String integer_field)
      throws IOException {
          this(IndexReader.open(directory), integer_field);
      }
  
      /**
       * Searches the provided index using the given field as the
       * primary sort.
       * The terms in the field must contain strictly integers in
       * the range <code>Integer.MIN_VALUE</code> and <code>Integer.MAX_VALUE</code>
inclusive.
       * @see IndexSearcher(IndexReader)
       */
      public IntegerSortedSearcher(IndexReader r, String integer_field) {
          super(r);
          this.field = integer_field.intern();
      }
  
      /**
       * Sets the field to order results by.  This can be called
       * multiple times per instance of IntegerSortedSearcher.
       * @param integer_field  The field to sort results by.
       */
      public void setOrderByField(String integer_field) {
          this.field = integer_field.intern();
      }
  
      /**
       * Returns the name of the field currently being used
       * to sort results by.
       * @return  Field name.
       */
      public String getOrderByField() {
          return field;
      }
  
  
      /**
       * Finds the top <code>nDocs</code>
       * hits for <code>query</code>, applying <code>filter</code>
if non-null.
       *
       * Overrides IndexSearcher.search to use a FieldSortedHitQueue instead of the
       * default HitQueue.
       *
       * @see IndexSearcher#search
       */
      public TopDocs search(Query query, Filter filter, final int nDocs)
      throws IOException {
  
          Scorer scorer = query.weight(this).scorer(reader);
          if (scorer == null) {
              return new TopDocs(0, new ScoreDoc[0]);
          }
  
          final BitSet bits = filter != null ? filter.bits(reader) : null;
          final FieldSortedHitQueue hq = new FieldSortedHitQueue(reader, field, nDocs);
          final int[] totalHits = new int[1];
          scorer.score(
              new HitCollector() {
                  public final void collect(int doc, float score) {
                      if (score > 0.0f &&                         // ignore zeroed
buckets
                          (bits == null || bits.get(doc))) {      // skip docs not in bits
                          totalHits[0]++;
                          hq.insert(new ScoreDoc(doc, score));
                      }
                  }
              });
  
          ScoreDoc[] scoreDocs = new ScoreDoc[hq.size()];
          for (int i = hq.size() - 1; i >= 0; i--) {              // put docs in array
              scoreDocs[i] = (ScoreDoc) hq.pop();
          }
  
          return new TopDocs(totalHits[0], scoreDocs);
      }
  }
  
  
  

---------------------------------------------------------------------
To unsubscribe, e-mail: lucene-dev-unsubscribe@jakarta.apache.org
For additional commands, e-mail: lucene-dev-help@jakarta.apache.org


Mime
View raw message