lucene-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From o...@apache.org
Subject cvs commit: jakarta-lucene-sandbox/contributions/miscellaneous/src/java/org/apache/lucene/misc HighFreqTerms.java
Date Sun, 17 Nov 2002 03:56:37 GMT
otis        2002/11/16 19:56:37

  Added:       contributions/miscellaneous/src/java/org/apache/lucene/misc
                        HighFreqTerms.java
  Log:
   - Initial checkin.
  
  Revision  Changes    Path
  1.1                  jakarta-lucene-sandbox/contributions/miscellaneous/src/java/org/apache/lucene/misc/HighFreqTerms.java
  
  Index: HighFreqTerms.java
  ===================================================================
  package org.apache.lucene.misc;
  
  /* ====================================================================
   * The Apache Software License, Version 1.1
   *
   * Copyright (c) 2001 The Apache Software Foundation.  All rights
   * reserved.
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
   *
   * 1. Redistributions of source code must retain the above copyright
   *    notice, this list of conditions and the following disclaimer.
   *
   * 2. Redistributions in binary form must reproduce the above copyright
   *    notice, this list of conditions and the following disclaimer in
   *    the documentation and/or other materials provided with the
   *    distribution.
   *
   * 3. The end-user documentation included with the redistribution,
   *    if any, must include the following acknowledgment:
   *       "This product includes software developed by the
   *        Apache Software Foundation (http://www.apache.org/)."
   *    Alternately, this acknowledgment may appear in the software itself,
   *    if and wherever such third-party acknowledgments normally appear.
   *
   * 4. The names "Apache" and "Apache Software Foundation" and
   *    "Apache Lucene" must not be used to endorse or promote products
   *    derived from this software without prior written permission. For
   *    written permission, please contact apache@apache.org.
   *
   * 5. Products derived from this software may not be called "Apache",
   *    "Apache Lucene", nor may "Apache" appear in their name, without
   *    prior written permission of the Apache Software Foundation.
   *
   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   * SUCH DAMAGE.
   * ====================================================================
   *
   * This software consists of voluntary contributions made by many
   * individuals on behalf of the Apache Software Foundation.  For more
   * information on the Apache Software Foundation, please see
   * <http://www.apache.org/>.
   */
  
  import org.apache.lucene.util.PriorityQueue;
  import org.apache.lucene.store.Directory;
  import org.apache.lucene.store.FSDirectory;
  import org.apache.lucene.index.IndexReader;
  import org.apache.lucene.index.Term;
  import org.apache.lucene.index.TermEnum;
  
  /**
   * <code>HighFreqTerms</code> class extracts terms and their frequencies out
   * of an existing Lucene index.
   *
   * @version $Id: HighFreqTerms.java,v 1.1 2002/11/17 03:56:37 otis Exp $
   */
  public class HighFreqTerms
  {
      public static int numTerms = 100;
  
      public static void main(String[] args) throws Exception
      {
          IndexReader reader = null;
          if (args.length == 1)
          {
              reader = IndexReader.open(args[0]);
          }
          else
          {
              usage();
              System.exit(1);
          }
  
          TermInfoQueue tiq = new TermInfoQueue(numTerms);
          TermEnum terms = reader.terms();
  
          int minFreq = 0;
          while (terms.next())
          {
              if (terms.docFreq() > minFreq)
              {
                  tiq.put(new TermInfo(terms.term(), terms.docFreq()));
                  if (tiq.size() > numTerms) 		     // if tiq overfull
                  {
                      tiq.pop();				     // remove lowest in tiq
                      minFreq = ((TermInfo)tiq.top()).docFreq; // reset minFreq
                  }
              }
          }
  
          while (tiq.size() != 0)
          {
              TermInfo termInfo = (TermInfo)tiq.pop();
              System.out.println(termInfo.term + " " + termInfo.docFreq);
          }
  
          reader.close();
      }
  
      private static void usage()
      {
          System.out.println("\n\n" +
              "java org.apache.lucene.misc.HighFreqTerms <index dir>\n\n");
      }
  }
  
  final class TermInfo
  {
      TermInfo(Term t, int df)
      {
          term = t;
          docFreq = df;
      }
      int docFreq;
      Term term;
  }
  
  final class TermInfoQueue extends PriorityQueue
  {
      TermInfoQueue(int size)
      {
          initialize(size);
      }
      protected final boolean lessThan(Object a, Object b)
      {
          TermInfo termInfoA = (TermInfo)a;
          TermInfo termInfoB = (TermInfo)b;
          return termInfoA.docFreq < termInfoB.docFreq;
      }
  }
  
  
  

--
To unsubscribe, e-mail:   <mailto:lucene-dev-unsubscribe@jakarta.apache.org>
For additional commands, e-mail: <mailto:lucene-dev-help@jakarta.apache.org>


Mime
View raw message