lucene-java-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From David Spencer <David.Spen...@micromuse.com>
Subject Re: I need a list of the indexed words
Date Wed, 02 Apr 2003 00:15:30 GMT
jcrowell wrote:

>Thanks for responding.  Are you referring to the solution under the title:
>
>"How do I retrieve all the values of a particular field that exists within
>an index, across all documents" ?
>
Here's some code that might do what you want.
It's shows the frequency of each term also.
Args are "-i INDEX" and optional "-f FIELD".
I haven't tested it outside my tree (to see if it's clean of 
dependencies) but
it looks reasonable at a glance.





package com.tropo.lucene;

import org.apache.lucene.analysis.*;
import org.apache.lucene.index.*;
import org.apache.lucene.document.*;

import java.io.*;
import java.util.*;

/**
 * Call IndexReader.terms(), termPositions().
 */
public class DumpTerms
{
    private DumpTerms()
    {
    }
   
    /**
     *
     */
    public static void main(String[] args)
        throws Throwable
    {
        String name = "index";
        String field = null;
       
        for ( int i = 0; i< args.length; i++)
        {
            if ( args[i].equals( "-i"))
                name = args[ ++i];
            else if ( args[i].equals( "-f"))
                field = args[ ++i];
        /*

            else if ( args[i].equals( "-v"))
                v = args[ ++i];
        */
        }
        o.println( "Opening "+ name);
        final IndexReader r = IndexReader.open( Rammer.convertMaybe( name));
        final TermEnum te = r.terms();
        final Set s =new TreeSet();
        final List lis = new LinkedList();
        final int docs = r.numDocs();
        o.println( "Docs: " + docs);
        while ( te.next())
        {
            Term t = te.term();
            if ( field != null && ! field.equals( t.field())) continue;
            s.add( t.field());
            /*
            o.println( "field=" +t.field() +
                       " text="  + t.text() +
                       " freq="+ te.docFreq());
            */
            lis.add( new Object[] { t, new Integer( te.docFreq())});
            /*
            TermPositions tp = r.termPositions( t);
            while ( tp.next())
            {
                int f = tp.freq();               
                o.println( "\t" + "doc=" + tp.doc() +  " freq=" +f);
                for ( int i = 0; i < f; i++)
                    o.println( "\t\tnextp=" + tp.nextPosition());
            }
            tp.close();
            */
        }
        o.println( "Terms: " + lis.size());
        Collections.sort( lis, new ICmp());
        Iterator it = lis.iterator();
        int[] lens = new int[ 30];
        while ( it.hasNext())
        {
            Object[] ar = (Object[]) it.next();
            Term t = (Term) ar[0];
            int freq = ((Integer)ar[1]).intValue();
            int len = t.text().length();
            if ( len >= lens.length)
                lens[ lens.length-1]++;
            else
                lens[ len]++;
            o.print( fill( t.text(), 16)  + " " + fill( "" + freq, 6));
            int pc = (freq * 100); pc += (docs/2); pc /= docs;
            o.print( " " +pc + "%");
            if ( ! t.field().equals( "contents"))
                o.println( " " + t.field());
            else
                o.println();
        }
        o.println();

        int tot_terms = lis.size();
        for ( int i = 0; i< lens.length; i++)
        {
            int pc = lens[ i] * 100; pc += tot_terms/2; pc /= tot_terms;
            o.println( "" + i + ". " + lens[ i] + " " + pc + "%");
        }

        r.close();
        /*
        o.println();
        Iterator it = s.iterator();
        while ( it.hasNext())
            o.println( it.next());
        */
    }
    /**
     *
     */
    private static String fill( String s, int w)
    {
        int len = s.length();       
        for ( int i = len; i < w; i++)
            s += " ";
        return s;
    }

    private static class ICmp
        implements Comparator
    {
        public int compare( Object a, Object b)
        {
            Object[] a1 = (Object[]) a;
            Object[] b1 = (Object[]) b;
            int d1 = ((Integer)a1[1]).intValue();
            int d2 = ((Integer)b1[1]).intValue();
            if ( d1 < d2) return 1;
            if ( d1 > d2) return -1;
            return 0;
        }
    }


    static PrintStream o = System.out;


}


>
>Thanks,
>
>Jon
>
>
>
>-----Original Message-----
>From: Otis Gospodnetic [mailto:otis_gospodnetic@yahoo.com] 
>Sent: Monday, March 31, 2003 3:02 PM
>To: Lucene Users List
>Subject: Re: I need a list of the indexed words
>
>
>Please see the FAQ at jGuru, the answer is there.
>
>Otis
>
>--- jcrowell <jcrowell@dsg.harvard.edu> wrote:
>  
>
>>I have a very simple problem: I need to get a list of the words that 
>>will result in a hit if searched on.  Should be simple, but I'm not 
>>quite sure
>>where to start.
>>
>>Thanks,
>>
>>Jon
>>
>>
>>---------------------------------------------------------------------
>>To unsubscribe, e-mail: lucene-user-unsubscribe@jakarta.apache.org
>>For additional commands, e-mail: lucene-user-help@jakarta.apache.org
>>
>>    
>>
>
>
>__________________________________________________
>Do you Yahoo!?
>Yahoo! Platinum - Watch CBS' NCAA March Madness, live on your desktop!
>http://platinum.yahoo.com
>
>---------------------------------------------------------------------
>To unsubscribe, e-mail: lucene-user-unsubscribe@jakarta.apache.org
>For additional commands, e-mail: lucene-user-help@jakarta.apache.org
>
>
>---------------------------------------------------------------------
>To unsubscribe, e-mail: lucene-user-unsubscribe@jakarta.apache.org
>For additional commands, e-mail: lucene-user-help@jakarta.apache.org
>
>  
>


Mime
  • Unnamed multipart/alternative (inline, None, 0 bytes)
View raw message