lucene-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From dspen...@apache.org
Subject cvs commit: jakarta-lucene-sandbox/contributions/WordNet/src/java/org/apache/lucene/wordnet SynExpand.java package.html SynLookup.java Syns2Index.java
Date Tue, 11 Jan 2005 20:58:11 GMT
dspencer    2005/01/11 12:58:11

  Modified:    contributions/WordNet build.xml
               contributions/WordNet/src/java/org/apache/lucene/wordnet
                        SynLookup.java Syns2Index.java
  Added:       contributions/WordNet/src/java/org/apache/lucene/wordnet
                        SynExpand.java package.html
  Log:
  make sure code works with WordNet2.0 (no problem) and add Query expansion, and comments
  
  Revision  Changes    Path
  1.4       +21 -0     jakarta-lucene-sandbox/contributions/WordNet/build.xml
  
  Index: build.xml
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/WordNet/build.xml,v
  retrieving revision 1.3
  retrieving revision 1.4
  diff -u -r1.3 -r1.4
  --- build.xml	23 Feb 2004 15:23:25 -0000	1.3
  +++ build.xml	11 Jan 2005 20:58:11 -0000	1.4
  @@ -29,6 +29,7 @@
       </java>
     </target>
   
  +
     <target name="synonym" description="Find synonyms for word">
       <fail unless="synindex.exists">
         Index does not exist.
  @@ -46,6 +47,26 @@
   
         <arg file="${synindex.dir}"/>
         <arg value="${word}"/>
  +    </java>
  +  </target>
  +
  +  <target name="expand" description="Perform synonym expansion on a query">
  +    <fail unless="synindex.exists">
  +      Index does not exist.
  +    </fail>
  +
  +    <fail unless="query">
  +      Must specify 'query' property.
  +    </fail>
  +    
  +    <java classname="org.apache.lucene.wordnet.SynExpand">
  +      <classpath>
  +        <path refid="compile.classpath"/>
  +        <pathelement location="${build.classes.dir}"/>
  +      </classpath>
  +
  +      <arg file="${synindex.dir}"/>
  +      <arg value="${query}"/>
       </java>
     </target>
   
  
  
  
  1.2       +108 -39   jakarta-lucene-sandbox/contributions/WordNet/src/java/org/apache/lucene/wordnet/SynLookup.java
  
  Index: SynLookup.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/WordNet/src/java/org/apache/lucene/wordnet/SynLookup.java,v
  retrieving revision 1.1
  retrieving revision 1.2
  diff -u -r1.1 -r1.2
  --- SynLookup.java	26 Jan 2004 17:29:35 -0000	1.1
  +++ SynLookup.java	11 Jan 2005 20:58:11 -0000	1.2
  @@ -1,45 +1,114 @@
   package org.apache.lucene.wordnet;
   
  -import org.apache.lucene.store.FSDirectory;
  -import org.apache.lucene.search.IndexSearcher;
  -import org.apache.lucene.search.TermQuery;
  -import org.apache.lucene.search.Hits;
  -import org.apache.lucene.index.Term;
  -import org.apache.lucene.document.Document;
  -import java.io.IOException;
  +import org.apache.lucene.store.*;
  +import org.apache.lucene.search.*;
  +import org.apache.lucene.index.*;
  +import org.apache.lucene.document.*;
  +import org.apache.lucene.analysis.*;
  +import java.io.*;
  +import java.util.*;
   
  +
  +/**
  + * Test program to look up synonyms.
  + */
   public class SynLookup {
   
  -  public static void main(String[] args) throws IOException {
  -    if (args.length != 2) {
  -      System.out.println(
  -    "java org.apache.lucene.wordnet.SynLookup <index path> <word>");
  -    }
  -
  -    FSDirectory directory = FSDirectory.getDirectory(args[0], false);
  -    IndexSearcher searcher = new IndexSearcher(directory);
  -
  -    String word = args[1];
  -    Hits hits = searcher.search(
  -      new TermQuery(new Term("word", word)));
  -
  -    if (hits.length() == 0) {
  -      System.out.println("No synonyms found for " + word);
  -    } else {
  -      System.out.println("Synonyms found for \"" + word + "\":");
  -    }
  -
  -    for (int i = 0; i < hits.length(); i++) {
  -      Document doc = hits.doc(i);
  -
  -      String[] values = doc.getValues("syn");
  -
  -      for (int j = 0; j < values.length; j++) {
  -        System.out.println(values[j]);
  -      }
  -    }
  -
  -    searcher.close();
  -    directory.close();
  -  }
  +	public static void main(String[] args) throws IOException {
  +		if (args.length != 2) {
  +			System.out.println(
  +							   "java org.apache.lucene.wordnet.SynLookup <index path> <word>");
  +		}
  +
  +		FSDirectory directory = FSDirectory.getDirectory(args[0], false);
  +		IndexSearcher searcher = new IndexSearcher(directory);
  +
  +		String word = args[1];
  +		Hits hits = searcher.search(
  +									new TermQuery(new Term(Syns2Index.F_WORD, word)));
  +
  +		if (hits.length() == 0) {
  +			System.out.println("No synonyms found for " + word);
  +		} else {
  +			System.out.println("Synonyms found for \"" + word + "\":");
  +		}
  +
  +		for (int i = 0; i < hits.length(); i++) {
  +			Document doc = hits.doc(i);
  +
  +			String[] values = doc.getValues(Syns2Index.F_SYN);
  +
  +			for (int j = 0; j < values.length; j++) {
  +				System.out.println(values[j]);
  +			}
  +		}
  +
  +		searcher.close();
  +		directory.close();
  +	}
  +
  +
  +	/**
  +	 * Perform synonym expansion on a query.
  +	 *
  +	 * @param query
  +	 * @param syns
  +	 * @param a
  +	 * @param field
  +	 * @param boost
  +	 */ 
  +	public static Query expand( String query,
  +								Searcher syns,
  +								Analyzer a,
  +								String field,
  +								float boost)
  +		throws IOException
  +	{
  +		Set already = new HashSet(); // avoid dups		
  +		List top = new LinkedList(); // needs to be separately listed..
  +
  +		// [1] Parse query into separate words so that when we expand we can avoid dups
  +		TokenStream ts = a.tokenStream( field, new StringReader( query));
  +		org.apache.lucene.analysis.Token t;
  +		while ( (t = ts.next()) != null)
  +		{
  +			String word = t.termText();
  +			if ( already.add( word))
  +				top.add( word);
  +		}
  +		BooleanQuery tmp = new BooleanQuery();
  +		
  +		// [2] form query
  +		Iterator it = top.iterator();
  +		while ( it.hasNext())
  +		{
  +			// [2a] add to level words in
  +			String word = (String) it.next();
  +			TermQuery tq = new TermQuery( new Term( field, word));
  +			tmp.add( tq, BooleanClause.Occur.SHOULD);
  +
  +			// [2b] add in unique synonums
  +			Hits hits = syns.search( new TermQuery( new Term(Syns2Index.F_WORD, word)));
  +			for (int i = 0; i < hits.length(); i++)
  +			{
  +				Document doc = hits.doc(i);
  +				String[] values = doc.getValues( Syns2Index.F_SYN);
  +				for ( int j = 0; j < values.length; j++)
  +				{
  +					String syn = values[ j];
  +					if ( already.add( syn))
  +					{
  +						tq = new TermQuery( new Term( field, syn));
  +						if ( boost > 0) // else keep normal 1.0
  +							tq.setBoost( boost);
  +						tmp.add( tq, BooleanClause.Occur.SHOULD); 
  +					}
  +				}
  +			}
  +		}
  +
  +
  +		return tmp;
  +	}
  +								
   }
  
  
  
  1.6       +10 -8     jakarta-lucene-sandbox/contributions/WordNet/src/java/org/apache/lucene/wordnet/Syns2Index.java
  
  Index: Syns2Index.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/WordNet/src/java/org/apache/lucene/wordnet/Syns2Index.java,v
  retrieving revision 1.5
  retrieving revision 1.6
  diff -u -r1.5 -r1.6
  --- Syns2Index.java	11 Jan 2005 20:13:39 -0000	1.5
  +++ Syns2Index.java	11 Jan 2005 20:58:11 -0000	1.6
  @@ -23,8 +23,10 @@
   import java.util.TreeMap;
   
   /**
  - * Convert the prolog file wn_s.pl from the <a href="http://www.cogsci.princeton.edu/~wn/obtain.shtml">WordNet
prolog download</a>
  - * into a Lucene index suitable for looking up synonyms and performing query expansion.
  + * Convert the prolog file wn_s.pl from the <a href="http://www.cogsci.princeton.edu/2.0/WNprolog-2.0.tar.gz">WordNet
prolog download</a>
  + * into a Lucene index suitable for looking up synonyms and performing query expansion
({@see SynExpand#expand SynExpand.expand(...)}).
  + *
  + * This has been tested with WordNet 2.0.
    *
    * The index has fields named "word" ({@see #F_WORD})
    * and "syn" ({@see #F_SYN}).
  @@ -40,8 +42,7 @@
    * related meanings we don't do that here.
    * </p>
    *
  - * This can take 8 minutes to execute and build an index on a "fast" system and the index
takes up almost 3 MB.
  - * If you boost the minMergeDocuments and mergeFactor of the index writer than you can
get this down to under 4 minutes.
  + * This can take 4 minutes to execute and build an index on a "fast" system and the index
takes up almost 3 MB.
    *
    * @author Dave Spencer, dave&#064;searchmorph.com
    * @see <a href="http://www.cogsci.princeton.edu/~wn/">WordNet home page</a>
  @@ -76,7 +77,7 @@
       private static final Analyzer ana = new StandardAnalyzer();
   
       /**
  -     * Takes optional arg of prolog file name.
  +     * Takes arg of prolog file name and index directory.
        */
       public static void main(String[] args)
           throws Throwable
  @@ -228,9 +229,10 @@
   
           // override the specific index if it already exists
           IndexWriter writer = new IndexWriter(indexDir, ana, true);
  -        writer.setUseCompoundFile(true);
  -		writer.mergeFactor *= 2;
  -		writer.minMergeDocs *= 2;
  +        writer.setUseCompoundFile(true); // why?
  +		// blindly up these parameters for speed
  +		writer.setMergeFactor( writer.getMergeFactor() * 2);
  +		writer.setMaxBufferedDocs( writer.getMaxBufferedDocs() * 2);
           Iterator i1 = word2Nums.keySet().iterator();
           while (i1.hasNext()) // for each word
           {
  
  
  
  1.1                  jakarta-lucene-sandbox/contributions/WordNet/src/java/org/apache/lucene/wordnet/SynExpand.java
  
  Index: SynExpand.java
  ===================================================================
  package org.apache.lucene.wordnet;
  
  import org.apache.lucene.store.*;
  import org.apache.lucene.search.*;
  import org.apache.lucene.index.*;
  import org.apache.lucene.document.*;
  import org.apache.lucene.analysis.*;
  import org.apache.lucene.analysis.standard.*;
  import java.io.*;
  import java.util.*;
  
  
  /**
   * Expand a query by looking up synonyms for every term.
   * You need to invoke {@see Syns2Index} first to build the synonym index.
   *
   * @see Syns2Index
   */
  public final class SynExpand {
  
  	/**
  	 * Test driver for synonym expansion.
  	 * Uses boost factor of 0.9 for illustrative purposes.
  	 *
  	 * If you pass in the query "big dog" then it prints out:
  	 *
  	 * <code><pre>
  	 * Query: big adult^0.9 bad^0.9 bighearted^0.9 boastful^0.9 boastfully^0.9 bounteous^0.9
bountiful^0.9 braggy^0.9 crowing^0.9 freehanded^0.9 giving^0.9 grown^0.9 grownup^0.9 handsome^0.9
large^0.9 liberal^0.9 magnanimous^0.9 momentous^0.9 openhanded^0.9 prominent^0.9 swelled^0.9
vainglorious^0.9 vauntingly^0.9
  	 * dog andiron^0.9 blackguard^0.9 bounder^0.9 cad^0.9 chase^0.9 click^0.9 detent^0.9 dogtooth^0.9
firedog^0.9 frank^0.9 frankfurter^0.9 frump^0.9 heel^0.9 hotdog^0.9 hound^0.9 pawl^0.9 tag^0.9
tail^0.9 track^0.9 trail^0.9 weenie^0.9 wiener^0.9 wienerwurst^0.9
  	 * </pre></code>
  	 */
  	public static void main(String[] args) throws IOException
  	{
  		if (args.length != 2)
  		{
  			System.out.println(
  							   "java org.apache.lucene.wordnet.SynExpand <index path> <query>");
  		}
  
  		FSDirectory directory = FSDirectory.getDirectory(args[0], false);
  		IndexSearcher searcher = new IndexSearcher(directory);
  
  		String query = args[1];
  		String field = "contents";
  
  		Query q = expand( query, searcher, new StandardAnalyzer(), field, 0.9f);
  		System.out.println( "Query: " + q.toString( field));
  
  
  
  		searcher.close();
  		directory.close();
  	}
  
  
  	/**
  	 * Perform synonym expansion on a query.
  	 *
  	 * @param query users query that is assumed to not have any "special" query syntax, thus
it should be just normal words, so "big dog" makes sense, but a query like "title:foo^1.2"
doesn't as this should presumably be passed directly to the default query parser.
  	 *
  	 * @param syns a opened to the Lucene index you previously created with {@see Syns2Index}.
The searcher is not closed or otherwise altered.
  	 *
  	 * @param a optional analyzer used to parse the users query else {@see StandardAnalzyer}
is used
  	 *
  	 * @param field optional field name to search in or null if you want the default of "contents"
  	 *
  	 * @param boost optional boost applied to synonyms else no boost is applied
  	 *
  	 * @return the expanded Query
  	 */ 
  	public static Query expand( String query,
  								Searcher syns,
  								Analyzer a,
  								String field,
  								float boost)
  		throws IOException
  	{
  		Set already = new HashSet(); // avoid dups 
  		List top = new LinkedList(); // needs to be separately listed..
  		if ( field == null) field = "contents";
  		if ( a == null) a = new StandardAnalyzer();
  
  		// [1] Parse query into separate words so that when we expand we can avoid dups
  		TokenStream ts = a.tokenStream( field, new StringReader( query));
  		org.apache.lucene.analysis.Token t;
  		while ( (t = ts.next()) != null)
  		{
  			String word = t.termText();
  			if ( already.add( word))
  				top.add( word);
  		}
  		BooleanQuery tmp = new BooleanQuery();
  		
  		// [2] form query
  		Iterator it = top.iterator();
  		while ( it.hasNext())
  		{
  			// [2a] add to level words in
  			String word = (String) it.next();
  			TermQuery tq = new TermQuery( new Term( field, word));
  			tmp.add( tq, BooleanClause.Occur.SHOULD);
  
  			// [2b] add in unique synonums
  			Hits hits = syns.search( new TermQuery( new Term(Syns2Index.F_WORD, word)));
  			for (int i = 0; i < hits.length(); i++)
  			{
  				Document doc = hits.doc(i);
  				String[] values = doc.getValues( Syns2Index.F_SYN);
  				for ( int j = 0; j < values.length; j++)
  				{
  					String syn = values[ j];
  					if ( already.add( syn)) // avoid dups of top level words and synonyms
  					{
  						tq = new TermQuery( new Term( field, syn));
  						if ( boost > 0) // else keep normal 1.0
  							tq.setBoost( boost);
  						tmp.add( tq, BooleanClause.Occur.SHOULD); 
  					}
  				}
  			}
  		}
  
  
  		return tmp;
  	}
  								
  }
  
  
  
  1.1                  jakarta-lucene-sandbox/contributions/WordNet/src/java/org/apache/lucene/wordnet/package.html
  
  Index: package.html
  ===================================================================
  <html>
      <head>
  <title>WordNet Lucene Synonyms Integration</title>
  </head>
  <body>
  
      This package uses synonyms defined by <a href="http://www.cogsci.princeton.edu/~wn/">WordNet</a>
to build a
      Lucene index storing them, which in turn can be used for query expansion.
  
      You normally run {@see org.apache.lucene.wordnet.Syns2Index} once to build the query
index/"database", and then call
      {@see org.apache.lucene.wordnet.SynExpand#expand SynExpand.expand(...)} to expand a
query.
  
      <p>
  
  	<h3> Instructions </h3>
  	<ol>
  	    <li> Download the <a href="http://www.cogsci.princeton.edu/2.0/WNprolog-2.0.tar.gz">WordNet
prolog database</a> , gunzip, untar etc.
  	<li> Invoke Syn2Index as appropriate to build a synonym index.
  	    It'll take 2 arguments, the path to wn_s.pl from that WordNet downlaod, and the index
name.
     
  	 <li> Update your UI so that as appropriate you call SynExpand.expand(...) to expand
user queries with synonyms.
         </ol>
  
  </body>
      </html>
  
  

---------------------------------------------------------------------
To unsubscribe, e-mail: lucene-dev-unsubscribe@jakarta.apache.org
For additional commands, e-mail: lucene-dev-help@jakarta.apache.org


Mime
View raw message