lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From yo...@apache.org
Subject svn commit: r596484 - in /lucene/java/trunk: CHANGES.txt src/java/org/apache/lucene/analysis/CharArraySet.java src/java/org/apache/lucene/analysis/StopFilter.java src/test/org/apache/lucene/analysis/TestStopFilter.java
Date Mon, 19 Nov 2007 23:23:08 GMT
Author: yonik
Date: Mon Nov 19 15:23:04 2007
New Revision: 596484

URL: http://svn.apache.org/viewvc?rev=596484&view=rev
Log:
LUCENE-1040: new CharArraySet, make StopFilter directly use it

Modified:
    lucene/java/trunk/CHANGES.txt
    lucene/java/trunk/src/java/org/apache/lucene/analysis/CharArraySet.java
    lucene/java/trunk/src/java/org/apache/lucene/analysis/StopFilter.java
    lucene/java/trunk/src/test/org/apache/lucene/analysis/TestStopFilter.java

Modified: lucene/java/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/CHANGES.txt?rev=596484&r1=596483&r2=596484&view=diff
==============================================================================
--- lucene/java/trunk/CHANGES.txt (original)
+++ lucene/java/trunk/CHANGES.txt Mon Nov 19 15:23:04 2007
@@ -1,5 +1,4 @@
-Lucene Change Log
-
+Lucene Change Log
 $Id$
 
 ======================= Trunk (not yet released) =======================
@@ -203,6 +202,10 @@
     be significantly faster than open(), depending on the amount of
     index changes. SegmentReader, MultiSegmentReader, MultiReader,
     and ParallelReader implement reopen(). (Michael Busch) 
+
+10. LUCENE-1040: CharArraySet useful for efficiently checking
+    set membership of text specified by char[]. (yonik)
+ 
 
 Optimizations
 

Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/CharArraySet.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/CharArraySet.java?rev=596484&r1=596483&r2=596484&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/CharArraySet.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/CharArraySet.java Mon Nov 19 15:23:04
2007
@@ -1,5 +1,9 @@
 package org.apache.lucene.analysis;
 
+import java.util.AbstractSet;
+import java.util.Collection;
+import java.util.Iterator;
+
 /**
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
@@ -19,90 +23,138 @@
 
 
 /**
- * A simple class that can store & retrieve char[]'s in a
+ * A simple class that stores Strings as char[]'s in a
  * hash table.  Note that this is not a general purpose
- * class.  For example, it cannot remove char[]'s from the
+ * class.  For example, it cannot remove items from the
  * set, nor does it resize its hash table to be smaller,
- * etc.  It is designed for use with StopFilter to enable
- * quick filtering based on the char[] termBuffer in a
- * Token.
+ * etc.  It is designed to be quick to test if a char[]
+ * is in the set without the necessity of converting it
+ * to a String first.
  */
 
-final class CharArraySet {
-
+public class CharArraySet extends AbstractSet {
   private final static int INIT_SIZE = 8;
-  private final static double MAX_LOAD_FACTOR = 0.75;
-  private int mask;
   private char[][] entries;
   private int count;
-  private boolean ignoreCase;
+  private final boolean ignoreCase;
 
   /** Create set with enough capacity to hold startSize
    *  terms */
   public CharArraySet(int startSize, boolean ignoreCase) {
     this.ignoreCase = ignoreCase;
     int size = INIT_SIZE;
-    while(((double) startSize)/size >= MAX_LOAD_FACTOR)
-      size *= 2;
-    mask = size-1;
+    while(startSize + (startSize>>2) > size)
+      size <<= 1;
     entries = new char[size][];
   }
 
-  /** Returns true if the characters in text up to length
-   *  len is present in the set. */
-  public boolean contains(char[] text, int len) {
+ /** Create set from a Collection of char[] or String */
+  public CharArraySet(Collection c, boolean ignoreCase) {
+    this(c.size(), ignoreCase);
+    addAll(c);
+  }
+
+  /** true if the <code>len</code> chars of <code>text</code> starting
at <code>off</code>
+   * are in the set */
+  public boolean contains(char[] text, int off, int len) {
+    return entries[getSlot(text, off, len)] != null;
+  }
+
+  /** true if the <code>CharSequence</code> is in the set */
+  public boolean contains(CharSequence cs) {
+    return entries[getSlot(cs)] != null;
+  }
+
+  private int getSlot(char[] text, int off, int len) {
     int code = getHashCode(text, len);
-    int pos = code & mask;
+    int pos = code & (entries.length-1);
     char[] text2 = entries[pos];
-    if (text2 != null && !equals(text, len, text2)) {
+    if (text2 != null && !equals(text, off, len, text2)) {
       final int inc = ((code>>8)+code)|1;
       do {
         code += inc;
-        pos = code & mask;
+        pos = code & (entries.length-1);
         text2 = entries[pos];
-      } while (text2 != null && !equals(text, len, text2));
+      } while (text2 != null && !equals(text, off, len, text2));
     }
-    return text2 != null;
-  }
-
-  /** Add this String into the set */
-  public void add(String text) {
-    add(text.toCharArray());
+    return pos;
   }
 
-  /** Add this text into the set */
-  public void add(char[] text) {
-    if (ignoreCase)
-      for(int i=0;i<text.length;i++)
-        text[i] = Character.toLowerCase(text[i]);
-    int code = getHashCode(text, text.length);
-    int pos = code & mask;
+  /** Returns true if the String is in the set */  
+  private int getSlot(CharSequence text) {
+    int code = getHashCode(text);
+    int pos = code & (entries.length-1);
     char[] text2 = entries[pos];
-    if (text2 != null) {
+    if (text2 != null && !equals(text, text2)) {
       final int inc = ((code>>8)+code)|1;
       do {
         code += inc;
-        pos = code & mask;
+        pos = code & (entries.length-1);
         text2 = entries[pos];
-      } while (text2 != null);
+      } while (text2 != null && !equals(text, text2));
     }
-    entries[pos] = text;
+    return pos;
+  }
+
+  /** Add this CharSequence into the set */
+  public boolean add(CharSequence text) {
+    return add(text.toString()); // could be more efficient
+  }
+
+  /** Add this String into the set */
+  public boolean add(String text) {
+    return add(text.toCharArray());
+  }
+
+  /** Add this char[] directly to the set.
+   * If ignoreCase is true for this Set, the text array will be directly modified.
+   * The user should never modify this text array after calling this method.
+   */
+  public boolean add(char[] text) {
+    if (ignoreCase)
+      for(int i=0;i<text.length;i++)
+        text[i] = Character.toLowerCase(text[i]);
+    int slot = getSlot(text, 0, text.length);
+    if (entries[slot] != null) return false;
+    entries[slot] = text;
     count++;
 
-    if (((double) count)/entries.length > MAX_LOAD_FACTOR) {
+    if (count > entries.length + (entries.length>>2) ) {
       rehash();
     }
+
+    return true;
   }
 
-  private boolean equals(char[] text1, int len, char[] text2) {
+  private boolean equals(char[] text1, int off, int len, char[] text2) {
     if (len != text2.length)
       return false;
-    for(int i=0;i<len;i++) {
-      if (ignoreCase) {
-        if (Character.toLowerCase(text1[i]) != text2[i])
+    if (ignoreCase) {
+      for(int i=0;i<len;i++) {
+        if (Character.toLowerCase(text1[off+i]) != text2[i])
           return false;
-      } else {
-        if (text1[i] != text2[i])
+      }
+    } else {
+      for(int i=0;i<len;i++) {
+        if (text1[off+i] != text2[i])
+          return false;
+      }
+    }
+    return true;
+  }
+
+  private boolean equals(CharSequence text1, char[] text2) {
+    int len = text1.length();
+    if (len != text2.length)
+      return false;
+    if (ignoreCase) {
+      for(int i=0;i<len;i++) {
+        if (Character.toLowerCase(text1.charAt(i)) != text2[i])
+          return false;
+      }
+    } else {
+      for(int i=0;i<len;i++) {
+        if (text1.charAt(i) != text2[i])
           return false;
       }
     }
@@ -111,39 +163,125 @@
 
   private void rehash() {
     final int newSize = 2*count;
-    mask = newSize-1;
+    char[][] oldEntries = entries;
+    char[][] entries = new char[newSize][];
 
-    char[][] newEntries = new char[newSize][];
-    for(int i=0;i<entries.length;i++) {
-      char[] text = entries[i];
+    for(int i=0;i<oldEntries.length;i++) {
+      char[] text = oldEntries[i];
       if (text != null) {
-        int code = getHashCode(text, text.length);
-        int pos = code & mask;
-        if (newEntries[pos] != null) {
-          final int inc = ((code>>8)+code)|1;
-          do {
-            code += inc;
-            pos = code & mask;
-          } while (newEntries[pos] != null);
-        }
-        newEntries[pos] = text;
+        // todo: could be faster... no need to compare strings on collision
+        entries[ getSlot(text,0,text.length) ] = text;
       }
     }
-
-    entries = newEntries;
   }
   
   private int getHashCode(char[] text, int len) {
-    int downto = len;
     int code = 0;
-    while (downto > 0) {
-      final char c;
-      if (ignoreCase)
-        c = Character.toLowerCase(text[--downto]);
-      else
-        c = text[--downto];
-      code = (code*31) + c;
+    if (ignoreCase) {
+      for (int i=0; i<len; i++) {
+        code = code*31 + Character.toLowerCase(text[i]);
+      }
+    } else {
+      for (int i=0; i<len; i++) {
+        code = code*31 + text[i];
+      }
+    }
+    return code;
+  }
+
+  private int getHashCode(CharSequence text) {
+    int code;
+    if (ignoreCase) {
+      code = 0;
+      int len = text.length();
+      for (int i=0; i<len; i++) {
+        code = code*31 + Character.toLowerCase(text.charAt(i));
+      }
+    } else {
+      if (false && text instanceof String) {
+        code = text.hashCode();
+      } else {
+        code = 0;
+        int len = text.length();
+        for (int i=0; i<len; i++) {
+          code = code*31 + text.charAt(i);
+        }
+      }
     }
     return code;
   }
+
+
+  public int size() {
+    return count;
+  }
+
+  public boolean isEmpty() {
+    return count==0;
+  }
+
+  public boolean contains(Object o) {
+    if (o instanceof char[]) {
+      char[] text = (char[])o;
+      return contains(text, 0, text.length);
+    } else if (o instanceof CharSequence) {
+      return contains((CharSequence)o);
+    }
+    return false;
+  }
+
+  public boolean add(Object o) {
+    if (o instanceof char[]) {
+      return add((char[])o);
+    } else if (o instanceof String) {
+      return add((String)o);
+    } else if (o instanceof CharSequence) {
+      return add((CharSequence)o);
+    } else {
+      return add(o.toString());
+    }
+  }
+
+  /** The Iterator<String> for this set.  Strings are constructed on the fly, so
+   * use <code>nextCharArray</code> for more efficient access. */
+  public class CharArraySetIterator implements Iterator {
+    int pos=-1;
+    char[] next;
+    CharArraySetIterator() {
+      goNext();
+    }
+
+    private void goNext() {
+      next = null;
+      pos++;
+      while (pos < entries.length && (next=entries[pos]) == null) pos++;
+    }
+
+    public boolean hasNext() {
+      return next != null;
+    }
+
+    /** do not modify the returned char[] */
+    public char[] nextCharArray() {
+      char[] ret = next;
+      goNext();
+      return ret;
+    }
+
+    /** Returns the next String, as a Set<String> would...
+     * use nextCharArray() for better efficiency. */
+    public Object next() {
+      return new String(nextCharArray());
+    }
+
+    public void remove() {
+      throw new UnsupportedOperationException();
+    }
+  }
+
+
+  public Iterator iterator() {
+    return new CharArraySetIterator();
+  }
+
 }

Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/StopFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/StopFilter.java?rev=596484&r1=596483&r2=596484&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/StopFilter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/StopFilter.java Mon Nov 19 15:23:04
2007
@@ -18,7 +18,7 @@
  */
 
 import java.io.IOException;
-import java.util.HashSet;
+import java.util.Arrays;
 import java.util.Iterator;
 import java.util.Set;
 
@@ -29,7 +29,6 @@
 public final class StopFilter extends TokenFilter {
 
   private final CharArraySet stopWords;
-  private final boolean ignoreCase;
 
   /**
    * Construct a token stream filtering the given input.
@@ -45,32 +44,39 @@
    */
   public StopFilter(TokenStream in, String[] stopWords, boolean ignoreCase) {
     super(in);
-    this.ignoreCase = ignoreCase;
-    this.stopWords = makeStopCharArraySet(stopWords, ignoreCase);
+    this.stopWords = (CharArraySet)makeStopSet(stopWords, ignoreCase);
   }
 
 
   /**
    * Construct a token stream filtering the given input.
+   * If <code>stopWords</code> is an instance of {@link CharArraySet} (true if
+   * <code>makeStopSet()</code> was used to construct the set) it will be directly
used
+   * and <code>ignoreCase</code> will be ignored since <code>CharArraySet</code>
+   * directly controls case sensitivity.
+   * <p/>
+   * If <code>stopWords</code> is not an instance of {@link CharArraySet},
+   * a new CharArraySet will be constructed and <code>ignoreCase</code> will
be
+   * used to specify the case sensitivity of that set.
+   *
    * @param input
-   * @param stopWords The set of Stop Words, as Strings.  If ignoreCase is true, all strings
should be lower cased
-   * @param ignoreCase -Ignore case when stopping.  The stopWords set must be setup to contain
only lower case words 
+   * @param stopWords The set of Stop Words.
+   * @param ignoreCase -Ignore case when stopping.
    */
   public StopFilter(TokenStream input, Set stopWords, boolean ignoreCase)
   {
     super(input);
-    this.ignoreCase = ignoreCase;
-    this.stopWords = new CharArraySet(stopWords.size(), ignoreCase);
-    Iterator it = stopWords.iterator();
-    while(it.hasNext())
-      this.stopWords.add((String) it.next());
+    if (stopWords instanceof CharArraySet) {
+      this.stopWords = (CharArraySet)stopWords;
+    } else {
+      this.stopWords = new CharArraySet(stopWords.size(), ignoreCase);
+      this.stopWords.addAll(stopWords);
+    }
   }
 
   /**
    * Constructs a filter which removes words from the input
    * TokenStream that are named in the Set.
-   * It is crucial that an efficient Set implementation is used
-   * for maximum performance.
    *
    * @see #makeStopSet(java.lang.String[])
    */
@@ -97,18 +103,9 @@
    * @return a Set containing the words
    */    
   public static final Set makeStopSet(String[] stopWords, boolean ignoreCase) {
-    HashSet stopTable = new HashSet(stopWords.length);
-    for (int i = 0; i < stopWords.length; i++)
-      stopTable.add(ignoreCase ? stopWords[i].toLowerCase() : stopWords[i]);
-    return stopTable;
-  }
-
-  private static final CharArraySet makeStopCharArraySet(String[] stopWords, boolean ignoreCase)
{
     CharArraySet stopSet = new CharArraySet(stopWords.length, ignoreCase);
-    for (int i = 0; i < stopWords.length; i++)
-      stopSet.add(ignoreCase ? stopWords[i].toLowerCase() : stopWords[i]);
-    return stopSet;
-  }
+    stopSet.addAll(Arrays.asList(stopWords));
+    return stopSet;  }
 
   /**
    * Returns the next input Token whose termText() is not a stop word.
@@ -116,7 +113,7 @@
   public final Token next(Token result) throws IOException {
     // return the first non-stop word found
     while((result = input.next(result)) != null) {
-      if (!stopWords.contains(result.termBuffer(), result.termLength))
+      if (!stopWords.contains(result.termBuffer(), 0, result.termLength))
         return result;
     }
     // reached EOS -- return null

Modified: lucene/java/trunk/src/test/org/apache/lucene/analysis/TestStopFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/analysis/TestStopFilter.java?rev=596484&r1=596483&r2=596484&view=diff
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/analysis/TestStopFilter.java (original)
+++ lucene/java/trunk/src/test/org/apache/lucene/analysis/TestStopFilter.java Mon Nov 19 15:23:04
2007
@@ -16,10 +16,11 @@
  * limitations under the License.
  */
 
+import org.apache.lucene.util.LuceneTestCase;
+
 import java.io.IOException;
 import java.io.StringReader;
-
-import org.apache.lucene.util.LuceneTestCase;
+import java.util.Set;
 
 /**
  * @author yonik
@@ -43,6 +44,16 @@
     TokenStream stream = new StopFilter(new WhitespaceTokenizer(reader), stopWords, true);
     assertEquals("Now", stream.next().termText());
     assertEquals(null,stream.next());
+  }
+
+  public void testStopFilt() throws IOException {
+    StringReader reader = new StringReader("Now is The Time");
+    String[] stopWords = new String[] { "is", "the", "Time" };
+    Set stopSet = StopFilter.makeStopSet(stopWords);
+    TokenStream stream = new StopFilter(new WhitespaceTokenizer(reader), stopSet);
+    assertEquals("Now", stream.next().termText());
+    assertEquals("The", stream.next().termText());
+    assertEquals(null, stream.next());
   }
 
 }



Mime
View raw message