lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From markrmil...@apache.org
Subject svn commit: r824934 - in /lucene/java/branches/flex_1458: contrib/regex/src/java/org/apache/lucene/search/regex/ src/java/org/apache/lucene/search/
Date Tue, 13 Oct 2009 21:29:24 GMT
Author: markrmiller
Date: Tue Oct 13 21:29:23 2009
New Revision: 824934

URL: http://svn.apache.org/viewvc?rev=824934&view=rev
Log:
add missing enums classes

Added:
    lucene/java/branches/flex_1458/contrib/regex/src/java/org/apache/lucene/search/regex/RegexTermsEnum.java
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/FilteredTermsEnum.java
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/FuzzyTermsEnum.java
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/PrefixTermsEnum.java
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/TermRangeTermsEnum.java
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/WildcardTermsEnum.java

Added: lucene/java/branches/flex_1458/contrib/regex/src/java/org/apache/lucene/search/regex/RegexTermsEnum.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/regex/src/java/org/apache/lucene/search/regex/RegexTermsEnum.java?rev=824934&view=auto
==============================================================================
--- lucene/java/branches/flex_1458/contrib/regex/src/java/org/apache/lucene/search/regex/RegexTermsEnum.java
(added)
+++ lucene/java/branches/flex_1458/contrib/regex/src/java/org/apache/lucene/search/regex/RegexTermsEnum.java
Tue Oct 13 21:29:23 2009
@@ -0,0 +1,86 @@
+package org.apache.lucene.search.regex;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.search.FilteredTermsEnum;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermRef;
+
+import java.io.IOException;
+
+/**
+ * Subclass of FilteredTermsEnum for enumerating all terms that match the
+ * specified regular expression term using the specified regular expression
+ * implementation.
+ * <p>
+ * Term enumerations are always ordered by Term.compareTo().  Each term in
+ * the enumeration is greater than all that precede it.
+ *
+ * @deprecated Use {@link RegexTermsEnum} instead.
+ */
+
+public class RegexTermsEnum extends FilteredTermsEnum {
+  private String field = "";
+  private String pre = "";
+  private final boolean empty;
+  private RegexCapabilities regexImpl;
+  private final TermRef prefixRef;
+
+  public RegexTermsEnum(IndexReader reader, Term term, RegexCapabilities regexImpl) throws
IOException {
+    super();
+    field = term.field();
+    String text = term.text();
+    this.regexImpl = regexImpl;
+
+    regexImpl.compile(text);
+
+    pre = regexImpl.prefix();
+    if (pre == null) pre = "";
+
+    Terms terms = reader.fields().terms(term.field());
+    prefixRef = new TermRef(pre);
+    if (terms != null) {
+      empty = setEnum(terms.iterator(), prefixRef) == null;
+    } else {
+      empty = true;
+    }
+  }
+
+  public String field() {
+    return field;
+  }
+
+  protected final boolean accept(TermRef term) {
+    if (term.startsWith(prefixRef)) {
+      return regexImpl.match(term.toString());
+    } else {
+      return false;
+    }
+  }
+
+  public final float difference() {
+// TODO: adjust difference based on distance of searchTerm.text() and term().text()
+    return 1.0f;
+  }
+
+  public final boolean empty() {
+    return empty;
+  }
+}

Added: lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/FilteredTermsEnum.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/FilteredTermsEnum.java?rev=824934&view=auto
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/FilteredTermsEnum.java
(added)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/FilteredTermsEnum.java
Tue Oct 13 21:29:23 2009
@@ -0,0 +1,142 @@
+package org.apache.lucene.search;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import org.apache.lucene.index.TermRef;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.index.DocsEnum;
+import org.apache.lucene.util.Bits;
+
+/**
+ * Abstract class for enumerating a subset of all terms. 
+ *
+ * <p>On creation, the enumerator must already be positioned
+ * to the first term.</p>
+ * 
+ * <p>Term enumerations are always ordered by
+ * Term.compareTo().  Each term in the enumeration is
+ * greater than all that precede it.</p>
+*/
+public abstract class FilteredTermsEnum extends TermsEnum {
+    
+  /** the delegate enum - to set this member use {@link #setEnum} */
+  protected TermsEnum actualEnum;
+    
+  /** Return true if term is acceptd */
+  protected abstract boolean accept(TermRef term);
+    
+  /** Equality measure on the term */
+  public abstract float difference();
+
+  public abstract String field();
+
+  /** Only called once, right after construction, to check
+   *  whether there are no matching terms */
+  public abstract boolean empty();
+
+  /**
+   * use this method to set the actual TermsEnum (e.g. in ctor),
+   * it will be automatically positioned on the first
+   * accepted term, and returns the term found or null if
+   * there is no matching term.
+   */
+  protected TermRef setEnum(TermsEnum actualEnum, TermRef term) throws IOException {
+    this.actualEnum = actualEnum;
+
+    // Find the first term that matches
+    if (term != null) {
+      SeekStatus status = actualEnum.seek(term);
+      if (status == SeekStatus.END) {
+        return null;
+      } else {
+        if (!accept(actualEnum.term())) {
+          return next();
+        } else {
+          return actualEnum.term();
+        }
+      }
+    } else {
+      return next();
+    }
+  }
+
+  public TermRef term() throws IOException {
+    assert actualEnum != null;
+    return actualEnum.term();
+  }
+    
+  /** 
+   * Returns the docFreq of the current Term in the enumeration.
+   * Returns -1 if no Term matches or all terms have been enumerated.
+   */
+  public int docFreq() {
+    assert actualEnum != null;
+    return actualEnum.docFreq();
+  }
+    
+  /** Increments the enumeration to the next element.  True if one exists. */
+  public TermRef next() throws IOException {
+    assert actualEnum != null;
+    while (true) {
+      TermRef term = actualEnum.next();
+      if (term != null) {
+        if (accept(term)) {
+          return term;
+        }
+      } else {
+        // end
+        return null;
+      }
+    }
+  }
+
+  public SeekStatus seek(TermRef term) throws IOException {
+    return finishSeek(actualEnum.seek(term));
+  }
+
+  public SeekStatus seek(long ord) throws IOException {
+    return finishSeek(actualEnum.seek(ord));
+  }
+
+  private SeekStatus finishSeek(SeekStatus status) throws IOException {
+    if (status != SeekStatus.END) {
+      TermRef term = actualEnum.term();
+      if (!accept(term)) {
+        term = next();
+        if (term == null) {
+          return SeekStatus.END;
+        } else {
+          return SeekStatus.NOT_FOUND;
+        }
+      } else {
+        return status;
+      }
+    } else {
+      return status;
+    }
+  }
+
+  public long ord() throws IOException {
+    return actualEnum.ord();
+  }
+
+  public DocsEnum docs(Bits bits) throws IOException {
+    return actualEnum.docs(bits);
+  }
+}

Added: lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/FuzzyTermsEnum.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/FuzzyTermsEnum.java?rev=824934&view=auto
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/FuzzyTermsEnum.java (added)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/FuzzyTermsEnum.java Tue
Oct 13 21:29:23 2009
@@ -0,0 +1,317 @@
+package org.apache.lucene.search;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermRef;
+
+import java.io.IOException;
+
+/** Subclass of FilteredTermEnum for enumerating all terms that are similar
+ * to the specified filter term.
+ *
+ * <p>Term enumerations are always ordered by Term.compareTo().  Each term in
+ * the enumeration is greater than all that precede it.
+ */
+public final class FuzzyTermsEnum extends FilteredTermsEnum {
+
+  /* This should be somewhere around the average long word.
+   * If it is longer, we waste time and space. If it is shorter, we waste a
+   * little bit of time growing the array as we encounter longer words.
+   */
+  private static final int TYPICAL_LONGEST_WORD_IN_INDEX = 19;
+
+  /* Allows us save time required to create a new array
+   * every time similarity is called.
+   */
+  private int[][] d;
+
+  private float similarity;
+  private final boolean empty;
+
+  private Term searchTerm;
+  private final String field;
+  private final String text;
+  private final String prefix;
+
+  private final float minimumSimilarity;
+  private final float scale_factor;
+  private final int[] maxDistances = new int[TYPICAL_LONGEST_WORD_IN_INDEX];
+
+  // nocommit -- remove some of these ctors:
+  /**
+   * Creates a FuzzyTermEnum with an empty prefix and a minSimilarity of 0.5f.
+   * <p>
+   * After calling the constructor the enumeration is already pointing to the first 
+   * valid term if such a term exists. 
+   * 
+   * @param reader
+   * @param term
+   * @throws IOException
+   * @see #FuzzyTermEnum(IndexReader, Term, float, int)
+   */
+  public FuzzyTermsEnum(IndexReader reader, Term term) throws IOException {
+    this(reader, term, FuzzyQuery.defaultMinSimilarity, FuzzyQuery.defaultPrefixLength);
+  }
+    
+  /**
+   * Creates a FuzzyTermEnum with an empty prefix.
+   * <p>
+   * After calling the constructor the enumeration is already pointing to the first 
+   * valid term if such a term exists. 
+   * 
+   * @param reader
+   * @param term
+   * @param minSimilarity
+   * @throws IOException
+   * @see #FuzzyTermEnum(IndexReader, Term, float, int)
+   */
+  public FuzzyTermsEnum(IndexReader reader, Term term, float minSimilarity) throws IOException
{
+    this(reader, term, minSimilarity, FuzzyQuery.defaultPrefixLength);
+  }
+    
+  /**
+   * Constructor for enumeration of all terms from specified <code>reader</code>
which share a prefix of
+   * length <code>prefixLength</code> with <code>term</code> and
which have a fuzzy similarity &gt;
+   * <code>minSimilarity</code>.
+   * <p>
+   * After calling the constructor the enumeration is already pointing to the first 
+   * valid term if such a term exists. 
+   * 
+   * @param reader Delivers terms.
+   * @param term Pattern term.
+   * @param minSimilarity Minimum required similarity for terms from the reader. Default
value is 0.5f.
+   * @param prefixLength Length of required common prefix. Default value is 0.
+   * @throws IOException
+   */
+  public FuzzyTermsEnum(IndexReader reader, Term term, final float minSimilarity, final int
prefixLength) throws IOException {
+    super();
+    
+    if (minSimilarity >= 1.0f)
+      throw new IllegalArgumentException("minimumSimilarity cannot be greater than or equal
to 1");
+    else if (minSimilarity < 0.0f)
+      throw new IllegalArgumentException("minimumSimilarity cannot be less than 0");
+    if(prefixLength < 0)
+      throw new IllegalArgumentException("prefixLength cannot be less than 0");
+
+    this.minimumSimilarity = minSimilarity;
+    this.scale_factor = 1.0f / (1.0f - minimumSimilarity);
+    this.searchTerm = term;
+    this.field = searchTerm.field();
+
+    //The prefix could be longer than the word.
+    //It's kind of silly though.  It means we must match the entire word.
+    final int fullSearchTermLength = searchTerm.text().length();
+    final int realPrefixLength = prefixLength > fullSearchTermLength ? fullSearchTermLength
: prefixLength;
+
+    this.text = searchTerm.text().substring(realPrefixLength);
+    this.prefix = searchTerm.text().substring(0, realPrefixLength);
+    prefixTermRef = new TermRef(prefix);
+    initializeMaxDistances();
+    this.d = initDistanceArray();
+
+    Terms terms = reader.fields().terms(field);
+    if (terms != null) {
+      empty = setEnum(terms.iterator(), prefixTermRef) == null;
+    } else {
+      empty = false;
+    }
+  }
+
+  private final TermRef prefixTermRef;
+
+  public String field() {
+    return field;
+  }
+
+  /**
+   * The termCompare method in FuzzyTermEnum uses Levenshtein distance to 
+   * calculate the distance between the given term and the comparing term. 
+   */
+  protected final boolean accept(TermRef term) {
+    if (term.startsWith(prefixTermRef)) {
+      // TODO: costly that we create intermediate String:
+      final String target = term.toString().substring(prefix.length());
+      this.similarity = similarity(target);
+      return (similarity > minimumSimilarity);
+    } else {
+      return false;
+    }
+  }
+  
+  public final float difference() {
+    return (float)((similarity - minimumSimilarity) * scale_factor);
+  }
+  
+  public final boolean empty() {
+    return empty;
+  }
+  
+  /******************************
+   * Compute Levenshtein distance
+   ******************************/
+  
+  /**
+   * Finds and returns the smallest of three integers 
+   */
+  private static final int min(int a, int b, int c) {
+    final int t = (a < b) ? a : b;
+    return (t < c) ? t : c;
+  }
+
+  private final int[][] initDistanceArray(){
+    return new int[this.text.length() + 1][TYPICAL_LONGEST_WORD_IN_INDEX];
+  }
+
+  /**
+   * <p>Similarity returns a number that is 1.0f or less (including negative numbers)
+   * based on how similar the Term is compared to a target term.  It returns
+   * exactly 0.0f when
+   * <pre>
+   *    editDistance &lt; maximumEditDistance</pre>
+   * Otherwise it returns:
+   * <pre>
+   *    1 - (editDistance / length)</pre>
+   * where length is the length of the shortest term (text or target) including a
+   * prefix that are identical and editDistance is the Levenshtein distance for
+   * the two words.</p>
+   *
+   * <p>Embedded within this algorithm is a fail-fast Levenshtein distance
+   * algorithm.  The fail-fast algorithm differs from the standard Levenshtein
+   * distance algorithm in that it is aborted if it is discovered that the
+   * minimum distance between the words is greater than some threshold.
+   *
+   * <p>To calculate the maximum distance threshold we use the following formula:
+   * <pre>
+   *     (1 - minimumSimilarity) * length</pre>
+   * where length is the shortest term including any prefix that is not part of the
+   * similarity comparison.  This formula was derived by solving for what maximum value
+   * of distance returns false for the following statements:
+   * <pre>
+   *   similarity = 1 - ((float)distance / (float) (prefixLength + Math.min(textlen, targetlen)));
+   *   return (similarity > minimumSimilarity);</pre>
+   * where distance is the Levenshtein distance for the two words.
+   * </p>
+   * <p>Levenshtein distance (also known as edit distance) is a measure of similarity
+   * between two strings where the distance is measured as the number of character
+   * deletions, insertions or substitutions required to transform one string to
+   * the other string.
+   * @param target the target word or phrase
+   * @return the similarity,  0.0 or less indicates that it matches less than the required
+   * threshold and 1.0 indicates that the text and target are identical
+   */
+  private synchronized final float similarity(final String target) {
+    final int m = target.length();
+    final int n = text.length();
+    if (n == 0)  {
+      //we don't have anything to compare.  That means if we just add
+      //the letters for m we get the new word
+      return prefix.length() == 0 ? 0.0f : 1.0f - ((float) m / prefix.length());
+    }
+    if (m == 0) {
+      return prefix.length() == 0 ? 0.0f : 1.0f - ((float) n / prefix.length());
+    }
+
+    final int maxDistance = getMaxDistance(m);
+
+    if (maxDistance < Math.abs(m-n)) {
+      //just adding the characters of m to n or vice-versa results in
+      //too many edits
+      //for example "pre" length is 3 and "prefixes" length is 8.  We can see that
+      //given this optimal circumstance, the edit distance cannot be less than 5.
+      //which is 8-3 or more precisely Math.abs(3-8).
+      //if our maximum edit distance is 4, then we can discard this word
+      //without looking at it.
+      return 0.0f;
+    }
+
+    //let's make sure we have enough room in our array to do the distance calculations.
+    if (d[0].length <= m) {
+      growDistanceArray(m);
+    }
+
+    // init matrix d
+    for (int i = 0; i <= n; i++) d[i][0] = i;
+    for (int j = 0; j <= m; j++) d[0][j] = j;
+    
+    // start computing edit distance
+    for (int i = 1; i <= n; i++) {
+      int bestPossibleEditDistance = m;
+      final char s_i = text.charAt(i - 1);
+      for (int j = 1; j <= m; j++) {
+        if (s_i != target.charAt(j-1)) {
+            d[i][j] = min(d[i-1][j], d[i][j-1], d[i-1][j-1])+1;
+        }
+        else {
+          d[i][j] = min(d[i-1][j]+1, d[i][j-1]+1, d[i-1][j-1]);
+        }
+        bestPossibleEditDistance = Math.min(bestPossibleEditDistance, d[i][j]);
+      }
+
+      //After calculating row i, the best possible edit distance
+      //can be found by found by finding the smallest value in a given column.
+      //If the bestPossibleEditDistance is greater than the max distance, abort.
+
+      if (i > maxDistance && bestPossibleEditDistance > maxDistance) {  //equal
is okay, but not greater
+        //the closest the target can be to the text is just too far away.
+        //this target is leaving the party early.
+        return 0.0f;
+      }
+    }
+
+    // this will return less than 0.0 when the edit distance is
+    // greater than the number of characters in the shorter word.
+    // but this was the formula that was previously used in FuzzyTermEnum,
+    // so it has not been changed (even though minimumSimilarity must be
+    // greater than 0.0)
+    return 1.0f - ((float)d[n][m] / (float) (prefix.length() + Math.min(n, m)));
+  }
+
+  /**
+   * Grow the second dimension of the array, so that we can calculate the
+   * Levenshtein difference.
+   */
+  private void growDistanceArray(int m) {
+    for (int i = 0; i < d.length; i++) {
+      d[i] = new int[m+1];
+    }
+  }
+
+  /**
+   * The max Distance is the maximum Levenshtein distance for the text
+   * compared to some other value that results in score that is
+   * better than the minimum similarity.
+   * @param m the length of the "other value"
+   * @return the maximum levenshtein distance that we care about
+   */
+  private final int getMaxDistance(int m) {
+    return (m < maxDistances.length) ? maxDistances[m] : calculateMaxDistance(m);
+  }
+
+  private void initializeMaxDistances() {
+    for (int i = 0; i < maxDistances.length; i++) {
+      maxDistances[i] = calculateMaxDistance(i);
+    }
+  }
+  
+  private int calculateMaxDistance(int m) {
+    return (int) ((1-minimumSimilarity) * (Math.min(text.length(), m) + prefix.length()));
+  }
+}

Added: lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/PrefixTermsEnum.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/PrefixTermsEnum.java?rev=824934&view=auto
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/PrefixTermsEnum.java
(added)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/PrefixTermsEnum.java
Tue Oct 13 21:29:23 2009
@@ -0,0 +1,72 @@
+package org.apache.lucene.search;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermRef;
+
+/**
+ * Subclass of FilteredTermEnum for enumerating all terms that match the
+ * specified prefix filter term.
+ * <p>
+ * Term enumerations are always ordered by Term.compareTo().  Each term in
+ * the enumeration is greater than all that precede it.
+ *
+ */
+public class PrefixTermsEnum extends FilteredTermsEnum {
+
+  private final Term prefix;
+  private final TermRef prefixRef;
+  private final boolean empty;
+
+  public PrefixTermsEnum(IndexReader reader, Term prefix) throws IOException {
+    this.prefix = prefix;
+    Terms terms = reader.fields().terms(prefix.field());
+    if (terms != null) {
+      prefixRef = new TermRef(prefix.text());
+      empty = setEnum(terms.iterator(), prefixRef) == null;
+    } else {
+      empty = true;
+      prefixRef = null;
+    }
+  }
+
+  public String field() {
+    return prefix.field();
+  }
+
+  public float difference() {
+    return 1.0f;
+  }
+
+  public boolean empty() {
+    return empty;
+  }
+  
+  protected Term getPrefixTerm() {
+    return prefix;
+  }
+
+  protected boolean accept(TermRef term) {
+    return term.startsWith(prefixRef);
+  }
+}

Added: lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/TermRangeTermsEnum.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/TermRangeTermsEnum.java?rev=824934&view=auto
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/TermRangeTermsEnum.java
(added)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/TermRangeTermsEnum.java
Tue Oct 13 21:29:23 2009
@@ -0,0 +1,155 @@
+package org.apache.lucene.search;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.text.Collator;
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.TermRef;
+import org.apache.lucene.index.Terms;
+//import org.apache.lucene.index.Term;
+import org.apache.lucene.util.StringHelper;
+
+/**
+ * Subclass of FilteredTermEnum for enumerating all terms that match the
+ * specified range parameters.
+ * <p>
+ * Term enumerations are always ordered by Term.compareTo().  Each term in
+ * the enumeration is greater than all that precede it.
+ */
+public class TermRangeTermsEnum extends FilteredTermsEnum {
+
+  private Collator collator;
+  private boolean end;
+  private String field;
+  private String upperTermText;
+  private String lowerTermText;
+  private boolean includeLower;
+  private boolean includeUpper;
+  final private TermRef lowerTermRef;
+  final private TermRef upperTermRef;
+  private final boolean empty;
+
+  /**
+   * Enumerates all terms greater/equal than <code>lowerTerm</code>
+   * but less/equal than <code>upperTerm</code>. 
+   * 
+   * If an endpoint is null, it is said to be "open". Either or both 
+   * endpoints may be open.  Open endpoints may not be exclusive 
+   * (you can't select all but the first or last term without 
+   * explicitly specifying the term to exclude.)
+   * 
+   * @param reader
+   * @param field
+   *          An interned field that holds both lower and upper terms.
+   * @param lowerTermText
+   *          The term text at the lower end of the range
+   * @param upperTermText
+   *          The term text at the upper end of the range
+   * @param includeLower
+   *          If true, the <code>lowerTerm</code> is included in the range.
+   * @param includeUpper
+   *          If true, the <code>upperTerm</code> is included in the range.
+   * @param collator
+   *          The collator to use to collate index Terms, to determine their
+   *          membership in the range bounded by <code>lowerTerm</code> and
+   *          <code>upperTerm</code>.
+   * 
+   * @throws IOException
+   */
+  public TermRangeTermsEnum(IndexReader reader, String field, String lowerTermText, String
upperTermText, 
+    boolean includeLower, boolean includeUpper, Collator collator) throws IOException {
+    this.collator = collator;
+    this.upperTermText = upperTermText;
+    this.lowerTermText = lowerTermText;
+    this.includeLower = includeLower;
+    this.includeUpper = includeUpper;
+    this.field = StringHelper.intern(field);
+    // do a little bit of normalization...
+    // open ended range queries should always be inclusive.
+    if (this.lowerTermText == null) {
+      this.lowerTermText = "";
+      this.includeLower = true;
+    }
+    lowerTermRef = new TermRef(this.lowerTermText);
+    
+    if (this.upperTermText == null) {
+      this.includeUpper = true;
+      upperTermRef = null;
+    } else {
+      upperTermRef = new TermRef(upperTermText);
+    }
+
+    String startTermText = collator == null ? this.lowerTermText : "";
+    Terms terms = reader.fields().terms(field);
+
+    if (terms != null) {
+      final boolean foundFirstTerm = setEnum(terms.iterator(), new TermRef(startTermText))
!= null;
+      if (foundFirstTerm && collator == null && !this.includeLower &&
term().termEquals(lowerTermRef)) {
+        empty = next() == null;
+      } else {
+        empty = !foundFirstTerm;
+      }
+    } else {
+      empty = true;
+    }
+  }
+
+  public float difference() {
+    return 1.0f;
+  }
+
+  public boolean empty() {
+    return empty;
+  }
+
+  public String field() {
+    return field;
+  }
+
+  protected boolean accept(TermRef term) {
+    if (collator == null) {
+      // Use Unicode code point ordering
+      if (upperTermRef != null) {
+        final int cmp = upperTermRef.compareTerm(term);
+        /*
+         * if beyond the upper term, or is exclusive and this is equal to
+         * the upper term, break out
+         */
+        if ((cmp < 0) ||
+            (!includeUpper && cmp==0)) {
+          return false;
+        }
+      }
+      return true;
+    } else {
+      if ((includeLower
+           ? collator.compare(term.toString(), lowerTermText) >= 0
+           : collator.compare(term.toString(), lowerTermText) > 0)
+          && (upperTermText == null
+              || (includeUpper
+                  ? collator.compare(term.toString(), upperTermText) <= 0
+                  : collator.compare(term.toString(), upperTermText) < 0))) {
+        return true;
+      }
+      end = true;
+    }
+    return false;
+  }
+}

Added: lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/WildcardTermsEnum.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/WildcardTermsEnum.java?rev=824934&view=auto
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/WildcardTermsEnum.java
(added)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/WildcardTermsEnum.java
Tue Oct 13 21:29:23 2009
@@ -0,0 +1,203 @@
+package org.apache.lucene.search;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermRef;
+
+/**
+ * Subclass of FilteredTermEnum for enumerating all terms that match the
+ * specified wildcard filter term.
+ * <p>
+ * Term enumerations are always ordered by Term.compareTo().  Each term in
+ * the enumeration is greater than all that precede it.
+ *
+ * @version $Id: WildcardTermEnum.java 783371 2009-06-10 14:39:56Z mikemccand $
+ */
+public class WildcardTermsEnum extends FilteredTermsEnum {
+  final Term searchTerm;
+  final String field;
+  final String text;
+  final String pre;
+  final int preLen;
+  private final boolean empty;
+  private final TermRef preTermRef;
+
+  /**
+   * Creates a new <code>WildcardTermEnum</code>.
+   * <p>
+   * After calling the constructor the enumeration is already pointing to the first 
+   * valid term if such a term exists.
+   */
+  public WildcardTermsEnum(IndexReader reader, Term term) throws IOException {
+    super();
+    searchTerm = term;
+    field = searchTerm.field();
+    final String searchTermText = searchTerm.text();
+
+    final int sidx = searchTermText.indexOf(WILDCARD_STRING);
+    final int cidx = searchTermText.indexOf(WILDCARD_CHAR);
+    int idx = sidx;
+    if (idx == -1) {
+      idx = cidx;
+    }
+    else if (cidx >= 0) {
+      idx = Math.min(idx, cidx);
+    }
+    pre = idx != -1?searchTerm.text().substring(0,idx): "";
+
+    preLen = pre.length();
+    text = searchTermText.substring(preLen);
+    preTermRef = new TermRef(pre);
+
+    Terms terms = reader.fields().terms(searchTerm.field());
+    if (terms != null) {
+      empty = setEnum(terms.iterator(), preTermRef) == null;
+    } else {
+      empty = true;
+    }
+  }
+
+  public String field() {
+    return searchTerm.field();
+  }
+
+  protected final boolean accept(TermRef term) {
+    if (term.startsWith(preTermRef)) {
+      // TODO: would be better, but trickier, to not have to
+      // build intermediate String (ie check wildcard matching
+      // directly on UTF8)
+      final String searchText = term.toString();
+      return wildcardEquals(text, 0, searchText, preLen);
+    }
+    return false;
+  }
+
+  public float difference() {
+    return 1.0f;
+  }
+
+  public final boolean empty() {
+    return empty;
+  }
+
+  /********************************************
+   * String equality with support for wildcards
+   ********************************************/
+
+  public static final char WILDCARD_STRING = '*';
+  public static final char WILDCARD_CHAR = '?';
+
+  /**
+   * Determines if a word matches a wildcard pattern.
+   * <small>Work released by Granta Design Ltd after originally being done on
+   * company time.</small>
+   */
+  public static final boolean wildcardEquals(String pattern, int patternIdx,
+    String string, int stringIdx)
+  {
+    int p = patternIdx;
+    
+    for (int s = stringIdx; ; ++p, ++s)
+      {
+        // End of string yet?
+        boolean sEnd = (s >= string.length());
+        // End of pattern yet?
+        boolean pEnd = (p >= pattern.length());
+
+        // If we're looking at the end of the string...
+        if (sEnd)
+        {
+          // Assume the only thing left on the pattern is/are wildcards
+          boolean justWildcardsLeft = true;
+
+          // Current wildcard position
+          int wildcardSearchPos = p;
+          // While we haven't found the end of the pattern,
+          // and haven't encountered any non-wildcard characters
+          while (wildcardSearchPos < pattern.length() && justWildcardsLeft)
+          {
+            // Check the character at the current position
+            char wildchar = pattern.charAt(wildcardSearchPos);
+            
+            // If it's not a wildcard character, then there is more
+            // pattern information after this/these wildcards.
+            if (wildchar != WILDCARD_CHAR && wildchar != WILDCARD_STRING)
+            {
+              justWildcardsLeft = false;
+            }
+            else
+            {
+              // to prevent "cat" matches "ca??"
+              if (wildchar == WILDCARD_CHAR) {
+                return false;
+              }
+              
+              // Look at the next character
+              wildcardSearchPos++;
+            }
+          }
+
+          // This was a prefix wildcard search, and we've matched, so
+          // return true.
+          if (justWildcardsLeft)
+          {
+            return true;
+          }
+        }
+
+        // If we've gone past the end of the string, or the pattern,
+        // return false.
+        if (sEnd || pEnd)
+        {
+          break;
+        }
+
+        // Match a single character, so continue.
+        if (pattern.charAt(p) == WILDCARD_CHAR)
+        {
+          continue;
+        }
+
+        //
+        if (pattern.charAt(p) == WILDCARD_STRING)
+        {
+          // Look at the character beyond the '*'.
+          ++p;
+          // Examine the string, starting at the last character.
+          for (int i = string.length(); i >= s; --i)
+          {
+            if (wildcardEquals(pattern, p, string, i))
+            {
+              return true;
+            }
+          }
+          break;
+        }
+        if (pattern.charAt(p) != string.charAt(s))
+        {
+          break;
+        }
+      }
+      return false;
+  }
+}



Mime
View raw message