jackrabbit-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mreut...@apache.org
Subject svn commit: r239404 - in /incubator/jackrabbit/trunk/core/src/java/org/apache/jackrabbit/core/query/lucene: LuceneQueryBuilder.java WildcardQuery.java WildcardTermEnum.java
Date Tue, 23 Aug 2005 13:31:56 GMT
Author: mreutegg
Date: Tue Aug 23 06:31:51 2005
New Revision: 239404

URL: http://svn.apache.org/viewcvs?rev=239404&view=rev
Log:
JCR-196: jcr:like() does not scale well on large value ranges

Modified:
    incubator/jackrabbit/trunk/core/src/java/org/apache/jackrabbit/core/query/lucene/LuceneQueryBuilder.java
    incubator/jackrabbit/trunk/core/src/java/org/apache/jackrabbit/core/query/lucene/WildcardQuery.java
    incubator/jackrabbit/trunk/core/src/java/org/apache/jackrabbit/core/query/lucene/WildcardTermEnum.java

Modified: incubator/jackrabbit/trunk/core/src/java/org/apache/jackrabbit/core/query/lucene/LuceneQueryBuilder.java
URL: http://svn.apache.org/viewcvs/incubator/jackrabbit/trunk/core/src/java/org/apache/jackrabbit/core/query/lucene/LuceneQueryBuilder.java?rev=239404&r1=239403&r2=239404&view=diff
==============================================================================
--- incubator/jackrabbit/trunk/core/src/java/org/apache/jackrabbit/core/query/lucene/LuceneQueryBuilder.java
(original)
+++ incubator/jackrabbit/trunk/core/src/java/org/apache/jackrabbit/core/query/lucene/LuceneQueryBuilder.java
Tue Aug 23 06:31:51 2005
@@ -614,8 +614,7 @@
                 if (stringValues[0].equals("%")) {
                     query = new MatchAllQuery(field);
                 } else {
-                    Term t = new Term(FieldNames.PROPERTIES, FieldNames.createNamedValue(field,
stringValues[0]));
-                    query = new WildcardQuery(t);
+                    query = new WildcardQuery(FieldNames.PROPERTIES, field, stringValues[0]);
                 }
                 break;
             case QueryConstants.OPERATION_LT_VALUE:      // <

Modified: incubator/jackrabbit/trunk/core/src/java/org/apache/jackrabbit/core/query/lucene/WildcardQuery.java
URL: http://svn.apache.org/viewcvs/incubator/jackrabbit/trunk/core/src/java/org/apache/jackrabbit/core/query/lucene/WildcardQuery.java?rev=239404&r1=239403&r2=239404&view=diff
==============================================================================
--- incubator/jackrabbit/trunk/core/src/java/org/apache/jackrabbit/core/query/lucene/WildcardQuery.java
(original)
+++ incubator/jackrabbit/trunk/core/src/java/org/apache/jackrabbit/core/query/lucene/WildcardQuery.java
Tue Aug 23 06:31:51 2005
@@ -18,20 +18,326 @@
 
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermEnum;
+import org.apache.lucene.index.TermDocs;
 import org.apache.lucene.search.FilteredTermEnum;
 import org.apache.lucene.search.MultiTermQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.Weight;
+import org.apache.lucene.search.Searcher;
+import org.apache.lucene.search.Scorer;
+import org.apache.lucene.search.Explanation;
+import org.apache.lucene.search.Similarity;
+import org.apache.log4j.Logger;
+import org.apache.commons.collections.map.LRUMap;
 
 import java.io.IOException;
+import java.util.BitSet;
+import java.util.WeakHashMap;
+import java.util.Map;
 
 /**
+ * Implements a wildcard query on a lucene field with an embedded property name
+ * and a pattern.
+ * <p/>
+ * Wildcards are:
+ * <ul>
+ * <li><code>%</code> : matches zero or more characters</li>
+ * <li><code>_</code> : matches exactly one character</li>
+ * </ul>
  */
-class WildcardQuery extends MultiTermQuery {
+class WildcardQuery extends Query {
 
-    public WildcardQuery(Term term) {
-        super(term);
+    /**
+     * Logger instance for this class.
+     */
+    private static final Logger log = Logger.getLogger(WildcardQuery.class);
+
+    /**
+     * Name of the field to search.
+     */
+    private final String field;
+
+    /**
+     * Name of the property to search.
+     */
+    private final String propName;
+
+    /**
+     * The wildcard pattern.
+     */
+    private final String pattern;
+
+    /**
+     * Simple result cache for previously calculated hits.
+     * key=IndexReader value=Map{key=String:pattern,value=BitSet:hits}
+     */
+    private static final Map cache = new WeakHashMap();
+
+    /**
+     * Creates a new <code>WildcardQuery</code>.
+     *
+     * @param field the name of the field to search.
+     * @param propName name of the property to search.
+     * @param pattern the wildcard pattern.
+     */
+    public WildcardQuery(String field, String propName, String pattern) {
+        this.field = field;
+        this.propName = propName;
+        this.pattern = pattern;
+    }
+
+    /**
+     * Either rewrites this query to a lucene MultiTermQuery or in case of
+     * a TooManyClauses exception to a custom jackrabbit query implementation
+     * that uses a BitSet to collect all hits.
+     *
+     * @param reader the index reader to use for the search.
+     * @return the rewritten query.
+     * @throws IOException if an error occurs while reading from the index.
+     */
+    public Query rewrite(IndexReader reader) throws IOException {
+        Query stdWildcardQuery = new MultiTermQuery(new Term(FieldNames.PROPERTIES, pattern))
{
+            protected FilteredTermEnum getEnum(IndexReader reader) throws IOException {
+                return new WildcardTermEnum(reader, field, propName, pattern);
+            }
+        };
+        try {
+            return stdWildcardQuery.rewrite(reader);
+        } catch (BooleanQuery.TooManyClauses e) {
+            // MultiTermQuery not possible
+            log.debug("Too many terms to enumerate, using custom WildcardQuery.");
+            return this;
+        }
+    }
+
+    /**
+     * Creates the <code>Weight</code> for this query.
+     *
+     * @param searcher the searcher to use for the <code>Weight</code>.
+     * @return the <code>Weigth</code> for this query.
+     */
+    protected Weight createWeight(Searcher searcher) {
+        return new WildcardQueryWeight(searcher);
     }
 
-    protected FilteredTermEnum getEnum(IndexReader reader) throws IOException {
-        return new WildcardTermEnum(reader, getTerm());
+    /**
+     * Returns a string representation of this query.
+     *
+     * @param field the field name for which to create a string representation.
+     * @return a string representation of this query.
+     */
+    public String toString(String field) {
+        return propName + ":" + pattern;
+    }
+
+    /**
+     * The <code>Weight</code> implementation for this <code>WildcardQuery</code>.
+     */
+    private class WildcardQueryWeight implements Weight {
+
+        /**
+         * The searcher in use
+         */
+        private final Searcher searcher;
+
+        /**
+         * Creates a new <code>WildcardQueryWeight</code> instance using
+         * <code>searcher</code>.
+         *
+         * @param searcher a <code>Searcher</code> instance.
+         */
+        public WildcardQueryWeight(Searcher searcher) {
+            this.searcher = searcher;
+        }
+
+        /**
+         * Returns this <code>WildcardQuery</code>.
+         *
+         * @return this <code>WildcardQuery</code>.
+         */
+        public Query getQuery() {
+            return WildcardQuery.this;
+        }
+
+        /**
+         * {@inheritDoc}
+         */
+        public float getValue() {
+            return 1.0f;
+        }
+
+        /**
+         * {@inheritDoc}
+         */
+        public float sumOfSquaredWeights() throws IOException {
+            return 1.0f;
+        }
+
+        /**
+         * {@inheritDoc}
+         */
+        public void normalize(float norm) {
+        }
+
+        /**
+         * Creates a scorer for this <code>WildcardQuery</code>.
+         *
+         * @param reader a reader for accessing the index.
+         * @return a <code>WildcardQueryScorer</code>.
+         * @throws IOException if an error occurs while reading from the index.
+         */
+        public Scorer scorer(IndexReader reader) throws IOException {
+            return new WildcardQueryScorer(searcher.getSimilarity(), reader);
+        }
+
+        /**
+         * {@inheritDoc}
+         */
+        public Explanation explain(IndexReader reader, int doc) throws IOException {
+            return new Explanation();
+        }
+    }
+
+    /**
+     * Implements a <code>Scorer</code> for this <code>WildcardQuery</code>.
+     */
+    private final class WildcardQueryScorer extends Scorer {
+
+        /**
+         * The index reader to use for calculating the matching documents.
+         */
+        private final IndexReader reader;
+
+        /**
+         * The documents ids that match this wildcard query.
+         */
+        private final BitSet hits;
+
+        /**
+         * Set to <code>true</code> when the hits have been calculated.
+         */
+        private boolean hitsCalculated = false;
+
+        /**
+         * The next document id to return
+         */
+        private int nextDoc = -1;
+
+        /**
+         * The cache key to use to store the results.
+         */
+        private final String cacheKey;
+
+        /**
+         * The map to store the results.
+         */
+        private final Map resultMap;
+
+        /**
+         * Creates a new WildcardQueryScorer.
+         *
+         * @param similarity the similarity implementation.
+         * @param reader     the index reader to use.
+         */
+        WildcardQueryScorer(Similarity similarity, IndexReader reader) {
+            super(similarity);
+            this.reader = reader;
+            this.cacheKey = field + '\uFFFF' + propName + '\uFFFF' + pattern;
+            // check cache
+            synchronized (cache) {
+                Map m = (Map) cache.get(reader);
+                if (m == null) {
+                    m = new LRUMap(10);
+                    cache.put(reader, m);
+                }
+                resultMap = m;
+            }
+            synchronized (resultMap) {
+                BitSet result = (BitSet) resultMap.get(cacheKey);
+                if (result == null) {
+                    result = new BitSet(reader.maxDoc());
+                } else {
+                    hitsCalculated = true;
+                }
+                hits = result;
+            }
+        }
+
+        /**
+         * {@inheritDoc}
+         */
+        public boolean next() throws IOException {
+            calculateHits();
+            nextDoc = hits.nextSetBit(nextDoc + 1);
+            return nextDoc > -1;
+        }
+
+        /**
+         * {@inheritDoc}
+         */
+        public int doc() {
+            return nextDoc;
+        }
+
+        /**
+         * {@inheritDoc}
+         */
+        public float score() {
+            return 1.0f;
+        }
+
+        /**
+         * {@inheritDoc}
+         */
+        public boolean skipTo(int target) {
+            nextDoc = hits.nextSetBit(target);
+            return nextDoc > -1;
+        }
+
+        /**
+         * Returns an empty Explanation object.
+         * @return an empty Explanation object.
+         */
+        public Explanation explain(int doc) {
+            return new Explanation();
+        }
+
+        /**
+         * Calculates the ids of the documents matching this wildcard query.
+         * @throws IOException if an error occurs while reading from the index.
+         */
+        private void calculateHits() throws IOException {
+            if (hitsCalculated) {
+                return;
+            }
+            TermEnum enum = new WildcardTermEnum(reader, field, propName, pattern);
+            try {
+                // use unpositioned TermDocs
+                TermDocs docs = reader.termDocs();
+                try {
+                    while (enum.term() != null) {
+                        docs.seek(enum);
+                        while (docs.next()) {
+                            hits.set(docs.doc());
+                        }
+                        if (!enum.next()) {
+                            break;
+                        }
+                    }
+                } finally {
+                    docs.close();
+                }
+            } finally {
+                enum.close();
+            }
+            hitsCalculated = true;
+            // put to cache
+            synchronized (resultMap) {
+                resultMap.put(cacheKey, hits);
+            }
+        }
+
     }
 }

Modified: incubator/jackrabbit/trunk/core/src/java/org/apache/jackrabbit/core/query/lucene/WildcardTermEnum.java
URL: http://svn.apache.org/viewcvs/incubator/jackrabbit/trunk/core/src/java/org/apache/jackrabbit/core/query/lucene/WildcardTermEnum.java?rev=239404&r1=239403&r2=239404&view=diff
==============================================================================
--- incubator/jackrabbit/trunk/core/src/java/org/apache/jackrabbit/core/query/lucene/WildcardTermEnum.java
(original)
+++ incubator/jackrabbit/trunk/core/src/java/org/apache/jackrabbit/core/query/lucene/WildcardTermEnum.java
Tue Aug 23 06:31:51 2005
@@ -22,52 +22,103 @@
 
 import java.io.IOException;
 import java.util.regex.Pattern;
+import java.util.regex.Matcher;
 
 /**
+ * Implements a wildcard term enum that supports embedded property names in
+ * lucene term texts.
  */
 class WildcardTermEnum extends FilteredTermEnum {
 
-    private final Pattern pattern;
-
+    /**
+     * The pattern matcher.
+     */
+    private final Matcher pattern;
+
+    /**
+     * The lucene field to search.
+     */
     private final String field;
 
+    /**
+     * The term prefix without wildcards
+     */
+    private final String prefix;
+
+    /**
+     * Flag that indicates the end of the term enum.
+     */
     private boolean endEnum = false;
 
-    public WildcardTermEnum(IndexReader reader, Term term) throws IOException {
-        pattern = createRegexp(term.text());
-        field = term.field();
+    /**
+     * The input for the pattern matcher.
+     */
+    private final OffsetCharSequence input;
+
+    /**
+     * Creates a new <code>WildcardTermEnum</code>.
+     *
+     * @param reader the index reader.
+     * @param field the lucene field to search.
+     * @param propName the embedded jcr property name.
+     * @param pattern the pattern to match the values.
+     * @throws IOException if an error occurs while reading from the index.
+     */
+    public WildcardTermEnum(IndexReader reader,
+                            String field,
+                            String propName,
+                            String pattern) throws IOException {
+        this.field = field;
 
         int idx = 0;
-        while (idx < term.text().length()
-                && Character.isLetterOrDigit(term.text().charAt(idx))) {
+        while (idx < pattern.length()
+                && Character.isLetterOrDigit(pattern.charAt(idx))) {
             idx++;
         }
-        // because IndexReader.terms() starts with the term after the given
-        // one start with idx - 1
-        if (idx > 0) {
-            idx--;
-        }
-        setEnum(reader.terms(new Term(term.field(), term.text().substring(0, idx))));
+
+        prefix = FieldNames.createNamedValue(propName, pattern.substring(0, idx));
+
+        // initialize with prefix as dummy value
+        input = new OffsetCharSequence(prefix.length(), prefix);
+        this.pattern = createRegexp(pattern.substring(idx)).matcher(input);
+
+        setEnum(reader.terms(new Term(field, prefix)));
     }
 
+    /**
+     * @inheritDoc
+     */
     protected boolean termCompare(Term term) {
-        if (term.field() == field) {
-            return pattern.matcher(term.text()).matches();
+        if (term.field() == field && term.text().startsWith(prefix)) {
+            input.setBase(term.text());
+            return pattern.reset().matches();
         }
         endEnum = true;
         return false;
     }
 
+    /**
+     * @inheritDoc
+     */
     protected float difference() {
         return 1.0f;
     }
 
+    /**
+     * @inheritDoc
+     */
     protected boolean endEnum() {
         return endEnum;
     }
 
     //--------------------------< internal >------------------------------------
 
+    /**
+     * Creates a regexp from <code>likePattern</code>.
+     *
+     * @param likePattern the pattern.
+     * @return the regular expression <code>Pattern</code>.
+     */
     private Pattern createRegexp(String likePattern) {
         // - escape all non alphabetic characters
         // - escape constructs like \<alphabetic char> into \\<alphabetic char>
@@ -110,5 +161,70 @@
             }
         }
         return Pattern.compile(regexp.toString());
+    }
+
+    /**
+     * CharSequence that applies an offset to a base CharSequence. The base
+     * CharSequence can be replaced without creating a new CharSequence.
+     */
+    private static final class OffsetCharSequence implements CharSequence {
+
+        /**
+         * The offset to apply to the base CharSequence
+         */
+        private final int offset;
+
+        /**
+         * The base character sequence
+         */
+        private CharSequence base;
+
+        /**
+         * Creates a new OffsetCharSequence with an <code>offset</code>.
+         *
+         * @param offset the offset
+         * @param base the base CharSequence
+         */
+        OffsetCharSequence(int offset, CharSequence base) {
+            this.offset = offset;
+            this.base = base;
+        }
+
+        /**
+         * Sets a new base sequence.
+         *
+         * @param base the base character sequence
+         */
+        public void setBase(CharSequence base) {
+            this.base = base;
+        }
+
+        /**
+         * @inheritDoc
+         */
+        public int length() {
+            return base.length() - offset;
+        }
+
+        /**
+         * @inheritDoc
+         */
+        public char charAt(int index) {
+            return base.charAt(index + offset);
+        }
+
+        /**
+         * @inheritDoc
+         */
+        public CharSequence subSequence(int start, int end) {
+            return base.subSequence(start + offset, end + offset);
+        }
+
+        /**
+         * @inheritDoc
+         */
+        public String toString() {
+            return base.subSequence(offset, base.length()).toString();
+        }
     }
 }



Mime
View raw message