lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From er...@apache.org
Subject svn commit: r1206229 - in /lucene/dev/trunk: modules/queryparser/src/java/org/apache/lucene/queryparser/classic/ solr/ solr/core/src/java/org/apache/solr/schema/ solr/core/src/java/org/apache/solr/search/ solr/core/src/test-files/solr/conf/ solr/core/s...
Date Fri, 25 Nov 2011 15:46:27 GMT
Author: erick
Date: Fri Nov 25 15:46:26 2011
New Revision: 1206229

URL: http://svn.apache.org/viewvc?rev=1206229&view=rev
Log:
SOLR-2438, allow an analysis chain to be created for multiterm query terms or synthesize one
if not defined explicitly

Added:
    lucene/dev/trunk/solr/core/src/test-files/solr/conf/schema-folding.xml
    lucene/dev/trunk/solr/core/src/test/org/apache/solr/schema/MultiTermTest.java
    lucene/dev/trunk/solr/core/src/test/org/apache/solr/search/TestFoldingMultitermQuery.java
Modified:
    lucene/dev/trunk/modules/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParserBase.java
    lucene/dev/trunk/solr/CHANGES.txt
    lucene/dev/trunk/solr/core/src/java/org/apache/solr/schema/FieldProperties.java
    lucene/dev/trunk/solr/core/src/java/org/apache/solr/schema/FieldType.java
    lucene/dev/trunk/solr/core/src/java/org/apache/solr/schema/FieldTypePluginLoader.java
    lucene/dev/trunk/solr/core/src/java/org/apache/solr/schema/SchemaField.java
    lucene/dev/trunk/solr/core/src/java/org/apache/solr/schema/TextField.java
    lucene/dev/trunk/solr/core/src/java/org/apache/solr/search/SolrQueryParser.java
    lucene/dev/trunk/solr/example/solr/conf/schema.xml

Modified: lucene/dev/trunk/modules/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParserBase.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParserBase.java?rev=1206229&r1=1206228&r2=1206229&view=diff
==============================================================================
--- lucene/dev/trunk/modules/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParserBase.java
(original)
+++ lucene/dev/trunk/modules/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParserBase.java
Fri Nov 25 15:46:26 2011
@@ -290,7 +290,6 @@ public abstract class QueryParserBase {
     this.lowercaseExpandedTerms = lowercaseExpandedTerms;
   }
 
-
   /**
    * @see #setLowercaseExpandedTerms(boolean)
    */
@@ -778,14 +777,21 @@ public abstract class QueryParserBase {
     return new FuzzyQuery(term,minimumSimilarity,prefixLength);
   }
 
-  private BytesRef analyzeRangePart(String field, String part) {
+  // TODO: Should this be protected instead?
+  private BytesRef analyzeMultitermTerm(String field, String part) {
+    return analyzeMultitermTerm(field, part, analyzer);
+  }
+
+  protected BytesRef analyzeMultitermTerm(String field, String part, Analyzer analyzerIn)
{
     TokenStream source;
-      
+
+    if (analyzerIn == null) analyzerIn = analyzer;
+
     try {
-      source = analyzer.tokenStream(field, new StringReader(part));
+      source = analyzerIn.tokenStream(field, new StringReader(part));
       source.reset();
     } catch (IOException e) {
-      throw new RuntimeException("Unable to initialize TokenStream to analyze range part:
" + part, e);
+      throw new RuntimeException("Unable to initialize TokenStream to analyze multiTerm term:
" + part, e);
     }
       
     TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
@@ -793,10 +799,10 @@ public abstract class QueryParserBase {
 
     try {
       if (!source.incrementToken())
-        throw new IllegalArgumentException("analyzer returned no terms for range part: "
+ part);
+        throw new IllegalArgumentException("analyzer returned no terms for multiTerm term:
" + part);
       termAtt.fillBytesRef();
       if (source.incrementToken())
-        throw new IllegalArgumentException("analyzer returned too many terms for range part:
" + part);
+        throw new IllegalArgumentException("analyzer returned too many terms for multiTerm
term: " + part);
     } catch (IOException e) {
       throw new RuntimeException("error analyzing range part: " + part, e);
     }
@@ -805,7 +811,7 @@ public abstract class QueryParserBase {
       source.end();
       source.close();
     } catch (IOException e) {
-      throw new RuntimeException("Unable to end & close TokenStream after analyzing range
part: " + part, e);
+      throw new RuntimeException("Unable to end & close TokenStream after analyzing multiTerm
term: " + part, e);
     }
     
     return BytesRef.deepCopyOf(bytes);
@@ -827,13 +833,13 @@ public abstract class QueryParserBase {
     if (part1 == null) {
       start = null;
     } else {
-      start = analyzeRangeTerms ? analyzeRangePart(field, part1) : new BytesRef(part1);
+      start = analyzeRangeTerms ? analyzeMultitermTerm(field, part1) : new BytesRef(part1);
     }
      
     if (part2 == null) {
       end = null;
     } else {
-      end = analyzeRangeTerms ? analyzeRangePart(field, part2) : new BytesRef(part2);
+      end = analyzeRangeTerms ? analyzeMultitermTerm(field, part2) : new BytesRef(part2);
     }
       
     final TermRangeQuery query = new TermRangeQuery(field, start, end, startInclusive, endInclusive);

Modified: lucene/dev/trunk/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/CHANGES.txt?rev=1206229&r1=1206228&r2=1206229&view=diff
==============================================================================
--- lucene/dev/trunk/solr/CHANGES.txt (original)
+++ lucene/dev/trunk/solr/CHANGES.txt Fri Nov 25 15:46:26 2011
@@ -188,6 +188,11 @@ New Features
  
 * SOLR-2134 Trie* fields should support sortMissingLast=true, and deprecate Sortable* Field
Types
   (Ryan McKinley, Mike McCandless, Uwe Schindler, Erick Erickson)
+    
+* SOLR-2438: Case insensitive search for wildcard queries. Actually, the ability to specify
+  a complete analysis chain for multiterm queries.   
+  (Pete Sturge Erick Erickson, Mentoring from Seeley and Muir)
+
 
 Optimizations
 ----------------------
@@ -383,6 +388,11 @@ New Features
 * SOLR-1565: StreamingUpdateSolrServer supports RequestWriter API and therefore, javabin
update
   format (shalin)
 
+* SOLR-2438: Case insensitive search for wildcard queries. Actually, the ability to specify
+  a complete analysis chain for multiterm queries.   
+  (Pete Sturge Erick Erickson, Mentoring from Seeley and Muir)
+
+
 Bug Fixes
 ----------------------
 * SOLR-2912: Fixed File descriptor leak in ShowFileRequestHandler (Michael Ryan, shalin)

Modified: lucene/dev/trunk/solr/core/src/java/org/apache/solr/schema/FieldProperties.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/java/org/apache/solr/schema/FieldProperties.java?rev=1206229&r1=1206228&r2=1206229&view=diff
==============================================================================
--- lucene/dev/trunk/solr/core/src/java/org/apache/solr/schema/FieldProperties.java (original)
+++ lucene/dev/trunk/solr/core/src/java/org/apache/solr/schema/FieldProperties.java Fri Nov
25 15:46:26 2011
@@ -48,13 +48,15 @@ public abstract class FieldProperties {
   
   protected final static int REQUIRED            = 0x00001000;
   protected final static int OMIT_POSITIONS      = 0x00002000;
+  protected final static int LEGACY_MULTITERM    = 0x00004000;
   
   static final String[] propertyNames = {
           "indexed", "tokenized", "stored",
           "binary", "omitNorms", "omitTermFreqAndPositions",
           "termVectors", "termPositions", "termOffsets",
           "multiValued",
-          "sortMissingFirst","sortMissingLast","required", "omitPositions"
+          "sortMissingFirst","sortMissingLast","required", "omitPositions" ,
+          "legacyMultiTerm"
   };
 
   static final Map<String,Integer> propertyMap = new HashMap<String,Integer>();

Modified: lucene/dev/trunk/solr/core/src/java/org/apache/solr/schema/FieldType.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/java/org/apache/solr/schema/FieldType.java?rev=1206229&r1=1206228&r2=1206229&view=diff
==============================================================================
--- lucene/dev/trunk/solr/core/src/java/org/apache/solr/schema/FieldType.java (original)
+++ lucene/dev/trunk/solr/core/src/java/org/apache/solr/schema/FieldType.java Fri Nov 25 15:46:26
2011
@@ -429,6 +429,21 @@ public abstract class FieldType extends 
   protected Analyzer queryAnalyzer=analyzer;
 
   /**
+   * Analyzer set by schema for text types to use when searching fields
+   * of this type, subclasses can set analyzer themselves or override
+   * getAnalyzer()
+   * This analyzer is used to process wildcard, prefix, regex and other multiterm queries.
It
+   * assembles a list of tokenizer +filters that "make sense" for this, primarily accent
folding and
+   * lowercasing filters, and charfilters.
+   *
+   * If users require old-style behavior, they can specify 'legacyMultiterm="true" ' in the
schema file
+   * @see #getMultiTermAnalyzer
+   * @see #setMultiTermAnalyzer
+   */
+  protected Analyzer multiTermAnalyzer=null;
+
+
+  /**
    * Returns the Analyzer to be used when indexing fields of this type.
    * <p>
    * This method may be called many times, at any time.
@@ -450,6 +465,17 @@ public abstract class FieldType extends 
     return queryAnalyzer;
   }
 
+  /**
+   * Returns the Analyzer to be used when searching fields of this type when mult-term queries
are specified.
+   * <p>
+   * This method may be called many times, at any time.
+   * </p>
+   * @see #getAnalyzer
+   */
+  public Analyzer getMultiTermAnalyzer() {
+    return multiTermAnalyzer;
+  }
+
   private final String analyzerError = 
     "FieldType: " + this.getClass().getSimpleName() + 
     " (" + typeName + ") does not support specifying an analyzer";
@@ -498,6 +524,28 @@ public abstract class FieldType extends 
     throw e;
   }
 
+  /**
+   * Sets the Analyzer to be used when querying fields of this type.
+   *
+   * <p>
+   *
+   * Subclasses that override this method need to ensure the behavior
+   * of the analyzer is consistent with the implementation of toInternal.
+   * </p>
+   *
+   * @see #toInternal
+   * @see #setAnalyzer
+   * @see #getQueryAnalyzer
+   */
+  public void setMultiTermAnalyzer(Analyzer analyzer) {
+    SolrException e = new SolrException
+      (ErrorCode.SERVER_ERROR,
+       "FieldType: " + this.getClass().getSimpleName() +
+       " (" + typeName + ") does not support specifying an analyzer");
+    SolrException.logOnce(log,null,e);
+    throw e;
+  }
+
   /** @lucene.internal */
   protected Similarity similarity;
   

Modified: lucene/dev/trunk/solr/core/src/java/org/apache/solr/schema/FieldTypePluginLoader.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/java/org/apache/solr/schema/FieldTypePluginLoader.java?rev=1206229&r1=1206228&r2=1206229&view=diff
==============================================================================
--- lucene/dev/trunk/solr/core/src/java/org/apache/solr/schema/FieldTypePluginLoader.java
(original)
+++ lucene/dev/trunk/solr/core/src/java/org/apache/solr/schema/FieldTypePluginLoader.java
Fri Nov 25 15:46:26 2011
@@ -18,19 +18,15 @@
 package org.apache.solr.schema;
 
 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.core.KeywordAnalyzer;
 import org.apache.lucene.search.similarities.Similarity;
 import org.apache.lucene.util.Version;
+import org.apache.solr.analysis.*;
 import org.apache.solr.common.ResourceLoader;
 import org.apache.solr.common.SolrException;
-import org.apache.solr.common.params.SolrParams;
 import org.apache.solr.common.util.DOMUtil;
-import org.apache.solr.common.util.NamedList;
 import org.apache.solr.core.Config;
 import org.apache.solr.core.SolrResourceLoader;
-import org.apache.solr.analysis.CharFilterFactory;
-import org.apache.solr.analysis.TokenFilterFactory;
-import org.apache.solr.analysis.TokenizerChain;
-import org.apache.solr.analysis.TokenizerFactory;
 import org.apache.solr.util.plugin.AbstractPluginLoader;
 import org.w3c.dom.*;
 
@@ -88,12 +84,16 @@ public final class FieldTypePluginLoader
     String expression = "./analyzer[@type='query']";
     Node anode = (Node)xpath.evaluate(expression, node, XPathConstants.NODE);
     Analyzer queryAnalyzer = readAnalyzer(anode);
-    
+
+    expression = "./analyzer[@type='multiterm']";
+    anode = (Node)xpath.evaluate(expression, node, XPathConstants.NODE);
+    Analyzer multiAnalyzer = readAnalyzer(anode);
+
     // An analyzer without a type specified, or with type="index"
     expression = "./analyzer[not(@type)] | ./analyzer[@type='index']";
     anode = (Node)xpath.evaluate(expression, node, XPathConstants.NODE);
     Analyzer analyzer = readAnalyzer(anode);
-    
+
     // a custom similarity[Factory]
     expression = "./similarity";
     anode = (Node)xpath.evaluate(expression, node, XPathConstants.NODE);
@@ -101,9 +101,16 @@ public final class FieldTypePluginLoader
     
     if (queryAnalyzer==null) queryAnalyzer=analyzer;
     if (analyzer==null) analyzer=queryAnalyzer;
+    if (multiAnalyzer == null) {
+      Boolean legacyMatch = ! schema.getDefaultLuceneMatchVersion().onOrAfter(Version.LUCENE_36);
+      legacyMatch = (DOMUtil.getAttr(node, "legacyMultiTerm", null) == null) ? legacyMatch
:
+          Boolean.parseBoolean(DOMUtil.getAttr(node, "legacyMultiTerm", null));
+      multiAnalyzer = constructMultiTermAnalyzer(queryAnalyzer, legacyMatch);
+    }
     if (analyzer!=null) {
       ft.setAnalyzer(analyzer);
       ft.setQueryAnalyzer(queryAnalyzer);
+      ft.setMultiTermAnalyzer(multiAnalyzer);
     }
     if (similarity!=null) {
       ft.setSimilarity(similarity);
@@ -130,6 +137,42 @@ public final class FieldTypePluginLoader
     return fieldTypes.put( name, plugin );
   }
 
+  // The point here is that, if no multitermanalyzer was specified in the schema file, do
one of several things:
+  // 1> If legacyMultiTerm == false, assemble a new analyzer composed of all of the charfilters,
+  //    lowercase filters and asciifoldingfilter.
+  // 2> If letacyMultiTerm == true just construct the analyzer from a KeywordTokenizer.
That should mimic current behavior.
+  //    Do the same if they've specified that the old behavior is required (legacyMultiTerm="true")
+
+  private Analyzer constructMultiTermAnalyzer(Analyzer queryAnalyzer, Boolean legacyMultiTerm)
{
+    if (queryAnalyzer == null) return null;
+
+    if (legacyMultiTerm || (!(queryAnalyzer instanceof TokenizerChain))) {
+      return new KeywordAnalyzer();
+    }
+
+    TokenizerChain tc = (TokenizerChain) queryAnalyzer;
+
+    // we know it'll never be longer than this unless the code below is explicitly changed
+    TokenFilterFactory[] filters = new TokenFilterFactory[2];
+    int idx = 0;
+    for (TokenFilterFactory factory : tc.getTokenFilterFactories()) {
+      if (factory instanceof LowerCaseFilterFactory) {
+        filters[idx] = new LowerCaseFilterFactory();
+        filters[idx++].init(factory.getArgs());
+      }
+      if (factory instanceof ASCIIFoldingFilterFactory) {
+        filters[idx] = new ASCIIFoldingFilterFactory();
+        filters[idx++].init(factory.getArgs());
+      }
+    }
+    WhitespaceTokenizerFactory white = new WhitespaceTokenizerFactory();
+    white.init(tc.getTokenizerFactory().getArgs());
+
+    return new TokenizerChain(tc.getCharFilterFactories(),
+        white,
+        Arrays.copyOfRange(filters, 0, idx));
+  }
+
   //
   // <analyzer><tokenizer class="...."/><tokenizer class="...." arg="....">
   //

Modified: lucene/dev/trunk/solr/core/src/java/org/apache/solr/schema/SchemaField.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/java/org/apache/solr/schema/SchemaField.java?rev=1206229&r1=1206228&r2=1206229&view=diff
==============================================================================
--- lucene/dev/trunk/solr/core/src/java/org/apache/solr/schema/SchemaField.java (original)
+++ lucene/dev/trunk/solr/core/src/java/org/apache/solr/schema/SchemaField.java Fri Nov 25
15:46:26 2011
@@ -97,6 +97,9 @@ public final class SchemaField extends F
   boolean isTokenized() { return (properties & TOKENIZED)!=0; }
   boolean isBinary() { return (properties & BINARY)!=0; }
 
+  boolean legacyMultiTerm() {
+    return (properties & LEGACY_MULTITERM) != 0;
+  }
 
   public IndexableField createField(Object val, float boost) {
     return type.createField(this,val,boost);

Modified: lucene/dev/trunk/solr/core/src/java/org/apache/solr/schema/TextField.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/java/org/apache/solr/schema/TextField.java?rev=1206229&r1=1206228&r2=1206229&view=diff
==============================================================================
--- lucene/dev/trunk/solr/core/src/java/org/apache/solr/schema/TextField.java (original)
+++ lucene/dev/trunk/solr/core/src/java/org/apache/solr/schema/TextField.java Fri Nov 25 15:46:26
2011
@@ -98,6 +98,11 @@ public class TextField extends FieldType
     this.queryAnalyzer = analyzer;
   }
 
+  @Override
+  public void setMultiTermAnalyzer(Analyzer analyzer) {
+    this.multiTermAnalyzer = analyzer;
+  }
+
   static Query parseFieldQuery(QParser parser, Analyzer analyzer, String field, String queryText)
{
     int phraseSlop = 0;
     boolean enablePositionIncrements = true;

Modified: lucene/dev/trunk/solr/core/src/java/org/apache/solr/search/SolrQueryParser.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/java/org/apache/solr/search/SolrQueryParser.java?rev=1206229&r1=1206228&r2=1206229&view=diff
==============================================================================
--- lucene/dev/trunk/solr/core/src/java/org/apache/solr/search/SolrQueryParser.java (original)
+++ lucene/dev/trunk/solr/core/src/java/org/apache/solr/search/SolrQueryParser.java Fri Nov
25 15:46:26 2011
@@ -26,7 +26,6 @@ import org.apache.lucene.queryparser.cla
 import org.apache.lucene.queryparser.classic.QueryParser;
 import org.apache.lucene.search.*;
 import org.apache.lucene.util.ToStringUtils;
-import org.apache.lucene.util.Version;
 import org.apache.lucene.util.automaton.Automaton;
 import org.apache.lucene.util.automaton.BasicAutomata;
 import org.apache.lucene.util.automaton.BasicOperations;
@@ -71,7 +70,6 @@ public class SolrQueryParser extends Que
     this.schema = parser.getReq().getSchema();
     this.parser = parser;
     this.defaultField = defaultField;
-    setLowercaseExpandedTerms(false);
     setEnablePositionIncrements(true);
     checkAllowLeadingWildcards();
   }
@@ -106,6 +104,14 @@ public class SolrQueryParser extends Que
     }
   }
 
+  protected String analyzeIfMultitermTermText(String field, String part, Analyzer analyzer)
{
+    if (part == null) return part;
+
+    SchemaField sf = schema.getFieldOrNull((field));
+    if (sf == null || ! (sf.getType() instanceof TextField)) return part;
+    return analyzeMultitermTerm(field, part, analyzer).utf8ToString();
+  }
+
   @Override
   protected Query getFieldQuery(String field, String queryText, boolean quoted) throws ParseException
{
     checkNullField(field);
@@ -137,6 +143,8 @@ public class SolrQueryParser extends Que
   @Override
   protected Query getRangeQuery(String field, String part1, String part2, boolean startInclusive,
boolean endInclusive) throws ParseException {
     checkNullField(field);
+    part1 = analyzeIfMultitermTermText(field, part1, schema.getFieldType(field).getMultiTermAnalyzer());
+    part2 = analyzeIfMultitermTermText(field, part2, schema.getFieldType(field).getMultiTermAnalyzer());
     SchemaField sf = schema.getField(field);
     return sf.getType().getRangeQuery(parser, sf, part1, part2, startInclusive, endInclusive);
   }
@@ -144,9 +152,8 @@ public class SolrQueryParser extends Que
   @Override
   protected Query getPrefixQuery(String field, String termStr) throws ParseException {
     checkNullField(field);
-    if (getLowercaseExpandedTerms()) {
-      termStr = termStr.toLowerCase();
-    }
+
+    termStr = analyzeIfMultitermTermText(field, termStr, schema.getFieldType(field).getMultiTermAnalyzer());
 
     // TODO: toInternal() won't necessarily work on partial
     // values, so it looks like we need a getPrefix() function
@@ -162,14 +169,13 @@ public class SolrQueryParser extends Que
     PrefixQuery prefixQuery = new PrefixQuery(t);
     return prefixQuery;
   }
-
   @Override
   protected Query getWildcardQuery(String field, String termStr) throws ParseException {
     // *:* -> MatchAllDocsQuery
     if ("*".equals(field) && "*".equals(termStr)) {
       return newMatchAllDocsQuery();
     }
-    
+    termStr = analyzeIfMultitermTermText(field, termStr, schema.getFieldType(field).getMultiTermAnalyzer());
     // can we use reversed wildcards in this field?
     String type = schema.getFieldType(field).getTypeName();
     ReversedWildcardFilterFactory factory = leadingWildcards.get(type);
@@ -213,4 +219,11 @@ public class SolrQueryParser extends Que
     }
     return q;
   }
+
+
+  protected Query getRegexpQuery(String field, String termStr) throws ParseException
+  {
+    termStr = analyzeIfMultitermTermText(field, termStr, schema.getFieldType(field).getMultiTermAnalyzer());
+    return super.getRegexpQuery(field, termStr);
+  }
 }

Added: lucene/dev/trunk/solr/core/src/test-files/solr/conf/schema-folding.xml
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/test-files/solr/conf/schema-folding.xml?rev=1206229&view=auto
==============================================================================
--- lucene/dev/trunk/solr/core/src/test-files/solr/conf/schema-folding.xml (added)
+++ lucene/dev/trunk/solr/core/src/test-files/solr/conf/schema-folding.xml Fri Nov 25 15:46:26
2011
@@ -0,0 +1,145 @@
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+  -->
+
+<schema name="test" version="1.0">
+  <types>
+    <fieldtype name="string" class="solr.StrField" sortMissingLast="true" multiValued="false"/>
+
+    <fieldType name="text" class="solr.TextField" multiValued="false">
+      <analyzer>
+        <tokenizer class="solr.PatternTokenizerFactory" pattern="\s+"/>
+        <filter class="solr.ASCIIFoldingFilterFactory"/>
+        <filter class="solr.LowerCaseFilterFactory"/>
+      </analyzer>
+    </fieldType>
+
+    <fieldType name="text_multi" class="solr.TextField" multiValued="true">
+      <analyzer>
+        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+        <filter class="solr.ASCIIFoldingFilterFactory"/>
+        <filter class="solr.TrimFilterFactory"/>
+      </analyzer>
+      <analyzer type="multiterm">        <!-- Intentionally different to test that
these are kept  distinct -->
+        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+        <filter class="solr.LowerCaseFilterFactory"/>
+        <filter class="solr.ASCIIFoldingFilterFactory"/>
+      </analyzer>
+    </fieldType>
+
+    <fieldType name="text_multi_bad" class="solr.TextField" multiValued="false">
+      <analyzer>
+        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+        <filter class="solr.ASCIIFoldingFilterFactory"/>
+        <filter class="solr.TrimFilterFactory"/>
+      </analyzer>
+      <analyzer type="multiterm">        <!-- Intentionally different to test that
these are kept  distinct -->
+        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1"
catenateWords="0"
+                catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
+        <filter class="solr.LowerCaseFilterFactory"/>
+        <filter class="solr.ASCIIFoldingFilterFactory"/>
+      </analyzer>
+    </fieldType>
+
+
+    <fieldType name="text_ws" class="solr.TextField" multiValued="true">
+      <analyzer>
+        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+        <filter class="solr.ASCIIFoldingFilterFactory"/>
+        <filter class="solr.LowerCaseFilterFactory"/>
+      </analyzer>
+    </fieldType>
+
+    <fieldType name="text_rev" class="solr.TextField" legacyMultiTerm="false">
+      <analyzer type="index">
+        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+        <filter class="solr.LowerCaseFilterFactory"/>
+        <filter class="solr.ASCIIFoldingFilterFactory"/>
+        <filter class="solr.ReversedWildcardFilterFactory" withOriginal="false"
+                maxPosAsterisk="1" maxPosQuestion="2" maxFractionAsterisk="0.99"
+                minTrailing="1"/>
+      </analyzer>
+      <analyzer type="query">
+        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+        <filter class="solr.LowerCaseFilterFactory"/>
+        <filter class="solr.ASCIIFoldingFilterFactory"/>
+      </analyzer>
+    </fieldType>
+
+    <fieldType name="text_lower_tokenizer" class="solr.TextField">
+      <analyzer>
+        <tokenizer class="solr.LowerCaseTokenizerFactory"/>
+      </analyzer>
+    </fieldType>
+
+    <fieldType name="text_charfilter" class="solr.TextField" multiValued="false">
+      <analyzer type="index">
+        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+        <filter class="solr.ASCIIFoldingFilterFactory"/>
+        <filter class="solr.LowerCaseFilterFactory"/>
+      </analyzer>
+      <analyzer type="query">
+        <charFilter class="solr.MappingCharFilterFactory" mapping="mapping-ISOLatin1Accent.txt"/>
+        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+        <filter class="solr.LowerCaseFilterFactory"/>
+      </analyzer>
+    </fieldType>
+
+    <fieldType name="text_oldstyle" class="solr.TextField" multiValued="false" legacyMultiTerm="true">
+      <analyzer>
+        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+        <filter class="solr.ASCIIFoldingFilterFactory"/>
+        <filter class="solr.LowerCaseFilterFactory"/>
+        <filter class="solr.TrimFilterFactory"/>
+      </analyzer>
+    </fieldType>
+
+    <fieldType name="int" class="solr.TrieIntField" precisionStep="0" omitNorms="true"
positionIncrementGap="0"/>
+    <fieldType name="float" class="solr.TrieFloatField" precisionStep="0" omitNorms="true"
positionIncrementGap="0"/>
+    <fieldType name="long" class="solr.TrieLongField" precisionStep="0" omitNorms="true"
positionIncrementGap="0"/>
+    <fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" omitNorms="true"
positionIncrementGap="0"/>
+    <fieldType name="byte" class="solr.ByteField" omitNorms="true" positionIncrementGap="0"/>
+    <fieldType name="short" class="solr.ShortField" omitNorms="true" positionIncrementGap="0"/>
+    <fieldtype name="boolean" class="solr.BoolField" sortMissingLast="true"/>
+    <fieldtype name="date" class="solr.TrieDateField" precisionStep="0"/>
+  </types>
+
+  <fields>
+    <field name="id" type="string" indexed="true" stored="true" required="true"/>
+    <field name="int_f" type="int"/>
+    <field name="float_f" type="float"/>
+    <field name="long_f" type="long"/>
+    <field name="double_f" type="double"/>
+    <field name="byte_f" type="byte"/>
+    <field name="short_f" type="short"/>
+    <field name="bool_f" type="boolean"/>
+    <field name="date_f" type="date"/>
+
+    <field name="content" type="text" indexed="true" stored="true"/>
+    <field name="content_ws" type="text_ws" indexed="true" stored="true"/>
+    <field name="content_rev" type="text_rev" indexed="true" stored="true"/>
+    <field name="content_multi" type="text_multi" indexed="true" stored="true"/>
+    <field name="content_lower_token" type="text_multi" indexed="true" stored="true"/>
+    <field name="content_oldstyle" type="text_oldstyle" indexed="true" stored="true"/>
+    <field name="content_charfilter" type="text_charfilter" indexed="true" stored="true"/>
+    <field name="content_multi_bad" type="text_multi_bad" indexed="true" stored="true"/>
+  </fields>
+
+  <defaultSearchField>content</defaultSearchField>
+  <uniqueKey>id</uniqueKey>
+
+</schema>

Added: lucene/dev/trunk/solr/core/src/test/org/apache/solr/schema/MultiTermTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/test/org/apache/solr/schema/MultiTermTest.java?rev=1206229&view=auto
==============================================================================
--- lucene/dev/trunk/solr/core/src/test/org/apache/solr/schema/MultiTermTest.java (added)
+++ lucene/dev/trunk/solr/core/src/test/org/apache/solr/schema/MultiTermTest.java Fri Nov
25 15:46:26 2011
@@ -0,0 +1,87 @@
+package org.apache.solr.schema;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.solr.SolrTestCaseJ4;
+import org.apache.solr.analysis.*;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+public class MultiTermTest extends SolrTestCaseJ4 {
+  public String getCoreName() {
+    return "basic";
+  }
+
+  @BeforeClass
+  public static void beforeTests() throws Exception {
+    initCore("solrconfig-basic.xml", "schema-folding.xml");
+  }
+
+  @Test
+  public void testMultiFound() {
+    SchemaField field = h.getCore().getSchema().getField("content_multi");
+    Analyzer analyzer = field.getType().getMultiTermAnalyzer();
+    assertTrue(analyzer instanceof TokenizerChain);
+    assertTrue(((TokenizerChain) analyzer).getTokenizerFactory() instanceof WhitespaceTokenizerFactory);
+    TokenizerChain tc = (TokenizerChain) analyzer;
+    for (TokenFilterFactory factory : tc.getTokenFilterFactories()) {
+      assertTrue((factory instanceof ASCIIFoldingFilterFactory) || (factory instanceof LowerCaseFilterFactory));
+    }
+
+    analyzer = field.getType().getAnalyzer();
+    assertTrue(analyzer instanceof TokenizerChain);
+    assertTrue(((TokenizerChain) analyzer).getTokenizerFactory() instanceof WhitespaceTokenizerFactory);
+    tc = (TokenizerChain) analyzer;
+    for (TokenFilterFactory factory : tc.getTokenFilterFactories()) {
+      assertTrue((factory instanceof ASCIIFoldingFilterFactory) || (factory instanceof TrimFilterFactory));
+    }
+
+    assertTrue(tc.getCharFilterFactories().length == 0);
+  }
+
+  @Test
+  public void testQueryCopiedToMulti() {
+    SchemaField field = h.getCore().getSchema().getField("content_charfilter");
+    Analyzer analyzer = field.getType().getMultiTermAnalyzer();
+    assertTrue(analyzer instanceof TokenizerChain);
+    assertTrue(((TokenizerChain) analyzer).getTokenizerFactory() instanceof WhitespaceTokenizerFactory);
+    TokenizerChain tc = (TokenizerChain) analyzer;
+    for (TokenFilterFactory factory : tc.getTokenFilterFactories()) {
+      assertTrue(factory instanceof LowerCaseFilterFactory);
+    }
+
+    assertTrue(tc.getCharFilterFactories().length == 1);
+    assertTrue(tc.getCharFilterFactories()[0] instanceof MappingCharFilterFactory);
+  }
+
+  @Test
+  public void testDefaultCopiedToMulti() {
+    SchemaField field = h.getCore().getSchema().getField("content_ws");
+    Analyzer analyzer = field.getType().getMultiTermAnalyzer();
+    assertTrue(analyzer instanceof TokenizerChain);
+    assertTrue(((TokenizerChain) analyzer).getTokenizerFactory() instanceof WhitespaceTokenizerFactory);
+    TokenizerChain tc = (TokenizerChain) analyzer;
+    for (TokenFilterFactory factory : tc.getTokenFilterFactories()) {
+      assertTrue((factory instanceof ASCIIFoldingFilterFactory) || (factory instanceof LowerCaseFilterFactory));
+    }
+
+    assertTrue(tc.getCharFilterFactories().length == 0);
+
+  }
+}

Added: lucene/dev/trunk/solr/core/src/test/org/apache/solr/search/TestFoldingMultitermQuery.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/test/org/apache/solr/search/TestFoldingMultitermQuery.java?rev=1206229&view=auto
==============================================================================
--- lucene/dev/trunk/solr/core/src/test/org/apache/solr/search/TestFoldingMultitermQuery.java
(added)
+++ lucene/dev/trunk/solr/core/src/test/org/apache/solr/search/TestFoldingMultitermQuery.java
Fri Nov 25 15:46:26 2011
@@ -0,0 +1,231 @@
+package org.apache.solr.search;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.index.IndexWriter;
+import org.apache.solr.SolrTestCaseJ4;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+public class TestFoldingMultitermQuery extends SolrTestCaseJ4 {
+
+  public String getCoreName() {
+    return "basic";
+  }
+
+  @BeforeClass
+  public static void beforeTests() throws Exception {
+    initCore("solrconfig-basic.xml", "schema-folding.xml");
+    IndexWriter iw;
+
+    String docs[] = {
+        "abcdefg1 finger",
+        "gangs hijklmn1",
+        "opqrstu1 zilly",
+    };
+
+    // prepare the index
+    for (int i = 0; i < docs.length; i++) {
+      String num = Integer.toString(i);
+      String boolVal = ((i % 2) == 0) ? "true" : "false";
+      assertU(adoc("id", num,
+          "int_f", num,
+          "float_f", num,
+          "long_f", num,
+          "double_f", num,
+          "byte_f", num,
+          "short_f", num,
+          "bool_f", boolVal,
+          "date_f", "200" + Integer.toString(i % 10) + "-01-01T00:00:00Z",
+          "content", docs[i],
+          "content_ws", docs[i],
+          "content_rev", docs[i],
+          "content_multi", docs[i],
+          "content_lower_token", docs[i],
+          "content_oldstyle", docs[i],
+          "content_charfilter", docs[i],
+          "content_multi_bad", docs[i]
+      ));
+    }
+    assertU(optimize());
+  }
+
+  @Test
+  public void testPrefixCaseAccentFolding() throws Exception {
+    String matchOneDocPrefixUpper[][] = {
+        {"A*", "ÁB*", "ABÇ*"},   // these should find only doc 0
+        {"H*", "HÏ*", "HìJ*"},   // these should find only doc 1
+        {"O*", "ÖP*", "OPQ*"},   // these should find only doc 2
+    };
+
+    String matchRevPrefixUpper[][] = {
+        {"*Ğ1", "*DEfG1", "*EfG1"},
+        {"*N1", "*LmŊ1", "*MÑ1"},
+        {"*Ǖ1", "*sTu1", "*RŠTU1"}
+    };
+
+    // test the prefix queries find only one doc where the query is uppercased. Must go through
query parser here!
+    for (int idx = 0; idx < matchOneDocPrefixUpper.length; idx++) {
+      for (int jdx = 0; jdx < matchOneDocPrefixUpper[idx].length; jdx++) {
+        String me = matchOneDocPrefixUpper[idx][jdx];
+        assertQ(req("q", "content:" + me),
+            "//*[@numFound='1']",
+            "//*[@name='id'][.='" + Integer.toString(idx) + "']");
+        assertQ(req("q", "content_ws:" + me),
+            "//*[@numFound='1']",
+            "//*[@name='id'][.='" + Integer.toString(idx) + "']");
+        assertQ(req("q", "content_multi:" + me),
+            "//*[@numFound='1']",
+            "//*[@name='id'][.='" + Integer.toString(idx) + "']");
+        assertQ(req("q", "content_lower_token:" + me),
+            "//result[@numFound='1']",
+            "//*[@name='id'][.='" + Integer.toString(idx) + "']");
+      }
+    }
+    for (int idx = 0; idx < matchRevPrefixUpper.length; idx++) {
+      for (int jdx = 0; jdx < matchRevPrefixUpper[idx].length; jdx++) {
+        String me = matchRevPrefixUpper[idx][jdx];
+        assertQ(req("q", "content_rev:" + me),
+            "//*[@numFound='1']",
+            "//*[@name='id'][.='" + Integer.toString(idx) + "']");
+      }
+    }
+  }
+
+  // test the wildcard queries find only one doc  where the query is uppercased and/or accented.
+  @Test
+  public void testWildcardCaseAccentFolding() throws Exception {
+    String matchOneDocWildUpper[][] = {
+        {"Á*C*", "ÁB*1", "ABÇ*g1", "Á*FG1"},      // these should find only doc
0
+        {"H*k*", "HÏ*l?*", "HìJ*n*", "HìJ*m*"},   // these should find only doc
1
+        {"O*ř*", "ÖP*ş???", "OPQ*S?Ů*", "ÖP*1"},  // these should find only
doc 2
+    };
+
+    for (int idx = 0; idx < matchOneDocWildUpper.length; idx++) {
+      for (int jdx = 0; jdx < matchOneDocWildUpper[idx].length; jdx++) {
+        String me = matchOneDocWildUpper[idx][jdx];
+        assertQ("Error with " + me, req("q", "content:" + me),
+            "//result[@numFound='1']",
+            "//*[@name='id'][.='" + Integer.toString(idx) + "']");
+        assertQ(req("q", "content_ws:" + me),
+            "//result[@numFound='1']",
+            "//*[@name='id'][.='" + Integer.toString(idx) + "']");
+        assertQ(req("q", "content_multi:" + me),
+            "//result[@numFound='1']",
+            "//*[@name='id'][.='" + Integer.toString(idx) + "']");
+        assertQ(req("q", "content_lower_token:" + me),
+            "//result[@numFound='1']",
+            "//*[@name='id'][.='" + Integer.toString(idx) + "']");
+      }
+    }
+  }
+
+  // Phrases should fail. This test is mainly a marker so if phrases ever do start working
with wildcards we go
+  // and update the documentation
+  @Test
+  public void testPhrase() {
+    assertQ(req("q", "content:\"silly ABCD*\""),
+        "//result[@numFound='0']");
+  }
+
+  // Make sure the legacy behavior flag is honored
+  @Test
+  public void testLegacyBehavior() {
+    assertQ(req("q", "content_oldstyle:ABCD*"),
+        "//result[@numFound='0']");
+  }
+
+  @Test
+  public void testWildcardRange() {
+    assertQ(req("q", "content:[* TO *]"),
+        "//result[@numFound='3']");
+  }
+
+
+  // Does the char filter get correctly handled?
+  @Test
+  public void testCharFilter() {
+    assertQ(req("q", "content_charfilter:" + "Á*C*"),
+        "//result[@numFound='1']",
+        "//*[@name='id'][.='0']");
+    assertQ(req("q", "content_charfilter:" + "ABÇ*g1"),
+        "//result[@numFound='1']",
+        "//*[@name='id'][.='0']");
+    assertQ(req("q", "content_charfilter:" + "HÏ*l?*"),
+        "//result[@numFound='1']",
+        "//*[@name='id'][.='1']");
+  }
+
+  @Test
+  public void testRangeQuery() {
+    assertQ(req("q", "content:" + "{Ȫp*1 TO QŮ*}"),
+        "//result[@numFound='1']",
+        "//*[@name='id'][.='2']");
+
+    assertQ(req("q", "content:" + "[Áb* TO f?Ñg?r]"),
+        "//result[@numFound='1']",
+        "//*[@name='id'][.='0']");
+
+  }
+
+  @Test
+  public void testNonTextTypes() {
+    String[] intTypes = {"int_f", "float_f", "long_f", "double_f", "byte_f", "short_f"};
+
+    for (String str : intTypes) {
+      assertQ(req("q", str + ":" + "0"),
+          "//result[@numFound='1']",
+          "//*[@name='id'][.='0']");
+
+      assertQ(req("q", str + ":" + "[0 TO 2]"),
+          "//result[@numFound='3']",
+          "//*[@name='id'][.='0']",
+          "//*[@name='id'][.='1']",
+          "//*[@name='id'][.='2']");
+    }
+    assertQ(req("q", "bool_f:true"),
+        "//result[@numFound='2']",
+        "//*[@name='id'][.='0']",
+        "//*[@name='id'][.='2']");
+
+    assertQ(req("q", "bool_f:[false TO true]"),
+        "//result[@numFound='3']",
+        "//*[@name='id'][.='0']",
+        "//*[@name='id'][.='1']",
+        "//*[@name='id'][.='2']");
+
+    assertQ(req("q", "date_f:2000-01-01T00\\:00\\:00Z"),
+        "//result[@numFound='1']",
+        "//*[@name='id'][.='0']");
+
+    assertQ(req("q", "date_f:[2000-12-31T23:59:59.999Z TO 2002-01-02T00:00:01Z]"),
+        "//result[@numFound='2']",
+        "//*[@name='id'][.='1']",
+        "//*[@name='id'][.='2']");
+  }
+
+  @Test
+  public void testMultiBad() {
+    try {
+      assertQ(req("q", "content_multi_bad:" + "abCD*"));
+      fail("Should throw exception when token evaluates to more than one term");
+    } catch (Exception expected) {
+      assertTrue(expected.getCause() instanceof IllegalArgumentException);
+    }
+  }
+}
\ No newline at end of file

Modified: lucene/dev/trunk/solr/example/solr/conf/schema.xml
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/example/solr/conf/schema.xml?rev=1206229&r1=1206228&r2=1206229&view=diff
==============================================================================
--- lucene/dev/trunk/solr/example/solr/conf/schema.xml (original)
+++ lucene/dev/trunk/solr/example/solr/conf/schema.xml Fri Nov 25 15:46:26 2011
@@ -427,6 +427,42 @@
       </analyzer>
     </fieldType>
 
+    <!-- Illustrates the new "multiterm" analyzer definition the <fieldType> can
take a new
+         parameter legacyMultiTerm="true" if the old behvaior is desired. The new default
+         behavior as of 3.6+ is to automatically define a multiterm analyzer
+    -->
+    <fieldType name="text_multiterm" class="solr.TextField" positionIncrementGap="100">
+      <analyzer type="index">
+        <tokenizer class="solr.StandardTokenizerFactory"/>
+        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1"
catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
+        <filter class="solr.LowerCaseFilterFactory"/>
+      </analyzer>
+      <analyzer type="query">
+        <tokenizer class="solr.StandardTokenizerFactory"/>
+        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1"
catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
+        <filter class="solr.LowerCaseFilterFactory"/>
+      </analyzer>
+      <!-- Illustrates the use of a new analyzer type "multiterm". See the Wiki page "Multiterm

+           Query Analysis" and SOLR-2438 for full details. The short form is that this analyzer
is
+           applied to wildcard terms (prefix, wildcard range) if specified. This allows,
among other
+           things, not having to lowercase wildcard terms on the client.
+           
+           In the absence of this section, the new default behavior (3.6, 4.0) is to construct
+           one of these from the query analyzer that incorporates any defined charfilters,
a
+           WhitespaceTokenizer, a LowerCaseFilter (if defined), and an ASCIIFoldingFilter

+           (if defined).
+           
+           Arguably, this is an expert-level analyzer, most cases will be handled by an instance
+           of this being automatically constructed from the queryanalyzer.
+           
+      -->
+      <analyzer type="multiterm"> 
+        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+        <filter class="solr.LowerCaseFilterFactory"/>
+        <filter class="solr.ASCIIFoldingFilterFactory"/>
+      </analyzer>
+    </fieldType>
+
     <!-- since fields of this type are by default not stored or indexed,
          any data added to them will be ignored outright.  --> 
     <fieldtype name="ignored" stored="false" indexed="false" multiValued="true" class="solr.StrField"
/>



Mime
View raw message