lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From er...@apache.org
Subject lucene-solr:master: LUCENE=7705: Allow CharTokenizer-derived tokenizers and KeywordTokenizer to configure the max token length
Date Sun, 28 May 2017 22:38:07 GMT
Repository: lucene-solr
Updated Branches:
  refs/heads/master bc973ecdc -> 906679adc


LUCENE=7705: Allow CharTokenizer-derived tokenizers and KeywordTokenizer to configure the max token length


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/906679ad
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/906679ad
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/906679ad

Branch: refs/heads/master
Commit: 906679adc80f0fad1e5c311b03023c7bd95633d7
Parents: bc973ec
Author: Erick Erickson <erick@apache.org>
Authored: Sun May 28 15:18:48 2017 -0700
Committer: Erick Erickson <erick@apache.org>
Committed: Sun May 28 15:18:48 2017 -0700

----------------------------------------------------------------------
 .../lucene/analysis/core/KeywordTokenizer.java  |  10 +-
 .../analysis/core/KeywordTokenizerFactory.java  |  19 ++-
 .../lucene/analysis/core/LetterTokenizer.java   |  14 ++
 .../analysis/core/LetterTokenizerFactory.java   |  19 ++-
 .../analysis/core/LowerCaseTokenizer.java       |  13 ++
 .../core/LowerCaseTokenizerFactory.java         |  37 +++--
 .../core/UnicodeWhitespaceTokenizer.java        |  13 ++
 .../analysis/core/WhitespaceTokenizer.java      |  13 ++
 .../core/WhitespaceTokenizerFactory.java        |  18 ++-
 .../lucene/analysis/util/CharTokenizer.java     |  27 +++-
 .../analysis/core/TestKeywordTokenizer.java     |  88 +++++++++++
 .../core/TestUnicodeWhitespaceTokenizer.java    |  51 +++++++
 .../analysis/util/TestCharTokenizers.java       |  95 ++++++++++++
 solr/CHANGES.txt                                |   3 +
 .../collection1/conf/schema-tokenizer-test.xml  | 150 +++++++++++++++++++
 .../solr/util/TestMaxTokenLenTokenizer.java     | 135 +++++++++++++++++
 16 files changed, 680 insertions(+), 25 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/906679ad/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizer.java
index 209ecee..eb08eea 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizer.java
@@ -24,6 +24,8 @@ import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.util.AttributeFactory;
 
+import static org.apache.lucene.analysis.standard.StandardTokenizer.MAX_TOKEN_LENGTH_LIMIT;
+
 /**
  * Emits the entire input as a single token.
  */
@@ -41,16 +43,16 @@ public final class KeywordTokenizer extends Tokenizer {
   }
 
   public KeywordTokenizer(int bufferSize) {
-    if (bufferSize <= 0) {
-      throw new IllegalArgumentException("bufferSize must be > 0");
+    if (bufferSize > MAX_TOKEN_LENGTH_LIMIT || bufferSize <= 0) {
+      throw new IllegalArgumentException("maxTokenLen must be greater than 0 and less than " + MAX_TOKEN_LENGTH_LIMIT + " passed: " + bufferSize);
     }
     termAtt.resizeBuffer(bufferSize);
   }
 
   public KeywordTokenizer(AttributeFactory factory, int bufferSize) {
     super(factory);
-    if (bufferSize <= 0) {
-      throw new IllegalArgumentException("bufferSize must be > 0");
+    if (bufferSize > MAX_TOKEN_LENGTH_LIMIT || bufferSize <= 0) {
+      throw new IllegalArgumentException("maxTokenLen must be greater than 0 and less than " + MAX_TOKEN_LENGTH_LIMIT + " passed: " + bufferSize);
     }
     termAtt.resizeBuffer(bufferSize);
   }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/906679ad/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizerFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizerFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizerFactory.java
index 3654f67..86f65d6 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizerFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizerFactory.java
@@ -16,26 +16,39 @@
  */
 package org.apache.lucene.analysis.core;
 
-
 import org.apache.lucene.analysis.util.TokenizerFactory;
 import org.apache.lucene.util.AttributeFactory;
 
 import java.util.Map;
 
+import static org.apache.lucene.analysis.standard.StandardTokenizer.MAX_TOKEN_LENGTH_LIMIT;
+
 /**
  * Factory for {@link KeywordTokenizer}. 
  * <pre class="prettyprint">
  * &lt;fieldType name="text_keyword" class="solr.TextField" positionIncrementGap="100"&gt;
  *   &lt;analyzer&gt;
- *     &lt;tokenizer class="solr.KeywordTokenizerFactory"/&gt;
+ *     &lt;tokenizer class="solr.KeywordTokenizerFactory" maxTokenLen="256"/&gt;
  *   &lt;/analyzer&gt;
  * &lt;/fieldType&gt;</pre> 
+ *
+ * Options:
+ * <ul>
+ *   <li>maxTokenLen: max token length, should be greater than 0 and less than 
+ *        MAX_TOKEN_LENGTH_LIMIT (1024*1024). It is rare to need to change this
+ *      else {@link KeywordTokenizer}::DEFAULT_BUFFER_SIZE</li>
+ * </ul>
  */
 public class KeywordTokenizerFactory extends TokenizerFactory {
+  private final int maxTokenLen;
   
   /** Creates a new KeywordTokenizerFactory */
   public KeywordTokenizerFactory(Map<String,String> args) {
     super(args);
+    maxTokenLen = getInt(args, "maxTokenLen", KeywordTokenizer.DEFAULT_BUFFER_SIZE);
+    if (maxTokenLen > MAX_TOKEN_LENGTH_LIMIT || maxTokenLen <= 0) {
+      throw new IllegalArgumentException("maxTokenLen must be greater than 0 and less than " + MAX_TOKEN_LENGTH_LIMIT + " passed: " + maxTokenLen);
+    }
     if (!args.isEmpty()) {
       throw new IllegalArgumentException("Unknown parameters: " + args);
     }
@@ -43,6 +56,6 @@ public class KeywordTokenizerFactory extends TokenizerFactory {
   
   @Override
   public KeywordTokenizer create(AttributeFactory factory) {
-    return new KeywordTokenizer(factory, KeywordTokenizer.DEFAULT_BUFFER_SIZE);
+    return new KeywordTokenizer(factory, maxTokenLen);
   }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/906679ad/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LetterTokenizer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LetterTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LetterTokenizer.java
index df41b37..8fb7d0e 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LetterTokenizer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LetterTokenizer.java
@@ -50,6 +50,20 @@ public class LetterTokenizer extends CharTokenizer {
     super(factory);
   }
   
+  /**
+   * Construct a new LetterTokenizer using a given
+   * {@link org.apache.lucene.util.AttributeFactory}.
+   *
+   * @param factory the attribute factory to use for this {@link Tokenizer}
+   * @param maxTokenLen maximum token length the tokenizer will emit. 
+   *        Must be greater than 0 and less than MAX_TOKEN_LENGTH_LIMIT (1024*1024)
+   * @throws IllegalArgumentException if maxTokenLen is invalid.
+
+   */
+  public LetterTokenizer(AttributeFactory factory, int maxTokenLen) {
+    super(factory, maxTokenLen);
+  }
+
   /** Collects only characters which satisfy
    * {@link Character#isLetter(int)}.*/
   @Override

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/906679ad/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LetterTokenizerFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LetterTokenizerFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LetterTokenizerFactory.java
index 828d6cf..41ada68 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LetterTokenizerFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LetterTokenizerFactory.java
@@ -17,25 +17,40 @@
 package org.apache.lucene.analysis.core;
 
 
+import org.apache.lucene.analysis.util.CharTokenizer;
 import org.apache.lucene.analysis.util.TokenizerFactory;
 import org.apache.lucene.util.AttributeFactory;
 
 import java.util.Map;
 
+import static org.apache.lucene.analysis.standard.StandardTokenizer.MAX_TOKEN_LENGTH_LIMIT;
+
 /**
  * Factory for {@link LetterTokenizer}. 
  * <pre class="prettyprint">
  * &lt;fieldType name="text_letter" class="solr.TextField" positionIncrementGap="100"&gt;
  *   &lt;analyzer&gt;
- *     &lt;tokenizer class="solr.LetterTokenizerFactory"/&gt;
+ *     &lt;tokenizer class="solr.LetterTokenizerFactory" maxTokenLen="256"/&gt;
  *   &lt;/analyzer&gt;
  * &lt;/fieldType&gt;</pre>
+ *
+ * Options:
+ * <ul>
+ *   <li>maxTokenLen: max token length, must be greater than 0 and less than MAX_TOKEN_LENGTH_LIMIT (1024*1024).
+ *       It is rare to need to change this
+ *      else {@link CharTokenizer}::DEFAULT_MAX_TOKEN_LEN</li>
+ * </ul>
  */
 public class LetterTokenizerFactory extends TokenizerFactory {
+  private final int maxTokenLen;
 
   /** Creates a new LetterTokenizerFactory */
   public LetterTokenizerFactory(Map<String,String> args) {
     super(args);
+    maxTokenLen = getInt(args, "maxTokenLen", CharTokenizer.DEFAULT_MAX_WORD_LEN);
+    if (maxTokenLen > MAX_TOKEN_LENGTH_LIMIT || maxTokenLen <= 0) {
+      throw new IllegalArgumentException("maxTokenLen must be greater than 0 and less than " + MAX_TOKEN_LENGTH_LIMIT + " passed: " + maxTokenLen);
+    }
     if (!args.isEmpty()) {
       throw new IllegalArgumentException("Unknown parameters: " + args);
     }
@@ -43,6 +58,6 @@ public class LetterTokenizerFactory extends TokenizerFactory {
 
   @Override
   public LetterTokenizer create(AttributeFactory factory) {
-    return new LetterTokenizer(factory);
+    return new LetterTokenizer(factory, maxTokenLen);
   }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/906679ad/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseTokenizer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseTokenizer.java
index 982d356..26b8747 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseTokenizer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseTokenizer.java
@@ -50,6 +50,19 @@ public final class LowerCaseTokenizer extends LetterTokenizer {
     super(factory);
   }
   
+  /**
+   * Construct a new LowerCaseTokenizer using a given
+   * {@link org.apache.lucene.util.AttributeFactory}.
+   *
+   * @param factory the attribute factory to use for this {@link Tokenizer}
+   * @param maxTokenLen maximum token length the tokenizer will emit. 
+   *        Must be greater than 0 and less than MAX_TOKEN_LENGTH_LIMIT (1024*1024)
+   * @throws IllegalArgumentException if maxTokenLen is invalid.
+   */
+  public LowerCaseTokenizer(AttributeFactory factory, int maxTokenLen) {
+    super(factory, maxTokenLen);
+  }
+  
   /** Converts char to lower case
    * {@link Character#toLowerCase(int)}.*/
   @Override

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/906679ad/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseTokenizerFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseTokenizerFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseTokenizerFactory.java
index 3e29161..a3e06c7 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseTokenizerFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseTokenizerFactory.java
@@ -18,6 +18,7 @@ package org.apache.lucene.analysis.core;
 
 
 import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
+import org.apache.lucene.analysis.util.CharTokenizer;
 import org.apache.lucene.analysis.util.MultiTermAwareComponent;
 import org.apache.lucene.analysis.util.TokenizerFactory;
 import org.apache.lucene.util.AttributeFactory;
@@ -25,20 +26,36 @@ import org.apache.lucene.util.AttributeFactory;
 import java.util.HashMap;
 import java.util.Map;
 
+import static org.apache.lucene.analysis.standard.StandardTokenizer.MAX_TOKEN_LENGTH_LIMIT;
+
 /**
- * Factory for {@link LowerCaseTokenizer}. 
+ * Factory for {@link LowerCaseTokenizer}.
  * <pre class="prettyprint">
  * &lt;fieldType name="text_lwrcase" class="solr.TextField" positionIncrementGap="100"&gt;
- *   &lt;analyzer&gt;
- *     &lt;tokenizer class="solr.LowerCaseTokenizerFactory"/&gt;
- *   &lt;/analyzer&gt;
+ * &lt;analyzer&gt;
+ * &lt;tokenizer class="solr.LowerCaseTokenizerFactory" maxTokenLen="256"/&gt;
+ * &lt;/analyzer&gt;
  * &lt;/fieldType&gt;</pre>
+ * <p>
+ * Options:
+ * <ul>
+ * <li>maxTokenLen: max token length, should be greater than 0 and less than MAX_TOKEN_LENGTH_LIMIT (1024*1024).
+ *     It is rare to need to change this
+ * else {@link CharTokenizer}::DEFAULT_MAX_WORD_LEN</li>
+ * </ul>
  */
 public class LowerCaseTokenizerFactory extends TokenizerFactory implements MultiTermAwareComponent {
-  
-  /** Creates a new LowerCaseTokenizerFactory */
-  public LowerCaseTokenizerFactory(Map<String,String> args) {
+  private final int maxTokenLen;
+
+  /**
+   * Creates a new LowerCaseTokenizerFactory
+   */
+  public LowerCaseTokenizerFactory(Map<String, String> args) {
     super(args);
+    maxTokenLen = getInt(args, "maxTokenLen", CharTokenizer.DEFAULT_MAX_WORD_LEN);
+    if (maxTokenLen > MAX_TOKEN_LENGTH_LIMIT || maxTokenLen <= 0) {
+      throw new IllegalArgumentException("maxTokenLen must be greater than 0 and less than " + MAX_TOKEN_LENGTH_LIMIT + " passed: " + maxTokenLen);
+    }
     if (!args.isEmpty()) {
       throw new IllegalArgumentException("Unknown parameters: " + args);
     }
@@ -46,11 +63,13 @@ public class LowerCaseTokenizerFactory extends TokenizerFactory implements Multi
 
   @Override
   public LowerCaseTokenizer create(AttributeFactory factory) {
-    return new LowerCaseTokenizer(factory);
+    return new LowerCaseTokenizer(factory, maxTokenLen);
   }
 
   @Override
   public AbstractAnalysisFactory getMultiTermComponent() {
-    return new LowerCaseFilterFactory(new HashMap<>(getOriginalArgs()));
+    Map map = new HashMap<>(getOriginalArgs());
+    map.remove("maxTokenLen"); //removing "maxTokenLen" argument for LowerCaseFilterFactory init
+    return new LowerCaseFilterFactory(map);
   }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/906679ad/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/UnicodeWhitespaceTokenizer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/UnicodeWhitespaceTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/UnicodeWhitespaceTokenizer.java
index 5e4313f..00c181f 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/UnicodeWhitespaceTokenizer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/UnicodeWhitespaceTokenizer.java
@@ -47,6 +47,19 @@ public final class UnicodeWhitespaceTokenizer extends CharTokenizer {
   public UnicodeWhitespaceTokenizer(AttributeFactory factory) {
     super(factory);
   }
+
+  /**
+   * Construct a new UnicodeWhitespaceTokenizer using a given
+   * {@link org.apache.lucene.util.AttributeFactory}.
+   *
+   * @param factory the attribute factory to use for this {@link Tokenizer}
+   * @param maxTokenLen maximum token length the tokenizer will emit. 
+   *        Must be greater than 0 and less than MAX_TOKEN_LENGTH_LIMIT (1024*1024)
+   * @throws IllegalArgumentException if maxTokenLen is invalid.
+   */
+  public UnicodeWhitespaceTokenizer(AttributeFactory factory, int maxTokenLen) {
+    super(factory, maxTokenLen);
+  }
   
   /** Collects only characters which do not satisfy Unicode's WHITESPACE property. */
   @Override

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/906679ad/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/WhitespaceTokenizer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/WhitespaceTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/WhitespaceTokenizer.java
index 70f2d62..0655227 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/WhitespaceTokenizer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/WhitespaceTokenizer.java
@@ -46,6 +46,19 @@ public final class WhitespaceTokenizer extends CharTokenizer {
   public WhitespaceTokenizer(AttributeFactory factory) {
     super(factory);
   }
+
+  /**
+   * Construct a new WhitespaceTokenizer using a given
+   * {@link org.apache.lucene.util.AttributeFactory}.
+   *
+   * @param factory the attribute factory to use for this {@link Tokenizer}
+   * @param maxTokenLen maximum token length the tokenizer will emit. 
+   *        Must be greater than 0 and less than MAX_TOKEN_LENGTH_LIMIT (1024*1024)
+   * @throws IllegalArgumentException if maxTokenLen is invalid.
+   */
+  public WhitespaceTokenizer(AttributeFactory factory, int maxTokenLen) {
+    super(factory, maxTokenLen);
+  }
   
   /** Collects only characters which do not satisfy
    * {@link Character#isWhitespace(int)}.*/

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/906679ad/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/WhitespaceTokenizerFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/WhitespaceTokenizerFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/WhitespaceTokenizerFactory.java
index fd38b63..29e9ed5 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/WhitespaceTokenizerFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/WhitespaceTokenizerFactory.java
@@ -22,15 +22,18 @@ import java.util.Collection;
 import java.util.Map;
 
 import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.util.CharTokenizer;
 import org.apache.lucene.analysis.util.TokenizerFactory;
 import org.apache.lucene.util.AttributeFactory;
 
+import static org.apache.lucene.analysis.standard.StandardTokenizer.MAX_TOKEN_LENGTH_LIMIT;
+
 /**
  * Factory for {@link WhitespaceTokenizer}. 
  * <pre class="prettyprint">
  * &lt;fieldType name="text_ws" class="solr.TextField" positionIncrementGap="100"&gt;
  *   &lt;analyzer&gt;
- *     &lt;tokenizer class="solr.WhitespaceTokenizerFactory" rule="unicode"/&gt;
+ *     &lt;tokenizer class="solr.WhitespaceTokenizerFactory" rule="unicode"  maxTokenLen="256"/&gt;
  *   &lt;/analyzer&gt;
  * &lt;/fieldType&gt;</pre>
  *
@@ -38,6 +41,9 @@ import org.apache.lucene.util.AttributeFactory;
  * <ul>
  *   <li>rule: either "java" for {@link WhitespaceTokenizer}
  *      or "unicode" for {@link UnicodeWhitespaceTokenizer}</li>
+ *   <li>maxTokenLen: max token length, should be greater than 0 and less than MAX_TOKEN_LENGTH_LIMIT (1024*1024).
+ *       It is rare to need to change this
+ *      else {@link CharTokenizer}::DEFAULT_MAX_TOKEN_LEN</li>
  * </ul>
  */
 public class WhitespaceTokenizerFactory extends TokenizerFactory {
@@ -46,13 +52,17 @@ public class WhitespaceTokenizerFactory extends TokenizerFactory {
   private static final Collection<String> RULE_NAMES = Arrays.asList(RULE_JAVA, RULE_UNICODE);
 
   private final String rule;
+  private final int maxTokenLen;
 
   /** Creates a new WhitespaceTokenizerFactory */
   public WhitespaceTokenizerFactory(Map<String,String> args) {
     super(args);
 
     rule = get(args, "rule", RULE_NAMES, RULE_JAVA);
-
+    maxTokenLen = getInt(args, "maxTokenLen", CharTokenizer.DEFAULT_MAX_WORD_LEN);
+    if (maxTokenLen > MAX_TOKEN_LENGTH_LIMIT || maxTokenLen <= 0) {
+      throw new IllegalArgumentException("maxTokenLen must be greater than 0 and less than " + MAX_TOKEN_LENGTH_LIMIT + " passed: " + maxTokenLen);
+    }
     if (!args.isEmpty()) {
       throw new IllegalArgumentException("Unknown parameters: " + args);
     }
@@ -62,9 +72,9 @@ public class WhitespaceTokenizerFactory extends TokenizerFactory {
   public Tokenizer create(AttributeFactory factory) {
     switch (rule) {
       case RULE_JAVA:
-        return new WhitespaceTokenizer(factory);
+        return new WhitespaceTokenizer(factory, maxTokenLen);
       case RULE_UNICODE:
-        return new UnicodeWhitespaceTokenizer(factory);
+        return new UnicodeWhitespaceTokenizer(factory, maxTokenLen);
       default:
         throw new AssertionError();
     }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/906679ad/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharTokenizer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharTokenizer.java
index 13289be..ff9d6ff 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharTokenizer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharTokenizer.java
@@ -33,6 +33,8 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.util.AttributeFactory;
 
+import static org.apache.lucene.analysis.standard.StandardTokenizer.MAX_TOKEN_LENGTH_LIMIT;
+
 /**
  * An abstract base class for simple, character-oriented tokenizers.
  * <p>
@@ -50,6 +52,7 @@ public abstract class CharTokenizer extends Tokenizer {
    * Creates a new {@link CharTokenizer} instance
    */
   public CharTokenizer() {
+    this.maxTokenLen = DEFAULT_MAX_WORD_LEN;
   }
   
   /**
@@ -60,6 +63,23 @@ public abstract class CharTokenizer extends Tokenizer {
    */
   public CharTokenizer(AttributeFactory factory) {
     super(factory);
+    this.maxTokenLen = DEFAULT_MAX_WORD_LEN;
+  }
+  
+  /**
+   * Creates a new {@link CharTokenizer} instance
+   *
+   * @param factory the attribute factory to use for this {@link Tokenizer}
+   * @param maxTokenLen maximum token length the tokenizer will emit. 
+   *        Must be greater than 0 and less than MAX_TOKEN_LENGTH_LIMIT (1024*1024)
+   * @throws IllegalArgumentException if maxTokenLen is invalid.
+   */
+  public CharTokenizer(AttributeFactory factory, int maxTokenLen) {
+    super(factory);
+    if (maxTokenLen > MAX_TOKEN_LENGTH_LIMIT || maxTokenLen <= 0) {
+      throw new IllegalArgumentException("maxTokenLen must be greater than 0 and less than " + MAX_TOKEN_LENGTH_LIMIT + " passed: " + maxTokenLen);
+    }
+    this.maxTokenLen = maxTokenLen;
   }
   
   /**
@@ -193,9 +213,10 @@ public abstract class CharTokenizer extends Tokenizer {
   }
   
   private int offset = 0, bufferIndex = 0, dataLen = 0, finalOffset = 0;
-  private static final int MAX_WORD_LEN = 255;
+  public static final int DEFAULT_MAX_WORD_LEN = 255;
   private static final int IO_BUFFER_SIZE = 4096;
-  
+  private final int maxTokenLen;
+
   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
   private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
   
@@ -256,7 +277,7 @@ public abstract class CharTokenizer extends Tokenizer {
         }
         end += charCount;
         length += Character.toChars(normalize(c), buffer, length); // buffer it, normalized
-        if (length >= MAX_WORD_LEN) { // buffer overflow! make sure to check for >= surrogate pair could break == test
+        if (length >= maxTokenLen) { // buffer overflow! make sure to check for >= surrogate pair could break == test
           break;
         }
       } else if (length > 0) {           // at non-Letter w/ chars

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/906679ad/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestKeywordTokenizer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestKeywordTokenizer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestKeywordTokenizer.java
new file mode 100644
index 0000000..3f03a00
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestKeywordTokenizer.java
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.core;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.util.AttributeFactory;
+
+public class TestKeywordTokenizer extends BaseTokenStreamTestCase {
+
+  public void testSimple() throws IOException {
+    StringReader reader = new StringReader("Tokenizer \ud801\udc1ctest");
+    KeywordTokenizer tokenizer = new KeywordTokenizer();
+    tokenizer.setReader(reader);
+    assertTokenStreamContents(tokenizer, new String[]{"Tokenizer \ud801\udc1ctest"});
+  }
+
+  public void testFactory() {
+    Map<String, String> args = new HashMap<>();
+    KeywordTokenizerFactory factory = new KeywordTokenizerFactory(args);
+    AttributeFactory attributeFactory = newAttributeFactory();
+    Tokenizer tokenizer = factory.create(attributeFactory);
+    assertEquals(KeywordTokenizer.class, tokenizer.getClass());
+  }
+
+  private Map<String, String> makeArgs(String... args) {
+    Map<String, String> ret = new HashMap<>();
+    for (int idx = 0; idx < args.length; idx += 2) {
+      ret.put(args[idx], args[idx + 1]);
+    }
+    return ret;
+  }
+
+  public void testParamsFactory() throws IOException {
+    // negative maxTokenLen
+    IllegalArgumentException iae = expectThrows(IllegalArgumentException.class, () ->
+        new KeywordTokenizerFactory(makeArgs("maxTokenLen", "-1")));
+    assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: -1", iae.getMessage());
+
+    // zero maxTokenLen
+    iae = expectThrows(IllegalArgumentException.class, () ->
+        new KeywordTokenizerFactory(makeArgs("maxTokenLen", "0")));
+    assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 0", iae.getMessage());
+
+    // Added random param, should throw illegal error
+    iae = expectThrows(IllegalArgumentException.class, () ->
+        new KeywordTokenizerFactory(makeArgs("maxTokenLen", "255", "randomParam", "rValue")));
+    assertEquals("Unknown parameters: {randomParam=rValue}", iae.getMessage());
+
+    // tokeniser will never split, no matter what is passed, 
+    // but the buffer will not be more than length of the token
+
+    KeywordTokenizerFactory factory = new KeywordTokenizerFactory(makeArgs("maxTokenLen", "5"));
+    AttributeFactory attributeFactory = newAttributeFactory();
+    Tokenizer tokenizer = factory.create(attributeFactory);
+    StringReader reader = new StringReader("Tokenizertest");
+    tokenizer.setReader(reader);
+    assertTokenStreamContents(tokenizer, new String[]{"Tokenizertest"});
+
+    // tokeniser will never split, no matter what is passed, 
+    // but the buffer will not be more than length of the token
+    factory = new KeywordTokenizerFactory(makeArgs("maxTokenLen", "2"));
+    attributeFactory = newAttributeFactory();
+    tokenizer = factory.create(attributeFactory);
+    reader = new StringReader("Tokenizer\u00A0test");
+    tokenizer.setReader(reader);
+    assertTokenStreamContents(tokenizer, new String[]{"Tokenizer\u00A0test"});
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/906679ad/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestUnicodeWhitespaceTokenizer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestUnicodeWhitespaceTokenizer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestUnicodeWhitespaceTokenizer.java
index acdb670..16089e9 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestUnicodeWhitespaceTokenizer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestUnicodeWhitespaceTokenizer.java
@@ -54,4 +54,55 @@ public class TestUnicodeWhitespaceTokenizer extends BaseTokenStreamTestCase {
     assertEquals(UnicodeWhitespaceTokenizer.class, tokenizer.getClass());
   }
 
+  private Map<String, String> makeArgs(String... args) {
+    Map<String, String> ret = new HashMap<>();
+    for (int idx = 0; idx < args.length; idx += 2) {
+      ret.put(args[idx], args[idx + 1]);
+    }
+    return ret;
+  }
+
+  public void testParamsFactory() throws IOException {
+    
+
+    // negative maxTokenLen
+    IllegalArgumentException iae = expectThrows(IllegalArgumentException.class, () ->
+        new WhitespaceTokenizerFactory(makeArgs("rule", "unicode", "maxTokenLen", "-1")));
+    assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: -1", iae.getMessage());
+
+    // zero maxTokenLen
+    iae = expectThrows(IllegalArgumentException.class, () ->
+        new WhitespaceTokenizerFactory(makeArgs("rule", "unicode", "maxTokenLen", "0")));
+    assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 0", iae.getMessage());
+
+    // Added random param, should throw illegal error
+    iae = expectThrows(IllegalArgumentException.class, () ->
+        new WhitespaceTokenizerFactory(makeArgs("rule", "unicode", "maxTokenLen", "255", "randomParam", "rValue")));
+    assertEquals("Unknown parameters: {randomParam=rValue}", iae.getMessage());
+
+    // tokeniser will split at 5, Token | izer, no matter what happens 
+    WhitespaceTokenizerFactory factory = new WhitespaceTokenizerFactory(makeArgs("rule", "unicode", "maxTokenLen", "5"));
+    AttributeFactory attributeFactory = newAttributeFactory();
+    Tokenizer tokenizer = factory.create(attributeFactory);
+    StringReader reader = new StringReader("Tokenizer \ud801\udc1ctest");
+    tokenizer.setReader(reader);
+    assertTokenStreamContents(tokenizer, new String[]{"Token", "izer", "\ud801\udc1ctes", "t"});
+
+    // tokeniser will split at 2, To | ke | ni | ze | r, no matter what happens 
+    factory = new WhitespaceTokenizerFactory(makeArgs("rule", "unicode", "maxTokenLen", "2"));
+    attributeFactory = newAttributeFactory();
+    tokenizer = factory.create(attributeFactory);
+    reader = new StringReader("Tokenizer\u00A0test");
+    tokenizer.setReader(reader);
+    assertTokenStreamContents(tokenizer, new String[]{"To", "ke", "ni", "ze", "r", "te", "st"});
+
+    // tokeniser will split at 10, no matter what happens, 
+    // but tokens' length are less than that
+    factory = new WhitespaceTokenizerFactory(makeArgs("rule", "unicode", "maxTokenLen", "10"));
+    attributeFactory = newAttributeFactory();
+    tokenizer = factory.create(attributeFactory);
+    reader = new StringReader("Tokenizer\u00A0test");
+    tokenizer.setReader(reader);
+    assertTokenStreamContents(tokenizer, new String[]{"Tokenizer", "test"});
+  }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/906679ad/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharTokenizers.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharTokenizers.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharTokenizers.java
index 783fc3e..4596608 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharTokenizers.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharTokenizers.java
@@ -25,8 +25,10 @@ import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.KeywordTokenizer;
 import org.apache.lucene.analysis.core.LetterTokenizer;
 import org.apache.lucene.analysis.core.LowerCaseTokenizer;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.util.TestUtil;
 
@@ -89,6 +91,99 @@ public class TestCharTokenizers extends BaseTokenStreamTestCase {
     tokenizer.setReader(new StringReader(builder.toString() + builder.toString()));
     assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase(Locale.ROOT), builder.toString().toLowerCase(Locale.ROOT)});
   }
+
+  /*
+   * tests the max word length passed as parameter - tokenizer will split at the passed position char no matter what happens
+   */
+  public void testCustomMaxTokenLength() throws IOException {
+
+    StringBuilder builder = new StringBuilder();
+    for (int i = 0; i < 100; i++) {
+      builder.append("A");
+    }
+    Tokenizer tokenizer = new LowerCaseTokenizer(newAttributeFactory(), 100);
+    // Tricky, passing two copies of the string to the reader....
+    tokenizer.setReader(new StringReader(builder.toString() + builder.toString()));
+    assertTokenStreamContents(tokenizer, new String[]{builder.toString().toLowerCase(Locale.ROOT), 
+        builder.toString().toLowerCase(Locale.ROOT) });
+
+    Exception e = expectThrows(IllegalArgumentException.class, () ->
+        new LowerCaseTokenizer(newAttributeFactory(), -1));
+    assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: -1", e.getMessage());
+
+    tokenizer = new LetterTokenizer(newAttributeFactory(), 100);
+    tokenizer.setReader(new StringReader(builder.toString() + builder.toString()));
+    assertTokenStreamContents(tokenizer, new String[]{builder.toString(), builder.toString()});
+
+
+    // Let's test that we can get a token longer than 255 through.
+    builder.setLength(0);
+    for (int i = 0; i < 500; i++) {
+      builder.append("Z");
+    }
+    tokenizer = new LetterTokenizer(newAttributeFactory(), 500);
+    tokenizer.setReader(new StringReader(builder.toString()));
+    assertTokenStreamContents(tokenizer, new String[]{builder.toString()});
+
+    
+    // Just to be sure what is happening here, token lengths of zero make no sense, 
+    // Let's try the edge cases, token > I/O buffer (4096)
+    builder.setLength(0);
+    for (int i = 0; i < 600; i++) {
+      builder.append("aUrOkIjq"); // 600 * 8 = 4800 chars.
+    }
+
+    e = expectThrows(IllegalArgumentException.class, () ->
+        new LowerCaseTokenizer(newAttributeFactory(), 0));
+    assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 0", e.getMessage());
+
+    e = expectThrows(IllegalArgumentException.class, () ->
+        new LowerCaseTokenizer(newAttributeFactory(), 10_000_000));
+    assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 10000000", e.getMessage());
+
+    tokenizer = new LowerCaseTokenizer(newAttributeFactory(), 4800);
+    tokenizer.setReader(new StringReader(builder.toString()));
+    assertTokenStreamContents(tokenizer, new String[]{builder.toString().toLowerCase(Locale.ROOT)});
+
+
+    e = expectThrows(IllegalArgumentException.class, () ->
+        new KeywordTokenizer(newAttributeFactory(), 0));
+    assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 0", e.getMessage());
+
+    e = expectThrows(IllegalArgumentException.class, () ->
+        new KeywordTokenizer(newAttributeFactory(), 10_000_000));
+    assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 10000000", e.getMessage());
+
+
+    tokenizer = new KeywordTokenizer(newAttributeFactory(), 4800);
+    tokenizer.setReader(new StringReader(builder.toString()));
+    assertTokenStreamContents(tokenizer, new String[]{builder.toString()});
+
+    e = expectThrows(IllegalArgumentException.class, () ->
+        new LetterTokenizer(newAttributeFactory(), 0));
+    assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 0", e.getMessage());
+
+    e = expectThrows(IllegalArgumentException.class, () ->
+        new LetterTokenizer(newAttributeFactory(), 2_000_000));
+    assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 2000000", e.getMessage());
+
+    tokenizer = new LetterTokenizer(newAttributeFactory(), 4800);
+    tokenizer.setReader(new StringReader(builder.toString()));
+    assertTokenStreamContents(tokenizer, new String[]{builder.toString()});
+
+    e = expectThrows(IllegalArgumentException.class, () ->
+        new WhitespaceTokenizer(newAttributeFactory(), 0));
+    assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 0", e.getMessage());
+
+    e = expectThrows(IllegalArgumentException.class, () ->
+        new WhitespaceTokenizer(newAttributeFactory(), 3_000_000));
+    assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 3000000", e.getMessage());
+
+    tokenizer = new WhitespaceTokenizer(newAttributeFactory(), 4800);
+    tokenizer.setReader(new StringReader(builder.toString()));
+    assertTokenStreamContents(tokenizer, new String[]{builder.toString()});
+
+  }
   
   /*
    * tests the max word length of 255 with a surrogate pair at position 255

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/906679ad/solr/CHANGES.txt
----------------------------------------------------------------------
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index d4e6eac..c413cf8 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -260,6 +260,9 @@ Other Changes
   
 * SOLR-10438: Assign explicit useDocValuesAsStored values to all points field types in 
   schema-point.xml/TestPointFields. (hossman, Steve Rowe)
+  
+* LUCENE-7705: Allow CharTokenizer-derived tokenizers and KeywordTokenizer to configure the max token length.
+  (Amrit Sarkar via Erick Erickson)
 
 * SOLR-10659: Remove ResponseBuilder.getSortSpec use in SearchGroupShardResponseProcessor.
   (Judith Silverman via Christine Poerschke)

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/906679ad/solr/core/src/test-files/solr/collection1/conf/schema-tokenizer-test.xml
----------------------------------------------------------------------
diff --git a/solr/core/src/test-files/solr/collection1/conf/schema-tokenizer-test.xml b/solr/core/src/test-files/solr/collection1/conf/schema-tokenizer-test.xml
new file mode 100644
index 0000000..f3d3196
--- /dev/null
+++ b/solr/core/src/test-files/solr/collection1/conf/schema-tokenizer-test.xml
@@ -0,0 +1,150 @@
+<?xml version="1.0" ?>
+<!--
+Licensed to the Apache Software Foundation (ASF) under one or more
+contributor license agreements.  See the NOTICE file distributed with
+this work for additional information regarding copyright ownership.
+The ASF licenses this file to You under the Apache License, Version 2.0
+(the "License"); you may not use this file except in compliance with
+the License.  You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<!-- The Solr schema file. This file should be named "schema.xml" and
+should be located where the classloader for the Solr webapp can find it.
+
+This schema is used for testing, and as such has everything and the
+kitchen sink thrown in. See example/solr/conf/schema.xml for a
+more concise example.
+
+-->
+
+<schema name="test" version="1.0">
+
+  <!-- field type definitions... note that the "name" attribute is
+  just a label to be used by field definitions.  The "class"
+  attribute and any other attributes determine the real type and
+  behavior of the fieldType.
+  -->
+
+  <!--
+  Default numeric field types. For faster range queries, consider the tint/tfloat/tlong/tdouble types.
+  -->
+  <fieldType name="int" class="solr.TrieIntField" precisionStep="0" positionIncrementGap="0"/>
+  <fieldType name="float" class="solr.TrieFloatField" precisionStep="0" positionIncrementGap="0"/>
+  <fieldType name="long" class="solr.TrieLongField" precisionStep="0" positionIncrementGap="0"/>
+  <fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" positionIncrementGap="0"/>
+
+  <!--
+  Numeric field types that index each value at various levels of precision
+  to accelerate range queries when the number of values between the range
+  endpoints is large. See the javadoc for LegacyNumericRangeQuery for internal
+  implementation details.
+  -->
+
+  <!-- Seperate analyzers for index and query time -->
+
+  <fieldType name="letterfieldType" class="solr.TextField" positionIncrementGap="100">
+      <analyzer type="index">
+        <tokenizer class="solr.LetterTokenizerFactory" maxTokenLen="3" />
+      </analyzer>
+      <analyzer type="query">
+        <tokenizer class="solr.StandardTokenizerFactory"/>
+      </analyzer>
+  </fieldType>
+
+  <fieldType name="lowerCasefieldType" class="solr.TextField" positionIncrementGap="100">
+    <analyzer type="index">
+      <tokenizer class="solr.LowerCaseTokenizerFactory" maxTokenLen="3" />
+    </analyzer>
+    <analyzer type="query">
+      <tokenizer class="solr.StandardTokenizerFactory"/>
+    </analyzer>
+  </fieldType>
+
+  <fieldType name="whiteSpfieldType" class="solr.TextField" positionIncrementGap="100">
+    <analyzer type="index">
+      <tokenizer class="solr.WhitespaceTokenizerFactory" maxTokenLen="3" />
+    </analyzer>
+    <analyzer type="query">
+      <tokenizer class="solr.StandardTokenizerFactory"/>
+    </analyzer>
+  </fieldType>
+
+  <fieldType name="uniWhiteSpfieldType" class="solr.TextField" positionIncrementGap="100">
+    <analyzer type="index">
+      <tokenizer class="solr.WhitespaceTokenizerFactory" maxTokenLen="3" />
+    </analyzer>
+    <analyzer type="query">
+      <tokenizer class="solr.StandardTokenizerFactory"/>
+    </analyzer>
+  </fieldType>
+
+  <fieldType name="keywordfieldType" class="solr.TextField" positionIncrementGap="100">
+    <analyzer index="index">
+      <tokenizer class="solr.KeywordTokenizerFactory" maxTokenLen="3" />
+    </analyzer>
+    <analyzer type="query">
+      <tokenizer class="solr.StandardTokenizerFactory"/>
+    </analyzer>
+  </fieldType>
+
+  <!-- Same analyzers for both index and query time -->
+
+  <fieldType name="letter0fieldType" class="solr.TextField" positionIncrementGap="100">
+    <analyzer>
+      <tokenizer class="solr.LetterTokenizerFactory" maxTokenLen="3" />
+    </analyzer>
+  </fieldType>
+
+  <fieldType name="lowerCase0fieldType" class="solr.TextField" positionIncrementGap="100">
+    <analyzer>
+      <tokenizer class="solr.LowerCaseTokenizerFactory" maxTokenLen="3" />
+    </analyzer>
+  </fieldType>
+
+  <fieldType name="whiteSp0fieldType" class="solr.TextField" positionIncrementGap="100">
+    <analyzer>
+      <tokenizer class="solr.WhitespaceTokenizerFactory" maxTokenLen="3" />
+    </analyzer>
+  </fieldType>
+
+  <fieldType name="uniWhiteSp0fieldType" class="solr.TextField" positionIncrementGap="100">
+    <analyzer>
+      <tokenizer class="solr.WhitespaceTokenizerFactory" maxTokenLen="3" />
+    </analyzer>
+  </fieldType>
+
+  <fieldType name="keyword0fieldType" class="solr.TextField" positionIncrementGap="100">
+    <analyzer>
+      <tokenizer class="solr.KeywordTokenizerFactory"  maxTokenLen="3" />
+    </analyzer>
+  </fieldType>
+
+  <field name="id" type="int" indexed="true" stored="true" multiValued="false" required="true"/>
+
+  <field name="letter" type="letterfieldType" indexed="true" stored="true"/>
+  <field name="lowerCase" type="lowerCasefieldType" indexed="true" stored="true"/>
+  <field name="whiteSpace" type="whiteSpfieldType" indexed="true" stored="true"/>
+  <field name="unicodeWhiteSpace" type="uniWhiteSpfieldType" indexed="true" stored="true"/>
+  <field name="keyword" type="keywordfieldType" indexed="true" stored="true"/>
+
+  <field name="letter0" type="letter0fieldType" indexed="true" stored="true"/>
+  <field name="lowerCase0" type="lowerCase0fieldType" indexed="true" stored="true"/>
+  <field name="whiteSpace0" type="whiteSp0fieldType" indexed="true" stored="true"/>
+  <field name="unicodeWhiteSpace0" type="uniWhiteSp0fieldType" indexed="true" stored="true"/>
+  <field name="keyword0" type="keyword0fieldType" indexed="true" stored="true"/>
+
+  <field name="_version_" type="long" indexed="true" stored="true" multiValued="false"/>
+
+
+  <uniqueKey>id</uniqueKey>
+
+
+</schema>

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/906679ad/solr/core/src/test/org/apache/solr/util/TestMaxTokenLenTokenizer.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/util/TestMaxTokenLenTokenizer.java b/solr/core/src/test/org/apache/solr/util/TestMaxTokenLenTokenizer.java
new file mode 100644
index 0000000..c7e0dc3
--- /dev/null
+++ b/solr/core/src/test/org/apache/solr/util/TestMaxTokenLenTokenizer.java
@@ -0,0 +1,135 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.util;
+
+import org.apache.solr.SolrTestCaseJ4;
+import org.junit.BeforeClass;
+
+/**
+ * Tests for:
+ * {@link org.apache.lucene.analysis.core.LowerCaseTokenizerFactory}
+ * {@link org.apache.lucene.analysis.core.LetterTokenizerFactory}
+ * {@link org.apache.lucene.analysis.core.KeywordTokenizerFactory}
+ * {@link org.apache.lucene.analysis.core.WhitespaceTokenizerFactory}
+ */
+
+public class TestMaxTokenLenTokenizer extends SolrTestCaseJ4 {
+  /* field names are used in accordance with the solrconfig and schema supplied */
+  private static final String ID = "id";
+
+  @BeforeClass
+  public static void beforeClass() throws Exception {
+    initCore("solrconfig-update-processor-chains.xml", "schema-tokenizer-test.xml");
+  }
+
+  public void testSingleFieldDiffAnalyzers() throws Exception {
+
+    clearIndex();
+
+    // using fields with definitions, different tokenizer factories respectively at index time and standard tokenizer at query time.
+
+    updateJ("{\"add\":{\"doc\": {\"id\":1,\"letter\":\"letter\"}},\"commit\":{}}",null);
+    updateJ("{\"add\":{\"doc\": {\"id\":2,\"lowerCase\":\"lowerCase\"}},\"commit\":{}}",null);
+    updateJ("{\"add\":{\"doc\": {\"id\":3,\"whiteSpace\":\"whiteSpace in\"}},\"commit\":{}}",null);
+    updateJ("{\"add\":{\"doc\": {\"id\":4,\"unicodeWhiteSpace\":\"unicode in\"}},\"commit\":{}}",null);
+    updateJ("{\"add\":{\"doc\": {\"id\":5,\"keyword\":\"keyword\"}},\"commit\":{}}",null);
+
+    assertU(commit());
+
+    assertQ("Check the total number of docs", req("q","*:*"), "//result[@numFound=5]");
+
+    //Tokens generated for "letter": "let" "ter" "letter" , maxTokenLen=3
+    assertQ("Check the total number of docs", req("q","letter:let"), "//result[@numFound=1]");
+    assertQ("Check the total number of docs", req("q","letter:lett"), "//result[@numFound=0]");
+
+    //Tokens generated for "lowerCase": "low" "erC" "ase" "lowerCase" , maxTokenLen=3
+    assertQ("Check the total number of docs", req("q","lowerCase:low"), "//result[@numFound=1]");
+    assertQ("Check the total number of docs", req("q","lowerCase:l"), "//result[@numFound=0]");
+    assertQ("Check the total number of docs", req("q","lowerCase:lo"), "//result[@numFound=0]");
+    assertQ("Check the total number of docs", req("q","lowerCase:lower"), "//result[@numFound=0]");
+
+    //Tokens generated for "whiteSpace in": "whi" "teS" "pac" "e" "in" "whiteSpace" , maxTokenLen=3
+    assertQ("Check the total number of docs", req("q","whiteSpace:whi"), "//result[@numFound=1]");
+    assertQ("Check the total number of docs", req("q","whiteSpace:teS"), "//result[@numFound=1]");
+    assertQ("Check the total number of docs", req("q","whiteSpace:in"), "//result[@numFound=1]");
+    assertQ("Check the total number of docs", req("q","whiteSpace:white"), "//result[@numFound=0]");
+
+    //Tokens generated for "unicode in": "uni" "cod" "e" "in" "unicode" , maxTokenLen=3
+    assertQ("Check the total number of docs", req("q","unicodeWhiteSpace:uni"), "//result[@numFound=1]");
+    assertQ("Check the total number of docs", req("q","unicodeWhiteSpace:cod"), "//result[@numFound=1]");
+    assertQ("Check the total number of docs", req("q","unicodeWhiteSpace:e"), "//result[@numFound=1]");
+    assertQ("Check the total number of docs", req("q","unicodeWhiteSpace:unico"), "//result[@numFound=0]");
+
+    //Tokens generated for "keyword": "keyword" , maxTokenLen=3
+    assertQ("Check the total number of docs", req("q","keyword:keyword"), "//result[@numFound=1]");
+    assertQ("Check the total number of docs", req("q","keyword:key"), "//result[@numFound=0]");
+
+  }
+
+  public void testSingleFieldSameAnalyzers() throws Exception {
+
+    clearIndex();
+
+    // using fields with definitions, same tokenizers both at index and query time.
+
+    updateJ("{\"add\":{\"doc\": {\"id\":1,\"letter0\":\"letter\"}},\"commit\":{}}",null);
+    updateJ("{\"add\":{\"doc\": {\"id\":2,\"lowerCase0\":\"lowerCase\"}},\"commit\":{}}",null);
+    updateJ("{\"add\":{\"doc\": {\"id\":3,\"whiteSpace0\":\"whiteSpace in\"}},\"commit\":{}}",null);
+    updateJ("{\"add\":{\"doc\": {\"id\":4,\"unicodeWhiteSpace0\":\"unicode in\"}},\"commit\":{}}",null);
+    updateJ("{\"add\":{\"doc\": {\"id\":5,\"keyword0\":\"keyword\"}},\"commit\":{}}",null);
+
+    assertU(commit());
+
+    assertQ("Check the total number of docs", req("q","*:*"), "//result[@numFound=5]");
+
+    //Tokens generated for "letter": "let" "ter" "letter" , maxTokenLen=3
+    // Anything that matches the first three letters should be found when maxLen=3
+    assertQ("Check the total number of docs", req("q","letter0:l"), "//result[@numFound=0]");
+    assertQ("Check the total number of docs", req("q","letter0:let"), "//result[@numFound=1]");
+    assertQ("Check the total number of docs", req("q","letter0:lett"), "//result[@numFound=1]");
+    assertQ("Check the total number of docs", req("q","letter0:letXYZ"), "//result[@numFound=1]");
+
+    //Tokens generated for "lowerCase": "low" "erC" "ase" "lowerCase" , maxTokenLen=3
+    // Anything that matches the first three letters should be found when maxLen=3
+    assertQ("Check the total number of docs", req("q","lowerCase0:low"), "//result[@numFound=1]");
+    assertQ("Check the total number of docs", req("q","lowerCase0:l"), "//result[@numFound=0]");
+    assertQ("Check the total number of docs", req("q","lowerCase0:lo"), "//result[@numFound=0]");
+    assertQ("Check the total number of docs", req("q","lowerCase0:lowerXYZ"), "//result[@numFound=1]");
+
+    //Tokens generated for "whiteSpace in": "whi" "teS" "pac" "e" "in" "whiteSpace" , maxTokenLen=3
+    // Anything that matches the first three letters should be found when maxLen=3
+    assertQ("Check the total number of docs", req("q","whiteSpace0:h"), "//result[@numFound=0]");
+    assertQ("Check the total number of docs", req("q","whiteSpace0:whi"), "//result[@numFound=1]");
+    assertQ("Check the total number of docs", req("q","whiteSpace0:teS"), "//result[@numFound=1]");
+    assertQ("Check the total number of docs", req("q","whiteSpace0:in"), "//result[@numFound=1]");
+    assertQ("Check the total number of docs", req("q","whiteSpace0:whiteZKY"), "//result[@numFound=1]");
+
+    //Tokens generated for "unicode in": "uni" "cod" "e" "in" "unicode" , maxTokenLen=3
+    // Anything that matches the first three letters should be found when maxLen=3
+    assertQ("Check the total number of docs", req("q","unicodeWhiteSpace0:u"), "//result[@numFound=0]");
+    assertQ("Check the total number of docs", req("q","unicodeWhiteSpace0:uni"), "//result[@numFound=1]");
+    assertQ("Check the total number of docs", req("q","unicodeWhiteSpace0:cod"), "//result[@numFound=1]");
+    assertQ("Check the total number of docs", req("q","unicodeWhiteSpace0:e"), "//result[@numFound=1]");
+    assertQ("Check the total number of docs", req("q","unicodeWhiteSpace0:unicoVBRT"), "//result[@numFound=1]");
+
+    //Tokens generated for "keyword": "keyword" , maxTokenLen=3
+    assertQ("Check the total number of docs", req("q","keyword0:keyword"), "//result[@numFound=1]");
+    assertQ("Check the total number of docs", req("q","keyword0:key"), "//result[@numFound=0]");
+
+  }
+}


Mime
View raw message