lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From da...@apache.org
Subject [6/7] lucene-solr:jira/gradle: Adding solr:analysis-extras module
Date Fri, 02 Nov 2018 10:30:04 GMT
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6c070b4a/solr/contrib/analysis-extras/src/main/java/org/apache/solr/update/processor/OpenNLPExtractNamedEntitiesUpdateProcessorFactory.java
----------------------------------------------------------------------
diff --git a/solr/contrib/analysis-extras/src/main/java/org/apache/solr/update/processor/OpenNLPExtractNamedEntitiesUpdateProcessorFactory.java b/solr/contrib/analysis-extras/src/main/java/org/apache/solr/update/processor/OpenNLPExtractNamedEntitiesUpdateProcessorFactory.java
new file mode 100644
index 0000000..d69c367
--- /dev/null
+++ b/solr/contrib/analysis-extras/src/main/java/org/apache/solr/update/processor/OpenNLPExtractNamedEntitiesUpdateProcessorFactory.java
@@ -0,0 +1,577 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.update.processor;
+
+import java.io.IOException;
+import java.lang.invoke.MethodHandles;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.regex.PatternSyntaxException;
+
+import opennlp.tools.util.Span;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.opennlp.OpenNLPTokenizer;
+import org.apache.lucene.analysis.opennlp.tools.NLPNERTaggerOp;
+import org.apache.lucene.analysis.opennlp.tools.OpenNLPOpsFactory;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.SolrInputDocument;
+import org.apache.solr.common.SolrInputField;
+import org.apache.solr.common.util.NamedList;
+import org.apache.solr.common.util.Pair;
+import org.apache.solr.core.SolrCore;
+import org.apache.solr.request.SolrQueryRequest;
+import org.apache.solr.response.SolrQueryResponse;
+import org.apache.solr.schema.FieldType;
+import org.apache.solr.update.AddUpdateCommand;
+import org.apache.solr.update.processor.FieldMutatingUpdateProcessor.FieldNameSelector;
+import org.apache.solr.update.processor.FieldMutatingUpdateProcessorFactory.SelectorParams;
+import org.apache.solr.util.plugin.SolrCoreAware;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import static org.apache.solr.common.SolrException.ErrorCode.SERVER_ERROR;
+
+/**
+ * Extracts named entities using an OpenNLP NER <code>modelFile</code> from the values found in
+ * any matching <code>source</code> field into a configured <code>dest</code> field, after
+ * first tokenizing the source text using the index analyzer on the configured
+ * <code>analyzerFieldType</code>, which must include <code>solr.OpenNLPTokenizerFactory</code>
+ * as the tokenizer. E.g.:
+ *
+ * <pre class="prettyprint">
+ *   &lt;fieldType name="opennlp-en-tokenization" class="solr.TextField"&gt;
+ *     &lt;analyzer&gt;
+ *       &lt;tokenizer class="solr.OpenNLPTokenizerFactory"
+ *                  sentenceModel="en-sent.bin"
+ *                  tokenizerModel="en-tokenizer.bin"/&gt;
+ *     &lt;/analyzer&gt;
+ *   &lt;/fieldType&gt;
+ * </pre>
+ * 
+ * <p>See the <a href="http://opennlp.apache.org/models.html">OpenNLP website</a>
+ * for information on downloading pre-trained models.</p>
+ *
+ * Note that in order to use model files larger than 1MB on SolrCloud, 
+ * <a href="https://lucene.apache.org/solr/guide/setting-up-an-external-zookeeper-ensemble#increasing-zookeeper-s-1mb-file-size-limit"
+ * >ZooKeeper server and client configuration is required</a>.
+ * 
+ * <p>
+ * The <code>source</code> field(s) can be configured as either:
+ * </p>
+ * <ul>
+ *  <li>One or more <code>&lt;str&gt;</code></li>
+ *  <li>An <code>&lt;arr&gt;</code> of <code>&lt;str&gt;</code></li>
+ *  <li>A <code>&lt;lst&gt;</code> containing
+ *   {@link FieldMutatingUpdateProcessor FieldMutatingUpdateProcessorFactory style selector arguments}</li>
+ * </ul>
+ *
+ * <p>The <code>dest</code> field can be a single <code>&lt;str&gt;</code>
+ * containing the literal name of a destination field, or it may be a <code>&lt;lst&gt;</code> specifying a
+ * regex <code>pattern</code> and a <code>replacement</code> string. If the pattern + replacement option
+ * is used the pattern will be matched against all fields matched by the source selector, and the replacement
+ * string (including any capture groups specified from the pattern) will be evaluated a using
+ * {@link Matcher#replaceAll(String)} to generate the literal name of the destination field.  Additionally,
+ * an occurrence of the string "{EntityType}" in the <code>dest</code> field specification, or in the
+ * <code>replacement</code> string, will be replaced with the entity type(s) returned for each entity by
+ * the OpenNLP NER model; as a result, if the model extracts more than one entity type, then more than one
+ * <code>dest</code> field will be populated.
+ * </p>
+ *
+ * <p>If the resolved <code>dest</code> field already exists in the document, then the
+ * named entities extracted from the <code>source</code> fields will be added to it.
+ * </p>
+ * <p>
+ * In the example below:
+ * </p>
+ * <ul>
+ *   <li>Named entities will be extracted from the <code>text</code> field and added
+ *       to the <code>names_ss</code> field</li>
+ *   <li>Named entities will be extracted from both the <code>title</code> and
+ *       <code>subtitle</code> fields and added into the <code>titular_people</code> field</li>
+ *   <li>Named entities will be extracted from any field with a name ending in <code>_txt</code>
+ *       -- except for <code>notes_txt</code> -- and added into the <code>people_ss</code> field</li>
+ *   <li>Named entities will be extracted from any field with a name beginning with "desc" and
+ *       ending in "s" (e.g. "descs" and "descriptions") and added to a field prefixed with "key_",
+ *       not ending in "s", and suffixed with "_people". (e.g. "key_desc_people" or
+ *       "key_description_people")</li>
+ *   <li>Named entities will be extracted from the <code>summary</code> field and added
+ *       to the <code>summary_person_ss</code> field, assuming that the modelFile only extracts
+ *       entities of type "person".</li>
+ * </ul>
+ *
+ * <pre class="prettyprint">
+ * &lt;updateRequestProcessorChain name="multiple-extract"&gt;
+ *   &lt;processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory"&gt;
+ *     &lt;str name="modelFile"&gt;en-test-ner-person.bin&lt;/str&gt;
+ *     &lt;str name="analyzerFieldType"&gt;opennlp-en-tokenization&lt;/str&gt;
+ *     &lt;str name="source"&gt;text&lt;/str&gt;
+ *     &lt;str name="dest"&gt;people_s&lt;/str&gt;
+ *   &lt;/processor&gt;
+ *   &lt;processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory"&gt;
+ *     &lt;str name="modelFile"&gt;en-test-ner-person.bin&lt;/str&gt;
+ *     &lt;str name="analyzerFieldType"&gt;opennlp-en-tokenization&lt;/str&gt;
+ *     &lt;arr name="source"&gt;
+ *       &lt;str&gt;title&lt;/str&gt;
+ *       &lt;str&gt;subtitle&lt;/str&gt;
+ *     &lt;/arr&gt;
+ *     &lt;str name="dest"&gt;titular_people&lt;/str&gt;
+ *   &lt;/processor&gt;
+ *   &lt;processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory"&gt;
+ *     &lt;str name="modelFile"&gt;en-test-ner-person.bin&lt;/str&gt;
+ *     &lt;str name="analyzerFieldType"&gt;opennlp-en-tokenization&lt;/str&gt;
+ *     &lt;lst name="source"&gt;
+ *       &lt;str name="fieldRegex"&gt;.*_txt$&lt;/str&gt;
+ *       &lt;lst name="exclude"&gt;
+ *         &lt;str name="fieldName"&gt;notes_txt&lt;/str&gt;
+ *       &lt;/lst&gt;
+ *     &lt;/lst&gt;
+ *     &lt;str name="dest"&gt;people_s&lt;/str&gt;
+ *   &lt;/processor&gt;
+ *   &lt;processor class="solr.processor.OpenNLPExtractNamedEntitiesUpdateProcessorFactory"&gt;
+ *     &lt;str name="modelFile"&gt;en-test-ner-person.bin&lt;/str&gt;
+ *     &lt;str name="analyzerFieldType"&gt;opennlp-en-tokenization&lt;/str&gt;
+ *     &lt;lst name="source"&gt;
+ *       &lt;str name="fieldRegex"&gt;^desc(.*)s$&lt;/str&gt;
+ *     &lt;/lst&gt;
+ *     &lt;lst name="dest"&gt;
+ *       &lt;str name="pattern"&gt;^desc(.*)s$&lt;/str&gt;
+ *       &lt;str name="replacement"&gt;key_desc$1_people&lt;/str&gt;
+ *     &lt;/lst&gt;
+ *   &lt;/processor&gt;
+ *   &lt;processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory"&gt;
+ *     &lt;str name="modelFile"&gt;en-test-ner-person.bin&lt;/str&gt;
+ *     &lt;str name="analyzerFieldType"&gt;opennlp-en-tokenization&lt;/str&gt;
+ *     &lt;str name="source"&gt;summary&lt;/str&gt;
+ *     &lt;str name="dest"&gt;summary_{EntityType}_s&lt;/str&gt;
+ *   &lt;/processor&gt;
+ *   &lt;processor class="solr.LogUpdateProcessorFactory" /&gt;
+ *   &lt;processor class="solr.RunUpdateProcessorFactory" /&gt;
+ * &lt;/updateRequestProcessorChain&gt;
+ * </pre>
+ *
+ * @since 7.3.0
+ */
+public class OpenNLPExtractNamedEntitiesUpdateProcessorFactory
+    extends UpdateRequestProcessorFactory implements SolrCoreAware {
+
+  private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+
+  public static final String SOURCE_PARAM = "source";
+  public static final String DEST_PARAM = "dest";
+  public static final String PATTERN_PARAM = "pattern";
+  public static final String REPLACEMENT_PARAM = "replacement";
+  public static final String MODEL_PARAM = "modelFile";
+  public static final String ANALYZER_FIELD_TYPE_PARAM = "analyzerFieldType";
+  public static final String ENTITY_TYPE = "{EntityType}";
+
+  private SelectorParams srcInclusions = new SelectorParams();
+  private Collection<SelectorParams> srcExclusions = new ArrayList<>();
+
+  private FieldNameSelector srcSelector = null;
+
+  private String modelFile = null;
+  private String analyzerFieldType = null;
+
+  /**
+   * If pattern is null, this this is a literal field name.  If pattern is non-null then this
+   * is a replacement string that may contain meta-characters (ie: capture group identifiers)
+   * @see #pattern
+   */
+  private String dest = null;
+  /** @see #dest */
+  private Pattern pattern = null;
+
+  protected final FieldNameSelector getSourceSelector() {
+    if (null != srcSelector) return srcSelector;
+
+    throw new SolrException(SERVER_ERROR, "selector was never initialized, inform(SolrCore) never called???");
+  }
+
+  @SuppressWarnings("unchecked")
+  @Override
+  public void init(NamedList args) {
+
+    // high level (loose) check for which type of config we have.
+    //
+    // individual init methods do more strict syntax checking
+    if (0 <= args.indexOf(SOURCE_PARAM, 0) && 0 <= args.indexOf(DEST_PARAM, 0) ) {
+      initSourceSelectorSyntax(args);
+    } else if (0 <= args.indexOf(PATTERN_PARAM, 0) && 0 <= args.indexOf(REPLACEMENT_PARAM, 0)) {
+      initSimpleRegexReplacement(args);
+    } else {
+      throw new SolrException(SERVER_ERROR, "A combination of either '" + SOURCE_PARAM + "' + '"+
+          DEST_PARAM + "', or '" + REPLACEMENT_PARAM + "' + '" +
+          PATTERN_PARAM + "' init params are mandatory");
+    }
+
+    Object modelParam = args.remove(MODEL_PARAM);
+    if (null == modelParam) {
+      throw new SolrException(SERVER_ERROR, "Missing required init param '" + MODEL_PARAM + "'");
+    }
+    if ( ! (modelParam instanceof CharSequence)) {
+      throw new SolrException(SERVER_ERROR, "Init param '" + MODEL_PARAM + "' must be a <str>");
+    }
+    modelFile = modelParam.toString();
+
+    Object analyzerFieldTypeParam = args.remove(ANALYZER_FIELD_TYPE_PARAM);
+    if (null == analyzerFieldTypeParam) {
+      throw new SolrException(SERVER_ERROR, "Missing required init param '" + ANALYZER_FIELD_TYPE_PARAM + "'");
+    }
+    if ( ! (analyzerFieldTypeParam instanceof CharSequence)) {
+      throw new SolrException(SERVER_ERROR, "Init param '" + ANALYZER_FIELD_TYPE_PARAM + "' must be a <str>");
+    }
+    analyzerFieldType = analyzerFieldTypeParam.toString();
+
+    if (0 < args.size()) {
+      throw new SolrException(SERVER_ERROR, "Unexpected init param(s): '" + args.getName(0) + "'");
+    }
+
+    super.init(args);
+  }
+
+  /**
+   * init helper method that should only be called when we know for certain that both the
+   * "source" and "dest" init params do <em>not</em> exist.
+   */
+  @SuppressWarnings("unchecked")
+  private void initSimpleRegexReplacement(NamedList args) {
+    // The syntactic sugar for the case where there is only one regex pattern for source and the same pattern
+    // is used for the destination pattern...
+    //
+    //  pattern != null && replacement != null
+    //
+    // ...as top level elements, with no other config options specified
+
+    // if we got here we know we had pattern and replacement, now check for the other two  so that we can give a better
+    // message than "unexpected"
+    if (0 <= args.indexOf(SOURCE_PARAM, 0) || 0 <= args.indexOf(DEST_PARAM, 0) ) {
+      throw new SolrException(SERVER_ERROR,"Short hand syntax must not be mixed with full syntax. Found " +
+          PATTERN_PARAM + " and " + REPLACEMENT_PARAM + " but also found " + SOURCE_PARAM + " or " + DEST_PARAM);
+    }
+
+    assert args.indexOf(SOURCE_PARAM, 0) < 0;
+
+    Object patt = args.remove(PATTERN_PARAM);
+    Object replacement = args.remove(REPLACEMENT_PARAM);
+
+    if (null == patt || null == replacement) {
+      throw new SolrException(SERVER_ERROR, "Init params '" + PATTERN_PARAM + "' and '" +
+          REPLACEMENT_PARAM + "' are both mandatory if '" + SOURCE_PARAM + "' and '"+
+          DEST_PARAM + "' are not both specified");
+    }
+
+    if (0 != args.size()) {
+      throw new SolrException(SERVER_ERROR, "Init params '" + REPLACEMENT_PARAM + "' and '" +
+          PATTERN_PARAM + "' must be children of '" + DEST_PARAM +
+          "' to be combined with other options.");
+    }
+
+    if (!(replacement instanceof String)) {
+      throw new SolrException(SERVER_ERROR, "Init param '" + REPLACEMENT_PARAM + "' must be a string (i.e. <str>)");
+    }
+    if (!(patt instanceof String)) {
+      throw new SolrException(SERVER_ERROR, "Init param '" + PATTERN_PARAM + "' must be a string (i.e. <str>)");
+    }
+
+    dest = replacement.toString();
+    try {
+      this.pattern = Pattern.compile(patt.toString());
+    } catch (PatternSyntaxException pe) {
+      throw new SolrException(SERVER_ERROR, "Init param " + PATTERN_PARAM +
+          " is not a valid regex pattern: " + patt, pe);
+
+    }
+    srcInclusions = new SelectorParams();
+    srcInclusions.fieldRegex = Collections.singletonList(this.pattern);
+  }
+
+  /**
+   * init helper method that should only be called when we know for certain that both the
+   * "source" and "dest" init params <em>do</em> exist.
+   */
+  @SuppressWarnings("unchecked")
+  private void initSourceSelectorSyntax(NamedList args) {
+    // Full and complete syntax where source and dest are mandatory.
+    //
+    // source may be a single string or a selector.
+    // dest may be a single string or list containing pattern and replacement
+    //
+    //   source != null && dest != null
+
+    // if we got here we know we had source and dest, now check for the other two so that we can give a better
+    // message than "unexpected"
+    if (0 <= args.indexOf(PATTERN_PARAM, 0) || 0 <= args.indexOf(REPLACEMENT_PARAM, 0) ) {
+      throw new SolrException(SERVER_ERROR,"Short hand syntax must not be mixed with full syntax. Found " +
+          SOURCE_PARAM + " and " + DEST_PARAM + " but also found " + PATTERN_PARAM + " or " + REPLACEMENT_PARAM);
+    }
+
+    Object d = args.remove(DEST_PARAM);
+    assert null != d;
+
+    List<Object> sources = args.getAll(SOURCE_PARAM);
+    assert null != sources;
+
+    if (1 == sources.size()) {
+      if (sources.get(0) instanceof NamedList) {
+        // nested set of selector options
+        NamedList selectorConfig = (NamedList) args.remove(SOURCE_PARAM);
+
+        srcInclusions = parseSelectorParams(selectorConfig);
+
+        List<Object> excList = selectorConfig.getAll("exclude");
+
+        for (Object excObj : excList) {
+          if (null == excObj) {
+            throw new SolrException(SERVER_ERROR, "Init param '" + SOURCE_PARAM +
+                "' child 'exclude' can not be null");
+          }
+          if (!(excObj instanceof NamedList)) {
+            throw new SolrException(SERVER_ERROR, "Init param '" + SOURCE_PARAM +
+                "' child 'exclude' must be <lst/>");
+          }
+          NamedList exc = (NamedList) excObj;
+          srcExclusions.add(parseSelectorParams(exc));
+          if (0 < exc.size()) {
+            throw new SolrException(SERVER_ERROR, "Init param '" + SOURCE_PARAM +
+                "' has unexpected 'exclude' sub-param(s): '"
+                + selectorConfig.getName(0) + "'");
+          }
+          // call once per instance
+          selectorConfig.remove("exclude");
+        }
+
+        if (0 < selectorConfig.size()) {
+          throw new SolrException(SERVER_ERROR, "Init param '" + SOURCE_PARAM +
+              "' contains unexpected child param(s): '" +
+              selectorConfig.getName(0) + "'");
+        }
+        // consume from the named list so it doesn't interfere with subsequent processing
+        sources.remove(0);
+      }
+    }
+    if (1 <= sources.size()) {
+      // source better be one or more strings
+      srcInclusions.fieldName = new HashSet<>(args.removeConfigArgs("source"));
+    }
+    if (srcInclusions == null) {
+      throw new SolrException(SERVER_ERROR,
+          "Init params do not specify any field from which to extract entities, please supply either "
+          + SOURCE_PARAM + " and " + DEST_PARAM + " or " + PATTERN_PARAM + " and " + REPLACEMENT_PARAM + ". See javadocs" +
+          "for OpenNLPExtractNamedEntitiesUpdateProcessor for further details.");
+    }
+
+    if (d instanceof NamedList) {
+      NamedList destList = (NamedList) d;
+
+      Object patt = destList.remove(PATTERN_PARAM);
+      Object replacement = destList.remove(REPLACEMENT_PARAM);
+
+      if (null == patt || null == replacement) {
+        throw new SolrException(SERVER_ERROR, "Init param '" + DEST_PARAM + "' children '" +
+            PATTERN_PARAM + "' and '" + REPLACEMENT_PARAM +
+            "' are both mandatory and can not be null");
+      }
+      if (! (patt instanceof String && replacement instanceof String)) {
+        throw new SolrException(SERVER_ERROR, "Init param '" + DEST_PARAM + "' children '" +
+            PATTERN_PARAM + "' and '" + REPLACEMENT_PARAM +
+            "' must both be strings (i.e. <str>)");
+      }
+      if (0 != destList.size()) {
+        throw new SolrException(SERVER_ERROR, "Init param '" + DEST_PARAM + "' has unexpected children: '"
+            + destList.getName(0) + "'");
+      }
+
+      try {
+        this.pattern = Pattern.compile(patt.toString());
+      } catch (PatternSyntaxException pe) {
+        throw new SolrException(SERVER_ERROR, "Init param '" + DEST_PARAM + "' child '" + PATTERN_PARAM +
+            " is not a valid regex pattern: " + patt, pe);
+      }
+      dest = replacement.toString();
+
+    } else if (d instanceof String) {
+      dest = d.toString();
+    } else {
+      throw new SolrException(SERVER_ERROR, "Init param '" + DEST_PARAM + "' must either be a string " +
+          "(i.e. <str>) or a list (i.e. <lst>) containing '" +
+          PATTERN_PARAM + "' and '" + REPLACEMENT_PARAM);
+    }
+
+  }
+
+  @Override
+  public void inform(final SolrCore core) {
+
+    srcSelector =
+        FieldMutatingUpdateProcessor.createFieldNameSelector
+            (core.getResourceLoader(), core, srcInclusions, FieldMutatingUpdateProcessor.SELECT_NO_FIELDS);
+
+    for (SelectorParams exc : srcExclusions) {
+      srcSelector = FieldMutatingUpdateProcessor.wrap
+          (srcSelector,
+              FieldMutatingUpdateProcessor.createFieldNameSelector
+                  (core.getResourceLoader(), core, exc, FieldMutatingUpdateProcessor.SELECT_NO_FIELDS));
+    }
+    try {
+      OpenNLPOpsFactory.getNERTaggerModel(modelFile, core.getResourceLoader());
+    } catch (IOException e) {
+      throw new IllegalArgumentException(e);
+    }
+  }
+
+  @Override
+  public final UpdateRequestProcessor getInstance
+      (SolrQueryRequest req, SolrQueryResponse rsp, UpdateRequestProcessor next) {
+    final FieldNameSelector srcSelector = getSourceSelector();
+    return new UpdateRequestProcessor(next) {
+      private final NLPNERTaggerOp nerTaggerOp;
+      private Analyzer analyzer = null;
+      {
+        try {
+          nerTaggerOp = OpenNLPOpsFactory.getNERTagger(modelFile);
+          FieldType fieldType = req.getSchema().getFieldTypeByName(analyzerFieldType);
+          if (fieldType == null) {
+            throw new SolrException
+                (SERVER_ERROR, ANALYZER_FIELD_TYPE_PARAM + " '" + analyzerFieldType + "' not found in the schema.");
+          }
+          analyzer = fieldType.getIndexAnalyzer();
+        } catch (IOException e) {
+          throw new IllegalArgumentException(e);
+        }
+      }
+
+      @Override
+      public void processAdd(AddUpdateCommand cmd) throws IOException {
+
+        final SolrInputDocument doc = cmd.getSolrInputDocument();
+
+        // Destination may be regex replace string, or "{EntityType}" replaced by
+        // each entity's type, both of which can cause multiple output fields.
+        Map<String,SolrInputField> destMap = new HashMap<>();
+
+        // preserve initial values
+        for (final String fname : doc.getFieldNames()) {
+          if ( ! srcSelector.shouldMutate(fname)) continue;
+
+          Collection<Object> srcFieldValues = doc.getFieldValues(fname);
+          if (srcFieldValues == null || srcFieldValues.isEmpty()) continue;
+
+          String resolvedDest = dest;
+
+          if (pattern != null) {
+            Matcher matcher = pattern.matcher(fname);
+            if (matcher.find()) {
+              resolvedDest = matcher.replaceAll(dest);
+            } else {
+              log.debug("srcSelector.shouldMutate(\"{}\") returned true, " +
+                  "but replacement pattern did not match, field skipped.", fname);
+              continue;
+            }
+          }
+
+          for (Object val : srcFieldValues) {
+            for (Pair<String,String> entity : extractTypedNamedEntities(val)) {
+              SolrInputField destField = null;
+              String entityName = entity.first();
+              String entityType = entity.second();
+              final String resolved = resolvedDest.replace(ENTITY_TYPE, entityType);
+              if (doc.containsKey(resolved)) {
+                destField = doc.getField(resolved);
+              } else {
+                SolrInputField targetField = destMap.get(resolved);
+                if (targetField == null) {
+                  destField = new SolrInputField(resolved);
+                } else {
+                  destField = targetField;
+                }
+              }
+              destField.addValue(entityName);
+
+              // put it in map to avoid concurrent modification...
+              destMap.put(resolved, destField);
+            }
+          }
+        }
+
+        for (Map.Entry<String,SolrInputField> entry : destMap.entrySet()) {
+          doc.put(entry.getKey(), entry.getValue());
+        }
+        super.processAdd(cmd);
+      }
+
+      /** Using configured NER model, extracts (name, type) pairs from the given source field value */
+      private List<Pair<String,String>> extractTypedNamedEntities(Object srcFieldValue) throws IOException {
+        List<Pair<String,String>> entitiesWithType = new ArrayList<>();
+        List<String> terms = new ArrayList<>();
+        List<Integer> startOffsets = new ArrayList<>();
+        List<Integer> endOffsets = new ArrayList<>();
+        String fullText = srcFieldValue.toString();
+        TokenStream tokenStream = analyzer.tokenStream("", fullText);
+        CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class);
+        OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
+        FlagsAttribute flagsAtt = tokenStream.addAttribute(FlagsAttribute.class);
+        tokenStream.reset();
+        synchronized (nerTaggerOp) {
+          while (tokenStream.incrementToken()) {
+            terms.add(termAtt.toString());
+            startOffsets.add(offsetAtt.startOffset());
+            endOffsets.add(offsetAtt.endOffset());
+            boolean endOfSentence = 0 != (flagsAtt.getFlags() & OpenNLPTokenizer.EOS_FLAG_BIT);
+            if (endOfSentence) {    // extract named entities one sentence at a time
+              extractEntitiesFromSentence(fullText, terms, startOffsets, endOffsets, entitiesWithType);
+            }
+          }
+          tokenStream.end();
+          tokenStream.close();
+          if (!terms.isEmpty()) { // In case last token of last sentence isn't properly flagged with EOS_FLAG_BIT
+            extractEntitiesFromSentence(fullText, terms, startOffsets, endOffsets, entitiesWithType);
+          }
+          nerTaggerOp.reset();      // Forget all adaptive data collected during previous calls
+        }
+        return entitiesWithType;
+      }
+
+      private void extractEntitiesFromSentence(String fullText, List<String> terms, List<Integer> startOffsets,
+                                               List<Integer> endOffsets, List<Pair<String,String>> entitiesWithType) {
+        for (Span span : nerTaggerOp.getNames(terms.toArray(new String[terms.size()]))) {
+          String text = fullText.substring(startOffsets.get(span.getStart()), endOffsets.get(span.getEnd() - 1));
+          entitiesWithType.add(new Pair<>(text, span.getType()));
+        }
+        terms.clear();
+        startOffsets.clear();
+        endOffsets.clear();
+      }
+    };
+  }
+
+  /** macro */
+  private static SelectorParams parseSelectorParams(NamedList args) {
+    return FieldMutatingUpdateProcessorFactory.parseSelectorParams(args);
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6c070b4a/solr/contrib/analysis-extras/src/main/java/org/apache/solr/update/processor/package.html
----------------------------------------------------------------------
diff --git a/solr/contrib/analysis-extras/src/main/java/org/apache/solr/update/processor/package.html b/solr/contrib/analysis-extras/src/main/java/org/apache/solr/update/processor/package.html
new file mode 100644
index 0000000..1388c29
--- /dev/null
+++ b/solr/contrib/analysis-extras/src/main/java/org/apache/solr/update/processor/package.html
@@ -0,0 +1,24 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<!-- not a package-info.java, because we already defined this package in core/ -->
+<html>
+  <body>
+    Update request processor invoking OpenNLP Named Entity Recognition over configured
+    source field(s), populating configured target field(s) with the results.
+  </body>
+</html>

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6c070b4a/solr/contrib/analysis-extras/src/main/java/overview.html
----------------------------------------------------------------------
diff --git a/solr/contrib/analysis-extras/src/main/java/overview.html b/solr/contrib/analysis-extras/src/main/java/overview.html
new file mode 100644
index 0000000..f3d70ca
--- /dev/null
+++ b/solr/contrib/analysis-extras/src/main/java/overview.html
@@ -0,0 +1,21 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html>
+<body>
+Apache Solr Search Server: Analysis Extras contrib
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6c070b4a/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/en-test-ner.bin
----------------------------------------------------------------------
diff --git a/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/en-test-ner.bin b/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/en-test-ner.bin
deleted file mode 100644
index b4d8cdc..0000000
Binary files a/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/en-test-ner.bin and /dev/null differ

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6c070b4a/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/en-test-sent.bin
----------------------------------------------------------------------
diff --git a/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/en-test-sent.bin b/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/en-test-sent.bin
deleted file mode 100644
index 6e19e6b..0000000
Binary files a/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/en-test-sent.bin and /dev/null differ

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6c070b4a/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/en-test-tokenizer.bin
----------------------------------------------------------------------
diff --git a/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/en-test-tokenizer.bin b/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/en-test-tokenizer.bin
deleted file mode 100644
index 796a744..0000000
Binary files a/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/en-test-tokenizer.bin and /dev/null differ

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6c070b4a/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/schema-folding-extra.xml
----------------------------------------------------------------------
diff --git a/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/schema-folding-extra.xml b/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/schema-folding-extra.xml
deleted file mode 100644
index 573ca53..0000000
--- a/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/schema-folding-extra.xml
+++ /dev/null
@@ -1,52 +0,0 @@
-<!--
-  Licensed to the Apache Software Foundation (ASF) under one or more
-  contributor license agreements.  See the NOTICE file distributed with
-  this work for additional information regarding copyright ownership.
-  The ASF licenses this file to You under the Apache License, Version 2.0
-  (the "License"); you may not use this file except in compliance with
-  the License.  You may obtain a copy of the License at
-
-      http://www.apache.org/licenses/LICENSE-2.0
-
-  Unless required by applicable law or agreed to in writing, software
-  distributed under the License is distributed on an "AS IS" BASIS,
-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-  See the License for the specific language governing permissions and
-  limitations under the License.
-  -->
-
-<schema name="test" version="1.0">
-  <fieldType name="string" class="solr.StrField" sortMissingLast="true" multiValued="false"/>
-
-
-  <fieldType name="text_icufolding" class="solr.TextField">
-    <analyzer>
-      <tokenizer class="solr.WhitespaceTokenizerFactory"/>
-      <filter class="solr.ICUFoldingFilterFactory"/>
-    </analyzer>
-  </fieldType>
-
-  <fieldType name="text_icunormalizer2" class="solr.TextField">
-    <analyzer>
-      <tokenizer class="solr.WhitespaceTokenizerFactory"/>
-      <filter class="solr.ICUNormalizer2FilterFactory" name="nfkc_cf" mode="compose"/>
-    </analyzer>
-  </fieldType>
-
-  <fieldType name="text_icutransform" class="solr.TextField">
-    <analyzer>
-      <tokenizer class="solr.WhitespaceTokenizerFactory"/>
-      <filter class="solr.ICUTransformFilterFactory" id="Cyrillic-Latin"/>
-    </analyzer>
-  </fieldType>
-
-
-  <field name="id" type="string" indexed="true" stored="true" required="true"/>
-  <field name="content_icufolding" type="text_icufolding" indexed="true" stored="true"/>
-  <field name="content_icunormalizer2" type="text_icunormalizer2" indexed="true" stored="true"/>
-  <field name="content_icutransform" type="text_icutransform" indexed="true" stored="true"/>
-
-
-  <uniqueKey>id</uniqueKey>
-
-</schema>

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6c070b4a/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/schema-icucollate-dv.xml
----------------------------------------------------------------------
diff --git a/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/schema-icucollate-dv.xml b/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/schema-icucollate-dv.xml
deleted file mode 100644
index 63f7330..0000000
--- a/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/schema-icucollate-dv.xml
+++ /dev/null
@@ -1,57 +0,0 @@
-<?xml version="1.0" ?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-
-<!-- Test schema file for CollationField (docvalues) -->
-
-<schema name="test" version="1.0">
-
-  <fieldType name="string" class="solr.StrField" omitNorms="true" positionIncrementGap="0"/>
-
-  <!-- basic text field -->
-  <fieldType name="text" class="solr.TextField">
-    <analyzer>
-      <tokenizer class="solr.StandardTokenizerFactory"/>
-      <filter class="solr.LowerCaseFilterFactory"/>
-    </analyzer>
-  </fieldType>
-
-  <fieldType name="sort_ar_t" class="solr.ICUCollationField" locale="ar"/>
-  <fieldType name="sort_de_t" class="solr.ICUCollationField" locale="de" strength="primary"/>
-  <fieldType name="sort_tr_canon_t" class="solr.ICUCollationField" locale="tr" strength="primary"
-             decomposition="canonical"/>
-  <fieldType name="sort_da_t" class="solr.ICUCollationField" locale="da" strength="primary"/>
-  <fieldType name="sort_custom_t" class="solr.ICUCollationField" custom="customrules.dat" strength="primary"/>
-
-  <field name="id" type="string" indexed="true" stored="true" multiValued="false" required="false"/>
-  <field name="text" type="text" indexed="true" stored="false"/>
-  <field name="sort_ar" type="sort_ar_t" indexed="false" stored="false" multiValued="false" docValues="true"/>
-  <field name="sort_de" type="sort_de_t" indexed="false" stored="false" multiValued="false" docValues="true"/>
-  <field name="sort_tr_canon" type="sort_tr_canon_t" indexed="false" stored="false" multiValued="true"
-         docValues="true"/>
-  <field name="sort_da" type="sort_da_t" indexed="false" stored="false" multiValued="false" docValues="true"/>
-  <field name="sort_custom" type="sort_custom_t" indexed="false" stored="false" multiValued="true" docValues="true"/>
-
-  <uniqueKey>id</uniqueKey>
-
-  <!-- copy our text to some sort fields with different orders -->
-  <copyField source="text" dest="sort_ar"/>
-  <copyField source="text" dest="sort_de"/>
-  <copyField source="text" dest="sort_tr_canon"/>
-  <copyField source="text" dest="sort_da"/>
-  <copyField source="text" dest="sort_custom"/>
-</schema>

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6c070b4a/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/schema-icucollate.xml
----------------------------------------------------------------------
diff --git a/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/schema-icucollate.xml b/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/schema-icucollate.xml
deleted file mode 100644
index 9698013..0000000
--- a/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/schema-icucollate.xml
+++ /dev/null
@@ -1,57 +0,0 @@
-<?xml version="1.0" ?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-
-<!-- Test schema file for CollationField -->
-
-<schema name="test" version="1.0">
-
-  <fieldType name="string" class="solr.StrField" omitNorms="true" positionIncrementGap="0"/>
-
-  <!-- basic text field -->
-  <fieldType name="text" class="solr.TextField">
-    <analyzer>
-      <tokenizer class="solr.StandardTokenizerFactory"/>
-      <filter class="solr.LowerCaseFilterFactory"/>
-    </analyzer>
-  </fieldType>
-
-  <fieldType name="sort_ar_t" class="solr.ICUCollationField" locale="ar"/>
-  <fieldType name="sort_de_t" class="solr.ICUCollationField" locale="de" strength="primary"/>
-  <fieldType name="sort_tr_canon_t" class="solr.ICUCollationField" locale="tr" strength="primary"
-             decomposition="canonical"/>
-  <fieldType name="sort_da_t" class="solr.ICUCollationField" locale="da" strength="primary"/>
-  <fieldType name="sort_custom_t" class="solr.ICUCollationField" custom="customrules.dat" strength="primary"/>
-
-  <field name="id" type="string" indexed="true" stored="true" multiValued="false" required="false"/>
-  <field name="text" type="text" indexed="true" stored="false"/>
-  <field name="sort_ar" type="sort_ar_t" indexed="true" stored="false" multiValued="false"/>
-  <field name="sort_de" type="sort_de_t" indexed="true" stored="false" multiValued="false"/>
-  <field name="sort_tr_canon" type="sort_tr_canon_t" indexed="true" stored="false" multiValued="false"/>
-  <field name="sort_da" type="sort_da_t" indexed="true" stored="false" multiValued="false"/>
-  <field name="sort_custom" type="sort_custom_t" indexed="true" stored="false" multiValued="false"/>
-
-
-  <uniqueKey>id</uniqueKey>
-
-  <!-- copy our text to some sort fields with different orders -->
-  <copyField source="text" dest="sort_ar"/>
-  <copyField source="text" dest="sort_de"/>
-  <copyField source="text" dest="sort_tr_canon"/>
-  <copyField source="text" dest="sort_da"/>
-  <copyField source="text" dest="sort_custom"/>
-</schema>

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6c070b4a/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/schema-icucollateoptions.xml
----------------------------------------------------------------------
diff --git a/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/schema-icucollateoptions.xml b/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/schema-icucollateoptions.xml
deleted file mode 100644
index 59b8d25..0000000
--- a/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/schema-icucollateoptions.xml
+++ /dev/null
@@ -1,68 +0,0 @@
-<?xml version="1.0" ?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-
-<!-- Test schema file for CollationField options -->
-
-<schema name="test" version="1.0">
-
-  <fieldType name="string" class="solr.StrField" omitNorms="true" positionIncrementGap="0"/>
-
-  <!-- basic text field -->
-  <fieldType name="text" class="solr.TextField">
-    <analyzer>
-      <tokenizer class="solr.StandardTokenizerFactory"/>
-      <filter class="solr.LowerCaseFilterFactory"/>
-    </analyzer>
-  </fieldType>
-
-  <!-- ignores punctuation and whitespace -->
-  <fieldType name="sort_ignore_punctuation_t" class="solr.ICUCollationField"
-             locale="en" strength="primary" alternate="shifted"/>
-  <!-- ignores only whitespace -->
-  <fieldType name="sort_ignore_space_t" class="solr.ICUCollationField"
-             locale="en" strength="primary" alternate="shifted" variableTop=" "/>
-  <!-- ignores only accents, but not case -->
-  <fieldType name="sort_ignore_accents_t" class="solr.ICUCollationField"
-             locale="en" strength="primary" caseLevel="true"/>
-  <!-- sorts numerics in numeric order -->
-  <fieldType name="sort_numerics_t" class="solr.ICUCollationField"
-             locale="en" numeric="true"/>
-  <!-- sorts uppercase before lowercase -->
-  <fieldType name="sort_uppercase_first_t" class="solr.ICUCollationField"
-             locale="en" strength="tertiary" caseFirst="upper"/>
-
-
-  <field name="id" type="string" indexed="true" stored="true" multiValued="false" required="false"/>
-  <field name="text" type="text" indexed="true" stored="false"/>
-  <field name="sort_ignore_punctuation" type="sort_ignore_punctuation_t" indexed="true" stored="false"
-         multiValued="false"/>
-  <field name="sort_ignore_space" type="sort_ignore_space_t" indexed="true" stored="false" multiValued="false"/>
-  <field name="sort_ignore_accents" type="sort_ignore_accents_t" indexed="true" stored="false" multiValued="false"/>
-  <field name="sort_numerics" type="sort_numerics_t" indexed="true" stored="false" multiValued="false"/>
-  <field name="sort_uppercase_first" type="sort_uppercase_first_t" indexed="true" stored="false" multiValued="false"/>
-
-
-  <uniqueKey>id</uniqueKey>
-
-  <!-- copy our text to some sort fields with different orders -->
-  <copyField source="text" dest="sort_ignore_punctuation"/>
-  <copyField source="text" dest="sort_ignore_space"/>
-  <copyField source="text" dest="sort_ignore_accents"/>
-  <copyField source="text" dest="sort_numerics"/>
-  <copyField source="text" dest="sort_uppercase_first"/>
-</schema>

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6c070b4a/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/schema-opennlp-extract.xml
----------------------------------------------------------------------
diff --git a/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/schema-opennlp-extract.xml b/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/schema-opennlp-extract.xml
deleted file mode 100644
index fc13431..0000000
--- a/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/schema-opennlp-extract.xml
+++ /dev/null
@@ -1,49 +0,0 @@
-<?xml version="1.0" ?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-
-<schema name="test-opennlp-extract" version="1.6">
-  <fieldType name="opennlp-en-tokenization" class="solr.TextField">
-    <analyzer>
-      <tokenizer class="solr.OpenNLPTokenizerFactory"
-                 sentenceModel="en-test-sent.bin"
-                 tokenizerModel="en-test-tokenizer.bin"/>
-    </analyzer>
-  </fieldType>
-
-  <fieldType name="string" class="solr.StrField" sortMissingLast="true"/>
-
-  <fieldType name="text" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
-    <analyzer>
-      <tokenizer class="solr.MockTokenizerFactory"/>
-      <filter class="solr.LowerCaseFilterFactory"/>
-      <filter class="solr.PorterStemFilterFactory"/>
-    </analyzer>
-  </fieldType>
-
-  <field name="id" type="string" indexed="true" stored="true" multiValued="false" required="true"/>
-  <field name="text" type="text" indexed="true" stored="false"/>
-  <field name="subject" type="text" indexed="true" stored="true"/>
-  <field name="title" type="text" indexed="true" stored="true"/>
-  <field name="subtitle" type="text" indexed="true" stored="true"/>
-  <field name="descs" type="text" indexed="true" stored="true"/>
-  <field name="descriptions" type="text" indexed="true" stored="true"/>
-
-  <dynamicField name="*_txt" type="text" indexed="true" stored="true"/>
-  <dynamicField name="*_s" type="string" indexed="true" stored="true" multiValued="true"/>
-  <dynamicField name="*_people" type="string" indexed="true" stored="true" multiValued="true"/>
-</schema>

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6c070b4a/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/solrconfig-icucollate.xml
----------------------------------------------------------------------
diff --git a/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/solrconfig-icucollate.xml b/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/solrconfig-icucollate.xml
deleted file mode 100644
index 90c52d7..0000000
--- a/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/solrconfig-icucollate.xml
+++ /dev/null
@@ -1,27 +0,0 @@
-<?xml version="1.0" ?>
-
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-
-<config>
-  <luceneMatchVersion>${tests.luceneMatchVersion:LATEST}</luceneMatchVersion>
-  <indexConfig>
-    <useCompoundFile>${useCompoundFile:false}</useCompoundFile>
-  </indexConfig>
-  <requestHandler name="/select" class="solr.SearchHandler"></requestHandler>
-  <directoryFactory name="DirectoryFactory" class="${solr.directoryFactory:solr.RAMDirectoryFactory}"/>
-</config>

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6c070b4a/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/solrconfig-opennlp-extract.xml
----------------------------------------------------------------------
diff --git a/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/solrconfig-opennlp-extract.xml b/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/solrconfig-opennlp-extract.xml
deleted file mode 100644
index 7fd793e..0000000
--- a/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/solrconfig-opennlp-extract.xml
+++ /dev/null
@@ -1,206 +0,0 @@
-<?xml version="1.0" ?>
-
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-
-<config>
-  <luceneMatchVersion>${tests.luceneMatchVersion:LATEST}</luceneMatchVersion>
-  <xi:include href="solrconfig.snippet.randomindexconfig.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/>
-  <requestHandler name="/select" class="solr.SearchHandler"></requestHandler>
-  <requestHandler name="/update" class="solr.UpdateRequestHandler"  />
-  <directoryFactory name="DirectoryFactory" class="${solr.directoryFactory:solr.RAMDirectoryFactory}"/>
-  <schemaFactory class="ClassicIndexSchemaFactory"/>
-
-  <updateRequestProcessorChain name="extract-single">
-    <processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
-      <str name="modelFile">en-test-ner.bin</str>
-      <str name="analyzerFieldType">opennlp-en-tokenization</str>
-      <str name="source">source1_s</str>
-      <str name="dest">dest_s</str>
-    </processor>
-  </updateRequestProcessorChain>
-
-  <updateRequestProcessorChain name="extract-single-regex">
-    <processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
-      <str name="modelFile">en-test-ner.bin</str>
-      <str name="analyzerFieldType">opennlp-en-tokenization</str>
-      <str name="source">source1_s</str>
-      <lst name="dest">
-        <str name="pattern">source\d(_s)</str>
-        <str name="replacement">dest$1</str>
-      </lst>
-    </processor>
-  </updateRequestProcessorChain>
-
-  <updateRequestProcessorChain name="extract-multi">
-    <processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
-      <str name="modelFile">en-test-ner.bin</str>
-      <str name="analyzerFieldType">opennlp-en-tokenization</str>
-      <str name="source">source1_s</str>
-      <str name="source">source2_s</str>
-      <str name="dest">dest_s</str>
-    </processor>
-  </updateRequestProcessorChain>
-
-  <updateRequestProcessorChain name="extract-multi-regex">
-    <processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
-      <str name="modelFile">en-test-ner.bin</str>
-      <str name="analyzerFieldType">opennlp-en-tokenization</str>
-      <str name="source">source1_s</str>
-      <str name="source">source2_s</str>
-      <lst name="dest">
-        <str name="pattern">source\d(_s)</str>
-        <str name="replacement">dest$1</str>
-      </lst>
-    </processor>
-  </updateRequestProcessorChain>
-
-  <updateRequestProcessorChain name="extract-array">
-    <processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
-      <str name="modelFile">en-test-ner.bin</str>
-      <str name="analyzerFieldType">opennlp-en-tokenization</str>
-      <arr name="source">
-        <str>source1_s</str>
-        <str>source2_s</str>
-      </arr>
-      <str name="dest">dest_s</str>
-    </processor>
-  </updateRequestProcessorChain>
-
-  <updateRequestProcessorChain name="extract-array-regex">
-    <processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
-      <str name="modelFile">en-test-ner.bin</str>
-      <str name="analyzerFieldType">opennlp-en-tokenization</str>
-      <arr name="source">
-        <str>source1_s</str>
-        <str>source2_s</str>
-      </arr>
-      <lst name="dest">
-        <str name="pattern">source\d(_s)</str>
-        <str name="replacement">dest$1</str>
-      </lst>
-    </processor>
-  </updateRequestProcessorChain>
-
-  <updateRequestProcessorChain name="extract-selector">
-    <processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
-      <str name="modelFile">en-test-ner.bin</str>
-      <str name="analyzerFieldType">opennlp-en-tokenization</str>
-      <lst name="source">
-        <str name="fieldRegex">source\d_.*</str>
-        <lst name="exclude">
-          <str name="fieldRegex">source0_.*</str>
-        </lst>
-      </lst>
-      <str name="dest">dest_s</str>
-    </processor>
-  </updateRequestProcessorChain>
-
-  <updateRequestProcessorChain name="extract-selector-regex">
-    <processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
-      <str name="modelFile">en-test-ner.bin</str>
-      <str name="analyzerFieldType">opennlp-en-tokenization</str>
-      <lst name="source">
-        <str name="fieldRegex">source\d_.*</str>
-        <lst name="exclude">
-          <str name="fieldRegex">source0_.*</str>
-        </lst>
-      </lst>
-      <lst name="dest">
-        <str name="pattern">source\d(_s)</str>
-        <str name="replacement">dest$1</str>
-      </lst>
-    </processor>
-  </updateRequestProcessorChain>
-
-  <updateRequestProcessorChain name="extract-regex-replaceall">
-    <processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
-      <str name="modelFile">en-test-ner.bin</str>
-      <str name="analyzerFieldType">opennlp-en-tokenization</str>
-      <lst name="source">
-        <str name="fieldRegex">foo.*</str>
-      </lst>
-      <lst name="dest">
-        <!-- unbounded pattern that can be replaced multiple times in field name -->
-        <str name="pattern">x(\d)</str>
-        <str name="replacement">y$1</str>
-      </lst>
-    </processor>
-  </updateRequestProcessorChain>
-
-  <updateRequestProcessorChain name="extract-regex-replaceall-with-entity-type">
-    <processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
-      <str name="modelFile">en-test-ner.bin</str>
-      <str name="analyzerFieldType">opennlp-en-tokenization</str>
-      <lst name="source">
-        <str name="fieldRegex">foo.*</str>
-      </lst>
-      <lst name="dest">
-        <!-- unbounded pattern that can be replaced multiple times in field name -->
-        <str name="pattern">x(\d)</str>
-        <str name="replacement">{EntityType}_y$1</str>
-      </lst>
-    </processor>
-  </updateRequestProcessorChain>
-
-  <!-- example used in OpenNLPExtractNamedEntitiesUpdateProcessorFactory javadocs -->
-  <updateRequestProcessorChain name="multiple-extract">
-    <processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
-      <str name="modelFile">en-test-ner.bin</str>
-      <str name="analyzerFieldType">opennlp-en-tokenization</str>
-      <str name="source">text</str>
-      <str name="dest">people_s</str>
-    </processor>
-    <processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
-      <str name="modelFile">en-test-ner.bin</str>
-      <str name="analyzerFieldType">opennlp-en-tokenization</str>
-      <arr name="source">
-        <str>title</str>
-        <str>subtitle</str>
-      </arr>
-      <str name="dest">titular_people</str>
-    </processor>
-    <processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
-      <str name="modelFile">en-test-ner.bin</str>
-      <str name="analyzerFieldType">opennlp-en-tokenization</str>
-      <lst name="source">
-        <str name="fieldRegex">.*_txt$</str>
-        <lst name="exclude">
-          <str name="fieldName">notes_txt</str>
-        </lst>
-      </lst>
-      <str name="dest">people_s</str>
-    </processor>
-    <processor class="solr.processor.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
-      <str name="modelFile">en-test-ner.bin</str>
-      <str name="analyzerFieldType">opennlp-en-tokenization</str>
-      <lst name="source">
-        <str name="fieldRegex">^desc(.*)s$</str>
-      </lst>
-      <lst name="dest">
-        <str name="pattern">^desc(.*)s$</str>
-        <str name="replacement">key_desc$1_people</str>
-      </lst>
-    </processor>
-    <processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
-      <str name="modelFile">en-test-ner.bin</str>
-      <str name="analyzerFieldType">opennlp-en-tokenization</str>
-      <str name="source">summary</str>
-      <str name="dest">summary_{EntityType}_s</str>
-    </processor>
-  </updateRequestProcessorChain>
-</config>

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6c070b4a/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/solrconfig.snippet.randomindexconfig.xml
----------------------------------------------------------------------
diff --git a/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/solrconfig.snippet.randomindexconfig.xml b/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/solrconfig.snippet.randomindexconfig.xml
deleted file mode 100644
index 23516b0..0000000
--- a/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/solrconfig.snippet.randomindexconfig.xml
+++ /dev/null
@@ -1,48 +0,0 @@
-<?xml version="1.0" ?>
-
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-
-<!--
-A solrconfig.xml snippet containing indexConfig settings for randomized testing.
--->
-<indexConfig>
-  <!-- this sys property is not set by SolrTestCaseJ4 because we ideally want to use
-       the RandomMergePolicy in all tests - but some tests expect very specific
-       Merge behavior, so those tests can set it as needed.
-  -->
-  <mergePolicyFactory class="${solr.tests.mergePolicyFactory:org.apache.solr.util.RandomMergePolicyFactory}" />
-
-  <useCompoundFile>${useCompoundFile:false}</useCompoundFile>
-
-  <maxBufferedDocs>${solr.tests.maxBufferedDocs}</maxBufferedDocs>
-  <ramBufferSizeMB>${solr.tests.ramBufferSizeMB}</ramBufferSizeMB>
-
-  <mergeScheduler class="${solr.tests.mergeScheduler}" />
-
-  <writeLockTimeout>1000</writeLockTimeout>
-  <commitLockTimeout>10000</commitLockTimeout>
-
-  <!-- this sys property is not set by SolrTestCaseJ4 because almost all tests should
-       use the single process lockType for speed - but tests that explicitly need
-       to vary the lockType can set it as needed.
-  -->
-  <lockType>${solr.tests.lockType:single}</lockType>
-
-  <infoStream>${solr.tests.infostream:false}</infoStream>
-
-</indexConfig>

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6c070b4a/solr/contrib/analysis-extras/src/test/java/org/apache/solr/analysis/TestFoldingMultitermExtrasQuery.java
----------------------------------------------------------------------
diff --git a/solr/contrib/analysis-extras/src/test/java/org/apache/solr/analysis/TestFoldingMultitermExtrasQuery.java b/solr/contrib/analysis-extras/src/test/java/org/apache/solr/analysis/TestFoldingMultitermExtrasQuery.java
new file mode 100644
index 0000000..b2cdbc2
--- /dev/null
+++ b/solr/contrib/analysis-extras/src/test/java/org/apache/solr/analysis/TestFoldingMultitermExtrasQuery.java
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.analysis;
+
+import java.io.File;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.solr.SolrTestCaseJ4;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+// See: https://issues.apache.org/jira/browse/SOLR-12028 Tests cannot remove files on Windows machines occasionally
+public class TestFoldingMultitermExtrasQuery extends SolrTestCaseJ4 {
+
+  public String getCoreName() {
+    return "basic";
+  }
+
+  @BeforeClass
+  public static void beforeTests() throws Exception {
+    File testHome = createTempDir().toFile();
+    FileUtils.copyDirectory(getFile("analysis-extras/solr"), testHome);
+    initCore("solrconfig-icucollate.xml","schema-folding-extra.xml", testHome.getAbsolutePath());
+
+    int idx = 1;
+    // ICUFoldingFilterFactory
+    assertU(adoc("id", Integer.toString(idx++), "content_icufolding", "BadMagicICUFolding"));
+    assertU(adoc("id", Integer.toString(idx++), "content_icufolding", "Ruß"));
+    assertU(adoc("id", Integer.toString(idx++), "content_icufolding", "ΜΆΪΟΣ"));
+    assertU(adoc("id", Integer.toString(idx++), "content_icufolding", "Μάϊος"));
+    assertU(adoc("id", Integer.toString(idx++), "content_icufolding", "résumé"));
+    assertU(adoc("id", Integer.toString(idx++), "content_icufolding", "re\u0301sume\u0301"));
+    assertU(adoc("id", Integer.toString(idx++), "content_icufolding", "ELİF"));
+    assertU(adoc("id", Integer.toString(idx++), "content_icufolding", "eli\u0307f"));
+
+    // ICUNormalizer2FilterFactory
+
+    assertU(adoc("id", Integer.toString(idx++), "content_icunormalizer2", "BadMagicICUFolding"));
+    assertU(adoc("id", Integer.toString(idx++), "content_icunormalizer2", "Ruß"));
+    assertU(adoc("id", Integer.toString(idx++), "content_icunormalizer2", "ΜΆΪΟΣ"));
+    assertU(adoc("id", Integer.toString(idx++), "content_icunormalizer2", "Μάϊος"));
+    assertU(adoc("id", Integer.toString(idx++), "content_icunormalizer2", "résumé"));
+    assertU(adoc("id", Integer.toString(idx++), "content_icunormalizer2", "re\u0301sume\u0301"));
+    assertU(adoc("id", Integer.toString(idx++), "content_icunormalizer2", "ELİF"));
+    assertU(adoc("id", Integer.toString(idx++), "content_icunormalizer2", "eli\u0307f"));
+
+    // ICUTransformFilterFactory
+    assertU(adoc("id", Integer.toString(idx++), "content_icutransform", "Российская"));
+
+    assertU(commit());
+  }
+
+  @Test
+  public void testICUFolding() {
+    assertQ(req("q", "content_icufolding:BadMagicicuFold*"), "//result[@numFound='1']");
+    assertQ(req("q", "content_icufolding:rU*"), "//result[@numFound='1']");
+    assertQ(req("q", "content_icufolding:Re*Me"), "//result[@numFound='2']");
+    assertQ(req("q", "content_icufolding:RE\u0301su*"), "//result[@numFound='2']");
+    assertQ(req("q", "content_icufolding:El*"), "//result[@numFound='2']");
+  }
+  @Test
+  public void testICUNormalizer2() {
+    assertQ(req("q", "content_icunormalizer2:BadMagicicuFold*"), "//result[@numFound='1']");
+    assertQ(req("q", "content_icunormalizer2:RU*"), "//result[@numFound='1']");
+    assertQ(req("q", "content_icunormalizer2:Μάϊ*"), "//result[@numFound='2']");
+    assertQ(req("q", "content_icunormalizer2:re\u0301Su*"), "//result[@numFound='2']");
+    assertQ(req("q", "content_icunormalizer2:eL*"), "//result[@numFound='2']");
+  }
+  
+  public void testICUTransform() {
+    assertQ(req("q", "content_icutransform:Росс*"), "//result[@numFound='1']");
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6c070b4a/solr/contrib/analysis-extras/src/test/java/org/apache/solr/schema/TestICUCollationField.java
----------------------------------------------------------------------
diff --git a/solr/contrib/analysis-extras/src/test/java/org/apache/solr/schema/TestICUCollationField.java b/solr/contrib/analysis-extras/src/test/java/org/apache/solr/schema/TestICUCollationField.java
new file mode 100644
index 0000000..f164080
--- /dev/null
+++ b/solr/contrib/analysis-extras/src/test/java/org/apache/solr/schema/TestICUCollationField.java
@@ -0,0 +1,192 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.schema;
+
+import java.io.File;
+import java.io.FileOutputStream;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.commons.io.IOUtils;
+import org.apache.lucene.analysis.util.FilesystemResourceLoader;
+import org.apache.lucene.analysis.util.ResourceLoader;
+import org.apache.lucene.analysis.util.StringMockResourceLoader;
+import org.apache.solr.SolrTestCaseJ4;
+import org.junit.BeforeClass;
+
+import com.ibm.icu.text.Collator;
+import com.ibm.icu.text.RuleBasedCollator;
+import com.ibm.icu.util.ULocale;
+
+/**
+ * Tests {@link ICUCollationField} with TermQueries, RangeQueries, and sort order.
+ */
+public class TestICUCollationField extends SolrTestCaseJ4 {
+  
+  @BeforeClass
+  public static void beforeClass() throws Exception {
+    String home = setupSolrHome();
+    initCore("solrconfig.xml","schema.xml", home);
+    // add some docs
+    assertU(adoc("id", "1", "text", "\u0633\u0627\u0628"));
+    assertU(adoc("id", "2", "text", "I WİLL USE TURKİSH CASING"));
+    assertU(adoc("id", "3", "text", "ı will use turkish casıng"));
+    assertU(adoc("id", "4", "text", "Töne"));
+    assertU(adoc("id", "5", "text", "I W\u0049\u0307LL USE TURKİSH CASING"));
+    assertU(adoc("id", "6", "text", "Testing"));
+    assertU(adoc("id", "7", "text", "Tone"));
+    assertU(adoc("id", "8", "text", "Testing"));
+    assertU(adoc("id", "9", "text", "testing"));
+    assertU(adoc("id", "10", "text", "toene"));
+    assertU(adoc("id", "11", "text", "Tzne"));
+    assertU(adoc("id", "12", "text", "\u0698\u0698"));
+    assertU(commit());
+  }
+  
+  /**
+   * Ugly: but what to do? We want to test custom sort, which reads rules in as a resource.
+   * These are largish files, and jvm-specific (as our documentation says, you should always
+   * look out for jvm differences with collation).
+   * So it's preferable to create this file on-the-fly.
+   */
+  public static String setupSolrHome() throws Exception {
+    String tmpFile = createTempDir().toFile().getAbsolutePath();
+    // make data and conf dirs
+    new File(tmpFile  + "/collection1", "data").mkdirs();
+    File confDir = new File(tmpFile + "/collection1", "conf");
+    confDir.mkdirs();
+    
+    // copy over configuration files
+    FileUtils.copyFile(getFile("analysis-extras/solr/collection1/conf/solrconfig-icucollate.xml"), new File(confDir, "solrconfig.xml"));
+    FileUtils.copyFile(getFile("analysis-extras/solr/collection1/conf/schema-icucollate.xml"), new File(confDir, "schema.xml"));
+    
+    // generate custom collation rules (DIN 5007-2), saving to customrules.dat
+    RuleBasedCollator baseCollator = (RuleBasedCollator) Collator.getInstance(new ULocale("de", "DE"));
+
+    String DIN5007_2_tailorings =
+      "& ae , a\u0308 & AE , A\u0308"+
+      "& oe , o\u0308 & OE , O\u0308"+
+      "& ue , u\u0308 & UE , u\u0308";
+
+    RuleBasedCollator tailoredCollator = new RuleBasedCollator(baseCollator.getRules() + DIN5007_2_tailorings);
+    String tailoredRules = tailoredCollator.getRules();
+    final String osFileName = "customrules.dat";
+    final FileOutputStream os = new FileOutputStream(new File(confDir, osFileName));
+    IOUtils.write(tailoredRules, os, "UTF-8");
+    os.close();
+
+    final ResourceLoader loader;
+    if (random().nextBoolean()) {
+      loader = new StringMockResourceLoader(tailoredRules);
+    } else {
+      loader = new FilesystemResourceLoader(confDir.toPath());
+    }
+    final Collator readCollator = ICUCollationField.createFromRules(osFileName, loader);
+    assertEquals(tailoredCollator, readCollator);
+
+    return tmpFile;
+  }
+
+  /** 
+   * Test termquery with german DIN 5007-1 primary strength.
+   * In this case, ö is equivalent to o (but not oe) 
+   */
+  public void testBasicTermQuery() {
+    assertQ("Collated TQ: ",
+       req("fl", "id", "q", "sort_de:tone", "sort", "id asc" ),
+              "//*[@numFound='2']",
+              "//result/doc[1]/str[@name='id'][.=4]",
+              "//result/doc[2]/str[@name='id'][.=7]"
+    );
+  }
+  
+  /** 
+   * Test rangequery again with the DIN 5007-1 collator.
+   * We do a range query of tone .. tp, in binary order this
+   * would retrieve nothing due to case and accent differences.
+   */
+  public void testBasicRangeQuery() {
+    assertQ("Collated RangeQ: ",
+        req("fl", "id", "q", "sort_de:[tone TO tp]", "sort", "id asc" ),
+               "//*[@numFound='2']",
+               "//result/doc[1]/str[@name='id'][.=4]",
+               "//result/doc[2]/str[@name='id'][.=7]"
+     );
+  }
+  
+  /** 
+   * Test sort with a danish collator. ö is ordered after z
+   */
+  public void testBasicSort() {
+    assertQ("Collated Sort: ",
+        req("fl", "id", "q", "sort_da:[tz TO töz]", "sort", "sort_da asc" ),
+               "//*[@numFound='2']",
+               "//result/doc[1]/str[@name='id'][.=11]",
+               "//result/doc[2]/str[@name='id'][.=4]"
+     );
+  }
+  
+  /** 
+   * Test sort with an arabic collator. U+0633 is ordered after U+0698.
+   * With a binary collator, the range would also return nothing.
+   */
+  public void testArabicSort() {
+    assertQ("Collated Sort: ",
+        req("fl", "id", "q", "sort_ar:[\u0698 TO \u0633\u0633]", "sort", "sort_ar asc" ),
+               "//*[@numFound='2']",
+               "//result/doc[1]/str[@name='id'][.=12]",
+               "//result/doc[2]/str[@name='id'][.=1]"
+     );
+  }
+
+  /** 
+   * Test rangequery again with an Arabic collator.
+   * Binary order would normally order U+0633 in this range.
+   */
+  public void testNegativeRangeQuery() {
+    assertQ("Collated RangeQ: ",
+        req("fl", "id", "q", "sort_ar:[\u062F TO \u0698]", "sort", "id asc" ),
+               "//*[@numFound='0']"
+     );
+  }
+  /**
+   * Test canonical decomposition with turkish primary strength. 
+   * With this sort order, İ is the uppercase form of i, and I is the uppercase form of ı.
+   * We index a decomposed form of İ.
+   */
+  public void testCanonicalDecomposition() {
+    assertQ("Collated TQ: ",
+        req("fl", "id", "q", "sort_tr_canon:\"I Will Use Turkish Casıng\"", "sort", "id asc" ),
+               "//*[@numFound='3']",
+               "//result/doc[1]/str[@name='id'][.=2]",
+               "//result/doc[2]/str[@name='id'][.=3]",
+               "//result/doc[3]/str[@name='id'][.=5]"
+     );
+  }
+  
+  /** 
+   * Test termquery with custom collator (DIN 5007-2).
+   * In this case, ö is equivalent to oe (but not o) 
+   */
+  public void testCustomCollation() {
+    assertQ("Collated TQ: ",
+        req("fl", "id", "q", "sort_custom:toene"),
+               "//*[@numFound='2']",
+               "//result/doc/str[@name='id'][.=4]",
+               "//result/doc/str[@name='id'][.=10]"
+     );
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6c070b4a/solr/contrib/analysis-extras/src/test/java/org/apache/solr/schema/TestICUCollationFieldDocValues.java
----------------------------------------------------------------------
diff --git a/solr/contrib/analysis-extras/src/test/java/org/apache/solr/schema/TestICUCollationFieldDocValues.java b/solr/contrib/analysis-extras/src/test/java/org/apache/solr/schema/TestICUCollationFieldDocValues.java
new file mode 100644
index 0000000..57b403a
--- /dev/null
+++ b/solr/contrib/analysis-extras/src/test/java/org/apache/solr/schema/TestICUCollationFieldDocValues.java
@@ -0,0 +1,180 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.schema;
+
+import java.io.File;
+import java.io.FileOutputStream;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.commons.io.IOUtils;
+import org.apache.solr.SolrTestCaseJ4;
+import org.junit.BeforeClass;
+
+import com.ibm.icu.text.Collator;
+import com.ibm.icu.text.RuleBasedCollator;
+import com.ibm.icu.util.ULocale;
+
+/**
+ * Tests {@link ICUCollationField} with docValues.
+ */
+public class TestICUCollationFieldDocValues extends SolrTestCaseJ4 {
+  
+  @BeforeClass
+  public static void beforeClass() throws Exception {
+    String home = setupSolrHome();
+    initCore("solrconfig.xml","schema.xml", home);
+    // add some docs
+    assertU(adoc("id", "1", "text", "\u0633\u0627\u0628"));
+    assertU(adoc("id", "2", "text", "I WİLL USE TURKİSH CASING"));
+    assertU(adoc("id", "3", "text", "ı will use turkish casıng"));
+    assertU(adoc("id", "4", "text", "Töne"));
+    assertU(adoc("id", "5", "text", "I W\u0049\u0307LL USE TURKİSH CASING"));
+    assertU(adoc("id", "6", "text", "Testing"));
+    assertU(adoc("id", "7", "text", "Tone"));
+    assertU(adoc("id", "8", "text", "Testing"));
+    assertU(adoc("id", "9", "text", "testing"));
+    assertU(adoc("id", "10", "text", "toene"));
+    assertU(adoc("id", "11", "text", "Tzne"));
+    assertU(adoc("id", "12", "text", "\u0698\u0698"));
+    assertU(commit());
+  }
+  
+  /**
+   * Ugly: but what to do? We want to test custom sort, which reads rules in as a resource.
+   * These are largish files, and jvm-specific (as our documentation says, you should always
+   * look out for jvm differences with collation).
+   * So it's preferable to create this file on-the-fly.
+   */
+  public static String setupSolrHome() throws Exception {
+    File tmpFile = createTempDir().toFile();
+    
+    // make data and conf dirs
+    new File(tmpFile + "/collection1", "data").mkdirs();
+    File confDir = new File(tmpFile + "/collection1", "conf");
+    confDir.mkdirs();
+    
+    // copy over configuration files
+    FileUtils.copyFile(getFile("analysis-extras/solr/collection1/conf/solrconfig-icucollate.xml"), new File(confDir, "solrconfig.xml"));
+    FileUtils.copyFile(getFile("analysis-extras/solr/collection1/conf/schema-icucollate-dv.xml"), new File(confDir, "schema.xml"));
+    
+    // generate custom collation rules (DIN 5007-2), saving to customrules.dat
+    RuleBasedCollator baseCollator = (RuleBasedCollator) Collator.getInstance(new ULocale("de", "DE"));
+
+    String DIN5007_2_tailorings =
+      "& ae , a\u0308 & AE , A\u0308"+
+      "& oe , o\u0308 & OE , O\u0308"+
+      "& ue , u\u0308 & UE , u\u0308";
+
+    RuleBasedCollator tailoredCollator = new RuleBasedCollator(baseCollator.getRules() + DIN5007_2_tailorings);
+    String tailoredRules = tailoredCollator.getRules();
+    FileOutputStream os = new FileOutputStream(new File(confDir, "customrules.dat"));
+    IOUtils.write(tailoredRules, os, "UTF-8");
+    os.close();
+
+    return tmpFile.getAbsolutePath();
+  }
+
+  /** 
+   * Test termquery with german DIN 5007-1 primary strength.
+   * In this case, ö is equivalent to o (but not oe) 
+   */
+  public void testBasicTermQuery() {
+    assertQ("Collated TQ: ",
+       req("fl", "id", "q", "sort_de:tone", "sort", "id asc" ),
+              "//*[@numFound='2']",
+              "//result/doc[1]/str[@name='id'][.=4]",
+              "//result/doc[2]/str[@name='id'][.=7]"
+    );
+  }
+  
+  /** 
+   * Test rangequery again with the DIN 5007-1 collator.
+   * We do a range query of tone .. tp, in binary order this
+   * would retrieve nothing due to case and accent differences.
+   */
+  public void testBasicRangeQuery() {
+    assertQ("Collated RangeQ: ",
+        req("fl", "id", "q", "sort_de:[tone TO tp]", "sort", "id asc" ),
+               "//*[@numFound='2']",
+               "//result/doc[1]/str[@name='id'][.=4]",
+               "//result/doc[2]/str[@name='id'][.=7]"
+     );
+  }
+  
+  /** 
+   * Test sort with a danish collator. ö is ordered after z
+   */
+  public void testBasicSort() {
+    assertQ("Collated Sort: ",
+        req("fl", "id", "q", "sort_da:[tz TO töz]", "sort", "sort_da asc" ),
+               "//*[@numFound='2']",
+               "//result/doc[1]/str[@name='id'][.=11]",
+               "//result/doc[2]/str[@name='id'][.=4]"
+     );
+  }
+  
+  /** 
+   * Test sort with an arabic collator. U+0633 is ordered after U+0698.
+   * With a binary collator, the range would also return nothing.
+   */
+  public void testArabicSort() {
+    assertQ("Collated Sort: ",
+        req("fl", "id", "q", "sort_ar:[\u0698 TO \u0633\u0633]", "sort", "sort_ar asc" ),
+               "//*[@numFound='2']",
+               "//result/doc[1]/str[@name='id'][.=12]",
+               "//result/doc[2]/str[@name='id'][.=1]"
+     );
+  }
+
+  /** 
+   * Test rangequery again with an Arabic collator.
+   * Binary order would normally order U+0633 in this range.
+   */
+  public void testNegativeRangeQuery() {
+    assertQ("Collated RangeQ: ",
+        req("fl", "id", "q", "sort_ar:[\u062F TO \u0698]", "sort", "id asc" ),
+               "//*[@numFound='0']"
+     );
+  }
+  /**
+   * Test canonical decomposition with turkish primary strength. 
+   * With this sort order, İ is the uppercase form of i, and I is the uppercase form of ı.
+   * We index a decomposed form of İ.
+   */
+  public void testCanonicalDecomposition() {
+    assertQ("Collated TQ: ",
+        req("fl", "id", "q", "sort_tr_canon:\"I Will Use Turkish Casıng\"", "sort", "id asc" ),
+               "//*[@numFound='3']",
+               "//result/doc[1]/str[@name='id'][.=2]",
+               "//result/doc[2]/str[@name='id'][.=3]",
+               "//result/doc[3]/str[@name='id'][.=5]"
+     );
+  }
+  
+  /** 
+   * Test termquery with custom collator (DIN 5007-2).
+   * In this case, ö is equivalent to oe (but not o) 
+   */
+  public void testCustomCollation() {
+    assertQ("Collated TQ: ",
+        req("fl", "id", "q", "sort_custom:toene"),
+               "//*[@numFound='2']",
+               "//result/doc/str[@name='id'][.=4]",
+               "//result/doc/str[@name='id'][.=10]"
+     );
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6c070b4a/solr/contrib/analysis-extras/src/test/java/org/apache/solr/schema/TestICUCollationFieldOptions.java
----------------------------------------------------------------------
diff --git a/solr/contrib/analysis-extras/src/test/java/org/apache/solr/schema/TestICUCollationFieldOptions.java b/solr/contrib/analysis-extras/src/test/java/org/apache/solr/schema/TestICUCollationFieldOptions.java
new file mode 100644
index 0000000..0b198b7
--- /dev/null
+++ b/solr/contrib/analysis-extras/src/test/java/org/apache/solr/schema/TestICUCollationFieldOptions.java
@@ -0,0 +1,119 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.schema;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.solr.SolrTestCaseJ4;
+import org.junit.BeforeClass;
+
+import java.io.File;
+
+/**
+ * Tests expert options of {@link ICUCollationField}.
+ */
+public class TestICUCollationFieldOptions extends SolrTestCaseJ4 {
+  @BeforeClass
+  public static void beforeClass() throws Exception {
+    File testHome = createTempDir().toFile();
+    FileUtils.copyDirectory(getFile("analysis-extras/solr"), testHome);
+    initCore("solrconfig-icucollate.xml","schema-icucollateoptions.xml", testHome.getAbsolutePath());
+    // add some docs
+    assertU(adoc("id", "1", "text", "foo-bar"));
+    assertU(adoc("id", "2", "text", "foo bar"));
+    assertU(adoc("id", "3", "text", "foobar"));
+    assertU(adoc("id", "4", "text", "foobar-10"));
+    assertU(adoc("id", "5", "text", "foobar-9"));
+    assertU(adoc("id", "6", "text", "resume"));
+    assertU(adoc("id", "7", "text", "Résumé"));
+    assertU(adoc("id", "8", "text", "Resume"));
+    assertU(adoc("id", "9", "text", "résumé"));
+    assertU(commit());
+  }
+  
+  /*
+   * Setting alternate=shifted to shift whitespace, punctuation and symbols
+   * to quaternary level 
+   */
+  public void testIgnorePunctuation() { 
+    assertQ("Collated TQ: ",
+        req("fl", "id", "q", "sort_ignore_punctuation:foobar", "sort", "id asc" ),
+               "//*[@numFound='3']",
+               "//result/doc[1]/str[@name='id'][.=1]",
+               "//result/doc[2]/str[@name='id'][.=2]",
+               "//result/doc[3]/str[@name='id'][.=3]"
+     );
+  }
+  
+  /*
+   * Setting alternate=shifted and variableTop to shift whitespace, but not 
+   * punctuation or symbols, to quaternary level 
+   */
+  public void testIgnoreWhitespace() {
+    assertQ("Collated TQ: ",
+        req("fl", "id", "q", "sort_ignore_space:\"foo bar\"", "sort", "id asc" ),
+               "//*[@numFound='2']",
+               "//result/doc[1]/str[@name='id'][.=2]",
+               "//result/doc[2]/str[@name='id'][.=3]"
+     );
+  }
+  
+  /*
+   * Setting numeric to encode digits with numeric value, so that
+   * foobar-9 sorts before foobar-10
+   */
+  public void testNumerics() {
+    assertQ("Collated sort: ",
+        req("fl", "id", "q", "id:[4 TO 5]", "sort", "sort_numerics asc" ),
+               "//*[@numFound='2']",
+               "//result/doc[1]/str[@name='id'][.=5]",
+               "//result/doc[2]/str[@name='id'][.=4]"
+     );
+  }
+  
+  /*
+   * Setting caseLevel=true to create an additional case level between
+   * secondary and tertiary
+   */
+  public void testIgnoreAccentsButNotCase() {
+    assertQ("Collated TQ: ",
+        req("fl", "id", "q", "sort_ignore_accents:resume", "sort", "id asc" ),
+               "//*[@numFound='2']",
+               "//result/doc[1]/str[@name='id'][.=6]",
+               "//result/doc[2]/str[@name='id'][.=9]"
+     );
+    
+    assertQ("Collated TQ: ",
+        req("fl", "id", "q", "sort_ignore_accents:Resume", "sort", "id asc" ),
+               "//*[@numFound='2']",
+               "//result/doc[1]/str[@name='id'][.=7]",
+               "//result/doc[2]/str[@name='id'][.=8]"
+     );
+  }
+  
+  /*
+   * Setting caseFirst=upper to cause uppercase strings to sort
+   * before lowercase ones.
+   */
+  public void testUpperCaseFirst() {
+    assertQ("Collated sort: ",
+        req("fl", "id", "q", "id:6 OR id:8", "sort", "sort_uppercase_first asc" ),
+               "//*[@numFound='2']",
+               "//result/doc[1]/str[@name='id'][.=8]",
+               "//result/doc[2]/str[@name='id'][.=6]"
+     );
+  }
+}


Mime
View raw message