lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From da...@apache.org
Subject [50/51] [partial] lucene-solr:jira/gradle: Add more contrib modules
Date Fri, 02 Nov 2018 15:43:14 GMT
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/4dd96a0e/solr/contrib/langid/src/main/java/org/apache/solr/update/processor/DetectedLanguage.java
----------------------------------------------------------------------
diff --git a/solr/contrib/langid/src/main/java/org/apache/solr/update/processor/DetectedLanguage.java b/solr/contrib/langid/src/main/java/org/apache/solr/update/processor/DetectedLanguage.java
new file mode 100644
index 0000000..e8e6fbe
--- /dev/null
+++ b/solr/contrib/langid/src/main/java/org/apache/solr/update/processor/DetectedLanguage.java
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.update.processor;
+
+/**
+ * Bean holding a language and a detection certainty 
+ */
+public class DetectedLanguage {
+  private final String langCode;
+  private final Double certainty;
+  
+  DetectedLanguage(String lang, Double certainty) {
+    this.langCode = lang;
+    this.certainty = certainty;
+  }
+  
+  /**
+   * Returns the detected language code
+   * @return language code as a string
+   */
+  public String getLangCode() {
+    return langCode;
+  }
+
+  /**
+   * Returns the detected certainty for this language
+   * @return certainty as a value between 0.0 and 1.0 where 1.0 is 100% certain
+   */
+  public Double getCertainty() {
+    return certainty;
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/4dd96a0e/solr/contrib/langid/src/main/java/org/apache/solr/update/processor/LangDetectLanguageIdentifierUpdateProcessor.java
----------------------------------------------------------------------
diff --git a/solr/contrib/langid/src/main/java/org/apache/solr/update/processor/LangDetectLanguageIdentifierUpdateProcessor.java b/solr/contrib/langid/src/main/java/org/apache/solr/update/processor/LangDetectLanguageIdentifierUpdateProcessor.java
new file mode 100644
index 0000000..8af05b3
--- /dev/null
+++ b/solr/contrib/langid/src/main/java/org/apache/solr/update/processor/LangDetectLanguageIdentifierUpdateProcessor.java
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.update.processor;
+
+import java.lang.invoke.MethodHandles;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.List;
+
+import org.apache.solr.request.SolrQueryRequest;
+import org.apache.solr.response.SolrQueryResponse;
+
+import com.cybozu.labs.langdetect.Detector;
+import com.cybozu.labs.langdetect.DetectorFactory;
+import com.cybozu.labs.langdetect.LangDetectException;
+import com.cybozu.labs.langdetect.Language;
+import org.apache.solr.common.SolrInputDocument;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Identifies the language of a set of input fields using http://code.google.com/p/language-detection
+ * <p>
+ * See <a href="http://wiki.apache.org/solr/LanguageDetection">http://wiki.apache.org/solr/LanguageDetection</a>
+ * @since 3.5
+ */
+public class LangDetectLanguageIdentifierUpdateProcessor extends LanguageIdentifierUpdateProcessor {
+
+  private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+
+  public LangDetectLanguageIdentifierUpdateProcessor(SolrQueryRequest req, 
+      SolrQueryResponse rsp, UpdateRequestProcessor next) {
+    super(req, rsp, next);
+  }
+
+  @Override
+  protected List<DetectedLanguage> detectLanguage(SolrInputDocument doc) {
+    try {
+      Detector detector = DetectorFactory.create();
+      detector.setMaxTextLength(maxTotalChars);
+
+      for (String fieldName : inputFields) {
+        log.debug("Appending field " + fieldName);
+        if (doc.containsKey(fieldName)) {
+          Collection<Object> fieldValues = doc.getFieldValues(fieldName);
+          if (fieldValues != null) {
+            for (Object content : fieldValues) {
+              if (content instanceof String) {
+                String stringContent = (String) content;
+                if (stringContent.length() > maxFieldValueChars) {
+                  detector.append(stringContent.substring(0, maxFieldValueChars));
+                } else {
+                  detector.append(stringContent);
+                }
+                detector.append(" ");
+              } else {
+                log.warn("Field " + fieldName + " not a String value, not including in detection");
+              }
+            }
+          }
+        }
+      }
+      ArrayList<Language> langlist = detector.getProbabilities();
+      ArrayList<DetectedLanguage> solrLangList = new ArrayList<>();
+      for (Language l: langlist) {
+        solrLangList.add(new DetectedLanguage(l.lang, l.prob));
+      }
+      return solrLangList;
+    } catch (LangDetectException e) {
+      log.debug("Could not determine language, returning empty list: ", e);
+      return Collections.emptyList();
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/4dd96a0e/solr/contrib/langid/src/main/java/org/apache/solr/update/processor/LangDetectLanguageIdentifierUpdateProcessorFactory.java
----------------------------------------------------------------------
diff --git a/solr/contrib/langid/src/main/java/org/apache/solr/update/processor/LangDetectLanguageIdentifierUpdateProcessorFactory.java b/solr/contrib/langid/src/main/java/org/apache/solr/update/processor/LangDetectLanguageIdentifierUpdateProcessorFactory.java
new file mode 100644
index 0000000..a140807
--- /dev/null
+++ b/solr/contrib/langid/src/main/java/org/apache/solr/update/processor/LangDetectLanguageIdentifierUpdateProcessorFactory.java
@@ -0,0 +1,137 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.update.processor;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.solr.common.params.SolrParams;
+import org.apache.solr.common.util.NamedList;
+import org.apache.solr.core.SolrCore;
+import org.apache.solr.request.SolrQueryRequest;
+import org.apache.solr.response.SolrQueryResponse;
+import org.apache.solr.util.SolrPluginUtils;
+import org.apache.solr.util.plugin.SolrCoreAware;
+
+import com.cybozu.labs.langdetect.DetectorFactory;
+import com.cybozu.labs.langdetect.LangDetectException;
+
+/**
+ * Identifies the language of a set of input fields using 
+ * http://code.google.com/p/language-detection
+ * <p>
+ * The UpdateProcessorChain config entry can take a number of parameters
+ * which may also be passed as HTTP parameters on the update request
+ * and override the defaults. Here is the simplest processor config possible:
+ * 
+ * <pre class="prettyprint" >
+ * &lt;processor class=&quot;org.apache.solr.update.processor.LangDetectLanguageIdentifierUpdateProcessorFactory&quot;&gt;
+ *   &lt;str name=&quot;langid.fl&quot;&gt;title,text&lt;/str&gt;
+ *   &lt;str name=&quot;langid.langField&quot;&gt;language_s&lt;/str&gt;
+ * &lt;/processor&gt;
+ * </pre>
+ * See <a href="http://wiki.apache.org/solr/LanguageDetection">http://wiki.apache.org/solr/LanguageDetection</a>
+ * @since 3.5
+ */
+public class LangDetectLanguageIdentifierUpdateProcessorFactory extends
+        UpdateRequestProcessorFactory implements SolrCoreAware, LangIdParams {
+
+  protected SolrParams defaults;
+  protected SolrParams appends;
+  protected SolrParams invariants;
+
+  @Override
+  public void inform(SolrCore core) {
+  }
+
+  /**
+   * The UpdateRequestProcessor may be initialized in solrconfig.xml similarly
+   * to a RequestHandler, with defaults, appends and invariants.
+   * @param args a NamedList with the configuration parameters 
+   */
+  @Override
+  @SuppressWarnings("rawtypes")
+  public void init( NamedList args )
+  {
+    try {
+      loadData();
+    } catch (Exception e) {
+      throw new RuntimeException("Couldn't load profile data, will return empty languages always!", e);
+    }
+    if (args != null) {
+      Object o;
+      o = args.get("defaults");
+      if (o != null && o instanceof NamedList) {
+        defaults = ((NamedList) o).toSolrParams();
+      } else {
+        defaults = args.toSolrParams();
+      }
+      o = args.get("appends");
+      if (o != null && o instanceof NamedList) {
+        appends = ((NamedList) o).toSolrParams();
+      }
+      o = args.get("invariants");
+      if (o != null && o instanceof NamedList) {
+        invariants = ((NamedList) o).toSolrParams();
+      }
+    }
+  }
+
+  @Override
+  public UpdateRequestProcessor getInstance(SolrQueryRequest req,
+                                            SolrQueryResponse rsp, UpdateRequestProcessor next) {
+    // Process defaults, appends and invariants if we got a request
+    if(req != null) {
+      SolrPluginUtils.setDefaults(req, defaults, appends, invariants);
+    }
+    return new LangDetectLanguageIdentifierUpdateProcessor(req, rsp, next);
+  }
+  
+  
+  // DetectorFactory is totally global, so we only want to do this once... ever!!!
+  static boolean loaded;
+  
+  // profiles we will load from classpath
+  static final String languages[] = {
+    "af", "ar", "bg", "bn", "cs", "da", "de", "el", "en", "es", "et", "fa", "fi", "fr", "gu",
+    "he", "hi", "hr", "hu", "id", "it", "ja", "kn", "ko", "lt", "lv", "mk", "ml", "mr", "ne",
+    "nl", "no", "pa", "pl", "pt", "ro", "ru", "sk", "sl", "so", "sq", "sv", "sw", "ta", "te",
+    "th", "tl", "tr", "uk", "ur", "vi", "zh-cn", "zh-tw"
+  };
+
+  public static synchronized void loadData() throws IOException, LangDetectException {
+    if (loaded) {
+      return;
+    }
+    loaded = true;
+    List<String> profileData = new ArrayList<>();
+    for (String language : languages) {
+      InputStream stream = LangDetectLanguageIdentifierUpdateProcessor.class.getResourceAsStream("langdetect-profiles/" + language);
+      BufferedReader reader = new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8));
+      profileData.add(new String(IOUtils.toCharArray(reader)));
+      reader.close();
+    }
+    DetectorFactory.loadProfile(profileData);
+    DetectorFactory.setSeed(0);
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/4dd96a0e/solr/contrib/langid/src/main/java/org/apache/solr/update/processor/LangIdParams.java
----------------------------------------------------------------------
diff --git a/solr/contrib/langid/src/main/java/org/apache/solr/update/processor/LangIdParams.java b/solr/contrib/langid/src/main/java/org/apache/solr/update/processor/LangIdParams.java
new file mode 100644
index 0000000..4e19eab
--- /dev/null
+++ b/solr/contrib/langid/src/main/java/org/apache/solr/update/processor/LangIdParams.java
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.update.processor;
+
+public interface LangIdParams {
+
+  String LANGUAGE_ID = "langid";
+  String DOCID_PARAM =  LANGUAGE_ID + ".idField";
+
+  String FIELDS_PARAM = LANGUAGE_ID + ".fl";                 // Field list to detect from
+  String LANG_FIELD = LANGUAGE_ID + ".langField";            // Main language detected
+  String LANGS_FIELD = LANGUAGE_ID + ".langsField";          // All languages detected (multiValued)
+  String FALLBACK =  LANGUAGE_ID + ".fallback";              // Fallback lang code  
+  String FALLBACK_FIELDS =  LANGUAGE_ID + ".fallbackFields"; // Comma-sep list of fallback fields
+  String OVERWRITE  = LANGUAGE_ID + ".overwrite";            // Overwrite if existing language value in LANG_FIELD
+  String THRESHOLD  = LANGUAGE_ID + ".threshold";            // Detection threshold
+  String ENFORCE_SCHEMA =  LANGUAGE_ID + ".enforceSchema";   // Enforces that output fields exist in schema
+  String LANG_WHITELIST  = LANGUAGE_ID + ".whitelist";       // Allowed languages
+  String LCMAP =  LANGUAGE_ID + ".lcmap";                    // Maps detected langcode to other value
+  String MAP_ENABLE =  LANGUAGE_ID + ".map";                 // Turns on or off the field mapping
+  String MAP_FL =  LANGUAGE_ID + ".map.fl";                  // Field list for mapping
+  String MAP_OVERWRITE =  LANGUAGE_ID + ".map.overwrite";    // Whether to overwrite existing fields
+  String MAP_KEEP_ORIG =  LANGUAGE_ID + ".map.keepOrig";     // Keep original field after mapping
+  String MAP_INDIVIDUAL =  LANGUAGE_ID + ".map.individual";  // Detect language per individual field
+  String MAP_INDIVIDUAL_FL =  LANGUAGE_ID + ".map.individual.fl";// Field list of fields to redetect language for
+  String MAP_LCMAP =  LANGUAGE_ID + ".map.lcmap";            // Enables mapping multiple langs to same output field
+  String MAP_PATTERN =  LANGUAGE_ID + ".map.pattern";        // RegEx pattern to match field name
+  String MAP_REPLACE =  LANGUAGE_ID + ".map.replace";        // Replace pattern
+  String MAX_FIELD_VALUE_CHARS = LANGUAGE_ID + ".maxFieldValueChars";   // Maximum number of characters to use per field for language detection
+  String MAX_TOTAL_CHARS = LANGUAGE_ID + ".maxTotalChars";   // Maximum number of characters to use per all concatenated fields for language detection
+
+  String DOCID_FIELD_DEFAULT = "id";
+  String DOCID_LANGFIELD_DEFAULT = null;
+  String DOCID_LANGSFIELD_DEFAULT = null;
+  String MAP_PATTERN_DEFAULT = "(.*)";
+  String MAP_REPLACE_DEFAULT = "$1_{lang}";
+  int MAX_FIELD_VALUE_CHARS_DEFAULT = 10000;
+  int MAX_TOTAL_CHARS_DEFAULT = 20000;
+
+  // TODO: This default threshold accepts even "uncertain" detections. 
+  // Increase &langid.threshold above 0.5 to return only certain detections
+  Double DOCID_THRESHOLD_DEFAULT = 0.5;
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/4dd96a0e/solr/contrib/langid/src/main/java/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessor.java
----------------------------------------------------------------------
diff --git a/solr/contrib/langid/src/main/java/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessor.java b/solr/contrib/langid/src/main/java/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessor.java
new file mode 100644
index 0000000..3679905
--- /dev/null
+++ b/solr/contrib/langid/src/main/java/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessor.java
@@ -0,0 +1,466 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.update.processor;
+
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.SolrException.ErrorCode;
+import org.apache.solr.common.SolrInputDocument;
+import org.apache.solr.common.SolrInputField;
+import org.apache.solr.common.params.SolrParams;
+import org.apache.solr.request.SolrQueryRequest;
+import org.apache.solr.response.SolrQueryResponse;
+import org.apache.solr.schema.IndexSchema;
+import org.apache.solr.schema.SchemaField;
+import org.apache.solr.update.AddUpdateCommand;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.lang.invoke.MethodHandles;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.regex.Pattern;
+
+
+/**
+ * Identifies the language of a set of input fields.
+ * Also supports mapping of field names based
+ * on detected language.
+ * <p>
+ * See <a href="http://wiki.apache.org/solr/LanguageDetection">http://wiki.apache.org/solr/LanguageDetection</a>
+ * @since 3.5
+ * @lucene.experimental
+ */
+public abstract class LanguageIdentifierUpdateProcessor extends UpdateRequestProcessor implements LangIdParams {
+
+  private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+
+  protected boolean enabled;
+
+  protected String[] inputFields = {};
+  protected String[] mapFields = {};
+  protected Pattern mapPattern;
+  protected String mapReplaceStr;
+  protected String langField;
+  protected String langsField; // MultiValued, contains all languages detected
+  protected String docIdField;
+  protected String fallbackValue;
+  protected String[] fallbackFields = {};
+  protected boolean enableMapping;
+  protected boolean mapKeepOrig;
+  protected boolean overwrite;
+  protected boolean mapOverwrite;
+  protected boolean mapIndividual;
+  protected boolean enforceSchema;
+  protected double threshold;
+  protected HashSet<String> langWhitelist;
+  protected HashSet<String> mapIndividualFieldsSet;
+  protected HashSet<String> allMapFieldsSet;
+  protected HashMap<String,String> lcMap;
+  protected HashMap<String,String> mapLcMap;
+  protected IndexSchema schema;
+  protected int maxFieldValueChars;
+  protected int maxTotalChars;
+
+  // Regex patterns
+  protected final Pattern tikaSimilarityPattern = Pattern.compile(".*\\((.*?)\\)");
+  protected final Pattern langPattern = Pattern.compile("\\{lang\\}");
+
+  public LanguageIdentifierUpdateProcessor(SolrQueryRequest req,
+                                           SolrQueryResponse rsp, UpdateRequestProcessor next) {
+    super(next);
+    schema = req.getSchema();
+
+    initParams(req.getParams());
+  }
+
+  private void initParams(SolrParams params) {
+    if (params != null) {
+      // Document-centric langId params
+      setEnabled(params.getBool(LANGUAGE_ID, true));
+      if(params.get(FIELDS_PARAM, "").length() > 0) {
+        inputFields = params.get(FIELDS_PARAM, "").split(",");
+      }
+      langField = params.get(LANG_FIELD, DOCID_LANGFIELD_DEFAULT);
+      langsField = params.get(LANGS_FIELD, DOCID_LANGSFIELD_DEFAULT);
+      SchemaField uniqueKeyField = schema.getUniqueKeyField();
+      docIdField = params.get(DOCID_PARAM, uniqueKeyField == null ? DOCID_FIELD_DEFAULT : uniqueKeyField.getName());
+      fallbackValue = params.get(FALLBACK);
+      if(params.get(FALLBACK_FIELDS, "").length() > 0) {
+        fallbackFields = params.get(FALLBACK_FIELDS).split(",");
+      }
+      overwrite = params.getBool(OVERWRITE, false);
+      langWhitelist = new HashSet<>();
+      threshold = params.getDouble(THRESHOLD, DOCID_THRESHOLD_DEFAULT);
+      if(params.get(LANG_WHITELIST, "").length() > 0) {
+        for(String lang : params.get(LANG_WHITELIST, "").split(",")) {
+          langWhitelist.add(lang);
+        }
+      }
+
+      // Mapping params (field centric)
+      enableMapping = params.getBool(MAP_ENABLE, false);
+      if(params.get(MAP_FL, "").length() > 0) {
+        mapFields = params.get(MAP_FL, "").split(",");
+      } else {
+        mapFields = inputFields;
+      }
+      mapKeepOrig = params.getBool(MAP_KEEP_ORIG, false);
+      mapOverwrite = params.getBool(MAP_OVERWRITE, false);
+      mapIndividual = params.getBool(MAP_INDIVIDUAL, false);
+
+      // Process individual fields
+      String[] mapIndividualFields = {};
+      if(params.get(MAP_INDIVIDUAL_FL, "").length() > 0) {
+        mapIndividualFields = params.get(MAP_INDIVIDUAL_FL, "").split(",");
+      } else {
+        mapIndividualFields = mapFields;
+      }
+      mapIndividualFieldsSet = new HashSet<>(Arrays.asList(mapIndividualFields));
+      // Compile a union of the lists of fields to map
+      allMapFieldsSet = new HashSet<>(Arrays.asList(mapFields));
+      if(Arrays.equals(mapFields, mapIndividualFields)) {
+        allMapFieldsSet.addAll(mapIndividualFieldsSet);
+      }
+
+      // Normalize detected langcode onto normalized langcode
+      lcMap = new HashMap<>();
+      if(params.get(LCMAP) != null) {
+        for(String mapping : params.get(LCMAP).split("[, ]")) {
+          String[] keyVal = mapping.split(":");
+          if(keyVal.length == 2) {
+            lcMap.put(keyVal[0], keyVal[1]);
+          } else {
+            log.error("Unsupported format for langid.lcmap: "+mapping+". Skipping this mapping.");
+          }
+        }
+      }
+
+      // Language Code mapping
+      mapLcMap = new HashMap<>();
+      if(params.get(MAP_LCMAP) != null) {
+        for(String mapping : params.get(MAP_LCMAP).split("[, ]")) {
+          String[] keyVal = mapping.split(":");
+          if(keyVal.length == 2) {
+            mapLcMap.put(keyVal[0], keyVal[1]);
+          } else {
+            log.error("Unsupported format for langid.map.lcmap: "+mapping+". Skipping this mapping.");
+          }
+        }
+      }
+      enforceSchema = params.getBool(ENFORCE_SCHEMA, true);
+
+      mapPattern = Pattern.compile(params.get(MAP_PATTERN, MAP_PATTERN_DEFAULT));
+      mapReplaceStr = params.get(MAP_REPLACE, MAP_REPLACE_DEFAULT);
+      maxFieldValueChars = params.getInt(MAX_FIELD_VALUE_CHARS, MAX_FIELD_VALUE_CHARS_DEFAULT);
+      maxTotalChars = params.getInt(MAX_TOTAL_CHARS, MAX_TOTAL_CHARS_DEFAULT);
+      if (maxFieldValueChars > maxTotalChars) {
+        if (maxTotalChars == MAX_TOTAL_CHARS_DEFAULT) {
+          // If the user specified only maxFieldValueChars, make maxTotalChars the same as it
+          log.warn(MAX_FIELD_VALUE_CHARS + " (" + maxFieldValueChars + ") is less than " + MAX_TOTAL_CHARS + " ("
+              + maxTotalChars + ").  Setting " + MAX_TOTAL_CHARS + " to " + maxFieldValueChars + ".");
+          maxTotalChars = maxFieldValueChars;
+        } else {
+          // If the user specified maxTotalChars, make maxFieldValueChars the same as it
+          log.warn(MAX_FIELD_VALUE_CHARS + " (" + maxFieldValueChars + ") is less than " + MAX_TOTAL_CHARS + " ("
+              + maxTotalChars + ").  Setting " + MAX_FIELD_VALUE_CHARS + " to " + maxTotalChars + ".");
+          maxFieldValueChars = maxTotalChars;
+        }
+      }
+    }
+    log.debug("LangId configured");
+
+
+    if (inputFields.length == 0) {
+      throw new SolrException(ErrorCode.BAD_REQUEST,
+              "Missing or faulty configuration of LanguageIdentifierUpdateProcessor. Input fields must be specified as a comma separated list");
+    }
+
+  }
+
+  @Override
+  public void processAdd(AddUpdateCommand cmd) throws IOException {
+    if (isEnabled()) {
+      process(cmd.getSolrInputDocument());
+    } else {
+      log.debug("Processor not enabled, not running");
+    }
+    super.processAdd(cmd);
+  }
+
+  /**
+   * This is the main, testable process method called from processAdd()
+   * @param doc the SolrInputDocument to work on
+   * @return the modified SolrInputDocument
+   */
+  protected SolrInputDocument process(SolrInputDocument doc) {
+    String docLang = null;
+    HashSet<String> docLangs = new HashSet<>();
+    String fallbackLang = getFallbackLang(doc, fallbackFields, fallbackValue);
+
+    if(langField == null || !doc.containsKey(langField) || (doc.containsKey(langField) && overwrite)) {
+      List<DetectedLanguage> languagelist = detectLanguage(doc);
+      docLang = resolveLanguage(languagelist, fallbackLang);
+      docLangs.add(docLang);
+      log.debug("Detected main document language from fields "+ Arrays.toString(inputFields) +": "+docLang);
+
+      if(doc.containsKey(langField) && overwrite) {
+        log.debug("Overwritten old value "+doc.getFieldValue(langField));
+      }
+      if(langField != null && langField.length() != 0) {
+        doc.setField(langField, docLang);
+      }
+    } else {
+      // langField is set, we sanity check it against whitelist and fallback
+      docLang = resolveLanguage((String) doc.getFieldValue(langField), fallbackLang);
+      docLangs.add(docLang);
+      log.debug("Field "+langField+" already contained value "+docLang+", not overwriting.");
+    }
+
+    if(enableMapping) {
+      for (String fieldName : allMapFieldsSet) {
+        if(doc.containsKey(fieldName)) {
+          String fieldLang;
+          if(mapIndividual && mapIndividualFieldsSet.contains(fieldName)) {
+            List<DetectedLanguage> languagelist = detectLanguage(doc);
+            fieldLang = resolveLanguage(languagelist, docLang);
+            docLangs.add(fieldLang);
+            log.debug("Mapping field "+fieldName+" using individually detected language "+fieldLang);
+          } else {
+            fieldLang = docLang;
+            log.debug("Mapping field "+fieldName+" using document global language "+fieldLang);
+          }
+          String mappedOutputField = getMappedField(fieldName, fieldLang);
+
+          if (mappedOutputField != null) {
+            log.debug("Mapping field {} to {}", doc.getFieldValue(docIdField), fieldLang);
+            SolrInputField inField = doc.getField(fieldName);
+            doc.setField(mappedOutputField, inField.getValue());
+            if(!mapKeepOrig) {
+              log.debug("Removing old field {}", fieldName);
+              doc.removeField(fieldName);
+            }
+          } else {
+            throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Invalid output field mapping for "
+                    + fieldName + " field and language: " + fieldLang);
+          }
+        }
+      }
+    }
+
+    // Set the languages field to an array of all detected languages
+    if(langsField != null && langsField.length() != 0) {
+      doc.setField(langsField, docLangs.toArray());
+    }
+
+    return doc;
+  }
+
+  /**
+   * Decides the fallback language, either from content of fallback field or fallback value
+   * @param doc the Solr document
+   * @param fallbackFields an array of strings with field names containing fallback language codes
+   * @param fallbackValue a language code to use in case no fallbackFields are found
+   */
+  private String getFallbackLang(SolrInputDocument doc, String[] fallbackFields, String fallbackValue) {
+    String lang = null;
+    for(String field : fallbackFields) {
+      if(doc.containsKey(field)) {
+        lang = (String) doc.getFieldValue(field);
+        log.debug("Language fallback to field "+field);
+        break;
+      }
+    }
+    if(lang == null) {
+      log.debug("Language fallback to value "+fallbackValue);
+      lang = fallbackValue;
+    }
+    return lang;
+  }
+
+  /**
+   * Detects language(s) from a string.
+   * Classes wishing to implement their own language detection module should override this method.
+   * @param content The content to identify
+   * @return List of detected language(s) according to RFC-3066
+   */
+  protected abstract List<DetectedLanguage> detectLanguage(SolrInputDocument content);
+
+  /**
+   * Chooses a language based on the list of candidates detected
+   * @param language language code as a string
+   * @param fallbackLang the language code to use as a fallback
+   * @return a string of the chosen language
+   */
+  protected String resolveLanguage(String language, String fallbackLang) {
+    List<DetectedLanguage> l = new ArrayList<>();
+    l.add(new DetectedLanguage(language, 1.0));
+    return resolveLanguage(l, fallbackLang);
+  }
+
+  /**
+   * Chooses a language based on the list of candidates detected
+   * @param languages a List of DetectedLanguages with certainty score
+   * @param fallbackLang the language code to use as a fallback
+   * @return a string of the chosen language
+   */
+  protected String resolveLanguage(List<DetectedLanguage> languages, String fallbackLang) {
+    String langStr;
+    if(languages.size() == 0) {
+      log.debug("No language detected, using fallback {}", fallbackLang);
+      langStr = fallbackLang;
+    } else {
+      DetectedLanguage lang = languages.get(0);
+      String normalizedLang = normalizeLangCode(lang.getLangCode());
+      if(langWhitelist.isEmpty() || langWhitelist.contains(normalizedLang)) {
+        log.debug("Language detected {} with certainty {}", normalizedLang, lang.getCertainty());
+        if(lang.getCertainty() >= threshold) {
+          langStr = normalizedLang;
+        } else {
+          log.debug("Detected language below threshold {}, using fallback {}", threshold, fallbackLang);
+          langStr = fallbackLang;
+        }
+      } else {
+        log.debug("Detected a language not in whitelist ({}), using fallback {}", lang.getLangCode(), fallbackLang);
+        langStr = fallbackLang;
+      }
+    }
+
+    if(langStr == null || langStr.length() == 0) {
+      log.warn("Language resolved to null or empty string. Fallback not configured?");
+      langStr = "";
+    }
+
+    return langStr;
+  }
+
+  /**
+   * Looks up language code in map (langid.lcmap) and returns mapped value
+   * @param langCode the language code string returned from detector
+   * @return the normalized/mapped language code
+   */
+  protected String normalizeLangCode(String langCode) {
+    if (lcMap.containsKey(langCode)) {
+      String lc = lcMap.get(langCode);
+      log.debug("Doing langcode normalization mapping from "+langCode+" to "+lc);
+      return lc;
+    }
+    return langCode;
+  }
+
+  /**
+   * Returns the name of the field to map the current contents into, so that they are properly analyzed.  For instance
+   * if the currentField is "text" and the code is "en", the new field would by default be "text_en".
+   * This method also performs custom regex pattern replace if configured. If enforceSchema=true
+   * and the resulting field name doesn't exist, then null is returned.
+   *
+   * @param currentField The current field name
+   * @param language the language code
+   * @return The new schema field name, based on pattern and replace, or null if illegal
+   */
+  protected String getMappedField(String currentField, String language) {
+    String lc = mapLcMap.containsKey(language) ? mapLcMap.get(language) : language;
+    String newFieldName = langPattern.matcher(mapPattern.matcher(currentField).replaceFirst(mapReplaceStr)).replaceFirst(lc);
+    if(enforceSchema && schema.getFieldOrNull(newFieldName) == null) {
+      log.warn("Unsuccessful field name mapping from {} to {}, field does not exist and enforceSchema=true; skipping mapping.", currentField, newFieldName);
+      return null;
+    } else {
+      log.debug("Doing mapping from "+currentField+" with language "+language+" to field "+newFieldName);
+    }
+    return newFieldName;
+  }
+
+  /**
+   * Tells if this processor is enabled or not
+   * @return true if enabled, else false
+   */
+  public boolean isEnabled() {
+    return enabled;
+  }
+
+  public void setEnabled(boolean enabled) {
+    this.enabled = enabled;
+  }
+
+
+
+  /**
+   * Concatenates content from multiple fields
+   */
+  protected String concatFields(SolrInputDocument doc) {
+    StringBuilder sb = new StringBuilder(getExpectedSize(doc, inputFields));
+    for (String fieldName : inputFields) {
+      log.debug("Appending field " + fieldName);
+      if (doc.containsKey(fieldName)) {
+        Collection<Object> fieldValues = doc.getFieldValues(fieldName);
+        if (fieldValues != null) {
+          for (Object content : fieldValues) {
+            if (content instanceof String) {
+              String stringContent = (String) content;
+              if (stringContent.length() > maxFieldValueChars) {
+                sb.append(stringContent.substring(0, maxFieldValueChars));
+              } else {
+                sb.append(stringContent);
+              }
+              sb.append(" ");
+              if (sb.length() > maxTotalChars) {
+                sb.setLength(maxTotalChars);
+                break;
+              }
+            } else {
+              log.warn("Field " + fieldName + " not a String value, not including in detection");
+            }
+          }
+        }
+      }
+    }
+    return sb.toString();
+  }
+
+  /**
+   * Calculate expected string size.
+   *
+   * @param doc           solr input document
+   * @param fields        fields to select
+   * @return expected size of string value
+   */
+  private int getExpectedSize(SolrInputDocument doc, String[] fields) {
+    int docSize = 0;
+    for (String field : fields) {
+      if (doc.containsKey(field)) {
+        Collection<Object> contents = doc.getFieldValues(field);
+        if (contents != null) {
+          for (Object content : contents) {
+            if (content instanceof String) {
+              docSize += Math.min(((String) content).length(), maxFieldValueChars);
+            }
+          }
+
+          if (docSize > maxTotalChars) {
+            docSize = maxTotalChars;
+            break;
+          }
+        }
+      }
+    }
+    return docSize;
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/4dd96a0e/solr/contrib/langid/src/main/java/org/apache/solr/update/processor/OpenNLPLangDetectUpdateProcessor.java
----------------------------------------------------------------------
diff --git a/solr/contrib/langid/src/main/java/org/apache/solr/update/processor/OpenNLPLangDetectUpdateProcessor.java b/solr/contrib/langid/src/main/java/org/apache/solr/update/processor/OpenNLPLangDetectUpdateProcessor.java
new file mode 100644
index 0000000..83f4fe4
--- /dev/null
+++ b/solr/contrib/langid/src/main/java/org/apache/solr/update/processor/OpenNLPLangDetectUpdateProcessor.java
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.update.processor;
+
+import java.lang.invoke.MethodHandles;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+
+import org.apache.solr.common.SolrInputDocument;
+import org.apache.solr.request.SolrQueryRequest;
+import org.apache.solr.response.SolrQueryResponse;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import opennlp.tools.langdetect.Language;
+import opennlp.tools.langdetect.LanguageDetectorME;
+import opennlp.tools.langdetect.LanguageDetectorModel;
+
+/**
+ * Identifies the language of a set of input fields using <a href="https://opennlp.apache.org/">Apache OpenNLP</a>.
+ * <p>
+ * See "Language Detector" section of
+ * <a href="https://opennlp.apache.org/docs/1.8.3/manual/opennlp.html">https://opennlp.apache.org/docs/1.8.3/manual/opennlp.html</a>
+ */
+public class OpenNLPLangDetectUpdateProcessor extends LanguageIdentifierUpdateProcessor {
+
+  private final LanguageDetectorModel model;
+  private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+
+  /** Maps ISO 639-3 (3-letter language code) to ISO 639-1 (2-letter language code) */
+  private static final Map<String,String> ISO639_MAP = make_ISO639_map();
+  
+  public OpenNLPLangDetectUpdateProcessor(SolrQueryRequest req, SolrQueryResponse rsp,
+      UpdateRequestProcessor next, LanguageDetectorModel model) {
+    super(req, rsp, next);
+    this.model = model;
+  }
+
+  @Override
+  protected List<DetectedLanguage> detectLanguage(SolrInputDocument doc) {
+    List<DetectedLanguage> languages = new ArrayList<>();
+    String content = concatFields(doc);
+    if (content.length() != 0) {
+      LanguageDetectorME ldme = new LanguageDetectorME(model);
+      Language[] langs = ldme.predictLanguages(content);
+      for(Language language: langs){
+        languages.add(new DetectedLanguage(ISO639_MAP.get(language.getLang()), language.getConfidence()));
+      }
+    } else {
+      log.debug("No input text to detect language from, returning empty list");
+    }
+    return languages;
+  }
+
+  private static Map<String,String> make_ISO639_map() {
+    Map<String,String> map = new HashMap<>();
+    for (String lang : Locale.getISOLanguages()) {
+      Locale locale = new Locale(lang);
+      map.put(locale.getISO3Language(), locale.getLanguage());
+    }
+    return map;
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/4dd96a0e/solr/contrib/langid/src/main/java/org/apache/solr/update/processor/OpenNLPLangDetectUpdateProcessorFactory.java
----------------------------------------------------------------------
diff --git a/solr/contrib/langid/src/main/java/org/apache/solr/update/processor/OpenNLPLangDetectUpdateProcessorFactory.java b/solr/contrib/langid/src/main/java/org/apache/solr/update/processor/OpenNLPLangDetectUpdateProcessorFactory.java
new file mode 100644
index 0000000..ffe11aa
--- /dev/null
+++ b/solr/contrib/langid/src/main/java/org/apache/solr/update/processor/OpenNLPLangDetectUpdateProcessorFactory.java
@@ -0,0 +1,130 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.update.processor;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.solr.common.params.SolrParams;
+import org.apache.solr.common.util.NamedList;
+import org.apache.solr.core.SolrCore;
+import org.apache.solr.core.SolrResourceLoader;
+import org.apache.solr.request.SolrQueryRequest;
+import org.apache.solr.response.SolrQueryResponse;
+import org.apache.solr.util.SolrPluginUtils;
+import org.apache.solr.util.plugin.SolrCoreAware;
+
+import opennlp.tools.langdetect.LanguageDetectorModel;
+
+/**
+ * Identifies the language of a set of input fields using <a href="https://opennlp.apache.org/">Apache OpenNLP</a>.
+ * <p>
+ * The UpdateProcessorChain config entry can take a number of parameters
+ * which may also be passed as HTTP parameters on the update request
+ * and override the defaults. Here is the simplest processor config possible:
+ * 
+ * <pre class="prettyprint" >
+ * &lt;processor class=&quot;org.apache.solr.update.processor.OpenNLPLangDetectUpdateProcessorFactory&quot;&gt;
+ *   &lt;str name=&quot;langid.fl&quot;&gt;title,text&lt;/str&gt;
+ *   &lt;str name=&quot;langid.langField&quot;&gt;language_s&lt;/str&gt;
+ *   &lt;str name="langid.model"&gt;langdetect-183.bin&lt;/str&gt;
+ * &lt;/processor&gt;
+ * </pre>
+ * See <a href="http://wiki.apache.org/solr/LanguageDetection">http://wiki.apache.org/solr/LanguageDetection</a>
+ */
+public class OpenNLPLangDetectUpdateProcessorFactory extends UpdateRequestProcessorFactory
+  implements SolrCoreAware {
+
+  private static final String MODEL_PARAM = "langid.model";
+  private String modelFile;
+  private LanguageDetectorModel model;
+  protected SolrParams defaults;
+  protected SolrParams appends;
+  protected SolrParams invariants;
+  private SolrResourceLoader solrResourceLoader;
+
+  @Override
+  public void init( NamedList args )
+  {
+    if (args != null) {
+      Object o;
+      o = args.get("defaults");
+      if (o != null && o instanceof NamedList) {
+        defaults = ((NamedList) o).toSolrParams();
+      } else {
+        defaults = args.toSolrParams();
+      }
+      o = args.get("appends");
+      if (o != null && o instanceof NamedList) {
+        appends = ((NamedList) o).toSolrParams();
+      }
+      o = args.get("invariants");
+      if (o != null && o instanceof NamedList) {
+        invariants = ((NamedList) o).toSolrParams();
+      }
+
+      // Look for model filename in invariants, then in args, then defaults
+      if (invariants != null) {
+        modelFile = invariants.get(MODEL_PARAM);
+      }
+      if (modelFile == null) {
+        o = args.get(MODEL_PARAM);
+        if (o != null && o instanceof String) {
+          modelFile = (String)o;
+        } else {
+          modelFile = defaults.get(MODEL_PARAM);
+          if (modelFile == null) {
+            throw new RuntimeException("Couldn't load language model, will return empty languages always!");
+          }
+        }
+      }
+    }
+  }
+
+  @Override
+  public UpdateRequestProcessor getInstance(SolrQueryRequest req, SolrQueryResponse rsp, UpdateRequestProcessor next) {
+    // Process defaults, appends and invariants if we got a request
+    if (req != null) {
+      SolrPluginUtils.setDefaults(req, defaults, appends, invariants);
+    }
+    return new OpenNLPLangDetectUpdateProcessor(req, rsp, next, model);
+  }
+
+  private void loadModel() throws IOException {
+    InputStream is = null;
+    try{
+      if (modelFile != null) {
+        is = solrResourceLoader.openResource(modelFile);
+        model = new LanguageDetectorModel(is);
+      }
+    }
+    finally{
+      IOUtils.closeQuietly(is);
+    }
+  }
+
+  @Override
+  public void inform(SolrCore core){
+    solrResourceLoader = core.getResourceLoader();
+    try {
+      loadModel();
+    } catch (IOException e) {
+      throw new RuntimeException(e);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/4dd96a0e/solr/contrib/langid/src/main/java/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessor.java
----------------------------------------------------------------------
diff --git a/solr/contrib/langid/src/main/java/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessor.java b/solr/contrib/langid/src/main/java/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessor.java
new file mode 100644
index 0000000..5c8146d
--- /dev/null
+++ b/solr/contrib/langid/src/main/java/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessor.java
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.update.processor;
+
+import java.lang.invoke.MethodHandles;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.solr.request.SolrQueryRequest;
+import org.apache.solr.response.SolrQueryResponse;
+import org.apache.tika.language.LanguageIdentifier;
+
+import org.apache.solr.common.SolrInputDocument;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Identifies the language of a set of input fields using Tika's
+ * LanguageIdentifier.
+ * The tika-core-x.y.jar must be on the classpath
+ * <p>
+ * See <a href="http://wiki.apache.org/solr/LanguageDetection">http://wiki.apache.org/solr/LanguageDetection</a>
+ * @since 3.5
+ */
+public class TikaLanguageIdentifierUpdateProcessor extends LanguageIdentifierUpdateProcessor {
+
+  private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+
+  public TikaLanguageIdentifierUpdateProcessor(SolrQueryRequest req,
+      SolrQueryResponse rsp, UpdateRequestProcessor next) {
+    super(req, rsp, next);
+  }
+  
+  @Override
+  protected List<DetectedLanguage> detectLanguage(SolrInputDocument doc) {
+    List<DetectedLanguage> languages = new ArrayList<>();
+    String content = concatFields(doc);
+    if (content.length() != 0) {
+      LanguageIdentifier identifier = new LanguageIdentifier(content);
+      // FIXME: Hack - we get the distance from toString and calculate our own certainty score
+      Double distance = Double.parseDouble(tikaSimilarityPattern.matcher(identifier.toString()).replaceFirst("$1"));
+      // This formula gives: 0.02 => 0.8, 0.1 => 0.5 which is a better sweetspot than isReasonablyCertain()
+      Double certainty = 1 - (5 * distance);
+      if (certainty < 0)
+        certainty = 0d;
+      DetectedLanguage language = new DetectedLanguage(identifier.getLanguage(), certainty);
+      languages.add(language);
+      log.debug("Language detected as "+language+" with a certainty of "+language.getCertainty()+" (Tika distance="+identifier.toString()+")");
+    } else {
+      log.debug("No input text to detect language from, returning empty list");
+    }
+    return languages;
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/4dd96a0e/solr/contrib/langid/src/main/java/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessorFactory.java
----------------------------------------------------------------------
diff --git a/solr/contrib/langid/src/main/java/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessorFactory.java b/solr/contrib/langid/src/main/java/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessorFactory.java
new file mode 100644
index 0000000..838311b
--- /dev/null
+++ b/solr/contrib/langid/src/main/java/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessorFactory.java
@@ -0,0 +1,94 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.update.processor;
+
+import org.apache.solr.common.params.SolrParams;
+import org.apache.solr.common.util.NamedList;
+import org.apache.solr.core.SolrCore;
+import org.apache.solr.request.SolrQueryRequest;
+import org.apache.solr.response.SolrQueryResponse;
+import org.apache.solr.util.SolrPluginUtils;
+import org.apache.solr.util.plugin.SolrCoreAware;
+
+/**
+ * Identifies the language of a set of input fields using Tika's
+ * LanguageIdentifier. The tika-core-x.y.jar must be on the classpath
+ * <p>
+ * The UpdateProcessorChain config entry can take a number of parameters
+ * which may also be passed as HTTP parameters on the update request
+ * and override the defaults. Here is the simplest processor config possible:
+ * 
+ * <pre class="prettyprint" >
+ * &lt;processor class=&quot;org.apache.solr.update.processor.TikaLanguageIdentifierUpdateProcessorFactory&quot;&gt;
+ *   &lt;str name=&quot;langid.fl&quot;&gt;title,text&lt;/str&gt;
+ *   &lt;str name=&quot;langid.langField&quot;&gt;language_s&lt;/str&gt;
+ * &lt;/processor&gt;
+ * </pre>
+ * See <a href="http://wiki.apache.org/solr/LanguageDetection">http://wiki.apache.org/solr/LanguageDetection</a>
+ * @since 3.5
+ */
+public class TikaLanguageIdentifierUpdateProcessorFactory extends
+        UpdateRequestProcessorFactory implements SolrCoreAware, LangIdParams {
+
+  protected SolrParams defaults;
+  protected SolrParams appends;
+  protected SolrParams invariants;
+
+  @Override
+  public void inform(SolrCore core) {
+  }
+
+  /**
+   * The UpdateRequestProcessor may be initialized in solrconfig.xml similarly
+   * to a RequestHandler, with defaults, appends and invariants.
+   * @param args a NamedList with the configuration parameters 
+   */
+  @Override
+  @SuppressWarnings("rawtypes")
+  public void init( NamedList args )
+  {
+    if (args != null) {
+      Object o;
+      o = args.get("defaults");
+      if (o != null && o instanceof NamedList) {
+        defaults = ((NamedList) o).toSolrParams();
+      } else {
+        defaults = args.toSolrParams();
+      }
+      o = args.get("appends");
+      if (o != null && o instanceof NamedList) {
+        appends = ((NamedList) o).toSolrParams();
+      }
+      o = args.get("invariants");
+      if (o != null && o instanceof NamedList) {
+        invariants = ((NamedList) o).toSolrParams();
+      }
+    }
+  }
+
+  @Override
+  public UpdateRequestProcessor getInstance(SolrQueryRequest req,
+                                            SolrQueryResponse rsp, UpdateRequestProcessor next) {
+    // Process defaults, appends and invariants if we got a request
+    if(req != null) {
+      SolrPluginUtils.setDefaults(req, defaults, appends, invariants);
+    }
+    return new TikaLanguageIdentifierUpdateProcessor(req, rsp, next);
+  }
+
+
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/4dd96a0e/solr/contrib/langid/src/main/java/org/apache/solr/update/processor/package.html
----------------------------------------------------------------------
diff --git a/solr/contrib/langid/src/main/java/org/apache/solr/update/processor/package.html b/solr/contrib/langid/src/main/java/org/apache/solr/update/processor/package.html
new file mode 100644
index 0000000..9bf453d
--- /dev/null
+++ b/solr/contrib/langid/src/main/java/org/apache/solr/update/processor/package.html
@@ -0,0 +1,23 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<!-- not a package-info.java, because we already defined this package in core/ -->
+<html>
+<body>
+Various implementations of {@link org.apache.solr.update.processor.LanguageIdentifierUpdateProcessor} and their factories.
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/4dd96a0e/solr/contrib/langid/src/main/java/overview.html
----------------------------------------------------------------------
diff --git a/solr/contrib/langid/src/main/java/overview.html b/solr/contrib/langid/src/main/java/overview.html
new file mode 100644
index 0000000..9bc5c9a
--- /dev/null
+++ b/solr/contrib/langid/src/main/java/overview.html
@@ -0,0 +1,21 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html>
+<body>
+Apache Solr Search Server: Solr Language Identifier contrib
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/4dd96a0e/solr/contrib/langid/src/main/resources/org/apache/solr/update/processor/langdetect-profiles/af
----------------------------------------------------------------------
diff --git a/solr/contrib/langid/src/main/resources/org/apache/solr/update/processor/langdetect-profiles/af b/solr/contrib/langid/src/main/resources/org/apache/solr/update/processor/langdetect-profiles/af
new file mode 100644
index 0000000..be8b172
--- /dev/null
+++ b/solr/contrib/langid/src/main/resources/org/apache/solr/update/processor/langdetect-profiles/af
@@ -0,0 +1 @@
+{"freq":{"D":9246,"E":2445,"F":2510,"G":3299,"A":6930,"B":3706,"C":2451,"L":2519,"M":3951,"N":3334,"O":2514,"H":3034,"I":2837,"J":2196,"K":3663,"U":687,"T":2336,"W":2258,"V":2714,"Q":182,"P":3097,"S":8234,"R":3039,"Y":252,"X":214,"Z":422,"f":13583,"g":42805,"d":77385,"Feb":207,"e":240974,"b":21626,"c":4896,"a":128566,"n":127153,"o":86673,"l":57433,"m":31352,"j":4048,"k":45378,"h":17527,"i":140621,"w":24930,"v":32618,"u":35166,"t":82606,"s":102389,"r":98861,"q":199,"p":23331,"z":1187,"y":11757,"x":1123,"ï":264,"ë":2903,"ê":1053,"é":765,"á":212,"ü":233,"ö":184,"ó":216,"Eur":318,"Eng":637," l":3565," m":7731," n":16000," o":12065," h":7358," i":23795," j":1325," k":6363," d":33601," e":13358," f":1200," g":11018,"р":242,"с":306," a":8747,"т":161," b":8379," c":434," u":1931," t":8537," w":13128," v":24617," p":4859," s":15482," r":3617," J":2155," K":3559," H":2961," I":2185," N":3120," O":2318," L":2396," M":3803," B":3554," C":2109," A":6365," F":2371," G":3138," D":8986,"
  E":2271,"л":219,"к":266," Z":368," Y":241,"и":371,"о":333,"н":199," S":7708,"Ger":200," R":2881,"в":199," Q":162," P":2912,"а":481," W":2205," V":2322," U":571,"е":266," T":2130,"Fra":1006,"A ":345,"Da":804,"Co":478,"Ch":621,"Du":1025,"Do":201,"De":763,"Di":5828,"Fe":367,"Eu":354,"En":721,"El":212,"Ge":659,"Ga":319,"I ":452,"Fr":1217,"Fo":165,"Fi":216,"II ":246,"C ":278,"Au":486,"Ar":425,"At":187,"As":201,"D ":158,"Ba":648,"Af":2087,"Am":566,"An":491,"Ap":353,"Al":628,"Bu":243,"Br":778,"Ca":399,"Bi":180,"Be":880,"Bo":481,"Bl":161,"Kr":224,"Ko":657,"Le":490,"Li":504,"La":658,"Lu":245,"Lo":347,"Me":800,"Mi":548,"Ma":1360,"Mu":186,"Mo":627,"Ni":257,"Ne":763,"Na":666,"No":1092,"Ok":339,"Ol":206,"Her":157,"Gr":1326,"Go":356,"Ha":534,"He":680,"II":369,"Hi":301,"Ho":503,"Hu":294,"Hy":550,"In":919,"Is":158,"It":218,"Ja":713,"Je":157,"Jo":565,"Ju":623,"Ka":1489,"Ki":194,"Ke":447,"Un":253,"Tu":248,"Tr":236,"To":272,"Th":313,"Te":262,"Ta":276,"V ":280,"Sw":402,"Sy":292,"St":964,"Su":1
 701,"Wo":181,"Wi":534,"Wa":412,"We":720,"Vo":315,"Vr":251,"Vi":374,"Va":314,"Ve":689,"Pr":551,"S ":157,"Pe":310,"Pa":727,"Po":681,"Pi":230,"Os":236,"Oo":423,"Or":191,"Se":814,"Sc":197,"Si":387,"Sl":222,"Sk":201,"Sp":443,"So":680,"Ru":645,"Ry":194,"Sa":728,"Re":621,"Ri":222,"Ro":746,"SA":233,"Ra":223,"Gre":501,"Gri":383,"Gra":158,"b ":1179,"Gro":254,"a ":7054,"i ":2513,"gd":570,"ge":16432,"ga":1621,"gb":319,"fk":224,"fl":183,"fg":323,"ff":351,"fi":1111,"fh":169,"fs":1224,"fr":2334,"fu":174,"ft":300,"fo":725,"Int":180,"he":6229,"ha":2610,"gn":360,"gl":334,"gi":2135,"gh":921,"gg":418,"gu":592,"gt":1512,"gs":1974,"gr":3459,"go":1385,"dt":211,"du":998,"dw":506,"g ":10256,"ea":936,"eb":3497,"ec":406,"ed":5721,"de":18394,"dd":606,"dg":161,"di":29432,"dh":249,"dj":173,"dm":299,"do":2521,"ds":2062,"dr":1453,"ew":3034,"eu":3603,"ev":2016,"ey":309,"fa":570,"h ":864,"Ind":251,"fd":469,"fe":948,"eh":993,"eg":3187,"ef":995,"ee":12296,"el":15653,"ek":7920,"ei":5726,"ep":2393,"eo":692,"en":27638,"e
 m":4686,"et":10282,"es":15156,"er":33393,"ca":479,"e ":78745,"by":1025,"br":1953,"bu":1057,"bo":2123,"bl":1117,"bi":1966,"bb":156,"be":8513,"db":222,"In ":319,"da":3617,"f ":4067,"ct":207,"co":446,"ck":502,"ci":340,"ch":1526,"ce":547,"c ":311,"az":190,"ay":279,"ba":2057,"d ":15502,"at":11369,"as":9342,"ar":11432,"aw":597,"av":407,"au":883,"ak":2797,"al":9554,"ai":1291,"aj":155,"ap":2087,"am":3989,"an":36357,"ac":615,"ad":4564,"aa":18307,"ab":1064,"ag":2729,"ah":292,"ae":907,"af":1901,"nu":917,"nt":6760,"ns":9243,"nr":212,"no":2885,"nn":1621,"ny":191,"nw":666,"nv":455,"oe":6026,"of":3797,"oc":387,"od":1636,"oa":178,"ob":729,"om":5480,"on":10533,"ok":2525,"ol":5346,"oi":587,"og":2271,"oh":382,"ot":3827,"os":3306,"ov":1152,"ou":2993,"op":4558,"oo":12667,"or":14221,"r ":19504,"ow":1144,"pe":3683,"pg":229,"pa":2371,"pl":1195,"lê":351,"po":1932,"ph":223,"pi":1008,"lo":3369,"lm":315,"ll":2990,"ls":2634,"lp":392,"lw":311,"lv":239,"lu":1548,"lt":993,"ly":716,"o ":2083,"md":261,"ma":3853,"mb
 ":2182,"mg":224,"me":9151,"mi":2940,"mm":802,"mp":1223,"mo":1485,"ië":1437,"mt":249,"ms":966,"mu":1085,"p ":4720,"na":6444,"nb":510,"nc":507,"nd":12581,"ne":5737,"nf":203,"ng":9804,"nh":460,"ni":6127,"nj":300,"nk":2057,"nl":616,"nm":203,"jo":532,"ki":2683,"kh":210,"kg":239,"ke":8584,"ka":6722,"m ":5913,"kw":457,"ky":282,"ks":2318,"kt":2084,"ku":1443,"ko":3908,"kr":2375,"kk":1579,"kl":2200,"km":469,"li":9515,"lh":279,"lk":1158,"lj":705,"le":10290,"ld":1944,"lg":1526,"lf":717,"la":8341,"lb":446,"n ":58065,"hr":313,"ht":702,"hu":1684,"hi":1067,"ho":3048,"dé":160,"id":5034,"ic":1058,"ib":451,"ia":2568,"ig":5540,"if":581,"ie":47836,"hy":348,"k ":9212,"ir":2359,"is":17403,"it":9361,"iu":405,"iv":1008,"iw":219,"ik":8953,"il":3774,"im":1386,"in":25004,"io":1984,"eë":1032,"ip":899,"je":609,"ji":572,"iz":156,"l ":8172,"ja":1960,"wy":994,"z ":242,"wi":1800,"wo":4179,"vy":166,"y ":4684,"wa":9856,"we":6959,"vl":1196,"vi":4040,"vu":178,"vr":662,"vo":4078,"uw":282,"uu":992,"ve":5906,"va":16173,
 "x ":845,"ui":7822,"uk":678,"ul":2052,"ue":905,"ug":1045,"ur":5410,"us":5098,"ut":907,"um":1711,"un":2596,"up":170,"ty":1434,"tu":2643,"tt":1277,"tw":1177,"tv":217,"ub":1182,"ua":728,"ud":950,"uc":160,"w ":232,"to":5433,"tm":201,"tl":667,"ts":3814,"tr":4026,"tg":532,"te":20430,"tk":279,"tj":177,"ti":5658,"th":1701,"tb":213,"ta":9118,"su":1177,"sv":424,"ss":2799,"st":17122,"sy":1309,"sw":531,"sl":1811,"sk":5006,"sn":242,"sm":693,"sp":2566,"oë":412,"so":3731,"sr":312,"sd":385,"sc":448,"sf":208,"se":15556,"sh":473,"sg":396,"sj":338,"si":8436,"u ":1834,"sa":2367,"sb":577,"rr":652,"rs":6262,"rt":4139,"ru":2543,"rv":1198,"rw":1199,"ry":2450,"rp":1265,"ro":8165,"rn":1586,"rm":2087,"rl":1734,"rk":2996,"ri":11752,"rh":614,"rg":2653,"rf":378,"re":10923,"rd":7372,"rc":234,"rb":955,"ra":7710,"t ":22731,"qu":168,"s ":35284,"px":614,"Hy ":529,"py":231,"pt":765,"pu":844,"pp":1058,"pr":3258,"ps":659,"wê":320,"zi":170,"ze":169,"za":209,"yg":162,"ye":406,"yf":643,"yd":927,"yw":439,"ys":1141,"yn":10
 41,"yl":288,"yk":1145,"Apr":247,"Aug":272,"Afr":2048,"Ame":464,"Ber":218,"Bel":171,"Bre":163,"Bra":191,"Bri":282,"Des":273,"Daa":460,"Chr":224,"Cha":171,"ër":307,"ël":325,"êr":697,"ë ":1979,"ê ":310,"é ":228,"Dit":1028,"Die":4537,"Dui":918,"Ned":417,"Nas":187,"Nov":238,"Noo":595,"Okt":256,"Oli":158,"Oos":361,"Par":313,"Pro":177,"Pre":186,"SA ":161,"Ita":207,"Jan":348,"Joh":290,"Jul":297,"Jun":245,"Kaa":543,"Kan":220,"Kat":191,"Kar":171,"Ker":270,"Kon":276,"Lat":181,"Lit":162,"Mei":281,"Mar":370,"Maa":286,"Mon":210,"Mid":157,"Wil":165,"Wes":439,"Vry":192,"Vol":161,"êre":674,"Swe":193,"Sy ":252,"Sui":1515,"Sta":443,"Ste":208,"Sep":228,"Spa":253,"Rus":560,"Sch":162,"Rep":214,"Rom":176,"Ver":555,"Uni":236,"The":196,"Tur":159,"bin":400,"blo":205,"bli":525,"bla":215,"boe":246,"boo":276,"bor":587,"bou":330,"ban":283,"bal":289,"bai":191,"baa":372,"bas":270,"bar":272,"beh":366,"beg":372,"bee":325,"bed":285,"ber":1916,"bel":540,"bek":1148,"bew":349,"bev":630,"bes":1308,"bet":510,"bie":
 1052,"ce ":276,"bri":159,"bro":237,"bra":211,"bre":258,"bru":1062,"bur":584,"by ":693,"am ":1182,"ake":292,"al ":2759,"ain":204,"ak ":856,"aie":241,"agt":446,"anu":467,"ann":632,"ant":1705,"ans":3841,"ane":404,"ang":1856,"ani":742,"anj":191,"ank":961,"ap ":635,"ana":788,"anc":195,"and":5528,"amm":186,"amp":480,"ami":512,"ame":657,"amb":236,"ama":204,"alt":231,"als":160,"all":667,"alk":171,"alg":320,"ali":1276,"ald":217,"ale":2352,"alf":209,"ala":367,"an ":18298,"aks":261,"akt":740,"akl":166,"abe":229,"abi":201,"aby":216,"ae ":624,"aag":175,"aad":172,"aak":679,"aai":350,"aan":6190,"aal":1515,"aam":1083,"aas":579,"aar":5293,"aap":567,"aat":1563,"ad ":2565,"afg":266,"ai ":311,"age":184,"afd":268,"adm":206,"adi":436,"ade":539,"ag ":1304,"ads":176,"ach":166,"ada":249,"af ":494,"at ":6755,"arg":256,"are":965,"ard":1124,"ara":390,"aro":332,"arn":185,"arm":157,"arl":301,"ark":397,"ari":1177,"arv":249,"ars":463,"art":1494,"ary":171,"asi":1669,"ase":210,"aso":169,"ar ":3216,"apa":189,"app":41
 8,"aps":269,"as ":5230,"awe":308,"awi":169,"ata":346,"ast":673,"ass":518,"ato":426,"ate":1382,"ati":871,"ats":404,"atu":409,"aty":167,"aus":156,"jaa":1087,"jar":470,"je ":175,"joe":306,"jin":161,"jie":306,"ito":170,"itt":191,"its":1623,"isk":182,"ism":266,"iss":374,"ist":1582,"ita":608,"ite":1331,"itg":386,"iti":469,"ius":176,"ium":203,"ivi":590,"ive":294,"is ":12546,"ion":1252,"eër":158,"ipa":265,"ir ":1648,"isi":1018,"ise":601,"isa":220,"ire":181,"it ":3772,"kil":644,"kie":536,"kin":914,"km ":266,"kgr":173,"kee":210,"kei":339,"kel":962,"ken":2090,"kep":166,"ker":1342,"ke ":3014,"kra":345,"kse":472,"kry":1085,"kri":662,"kou":249,"kor":369,"kop":214,"koo":391,"kon":866,"kom":903,"kol":246,"koe":157,"ks ":710,"kke":1272,"kki":178,"klu":430,"kle":511,"kla":387,"kli":749,"kat":157,"kar":183,"kas":204,"kap":818,"kan":1256,"kal":611,"kaa":1596,"ka ":1388," Ga":319," Ge":658," Fo":161," Fr":1217," Fi":213," Ha":534," He":680," Go":354," Gr":1318," Hy":549," Hu":294," Ho":502," II":202," 
 Hi":301," Ja":710," Is":157," It":218," In":916,"han":779," Ka":1486,"hal":311," Ke":447,"haw":164," Ki":192,"har":356," Jo":563," Ju":622,"haa":238,"had":164," La":657," Le":488," Li":502," Ko":657," Kr":224," Ma":1348," Mi":547," Me":799,"he ":399," Lo":346," Lu":244," Ne":762," Na":662," Ni":257," Mo":624," Mu":186,"hel":273,"hei":994,"hee":465,"hed":169,"het":2911,"her":350,"hem":255," Ap":349," Am":563," An":491," Al":626," Af":2082," Ba":645," Au":486," At":187," As":200," Ar":422," Be":877,"hie":290," Bi":179," Bl":161," Bo":479," Br":777," Bu":243,"his":173," Ca":384," Ch":612," Co":473," Da":803," Di":5802," De":761," Do":196," Du":1024," El":212," En":720," Eu":354," Fe":367," Wo":179," Wi":530," We":720," Wa":412,"god":193,"gs ":887,"gor":522,"gro":2150,"gra":537,"gri":320,"gre":401," Os":236," Or":191," Oo":422," Po":674," Pi":229," Pe":309," Pa":725,"gst":406," No":1092," Ol":205," Ok":339,"gte":962,"gti":391," Ra":221," Ro":743," Re":620," Ri":222," Pr":547,"gus":284,"
  Sy":292," Sw":400," Su":1700," St":953," Ta":273," Th":307," Te":261," Tr":236," To":270," Ry":194," Ru":645," Sa":724," Si":385," Sc":196," Se":811," So":678," Sp":441," Sk":201," Sl":222," Va":313," Ve":669," Vi":371," Vo":314," Vr":251," Tu":243," Un":253," ja":1102,"ial":357,"ian":256," in":12303,"iaa":736," is":11238," ka":1533," ki":531," ke":481,"id ":2425," ha":612," he":3438," gr":2075," go":365,"ia ":794," hy":292," hi":477," ho":1750," hu":727,"iet":320,"ieu":180,"iew":413," ni":722,"iel":277," ne":437,"ien":998," na":2339,"ier":2228,"ies":4471,"ied":1248,"ief":177,"iek":2103," mu":691,"ig ":1346," mo":667," om":1497," on":2106," of":1952,"ifi":218," no":1205," le":910," li":598," n ":10980," la":1290," ku":387,"ich":258,"ie ":34696," km":407," kl":879,"ica":209," kr":319," ko":1672," me":4100," mi":830,"ids":257," ma":1329," lu":186,"idi":291,"ide":993,"idd":457,"ida":156," lo":197," af":820," aa":2320," ad":269," am":322," an":759," ak":286," al":829," ar":263," at":22
 9," as":2284," ba":599,"il ":459," bi":320," be":5430," bo":565," bl":263," by":612," bu":213," br":340,"ika":2950,"igd":381,"ige":1604,"igh":698,"igi":270,"igg":185,"igt":498,"igs":156,"ik ":2305," en":9738,"imp":231," ei":517," el":502,"ime":187," ek":223," ee":1730,"ind":1030,"ina":506," fa":191,"inn":302," fo":227,"int":638,"ins":1349,"ine":545,"ing":6095," fi":368,"ini":615,"ink":417," ge":8191," ga":169,"inw":455,"ikk":629," ch":185,"ike":1814,"ila":498," da":1923,"in ":12178,"iku":209,"iks":287," do":1111,"ilo":514,"ill":662," dr":523," de":3947,"ilj":228,"ili":684,"ild":294," di":25510,"imb":245,"eë ":693,"io ":196," du":309," wê":298,"hom":166,"hou":360,"hoo":1325,"hoe":410," wy":201,"hul":552,"hui":260,"hri":224,"ht ":578," ru":233," sa":888," se":2315," si":590," sl":329," sk":1250," sp":887," so":2211," ra":237," re":1576," ri":825," ro":614," pr":1589," s ":207," px":614,"hy ":302," ou":447,"hum":674," oo":2639," op":2809," or":325," pe":402," pa":556," pl":641," po":
 737," lê":242," wa":7840," we":1395," wo":2888," wi":454," va":14670," ve":4043," vo":2359," vr":575," vi":2068," vl":594," ty":439," tw":582," tu":692," ui":1746," ta":895," sw":227," sy":1183," st":4293," su":859," tr":387," to":1857," th":729," ti":190," te":2715,"ffe":165,"fer":157,"fel":155,"fha":158,"fge":290,"fam":176,"fde":429,"eta":359,"ete":1299,"eti":372,"esp":358,"eso":210,"est":2951,"ess":405,"eun":234,"eto":320,"etr":438,"ets":217,"ett":493,"eve":456,"eva":262,"evo":907,"evi":274,"eur":2292,"eus":242,"ewi":337,"ewe":1704,"ewo":449,"ey ":181,"ewa":222,"epe":254,"er ":10617,"epa":228,"eor":221,"es ":4626,"ept":277,"epu":400,"epr":184,"erk":2067,"erl":875,"eri":1765,"erg":1022,"erh":416,"ere":1861,"erf":286,"erd":1514,"era":1470,"erb":529,"et ":6083,"esk":1018,"esl":228,"esi":976,"ese":3607,"eu ":338,"erv":860,"erw":949,"err":349,"ert":1101,"ers":4583,"ern":1142,"erm":861,"erp":342,"ero":382,"ekg":155,"ekk":206,"eko":474,"eks":950,"ekt":701,"en ":13492,"ela":904,"eld":11
 99,"elf":322,"ele":2593,"eli":1906,"elj":427,"elg":226,"elk":209,"ell":778,"elo":234,"els":1983,"elt":333,"ely":255,"emb":839,"ema":484,"eme":1266,"emo":181,"emi":456,"ep ":699,"ene":1142,"enh":254,"eng":314,"enb":269,"ena":610,"end":3112,"eno":500,"enn":400,"enk":275,"eni":1151,"ens":2864,"ent":2318,"ego":497,"ege":690,"egi":516,"eha":370,"egr":238,"egs":217,"egt":193,"eho":266,"ehe":259,"ek ":1799,"eis":330,"eil":544,"ein":1010,"eie":633,"eid":1307,"el ":3516,"eit":680,"eke":2739,"eka":220,"em ":967,"gin":784,"gie":714,"ght":548,"gep":249,"gen":1564,"get":297,"ger":1248,"ges":2014,"gev":788,"gew":944,"gee":448,"ged":475,"geb":2499,"geh":356,"geg":181,"gem":756,"gel":1995,"gek":350,"gde":427,"ge ":1916,"gaa":266,"gan":539,"ga ":157,"fst":852,"fri":2089,"for":371,"fie":369,"fil":208,"fin":174,"fis":177,"da ":327,"de ":6409,"daa":645,"dag":700,"dae":480,"dat":659,"dan":233,"dam":165,"dde":490,"ch ":316,"cha":160,"ck ":233,"che":490,"ed ":1090,"eba":159,"ebe":354,"ebi":752,"ebo":768,"
 ebr":1168,"ei ":821,"ega":168,"eek":631,"een":2520,"eel":2072,"eem":410,"eed":587,"ees":884,"eer":3295,"eeu":449,"eet":195,"edi":638,"ede":2561,"eda":161,"eg ":316,"eds":321,"edr":340,"ee ":892,"ef ":280,"dwe":310,"dus":171,"dor":875,"doo":416,"don":160,"dom":227,"ds ":353,"dmi":211,"doe":283,"dst":428,"dui":309,"dri":421,"dra":423,"dry":204,"dsk":181,"dse":527,"dia":294,"der":4829,"des":476,"deu":1676,"dee":1279,"del":1695,"dek":186,"den":1206,"do ":172,"din":875,"dio":177,"dis":425,"dit":656,"die":24964,"dig":1168,"dik":198,"rhe":301,"rga":496,"rgi":335,"rge":595,"ret":312,"res":944,"rg ":777,"rea":245,"ree":1091,"ref":257,"red":294,"rei":545,"reg":1039,"ren":1300,"rek":765,"rel":674,"rep":191,"rf ":180,"rdo":215,"rdi":841,"rde":1873,"re ":2607,"rd ":3667,"ras":532,"rat":587,"rbi":190,"rba":160,"rbe":287,"rag":291,"ran":2011,"ram":317,"ral":832,"rak":247,"raa":1046,"raf":284,"rad":331,"rs ":1922,"ros":273,"rot":330,"rom":305,"ron":1072,"roo":1778,"rop":575,"rou":212,"rov":708,"rod
 ":199,"rol":315,"roe":1277,"rog":195,"rno":196,"rp ":728,"rna":508,"rne":469,"rmo":164,"rma":539,"rme":324,"rmi":175,"rlo":320,"rli":409,"rle":270,"rla":508,"rks":184,"rko":248,"rki":199,"rkl":203,"rke":440,"rka":271,"rm ":692,"rio":174,"rit":493,"ris":571,"riv":501,"rig":863,"ril":278,"rik":3384,"rin":1384,"ria":924,"ric":236,"rie":2029,"rk ":1040,"rwe":410,"rwy":498,"ryf":393,"rui":1143,"rug":256,"rum":244,"ruk":231,"rus":225,"rva":502,"rvl":353,"rvo":192,"rwa":171,"ry ":383,"rsk":872,"rsi":432,"rso":249,"rsp":591,"rsa":225,"rse":478,"rta":186,"rst":1083,"rtk":160,"rto":274,"rte":620,"rti":334,"rua":209,"rty":351,"rt ":1413,"rre":272,"saa":540,"sal":170,"sam":303,"san":408,"sas":204,"sa ":155,"ryw":338,"rys":282,"ryk":576,"sge":305,"sie":4039,"sid":185,"sia":299,"sit":436,"sis":296,"sip":279,"sin":541,"sio":799,"sil":194,"sim":173,"sik":231,"sif":160,"sig":289,"sbu":231,"se ":9840,"sch":268,"ser":501,"ses":400,"set":250,"seu":239,"sea":162,"see":618,"sed":264,"sen":1323,"sem":298,
 "sel":1093,"sek":186,"spo":405,"spr":756,"spe":934,"spa":260,"sow":508,"som":247,"son":545,"soo":954,"soe":195,"sok":377,"st ":267,"sli":202,"slu":297,"sky":183,"sla":1006,"sle":205,"ski":804,"sko":594,"skr":1152,"sku":244,"ska":1212,"ske":665,"sië":283,"sma":173,"sme":382,"sse":1275,"ssa":198,"ssi":922,"ste":6829,"sta":5065,"sto":805,"sti":1396,"stu":693,"str":1673,"sty":226,"sui":596,"sve":167,"sy ":1199,"swa":313,"tal":1301,"taa":2499,"tad":2323,"tau":165,"tat":456,"tas":164,"tan":1021,"te ":8469,"ta ":339,"pa ":202,"pe ":459,"par":608,"pas":176,"paa":333,"pal":324,"pan":428,"pge":207,"pen":295,"per":1379,"pes":438,"pee":201,"pel":568,"pla":660,"lê ":268,"pli":169,"ple":241,"pie":480,"por":394,"poo":160,"pos":197,"pol":518,"ppy":184,"ppe":636,"pst":229,"pub":435,"pte":575,"pra":251,"pri":484,"pre":726,"pro":1677,"pun":246,"px ":614,"py ":166,"ra ":424,"ngo":161,"ngr":289,"ngs":1292,"nge":2327,"nhe":276,"nel":314,"nen":189,"nem":225,"ner":1014,"net":468,"nes":533,"ng ":4906,"nee
 ":762,"nce":206,"ne ":1530,"ndr":216,"nds":657,"ndo":326,"ndi":878,"nde":5081,"nda":453,"nal":790,"nam":291,"nad":316,"naf":372,"nab":229,"naa":1198,"nd ":4245,"nat":282,"nas":677,"na ":1572,"nwo":542,"nus":209,"nua":266,"ntw":393,"nto":201,"nts":300,"ntr":543,"nti":571,"ntl":164,"nta":457,"nte":1815,"nst":787,"nse":3345,"nsi":1079,"nsl":207,"nsk":498,"nt ":1757,"ns ":2476,"nog":456,"noe":477,"noo":659,"nom":368,"nne":904,"nni":442,"nië":246,"nli":373,"nke":345,"nkl":391,"nks":179,"nkr":453,"nje":156,"nig":640,"nie":1831,"nk ":274,"niv":210,"nis":1512,"nin":804,"ogr":272,"ogi":423,"oi ":216,"oha":228,"oeë":178,"ok ":1432,"ol ":554,"ock":164,"oe ":303,"ode":551,"odi":176,"ods":177,"of ":2323,"oek":499,"oel":276,"oem":563,"oeg":231,"oei":336,"oer":752,"oes":295,"oet":302,"oen":602,"oep":714,"odu":188,"oed":477,"og ":895,"ofs":803,"oew":261,"od ":254,"obe":382,"oud":510,"oue":197,"ote":350,"ott":175,"ots":913,"oto":266,"ost":637,"osi":266,"ose":346,"oss":176,"oso":190,"owa":484,"owe"
 :208,"ovi":678,"ove":370,"ous":302,"our":167,"out":306,"opo":205,"opp":449,"ope":438,"opg":213,"opa":195,"os ":1171,"oon":731,"ool":561,"oom":393,"ook":1376,"ooi":288,"oof":1146,"oog":389,"ood":288,"or ":1152,"oot":1351,"oos":958,"oor":4776,"oop":341,"ork":260,"orl":386,"orm":964,"orp":858,"ord":4583,"ore":773,"org":587,"ori":1212,"ou ":999,"ort":1219,"ors":871,"orw":195,"ot ":1528,"orb":186,"ora":235,"ola":171,"on ":1522,"oli":772,"oll":288,"olk":702,"ole":263,"olg":904,"ols":270,"olo":636,"om ":1870,"okk":553,"ona":980,"ond":1915,"one":1178,"ong":620,"oni":1012,"onl":220,"onk":232,"onn":184,"ono":391,"ons":511,"ont":1339,"oma":425,"ome":845,"omi":324,"omm":454,"omp":297,"oms":595,"op ":2264,"la ":334,"le ":3834,"lf ":175,"lde":601,"laa":982,"lad":180,"lag":434,"lak":490,"lan":4154,"lar":155,"lat":361,"las":433,"ld ":695,"kus":410,"kun":548,"kul":242,"kwe":204,"kwa":191,"kte":822,"kst":257,"ksi":463,"ktr":342,"ktu":210,"kti":247,"kto":369,"ls ":1008,"lon":293,"lom":430,"loo":382,"l
 oe":423,"log":655,"los":274,"lië":349,"lti":157,"lub":411,"lug":221,"lst":643,"lte":252,"lse":623,"lge":754,"lew":250,"leu":193,"les":329,"let":347,"ler":415,"lem":358,"len":1056,"lek":605,"lei":1010,"leg":257,"lee":477,"led":218,"lg ":483,"lo ":169,"lla":325,"lle":1578,"lli":615,"lke":200,"lki":447,"ljo":223,"ll ":176,"lja":430,"lit":831,"lis":504,"leë":449,"lin":1208,"lim":201,"lid":165,"lia":364,"lik":2917,"lig":818,"lie":1618,"ma ":226,"mb ":655,"maa":1244,"mag":221,"mar":331,"mas":207,"mal":270,"man":726,"mat":394,"mba":172,"mbi":179,"mbe":814,"mbo":161,"me ":936,"mde":163,"med":223,"mee":1533,"met":2981,"mes":247,"mer":991,"mel":330,"men":1550,"lui":390,"lus":194,"lwe":213,"lyk":221,"lyn":187,"mpi":220,"mpe":208,"mpo":176,"mpt":267,"ms ":488,"moe":196,"mod":233,"mon":329,"mst":248,"mus":488,"mun":417,"ër ":180,"mge":191,"min":806,"mil":465,"mit":231,"mig":184,"mie":523,"mid":310,"ië ":1136,"mme":353,"wêr":319,"yst":183,"ys ":680,"ywe":370,"ye ":306,"yf ":380,"yde":281,"yd
 s":165,"yd ":230,"yn ":461,"yns":175,"yk ":810,"wys":531,"wor":2620,"woo":760,"won":526,"we ":1260,"wes":799,"wer":1583,"wet":305,"wen":427,"wel":545,"weg":270,"wee":1257,"wis":166,"wit":342,"wie":194,"win":417,"wil":177,"wik":231,"wan":300,"wat":5174,"war":532,"was":2236,"waa":1031,"vry":194,"vro":313,"vir":1570,"vin":921,"vie":880,"vis":289,"vla":709,"vlo":280,"voe":444,"vol":1592,"voo":1083,"vor":625,"ver":4566,"ven":170,"vem":236,"vel":250,"vee":302,"val":319,"van":14723,"vat":155,"vaa":414,"uwe":229,"uur":863,"usl":180,"usi":606,"use":380,"ust":585,"uss":1129,"ute":176,"uto":171,"us ":1998,"ure":395,"urg":669,"uri":191,"urk":167,"uro":352,"urs":211,"urt":189,"ur ":2547,"umb":689,"ume":172,"unt":325,"uns":289,"uni":820,"und":530,"um ":614,"ult":270,"ull":459,"uli":358,"un ":219,"uid":2285,"uik":850,"uim":162,"uis":508,"uk ":200,"uit":3378,"ul ":272,"ugb":161,"ugu":278,"ude":184,"udi":240,"ue ":322,"ug ":159,"ub ":406,"uar":522,"ubl":464,"ud ":181,"tyn":228,"ty ":384,"tur":232,"t
 us":988,"tuu":617,"tui":232,"tud":171,"tyd":628,"twi":269,"twe":751,"ts ":533,"tre":1022,"tra":1128,"tri":607,"tru":366,"tro":780,"tse":746,"tsk":298,"tsl":425,"tst":993,"tte":641,"tti":226,"to ":272,"tof":244,"toe":713,"tob":268,"tot":1108,"tom":182,"ton":586,"tol":317,"tor":808,"too":280,"til":187,"tik":334,"tie":1846,"tig":1053,"tis":241,"tin":826,"tio":267,"thu":695,"tkl":165,"tli":191,"tla":301,"tem":732,"ten":1059,"tei":844,"tek":528,"tel":2135,"tee":779,"teg":166,"ted":237,"th ":270,"teu":212,"tes":357,"ter":4231,"tge":442,"the":380},"n_words":[1541130,1808182,1328687],"name":"af"}
\ No newline at end of file


Mime
View raw message