lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mikemcc...@apache.org
Subject svn commit: r1575397 [1/3] - in /lucene/dev/branches/lucene5487: ./ dev-tools/ dev-tools/idea/solr/contrib/dataimporthandler/ dev-tools/idea/solr/contrib/map-reduce/ dev-tools/scripts/ lucene/ lucene/analysis/ lucene/analysis/common/ lucene/analysis/co...
Date Fri, 07 Mar 2014 20:50:47 GMT
Author: mikemccand
Date: Fri Mar  7 20:50:45 2014
New Revision: 1575397

URL: http://svn.apache.org/r1575397
Log:
LUCENE-5487: merge trunk

Added:
    lucene/dev/branches/lucene5487/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries2.java
      - copied unchanged from r1575347, lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries2.java
    lucene/dev/branches/lucene5487/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestConv.java
      - copied unchanged from r1575347, lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestConv.java
    lucene/dev/branches/lucene5487/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestIgnore.java
      - copied unchanged from r1575347, lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestIgnore.java
    lucene/dev/branches/lucene5487/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/conv.aff
      - copied unchanged from r1575347, lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/conv.aff
    lucene/dev/branches/lucene5487/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/conv.dic
      - copied unchanged from r1575347, lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/conv.dic
    lucene/dev/branches/lucene5487/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ignore.aff
      - copied unchanged from r1575347, lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ignore.aff
    lucene/dev/branches/lucene5487/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ignore.dic
      - copied unchanged from r1575347, lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ignore.dic
    lucene/dev/branches/lucene5487/lucene/core/src/test/org/apache/lucene/index/TestExceedMaxTermLength.java
      - copied unchanged from r1575347, lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/index/TestExceedMaxTermLength.java
    lucene/dev/branches/lucene5487/lucene/misc/src/java/org/apache/lucene/index/sorter/BlockJoinComparatorSource.java
      - copied unchanged from r1575347, lucene/dev/trunk/lucene/misc/src/java/org/apache/lucene/index/sorter/BlockJoinComparatorSource.java
    lucene/dev/branches/lucene5487/solr/core/src/test-files/solr/collection1/conf/schema-field-sort-values.xml
      - copied unchanged from r1575347, lucene/dev/trunk/solr/core/src/test-files/solr/collection1/conf/schema-field-sort-values.xml
    lucene/dev/branches/lucene5487/solr/core/src/test/org/apache/solr/cloud/hdfs/HdfsWriteToMultipleCollectionsTest.java
      - copied unchanged from r1575347, lucene/dev/trunk/solr/core/src/test/org/apache/solr/cloud/hdfs/HdfsWriteToMultipleCollectionsTest.java
    lucene/dev/branches/lucene5487/solr/core/src/test/org/apache/solr/schema/WrappedIntField.java
      - copied unchanged from r1575347, lucene/dev/trunk/solr/core/src/test/org/apache/solr/schema/WrappedIntField.java
    lucene/dev/branches/lucene5487/solr/core/src/test/org/apache/solr/search/TestFieldSortValues.java
      - copied unchanged from r1575347, lucene/dev/trunk/solr/core/src/test/org/apache/solr/search/TestFieldSortValues.java
    lucene/dev/branches/lucene5487/solr/core/src/test/org/apache/solr/update/TestExceedMaxTermLength.java
      - copied unchanged from r1575347, lucene/dev/trunk/solr/core/src/test/org/apache/solr/update/TestExceedMaxTermLength.java
    lucene/dev/branches/lucene5487/solr/test-framework/src/java/org/apache/solr/cloud/StopableIndexingThread.java
      - copied unchanged from r1575347, lucene/dev/trunk/solr/test-framework/src/java/org/apache/solr/cloud/StopableIndexingThread.java
Removed:
    lucene/dev/branches/lucene5487/lucene/misc/src/java/org/apache/lucene/index/sorter/BlockJoinSorter.java
    lucene/dev/branches/lucene5487/lucene/misc/src/java/org/apache/lucene/index/sorter/NumericDocValuesSorter.java
Modified:
    lucene/dev/branches/lucene5487/   (props changed)
    lucene/dev/branches/lucene5487/dev-tools/   (props changed)
    lucene/dev/branches/lucene5487/dev-tools/idea/solr/contrib/dataimporthandler/dataimporthandler.iml
    lucene/dev/branches/lucene5487/dev-tools/idea/solr/contrib/map-reduce/map-reduce.iml
    lucene/dev/branches/lucene5487/dev-tools/scripts/checkJavaDocs.py
    lucene/dev/branches/lucene5487/dev-tools/scripts/smokeTestRelease.py
    lucene/dev/branches/lucene5487/lucene/   (props changed)
    lucene/dev/branches/lucene5487/lucene/CHANGES.txt   (contents, props changed)
    lucene/dev/branches/lucene5487/lucene/analysis/   (props changed)
    lucene/dev/branches/lucene5487/lucene/analysis/common/   (props changed)
    lucene/dev/branches/lucene5487/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
    lucene/dev/branches/lucene5487/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
    lucene/dev/branches/lucene5487/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java
    lucene/dev/branches/lucene5487/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellStemFilter.java
    lucene/dev/branches/lucene5487/lucene/core/   (props changed)
    lucene/dev/branches/lucene5487/lucene/core/src/java/org/apache/lucene/index/DocFieldProcessor.java
    lucene/dev/branches/lucene5487/lucene/core/src/java/org/apache/lucene/index/DocInverterPerField.java
    lucene/dev/branches/lucene5487/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java
    lucene/dev/branches/lucene5487/lucene/core/src/java/org/apache/lucene/index/TermsHashPerField.java
    lucene/dev/branches/lucene5487/lucene/core/src/java/org/apache/lucene/search/Sort.java
    lucene/dev/branches/lucene5487/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java
    lucene/dev/branches/lucene5487/lucene/misc/   (props changed)
    lucene/dev/branches/lucene5487/lucene/misc/src/java/org/apache/lucene/index/sorter/EarlyTerminatingSortingCollector.java
    lucene/dev/branches/lucene5487/lucene/misc/src/java/org/apache/lucene/index/sorter/Sorter.java
    lucene/dev/branches/lucene5487/lucene/misc/src/java/org/apache/lucene/index/sorter/SortingAtomicReader.java
    lucene/dev/branches/lucene5487/lucene/misc/src/java/org/apache/lucene/index/sorter/SortingMergePolicy.java
    lucene/dev/branches/lucene5487/lucene/misc/src/java/org/apache/lucene/index/sorter/package.html
    lucene/dev/branches/lucene5487/lucene/misc/src/test/org/apache/lucene/index/sorter/IndexSortingTest.java
    lucene/dev/branches/lucene5487/lucene/misc/src/test/org/apache/lucene/index/sorter/SortingAtomicReaderTest.java
    lucene/dev/branches/lucene5487/lucene/misc/src/test/org/apache/lucene/index/sorter/TestBlockJoinSorter.java
    lucene/dev/branches/lucene5487/lucene/misc/src/test/org/apache/lucene/index/sorter/TestEarlyTermination.java
    lucene/dev/branches/lucene5487/lucene/misc/src/test/org/apache/lucene/index/sorter/TestSortingMergePolicy.java
    lucene/dev/branches/lucene5487/lucene/suggest/   (props changed)
    lucene/dev/branches/lucene5487/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggester.java
    lucene/dev/branches/lucene5487/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/BlendedInfixSuggester.java
    lucene/dev/branches/lucene5487/lucene/suggest/src/test/org/apache/lucene/search/suggest/LookupBenchmarkTest.java
    lucene/dev/branches/lucene5487/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggesterTest.java
    lucene/dev/branches/lucene5487/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/BlendedInfixSuggesterTest.java
    lucene/dev/branches/lucene5487/lucene/test-framework/   (props changed)
    lucene/dev/branches/lucene5487/lucene/test-framework/src/java/org/apache/lucene/index/ThreadedIndexingAndSearchingTestCase.java
    lucene/dev/branches/lucene5487/solr/   (props changed)
    lucene/dev/branches/lucene5487/solr/CHANGES.txt   (contents, props changed)
    lucene/dev/branches/lucene5487/solr/core/   (props changed)
    lucene/dev/branches/lucene5487/solr/core/src/java/org/apache/solr/cloud/Overseer.java
    lucene/dev/branches/lucene5487/solr/core/src/java/org/apache/solr/cloud/OverseerCollectionProcessor.java
    lucene/dev/branches/lucene5487/solr/core/src/java/org/apache/solr/cloud/ZkController.java
    lucene/dev/branches/lucene5487/solr/core/src/java/org/apache/solr/core/ConfigSolr.java
    lucene/dev/branches/lucene5487/solr/core/src/java/org/apache/solr/core/ConfigSolrXml.java
    lucene/dev/branches/lucene5487/solr/core/src/java/org/apache/solr/core/HdfsDirectoryFactory.java
    lucene/dev/branches/lucene5487/solr/core/src/java/org/apache/solr/core/ZkContainer.java
    lucene/dev/branches/lucene5487/solr/core/src/java/org/apache/solr/handler/component/QueryComponent.java
    lucene/dev/branches/lucene5487/solr/core/src/java/org/apache/solr/spelling/suggest/fst/AnalyzingInfixLookupFactory.java
    lucene/dev/branches/lucene5487/solr/core/src/java/org/apache/solr/spelling/suggest/fst/BlendedInfixLookupFactory.java
    lucene/dev/branches/lucene5487/solr/core/src/java/org/apache/solr/store/blockcache/BlockCache.java
    lucene/dev/branches/lucene5487/solr/core/src/java/org/apache/solr/store/blockcache/BlockCacheKey.java
    lucene/dev/branches/lucene5487/solr/core/src/java/org/apache/solr/store/blockcache/BlockCacheLocation.java
    lucene/dev/branches/lucene5487/solr/core/src/java/org/apache/solr/store/blockcache/BlockDirectory.java
    lucene/dev/branches/lucene5487/solr/core/src/java/org/apache/solr/store/blockcache/BlockDirectoryCache.java
    lucene/dev/branches/lucene5487/solr/core/src/java/org/apache/solr/store/blockcache/BlockLocks.java
    lucene/dev/branches/lucene5487/solr/core/src/java/org/apache/solr/store/blockcache/BufferStore.java
    lucene/dev/branches/lucene5487/solr/core/src/java/org/apache/solr/store/blockcache/Cache.java
    lucene/dev/branches/lucene5487/solr/core/src/java/org/apache/solr/store/blockcache/CachedIndexOutput.java
    lucene/dev/branches/lucene5487/solr/core/src/java/org/apache/solr/store/blockcache/CustomBufferedIndexInput.java
    lucene/dev/branches/lucene5487/solr/core/src/java/org/apache/solr/store/blockcache/Metrics.java
    lucene/dev/branches/lucene5487/solr/core/src/java/org/apache/solr/store/blockcache/ReusedBufferedIndexOutput.java
    lucene/dev/branches/lucene5487/solr/core/src/java/org/apache/solr/store/blockcache/Store.java
    lucene/dev/branches/lucene5487/solr/core/src/java/org/apache/solr/store/hdfs/HdfsFileReader.java
    lucene/dev/branches/lucene5487/solr/core/src/java/org/apache/solr/store/hdfs/HdfsFileWriter.java
    lucene/dev/branches/lucene5487/solr/core/src/java/org/apache/solr/store/hdfs/NullIndexOutput.java
    lucene/dev/branches/lucene5487/solr/core/src/test-files/solr/collection1/conf/schema11.xml
    lucene/dev/branches/lucene5487/solr/core/src/test-files/solr/collection1/conf/solrconfig-tlog.xml
    lucene/dev/branches/lucene5487/solr/core/src/test-files/solr/collection1/conf/solrconfig.xml
    lucene/dev/branches/lucene5487/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeyNothingIsSafeTest.java
    lucene/dev/branches/lucene5487/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeySafeLeaderTest.java
    lucene/dev/branches/lucene5487/solr/core/src/test/org/apache/solr/cloud/CollectionsAPIDistributedZkTest.java
    lucene/dev/branches/lucene5487/solr/core/src/test/org/apache/solr/cloud/OverseerTest.java
    lucene/dev/branches/lucene5487/solr/core/src/test/org/apache/solr/cloud/RecoveryZkTest.java
    lucene/dev/branches/lucene5487/solr/core/src/test/org/apache/solr/cloud/TestDistribDocBasedVersion.java
    lucene/dev/branches/lucene5487/solr/core/src/test/org/apache/solr/cloud/ZkControllerTest.java
    lucene/dev/branches/lucene5487/solr/core/src/test/org/apache/solr/cloud/hdfs/HdfsTestUtil.java
    lucene/dev/branches/lucene5487/solr/core/src/test/org/apache/solr/core/TestNonNRTOpen.java
    lucene/dev/branches/lucene5487/solr/core/src/test/org/apache/solr/store/blockcache/BlockCacheTest.java
    lucene/dev/branches/lucene5487/solr/example/   (props changed)
    lucene/dev/branches/lucene5487/solr/example/solr/collection1/conf/solrconfig.xml
    lucene/dev/branches/lucene5487/solr/solrj/   (props changed)
    lucene/dev/branches/lucene5487/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java
    lucene/dev/branches/lucene5487/solr/test-framework/   (props changed)
    lucene/dev/branches/lucene5487/solr/test-framework/src/java/org/apache/solr/cloud/AbstractFullDistribZkTestBase.java

Modified: lucene/dev/branches/lucene5487/dev-tools/idea/solr/contrib/dataimporthandler/dataimporthandler.iml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5487/dev-tools/idea/solr/contrib/dataimporthandler/dataimporthandler.iml?rev=1575397&r1=1575396&r2=1575397&view=diff
==============================================================================
--- lucene/dev/branches/lucene5487/dev-tools/idea/solr/contrib/dataimporthandler/dataimporthandler.iml (original)
+++ lucene/dev/branches/lucene5487/dev-tools/idea/solr/contrib/dataimporthandler/dataimporthandler.iml Fri Mar  7 20:50:45 2014
@@ -16,6 +16,7 @@
     <orderEntry type="library" scope="TEST" name="HSQLDB" level="project" />
     <orderEntry type="library" scope="TEST" name="Derby" level="project" />
     <orderEntry type="library" scope="TEST" name="Solr DIH test library" level="project" />
+    <orderEntry type="library" scope="TEST" name="Solr example library" level="project" />
     <orderEntry type="library" name="Solr core library" level="project" />
     <orderEntry type="library" name="Solrj library" level="project" />
     <orderEntry type="library" name="Solr DIH library" level="project" />

Modified: lucene/dev/branches/lucene5487/dev-tools/idea/solr/contrib/map-reduce/map-reduce.iml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5487/dev-tools/idea/solr/contrib/map-reduce/map-reduce.iml?rev=1575397&r1=1575396&r2=1575397&view=diff
==============================================================================
--- lucene/dev/branches/lucene5487/dev-tools/idea/solr/contrib/map-reduce/map-reduce.iml (original)
+++ lucene/dev/branches/lucene5487/dev-tools/idea/solr/contrib/map-reduce/map-reduce.iml Fri Mar  7 20:50:45 2014
@@ -18,6 +18,7 @@
     <orderEntry type="library" name="Solr morphlines core library" level="project" />
     <orderEntry type="library" name="Solr morphlines cell library" level="project" />
     <orderEntry type="library" scope="TEST" name="Solr morphlines core test library" level="project" />
+    <orderEntry type="library" scope="TEST" name="Solr example library" level="project" />
     <orderEntry type="module" scope="TEST" module-name="lucene-test-framework" />
     <orderEntry type="module" scope="TEST" module-name="solr-test-framework" />
     <orderEntry type="module" module-name="solr-core" />

Modified: lucene/dev/branches/lucene5487/dev-tools/scripts/checkJavaDocs.py
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5487/dev-tools/scripts/checkJavaDocs.py?rev=1575397&r1=1575396&r2=1575397&view=diff
==============================================================================
--- lucene/dev/branches/lucene5487/dev-tools/scripts/checkJavaDocs.py (original)
+++ lucene/dev/branches/lucene5487/dev-tools/scripts/checkJavaDocs.py Fri Mar  7 20:50:45 2014
@@ -212,7 +212,7 @@ def checkClassSummaries(fullPath):
     if inThing:
       if lineLower.find('</tr>') != -1:
         if not hasDesc:
-          missing.append((lastCaption, lastItem))
+          missing.append((lastCaption, unEscapeURL(lastItem)))
         inThing = False
         continue
       else:
@@ -298,6 +298,11 @@ def checkSummary(fullPath):
   f.close()
   return anyMissing
 
+def unEscapeURL(s):
+  # Not exhaustive!!
+  s = s.replace('%20', ' ')
+  return s
+
 def unescapeHTML(s):
   s = s.replace('&lt;', '<')
   s = s.replace('&gt;', '>')

Modified: lucene/dev/branches/lucene5487/dev-tools/scripts/smokeTestRelease.py
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5487/dev-tools/scripts/smokeTestRelease.py?rev=1575397&r1=1575396&r2=1575397&view=diff
==============================================================================
--- lucene/dev/branches/lucene5487/dev-tools/scripts/smokeTestRelease.py (original)
+++ lucene/dev/branches/lucene5487/dev-tools/scripts/smokeTestRelease.py Fri Mar  7 20:50:45 2014
@@ -731,7 +731,7 @@ def verifyUnpacked(project, artifact, un
       os.chdir('solr')
 
       print("    run tests w/ Java 7 and testArgs='%s'..." % testArgs)
-      run('%s; ant clean test %s' % (javaExe('1.7'), testArgs), '%s/test.log' % unpackPath)
+      run('%s; ant clean test -Dtests.slow=false %s' % (javaExe('1.7'), testArgs), '%s/test.log' % unpackPath)
  
       # test javadocs
       print('    generate javadocs w/ Java 7...')

Modified: lucene/dev/branches/lucene5487/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5487/lucene/CHANGES.txt?rev=1575397&r1=1575396&r2=1575397&view=diff
==============================================================================
--- lucene/dev/branches/lucene5487/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/lucene5487/lucene/CHANGES.txt Fri Mar  7 20:50:45 2014
@@ -68,6 +68,13 @@ Optimizations
 
 ======================= Lucene 4.8.0 =======================
 
+Changes in Runtime Behavior
+
+* LUCENE-5472: IndexWriter.addDocument will now throw an IllegalArgumentException 
+  if a Term to be indexed exceeds IndexWriter.MAX_TERM_LENGTH.  To recreate previous
+  behavior of silently ignoring these terms, use LengthFilter in your Analyzer.
+  (hossman, Mike McCandless, Varun Thacker)
+
 New Features
 
 * LUCENE-5454: Add SortedSetSortField to lucene/sandbox, to allow sorting
@@ -89,6 +96,13 @@ New Features
 
 * LUCENE-5485: Add circumfix support to HunspellStemFilter. (Robert Muir)
 
+* LUCENE-5224: Add iconv, oconv, and ignore support to HunspellStemFilter.
+  (Robert Muir)
+
+* LUCENE-5493: SortingMergePolicy, and EarlyTerminatingSortingCollector
+  support arbitrary Sort specifications.  
+  (Robert Muir, Mike McCandless, Adrien Grand)
+
 API Changes
 
 * LUCENE-5454: Add RandomAccessOrds, an optional extension of SortedSetDocValues
@@ -96,6 +110,12 @@ API Changes
 
 * LUCENE-5468: Move offline Sort (from suggest module) to OfflineSort. (Robert Muir)
 
+* LUCENE-5493: SortingMergePolicy and EarlyTerminatingSortingCollector take
+  Sort instead of Sorter. BlockJoinSorter is removed, replaced with 
+  BlockJoinComparatorSource, which can take a Sort for ordering of parents
+  and a separate Sort for ordering of children within a block. 
+  (Robert Muir, Mike McCandless, Adrien Grand)
+
 Optimizations
 
 * LUCENE-5468: HunspellStemFilter uses 10 to 100x less RAM. It also loads

Modified: lucene/dev/branches/lucene5487/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5487/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java?rev=1575397&r1=1575396&r2=1575397&view=diff
==============================================================================
--- lucene/dev/branches/lucene5487/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java (original)
+++ lucene/dev/branches/lucene5487/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java Fri Mar  7 20:50:45 2014
@@ -21,14 +21,17 @@ import org.apache.lucene.store.ByteArray
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.BytesRefHash;
+import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.IntsRef;
 import org.apache.lucene.util.OfflineSorter;
 import org.apache.lucene.util.OfflineSorter.ByteSequencesReader;
 import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter;
 import org.apache.lucene.util.fst.Builder;
+import org.apache.lucene.util.fst.CharSequenceOutputs;
 import org.apache.lucene.util.fst.FST;
 import org.apache.lucene.util.fst.IntSequenceOutputs;
+import org.apache.lucene.util.fst.Outputs;
 import org.apache.lucene.util.fst.Util;
 
 import java.io.BufferedInputStream;
@@ -67,6 +70,9 @@ public class Dictionary {
   private static final String FLAG_KEY = "FLAG";
   private static final String COMPLEXPREFIXES_KEY = "COMPLEXPREFIXES";
   private static final String CIRCUMFIX_KEY = "CIRCUMFIX";
+  private static final String IGNORE_KEY = "IGNORE";
+  private static final String ICONV_KEY = "ICONV";
+  private static final String OCONV_KEY = "OCONV";
 
   private static final String NUM_FLAG_TYPE = "num";
   private static final String UTF8_FLAG_TYPE = "UTF-8";
@@ -110,6 +116,16 @@ public class Dictionary {
   
   int circumfix = -1; // circumfix flag, or -1 if one is not defined
   
+  // ignored characters (dictionary, affix, inputs)
+  private char[] ignore;
+  
+  // FSTs used for ICONV/OCONV, output ord pointing to replacement text
+  FST<CharsRef> iconv;
+  FST<CharsRef> oconv;
+  
+  boolean needsInputCleaning;
+  boolean needsOutputCleaning;
+  
   /**
    * Creates a new Dictionary containing the information read from the provided InputStreams to hunspell affix
    * and dictionary files.
@@ -136,9 +152,13 @@ public class Dictionary {
    */
   public Dictionary(InputStream affix, List<InputStream> dictionaries, boolean ignoreCase) throws IOException, ParseException {
     this.ignoreCase = ignoreCase;
-    // hungarian has thousands of AF before the SET, so a 32k buffer is needed 
-    BufferedInputStream buffered = new BufferedInputStream(affix, 32768);
-    buffered.mark(32768);
+    this.needsInputCleaning = ignoreCase;
+    this.needsOutputCleaning = false; // set if we have an OCONV
+    // TODO: we really need to probably buffer this on disk since so many newer dictionaries
+    // (en_GB, hu_HU, etc) now have tons of AM lines (morph metadata) etc before they finally declare 
+    // their encoding... but for now this large buffer is a workaround
+    BufferedInputStream buffered = new BufferedInputStream(affix, 65536);
+    buffered.mark(65536);
     String encoding = getDictionaryEncoding(buffered);
     buffered.reset();
     CharsetDecoder decoder = getJavaEncoding(encoding);
@@ -249,6 +269,29 @@ public class Dictionary {
           throw new ParseException("Illegal CIRCUMFIX declaration", reader.getLineNumber());
         }
         circumfix = flagParsingStrategy.parseFlag(parts[1]);
+      } else if (line.startsWith(IGNORE_KEY)) {
+        String parts[] = line.split("\\s+");
+        if (parts.length != 2) {
+          throw new ParseException("Illegal IGNORE declaration", reader.getLineNumber());
+        }
+        ignore = parts[1].toCharArray();
+        Arrays.sort(ignore);
+        needsInputCleaning = true;
+      } else if (line.startsWith(ICONV_KEY) || line.startsWith(OCONV_KEY)) {
+        String parts[] = line.split("\\s+");
+        String type = parts[0];
+        if (parts.length != 2) {
+          throw new ParseException("Illegal " + type + " declaration", reader.getLineNumber());
+        }
+        int num = Integer.parseInt(parts[1]);
+        FST<CharsRef> res = parseConversions(reader, num);
+        if (type.equals("ICONV")) {
+          iconv = res;
+          needsInputCleaning |= iconv != null;
+        } else {
+          oconv = res;
+          needsOutputCleaning |= oconv != null;
+        }
       }
     }
     
@@ -291,6 +334,7 @@ public class Dictionary {
                           Map<String,Integer> seenPatterns) throws IOException, ParseException {
     
     BytesRef scratch = new BytesRef();
+    StringBuilder sb = new StringBuilder();
     String args[] = header.split("\\s+");
 
     boolean crossProduct = args[2].equals("Y");
@@ -300,9 +344,6 @@ public class Dictionary {
     ByteArrayDataOutput affixWriter = new ByteArrayDataOutput(affixData, currentAffix << 3, numLines << 3);
     
     for (int i = 0; i < numLines; i++) {
-      if (currentAffix > Short.MAX_VALUE) {
-        throw new UnsupportedOperationException("Too many affixes, please report this to dev@lucene.apache.org");
-      }
       assert affixWriter.getPosition() == currentAffix << 3;
       String line = reader.readLine();
       String ruleArgs[] = line.split("\\s+");
@@ -345,6 +386,9 @@ public class Dictionary {
       Integer patternIndex = seenPatterns.get(regex);
       if (patternIndex == null) {
         patternIndex = patterns.size();
+        if (patternIndex > Short.MAX_VALUE) {
+          throw new UnsupportedOperationException("Too many patterns, please report this to dev@lucene.apache.org");          
+        }
         seenPatterns.put(regex, patternIndex);
         Pattern pattern = Pattern.compile(regex);
         patterns.add(pattern);
@@ -355,6 +399,8 @@ public class Dictionary {
       if (stripOrd < 0) {
         // already exists in our hash
         stripOrd = (-stripOrd)-1;
+      } else if (stripOrd > Character.MAX_VALUE) {
+        throw new UnsupportedOperationException("Too many unique strips, please report this to dev@lucene.apache.org");
       }
 
       if (appendFlags == null) {
@@ -368,7 +414,7 @@ public class Dictionary {
         appendFlagsOrd = (-appendFlagsOrd)-1;
       } else if (appendFlagsOrd > Short.MAX_VALUE) {
         // this limit is probably flexible, but its a good sanity check too
-        throw new UnsupportedOperationException("Too many unique flags, please report this to dev@lucene.apache.org");
+        throw new UnsupportedOperationException("Too many unique append flags, please report this to dev@lucene.apache.org");
       }
       
       affixWriter.writeShort((short)flag);
@@ -378,6 +424,11 @@ public class Dictionary {
       affixWriter.writeShort((short)patternOrd);
       affixWriter.writeShort((short)appendFlagsOrd);
       
+      if (needsInputCleaning) {
+        CharSequence cleaned = cleanInput(affixArg, sb);
+        affixArg = cleaned.toString();
+      }
+      
       List<Character> list = affixes.get(affixArg);
       if (list == null) {
         list = new ArrayList<Character>();
@@ -388,6 +439,31 @@ public class Dictionary {
       currentAffix++;
     }
   }
+  
+  private FST<CharsRef> parseConversions(LineNumberReader reader, int num) throws IOException, ParseException {
+    Map<String,String> mappings = new TreeMap<>();
+    
+    for (int i = 0; i < num; i++) {
+      String line = reader.readLine();
+      String parts[] = line.split("\\s+");
+      if (parts.length != 3) {
+        throw new ParseException("invalid syntax: " + line, reader.getLineNumber());
+      }
+      if (mappings.put(parts[1], parts[2]) != null) {
+        throw new IllegalStateException("duplicate mapping specified for: " + parts[1]);
+      }
+    }
+    
+    Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
+    Builder<CharsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE2, outputs);
+    IntsRef scratchInts = new IntsRef();
+    for (Map.Entry<String,String> entry : mappings.entrySet()) {
+      Util.toUTF16(entry.getKey(), scratchInts);
+      builder.add(scratchInts, new CharsRef(entry.getValue()));
+    }
+    
+    return builder.finish();
+  }
 
   /**
    * Parses the encoding specified in the affix file readable through the provided InputStream
@@ -485,6 +561,8 @@ public class Dictionary {
     BytesRef flagsScratch = new BytesRef();
     IntsRef scratchInts = new IntsRef();
     
+    StringBuilder sb = new StringBuilder();
+    
     File unsorted = File.createTempFile("unsorted", "dat", tempDir);
     try (ByteSequencesWriter writer = new ByteSequencesWriter(unsorted)) {
       for (InputStream dictionary : dictionaries) {
@@ -492,16 +570,19 @@ public class Dictionary {
         String line = lines.readLine(); // first line is number of entries (approximately, sometimes)
         
         while ((line = lines.readLine()) != null) {
-          if (ignoreCase) {
+          if (needsInputCleaning) {
             int flagSep = line.lastIndexOf('/');
             if (flagSep == -1) {
-              writer.write(line.toLowerCase(Locale.ROOT).getBytes(IOUtils.CHARSET_UTF_8));
+              CharSequence cleansed = cleanInput(line, sb);
+              writer.write(cleansed.toString().getBytes(IOUtils.CHARSET_UTF_8));
             } else {
-              StringBuilder sb = new StringBuilder();
-              sb.append(line.substring(0, flagSep).toLowerCase(Locale.ROOT));
-              if (flagSep < line.length()) {
-                sb.append(line.substring(flagSep, line.length()));
+              String text = line.substring(0, flagSep);
+              CharSequence cleansed = cleanInput(text, sb);
+              if (cleansed != sb) {
+                sb.setLength(0);
+                sb.append(cleansed);
               }
+              sb.append(line.substring(flagSep));
               writer.write(sb.toString().getBytes(IOUtils.CHARSET_UTF_8));
             }
           } else {
@@ -761,4 +842,76 @@ public class Dictionary {
   static boolean hasFlag(char flags[], char flag) {
     return Arrays.binarySearch(flags, flag) >= 0;
   }
+  
+  CharSequence cleanInput(CharSequence input, StringBuilder reuse) {
+    reuse.setLength(0);
+    
+    for (int i = 0; i < input.length(); i++) {
+      char ch = input.charAt(i);
+      
+      if (ignore != null && Arrays.binarySearch(ignore, ch) >= 0) {
+        continue;
+      }
+      
+      if (ignoreCase && iconv == null) {
+        // if we have no input conversion mappings, do this on-the-fly
+        ch = Character.toLowerCase(ch);
+      }
+      
+      reuse.append(ch);
+    }
+    
+    if (iconv != null) {
+      try {
+        applyMappings(iconv, reuse);
+      } catch (IOException bogus) {
+        throw new RuntimeException(bogus);
+      }
+      if (ignoreCase) {
+        for (int i = 0; i < reuse.length(); i++) {
+          reuse.setCharAt(i, Character.toLowerCase(reuse.charAt(i)));
+        }
+      }
+    }
+    
+    return reuse;
+  }
+  
+  // TODO: this could be more efficient!
+  static void applyMappings(FST<CharsRef> fst, StringBuilder sb) throws IOException {
+    final FST.BytesReader bytesReader = fst.getBytesReader();
+    final FST.Arc<CharsRef> firstArc = fst.getFirstArc(new FST.Arc<CharsRef>());
+    final CharsRef NO_OUTPUT = fst.outputs.getNoOutput();
+    
+    // temporary stuff
+    final FST.Arc<CharsRef> arc = new FST.Arc<>();
+    int longestMatch;
+    CharsRef longestOutput;
+    
+    for (int i = 0; i < sb.length(); i++) {
+      arc.copyFrom(firstArc);
+      CharsRef output = NO_OUTPUT;
+      longestMatch = -1;
+      longestOutput = null;
+      
+      for (int j = i; j < sb.length(); j++) {
+        char ch = sb.charAt(j);
+        if (fst.findTargetArc(ch, arc, arc, bytesReader) == null) {
+          break;
+        } else {
+          output = fst.outputs.add(output, arc.output);
+        }
+        if (arc.isFinal()) {
+          longestOutput = fst.outputs.add(output, arc.nextFinalOutput);
+          longestMatch = j;
+        }
+      }
+      
+      if (longestMatch >= 0) {
+        sb.delete(i, longestMatch+1);
+        sb.insert(i, longestOutput);
+        i += (longestOutput.length - 1);
+      }
+    }
+  }
 }

Modified: lucene/dev/branches/lucene5487/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5487/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java?rev=1575397&r1=1575396&r2=1575397&view=diff
==============================================================================
--- lucene/dev/branches/lucene5487/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java (original)
+++ lucene/dev/branches/lucene5487/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java Fri Mar  7 20:50:45 2014
@@ -17,6 +17,7 @@ package org.apache.lucene.analysis.hunsp
  * limitations under the License.
  */
 
+import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
@@ -24,8 +25,8 @@ import java.util.List;
 import java.util.regex.Pattern;
 
 import org.apache.lucene.analysis.util.CharArraySet;
-import org.apache.lucene.analysis.util.CharacterUtils;
 import org.apache.lucene.store.ByteArrayDataInput;
+import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.IntsRef;
@@ -40,8 +41,11 @@ final class Stemmer {
   private final BytesRef scratch = new BytesRef();
   private final StringBuilder segment = new StringBuilder();
   private final ByteArrayDataInput affixReader;
-  private final CharacterUtils charUtils = CharacterUtils.getInstance(Version.LUCENE_CURRENT);
-
+  
+  // used for normalization
+  private final StringBuilder scratchSegment = new StringBuilder();
+  private char scratchBuffer[] = new char[32];
+  
   /**
    * Constructs a new Stemmer which will use the provided Dictionary to create its stems.
    *
@@ -68,17 +72,25 @@ final class Stemmer {
    * @param word Word to find the stems for
    * @return List of stems for the word
    */
-  public List<CharsRef> stem(char word[], int length) {
-    if (dictionary.ignoreCase) {
-      charUtils.toLowerCase(word, 0, length);
+  public List<CharsRef> stem(char word[], int length) {    
+
+    if (dictionary.needsInputCleaning) {
+      scratchSegment.setLength(0);
+      scratchSegment.append(word, 0, length);
+      CharSequence cleaned = dictionary.cleanInput(scratchSegment, segment);
+      scratchBuffer = ArrayUtil.grow(scratchBuffer, cleaned.length());
+      length = segment.length();
+      segment.getChars(0, length, scratchBuffer, 0);
+      word = scratchBuffer;
     }
+    
     List<CharsRef> stems = new ArrayList<CharsRef>();
     IntsRef forms = dictionary.lookupWord(word, 0, length);
     if (forms != null) {
       // TODO: some forms should not be added, e.g. ONLYINCOMPOUND
       // just because it exists, does not make it valid...
       for (int i = 0; i < forms.length; i++) {
-        stems.add(new CharsRef(word, 0, length));
+        stems.add(newStem(word, length));
       }
     }
     stems.addAll(stem(word, length, -1, -1, -1, 0, true, true, false, false));
@@ -106,6 +118,23 @@ final class Stemmer {
     }
     return deduped;
   }
+  
+  private CharsRef newStem(char buffer[], int length) {
+    if (dictionary.needsOutputCleaning) {
+      scratchSegment.setLength(0);
+      scratchSegment.append(buffer, 0, length);
+      try {
+        Dictionary.applyMappings(dictionary.oconv, scratchSegment);
+      } catch (IOException bogus) {
+        throw new RuntimeException(bogus);
+      }
+      char cleaned[] = new char[scratchSegment.length()];
+      scratchSegment.getChars(0, cleaned.length, cleaned, 0);
+      return new CharsRef(cleaned, 0, cleaned.length);
+    } else {
+      return new CharsRef(buffer, 0, length);
+    }
+  }
 
   // ================================================= Helper Methods ================================================
 
@@ -292,7 +321,7 @@ final class Stemmer {
               continue;
             }
           }
-          stems.add(new CharsRef(strippedWord, 0, length));
+          stems.add(newStem(strippedWord, length));
         }
       }
     }

Modified: lucene/dev/branches/lucene5487/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5487/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java?rev=1575397&r1=1575396&r2=1575397&view=diff
==============================================================================
--- lucene/dev/branches/lucene5487/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java (original)
+++ lucene/dev/branches/lucene5487/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java Fri Mar  7 20:50:45 2014
@@ -22,10 +22,15 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.text.ParseException;
 
-import org.apache.lucene.analysis.hunspell.Dictionary;
 import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.IntsRef;
 import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.fst.Builder;
+import org.apache.lucene.util.fst.CharSequenceOutputs;
+import org.apache.lucene.util.fst.FST;
+import org.apache.lucene.util.fst.Outputs;
+import org.apache.lucene.util.fst.Util;
 
 public class TestDictionary extends LuceneTestCase {
 
@@ -123,4 +128,54 @@ public class TestDictionary extends Luce
     assertTrue(affixStream.isClosed());
     assertTrue(dictStream.isClosed());
   }
+  
+  
+  
+  public void testReplacements() throws Exception {
+    Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
+    Builder<CharsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE2, outputs);
+    IntsRef scratchInts = new IntsRef();
+    
+    // a -> b
+    Util.toUTF16("a", scratchInts);
+    builder.add(scratchInts, new CharsRef("b"));
+    
+    // ab -> c
+    Util.toUTF16("ab", scratchInts);
+    builder.add(scratchInts, new CharsRef("c"));
+    
+    // c -> de
+    Util.toUTF16("c", scratchInts);
+    builder.add(scratchInts, new CharsRef("de"));
+    
+    // def -> gh
+    Util.toUTF16("def", scratchInts);
+    builder.add(scratchInts, new CharsRef("gh"));
+    
+    FST<CharsRef> fst = builder.finish();
+    
+    StringBuilder sb = new StringBuilder("atestanother");
+    Dictionary.applyMappings(fst, sb);
+    assertEquals("btestbnother", sb.toString());
+    
+    sb = new StringBuilder("abtestanother");
+    Dictionary.applyMappings(fst, sb);
+    assertEquals("ctestbnother", sb.toString());
+    
+    sb = new StringBuilder("atestabnother");
+    Dictionary.applyMappings(fst, sb);
+    assertEquals("btestcnother", sb.toString());
+    
+    sb = new StringBuilder("abtestabnother");
+    Dictionary.applyMappings(fst, sb);
+    assertEquals("ctestcnother", sb.toString());
+    
+    sb = new StringBuilder("abtestabcnother");
+    Dictionary.applyMappings(fst, sb);
+    assertEquals("ctestcdenother", sb.toString());
+    
+    sb = new StringBuilder("defdefdefc");
+    Dictionary.applyMappings(fst, sb);
+    assertEquals("ghghghde", sb.toString());
+  }
 }

Modified: lucene/dev/branches/lucene5487/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5487/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellStemFilter.java?rev=1575397&r1=1575396&r2=1575397&view=diff
==============================================================================
--- lucene/dev/branches/lucene5487/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellStemFilter.java (original)
+++ lucene/dev/branches/lucene5487/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellStemFilter.java Fri Mar  7 20:50:45 2014
@@ -20,6 +20,7 @@ package org.apache.lucene.analysis.hunsp
 import java.io.IOException;
 import java.io.InputStream;
 import java.util.Arrays;
+import java.util.Collections;
 
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
@@ -30,7 +31,6 @@ import org.apache.lucene.analysis.hunspe
 import org.apache.lucene.analysis.hunspell.HunspellStemFilter;
 import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
 import org.apache.lucene.analysis.util.CharArraySet;
-import org.apache.lucene.util.TestUtil;
 import org.junit.AfterClass;
 import org.junit.BeforeClass;
 
@@ -94,4 +94,20 @@ public class TestHunspellStemFilter exte
     };
     checkOneTerm(a, "", "");
   }
+  
+  public void testIgnoreCaseNoSideEffects() throws Exception {
+    final Dictionary d;
+    try (InputStream affixStream = TestStemmer.class.getResourceAsStream("simple.aff");
+        InputStream dictStream = TestStemmer.class.getResourceAsStream("simple.dic")) {
+      d = new Dictionary(affixStream, Collections.singletonList(dictStream), true);
+    }
+    Analyzer a = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName) {
+        Tokenizer tokenizer = new KeywordTokenizer();
+        return new TokenStreamComponents(tokenizer, new HunspellStemFilter(tokenizer, d));
+      }
+    };
+    checkOneTerm(a, "NoChAnGy", "NoChAnGy");
+  }
 }

Modified: lucene/dev/branches/lucene5487/lucene/core/src/java/org/apache/lucene/index/DocFieldProcessor.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5487/lucene/core/src/java/org/apache/lucene/index/DocFieldProcessor.java?rev=1575397&r1=1575396&r2=1575397&view=diff
==============================================================================
--- lucene/dev/branches/lucene5487/lucene/core/src/java/org/apache/lucene/index/DocFieldProcessor.java (original)
+++ lucene/dev/branches/lucene5487/lucene/core/src/java/org/apache/lucene/index/DocFieldProcessor.java Fri Mar  7 20:50:45 2014
@@ -209,11 +209,6 @@ final class DocFieldProcessor extends Do
       final DocFieldProcessorPerField perField = fields[i];
       perField.consumer.processFields(perField.fields, perField.fieldCount);
     }
-
-    if (docState.maxTermPrefix != null && docState.infoStream.isEnabled("IW")) {
-      docState.infoStream.message("IW", "WARNING: document contains at least one immense term (whose UTF8 encoding is longer than the max length " + DocumentsWriterPerThread.MAX_TERM_LENGTH_UTF8 + "), all of which were skipped.  Please correct the analyzer to not produce such terms.  The prefix of the first immense term is: '" + docState.maxTermPrefix + "...'");
-      docState.maxTermPrefix = null;
-    }
   }
 
   private DocFieldProcessorPerField processField(FieldInfos.Builder fieldInfos,

Modified: lucene/dev/branches/lucene5487/lucene/core/src/java/org/apache/lucene/index/DocInverterPerField.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5487/lucene/core/src/java/org/apache/lucene/index/DocInverterPerField.java?rev=1575397&r1=1575396&r2=1575397&view=diff
==============================================================================
--- lucene/dev/branches/lucene5487/lucene/core/src/java/org/apache/lucene/index/DocInverterPerField.java (original)
+++ lucene/dev/branches/lucene5487/lucene/core/src/java/org/apache/lucene/index/DocInverterPerField.java Fri Mar  7 20:50:45 2014
@@ -23,7 +23,6 @@ import org.apache.lucene.analysis.TokenS
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.index.FieldInfo.IndexOptions;
-import org.apache.lucene.util.IOUtils;
 
 /**
  * Holds state for inverting all occurrences of a single
@@ -182,6 +181,17 @@ final class DocInverterPerField extends 
           // when we come back around to the field...
           fieldState.position += posIncrAttribute.getPositionIncrement();
           fieldState.offset += offsetAttribute.endOffset();
+
+
+          if (docState.maxTermPrefix != null) {
+            final String msg = "Document contains at least one immense term in field=\"" + fieldInfo.name + "\" (whose UTF8 encoding is longer than the max length " + DocumentsWriterPerThread.MAX_TERM_LENGTH_UTF8 + "), all of which were skipped.  Please correct the analyzer to not produce such terms.  The prefix of the first immense term is: '" + docState.maxTermPrefix + "...'";
+            if (docState.infoStream.isEnabled("IW")) {
+              docState.infoStream.message("IW", "ERROR: " + msg);
+            }
+            docState.maxTermPrefix = null;
+            throw new IllegalArgumentException(msg);
+          }
+
           /* if success was false above there is an exception coming through and we won't get here.*/
           succeededInProcessingField = true;
         } finally {

Modified: lucene/dev/branches/lucene5487/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5487/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java?rev=1575397&r1=1575396&r2=1575397&view=diff
==============================================================================
--- lucene/dev/branches/lucene5487/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java (original)
+++ lucene/dev/branches/lucene5487/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java Fri Mar  7 20:50:45 2014
@@ -207,8 +207,9 @@ public class IndexWriter implements Clos
   /**
    * Absolute hard maximum length for a term, in bytes once
    * encoded as UTF8.  If a term arrives from the analyzer
-   * longer than this length, it is skipped and a message is
-   * printed to infoStream, if set (see {@link
+   * longer than this length, an
+   * <code>IllegalArgumentException</code>  is thrown
+   * and a message is printed to infoStream, if set (see {@link
    * IndexWriterConfig#setInfoStream(InfoStream)}).
    */
   public final static int MAX_TERM_LENGTH = DocumentsWriterPerThread.MAX_TERM_LENGTH_UTF8;
@@ -1159,7 +1160,7 @@ public class IndexWriter implements Clos
    * merge policy.
    *
    * <p>Note that each term in the document can be no longer
-   * than 16383 characters, otherwise an
+   * than {@link #MAX_TERM_LENGTH} in bytes, otherwise an
    * IllegalArgumentException will be thrown.</p>
    *
    * <p>Note that it's possible to create an invalid Unicode

Modified: lucene/dev/branches/lucene5487/lucene/core/src/java/org/apache/lucene/index/TermsHashPerField.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5487/lucene/core/src/java/org/apache/lucene/index/TermsHashPerField.java?rev=1575397&r1=1575396&r2=1575397&view=diff
==============================================================================
--- lucene/dev/branches/lucene5487/lucene/core/src/java/org/apache/lucene/index/TermsHashPerField.java (original)
+++ lucene/dev/branches/lucene5487/lucene/core/src/java/org/apache/lucene/index/TermsHashPerField.java Fri Mar  7 20:50:45 2014
@@ -179,12 +179,11 @@ final class TermsHashPerField extends In
     try {
       termID = bytesHash.add(termBytesRef, termAtt.fillBytesRef());
     } catch (MaxBytesLengthExceededException e) {
-      // Not enough room in current block
-      // Just skip this term, to remain as robust as
-      // possible during indexing.  A TokenFilter
-      // can be inserted into the analyzer chain if
-      // other behavior is wanted (pruning the term
-      // to a prefix, throwing an exception, etc).
+      // Term is too large; record this here (can't throw an
+      // exc because DocInverterPerField will then abort the
+      // entire segment) and then throw an exc later in
+      // DocInverterPerField.java.  LengthFilter can always be
+      // used to prune the term before indexing:
       if (docState.maxTermPrefix == null) {
         final int saved = termBytesRef.length;
         try {

Modified: lucene/dev/branches/lucene5487/lucene/core/src/java/org/apache/lucene/search/Sort.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5487/lucene/core/src/java/org/apache/lucene/search/Sort.java?rev=1575397&r1=1575396&r2=1575397&view=diff
==============================================================================
--- lucene/dev/branches/lucene5487/lucene/core/src/java/org/apache/lucene/search/Sort.java (original)
+++ lucene/dev/branches/lucene5487/lucene/core/src/java/org/apache/lucene/search/Sort.java Fri Mar  7 20:50:45 2014
@@ -202,8 +202,8 @@ public class Sort {
     return 0x45aaf665 + Arrays.hashCode(fields);
   }
 
-  /** Whether the relevance score is needed to sort documents. */
-  boolean needsScores() {
+  /** Returns true if the relevance score is needed to sort documents. */
+  public boolean needsScores() {
     for (SortField sortField : fields) {
       if (sortField.needsScores()) {
         return true;

Modified: lucene/dev/branches/lucene5487/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5487/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java?rev=1575397&r1=1575396&r2=1575397&view=diff
==============================================================================
--- lucene/dev/branches/lucene5487/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java (original)
+++ lucene/dev/branches/lucene5487/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java Fri Mar  7 20:50:45 2014
@@ -1660,32 +1660,32 @@ public class TestIndexWriter extends Luc
     // This contents produces a too-long term:
     String contents = "abc xyz x" + bigTerm + " another term";
     doc.add(new TextField("content", contents, Field.Store.NO));
-    w.addDocument(doc);
+    try {
+      w.addDocument(doc);
+      fail("should have hit exception");
+    } catch (IllegalArgumentException iae) {
+      // expected
+    }
 
     // Make sure we can add another normal document
     doc = new Document();
     doc.add(new TextField("content", "abc bbb ccc", Field.Store.NO));
     w.addDocument(doc);
 
+    // So we remove the deleted doc:
+    w.forceMerge(1);
+
     IndexReader reader = w.getReader();
     w.close();
 
     // Make sure all terms < max size were indexed
-    assertEquals(2, reader.docFreq(new Term("content", "abc")));
+    assertEquals(1, reader.docFreq(new Term("content", "abc")));
     assertEquals(1, reader.docFreq(new Term("content", "bbb")));
-    assertEquals(1, reader.docFreq(new Term("content", "term")));
-    assertEquals(1, reader.docFreq(new Term("content", "another")));
-
-    // Make sure position is still incremented when
-    // massive term is skipped:
-    DocsAndPositionsEnum tps = MultiFields.getTermPositionsEnum(reader, null, "content", new BytesRef("another"));
-    assertEquals(0, tps.nextDoc());
-    assertEquals(1, tps.freq());
-    assertEquals(3, tps.nextPosition());
+    assertEquals(0, reader.docFreq(new Term("content", "term")));
 
-    // Make sure the doc that has the massive term is in
+    // Make sure the doc that has the massive term is NOT in
     // the index:
-    assertEquals("document with wicked long term should is not in the index!", 2, reader.numDocs());
+    assertEquals("document with wicked long term is in the index!", 1, reader.numDocs());
 
     reader.close();
     dir.close();

Modified: lucene/dev/branches/lucene5487/lucene/misc/src/java/org/apache/lucene/index/sorter/EarlyTerminatingSortingCollector.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5487/lucene/misc/src/java/org/apache/lucene/index/sorter/EarlyTerminatingSortingCollector.java?rev=1575397&r1=1575396&r2=1575397&view=diff
==============================================================================
--- lucene/dev/branches/lucene5487/lucene/misc/src/java/org/apache/lucene/index/sorter/EarlyTerminatingSortingCollector.java (original)
+++ lucene/dev/branches/lucene5487/lucene/misc/src/java/org/apache/lucene/index/sorter/EarlyTerminatingSortingCollector.java Fri Mar  7 20:50:45 2014
@@ -24,50 +24,53 @@ import org.apache.lucene.index.IndexWrit
 import org.apache.lucene.search.CollectionTerminatedException;
 import org.apache.lucene.search.Collector;
 import org.apache.lucene.search.Scorer;
+import org.apache.lucene.search.Sort;
 import org.apache.lucene.search.TopDocsCollector;
 import org.apache.lucene.search.TotalHitCountCollector;
 
 /**
  * A {@link Collector} that early terminates collection of documents on a
  * per-segment basis, if the segment was sorted according to the given
- * {@link Sorter}.
+ * {@link Sort}.
  * 
  * <p>
- * <b>NOTE:</b> the {@link Collector} detects sorted segments according to
+ * <b>NOTE:</b> the {@code Collector} detects sorted segments according to
  * {@link SortingMergePolicy}, so it's best used in conjunction with it. Also,
- * it collects up to a specified num docs from each segment, and therefore is
- * mostly suitable for use in conjunction with collectors such as
+ * it collects up to a specified {@code numDocsToCollect} from each segment, 
+ * and therefore is mostly suitable for use in conjunction with collectors such as
  * {@link TopDocsCollector}, and not e.g. {@link TotalHitCountCollector}.
  * <p>
- * <b>NOTE</b>: If you wrap a {@link TopDocsCollector} that sorts in the same
- * order as the index order, the returned {@link TopDocsCollector#topDocs()}
+ * <b>NOTE</b>: If you wrap a {@code TopDocsCollector} that sorts in the same
+ * order as the index order, the returned {@link TopDocsCollector#topDocs() TopDocs}
  * will be correct. However the total of {@link TopDocsCollector#getTotalHits()
  * hit count} will be underestimated since not all matching documents will have
  * been collected.
  * <p>
- * <b>NOTE</b>: This {@link Collector} uses {@link Sorter#getID()} to detect
- * whether a segment was sorted with the same {@link Sorter} as the one given in
- * {@link #EarlyTerminatingSortingCollector(Collector, Sorter, int)}. This has
+ * <b>NOTE</b>: This {@code Collector} uses {@link Sort#toString()} to detect
+ * whether a segment was sorted with the same {@code Sort}. This has
  * two implications:
  * <ul>
- * <li>if {@link Sorter#getID()} is not implemented correctly and returns
- * different identifiers for equivalent {@link Sorter}s, this collector will not
+ * <li>if a custom comparator is not implemented correctly and returns
+ * different identifiers for equivalent instances, this collector will not
  * detect sorted segments,</li>
  * <li>if you suddenly change the {@link IndexWriter}'s
- * {@link SortingMergePolicy} to sort according to another criterion and if both
- * the old and the new {@link Sorter}s have the same identifier, this
- * {@link Collector} will incorrectly detect sorted segments.</li>
+ * {@code SortingMergePolicy} to sort according to another criterion and if both
+ * the old and the new {@code Sort}s have the same identifier, this
+ * {@code Collector} will incorrectly detect sorted segments.</li>
  * </ul>
  * 
  * @lucene.experimental
  */
 public class EarlyTerminatingSortingCollector extends Collector {
-
+  /** The wrapped Collector */
   protected final Collector in;
-  protected final Sorter sorter;
+  /** Sort used to sort the search results */
+  protected final Sort sort;
+  /** Number of documents to collect in each segment */
   protected final int numDocsToCollect;
-  
+  /** Number of documents to collect in the current segment being processed */
   protected int segmentTotalCollect;
+  /** True if the current segment being processed is sorted by {@link #sort} */
   protected boolean segmentSorted;
 
   private int numCollected;
@@ -77,20 +80,19 @@ public class EarlyTerminatingSortingColl
    * 
    * @param in
    *          the collector to wrap
-   * @param sorter
-   *          the same sorter as the one which is used by {@link IndexWriter}'s
-   *          {@link SortingMergePolicy}
+   * @param sort
+   *          the sort you are sorting the search results on
    * @param numDocsToCollect
    *          the number of documents to collect on each segment. When wrapping
    *          a {@link TopDocsCollector}, this number should be the number of
    *          hits.
    */
-  public EarlyTerminatingSortingCollector(Collector in, Sorter sorter, int numDocsToCollect) {
+  public EarlyTerminatingSortingCollector(Collector in, Sort sort, int numDocsToCollect) {
     if (numDocsToCollect <= 0) {
       throw new IllegalStateException("numDocsToCollect must always be > 0, got " + segmentTotalCollect);
     }
     this.in = in;
-    this.sorter = sorter;
+    this.sort = sort;
     this.numDocsToCollect = numDocsToCollect;
   }
 
@@ -110,7 +112,7 @@ public class EarlyTerminatingSortingColl
   @Override
   public void setNextReader(AtomicReaderContext context) throws IOException {
     in.setNextReader(context);
-    segmentSorted = SortingMergePolicy.isSorted(context.reader(), sorter);
+    segmentSorted = SortingMergePolicy.isSorted(context.reader(), sort);
     segmentTotalCollect = segmentSorted ? numDocsToCollect : Integer.MAX_VALUE;
     numCollected = 0;
   }

Modified: lucene/dev/branches/lucene5487/lucene/misc/src/java/org/apache/lucene/index/sorter/Sorter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5487/lucene/misc/src/java/org/apache/lucene/index/sorter/Sorter.java?rev=1575397&r1=1575396&r2=1575397&view=diff
==============================================================================
--- lucene/dev/branches/lucene5487/lucene/misc/src/java/org/apache/lucene/index/sorter/Sorter.java (original)
+++ lucene/dev/branches/lucene5487/lucene/misc/src/java/org/apache/lucene/index/sorter/Sorter.java Fri Mar  7 20:50:45 2014
@@ -22,47 +22,47 @@ import java.util.Comparator;
 
 import org.apache.lucene.index.AtomicReader;
 import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.search.FieldComparator;
+import org.apache.lucene.search.Scorer;
+import org.apache.lucene.search.Sort;
+import org.apache.lucene.search.SortField;
 import org.apache.lucene.util.TimSorter;
 import org.apache.lucene.util.packed.MonotonicAppendingLongBuffer;
 
 /**
  * Sorts documents of a given index by returning a permutation on the document
  * IDs.
- * <p><b>NOTE</b>: A {@link Sorter} implementation can be easily written from
- * a {@link DocComparator document comparator} by using the
- * {@link #sort(int, DocComparator)} helper method. This is especially useful
- * when documents are directly comparable by their field values.
  * @lucene.experimental
  */
-public abstract class Sorter {
-
-  /** A comparator that keeps documents in index order. */
-  public static final DocComparator INDEX_ORDER_COMPARATOR = new DocComparator() {
-    @Override
-    public int compare(int docID1, int docID2) {
-      return docID1 - docID2;
+final class Sorter {
+  final Sort sort;
+  
+  /** Creates a new Sorter to sort the index with {@code sort} */
+  Sorter(Sort sort) {
+    if (sort.needsScores()) {
+      throw new IllegalArgumentException("Cannot sort an index with a Sort that refers to the relevance score");
     }
-  };
+    this.sort = sort;
+  }
 
   /**
    * A permutation of doc IDs. For every document ID between <tt>0</tt> and
    * {@link IndexReader#maxDoc()}, <code>oldToNew(newToOld(docID))</code> must
    * return <code>docID</code>.
    */
-  public static abstract class DocMap {
+  static abstract class DocMap {
 
     /** Given a doc ID from the original index, return its ordinal in the
      *  sorted index. */
-    public abstract int oldToNew(int docID);
+    abstract int oldToNew(int docID);
 
     /** Given the ordinal of a doc ID, return its doc ID in the original index. */
-    public abstract int newToOld(int docID);
+    abstract int newToOld(int docID);
 
     /** Return the number of documents in this map. This must be equal to the
      *  {@link AtomicReader#maxDoc() number of documents} of the
      *  {@link AtomicReader} which is sorted. */
-    public abstract int size();
-
+    abstract int size();
   }
 
   /** Check consistency of a {@link DocMap}, useful for assertions. */
@@ -81,7 +81,7 @@ public abstract class Sorter {
   }
 
   /** A comparator of doc IDs. */
-  public static abstract class DocComparator {
+  static abstract class DocComparator {
 
     /** Compare docID1 against docID2. The contract for the return value is the
      *  same as {@link Comparator#compare(Object, Object)}. */
@@ -89,45 +89,13 @@ public abstract class Sorter {
 
   }
 
-  /**
-   * Sorts documents in reverse order. <b>NOTE</b>: This {@link Sorter} is not
-   * idempotent. Sorting an {@link AtomicReader} once or twice will return two
-   * different {@link AtomicReader} views. This {@link Sorter} should not be
-   * used with {@link SortingMergePolicy}.
-   */
-  public static final Sorter REVERSE_DOCS = new Sorter() {
-    @Override
-    public DocMap sort(final AtomicReader reader) throws IOException {
-      final int maxDoc = reader.maxDoc();
-      return new DocMap() {
-        @Override
-        public int oldToNew(int docID) {
-          return maxDoc - docID - 1;
-        }
-        @Override
-        public int newToOld(int docID) {
-          return maxDoc - docID - 1;
-        }
-        @Override
-        public int size() {
-          return maxDoc;
-        }
-      };
-    }
-    
-    @Override
-    public String getID() {
-      return "ReverseDocs";
-    }
-  };
-  
   private static final class DocValueSorter extends TimSorter {
     
     private final int[] docs;
     private final Sorter.DocComparator comparator;
     private final int[] tmp;
     
-    public DocValueSorter(int[] docs, Sorter.DocComparator comparator) {
+    DocValueSorter(int[] docs, Sorter.DocComparator comparator) {
       super(docs.length / 64);
       this.docs = docs;
       this.comparator = comparator;
@@ -168,7 +136,7 @@ public abstract class Sorter {
   }
 
   /** Computes the old-to-new permutation over the given comparator. */
-  protected static Sorter.DocMap sort(final int maxDoc, DocComparator comparator) {
+  private static Sorter.DocMap sort(final int maxDoc, DocComparator comparator) {
     // check if the index is sorted
     boolean sorted = true;
     for (int i = 1; i < maxDoc; ++i) {
@@ -242,20 +210,75 @@ public abstract class Sorter {
    * <b>NOTE:</b> deleted documents are expected to appear in the mapping as
    * well, they will however be marked as deleted in the sorted view.
    */
-  public abstract DocMap sort(AtomicReader reader) throws IOException;
+  DocMap sort(AtomicReader reader) throws IOException {
+    SortField fields[] = sort.getSort();
+    final int reverseMul[] = new int[fields.length];
+    final FieldComparator<?> comparators[] = new FieldComparator[fields.length];
+    
+    for (int i = 0; i < fields.length; i++) {
+      reverseMul[i] = fields[i].getReverse() ? -1 : 1;
+      comparators[i] = fields[i].getComparator(1, i);
+      comparators[i].setNextReader(reader.getContext());
+      comparators[i].setScorer(FAKESCORER);
+    }
+    final DocComparator comparator = new DocComparator() {
+      @Override
+      public int compare(int docID1, int docID2) {
+        try {
+          for (int i = 0; i < comparators.length; i++) {
+            // TODO: would be better if copy() didnt cause a term lookup in TermOrdVal & co,
+            // the segments are always the same here...
+            comparators[i].copy(0, docID1);
+            comparators[i].setBottom(0);
+            int comp = reverseMul[i] * comparators[i].compareBottom(docID2);
+            if (comp != 0) {
+              return comp;
+            }
+          }
+          return Integer.compare(docID1, docID2); // docid order tiebreak
+        } catch (IOException e) {
+          throw new RuntimeException(e);
+        }
+      }
+    };
+    return sort(reader.maxDoc(), comparator);
+  }
 
   /**
    * Returns the identifier of this {@link Sorter}.
    * <p>This identifier is similar to {@link Object#hashCode()} and should be
    * chosen so that two instances of this class that sort documents likewise
    * will have the same identifier. On the contrary, this identifier should be
-   * different on different {@link Sorter sorters}.
+   * different on different {@link Sort sorts}.
    */
-  public abstract String getID();
+  public String getID() {
+    return sort.toString();
+  }
 
   @Override
   public String toString() {
     return getID();
   }
   
+  static final Scorer FAKESCORER = new Scorer(null) {
+    
+    @Override
+    public float score() throws IOException { throw new UnsupportedOperationException(); }
+    
+    @Override
+    public int freq() throws IOException { throw new UnsupportedOperationException(); }
+
+    @Override
+    public int docID() { throw new UnsupportedOperationException(); }
+
+    @Override
+    public int nextDoc() throws IOException { throw new UnsupportedOperationException(); }
+
+    @Override
+    public int advance(int target) throws IOException { throw new UnsupportedOperationException(); }
+
+    @Override
+    public long cost() { throw new UnsupportedOperationException(); }
+  };
+  
 }

Modified: lucene/dev/branches/lucene5487/lucene/misc/src/java/org/apache/lucene/index/sorter/SortingAtomicReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5487/lucene/misc/src/java/org/apache/lucene/index/sorter/SortingAtomicReader.java?rev=1575397&r1=1575396&r2=1575397&view=diff
==============================================================================
--- lucene/dev/branches/lucene5487/lucene/misc/src/java/org/apache/lucene/index/sorter/SortingAtomicReader.java (original)
+++ lucene/dev/branches/lucene5487/lucene/misc/src/java/org/apache/lucene/index/sorter/SortingAtomicReader.java Fri Mar  7 20:50:45 2014
@@ -35,6 +35,7 @@ import org.apache.lucene.index.StoredFie
 import org.apache.lucene.index.Terms;
 import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.search.Sort;
 import org.apache.lucene.store.IndexInput;
 import org.apache.lucene.store.IndexOutput;
 import org.apache.lucene.store.RAMFile;
@@ -48,13 +49,13 @@ import org.apache.lucene.util.automaton.
 
 /**
  * An {@link AtomicReader} which supports sorting documents by a given
- * {@link Sorter}. You can use this class to sort an index as follows:
+ * {@link Sort}. You can use this class to sort an index as follows:
  * 
  * <pre class="prettyprint">
  * IndexWriter writer; // writer to which the sorted index will be added
  * DirectoryReader reader; // reader on the input index
- * Sorter sorter; // determines how the documents are sorted
- * AtomicReader sortingReader = SortingAtomicReader.wrap(SlowCompositeReaderWrapper.wrap(reader), sorter);
+ * Sort sort; // determines how the documents are sorted
+ * AtomicReader sortingReader = SortingAtomicReader.wrap(SlowCompositeReaderWrapper.wrap(reader), sort);
  * writer.addIndexes(reader);
  * writer.close();
  * reader.close();
@@ -480,7 +481,7 @@ public class SortingAtomicReader extends
   static class SortingDocsAndPositionsEnum extends FilterDocsAndPositionsEnum {
     
     /**
-     * A {@link Sorter} which sorts two parallel arrays of doc IDs and
+     * A {@link TimSorter} which sorts two parallel arrays of doc IDs and
      * offsets in one go. Everytime a doc ID is 'swapped', its correponding offset
      * is swapped too.
      */
@@ -708,14 +709,14 @@ public class SortingAtomicReader extends
   }
 
   /** Return a sorted view of <code>reader</code> according to the order
-   *  defined by <code>sorter</code>. If the reader is already sorted, this
+   *  defined by <code>sort</code>. If the reader is already sorted, this
    *  method might return the reader as-is. */
-  public static AtomicReader wrap(AtomicReader reader, Sorter sorter) throws IOException {
-    return wrap(reader, sorter.sort(reader));
+  public static AtomicReader wrap(AtomicReader reader, Sort sort) throws IOException {
+    return wrap(reader, new Sorter(sort).sort(reader));
   }
 
-  /** Expert: same as {@link #wrap(AtomicReader, Sorter)} but operates directly on a {@link Sorter.DocMap}. */
-  public static AtomicReader wrap(AtomicReader reader, Sorter.DocMap docMap) {
+  /** Expert: same as {@link #wrap(AtomicReader, Sort)} but operates directly on a {@link Sorter.DocMap}. */
+  static AtomicReader wrap(AtomicReader reader, Sorter.DocMap docMap) {
     if (docMap == null) {
       // the reader is already sorter
       return reader;

Modified: lucene/dev/branches/lucene5487/lucene/misc/src/java/org/apache/lucene/index/sorter/SortingMergePolicy.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5487/lucene/misc/src/java/org/apache/lucene/index/sorter/SortingMergePolicy.java?rev=1575397&r1=1575396&r2=1575397&view=diff
==============================================================================
--- lucene/dev/branches/lucene5487/lucene/misc/src/java/org/apache/lucene/index/sorter/SortingMergePolicy.java (original)
+++ lucene/dev/branches/lucene5487/lucene/misc/src/java/org/apache/lucene/index/sorter/SortingMergePolicy.java Fri Mar  7 20:50:45 2014
@@ -22,6 +22,7 @@ import java.util.Collections;
 import java.util.List;
 import java.util.Map;
 
+import org.apache.lucene.analysis.Analyzer; // javadocs
 import org.apache.lucene.index.AtomicReader;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexWriter;
@@ -33,22 +34,23 @@ import org.apache.lucene.index.SegmentCo
 import org.apache.lucene.index.SegmentInfos;
 import org.apache.lucene.index.SegmentReader;
 import org.apache.lucene.index.SlowCompositeReaderWrapper;
+import org.apache.lucene.search.Sort;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.Bits;
 import org.apache.lucene.util.packed.MonotonicAppendingLongBuffer;
 
-/** A {@link MergePolicy} that reorders documents according to a {@link Sorter}
+/** A {@link MergePolicy} that reorders documents according to a {@link Sort}
  *  before merging them. As a consequence, all segments resulting from a merge
  *  will be sorted while segments resulting from a flush will be in the order
  *  in which documents have been added.
- *  <p><b>NOTE</b>: Never use this {@link MergePolicy} if you rely on
- *  {@link IndexWriter#addDocuments(Iterable, org.apache.lucene.analysis.Analyzer)}
+ *  <p><b>NOTE</b>: Never use this policy if you rely on
+ *  {@link IndexWriter#addDocuments(Iterable, Analyzer) IndexWriter.addDocuments}
  *  to have sequentially-assigned doc IDs, this policy will scatter doc IDs.
- *  <p><b>NOTE</b>: This {@link MergePolicy} should only be used with idempotent
- *  {@link Sorter}s so that the order of segments is predictable. For example,
- *  using {@link SortingMergePolicy} with {@link Sorter#REVERSE_DOCS} (which is
- *  not idempotent) will make the order of documents in a segment depend on the
- *  number of times the segment has been merged.
+ *  <p><b>NOTE</b>: This policy should only be used with idempotent {@code Sort}s 
+ *  so that the order of segments is predictable. For example, using 
+ *  {@link Sort#INDEXORDER} in reverse (which is not idempotent) will make 
+ *  the order of documents in a segment depend on the number of times the segment 
+ *  has been merged.
  *  @lucene.experimental */
 public final class SortingMergePolicy extends MergePolicy {
 
@@ -147,12 +149,12 @@ public final class SortingMergePolicy ex
 
   }
 
-  /** Returns true if the given reader is sorted by the given sorter. */
-  public static boolean isSorted(AtomicReader reader, Sorter sorter) {
+  /** Returns {@code true} if the given {@code reader} is sorted by the specified {@code sort}. */
+  public static boolean isSorted(AtomicReader reader, Sort sort) {
     if (reader instanceof SegmentReader) {
       final SegmentReader segReader = (SegmentReader) reader;
       final Map<String, String> diagnostics = segReader.getSegmentInfo().info.getDiagnostics();
-      if (diagnostics != null && sorter.getID().equals(diagnostics.get(SORTER_ID_PROP))) {
+      if (diagnostics != null && sort.toString().equals(diagnostics.get(SORTER_ID_PROP))) {
         return true;
       }
     }
@@ -172,11 +174,13 @@ public final class SortingMergePolicy ex
 
   final MergePolicy in;
   final Sorter sorter;
+  final Sort sort;
 
-  /** Create a new {@link MergePolicy} that sorts documents with <code>sorter</code>. */
-  public SortingMergePolicy(MergePolicy in, Sorter sorter) {
+  /** Create a new {@code MergePolicy} that sorts documents with the given {@code sort}. */
+  public SortingMergePolicy(MergePolicy in, Sort sort) {
     this.in = in;
-    this.sorter = sorter;
+    this.sorter = new Sorter(sort);
+    this.sort = sort;
   }
 
   @Override
@@ -200,7 +204,7 @@ public final class SortingMergePolicy ex
 
   @Override
   public MergePolicy clone() {
-    return new SortingMergePolicy(in.clone(), sorter);
+    return new SortingMergePolicy(in.clone(), sort);
   }
 
   @Override

Modified: lucene/dev/branches/lucene5487/lucene/misc/src/java/org/apache/lucene/index/sorter/package.html
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5487/lucene/misc/src/java/org/apache/lucene/index/sorter/package.html?rev=1575397&r1=1575396&r2=1575397&view=diff
==============================================================================
--- lucene/dev/branches/lucene5487/lucene/misc/src/java/org/apache/lucene/index/sorter/package.html (original)
+++ lucene/dev/branches/lucene5487/lucene/misc/src/java/org/apache/lucene/index/sorter/package.html Fri Mar  7 20:50:45 2014
@@ -17,19 +17,16 @@
 -->
 <html>
 <body>
-<p>Provides index sorting capablities. The application can use one of the
-pre-existing Sorter implementations, e.g. to sort by a
-{@link org.apache.lucene.index.sorter.NumericDocValuesSorter}
-or {@link org.apache.lucene.index.sorter.Sorter#REVERSE_DOCS reverse} the order
-of the documents. Additionally, the application can implement a custom
-{@link org.apache.lucene.index.sorter.Sorter} which returns a permutation on 
-a source {@link org.apache.lucene.index.AtomicReader}'s document IDs, to sort
-the input documents by additional criteria.
+<p>Provides index sorting capablities. The application can use any
+Sort specification, e.g. to sort by fields using DocValues or FieldCache, or to
+reverse the order of the documents (by using SortField.Type.DOC in reverse).
+Multi-level sorts can be specified the same way you would when searching, by
+building Sort from multiple SortFields.
 
 <p>{@link org.apache.lucene.index.sorter.SortingMergePolicy} can be used to
 make Lucene sort segments before merging them. This will ensure that every
 segment resulting from a merge will be sorted according to the provided
-{@link org.apache.lucene.index.sorter.Sorter}. This however makes merging and
+{@link org.apache.lucene.search.Sort}. This however makes merging and
 thus indexing slower.
 
 <p>Sorted segments allow for early query termination when the sort order

Modified: lucene/dev/branches/lucene5487/lucene/misc/src/test/org/apache/lucene/index/sorter/IndexSortingTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5487/lucene/misc/src/test/org/apache/lucene/index/sorter/IndexSortingTest.java?rev=1575397&r1=1575396&r2=1575397&view=diff
==============================================================================
--- lucene/dev/branches/lucene5487/lucene/misc/src/test/org/apache/lucene/index/sorter/IndexSortingTest.java (original)
+++ lucene/dev/branches/lucene5487/lucene/misc/src/test/org/apache/lucene/index/sorter/IndexSortingTest.java Fri Mar  7 20:50:45 2014
@@ -24,6 +24,8 @@ import java.util.List;
 import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.SlowCompositeReaderWrapper;
+import org.apache.lucene.search.Sort;
+import org.apache.lucene.search.SortField;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.Bits;
 import org.apache.lucene.util.TestUtil;
@@ -31,9 +33,9 @@ import org.junit.BeforeClass;
 
 public class IndexSortingTest extends SorterTestBase {
   
-  private static final Sorter[] SORTERS = new Sorter[] {
-    new NumericDocValuesSorter(NUMERIC_DV_FIELD, true),
-    Sorter.REVERSE_DOCS,
+  private static final Sort[] SORT = new Sort[] {
+    new Sort(new SortField(NUMERIC_DV_FIELD, SortField.Type.LONG)),
+    new Sort(new SortField(null, SortField.Type.DOC, true))
   };
   
   @BeforeClass
@@ -47,13 +49,14 @@ public class IndexSortingTest extends So
         values.add(Integer.valueOf(reader.document(i).get(ID_FIELD)));
       }
     }
-    Sorter sorter = SORTERS[random().nextInt(SORTERS.length)];
-    if (sorter == Sorter.REVERSE_DOCS) {
+    int idx = random().nextInt(SORT.length);
+    Sort sorter = SORT[idx];
+    if (idx == 1) { // reverse doc sort
       Collections.reverse(values);
     } else {
       Collections.sort(values);
-      if (sorter instanceof NumericDocValuesSorter && random().nextBoolean()) {
-        sorter = new NumericDocValuesSorter(NUMERIC_DV_FIELD, false); // descending
+      if (random().nextBoolean()) {
+        sorter = new Sort(new SortField(NUMERIC_DV_FIELD, SortField.Type.LONG, true)); // descending
         Collections.reverse(values);
       }
     }

Modified: lucene/dev/branches/lucene5487/lucene/misc/src/test/org/apache/lucene/index/sorter/SortingAtomicReaderTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5487/lucene/misc/src/test/org/apache/lucene/index/sorter/SortingAtomicReaderTest.java?rev=1575397&r1=1575396&r2=1575397&view=diff
==============================================================================
--- lucene/dev/branches/lucene5487/lucene/misc/src/test/org/apache/lucene/index/sorter/SortingAtomicReaderTest.java (original)
+++ lucene/dev/branches/lucene5487/lucene/misc/src/test/org/apache/lucene/index/sorter/SortingAtomicReaderTest.java Fri Mar  7 20:50:45 2014
@@ -17,56 +17,37 @@ package org.apache.lucene.index.sorter;
  * limitations under the License.
  */
 
-import java.io.IOException;
 import java.util.Arrays;
 
-import org.apache.lucene.index.AtomicReader;
+import org.apache.lucene.index.NumericDocValues;
+import org.apache.lucene.search.Sort;
+import org.apache.lucene.search.SortField;
 import org.apache.lucene.util.Bits;
 import org.apache.lucene.util.TestUtil;
-import org.apache.lucene.util.TestUtil;
 import org.junit.BeforeClass;
 
 public class SortingAtomicReaderTest extends SorterTestBase {
   
   @BeforeClass
   public static void beforeClassSortingAtomicReaderTest() throws Exception {
-    // build the mapping from the reader, since we deleted documents, some of
-    // them might have disappeared from the index (e.g. if an entire segment is
-    // dropped b/c all its docs are deleted)
-    final int[] values = new int[reader.maxDoc()];
-    for (int i = 0; i < reader.maxDoc(); i++) {
-      values[i] = Integer.valueOf(reader.document(i).get(ID_FIELD));
-    }
-    final Sorter.DocComparator comparator = new Sorter.DocComparator() {
-      @Override
-      public int compare(int docID1, int docID2) {
-        final int v1 = values[docID1];
-        final int v2 = values[docID2];
-        return v1 < v2 ? -1 : v1 == v2 ? 0 : 1;
-      }
-    };
-
-    final Sorter.DocMap docMap = Sorter.sort(reader.maxDoc(), comparator);
+    
+    // sort the index by id (as integer, in NUMERIC_DV_FIELD)
+    Sort sort = new Sort(new SortField(NUMERIC_DV_FIELD, SortField.Type.INT));
+    final Sorter.DocMap docMap = new Sorter(sort).sort(reader);
+ 
     // Sorter.compute also sorts the values
+    NumericDocValues dv = reader.getNumericDocValues(NUMERIC_DV_FIELD);
     sortedValues = new Integer[reader.maxDoc()];
     for (int i = 0; i < reader.maxDoc(); ++i) {
-      sortedValues[docMap.oldToNew(i)] = values[i];
+      sortedValues[docMap.oldToNew(i)] = (int)dv.get(i);
     }
     if (VERBOSE) {
       System.out.println("docMap: " + docMap);
       System.out.println("sortedValues: " + Arrays.toString(sortedValues));
     }
     
-    reader = SortingAtomicReader.wrap(reader, new Sorter() {
-      @Override
-      public Sorter.DocMap sort(AtomicReader reader) throws IOException {
-        return docMap;
-      }
-      @Override
-      public String getID() {
-        return ID_FIELD;
-      }
-    });
+    // sort the index by id (as integer, in NUMERIC_DV_FIELD)
+    reader = SortingAtomicReader.wrap(reader, sort);
     
     if (VERBOSE) {
       System.out.print("mapped-deleted-docs: ");
@@ -81,5 +62,14 @@ public class SortingAtomicReaderTest ext
     
     TestUtil.checkReader(reader);
   }
+  
+  public void testBadSort() throws Exception {
+    try {
+      SortingAtomicReader.wrap(reader, Sort.RELEVANCE);
+      fail("Didn't get expected exception");
+    } catch (IllegalArgumentException e) {
+      assertEquals("Cannot sort an index with a Sort that refers to the relevance score", e.getMessage());
+    }
+  }
 
 }

Modified: lucene/dev/branches/lucene5487/lucene/misc/src/test/org/apache/lucene/index/sorter/TestBlockJoinSorter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5487/lucene/misc/src/test/org/apache/lucene/index/sorter/TestBlockJoinSorter.java?rev=1575397&r1=1575396&r2=1575397&view=diff
==============================================================================
--- lucene/dev/branches/lucene5487/lucene/misc/src/test/org/apache/lucene/index/sorter/TestBlockJoinSorter.java (original)
+++ lucene/dev/branches/lucene5487/lucene/misc/src/test/org/apache/lucene/index/sorter/TestBlockJoinSorter.java Fri Mar  7 20:50:45 2014
@@ -37,6 +37,8 @@ import org.apache.lucene.search.DocIdSet
 import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.search.Filter;
 import org.apache.lucene.search.QueryWrapperFilter;
+import org.apache.lucene.search.Sort;
+import org.apache.lucene.search.SortField;
 import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.FixedBitSet;
@@ -89,47 +91,14 @@ public class TestBlockJoinSorter extends
     final AtomicReader reader = getOnlySegmentReader(indexReader);
     final Filter parentsFilter = new FixedBitSetCachingWrapperFilter(new QueryWrapperFilter(new TermQuery(new Term("parent", "true"))));
     final FixedBitSet parentBits = (FixedBitSet) parentsFilter.getDocIdSet(reader.getContext(), null);
-
     final NumericDocValues parentValues = reader.getNumericDocValues("parent_val");
-    final Sorter.DocComparator parentComparator = new Sorter.DocComparator() {
-      @Override
-      public int compare(int docID1, int docID2) {
-        assertTrue(parentBits.get(docID1));
-        assertTrue(parentBits.get(docID2));
-        return Long.compare(parentValues.get(docID1), parentValues.get(docID2));
-      }
-    };
-
     final NumericDocValues childValues = reader.getNumericDocValues("child_val");
-    final Sorter.DocComparator childComparator = new Sorter.DocComparator() {
-      @Override
-      public int compare(int docID1, int docID2) {
-        assertFalse(parentBits.get(docID1));
-        assertFalse(parentBits.get(docID2));
-        return Long.compare(childValues.get(docID1), childValues.get(docID2));
-      }
-    };
 
-    final Sorter sorter = new BlockJoinSorter(parentsFilter) {
-      
-      @Override
-      public String getID() {
-        return "Dummy";
-      }
-      
-      @Override
-      protected DocComparator getParentComparator(AtomicReader r) {
-        assertEquals(reader, r);
-        return parentComparator;
-      }
-
-      @Override
-      protected DocComparator getChildComparator(AtomicReader r) {
-        assertEquals(reader, r);
-        return childComparator;
-      }
+    final Sort parentSort = new Sort(new SortField("parent_val", SortField.Type.LONG));
+    final Sort childSort = new Sort(new SortField("child_val", SortField.Type.LONG));
 
-    };
+    final Sort sort = new Sort(new SortField("custom", new BlockJoinComparatorSource(parentsFilter, parentSort, childSort)));
+    final Sorter sorter = new Sorter(sort);
     final Sorter.DocMap docMap = sorter.sort(reader);
     assertEquals(reader.maxDoc(), docMap.size());
 

Modified: lucene/dev/branches/lucene5487/lucene/misc/src/test/org/apache/lucene/index/sorter/TestEarlyTermination.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5487/lucene/misc/src/test/org/apache/lucene/index/sorter/TestEarlyTermination.java?rev=1575397&r1=1575396&r2=1575397&view=diff
==============================================================================
--- lucene/dev/branches/lucene5487/lucene/misc/src/test/org/apache/lucene/index/sorter/TestEarlyTermination.java (original)
+++ lucene/dev/branches/lucene5487/lucene/misc/src/test/org/apache/lucene/index/sorter/TestEarlyTermination.java Fri Mar  7 20:50:45 2014
@@ -51,14 +51,14 @@ public class TestEarlyTermination extend
   private int numDocs;
   private List<String> terms;
   private Directory dir;
-  private Sorter sorter;
+  private Sort sort;
   private RandomIndexWriter iw;
   private IndexReader reader;
 
   @Override
   public void setUp() throws Exception {
     super.setUp();
-    sorter = new NumericDocValuesSorter("ndv1");
+    sort = new Sort(new SortField("ndv1", SortField.Type.LONG));
   }
 
   private Document randomDocument() {
@@ -80,7 +80,7 @@ public class TestEarlyTermination extend
     terms = new ArrayList<String>(randomTerms);
     final long seed = random().nextLong();
     final IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(new Random(seed)));
-    iwc.setMergePolicy(TestSortingMergePolicy.newSortingMergePolicy(sorter));
+    iwc.setMergePolicy(TestSortingMergePolicy.newSortingMergePolicy(sort));
     iw = new RandomIndexWriter(new Random(seed), dir, iwc);
     for (int i = 0; i < numDocs; ++i) {
       final Document doc = randomDocument();
@@ -120,7 +120,7 @@ public class TestEarlyTermination extend
     for (int i = 0; i < iters; ++i) {
       final TermQuery query = new TermQuery(new Term("s", RandomPicks.randomFrom(random(), terms)));
       searcher.search(query, collector1);
-      searcher.search(query, new EarlyTerminatingSortingCollector(collector2, sorter, numHits));
+      searcher.search(query, new EarlyTerminatingSortingCollector(collector2, sort, numHits));
     }
     assertTrue(collector1.getTotalHits() >= collector2.getTotalHits());
     assertTopDocsEquals(collector1.topDocs().scoreDocs, collector2.topDocs().scoreDocs);
@@ -144,7 +144,8 @@ public class TestEarlyTermination extend
     for (int i = 0; i < iters; ++i) {
       final TermQuery query = new TermQuery(new Term("s", RandomPicks.randomFrom(random(), terms)));
       searcher.search(query, collector1);
-      searcher.search(query, new EarlyTerminatingSortingCollector(collector2, new NumericDocValuesSorter("ndv2"), numHits) {
+      Sort different = new Sort(new SortField("ndv2", SortField.Type.LONG));
+      searcher.search(query, new EarlyTerminatingSortingCollector(collector2, different, numHits) {
         @Override
         public void setNextReader(AtomicReaderContext context) throws IOException {
           super.setNextReader(context);



Mime
View raw message