lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mikemcc...@apache.org
Subject svn commit: r885265 [1/2] - in /lucene/java/branches/flex_1458/src: java/org/apache/lucene/index/ java/org/apache/lucene/index/codecs/ java/org/apache/lucene/index/codecs/intblock/ java/org/apache/lucene/index/codecs/preflex/ java/org/apache/lucene/ind...
Date Sun, 29 Nov 2009 20:28:52 GMT
Author: mikemccand
Date: Sun Nov 29 20:28:51 2009
New Revision: 885265

URL: http://svn.apache.org/viewvc?rev=885265&view=rev
Log:
LUCENE-1458 (on flex branch): allow codec to specify sort order for terms within the field

Modified:
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/ByteBlockPool.java
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/DirectoryReader.java
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/DocumentsWriter.java
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/FreqProxFieldMergeState.java
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/FreqProxTermsWriter.java
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/LegacyFieldsEnum.java
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/LegacyTerms.java
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/TermRef.java
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/Terms.java
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/TermsEnum.java
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/TermsHashPerField.java
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/FieldsConsumer.java
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/TermsConsumer.java
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/intblock/IntBlockCodec.java
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/pulsing/PulsingDocsReader.java
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/pulsing/PulsingDocsWriter.java
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/sep/SepCodec.java
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/DeltaBytesWriter.java
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexReader.java
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/StandardCodec.java
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/StandardTermsDictReader.java
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/StandardTermsDictWriter.java
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/FilteredTermsEnum.java
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/NumericRangeQuery.java
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/TermRangeTermsEnum.java
    lucene/java/branches/flex_1458/src/test/org/apache/lucene/TestDemo.java
    lucene/java/branches/flex_1458/src/test/org/apache/lucene/TestExternalCodecs.java
    lucene/java/branches/flex_1458/src/test/org/apache/lucene/index/TestIndexWriter.java

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/ByteBlockPool.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/ByteBlockPool.java?rev=885265&r1=885264&r2=885265&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/ByteBlockPool.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/ByteBlockPool.java Sun Nov 29 20:28:51 2009
@@ -143,5 +143,23 @@
 
     return newUpto+3;
   }
+
+  // Fill in a TermRef from terms length & bytes encoded in
+  // byte block
+  final TermRef setTermRef(TermRef term, int textStart) {
+    final byte[] bytes = term.bytes = buffers[textStart >> DocumentsWriter.BYTE_BLOCK_SHIFT];
+    int pos = textStart & DocumentsWriter.BYTE_BLOCK_MASK;
+    if ((bytes[pos] & 0x80) == 0) {
+      // length is 1 byte
+      term.length = bytes[pos];
+      term.offset = pos+1;
+    } else {
+      // length is 2 bytes
+      term.length = (bytes[pos]&0x7f) + ((bytes[pos+1]&0xff)<<7);
+      term.offset = pos+2;
+    }
+    assert term.length >= 0;
+    return term;
+  }
 }
 

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/DirectoryReader.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/DirectoryReader.java?rev=885265&r1=885264&r2=885265&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/DirectoryReader.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/DirectoryReader.java Sun Nov 29 20:28:51 2009
@@ -1175,6 +1175,7 @@
   }
 
   private final static class TermMergeQueue extends PriorityQueue {
+    TermRef.Comparator termComp;
     TermMergeQueue(int size) {
       initialize(size);
     }
@@ -1183,7 +1184,7 @@
     protected final boolean lessThan(Object a, Object b) {
       TermsEnumWithBase termsA = (TermsEnumWithBase) a;
       TermsEnumWithBase termsB = (TermsEnumWithBase) b;
-      final int cmp = termsA.current.compareTerm(termsB.current);
+      final int cmp = termComp.compare(termsA.current, termsB.current);
       if (cmp != 0) {
         return cmp < 0;
       } else {
@@ -1237,15 +1238,31 @@
     
   private final static class MultiTerms extends Terms {
     private final TermsWithBase[] subs;
-    
-    public MultiTerms(TermsWithBase[] subs) {
+    private final TermRef.Comparator termComp;
+
+    public MultiTerms(TermsWithBase[] subs) throws IOException {
       this.subs = subs;
+      
+      TermRef.Comparator _termComp = null;
+      for(int i=0;i<subs.length;i++) {
+        if (_termComp == null) {
+          _termComp = subs[i].terms.getTermComparator();
+        } else {
+          assert subs[i].terms.getTermComparator() == null || _termComp.equals(subs[i].terms.getTermComparator());
+        }
+      }
+      termComp = _termComp;
     }
 
     @Override
     public TermsEnum iterator() throws IOException {
       return new MultiTermsEnum(subs.length).reset(subs);
     }
+
+    @Override
+    public TermRef.Comparator getTermComparator() {
+      return termComp;
+    }
   }
 
   private final static class MultiFieldsEnum extends FieldsEnum {
@@ -1321,6 +1338,7 @@
     int numSubs;
     private TermRef current;
     private final MultiDocsEnum docs;
+    private TermRef.Comparator termComp;
 
     MultiTermsEnum(int size) {
       queue = new TermMergeQueue(size);
@@ -1334,13 +1352,25 @@
       return current;
     }
 
+    @Override
+    public TermRef.Comparator getTermComparator() {
+      return termComp;
+    }
+
     MultiTermsEnum reset(TermsWithBase[] terms) throws IOException {
       assert terms.length <= top.length;
       numSubs = 0;
       numTop = 0;
+      termComp = null;
+      queue.clear();
       for(int i=0;i<terms.length;i++) {
         final TermsEnum termsEnum = terms[i].terms.iterator();
         if (termsEnum != null) {
+          if (termComp == null) {
+            queue.termComp = termComp = termsEnum.getTermComparator();
+          } else {
+            assert termsEnum.getTermComparator() == null || termComp.equals(termsEnum.getTermComparator());
+          }
           final TermRef term = termsEnum.next();
           if (term != null) {
             subs[numSubs] = new TermsEnumWithBase(terms[i], termsEnum, term);
@@ -1359,11 +1389,18 @@
       assert numFields <= top.length;
       numSubs = 0;
       numTop = 0;
+      termComp = null;
+      queue.clear();
       for(int i=0;i<numFields;i++) {
         final TermsEnum terms = fields[i].fields.terms();
         if (terms != null) {
           final TermRef term = terms.next();
           if (term != null) {
+            if (termComp == null) {
+              queue.termComp = termComp = terms.getTermComparator();
+            } else {
+              assert termComp.equals(terms.getTermComparator());
+            }
             subs[numSubs] = new TermsEnumWithBase(fields[i], terms, term);
             queue.add(subs[numSubs]);
             numSubs++;
@@ -1386,8 +1423,9 @@
           top[numTop++] = subs[i];
           subs[i].current = term;
         } else if (status == SeekStatus.NOT_FOUND) {
-          queue.add(subs[i]);
           subs[i].current = subs[i].terms.term();
+          assert subs[i].current != null;
+          queue.add(subs[i]);
         } else {
           // enum exhausted
         }

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/DocumentsWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/DocumentsWriter.java?rev=885265&r1=885264&r2=885265&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/DocumentsWriter.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/DocumentsWriter.java Sun Nov 29 20:28:51 2009
@@ -1209,7 +1209,7 @@
   final static int BYTE_BLOCK_MASK = BYTE_BLOCK_SIZE - 1;
   final static int BYTE_BLOCK_NOT_MASK = ~BYTE_BLOCK_MASK;
 
-  final static int MAX_TERM_LENGTH_UTF8 = BYTE_BLOCK_SIZE-1;
+  final static int MAX_TERM_LENGTH_UTF8 = BYTE_BLOCK_SIZE-2;
 
   private class ByteBlockAllocator extends ByteBlockPool.Allocator {
 

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/FreqProxFieldMergeState.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/FreqProxFieldMergeState.java?rev=885265&r1=885264&r2=885265&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/FreqProxFieldMergeState.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/FreqProxFieldMergeState.java Sun Nov 29 20:28:51 2009
@@ -43,10 +43,10 @@
   int docID;
   int termFreq;
 
-  public FreqProxFieldMergeState(FreqProxTermsWriterPerField field) {
+  public FreqProxFieldMergeState(FreqProxTermsWriterPerField field, TermRef.Comparator termComp) {
     this.field = field;
     this.numPostings = field.termsHashPerField.numPostings;
-    this.postings = field.termsHashPerField.sortPostings();
+    this.postings = field.termsHashPerField.sortPostings(termComp);
     this.bytePool = field.perThread.termsHashPerThread.bytePool;
   }
 
@@ -59,14 +59,8 @@
     p = (FreqProxTermsWriter.PostingList) postings[postingUpto];
     docID = 0;
 
-    text.bytes = bytePool.buffers[p.textStart >> DocumentsWriter.BYTE_BLOCK_SHIFT];
-    text.offset = p.textStart & DocumentsWriter.BYTE_BLOCK_MASK;
-    // nocommit -- how to avoid this added cost?
-    int pos = text.offset;
-    while(text.bytes[pos] != TermsHashPerField.END_OF_TERM) {
-      pos++;
-    }
-    text.length = pos - text.offset;
+    // Get TermRef
+    bytePool.setTermRef(text, p.textStart);
 
     field.termsHashPerField.initReader(freq, p, 0);
     if (!field.fieldInfo.omitTermFreqAndPositions) {

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/FreqProxTermsWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/FreqProxTermsWriter.java?rev=885265&r1=885264&r2=885265&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/FreqProxTermsWriter.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/FreqProxTermsWriter.java Sun Nov 29 20:28:51 2009
@@ -45,35 +45,9 @@
       postings[i] = new PostingList();
   }
 
-  private static int compareText(final TermRef text1, final TermRef text2) {
-
-    int pos1 = text1.offset;
-    int pos2 = text2.offset;
-    final byte[] bytes1 = text1.bytes;
-    final byte[] bytes2 = text2.bytes;
-    while(true) {
-      final byte b1 = bytes1[pos1++];
-      final byte b2 = bytes2[pos2++];
-      if (b1 != b2) {
-        if (TermsHashPerField.END_OF_TERM == b2) {
-          //text2.length = pos2 - text2.offset;
-          return 1;
-        } else if (TermsHashPerField.END_OF_TERM == b1) {
-          //text1.length = pos1 - text1.offset;
-          return -1;
-        } else {
-          return (b1&0xff)-(b2&0xff);
-        }
-      } else if (TermsHashPerField.END_OF_TERM == b1) {
-        //text1.length = pos1 - text1.offset;
-        //text2.length = pos2 - text2.offset;
-        return 0;
-      }
-    }
-  }
-
   @Override
   void closeDocStore(SegmentWriteState state) {}
+
   @Override
   void abort() {}
 
@@ -184,8 +158,11 @@
 
     final FreqProxFieldMergeState[] mergeStates = new FreqProxFieldMergeState[numFields];
 
+    final TermsConsumer termsConsumer = consumer.addField(fields[0].fieldInfo);
+    final TermRef.Comparator termComp = termsConsumer.getTermComparator();
+
     for(int i=0;i<numFields;i++) {
-      FreqProxFieldMergeState fms = mergeStates[i] = new FreqProxFieldMergeState(fields[i]);
+      FreqProxFieldMergeState fms = mergeStates[i] = new FreqProxFieldMergeState(fields[i], termComp);
 
       assert fms.field.fieldInfo == fields[0].fieldInfo;
 
@@ -194,11 +171,10 @@
       assert result;
     }
 
-    final TermsConsumer termsConsumer = consumer.addField(fields[0].fieldInfo);
-
     FreqProxFieldMergeState[] termStates = new FreqProxFieldMergeState[numFields];
 
     final boolean currentFieldOmitTermFreqAndPositions = fields[0].fieldInfo.omitTermFreqAndPositions;
+    //System.out.println("flush terms field=" + fields[0].fieldInfo.name);
 
     // TODO: really TermsHashPerField should take over most
     // of this loop, including merge sort of terms from
@@ -212,7 +188,7 @@
       int numToMerge = 1;
 
       for(int i=1;i<numFields;i++) {
-        final int cmp = compareText(mergeStates[i].text, termStates[0].text);
+        final int cmp = termComp.compare(mergeStates[i].text, termStates[0].text);
         if (cmp < 0) {
           termStates[0] = mergeStates[i];
           numToMerge = 1;
@@ -221,10 +197,15 @@
         }
       }
 
+      // Need shallow copy here because termStates[0].text
+      // changes by the time we call finishTerm
       text.bytes = termStates[0].text.bytes;
       text.offset = termStates[0].text.offset;
       text.length = termStates[0].text.length;  
 
+      //System.out.println("  term=" + text.toUnicodeString());
+      //System.out.println("  term=" + text.toString());
+
       final DocsConsumer docConsumer = termsConsumer.startTerm(text);
 
       // Now termStates has numToMerge FieldMergeStates

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/LegacyFieldsEnum.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/LegacyFieldsEnum.java?rev=885265&r1=885264&r2=885265&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/LegacyFieldsEnum.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/LegacyFieldsEnum.java Sun Nov 29 20:28:51 2009
@@ -86,6 +86,12 @@
     }
 
     @Override
+    public TermRef.Comparator getTermComparator() {
+      // Pre-flex indexes always sorted in UTF16 order
+      return TermRef.getUTF8SortedAsUTF16Comparator();
+    }
+
+    @Override
     public SeekStatus seek(TermRef text) throws IOException {
 
       // nocommit: too slow?

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/LegacyTerms.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/LegacyTerms.java?rev=885265&r1=885264&r2=885265&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/LegacyTerms.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/LegacyTerms.java Sun Nov 29 20:28:51 2009
@@ -40,6 +40,12 @@
 
   public void close() {
   }
+
+  @Override
+  public TermRef.Comparator getTermComparator() {
+    // Pre-flex indexes always sorted in UTF16 order
+    return TermRef.getUTF8SortedAsUTF16Comparator();
+  }
 }
 
   

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/TermRef.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/TermRef.java?rev=885265&r1=885264&r2=885265&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/TermRef.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/TermRef.java Sun Nov 29 20:28:51 2009
@@ -39,12 +39,17 @@
     copy(text);
   }
 
+  public TermRef(TermRef other) {
+    copy(other);
+  }
+
   // nocommit: we could do this w/ UnicodeUtil w/o requiring
   // allocation of new bytes[]?
   /**
    * @param text Well-formed unicode text, with no unpaired surrogates or U+FFFF.
    */
   public void copy(String text) {
+    // nocommit -- assert text has no unpaired surrogates??
     try {
       bytes = text.getBytes("UTF-8");
     } catch (UnsupportedEncodingException uee) {
@@ -55,28 +60,6 @@
     length = bytes.length;
   }
 
-  public int compareTerm(TermRef other) {
-    final int minLength;
-    if (length < other.length) {
-      minLength = length;
-    } else {
-      minLength = other.length;
-    }
-    int upto = offset;
-    int otherUpto = other.offset;
-    final byte[] otherBytes = other.bytes;
-    for(int i=0;i<minLength;i++) {
-      // compare bytes as unsigned
-      final int b1 = bytes[upto++]&0xff;
-      final int b2 = otherBytes[otherUpto++]&0xff;
-      final int diff =  b1-b2;
-      if (diff != 0) {
-        return diff;
-      }
-    }
-    return length - other.length;
-  }
-
   public boolean termEquals(TermRef other) {
     if (length == other.length) {
       int upto = offset;
@@ -169,7 +152,26 @@
       if (i > offset) {
         sb.append(' ');
       }
-      sb.append(""+bytes[i]);
+      sb.append(Integer.toHexString(bytes[i]&0xff));
+    }
+    sb.append(']');
+    return sb.toString();
+  }
+
+  private final String asUnicodeChar(char c) {
+    return "U+" + Integer.toHexString(c);
+  }
+
+  // for debugging only -- this is slow
+  public String toUnicodeString() {
+    StringBuilder sb = new StringBuilder();
+    sb.append('[');
+    final String s = toString();
+    for(int i=0;i<s.length();i++) {
+      if (i > 0) {
+        sb.append(' ');
+      }
+      sb.append(asUnicodeChar(s.charAt(i)));
     }
     sb.append(']');
     return sb.toString();
@@ -189,4 +191,57 @@
   public void grow(int newLength) {
     bytes = ArrayUtil.grow(bytes, newLength);
   }
-}
\ No newline at end of file
+
+  public abstract static class Comparator {
+    abstract public int compare(TermRef a, TermRef b);
+  }
+
+  private final static Comparator utf8SortedAsUTF16SortOrder = new UTF8SortedAsUTF16Comparator();
+
+  public static Comparator getUTF8SortedAsUTF16Comparator() {
+    return utf8SortedAsUTF16SortOrder;
+  }
+
+  public static class UTF8SortedAsUTF16Comparator extends Comparator {
+    public int compare(TermRef a, TermRef b) {
+
+      final byte[] aBytes = a.bytes;
+      int aUpto = a.offset;
+      final byte[] bBytes = b.bytes;
+      int bUpto = b.offset;
+      
+      final int aStop;
+      if (a.length < b.length) {
+        aStop = aUpto + a.length;
+      } else {
+        aStop = aUpto + b.length;
+      }
+
+      while(aUpto < aStop) {
+        int aByte = aBytes[aUpto++] & 0xff;
+        int bByte = bBytes[bUpto++] & 0xff;
+
+        if (aByte != bByte) {
+
+          // See http://icu-project.org/docs/papers/utf16_code_point_order.html#utf-8-in-utf-16-order
+
+          // We know the terms are not equal, but, we may
+          // have to carefully fixup the bytes at the
+          // difference to match UTF16's sort order:
+          if (aByte >= 0xee && bByte >= 0xee) {
+            if ((aByte & 0xfe) == 0xee) {
+              aByte += 0x10;
+            }
+            if ((bByte&0xfe) == 0xee) {
+              bByte += 0x10;
+            }
+          }
+          return aByte - bByte;
+        }
+      }
+
+      // One is a prefix of the other, or, they are equal:
+      return a.length - b.length;
+    }
+  }
+}

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java?rev=885265&r1=885264&r2=885265&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java Sun Nov 29 20:28:51 2009
@@ -94,6 +94,9 @@
 
   public void abort() {}
 
+  // nocommit -- should be @ thread level not field
+  private final TermRef flushTerm = new TermRef();
+
   /** Called once per field per document if term vectors
    *  are enabled, to write the vectors to
    *  RAMOutputStream, which is then quickly flushed to
@@ -124,7 +127,9 @@
 
     perThread.doc.addField(termsHashPerField.fieldInfo.number);
 
-    final RawPostingList[] postings = termsHashPerField.sortPostings();
+    // nocommit -- should I sort by whatever terms dict is
+    // sorting by?
+    final RawPostingList[] postings = termsHashPerField.sortPostings(TermRef.getUTF8SortedAsUTF16Comparator());
 
     tvf.writeVInt(numPostings);
     byte bits = 0x0;
@@ -139,43 +144,35 @@
     int lastStart = 0;
       
     final ByteSliceReader reader = perThread.vectorSliceReader;
-    final byte[][] byteBuffers = perThread.termsHashPerThread.termBytePool.buffers;
+    final ByteBlockPool termBytePool = perThread.termsHashPerThread.termBytePool;
 
     for(int j=0;j<numPostings;j++) {
       final TermVectorsTermsWriter.PostingList posting = (TermVectorsTermsWriter.PostingList) postings[j];
       final int freq = posting.freq;
-          
-      final byte[] bytes = byteBuffers[posting.textStart >> DocumentsWriter.BYTE_BLOCK_SHIFT];
-      final int start = posting.textStart & DocumentsWriter.BYTE_BLOCK_MASK;
-
-      // nocommit: we can do this as completion of
-      // prefix-finding loop, below:
-      int upto = start;
-      while(bytes[upto] != TermsHashPerField.END_OF_TERM) {
-        upto++;
-      }
-      final int len = upto - start;
+
+      // Get TermRef
+      termBytePool.setTermRef(flushTerm, posting.textStart);
 
       // Compute common byte prefix between last term and
       // this term
       int prefix = 0;
       if (j > 0) {
-        while(prefix < lastLen && prefix < len) {
-          if (lastBytes[lastStart+prefix] != bytes[start+prefix]) {
+        while(prefix < lastLen && prefix < flushTerm.length) {
+          if (lastBytes[lastStart+prefix] != flushTerm.bytes[flushTerm.offset+prefix]) {
             break;
           }
           prefix++;
         }
       }
 
-      lastLen = len;
-      lastBytes = bytes;
-      lastStart = start;
+      lastLen = flushTerm.length;
+      lastBytes = flushTerm.bytes;
+      lastStart = flushTerm.offset;
 
-      final int suffix = len - prefix;
+      final int suffix = flushTerm.length - prefix;
       tvf.writeVInt(prefix);
       tvf.writeVInt(suffix);
-      tvf.writeBytes(bytes, lastStart+prefix, suffix);
+      tvf.writeBytes(flushTerm.bytes, lastStart+prefix, suffix);
       tvf.writeVInt(freq);
 
       if (doVectorPositions) {

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/Terms.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/Terms.java?rev=885265&r1=885264&r2=885265&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/Terms.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/Terms.java Sun Nov 29 20:28:51 2009
@@ -30,6 +30,13 @@
   /** Returns an iterator that will step through all terms */
   public abstract TermsEnum iterator() throws IOException;
   
+  /** Return the TermRef Comparator used to sort terms
+   *  provided by the iterator.  NOTE: this may return null
+   *  if there are no terms.  This method may be invoked
+   *  many times; it's best to cache a single instance &
+   *  reuse it. */
+  public abstract TermRef.Comparator getTermComparator() throws IOException;
+
   /** Returns the docFreq of the specified term text. */
   public int docFreq(TermRef text) throws IOException {
     // nocommit -- make thread private cache so we share

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/TermsEnum.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/TermsEnum.java?rev=885265&r1=885264&r2=885265&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/TermsEnum.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/TermsEnum.java Sun Nov 29 20:28:51 2009
@@ -81,5 +81,11 @@
    *  TermsEnum's {@link #seek} or {@link #next} until you
    *  are done using the DocsEnum. */
   public abstract DocsEnum docs(Bits skipDocs) throws IOException;
-}
 
+  /** Return the TermRef Comparator used to sort terms
+   *  provided by the iterator.  NOTE: this may return null
+   *  if there are no terms.  This method may be invoked
+   *  many times; it's best to cache a single instance &
+   *  reuse it. */
+  public abstract TermRef.Comparator getTermComparator() throws IOException;
+}

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/TermsHashPerField.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/TermsHashPerField.java?rev=885265&r1=885264&r2=885265&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/TermsHashPerField.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/TermsHashPerField.java Sun Nov 29 20:28:51 2009
@@ -33,8 +33,6 @@
   final FieldInvertState fieldState;
   TermAttribute termAtt;
 
-  static final byte END_OF_TERM = (byte) 0xff;
-  
   // Copied from our perThread
   final IntBlockPool intPool;
   final ByteBlockPool bytePool;
@@ -53,7 +51,13 @@
   private RawPostingList[] postingsHash = new RawPostingList[postingsHashSize];
   private RawPostingList p;
   private final UnicodeUtil.UTF8Result utf8;
-  
+  private TermRef.Comparator termComp;
+
+  // nocommit -- move to thread level
+  // Used when comparing postings via termRefComp
+  private final TermRef tr1 = new TermRef();
+  private final TermRef tr2 = new TermRef();
+
   public TermsHashPerField(DocInverterPerField docInverterPerField, final TermsHashPerThread perThread, final TermsHashPerThread nextPerThread, final FieldInfo fieldInfo) {
     this.perThread = perThread;
     intPool = perThread.intPool;
@@ -62,6 +66,10 @@
     docState = perThread.docState;
     fieldState = docInverterPerField.fieldState;
     this.consumer = perThread.consumer.addField(this, fieldInfo);
+
+    tr1.length = 3*((int) (Short.MAX_VALUE));
+    tr2.length = 3*((int) (Short.MAX_VALUE));
+
     streamCount = consumer.getStreamCount();
     numPostingInt = 2*streamCount;
     utf8 = perThread.utf8;
@@ -137,7 +145,8 @@
   }
 
   /** Collapse the hash table & sort in-place. */
-  public RawPostingList[] sortPostings() {
+  public RawPostingList[] sortPostings(TermRef.Comparator termComp) {
+    this.termComp = termComp;
     compactPostings();
     quickSort(postingsHash, 0, numPostings-1);
     return postingsHash;
@@ -209,31 +218,14 @@
   int comparePostings(RawPostingList p1, RawPostingList p2) {
 
     if (p1 == p2) {
+      // nocommit -- why does this happen again?
       return 0;
     }
 
-    final byte[] text1 = termBytePool.buffers[p1.textStart >> DocumentsWriter.BYTE_BLOCK_SHIFT];
-    int pos1 = p1.textStart & DocumentsWriter.BYTE_BLOCK_MASK;
-    final byte[] text2 = termBytePool.buffers[p2.textStart >> DocumentsWriter.BYTE_BLOCK_SHIFT];
-    int pos2 = p2.textStart & DocumentsWriter.BYTE_BLOCK_MASK;
-
-    assert text1 != text2 || pos1 != pos2;
-
-    while(true) {
-      final byte b1 = text1[pos1++];
-      final byte b2 = text2[pos2++];
-      if (b1 != b2) {
-        if (END_OF_TERM == b2)
-          return 1;
-        else if (END_OF_TERM == b1)
-          return -1;
-        else
-          return (b1&0xff)-(b2&0xff);
-      } else
-        // This method should never compare equal postings
-        // unless p1==p2
-        assert b1 != END_OF_TERM;
-    }
+    termBytePool.setTermRef(tr1, p1.textStart);
+    termBytePool.setTermRef(tr2, p2.textStart);
+
+    return termComp.compare(tr1, tr2);
   }
 
   /** Test whether the text for current RawPostingList p equals
@@ -243,14 +235,29 @@
     final byte[] text = termBytePool.buffers[p.textStart >> DocumentsWriter.BYTE_BLOCK_SHIFT];
     assert text != null;
     int pos = p.textStart & DocumentsWriter.BYTE_BLOCK_MASK;
+    
+    final int len;
+    if ((text[pos] & 0x80) == 0) {
+      // length is 1 byte
+      len = text[pos];
+      pos += 1;
+    } else {
+      // length is 2 bytes
+      len = (text[pos]&0x7f) + ((text[pos+1]&0xff)<<7);
+      pos += 2;
+    }
 
-    final byte[] utf8Bytes = utf8.result;
-    for(int tokenPos=0;tokenPos<utf8.length;pos++,tokenPos++) {
-      if (utf8Bytes[tokenPos] != text[pos]) {
-        return false;
+    if (len == utf8.length) {
+      final byte[] utf8Bytes = utf8.result;
+      for(int tokenPos=0;tokenPos<utf8.length;pos++,tokenPos++) {
+        if (utf8Bytes[tokenPos] != text[pos]) {
+          return false;
+        }
       }
+      return true;
+    } else {
+      return false;
     }
-    return END_OF_TERM == text[pos];
   }
   
   private boolean doCall;
@@ -360,7 +367,9 @@
     // Get the text of this term.
     final char[] tokenText = termAtt.termBuffer();;
     final int tokenTextLen = termAtt.termLength();
-
+    
+    //System.out.println("\nfield=" + fieldInfo.name + " add text=" + new String(tokenText, 0, tokenTextLen) + " len=" + tokenTextLen);
+    
     UnicodeUtil.UTF16toUTF8(tokenText, 0, tokenTextLen, utf8);
 
     // nocommit -- modify UnicodeUtil to compute hash for us
@@ -388,11 +397,15 @@
 
     if (p == null) {
 
+      //System.out.println("  not seen yet");
+
       // First time we are seeing this token since we last
       // flushed the hash.
-      final int textLen1 = 1+utf8.length;
-      if (textLen1 + bytePool.byteUpto > DocumentsWriter.BYTE_BLOCK_SIZE) {
-        if (textLen1 > DocumentsWriter.BYTE_BLOCK_SIZE) {
+      final int textLen2 = 2+utf8.length;
+      if (textLen2 + bytePool.byteUpto > DocumentsWriter.BYTE_BLOCK_SIZE) {
+        // Not enough room in current block
+
+        if (utf8.length > DocumentsWriter.MAX_TERM_LENGTH_UTF8) {
           // Just skip this term, to remain as robust as
           // possible during indexing.  A TokenFilter
           // can be inserted into the analyzer chain if
@@ -422,10 +435,25 @@
       final int textUpto = bytePool.byteUpto;
       p.textStart = textUpto + bytePool.byteOffset;
 
-      bytePool.byteUpto += textLen1;
-      System.arraycopy(utf8.result, 0, text, textUpto, utf8.length);
-      text[textUpto+utf8.length] = END_OF_TERM;
-          
+      // We first encode the length, followed by the UTF8
+      // bytes.  Length is encoded as vInt, but will consume
+      // 1 or 2 bytes at most (we reject too-long terms,
+      // above).
+
+      // encode length @ start of bytes
+      if (utf8.length < 128) {
+        // 1 byte to store length
+        text[textUpto] = (byte) utf8.length;
+        bytePool.byteUpto += utf8.length + 1;
+        System.arraycopy(utf8.result, 0, text, textUpto+1, utf8.length);
+      } else {
+        // 2 byte to store length
+        text[textUpto] = (byte) (0x80 | (utf8.length & 0x7f));
+        text[textUpto+1] = (byte) ((utf8.length>>7) & 0xff);
+        bytePool.byteUpto += utf8.length + 2;
+        System.arraycopy(utf8.result, 0, text, textUpto+2, utf8.length);
+      }
+
       assert postingsHash[hashPos] == null;
       postingsHash[hashPos] = p;
       numPostings++;
@@ -458,6 +486,7 @@
       consumer.newTerm(p);
 
     } else {
+      // System.out.println("  already seen");
       intUptos = intPool.buffers[p.intStart >> DocumentsWriter.INT_BLOCK_SHIFT];
       intUptoStart = p.intStart & DocumentsWriter.INT_BLOCK_MASK;
       consumer.addTerm(p);
@@ -514,6 +543,7 @@
 
     final int newMask = newSize-1;
 
+    //System.out.println("  rehash");
     RawPostingList[] newHash = new RawPostingList[newSize];
     for(int i=0;i<postingsHashSize;i++) {
       RawPostingList p0 = postingsHash[i];
@@ -523,8 +553,21 @@
           final int start = p0.textStart & DocumentsWriter.BYTE_BLOCK_MASK;
           final byte[] text = bytePool.buffers[p0.textStart >> DocumentsWriter.BYTE_BLOCK_SHIFT];
           code = 0;
-          int pos = start;
-          while(text[pos] != END_OF_TERM) {
+
+          final int len;
+          int pos;
+          if ((text[start] & 0x80) == 0) {
+            // length is 1 byte
+            len = text[start];
+            pos = start+1;
+          } else {
+            len = (text[start]&0x7f) + ((text[start+1]&0xff)<<7);
+            pos = start+2;
+          }
+          //System.out.println("    term=" + bytePool.setTermRef(new TermRef(), p0.textStart).toBytesString());
+
+          final int endPos = pos+len;
+          while(pos < endPos) {
             code = (code*31) + text[pos++];
           }
         } else {

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/FieldsConsumer.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/FieldsConsumer.java?rev=885265&r1=885264&r2=885265&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/FieldsConsumer.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/FieldsConsumer.java Sun Nov 29 20:28:51 2009
@@ -21,6 +21,7 @@
 import org.apache.lucene.index.Fields;
 import org.apache.lucene.index.FieldsEnum;
 import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.index.TermRef;
 import org.apache.lucene.util.PriorityQueue;
 
 import java.io.IOException;

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/TermsConsumer.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/TermsConsumer.java?rev=885265&r1=885264&r2=885265&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/TermsConsumer.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/TermsConsumer.java Sun Nov 29 20:28:51 2009
@@ -39,6 +39,10 @@
   /** Called when we are done adding terms to this field */
   public abstract void finish() throws IOException;
 
+  /** Return the TermRef Comparator used to sort terms
+   *  before feeding to this API. */
+  public abstract TermRef.Comparator getTermComparator() throws IOException;
+
   // For default merge impl
   public static class TermMergeState {
     TermRef current;
@@ -47,13 +51,17 @@
   }
 
   private final static class MergeQueue extends PriorityQueue<TermMergeState> {
-    public MergeQueue(int size) {
+
+    final TermRef.Comparator termComp;
+
+    public MergeQueue(int size, TermRef.Comparator termComp) {
       initialize(size);
+      this.termComp = termComp;
     }
 
     @Override
     protected final boolean lessThan(TermMergeState a, TermMergeState b) {
-      final int cmp = a.current.compareTerm(b.current);
+      final int cmp = termComp.compare(a.current, b.current);
       if (cmp != 0) {
         return cmp < 0;
       } else {
@@ -68,13 +76,20 @@
 
   /** Default merge impl */
   public void merge(MergeState mergeState, TermMergeState[] termsStates, int count) throws IOException {
+
+    final TermRef.Comparator termComp = getTermComparator();
+
+    //System.out.println("merge terms field=" + mergeState.fieldInfo.name + " comp=" + termComp);
+
     if (queue == null) {
-      queue = new MergeQueue(mergeState.readerCount);
+      queue = new MergeQueue(mergeState.readerCount, termComp);
       match = new DocsConsumer.DocsMergeState[mergeState.readerCount];
       for(int i=0;i<mergeState.readerCount;i++) {
         match[i] = new DocsConsumer.DocsMergeState();
       }
       pending = new TermMergeState[mergeState.readerCount];
+    } else if (!queue.termComp.equals(termComp)) {
+      queue = new MergeQueue(mergeState.readerCount, termComp);
     }
 
     // Init queue
@@ -111,6 +126,7 @@
       if (matchCount > 0) {
         // Merge one term
         final TermRef term = pending[0].current;
+        //System.out.println("  merge term=" + term);
         final DocsConsumer docsConsumer = startTerm(term);
         final int numDocs = docsConsumer.merge(mergeState, match, matchCount);
         finishTerm(term, numDocs);

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/intblock/IntBlockCodec.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/intblock/IntBlockCodec.java?rev=885265&r1=885264&r2=885265&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/intblock/IntBlockCodec.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/intblock/IntBlockCodec.java Sun Nov 29 20:28:51 2009
@@ -23,6 +23,7 @@
 import org.apache.lucene.index.FieldInfos;
 import org.apache.lucene.index.SegmentInfo;
 import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.index.TermRef;
 import org.apache.lucene.index.codecs.Codec;
 import org.apache.lucene.index.codecs.FieldsConsumer;
 import org.apache.lucene.index.codecs.FieldsProducer;
@@ -62,7 +63,7 @@
 
     success = false;
     try {
-      FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, docsWriter);
+      FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, docsWriter, TermRef.getUTF8SortedAsUTF16Comparator());
       success = true;
       return ret;
     } finally {
@@ -94,7 +95,8 @@
       indexReader = new SimpleStandardTermsIndexReader(dir,
                                                        fieldInfos,
                                                        si.name,
-                                                       indexDivisor);
+                                                       indexDivisor,
+                                                       TermRef.getUTF8SortedAsUTF16Comparator());
       success = true;
     } finally {
       if (!success) {
@@ -107,7 +109,8 @@
       FieldsProducer ret = new StandardTermsDictReader(indexReader,
                                                        dir, fieldInfos, si.name,
                                                        docsReader,
-                                                       readBufferSize);
+                                                       readBufferSize,
+                                                       TermRef.getUTF8SortedAsUTF16Comparator());
       success = true;
       return ret;
     } finally {

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java?rev=885265&r1=885264&r2=885265&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java Sun Nov 29 20:28:51 2009
@@ -159,11 +159,18 @@
     PreTerms(FieldInfo fieldInfo) {
       this.fieldInfo = fieldInfo;
     }
+
     @Override
     public TermsEnum iterator() {
       //System.out.println("pff.init create no context");
       return new PreTermsEnum(fieldInfo);
     }
+
+    @Override
+    public TermRef.Comparator getTermComparator() {
+      // Pre-flex indexes always sorted in UTF16 order
+      return TermRef.getUTF8SortedAsUTF16Comparator();
+    }
   }
 
   private class PreTermsEnum extends TermsEnum {
@@ -188,6 +195,12 @@
     }
 
     @Override
+    public TermRef.Comparator getTermComparator() {
+      // Pre-flex indexes always sorted in UTF16 order
+      return TermRef.getUTF8SortedAsUTF16Comparator();
+    }
+
+    @Override
     public SeekStatus seek(long ord) throws IOException {
       throw new UnsupportedOperationException();
     }

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java?rev=885265&r1=885264&r2=885265&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java Sun Nov 29 20:28:51 2009
@@ -23,6 +23,7 @@
 import org.apache.lucene.index.FieldInfos;
 import org.apache.lucene.index.SegmentInfo;
 import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.index.TermRef;
 import org.apache.lucene.index.codecs.Codec;
 import org.apache.lucene.index.codecs.standard.StandardDocsConsumer;
 import org.apache.lucene.index.codecs.standard.StandardDocsProducer;
@@ -78,7 +79,7 @@
     // Terms dict
     success = false;
     try {
-      FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, pulsingWriter);
+      FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, pulsingWriter, TermRef.getUTF8SortedAsUTF16Comparator());
       success = true;
       return ret;
     } finally {
@@ -108,7 +109,8 @@
       indexReader = new SimpleStandardTermsIndexReader(dir,
                                                        fieldInfos,
                                                        si.name,
-                                                       indexDivisor);
+                                                       indexDivisor,
+                                                       TermRef.getUTF8SortedAsUTF16Comparator());
       success = true;
     } finally {
       if (!success) {
@@ -122,7 +124,8 @@
       FieldsProducer ret = new StandardTermsDictReader(indexReader,
                                                        dir, fieldInfos, si.name,
                                                        docsReader,
-                                                       readBufferSize);
+                                                       readBufferSize,
+                                                       TermRef.getUTF8SortedAsUTF16Comparator());
       success = true;
       return ret;
     } finally {

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/pulsing/PulsingDocsReader.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/pulsing/PulsingDocsReader.java?rev=885265&r1=885264&r2=885265&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/pulsing/PulsingDocsReader.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/pulsing/PulsingDocsReader.java Sun Nov 29 20:28:51 2009
@@ -39,14 +39,14 @@
 // create two separate docs readers, one that also reads
 // prox and one that doesn't?
 
-class PulsingDocsReader extends StandardDocsProducer {
+public class PulsingDocsReader extends StandardDocsProducer {
 
   // Fallback reader for non-pulsed terms:
   final StandardDocsProducer wrappedDocsReader;
   IndexInput termsIn;
   int maxPulsingDocFreq;
 
-  PulsingDocsReader(Directory dir, SegmentInfo segmentInfo, int readBufferSize, StandardDocsProducer wrappedDocsReader) throws IOException {
+  public PulsingDocsReader(Directory dir, SegmentInfo segmentInfo, int readBufferSize, StandardDocsProducer wrappedDocsReader) throws IOException {
     this.wrappedDocsReader = wrappedDocsReader;
   }
 

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/pulsing/PulsingDocsWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/pulsing/PulsingDocsWriter.java?rev=885265&r1=885264&r2=885265&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/pulsing/PulsingDocsWriter.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/pulsing/PulsingDocsWriter.java Sun Nov 29 20:28:51 2009
@@ -120,7 +120,7 @@
 
   /** If docFreq <= maxPulsingDocFreq, its postings are
    *  inlined into terms dict */
-  PulsingDocsWriter(SegmentWriteState state, int maxPulsingDocFreq, StandardDocsConsumer wrappedDocsWriter) throws IOException {
+  public PulsingDocsWriter(SegmentWriteState state, int maxPulsingDocFreq, StandardDocsConsumer wrappedDocsWriter) throws IOException {
     super();
 
     pendingDocs = new Document[maxPulsingDocFreq];

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/sep/SepCodec.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/sep/SepCodec.java?rev=885265&r1=885264&r2=885265&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/sep/SepCodec.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/sep/SepCodec.java Sun Nov 29 20:28:51 2009
@@ -23,6 +23,7 @@
 import org.apache.lucene.index.FieldInfos;
 import org.apache.lucene.index.SegmentInfo;
 import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.index.TermRef;
 import org.apache.lucene.index.codecs.Codec;
 import org.apache.lucene.index.codecs.FieldsConsumer;
 import org.apache.lucene.index.codecs.FieldsProducer;
@@ -60,7 +61,7 @@
 
     success = false;
     try {
-      FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, docsWriter);
+      FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, docsWriter, TermRef.getUTF8SortedAsUTF16Comparator());
       success = true;
       return ret;
     } finally {
@@ -91,7 +92,8 @@
       indexReader = new SimpleStandardTermsIndexReader(dir,
                                                        fieldInfos,
                                                        si.name,
-                                                       indexDivisor);
+                                                       indexDivisor,
+                                                       TermRef.getUTF8SortedAsUTF16Comparator());
       success = true;
     } finally {
       if (!success) {
@@ -104,7 +106,8 @@
       FieldsProducer ret = new StandardTermsDictReader(indexReader,
                                                        dir, fieldInfos, si.name,
                                                        docsReader,
-                                                       readBufferSize);
+                                                       readBufferSize,
+                                                       TermRef.getUTF8SortedAsUTF16Comparator());
       success = true;
       return ret;
     } finally {

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/DeltaBytesWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/DeltaBytesWriter.java?rev=885265&r1=885264&r2=885265&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/DeltaBytesWriter.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/DeltaBytesWriter.java Sun Nov 29 20:28:51 2009
@@ -61,6 +61,9 @@
     if (lastBytes.length < length) {
       lastBytes = ArrayUtil.grow(lastBytes, length);
     }
+    // TODO: is this copy really necessary?  I don't think
+    // caller actually modifies these bytes, so we can save
+    // by reference?
     System.arraycopy(bytes, upto, lastBytes, start, suffix);
     lastLength = length;
   }

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexReader.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexReader.java?rev=885265&r1=885264&r2=885265&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexReader.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexReader.java Sun Nov 29 20:28:51 2009
@@ -64,19 +64,22 @@
 
   final private IndexInput in;
   private volatile boolean indexLoaded;
+  private final TermRef.Comparator termComp;
 
   final HashMap<FieldInfo,FieldIndexReader> fields = new HashMap<FieldInfo,FieldIndexReader>();
 
-  public SimpleStandardTermsIndexReader(Directory dir, FieldInfos fieldInfos, String segment, int indexDivisor)
+  public SimpleStandardTermsIndexReader(Directory dir, FieldInfos fieldInfos, String segment, int indexDivisor, TermRef.Comparator termComp)
     throws IOException {
+
+    this.termComp = termComp;
+
+    // nocommit -- why was this needed?
     String file = IndexFileNames.segmentFileName(segment, StandardCodec.TERMS_INDEX_EXTENSION);
-    //nocommit
-    if(!dir.fileExists(file)) {
+    if (!dir.fileExists(file)) {
       indexInterval = 0;
       totalIndexInterval = 0;
       this.indexDivisor = indexDivisor;
       in = null;
- 
       return;
     }
     IndexInput in = dir.openInput(file);
@@ -426,7 +429,7 @@
           result.term.length = termLength[mid];
           //System.out.println("    term=" + result.term);
 
-          int delta = term.compareTerm(result.term);
+          int delta = termComp.compare(term, result.term);
           if (delta < 0) {
             hi = mid - 1;
           } else if (delta > 0) {

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/StandardCodec.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/StandardCodec.java?rev=885265&r1=885264&r2=885265&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/StandardCodec.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/StandardCodec.java Sun Nov 29 20:28:51 2009
@@ -23,6 +23,7 @@
 import org.apache.lucene.index.FieldInfos;
 import org.apache.lucene.index.SegmentInfo;
 import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.index.TermRef;
 import org.apache.lucene.index.codecs.Codec;
 import org.apache.lucene.index.codecs.FieldsConsumer;
 import org.apache.lucene.index.codecs.FieldsProducer;
@@ -52,7 +53,7 @@
 
     success = false;
     try {
-      FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, docs);
+      FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, docs, TermRef.getUTF8SortedAsUTF16Comparator());
       success = true;
       return ret;
     } finally {
@@ -78,7 +79,8 @@
       indexReader = new SimpleStandardTermsIndexReader(dir,
                                                        fieldInfos,
                                                        si.name,
-                                                       indexDivisor);
+                                                       indexDivisor,
+                                                       TermRef.getUTF8SortedAsUTF16Comparator());
       success = true;
     } finally {
       if (!success) {
@@ -91,7 +93,8 @@
       FieldsProducer ret = new StandardTermsDictReader(indexReader,
                                                        dir, fieldInfos, si.name,
                                                        docs,
-                                                       readBufferSize);
+                                                       readBufferSize,
+                                                       TermRef.getUTF8SortedAsUTF16Comparator());
       success = true;
       return ret;
     } finally {

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/StandardTermsDictReader.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/StandardTermsDictReader.java?rev=885265&r1=885264&r2=885265&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/StandardTermsDictReader.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/StandardTermsDictReader.java Sun Nov 29 20:28:51 2009
@@ -57,12 +57,16 @@
   private final String segment;
   private StandardTermsIndexReader indexReader;
 
+  private final TermRef.Comparator termComp;
 
-  public StandardTermsDictReader(StandardTermsIndexReader indexReader, Directory dir, FieldInfos fieldInfos, String segment, StandardDocsProducer docs, int readBufferSize)
+  public StandardTermsDictReader(StandardTermsIndexReader indexReader, Directory dir, FieldInfos fieldInfos, String segment, StandardDocsProducer docs, int readBufferSize,
+                                 TermRef.Comparator termComp)
     throws IOException {
     
     this.segment = segment;
     this.docs = docs;
+
+    this.termComp = termComp;
     
     String file = IndexFileNames.segmentFileName(segment, StandardCodec.TERMS_EXTENSION);
     //nocommit
@@ -231,6 +235,11 @@
       }
     }
 
+    @Override
+    public TermRef.Comparator getTermComparator() {
+      return termComp;
+    }
+
     // nocommit -- figure out how to do this one: we want to
     // reuse the thread private TermsEnum, but, get a
     // clone'd docs, somehow.  This way if code is using the
@@ -296,6 +305,11 @@
         docs = StandardTermsDictReader.this.docs.reader(fieldInfo, in);
       }
 
+      @Override
+      public TermRef.Comparator getTermComparator() {
+        return termComp;
+      }
+
       /** Seeks until the first term that's >= the provided
        *  text; returns SeekStatus.FOUND if the exact term
        *  is found, SeekStatus.NOT_FOUND if a different term
@@ -335,7 +349,7 @@
         // so) -- I'd prefer such silly apps take the hit,
         // not well behaved apps?
 
-        if (bytesReader.started && termUpto < numTerms && bytesReader.term.compareTerm(term) == 0) {
+        if (bytesReader.started && termUpto < numTerms && bytesReader.term.termEquals(term)) {
           // nocommit -- not right if text is ""?
           // mxx
           if (Codec.DEBUG) {
@@ -384,7 +398,7 @@
         //int scanCnt = 0;
         while(next() != null) {
           //scanCnt++;
-          final int cmp = bytesReader.term.compareTerm(term);
+          final int cmp = termComp.compare(bytesReader.term, term);
           if (cmp == 0) {
             // mxx
             if (Codec.DEBUG) {

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/StandardTermsDictWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/StandardTermsDictWriter.java?rev=885265&r1=885264&r2=885265&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/StandardTermsDictWriter.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/StandardTermsDictWriter.java Sun Nov 29 20:28:51 2009
@@ -59,13 +59,15 @@
   FieldInfo currentField;
   private final StandardTermsIndexWriter indexWriter;
   private final List<TermsConsumer> fields = new ArrayList<TermsConsumer>();
+  private final TermRef.Comparator termComp;
 
   // nocommit
   private String segment;
 
-  public StandardTermsDictWriter(StandardTermsIndexWriter indexWriter, SegmentWriteState state, StandardDocsConsumer consumer) throws IOException {
+  public StandardTermsDictWriter(StandardTermsIndexWriter indexWriter, SegmentWriteState state, StandardDocsConsumer consumer, TermRef.Comparator termComp) throws IOException {
     final String termsFileName = IndexFileNames.segmentFileName(state.segmentName, StandardCodec.TERMS_EXTENSION);
     this.indexWriter = indexWriter;
+    this.termComp = termComp;
     out = state.directory.createOutput(termsFileName);
     indexWriter.setTermsOutput(out);
     state.flushedFiles.add(termsFileName);
@@ -165,6 +167,11 @@
     }
     
     @Override
+    public TermRef.Comparator getTermComparator() {
+      return termComp;
+    }
+
+    @Override
     public DocsConsumer startTerm(TermRef text) throws IOException {
       consumer.startTerm();
       if (Codec.DEBUG) {

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/FilteredTermsEnum.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/FilteredTermsEnum.java?rev=885265&r1=885264&r2=885265&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/FilteredTermsEnum.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/FilteredTermsEnum.java Sun Nov 29 20:28:51 2009
@@ -88,6 +88,12 @@
     }
     return actualEnum.term();
   }
+
+  @Override
+  /** Don't call this until after setEnum, else you'll hit NPE */
+  public TermRef.Comparator getTermComparator() throws IOException {
+    return actualEnum.getTermComparator();
+  }
     
   /** 
    * Returns the docFreq of the current Term in the enumeration.

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/NumericRangeQuery.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/NumericRangeQuery.java?rev=885265&r1=885264&r2=885265&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/NumericRangeQuery.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/NumericRangeQuery.java Sun Nov 29 20:28:51 2009
@@ -567,10 +567,11 @@
     private final LinkedList<String> rangeBounds = new LinkedList<String>();
     private TermRef currentUpperBound = null;
     private final boolean empty;
+    private final TermRef.Comparator termComp;
 
     NumericRangeTermsEnum(final IndexReader reader) throws IOException {
       this.reader = reader;
-      
+
       switch (valSize) {
         case 64: {
           // lower
@@ -649,6 +650,18 @@
           throw new IllegalArgumentException("valSize must be 32 or 64");
       }
       
+      // TODO: NRQ by design relies on a specific sort
+      // order; I think UT8 or UTF16 would work (NRQ encodes
+      // to only ASCII).
+      
+      Terms terms = reader.fields().terms(field);
+      if (terms != null) {
+        // cache locally
+        termComp = terms.getTermComparator();
+      } else {
+        termComp = null;
+      }
+
       // seek to first term
       empty = next() == null;
     }
@@ -678,7 +691,7 @@
      */
     @Override
     protected AcceptStatus accept(TermRef term) {
-      if (term.compareTerm(currentUpperBound) <= 0) {
+      if (termComp.compare(term, currentUpperBound) <= 0) {
         return AcceptStatus.YES;
       } else {
         return AcceptStatus.NO;

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/TermRangeTermsEnum.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/TermRangeTermsEnum.java?rev=885265&r1=885264&r2=885265&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/TermRangeTermsEnum.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/TermRangeTermsEnum.java Sun Nov 29 20:28:51 2009
@@ -44,6 +44,7 @@
   final private TermRef lowerTermRef;
   final private TermRef upperTermRef;
   private final boolean empty;
+  private final TermRef.Comparator termComp;
 
   /**
    * Enumerates all terms greater/equal than <code>lowerTerm</code>
@@ -80,6 +81,7 @@
     this.includeLower = includeLower;
     this.includeUpper = includeUpper;
     this.field = StringHelper.intern(field);
+
     // do a little bit of normalization...
     // open ended range queries should always be inclusive.
     if (this.lowerTermText == null) {
@@ -99,7 +101,9 @@
     Terms terms = reader.fields().terms(field);
 
     if (terms != null) {
+      termComp = terms.getTermComparator();
       final boolean foundFirstTerm = setEnum(terms.iterator(), new TermRef(startTermText)) != null;
+
       if (foundFirstTerm && collator == null && !this.includeLower && term().termEquals(lowerTermRef)) {
         empty = next() == null;
       } else {
@@ -107,6 +111,7 @@
       }
     } else {
       empty = true;
+      termComp = null;
     }
   }
 
@@ -128,9 +133,9 @@
   @Override
   protected AcceptStatus accept(TermRef term) {
     if (collator == null) {
-      // Use Unicode code point ordering
+      // Use this field's default sort ordering
       if (upperTermRef != null) {
-        final int cmp = upperTermRef.compareTerm(term);
+        final int cmp = termComp.compare(upperTermRef, term);
         /*
          * if beyond the upper term, or is exclusive and this is equal to
          * the upper term, break out

Modified: lucene/java/branches/flex_1458/src/test/org/apache/lucene/TestDemo.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/test/org/apache/lucene/TestDemo.java?rev=885265&r1=885264&r2=885265&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/test/org/apache/lucene/TestDemo.java (original)
+++ lucene/java/branches/flex_1458/src/test/org/apache/lucene/TestDemo.java Sun Nov 29 20:28:51 2009
@@ -24,10 +24,12 @@
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.Term;
 import org.apache.lucene.queryParser.ParseException;
 import org.apache.lucene.queryParser.QueryParser;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.Query;
+import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.search.ScoreDoc;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.RAMDirectory;
@@ -54,7 +56,8 @@
     IndexWriter iwriter = new IndexWriter(directory, analyzer, true,
                                           new IndexWriter.MaxFieldLength(25000));
     Document doc = new Document();
-    String text = "This is the text to be indexed.";
+    String longTerm = "longtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongterm";
+    String text = "This is the text to be indexed. " + longTerm;
     doc.add(new Field("fieldname", text, Field.Store.YES,
         Field.Index.ANALYZED));
     iwriter.addDocument(doc);
@@ -62,15 +65,17 @@
     
     // Now search the index:
     IndexSearcher isearcher = new IndexSearcher(directory, true); // read-only=true
+
+    assertEquals(1, isearcher.search(new TermQuery(new Term("fieldname", longTerm)), 1).totalHits);
     // Parse a simple query that searches for "text":
     QueryParser parser = new QueryParser(Version.LUCENE_CURRENT, "fieldname", analyzer);
     Query query = parser.parse("text");
-    ScoreDoc[] hits = isearcher.search(query, null, 1000).scoreDocs;
+    ScoreDoc[] hits = isearcher.search(query, null, 1).scoreDocs;
     assertEquals(1, hits.length);
     // Iterate through the results:
     for (int i = 0; i < hits.length; i++) {
       Document hitDoc = isearcher.doc(hits[i].doc);
-      assertEquals("This is the text to be indexed.", hitDoc.get("fieldname"));
+      assertEquals(text, hitDoc.get("fieldname"));
     }
     isearcher.close();
     directory.close();

Modified: lucene/java/branches/flex_1458/src/test/org/apache/lucene/TestExternalCodecs.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/test/org/apache/lucene/TestExternalCodecs.java?rev=885265&r1=885264&r2=885265&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/test/org/apache/lucene/TestExternalCodecs.java (original)
+++ lucene/java/branches/flex_1458/src/test/org/apache/lucene/TestExternalCodecs.java Sun Nov 29 20:28:51 2009
@@ -34,6 +34,36 @@
 
 public class TestExternalCodecs extends LuceneTestCase {
 
+  // For fun, test that we can override how terms are
+  // sorted, and basic things still work -- this comparator
+  // sorts in reversed unicode code point order:
+  private static final TermRef.Comparator reverseUnicodeComparator = new TermRef.Comparator() {
+      @Override
+      public int compare(TermRef t1, TermRef t2) {
+        byte[] b1 = t1.bytes;
+        byte[] b2 = t2.bytes;
+        int b1Stop;
+        int b1Upto = t1.offset;
+        int b2Upto = t2.offset;
+        if (t1.length < t2.length) {
+          b1Stop = t1.offset + t1.length;
+        } else {
+          b1Stop = t1.offset + t2.length;
+        }
+        while(b1Upto < b1Stop) {
+          final int bb1 = b1[b1Upto++] & 0xff;
+          final int bb2 = b2[b2Upto++] & 0xff;
+          if (bb1 != bb2) {
+            //System.out.println("cmp 1=" + t1 + " 2=" + t2 + " return " + (bb2-bb1));
+            return bb2 - bb1;
+          }
+        }
+
+        // One is prefix of another, or they are equal
+        return t2.length-t1.length;
+      }
+    };
+
   // TODO
   //   - good improvement would be to write through to disk,
   //     and then load into ram from disk
@@ -43,17 +73,21 @@
     static class RAMPostings extends FieldsProducer {
       final Map<String,RAMField> fieldToTerms = new TreeMap<String,RAMField>();
 
+      @Override
       public Terms terms(String field) {
         return fieldToTerms.get(field);
       }
 
+      @Override
       public FieldsEnum iterator() {
         return new RAMFieldsEnum(this);
       }
 
+      @Override
       public void close() {
       }
 
+      @Override
       public void loadTermsIndex() {
       }
     } 
@@ -65,13 +99,20 @@
         this.field = field;
       }
 
+      @Override
       public long getUniqueTermCount() {
         return termToDocs.size();
       }
 
+      @Override
       public TermsEnum iterator() {
         return new RAMTermsEnum(RAMOnlyCodec.RAMField.this);
       }
+
+      @Override
+      public TermRef.Comparator getTermComparator() {
+        return reverseUnicodeComparator;
+      }
     }
 
     static class RAMTerm {
@@ -101,6 +142,7 @@
         this.postings = postings;
       }
 
+      @Override
       public TermsConsumer addField(FieldInfo field) {
         RAMField ramField = new RAMField(field.name);
         postings.fieldToTerms.put(field.name, ramField);
@@ -108,6 +150,7 @@
         return termsConsumer;
       }
 
+      @Override
       public void close() {
         // TODO: finalize stuff
       }
@@ -121,7 +164,8 @@
       void reset(RAMField field) {
         this.field = field;
       }
-        
+      
+      @Override
       public DocsConsumer startTerm(TermRef text) {
         final String term = text.toString();
         current = new RAMTerm(term);
@@ -129,6 +173,13 @@
         return docsConsumer;
       }
 
+      
+      @Override
+      public TermRef.Comparator getTermComparator() {
+        return TermRef.getUTF8SortedAsUTF16Comparator();
+      }
+
+      @Override
       public void finishTerm(TermRef text, int numDocs) {
         // nocommit -- are we even called when numDocs == 0?
         if (numDocs > 0) {
@@ -137,6 +188,7 @@
         }
       }
 
+      @Override
       public void finish() {
       }
     }
@@ -149,6 +201,7 @@
       public void reset(RAMTerm term) {
         this.term = term;
       }
+      @Override
       public PositionsConsumer addDoc(int docID, int freq) {
         current = new RAMDoc(docID, freq);
         term.docs.add(current);
@@ -165,6 +218,7 @@
         upto = 0;
       }
 
+      @Override
       public void addPosition(int position, byte[] payload, int payloadOffset, int payloadLength) {
         if (payload != null) {
           throw new UnsupportedOperationException("can't handle payloads");
@@ -172,6 +226,7 @@
         current.positions[upto++] = position;
       }
 
+      @Override
       public void finishDoc() {
         assert upto == current.positions.length;
       }
@@ -189,6 +244,7 @@
         this.it = postings.fieldToTerms.keySet().iterator();
       }
 
+      @Override
       public String next() {
         if (it.hasNext()) {
           current = it.next();
@@ -198,12 +254,10 @@
         return current;
       }
 
+      @Override
       public TermsEnum terms() {
         return new RAMTermsEnum(postings.fieldToTerms.get(current));
       }
-
-      void close() {
-      }
     }
 
     static class RAMTermsEnum extends TermsEnum {
@@ -214,7 +268,13 @@
       public RAMTermsEnum(RAMField field) {
         this.ramField = field;
       }
+      
+      @Override
+      public TermRef.Comparator getTermComparator() {
+        return TermRef.getUTF8SortedAsUTF16Comparator();
+      }
 
+      @Override
       public TermRef next() {
         if (it == null) {
           if (current == null) {
@@ -231,6 +291,7 @@
         }
       }
 
+      @Override
       public SeekStatus seek(TermRef term) {
         current = term.toString();
         if (ramField.termToDocs.containsKey(current)) {
@@ -245,23 +306,28 @@
         }
       }
 
+      @Override
       public SeekStatus seek(long ord) {
         throw new UnsupportedOperationException();
       }
 
+      @Override
       public long ord() {
         throw new UnsupportedOperationException();
       }
 
+      @Override
       public TermRef term() {
         // TODO: reuse TermRef
         return new TermRef(current);
       }
 
+      @Override
       public int docFreq() {
         return ramField.termToDocs.get(current).docs.size();
       }
 
+      @Override
       public DocsEnum docs(Bits skipDocs) {
         return new RAMDocsEnum(ramField.termToDocs.get(current), skipDocs);
       }
@@ -279,6 +345,7 @@
         this.skipDocs = skipDocs;
       }
 
+      @Override
       public int advance(int targetDocID) {
         do {
           next();
@@ -288,6 +355,7 @@
 
       // TODO: override bulk read, for better perf
 
+      @Override
       public int next() {
         while(true) {
           upto++;
@@ -302,10 +370,12 @@
         }
       }
 
+      @Override
       public int freq() {
         return current.positions.length;
       }
 
+      @Override
       public PositionsEnum positions() {
         positions.reset(current);
         return positions;
@@ -321,18 +391,22 @@
         upto = 0;
       }
 
+      @Override
       public int next() {
         return ramDoc.positions[upto++];
       }
 
+      @Override
       public boolean hasPayload() {
         return false;
       }
 
+      @Override
       public int getPayloadLength() {
         return 0;
       }
 
+      @Override
       public byte[] getPayload(byte[] data, int offset) {
         return null;
       }
@@ -341,6 +415,7 @@
     // Holds all indexes created
     private final Map<String,RAMPostings> state = new HashMap<String,RAMPostings>();
 
+    @Override
     public FieldsConsumer fieldsConsumer(SegmentWriteState writeState) {
       RAMPostings postings = new RAMPostings();
       RAMFieldsConsumer consumer = new RAMFieldsConsumer(postings);
@@ -350,14 +425,17 @@
       return consumer;
     }
 
+    @Override
     public FieldsProducer fieldsProducer(Directory dir, FieldInfos fieldInfos, SegmentInfo si, int readBufferSize, int indexDivisor)
       throws IOException {
       return state.get(si.name);
     }
 
+    @Override
     public void getExtensions(Collection extensions) {
     }
 
+    @Override
     public void files(Directory dir, SegmentInfo segmentInfo, Collection files) {
     }
   }
@@ -388,6 +466,7 @@
       }
     }
       
+    @Override
     public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
       return new FieldsWriter(state);
     }
@@ -401,6 +480,7 @@
         this.state = state;
       }
 
+      @Override
       public TermsConsumer addField(FieldInfo field) throws IOException {
         fieldsSeen.add(field.name);
         Codec codec = getCodec(field.name);
@@ -414,6 +494,7 @@
         return fields.addField(field);
       }
 
+      @Override
       public void close() throws IOException {
         Iterator<FieldsConsumer> it = codecs.values().iterator();
         while(it.hasNext()) {
@@ -453,6 +534,7 @@
           it = fields.iterator();
         }
 
+        @Override
         public String next() {
           if (it.hasNext()) {
             current = it.next();
@@ -463,6 +545,7 @@
           return current;
         }
 
+        @Override
         public TermsEnum terms() throws IOException {
           Terms terms = codecs.get(getCodec(current)).terms(current);
           if (terms != null) {
@@ -473,10 +556,12 @@
         }
       }
       
+      @Override
       public FieldsEnum iterator() throws IOException {
         return new FieldsIterator();
       }
 
+      @Override
       public Terms terms(String field) throws IOException {
         Codec codec = getCodec(field);
 
@@ -485,6 +570,7 @@
         return fields.terms(field);
       }
 
+      @Override
       public void close() throws IOException {
         Iterator<FieldsProducer> it = codecs.values().iterator();
         while(it.hasNext()) {
@@ -493,6 +579,7 @@
         }
       }
 
+      @Override
       public void loadTermsIndex() throws IOException {
         Iterator<FieldsProducer> it = codecs.values().iterator();
         while(it.hasNext()) {
@@ -509,14 +596,20 @@
       return new FieldsReader(dir, fieldInfos, si, readBufferSize, indexDivisor);
     }
 
+    @Override
     public void files(Directory dir, SegmentInfo info, Collection files) throws IOException {
       Iterator<Codec> it = fields.values().iterator();
+      Set<Codec> seen = new HashSet<Codec>();
       while(it.hasNext()) {
         final Codec codec = it.next();
-        codec.files(dir, info, files);
+        if (!seen.contains(codec)) {
+          seen.add(codec);
+          codec.files(dir, info, files);
+        }
       }
     }
 
+    @Override
     public void getExtensions(Collection extensions) {
       Iterator<Codec> it = fields.values().iterator();
       while(it.hasNext()) {
@@ -531,34 +624,152 @@
 
     MyCodecs() {
       Codec ram = new RAMOnlyCodec();
-      Codec pulsing = new PulsingCodec();
+      Codec pulsing = new PulsingReverseTermsCodec();
       perField = new PerFieldCodecWrapper(ram);
       perField.add("field2", pulsing);
+      perField.add("id", pulsing);
       register(perField);
     }
     
+    @Override
     public Codec getWriter(SegmentWriteState state) {
       return perField;
     }
   }
 
+  // copied from PulsingCodec, just changing the terms
+  // comparator
+  private static class PulsingReverseTermsCodec extends Codec {
+
+    public PulsingReverseTermsCodec() {
+      name = "PulsingReverseTerms";
+    }
+
+    @Override
+    public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
+      // We wrap StandardDocsWriter, but any DocsConsumer
+      // will work:
+      StandardDocsConsumer docsWriter = new StandardDocsWriter(state);
+
+      // Terms that have <= freqCutoff number of docs are
+      // "pulsed" (inlined):
+      final int freqCutoff = 1;
+      StandardDocsConsumer pulsingWriter = new PulsingDocsWriter(state, freqCutoff, docsWriter);
+
+      // Terms dict index
+      StandardTermsIndexWriter indexWriter;
+      boolean success = false;
+      try {
+        indexWriter = new SimpleStandardTermsIndexWriter(state);
+        success = true;
+      } finally {
+        if (!success) {
+          pulsingWriter.close();
+        }
+      }
+
+      // Terms dict
+      success = false;
+      try {
+        FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, pulsingWriter, reverseUnicodeComparator);
+        success = true;
+        return ret;
+      } finally {
+        if (!success) {
+          try {
+            pulsingWriter.close();
+          } finally {
+            indexWriter.close();
+          }
+        }
+      }
+    }
+
+    @Override
+    public FieldsProducer fieldsProducer(Directory dir, FieldInfos fieldInfos, SegmentInfo si, int readBufferSize, int indexDivisor) throws IOException {
+
+      // We wrap StandardDocsReader, but any DocsProducer
+      // will work:
+      StandardDocsProducer docs = new StandardDocsReader(dir, si, readBufferSize);
+      StandardDocsProducer docsReader = new PulsingDocsReader(dir, si, readBufferSize, docs);
+
+      // Terms dict index reader
+      StandardTermsIndexReader indexReader;
+
+      boolean success = false;
+      try {
+        indexReader = new SimpleStandardTermsIndexReader(dir,
+                                                         fieldInfos,
+                                                         si.name,
+                                                         indexDivisor,
+                                                         reverseUnicodeComparator);
+        success = true;
+      } finally {
+        if (!success) {
+          docs.close();
+        }
+      }
+
+      // Terms dict reader
+      success = false;
+      try {
+        FieldsProducer ret = new StandardTermsDictReader(indexReader,
+                                                         dir, fieldInfos, si.name,
+                                                         docsReader,
+                                                         readBufferSize,
+                                                         reverseUnicodeComparator);
+        success = true;
+        return ret;
+      } finally {
+        if (!success) {
+          try {
+            docs.close();
+          } finally {
+            indexReader.close();
+          }
+        }
+      }
+    }
+
+    @Override
+    public void files(Directory dir, SegmentInfo segmentInfo, Collection<String> files) throws IOException {
+      StandardDocsReader.files(dir, segmentInfo, files);
+      StandardTermsDictReader.files(dir, segmentInfo, files);
+      SimpleStandardTermsIndexReader.files(dir, segmentInfo, files);
+    }
+
+    @Override
+    public void getExtensions(Collection<String> extensions) {
+      StandardCodec.getStandardExtensions(extensions);
+    }
+  }
+
+
+  /*
+    tests storing "id" and "field2" fields as pulsing codec,
+    whose term sort is backwards unicode code point, and
+    storing "field1" as a custom entirely-in-RAM codec
+   */
   public void testPerFieldCodec() throws Exception {
     
+    final int NUM_DOCS = 173;
+
     Directory dir = new MockRAMDirectory();
     IndexWriter w = new IndexWriter(dir, new WhitespaceAnalyzer(), true, null, IndexWriter.MaxFieldLength.UNLIMITED,
                                     null, null, new MyCodecs());
+
     w.setMergeFactor(3);
     Document doc = new Document();
     // uses default codec:
-    doc.add(new Field("field1", "this field uses the standard codec", Field.Store.NO, Field.Index.ANALYZED));
+    doc.add(new Field("field1", "this field uses the standard codec as the test", Field.Store.NO, Field.Index.ANALYZED));
     // uses pulsing codec:
-    doc.add(new Field("field2", "this field uses the pulsing codec", Field.Store.NO, Field.Index.ANALYZED));
+    doc.add(new Field("field2", "this field uses the pulsing codec as the test", Field.Store.NO, Field.Index.ANALYZED));
     
     Field idField = new Field("id", "", Field.Store.NO, Field.Index.NOT_ANALYZED);
     doc.add(idField);
-    for(int i=0;i<100;i++) {
-      w.addDocument(doc);
+    for(int i=0;i<NUM_DOCS;i++) {
       idField.setValue(""+i);
+      w.addDocument(doc);
       if ((i+1)%10 == 0) {
         w.commit();
       }
@@ -566,21 +777,36 @@
     w.deleteDocuments(new Term("id", "77"));
 
     IndexReader r = w.getReader();
-    assertEquals(99, r.numDocs());
+    IndexReader[] subs = r.getSequentialSubReaders();
+    assertTrue(subs.length > 1);
+    // test each segment
+    for(int i=0;i<subs.length;i++) {
+      //System.out.println("test i=" + i);
+      testTermsOrder(subs[i]);
+    }
+    // test each multi-reader
+    testTermsOrder(r);
+    
+    assertEquals(NUM_DOCS-1, r.numDocs());
     IndexSearcher s = new IndexSearcher(r);
-    assertEquals(99, s.search(new TermQuery(new Term("field1", "standard")), 1).totalHits);
-    assertEquals(99, s.search(new TermQuery(new Term("field2", "pulsing")), 1).totalHits);
+    assertEquals(NUM_DOCS-1, s.search(new TermQuery(new Term("field1", "standard")), 1).totalHits);
+    assertEquals(NUM_DOCS-1, s.search(new TermQuery(new Term("field2", "pulsing")), 1).totalHits);
     r.close();
     s.close();
 
     w.deleteDocuments(new Term("id", "44"));
     w.optimize();
     r = w.getReader();
-    assertEquals(98, r.maxDoc());
-    assertEquals(98, r.numDocs());
+    assertEquals(NUM_DOCS-2, r.maxDoc());
+    assertEquals(NUM_DOCS-2, r.numDocs());
     s = new IndexSearcher(r);
-    assertEquals(98, s.search(new TermQuery(new Term("field1", "standard")), 1).totalHits);
-    assertEquals(98, s.search(new TermQuery(new Term("field2", "pulsing")), 1).totalHits);
+    assertEquals(NUM_DOCS-2, s.search(new TermQuery(new Term("field1", "standard")), 1).totalHits);
+    assertEquals(NUM_DOCS-2, s.search(new TermQuery(new Term("field2", "pulsing")), 1).totalHits);
+    assertEquals(1, s.search(new TermQuery(new Term("id", "76")), 1).totalHits);
+    assertEquals(0, s.search(new TermQuery(new Term("id", "77")), 1).totalHits);
+    assertEquals(0, s.search(new TermQuery(new Term("id", "44")), 1).totalHits);
+
+    testTermsOrder(r);
     r.close();
     s.close();
 
@@ -588,4 +814,25 @@
 
     dir.close();
   }
+
+  private void testTermsOrder(IndexReader r) throws Exception {
+
+    // Verify sort order matches what my comparator said:
+    TermRef lastTermRef = null;
+    TermsEnum terms = r.fields().terms("id").iterator();
+    //System.out.println("id terms:");
+    while(true) {
+      TermRef t = terms.next();
+      if (t == null) {
+        break;
+      }
+      //System.out.println("  " + t);
+      if (lastTermRef == null) {
+        lastTermRef = new TermRef(t);
+      } else {
+        assertTrue("terms in wrong order last=" + lastTermRef + " current=" + t, reverseUnicodeComparator.compare(lastTermRef, t) < 0);
+        lastTermRef.copy(t);
+      }
+    }
+  }
 }



Mime
View raw message