Return-Path: X-Original-To: apmail-lucene-commits-archive@www.apache.org Delivered-To: apmail-lucene-commits-archive@www.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 44E6CE0CB for ; Wed, 20 Feb 2013 11:20:00 +0000 (UTC) Received: (qmail 39839 invoked by uid 500); 20 Feb 2013 11:19:59 -0000 Mailing-List: contact commits-help@lucene.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@lucene.apache.org Delivered-To: mailing list commits@lucene.apache.org Received: (qmail 39831 invoked by uid 99); 20 Feb 2013 11:19:59 -0000 Received: from nike.apache.org (HELO nike.apache.org) (192.87.106.230) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 20 Feb 2013 11:19:59 +0000 X-ASF-Spam-Status: No, hits=-2000.0 required=5.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 20 Feb 2013 11:19:49 +0000 Received: from eris.apache.org (localhost [127.0.0.1]) by eris.apache.org (Postfix) with ESMTP id EF30123889DE; Wed, 20 Feb 2013 11:19:27 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r1448085 [2/3] - in /lucene/dev/branches/branch_4x: ./ dev-tools/ lucene/ lucene/codecs/ lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/ lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/ lucene/core/ lucene/core/src/java/o... Date: Wed, 20 Feb 2013 11:19:24 -0000 To: commits@lucene.apache.org From: rmuir@apache.org X-Mailer: svnmailer-1.0.8-patched Message-Id: <20130220111927.EF30123889DE@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Modified: lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java?rev=1448085&r1=1448084&r2=1448085&view=diff ============================================================================== --- lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java (original) +++ lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java Wed Feb 20 11:19:22 2013 @@ -44,6 +44,7 @@ import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.CommandLineUtil; import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.OpenBitSet; import org.apache.lucene.util.StringHelper; /** @@ -1291,7 +1292,8 @@ public class CheckIndex { } else { if (reader.getBinaryDocValues(fieldInfo.name) != null || reader.getNumericDocValues(fieldInfo.name) != null || - reader.getSortedDocValues(fieldInfo.name) != null) { + reader.getSortedDocValues(fieldInfo.name) != null || + reader.getSortedSetDocValues(fieldInfo.name) != null) { throw new RuntimeException("field: " + fieldInfo.name + " has docvalues but should omit them!"); } } @@ -1349,6 +1351,47 @@ public class CheckIndex { } } + private static void checkSortedSetDocValues(String fieldName, AtomicReader reader, SortedSetDocValues dv) { + final long maxOrd = dv.getValueCount()-1; + OpenBitSet seenOrds = new OpenBitSet(dv.getValueCount()); + long maxOrd2 = -1; + for (int i = 0; i < reader.maxDoc(); i++) { + dv.setDocument(i); + long lastOrd = -1; + long ord; + while ((ord = dv.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) { + if (ord <= lastOrd) { + throw new RuntimeException("ords out of order: " + ord + " <= " + lastOrd + " for doc: " + i); + } + if (ord < 0 || ord > maxOrd) { + throw new RuntimeException("ord out of bounds: " + ord); + } + lastOrd = ord; + maxOrd2 = Math.max(maxOrd2, ord); + seenOrds.set(ord); + } + } + if (maxOrd != maxOrd2) { + throw new RuntimeException("dv for field: " + fieldName + " reports wrong maxOrd=" + maxOrd + " but this is not the case: " + maxOrd2); + } + if (seenOrds.cardinality() != dv.getValueCount()) { + throw new RuntimeException("dv for field: " + fieldName + " has holes in its ords, valueCount=" + dv.getValueCount() + " but only used: " + seenOrds.cardinality()); + } + + BytesRef lastValue = null; + BytesRef scratch = new BytesRef(); + for (long i = 0; i <= maxOrd; i++) { + dv.lookupOrd(i, scratch); + assert scratch.isValid(); + if (lastValue != null) { + if (scratch.compareTo(lastValue) <= 0) { + throw new RuntimeException("dv for field: " + fieldName + " has ords out of order: " + lastValue + " >=" + scratch); + } + } + lastValue = BytesRef.deepCopyOf(scratch); + } + } + private static void checkNumericDocValues(String fieldName, AtomicReader reader, NumericDocValues ndv) { for (int i = 0; i < reader.maxDoc(); i++) { ndv.get(i); @@ -1359,12 +1402,35 @@ public class CheckIndex { switch(fi.getDocValuesType()) { case SORTED: checkSortedDocValues(fi.name, reader, reader.getSortedDocValues(fi.name)); + if (reader.getBinaryDocValues(fi.name) != null || + reader.getNumericDocValues(fi.name) != null || + reader.getSortedSetDocValues(fi.name) != null) { + throw new RuntimeException(fi.name + " returns multiple docvalues types!"); + } + break; + case SORTED_SET: + checkSortedSetDocValues(fi.name, reader, reader.getSortedSetDocValues(fi.name)); + if (reader.getBinaryDocValues(fi.name) != null || + reader.getNumericDocValues(fi.name) != null || + reader.getSortedDocValues(fi.name) != null) { + throw new RuntimeException(fi.name + " returns multiple docvalues types!"); + } break; case BINARY: checkBinaryDocValues(fi.name, reader, reader.getBinaryDocValues(fi.name)); + if (reader.getNumericDocValues(fi.name) != null || + reader.getSortedDocValues(fi.name) != null || + reader.getSortedSetDocValues(fi.name) != null) { + throw new RuntimeException(fi.name + " returns multiple docvalues types!"); + } break; case NUMERIC: checkNumericDocValues(fi.name, reader, reader.getNumericDocValues(fi.name)); + if (reader.getBinaryDocValues(fi.name) != null || + reader.getSortedDocValues(fi.name) != null || + reader.getSortedSetDocValues(fi.name) != null) { + throw new RuntimeException(fi.name + " returns multiple docvalues types!"); + } break; default: throw new AssertionError(); Modified: lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/DocTermOrds.java URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/DocTermOrds.java?rev=1448085&r1=1448084&r2=1448085&view=diff ============================================================================== --- lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/DocTermOrds.java (original) +++ lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/DocTermOrds.java Wed Feb 20 11:19:22 2013 @@ -265,6 +265,10 @@ public class DocTermOrds { /** Call this only once (if you subclass!) */ protected void uninvert(final AtomicReader reader, final BytesRef termPrefix) throws IOException { + final FieldInfo info = reader.getFieldInfos().fieldInfo(field); + if (info != null && info.hasDocValues()) { + throw new IllegalStateException("Type mismatch: " + field + " was indexed as " + info.getDocValuesType()); + } //System.out.println("DTO uninvert field=" + field + " prefix=" + termPrefix); final long startTime = System.currentTimeMillis(); prefix = termPrefix == null ? null : BytesRef.deepCopyOf(termPrefix); @@ -596,93 +600,6 @@ public class DocTermOrds { return pos; } - /** Iterates over the ords for a single document. */ - public class TermOrdsIterator { - private int tnum; - private int upto; - private byte[] arr; - - TermOrdsIterator() { - } - - /** Buffer must be at least 5 ints long. Returns number - * of term ords placed into buffer; if this count is - * less than buffer.length then that is the end. */ - public int read(int[] buffer) { - int bufferUpto = 0; - if (arr == null) { - // code is inlined into upto - //System.out.println("inlined"); - int code = upto; - int delta = 0; - for (;;) { - delta = (delta << 7) | (code & 0x7f); - if ((code & 0x80)==0) { - if (delta==0) break; - tnum += delta - TNUM_OFFSET; - buffer[bufferUpto++] = ordBase+tnum; - //System.out.println(" tnum=" + tnum); - delta = 0; - } - code >>>= 8; - } - } else { - // code is a pointer - for(;;) { - int delta = 0; - for(;;) { - byte b = arr[upto++]; - delta = (delta << 7) | (b & 0x7f); - //System.out.println(" cycle: upto=" + upto + " delta=" + delta + " b=" + b); - if ((b & 0x80) == 0) break; - } - //System.out.println(" delta=" + delta); - if (delta == 0) break; - tnum += delta - TNUM_OFFSET; - //System.out.println(" tnum=" + tnum); - buffer[bufferUpto++] = ordBase+tnum; - if (bufferUpto == buffer.length) { - break; - } - } - } - - return bufferUpto; - } - - /** Reset the iterator on a new document. */ - public TermOrdsIterator reset(int docID) { - //System.out.println(" reset docID=" + docID); - tnum = 0; - final int code = index[docID]; - if ((code & 0xff)==1) { - // a pointer - upto = code>>>8; - //System.out.println(" pointer! upto=" + upto); - int whichArray = (docID >>> 16) & 0xff; - arr = tnums[whichArray]; - } else { - //System.out.println(" inline!"); - arr = null; - upto = code; - } - return this; - } - } - - /** Returns an iterator to step through the term ords for - * this document. It's also possible to subclass this - * class and directly access members. */ - public TermOrdsIterator lookup(int doc, TermOrdsIterator reuse) { - final TermOrdsIterator ret; - if (reuse != null) { - ret = reuse; - } else { - ret = new TermOrdsIterator(); - } - return ret.reset(doc); - } - /* Only used if original IndexReader doesn't implement * ord; in this case we "wrap" our own terms index * around it. */ @@ -847,4 +764,124 @@ public class DocTermOrds { termsEnum.seekExact(ord); return termsEnum.term(); } + + /** Returns a SortedSetDocValues view of this instance */ + public SortedSetDocValues iterator(TermsEnum termsEnum) throws IOException { + if (isEmpty()) { + return SortedSetDocValues.EMPTY; + } else { + return new Iterator(termsEnum); + } + } + + private class Iterator extends SortedSetDocValues { + final TermsEnum te; + // currently we read 5 at a time (using the logic of the old iterator) + final int buffer[] = new int[5]; + int bufferUpto; + int bufferLength; + + private int tnum; + private int upto; + private byte[] arr; + + Iterator(TermsEnum te) { + this.te = te; + } + + @Override + public long nextOrd() { + while (bufferUpto == bufferLength) { + if (bufferLength < buffer.length) { + return NO_MORE_ORDS; + } else { + bufferLength = read(buffer); + bufferUpto = 0; + } + } + return buffer[bufferUpto++]; + } + + /** Buffer must be at least 5 ints long. Returns number + * of term ords placed into buffer; if this count is + * less than buffer.length then that is the end. */ + int read(int[] buffer) { + int bufferUpto = 0; + if (arr == null) { + // code is inlined into upto + //System.out.println("inlined"); + int code = upto; + int delta = 0; + for (;;) { + delta = (delta << 7) | (code & 0x7f); + if ((code & 0x80)==0) { + if (delta==0) break; + tnum += delta - TNUM_OFFSET; + buffer[bufferUpto++] = ordBase+tnum; + //System.out.println(" tnum=" + tnum); + delta = 0; + } + code >>>= 8; + } + } else { + // code is a pointer + for(;;) { + int delta = 0; + for(;;) { + byte b = arr[upto++]; + delta = (delta << 7) | (b & 0x7f); + //System.out.println(" cycle: upto=" + upto + " delta=" + delta + " b=" + b); + if ((b & 0x80) == 0) break; + } + //System.out.println(" delta=" + delta); + if (delta == 0) break; + tnum += delta - TNUM_OFFSET; + //System.out.println(" tnum=" + tnum); + buffer[bufferUpto++] = ordBase+tnum; + if (bufferUpto == buffer.length) { + break; + } + } + } + + return bufferUpto; + } + + @Override + public void setDocument(int docID) { + tnum = 0; + final int code = index[docID]; + if ((code & 0xff)==1) { + // a pointer + upto = code>>>8; + //System.out.println(" pointer! upto=" + upto); + int whichArray = (docID >>> 16) & 0xff; + arr = tnums[whichArray]; + } else { + //System.out.println(" inline!"); + arr = null; + upto = code; + } + bufferUpto = 0; + bufferLength = read(buffer); + } + + @Override + public void lookupOrd(long ord, BytesRef result) { + BytesRef ref = null; + try { + ref = DocTermOrds.this.lookupTerm(te, (int) ord); + } catch (IOException e) { + throw new RuntimeException(e); + } + result.bytes = ref.bytes; + result.offset = ref.offset; + result.length = ref.length; + } + + @Override + public long getValueCount() { + return numTerms(); + } + } } Modified: lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/DocValuesProcessor.java URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/DocValuesProcessor.java?rev=1448085&r1=1448084&r2=1448085&view=diff ============================================================================== --- lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/DocValuesProcessor.java (original) +++ lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/DocValuesProcessor.java Wed Feb 20 11:19:22 2013 @@ -57,6 +57,8 @@ final class DocValuesProcessor extends S addBinaryField(fieldInfo, docID, field.binaryValue()); } else if (dvType == DocValuesType.SORTED) { addSortedField(fieldInfo, docID, field.binaryValue()); + } else if (dvType == DocValuesType.SORTED_SET) { + addSortedSetField(fieldInfo, docID, field.binaryValue()); } else if (dvType == DocValuesType.NUMERIC) { if (!(field.numericValue() instanceof Long)) { throw new IllegalArgumentException("illegal type " + field.numericValue().getClass() + ": DocValues types must be Long"); @@ -122,6 +124,20 @@ final class DocValuesProcessor extends S } sortedWriter.addValue(docID, value); } + + void addSortedSetField(FieldInfo fieldInfo, int docID, BytesRef value) { + DocValuesWriter writer = writers.get(fieldInfo.name); + SortedSetDocValuesWriter sortedSetWriter; + if (writer == null) { + sortedSetWriter = new SortedSetDocValuesWriter(fieldInfo, bytesUsed); + writers.put(fieldInfo.name, sortedSetWriter); + } else if (!(writer instanceof SortedSetDocValuesWriter)) { + throw new IllegalArgumentException("Incompatible DocValues type: field \"" + fieldInfo.name + "\" changed from " + getTypeDesc(writer) + " to sorted"); + } else { + sortedSetWriter = (SortedSetDocValuesWriter) writer; + } + sortedSetWriter.addValue(docID, value); + } void addNumericField(FieldInfo fieldInfo, int docID, long value) { DocValuesWriter writer = writers.get(fieldInfo.name); Modified: lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/FieldInfo.java URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/FieldInfo.java?rev=1448085&r1=1448084&r2=1448085&view=diff ============================================================================== --- lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/FieldInfo.java (original) +++ lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/FieldInfo.java Wed Feb 20 11:19:22 2013 @@ -101,7 +101,14 @@ public final class FieldInfo { * byte[]. The stored byte[] is presorted and allows access via document id, * ordinal and by-value. */ - SORTED + SORTED, + /** + * A pre-sorted Set<byte[]>. Fields with this type only store distinct byte values + * and store additional offset pointers per document to dereference the shared + * byte[]s. The stored byte[] is presorted and allows access via document id, + * ordinal and by-value. + */ + SORTED_SET }; /** Modified: lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/FilterAtomicReader.java URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/FilterAtomicReader.java?rev=1448085&r1=1448084&r2=1448085&view=diff ============================================================================== --- lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/FilterAtomicReader.java (original) +++ lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/FilterAtomicReader.java Wed Feb 20 11:19:22 2013 @@ -429,6 +429,12 @@ public class FilterAtomicReader extends } @Override + public SortedSetDocValues getSortedSetDocValues(String field) throws IOException { + ensureOpen(); + return in.getSortedSetDocValues(field); + } + + @Override public NumericDocValues getNormValues(String field) throws IOException { ensureOpen(); return in.getNormValues(field); Modified: lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/MultiDocValues.java URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/MultiDocValues.java?rev=1448085&r1=1448084&r2=1448085&view=diff ============================================================================== --- lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/MultiDocValues.java (original) +++ lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/MultiDocValues.java Wed Feb 20 11:19:22 2013 @@ -24,6 +24,7 @@ import org.apache.lucene.index.MultiTerm import org.apache.lucene.index.MultiTermsEnum.TermsEnumWithSlice; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.packed.AppendingLongBuffer; +import org.apache.lucene.util.packed.MonotonicAppendingLongBuffer; /** * A wrapper for CompositeIndexReader providing access to DocValues. @@ -214,61 +215,151 @@ public class MultiDocValues { if (!anyReal) { return null; } else { - OrdinalMap mapping = new OrdinalMap(r.getCoreCacheKey(), values); + TermsEnum enums[] = new TermsEnum[values.length]; + for (int i = 0; i < values.length; i++) { + enums[i] = new SortedDocValuesTermsEnum(values[i]); + } + OrdinalMap mapping = new OrdinalMap(r.getCoreCacheKey(), enums); return new MultiSortedDocValues(values, starts, mapping); } } + /** Returns a SortedSetDocValues for a reader's docvalues (potentially doing extremely slow things). + *

+ * This is an extremely slow way to access sorted values. Instead, access them per-segment + * with {@link AtomicReader#getSortedSetDocValues(String)} + *

+ */ + public static SortedSetDocValues getSortedSetValues(final IndexReader r, final String field) throws IOException { + final List leaves = r.leaves(); + final int size = leaves.size(); + + if (size == 0) { + return null; + } else if (size == 1) { + return leaves.get(0).reader().getSortedSetDocValues(field); + } + + boolean anyReal = false; + final SortedSetDocValues[] values = new SortedSetDocValues[size]; + final int[] starts = new int[size+1]; + for (int i = 0; i < size; i++) { + AtomicReaderContext context = leaves.get(i); + SortedSetDocValues v = context.reader().getSortedSetDocValues(field); + if (v == null) { + v = SortedSetDocValues.EMPTY; + } else { + anyReal = true; + } + values[i] = v; + starts[i] = context.docBase; + } + starts[size] = r.maxDoc(); + + if (!anyReal) { + return null; + } else { + TermsEnum enums[] = new TermsEnum[values.length]; + for (int i = 0; i < values.length; i++) { + enums[i] = new SortedSetDocValuesTermsEnum(values[i]); + } + OrdinalMap mapping = new OrdinalMap(r.getCoreCacheKey(), enums); + return new MultiSortedSetDocValues(values, starts, mapping); + } + } + /** maps per-segment ordinals to/from global ordinal space */ - // TODO: use more efficient packed ints structures (these are all positive values!) - static class OrdinalMap { + // TODO: use more efficient packed ints structures? + // TODO: pull this out? its pretty generic (maps between N ord()-enabled TermsEnums) + public static class OrdinalMap { // cache key of whoever asked for this aweful thing final Object owner; // globalOrd -> (globalOrd - segmentOrd) - final AppendingLongBuffer globalOrdDeltas; + final MonotonicAppendingLongBuffer globalOrdDeltas; // globalOrd -> sub index final AppendingLongBuffer subIndexes; // segmentOrd -> (globalOrd - segmentOrd) - final AppendingLongBuffer ordDeltas[]; + final MonotonicAppendingLongBuffer ordDeltas[]; - OrdinalMap(Object owner, SortedDocValues subs[]) throws IOException { + /** + * Creates an ordinal map that allows mapping ords to/from a merged + * space from subs. + * @param owner a cache key + * @param subs TermsEnums that support {@link TermsEnum#ord()}. They need + * not be dense (e.g. can be FilteredTermsEnums}. + * @throws IOException if an I/O error occurred. + */ + public OrdinalMap(Object owner, TermsEnum subs[]) throws IOException { // create the ordinal mappings by pulling a termsenum over each sub's // unique terms, and walking a multitermsenum over those this.owner = owner; - globalOrdDeltas = new AppendingLongBuffer(); + globalOrdDeltas = new MonotonicAppendingLongBuffer(); subIndexes = new AppendingLongBuffer(); - ordDeltas = new AppendingLongBuffer[subs.length]; + ordDeltas = new MonotonicAppendingLongBuffer[subs.length]; for (int i = 0; i < ordDeltas.length; i++) { - ordDeltas[i] = new AppendingLongBuffer(); + ordDeltas[i] = new MonotonicAppendingLongBuffer(); } - int segmentOrds[] = new int[subs.length]; + long segmentOrds[] = new long[subs.length]; ReaderSlice slices[] = new ReaderSlice[subs.length]; TermsEnumIndex indexes[] = new TermsEnumIndex[slices.length]; for (int i = 0; i < slices.length; i++) { slices[i] = new ReaderSlice(0, 0, i); - indexes[i] = new TermsEnumIndex(new SortedDocValuesTermsEnum(subs[i]), i); + indexes[i] = new TermsEnumIndex(subs[i], i); } MultiTermsEnum mte = new MultiTermsEnum(slices); mte.reset(indexes); - int globalOrd = 0; + long globalOrd = 0; while (mte.next() != null) { TermsEnumWithSlice matches[] = mte.getMatchArray(); for (int i = 0; i < mte.getMatchCount(); i++) { int subIndex = matches[i].index; - int delta = globalOrd - segmentOrds[subIndex]; - assert delta >= 0; + long segmentOrd = matches[i].terms.ord(); + long delta = globalOrd - segmentOrd; // for each unique term, just mark the first subindex/delta where it occurs if (i == 0) { subIndexes.add(subIndex); globalOrdDeltas.add(delta); } // for each per-segment ord, map it back to the global term. - ordDeltas[subIndex].add(delta); - segmentOrds[subIndex]++; + while (segmentOrds[subIndex] <= segmentOrd) { + ordDeltas[subIndex].add(delta); + segmentOrds[subIndex]++; + } } globalOrd++; } } + + /** + * Given a segment number and segment ordinal, returns + * the corresponding global ordinal. + */ + public long getGlobalOrd(int subIndex, long segmentOrd) { + return segmentOrd + ordDeltas[subIndex].get(segmentOrd); + } + + /** + * Given a segment number and global ordinal, returns + * the corresponding segment ordinal. + */ + public long getSegmentOrd(int subIndex, long globalOrd) { + return globalOrd - globalOrdDeltas.get(globalOrd); + } + + /** + * Given a global ordinal, returns the index of the first + * sub that contains this term. + */ + public int getSegmentNumber(long globalOrd) { + return (int) subIndexes.get(globalOrd); + } + + /** + * Returns the total number of unique terms in global ord space. + */ + public long getValueCount() { + return globalOrdDeltas.size(); + } } /** implements SortedDocValues over n subs, using an OrdinalMap */ @@ -289,19 +380,63 @@ public class MultiDocValues { public int getOrd(int docID) { int subIndex = ReaderUtil.subIndex(docID, docStarts); int segmentOrd = values[subIndex].getOrd(docID - docStarts[subIndex]); - return (int) (segmentOrd + mapping.ordDeltas[subIndex].get(segmentOrd)); + return (int) mapping.getGlobalOrd(subIndex, segmentOrd); } @Override public void lookupOrd(int ord, BytesRef result) { - int subIndex = (int) mapping.subIndexes.get(ord); - int segmentOrd = (int) (ord - mapping.globalOrdDeltas.get(ord)); + int subIndex = mapping.getSegmentNumber(ord); + int segmentOrd = (int) mapping.getSegmentOrd(subIndex, ord); values[subIndex].lookupOrd(segmentOrd, result); } @Override public int getValueCount() { - return mapping.globalOrdDeltas.size(); + return (int) mapping.getValueCount(); + } + } + + /** implements MultiSortedDocValues over n subs, using an OrdinalMap */ + static class MultiSortedSetDocValues extends SortedSetDocValues { + final int docStarts[]; + final SortedSetDocValues values[]; + final OrdinalMap mapping; + int currentSubIndex; + + MultiSortedSetDocValues(SortedSetDocValues values[], int docStarts[], OrdinalMap mapping) throws IOException { + assert values.length == mapping.ordDeltas.length; + assert docStarts.length == values.length + 1; + this.values = values; + this.docStarts = docStarts; + this.mapping = mapping; + } + + @Override + public long nextOrd() { + long segmentOrd = values[currentSubIndex].nextOrd(); + if (segmentOrd == NO_MORE_ORDS) { + return segmentOrd; + } else { + return mapping.getGlobalOrd(currentSubIndex, segmentOrd); + } + } + + @Override + public void setDocument(int docID) { + currentSubIndex = ReaderUtil.subIndex(docID, docStarts); + values[currentSubIndex].setDocument(docID - docStarts[currentSubIndex]); + } + + @Override + public void lookupOrd(long ord, BytesRef result) { + int subIndex = mapping.getSegmentNumber(ord); + long segmentOrd = mapping.getSegmentOrd(subIndex, ord); + values[subIndex].lookupOrd(segmentOrd, result); + } + + @Override + public long getValueCount() { + return mapping.getValueCount(); } } } Modified: lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/MultiTermsEnum.java URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/MultiTermsEnum.java?rev=1448085&r1=1448084&r2=1448085&view=diff ============================================================================== --- lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/MultiTermsEnum.java (original) +++ lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/MultiTermsEnum.java Wed Feb 20 11:19:22 2013 @@ -497,7 +497,7 @@ public final class MultiTermsEnum extend final static class TermsEnumWithSlice { private final ReaderSlice subSlice; - private TermsEnum terms; + TermsEnum terms; public BytesRef current; final int index; Modified: lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/NumericDocValuesWriter.java URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/NumericDocValuesWriter.java?rev=1448085&r1=1448084&r2=1448085&view=diff ============================================================================== --- lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/NumericDocValuesWriter.java (original) +++ lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/NumericDocValuesWriter.java Wed Feb 20 11:19:22 2013 @@ -50,7 +50,7 @@ class NumericDocValuesWriter extends Doc } // Fill in any holes: - for (int i = pending.size(); i < docID; ++i) { + for (int i = (int)pending.size(); i < docID; ++i) { pending.add(MISSING); } @@ -90,7 +90,7 @@ class NumericDocValuesWriter extends Doc // iterates over the values we have in ram private class NumericIterator implements Iterator { final AppendingLongBuffer.Iterator iter = pending.iterator(); - final int size = pending.size(); + final int size = (int)pending.size(); final int maxDoc; int upto; Modified: lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/ParallelAtomicReader.java URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/ParallelAtomicReader.java?rev=1448085&r1=1448084&r2=1448085&view=diff ============================================================================== --- lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/ParallelAtomicReader.java (original) +++ lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/ParallelAtomicReader.java Wed Feb 20 11:19:22 2013 @@ -285,6 +285,13 @@ public final class ParallelAtomicReader } @Override + public SortedSetDocValues getSortedSetDocValues(String field) throws IOException { + ensureOpen(); + AtomicReader reader = fieldToReader.get(field); + return reader == null ? null : reader.getSortedSetDocValues(field); + } + + @Override public NumericDocValues getNormValues(String field) throws IOException { ensureOpen(); AtomicReader reader = fieldToReader.get(field); Modified: lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/SegmentCoreReaders.java URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/SegmentCoreReaders.java?rev=1448085&r1=1448084&r2=1448085&view=diff ============================================================================== --- lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/SegmentCoreReaders.java (original) +++ lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/SegmentCoreReaders.java Wed Feb 20 11:19:22 2013 @@ -253,6 +253,34 @@ final class SegmentCoreReaders { return dvs; } + + SortedSetDocValues getSortedSetDocValues(String field) throws IOException { + FieldInfo fi = fieldInfos.fieldInfo(field); + if (fi == null) { + // Field does not exist + return null; + } + if (fi.getDocValuesType() == null) { + // Field was not indexed with doc values + return null; + } + if (fi.getDocValuesType() != DocValuesType.SORTED_SET) { + // DocValues were not sorted + return null; + } + + assert dvProducer != null; + + Map dvFields = docValuesLocal.get(); + + SortedSetDocValues dvs = (SortedSetDocValues) dvFields.get(field); + if (dvs == null) { + dvs = dvProducer.getSortedSet(fi); + dvFields.put(field, dvs); + } + + return dvs; + } NumericDocValues getNormValues(String field) throws IOException { FieldInfo fi = fieldInfos.fieldInfo(field); Modified: lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/SegmentMerger.java URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/SegmentMerger.java?rev=1448085&r1=1448084&r2=1448085&view=diff ============================================================================== --- lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/SegmentMerger.java (original) +++ lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/SegmentMerger.java Wed Feb 20 11:19:22 2013 @@ -197,6 +197,16 @@ final class SegmentMerger { toMerge.add(values); } consumer.mergeSortedField(field, mergeState, toMerge); + } else if (type == DocValuesType.SORTED_SET) { + List toMerge = new ArrayList(); + for (AtomicReader reader : mergeState.readers) { + SortedSetDocValues values = reader.getSortedSetDocValues(field.name); + if (values == null) { + values = SortedSetDocValues.EMPTY; + } + toMerge.add(values); + } + consumer.mergeSortedSetField(field, mergeState, toMerge); } else { throw new AssertionError("type=" + type); } Modified: lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/SegmentReader.java URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/SegmentReader.java?rev=1448085&r1=1448084&r2=1448085&view=diff ============================================================================== --- lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/SegmentReader.java (original) +++ lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/SegmentReader.java Wed Feb 20 11:19:22 2013 @@ -248,6 +248,12 @@ public final class SegmentReader extends } @Override + public SortedSetDocValues getSortedSetDocValues(String field) throws IOException { + ensureOpen(); + return core.getSortedSetDocValues(field); + } + + @Override public NumericDocValues getNormValues(String field) throws IOException { ensureOpen(); return core.getNormValues(field); Modified: lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/SlowCompositeReaderWrapper.java URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/SlowCompositeReaderWrapper.java?rev=1448085&r1=1448084&r2=1448085&view=diff ============================================================================== --- lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/SlowCompositeReaderWrapper.java (original) +++ lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/SlowCompositeReaderWrapper.java Wed Feb 20 11:19:22 2013 @@ -24,7 +24,9 @@ import java.util.Map; import org.apache.lucene.util.Bits; import org.apache.lucene.index.DirectoryReader; // javadoc +import org.apache.lucene.index.FieldInfo.DocValuesType; import org.apache.lucene.index.MultiDocValues.MultiSortedDocValues; +import org.apache.lucene.index.MultiDocValues.MultiSortedSetDocValues; import org.apache.lucene.index.MultiDocValues.OrdinalMap; import org.apache.lucene.index.MultiReader; // javadoc @@ -113,8 +115,10 @@ public final class SlowCompositeReaderWr return dv; } } - // cached multi dv - assert map != null; + // cached ordinal map + if (getFieldInfos().fieldInfo(field).getDocValuesType() != DocValuesType.SORTED) { + return null; + } int size = in.leaves().size(); final SortedDocValues[] values = new SortedDocValues[size]; final int[] starts = new int[size+1]; @@ -131,6 +135,45 @@ public final class SlowCompositeReaderWr return new MultiSortedDocValues(values, starts, map); } + @Override + public SortedSetDocValues getSortedSetDocValues(String field) throws IOException { + ensureOpen(); + OrdinalMap map = null; + synchronized (cachedOrdMaps) { + map = cachedOrdMaps.get(field); + if (map == null) { + // uncached, or not a multi dv + SortedSetDocValues dv = MultiDocValues.getSortedSetValues(in, field); + if (dv instanceof MultiSortedSetDocValues) { + map = ((MultiSortedSetDocValues)dv).mapping; + if (map.owner == getCoreCacheKey()) { + cachedOrdMaps.put(field, map); + } + } + return dv; + } + } + // cached ordinal map + if (getFieldInfos().fieldInfo(field).getDocValuesType() != DocValuesType.SORTED_SET) { + return null; + } + assert map != null; + int size = in.leaves().size(); + final SortedSetDocValues[] values = new SortedSetDocValues[size]; + final int[] starts = new int[size+1]; + for (int i = 0; i < size; i++) { + AtomicReaderContext context = in.leaves().get(i); + SortedSetDocValues v = context.reader().getSortedSetDocValues(field); + if (v == null) { + v = SortedSetDocValues.EMPTY; + } + values[i] = v; + starts[i] = context.docBase; + } + starts[size] = maxDoc(); + return new MultiSortedSetDocValues(values, starts, map); + } + // TODO: this could really be a weak map somewhere else on the coreCacheKey, // but do we really need to optimize slow-wrapper any more? private final Map cachedOrdMaps = new HashMap(); Modified: lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/search/FieldCache.java URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/search/FieldCache.java?rev=1448085&r1=1448084&r2=1448085&view=diff ============================================================================== --- lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/search/FieldCache.java (original) +++ lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/search/FieldCache.java Wed Feb 20 11:19:22 2013 @@ -29,6 +29,7 @@ import org.apache.lucene.index.AtomicRea import org.apache.lucene.index.BinaryDocValues; import org.apache.lucene.index.DocTermOrds; import org.apache.lucene.index.SortedDocValues; +import org.apache.lucene.index.SortedSetDocValues; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.util.Bits; @@ -670,7 +671,7 @@ public interface FieldCache { * @return a {@link DocTermOrds} instance * @throws IOException If any error occurs. */ - public DocTermOrds getDocTermOrds(AtomicReader reader, String field) throws IOException; + public SortedSetDocValues getDocTermOrds(AtomicReader reader, String field) throws IOException; /** * EXPERT: A unique Identifier/Description for each item in the FieldCache. Modified: lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/search/FieldCacheImpl.java URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/search/FieldCacheImpl.java?rev=1448085&r1=1448084&r2=1448085&view=diff ============================================================================== --- lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/search/FieldCacheImpl.java (original) +++ lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/search/FieldCacheImpl.java Wed Feb 20 11:19:22 2013 @@ -33,7 +33,9 @@ import org.apache.lucene.index.FieldInfo import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.NumericDocValues; import org.apache.lucene.index.SegmentReader; +import org.apache.lucene.index.SingletonSortedSetDocValues; import org.apache.lucene.index.SortedDocValues; +import org.apache.lucene.index.SortedSetDocValues; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.util.ArrayUtil; @@ -1363,8 +1365,30 @@ class FieldCacheImpl implements FieldCac } } - public DocTermOrds getDocTermOrds(AtomicReader reader, String field) throws IOException { - return (DocTermOrds) caches.get(DocTermOrds.class).get(reader, new CacheKey(field, null), false); + // TODO: this if DocTermsIndex was already created, we + // should share it... + public SortedSetDocValues getDocTermOrds(AtomicReader reader, String field) throws IOException { + SortedSetDocValues dv = reader.getSortedSetDocValues(field); + if (dv != null) { + return dv; + } + + SortedDocValues sdv = reader.getSortedDocValues(field); + if (sdv != null) { + return new SingletonSortedSetDocValues(sdv); + } + + final FieldInfo info = reader.getFieldInfos().fieldInfo(field); + if (info == null) { + return SortedSetDocValues.EMPTY; + } else if (info.hasDocValues()) { + throw new IllegalStateException("Type mismatch: " + field + " was indexed as " + info.getDocValuesType()); + } else if (!info.isIndexed()) { + return SortedSetDocValues.EMPTY; + } + + DocTermOrds dto = (DocTermOrds) caches.get(DocTermOrds.class).get(reader, new CacheKey(field, null), false); + return dto.iterator(dto.getOrdTermsEnum(reader)); } static final class DocTermOrdsCache extends Cache { @@ -1375,7 +1399,6 @@ class FieldCacheImpl implements FieldCac @Override protected Object createValue(AtomicReader reader, CacheKey key, boolean setDocsWithField /* ignored */) throws IOException { - // No DocValues impl yet (DocValues are single valued...): return new DocTermOrds(reader, key.field); } } Modified: lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/packed/AppendingLongBuffer.java URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/packed/AppendingLongBuffer.java?rev=1448085&r1=1448084&r2=1448085&view=diff ============================================================================== --- lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/packed/AppendingLongBuffer.java (original) +++ lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/packed/AppendingLongBuffer.java Wed Feb 20 11:19:22 2013 @@ -19,72 +19,33 @@ package org.apache.lucene.util.packed; import java.util.Arrays; -import org.apache.lucene.util.ArrayUtil; -import org.apache.lucene.util.RamUsageEstimator; - /** * Utility class to buffer a list of signed longs in memory. This class only - * supports appending. + * supports appending and is optimized for the case where values are close to + * each other. * @lucene.internal */ -public class AppendingLongBuffer { - - private static final int BLOCK_BITS = 10; - private static final int MAX_PENDING_COUNT = 1 << BLOCK_BITS; - private static final int BLOCK_MASK = MAX_PENDING_COUNT - 1; - - private long[] minValues; - private PackedInts.Reader[] values; - private long valuesBytes; - private int valuesOff; - private long[] pending; - private int pendingOff; +public final class AppendingLongBuffer extends AbstractAppendingLongBuffer { /** Sole constructor. */ public AppendingLongBuffer() { - minValues = new long[16]; - values = new PackedInts.Reader[16]; - pending = new long[MAX_PENDING_COUNT]; - valuesOff = 0; - pendingOff = 0; + super(16); } - /** Append a value to this buffer. */ - public void add(long l) { - if (pendingOff == MAX_PENDING_COUNT) { - packPendingValues(); - } - pending[pendingOff++] = l; - } - - /** Get a value from this buffer. - *

- * NOTE: This class is not really designed for random access! - * You will likely get better performance by using packed ints in another way! */ - public long get(int index) { - assert index < size(); // TODO: do a better check, and throw IndexOutOfBoundsException? - // This class is currently only used by the indexer. - int block = index >> BLOCK_BITS; - int element = index & BLOCK_MASK; + @Override + long get(int block, int element) { if (block == valuesOff) { return pending[element]; - } else if (values[block] == null) { + } else if (deltas[block] == null) { return minValues[block]; } else { - return minValues[block] + values[block].get(element); + return minValues[block] + deltas[block].get(element); } } - private void packPendingValues() { + void packPendingValues() { assert pendingOff == MAX_PENDING_COUNT; - // check size - if (values.length == valuesOff) { - final int newLength = ArrayUtil.oversize(valuesOff + 1, 8); - minValues = Arrays.copyOf(minValues, newLength); - values = Arrays.copyOf(values, newLength); - } - // compute max delta long minValue = pending[0]; long maxValue = pending[0]; @@ -105,18 +66,8 @@ public class AppendingLongBuffer { for (int i = 0; i < pendingOff; ) { i += mutable.set(i, pending, i, pendingOff - i); } - values[valuesOff] = mutable; - valuesBytes += mutable.ramBytesUsed(); + deltas[valuesOff] = mutable; } - ++valuesOff; - - // reset pending buffer - pendingOff = 0; - } - - /** Get the number of values that have been added to the buffer. */ - public int size() { - return valuesOff * MAX_PENDING_COUNT + pendingOff; } /** Return an iterator over the values of this buffer. */ @@ -125,29 +76,20 @@ public class AppendingLongBuffer { } /** A long iterator. */ - public class Iterator { - - long[] currentValues; - int vOff, pOff; + public final class Iterator extends AbstractAppendingLongBuffer.Iterator { private Iterator() { - vOff = pOff = 0; - if (valuesOff == 0) { - currentValues = pending; - } else { - currentValues = new long[MAX_PENDING_COUNT]; - fillValues(); - } + super(); } - private void fillValues() { + void fillValues() { if (vOff == valuesOff) { currentValues = pending; - } else if (values[vOff] == null) { + } else if (deltas[vOff] == null) { Arrays.fill(currentValues, minValues[vOff]); } else { for (int k = 0; k < MAX_PENDING_COUNT; ) { - k += values[vOff].get(k, currentValues, k, MAX_PENDING_COUNT - k); + k += deltas[vOff].get(k, currentValues, k, MAX_PENDING_COUNT - k); } for (int k = 0; k < MAX_PENDING_COUNT; ++k) { currentValues[k] += minValues[vOff]; @@ -155,42 +97,6 @@ public class AppendingLongBuffer { } } - /** Whether or not there are remaining values. */ - public boolean hasNext() { - return vOff < valuesOff || (vOff == valuesOff && pOff < pendingOff); - } - - /** Return the next long in the buffer. */ - public long next() { - assert hasNext(); - long result = currentValues[pOff++]; - if (pOff == MAX_PENDING_COUNT) { - vOff += 1; - pOff = 0; - if (vOff <= valuesOff) { - fillValues(); - } - } - return result; - } - - } - - /** - * Return the number of bytes used by this instance. - */ - public long ramBytesUsed() { - // TODO: this is called per-doc-per-norms/dv-field, can we optimize this? - long bytesUsed = RamUsageEstimator.alignObjectSize( - RamUsageEstimator.NUM_BYTES_OBJECT_HEADER - + 3 * RamUsageEstimator.NUM_BYTES_OBJECT_REF // the 3 arrays - + 2 * RamUsageEstimator.NUM_BYTES_INT) // the 2 offsets - + RamUsageEstimator.NUM_BYTES_LONG // valuesBytes - + RamUsageEstimator.sizeOf(pending) - + RamUsageEstimator.sizeOf(minValues) - + RamUsageEstimator.alignObjectSize(RamUsageEstimator.NUM_BYTES_ARRAY_HEADER + (long) RamUsageEstimator.NUM_BYTES_OBJECT_REF * values.length); // values - - return bytesUsed + valuesBytes; } } Modified: lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/index/TestDocTermOrds.java URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/index/TestDocTermOrds.java?rev=1448085&r1=1448084&r2=1448085&view=diff ============================================================================== --- lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/index/TestDocTermOrds.java (original) +++ lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/index/TestDocTermOrds.java Wed Feb 20 11:19:22 2013 @@ -29,7 +29,6 @@ import org.apache.lucene.codecs.Postings import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.IntField; -import org.apache.lucene.index.DocTermOrds.TermOrdsIterator; import org.apache.lucene.search.FieldCache; import org.apache.lucene.store.Directory; import org.apache.lucene.util.BytesRef; @@ -63,25 +62,26 @@ public class TestDocTermOrds extends Luc final IndexReader r = w.getReader(); w.close(); - final DocTermOrds dto = new DocTermOrds(SlowCompositeReaderWrapper.wrap(r), "field"); - - TermOrdsIterator iter = dto.lookup(0, null); - final int[] buffer = new int[5]; - assertEquals(3, iter.read(buffer)); - assertEquals(0, buffer[0]); - assertEquals(1, buffer[1]); - assertEquals(2, buffer[2]); - - iter = dto.lookup(1, iter); - assertEquals(3, iter.read(buffer)); - assertEquals(3, buffer[0]); - assertEquals(4, buffer[1]); - assertEquals(5, buffer[2]); - - iter = dto.lookup(2, iter); - assertEquals(2, iter.read(buffer)); - assertEquals(0, buffer[0]); - assertEquals(5, buffer[1]); + final AtomicReader ar = SlowCompositeReaderWrapper.wrap(r); + final DocTermOrds dto = new DocTermOrds(ar, "field"); + SortedSetDocValues iter = dto.iterator(ar.terms("field").iterator(null)); + + iter.setDocument(0); + assertEquals(0, iter.nextOrd()); + assertEquals(1, iter.nextOrd()); + assertEquals(2, iter.nextOrd()); + assertEquals(SortedSetDocValues.NO_MORE_ORDS, iter.nextOrd()); + + iter.setDocument(1); + assertEquals(3, iter.nextOrd()); + assertEquals(4, iter.nextOrd()); + assertEquals(5, iter.nextOrd()); + assertEquals(SortedSetDocValues.NO_MORE_ORDS, iter.nextOrd()); + + iter.setDocument(2); + assertEquals(0, iter.nextOrd()); + assertEquals(5, iter.nextOrd()); + assertEquals(SortedSetDocValues.NO_MORE_ORDS, iter.nextOrd()); r.close(); dir.close(); @@ -352,31 +352,24 @@ public class TestDocTermOrds extends Luc } } - TermOrdsIterator iter = null; - final int[] buffer = new int[5]; + SortedSetDocValues iter = dto.iterator(te); for(int docID=0;docID expectedList = new ArrayList(); + long ord; + while ((ord = single.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) { + expectedList.add(ord); + } + + multi.setDocument(i); + int upto = 0; + while ((ord = multi.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) { + assertEquals(expectedList.get(upto).longValue(), ord); + upto++; + } + assertEquals(expectedList.size(), upto); + } + } + + ir.close(); + ir2.close(); + dir.close(); + } + + // tries to make more dups than testSortedSet + public void testSortedSetWithDups() throws Exception { + assumeTrue("codec does not support SORTED_SET", defaultCodecSupportsSortedSet()); + Directory dir = newDirectory(); + + IndexWriterConfig iwc = newIndexWriterConfig(random(), TEST_VERSION_CURRENT, null); + iwc.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); + + int numDocs = atLeast(500); + for (int i = 0; i < numDocs; i++) { + Document doc = new Document(); + int numValues = random().nextInt(5); + for (int j = 0; j < numValues; j++) { + doc.add(new SortedSetDocValuesField("bytes", new BytesRef(_TestUtil.randomSimpleString(random(), 2)))); + } + iw.addDocument(doc); + if (random().nextInt(17) == 0) { + iw.commit(); + } + } + DirectoryReader ir = iw.getReader(); + iw.forceMerge(1); + DirectoryReader ir2 = iw.getReader(); + AtomicReader merged = getOnlySegmentReader(ir2); + iw.close(); + + SortedSetDocValues multi = MultiDocValues.getSortedSetValues(ir, "bytes"); + SortedSetDocValues single = merged.getSortedSetDocValues("bytes"); + if (multi == null) { + assertNull(single); + } else { + assertEquals(single.getValueCount(), multi.getValueCount()); + BytesRef actual = new BytesRef(); + BytesRef expected = new BytesRef(); + // check values + for (long i = 0; i < single.getValueCount(); i++) { + single.lookupOrd(i, expected); + multi.lookupOrd(i, actual); + assertEquals(expected, actual); + } + // check ord list + for (int i = 0; i < numDocs; i++) { + single.setDocument(i); + ArrayList expectedList = new ArrayList(); + long ord; + while ((ord = single.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) { + expectedList.add(ord); + } + + multi.setDocument(i); + int upto = 0; + while ((ord = multi.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) { + assertEquals(expectedList.get(upto).longValue(), ord); + upto++; + } + assertEquals(expectedList.size(), upto); + } + } + + ir.close(); + ir2.close(); + dir.close(); + } } Modified: lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/search/TestFieldCache.java URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/search/TestFieldCache.java?rev=1448085&r1=1448084&r2=1448085&view=diff ============================================================================== --- lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/search/TestFieldCache.java (original) +++ lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/search/TestFieldCache.java Wed Feb 20 11:19:22 2013 @@ -29,12 +29,14 @@ import java.util.concurrent.atomic.Atomi import java.util.concurrent.atomic.AtomicInteger; import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.codecs.DocValuesFormat; import org.apache.lucene.document.BinaryDocValuesField; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.IntField; import org.apache.lucene.document.NumericDocValuesField; import org.apache.lucene.document.SortedDocValuesField; +import org.apache.lucene.document.SortedSetDocValuesField; import org.apache.lucene.document.StoredField; import org.apache.lucene.document.StringField; import org.apache.lucene.index.*; @@ -263,45 +265,33 @@ public class TestFieldCache extends Luce terms = cache.getTerms(reader, "bogusfield"); // getDocTermOrds - DocTermOrds termOrds = cache.getDocTermOrds(reader, "theRandomUnicodeMultiValuedField"); - TermsEnum termsEnum = termOrds.getOrdTermsEnum(reader); - assertSame("Second request to cache return same DocTermOrds", termOrds, cache.getDocTermOrds(reader, "theRandomUnicodeMultiValuedField")); - DocTermOrds.TermOrdsIterator reuse = null; + SortedSetDocValues termOrds = cache.getDocTermOrds(reader, "theRandomUnicodeMultiValuedField"); + int numEntries = cache.getCacheEntries().length; + // ask for it again, and check that we didnt create any additional entries: + termOrds = cache.getDocTermOrds(reader, "theRandomUnicodeMultiValuedField"); + assertEquals(numEntries, cache.getCacheEntries().length); + for (int i = 0; i < NUM_DOCS; i++) { - reuse = termOrds.lookup(i, reuse); - final int[] buffer = new int[5]; + termOrds.setDocument(i); // This will remove identical terms. A DocTermOrds doesn't return duplicate ords for a docId List values = new ArrayList(new LinkedHashSet(Arrays.asList(multiValued[i]))); - for (;;) { - int chunk = reuse.read(buffer); - if (chunk == 0) { - for (int ord = 0; ord < values.size(); ord++) { - BytesRef term = values.get(ord); - assertNull(String.format(Locale.ROOT, "Document[%d] misses field must be null. Has value %s for ord %d", i, term, ord), term); - } - break; - } - - for(int idx=0; idx < chunk; idx++) { - int key = buffer[idx]; - termsEnum.seekExact((long) key); - String actual = termsEnum.term().utf8ToString(); - String expected = values.get(idx).utf8ToString(); - if (!expected.equals(actual)) { - reuse = termOrds.lookup(i, reuse); - reuse.read(buffer); - } - assertTrue(String.format(Locale.ROOT, "Expected value %s for doc %d and ord %d, but was %s", expected, i, idx, actual), expected.equals(actual)); - } - - if (chunk <= buffer.length) { + for (BytesRef v : values) { + if (v == null) { + // why does this test use null values... instead of an empty list: confusing break; } + long ord = termOrds.nextOrd(); + assert ord != SortedSetDocValues.NO_MORE_ORDS; + BytesRef scratch = new BytesRef(); + termOrds.lookupOrd(ord, scratch); + assertEquals(v, scratch); } + assertEquals(SortedSetDocValues.NO_MORE_ORDS, termOrds.nextOrd()); } // test bad field termOrds = cache.getDocTermOrds(reader, "bogusfield"); + assertTrue(termOrds.getValueCount() == 0); FieldCache.DEFAULT.purge(reader); } @@ -445,11 +435,16 @@ public class TestFieldCache extends Luce public void testDocValuesIntegration() throws Exception { assumeTrue("3.x does not support docvalues", defaultCodecSupportsDocValues()); Directory dir = newDirectory(); - RandomIndexWriter iw = new RandomIndexWriter(random(), dir); + IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, null); + RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); Document doc = new Document(); doc.add(new BinaryDocValuesField("binary", new BytesRef("binary value"))); doc.add(new SortedDocValuesField("sorted", new BytesRef("sorted value"))); doc.add(new NumericDocValuesField("numeric", 42)); + if (defaultCodecSupportsSortedSet()) { + doc.add(new SortedSetDocValuesField("sortedset", new BytesRef("sortedset value1"))); + doc.add(new SortedSetDocValuesField("sortedset", new BytesRef("sortedset value2"))); + } iw.addDocument(doc); DirectoryReader ir = iw.getReader(); iw.close(); @@ -472,15 +467,30 @@ public class TestFieldCache extends Luce fail(); } catch (IllegalStateException expected) {} + try { + FieldCache.DEFAULT.getDocTermOrds(ar, "binary"); + fail(); + } catch (IllegalStateException expected) {} + + try { + new DocTermOrds(ar, "binary"); + fail(); + } catch (IllegalStateException expected) {} + Bits bits = FieldCache.DEFAULT.getDocsWithField(ar, "binary"); assertTrue(bits instanceof Bits.MatchAllBits); - // Sorted type: can be retrieved via getTerms() or getTermsIndex() + // Sorted type: can be retrieved via getTerms(), getTermsIndex(), getDocTermOrds() try { FieldCache.DEFAULT.getInts(ar, "sorted", false); fail(); } catch (IllegalStateException expected) {} + try { + new DocTermOrds(ar, "sorted"); + fail(); + } catch (IllegalStateException expected) {} + binary = FieldCache.DEFAULT.getTerms(ar, "sorted"); binary.get(0, scratch); assertEquals("sorted value", scratch.utf8ToString()); @@ -491,6 +501,12 @@ public class TestFieldCache extends Luce sorted.get(0, scratch); assertEquals("sorted value", scratch.utf8ToString()); + SortedSetDocValues sortedSet = FieldCache.DEFAULT.getDocTermOrds(ar, "sorted"); + sortedSet.setDocument(0); + assertEquals(0, sortedSet.nextOrd()); + assertEquals(SortedSetDocValues.NO_MORE_ORDS, sortedSet.nextOrd()); + assertEquals(1, sortedSet.getValueCount()); + bits = FieldCache.DEFAULT.getDocsWithField(ar, "sorted"); assertTrue(bits instanceof Bits.MatchAllBits); @@ -508,9 +524,52 @@ public class TestFieldCache extends Luce fail(); } catch (IllegalStateException expected) {} + try { + FieldCache.DEFAULT.getDocTermOrds(ar, "numeric"); + fail(); + } catch (IllegalStateException expected) {} + + try { + new DocTermOrds(ar, "numeric"); + fail(); + } catch (IllegalStateException expected) {} + bits = FieldCache.DEFAULT.getDocsWithField(ar, "numeric"); assertTrue(bits instanceof Bits.MatchAllBits); + // SortedSet type: can be retrieved via getDocTermOrds() + if (defaultCodecSupportsSortedSet()) { + try { + FieldCache.DEFAULT.getInts(ar, "sortedset", false); + fail(); + } catch (IllegalStateException expected) {} + + try { + FieldCache.DEFAULT.getTerms(ar, "sortedset"); + fail(); + } catch (IllegalStateException expected) {} + + try { + FieldCache.DEFAULT.getTermsIndex(ar, "sortedset"); + fail(); + } catch (IllegalStateException expected) {} + + try { + new DocTermOrds(ar, "sortedset"); + fail(); + } catch (IllegalStateException expected) {} + + sortedSet = FieldCache.DEFAULT.getDocTermOrds(ar, "sortedset"); + sortedSet.setDocument(0); + assertEquals(0, sortedSet.nextOrd()); + assertEquals(1, sortedSet.nextOrd()); + assertEquals(SortedSetDocValues.NO_MORE_ORDS, sortedSet.nextOrd()); + assertEquals(2, sortedSet.getValueCount()); + + bits = FieldCache.DEFAULT.getDocsWithField(ar, "sortedset"); + assertTrue(bits instanceof Bits.MatchAllBits); + } + ir.close(); dir.close(); } @@ -557,6 +616,10 @@ public class TestFieldCache extends Luce sorted.get(0, scratch); assertTrue(scratch.bytes == BinaryDocValues.MISSING); + SortedSetDocValues sortedSet = cache.getDocTermOrds(ar, "bogusmultivalued"); + sortedSet.setDocument(0); + assertEquals(SortedSetDocValues.NO_MORE_ORDS, sortedSet.nextOrd()); + Bits bits = cache.getDocsWithField(ar, "bogusbits"); assertFalse(bits.get(0)); @@ -578,6 +641,7 @@ public class TestFieldCache extends Luce doc.add(new StoredField("bogusdoubles", "bogus")); doc.add(new StoredField("bogusterms", "bogus")); doc.add(new StoredField("bogustermsindex", "bogus")); + doc.add(new StoredField("bogusmultivalued", "bogus")); doc.add(new StoredField("bogusbits", "bogus")); iw.addDocument(doc); DirectoryReader ir = iw.getReader(); @@ -617,6 +681,10 @@ public class TestFieldCache extends Luce sorted.get(0, scratch); assertTrue(scratch.bytes == BinaryDocValues.MISSING); + SortedSetDocValues sortedSet = cache.getDocTermOrds(ar, "bogusmultivalued"); + sortedSet.setDocument(0); + assertEquals(SortedSetDocValues.NO_MORE_ORDS, sortedSet.nextOrd()); + Bits bits = cache.getDocsWithField(ar, "bogusbits"); assertFalse(bits.get(0)); Modified: lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/util/packed/TestPackedInts.java URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/util/packed/TestPackedInts.java?rev=1448085&r1=1448084&r2=1448085&view=diff ============================================================================== --- lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/util/packed/TestPackedInts.java (original) +++ lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/util/packed/TestPackedInts.java Wed Feb 20 11:19:22 2013 @@ -805,42 +805,55 @@ public class TestPackedInts extends Luce } public void testAppendingLongBuffer() { - final long[] arr = new long[RandomInts.randomIntBetween(random(), 1, 2000000)]; - for (int bpv : new int[] {0, 1, 63, 64, RandomInts.randomIntBetween(random(), 2, 61)}) { - if (bpv == 0) { - Arrays.fill(arr, random().nextLong()); - } else if (bpv == 64) { + final long[] arr = new long[RandomInts.randomIntBetween(random(), 1, 1000000)]; + for (int bpv : new int[] {0, 1, 63, 64, RandomInts.randomIntBetween(random(), 2, 62)}) { + for (boolean monotonic : new boolean[] {true, false}) { + AbstractAppendingLongBuffer buf; + final int inc; + if (monotonic) { + buf = new MonotonicAppendingLongBuffer(); + inc = _TestUtil.nextInt(random(), -1000, 1000); + } else { + buf = new AppendingLongBuffer(); + inc = 0; + } + if (bpv == 0) { + arr[0] = random().nextLong(); + for (int i = 1; i < arr.length; ++i) { + arr[i] = arr[i-1] + inc; + } + } else if (bpv == 64) { + for (int i = 0; i < arr.length; ++i) { + arr[i] = random().nextLong(); + } + } else { + final long minValue = _TestUtil.nextLong(random(), Long.MIN_VALUE, Long.MAX_VALUE - PackedInts.maxValue(bpv)); + for (int i = 0; i < arr.length; ++i) { + arr[i] = minValue + inc * i + random().nextLong() & PackedInts.maxValue(bpv); // _TestUtil.nextLong is too slow + } + } for (int i = 0; i < arr.length; ++i) { - arr[i] = random().nextLong(); + buf.add(arr[i]); } - } else { - final long minValue = _TestUtil.nextLong(random(), Long.MIN_VALUE, Long.MAX_VALUE - PackedInts.maxValue(bpv)); + assertEquals(arr.length, buf.size()); + final AbstractAppendingLongBuffer.Iterator it = buf.iterator(); for (int i = 0; i < arr.length; ++i) { - arr[i] = minValue + random().nextLong() & PackedInts.maxValue(bpv); // _TestUtil.nextLong is too slow + if (random().nextBoolean()) { + assertTrue(it.hasNext()); + } + assertEquals(arr[i], it.next()); } - } - AppendingLongBuffer buf = new AppendingLongBuffer(); - for (int i = 0; i < arr.length; ++i) { - buf.add(arr[i]); - } - assertEquals(arr.length, buf.size()); - final AppendingLongBuffer.Iterator it = buf.iterator(); - for (int i = 0; i < arr.length; ++i) { - if (random().nextBoolean()) { - assertTrue(it.hasNext()); + assertFalse(it.hasNext()); + + for (int i = 0; i < arr.length; ++i) { + assertEquals(arr[i], buf.get(i)); } - assertEquals(arr[i], it.next()); - } - assertFalse(it.hasNext()); - - for (int i = 0; i < arr.length; ++i) { - assertEquals(arr[i], buf.get(i)); + + final long expectedBytesUsed = RamUsageEstimator.sizeOf(buf); + final long computedBytesUsed = buf.ramBytesUsed(); + assertEquals("got " + computedBytesUsed + ", expected: " + expectedBytesUsed, + expectedBytesUsed, computedBytesUsed); } - - final long expectedBytesUsed = RamUsageEstimator.sizeOf(buf); - final long computedBytesUsed = buf.ramBytesUsed(); - assertEquals("got " + computedBytesUsed + ", expected: " + expectedBytesUsed, - expectedBytesUsed, computedBytesUsed); } } Modified: lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/codecs/facet42/Facet42DocValuesConsumer.java URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/codecs/facet42/Facet42DocValuesConsumer.java?rev=1448085&r1=1448084&r2=1448085&view=diff ============================================================================== --- lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/codecs/facet42/Facet42DocValuesConsumer.java (original) +++ lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/codecs/facet42/Facet42DocValuesConsumer.java Wed Feb 20 11:19:22 2013 @@ -98,6 +98,11 @@ public class Facet42DocValuesConsumer ex } @Override + public void addSortedSetField(FieldInfo field, Iterable values, Iterable docToOrdCount, Iterable ords) throws IOException { + throw new UnsupportedOperationException("FacetsDocValues can only handle binary fields"); + } + + @Override public void close() throws IOException { boolean success = false; try { Modified: lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/codecs/facet42/Facet42DocValuesProducer.java URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/codecs/facet42/Facet42DocValuesProducer.java?rev=1448085&r1=1448084&r2=1448085&view=diff ============================================================================== --- lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/codecs/facet42/Facet42DocValuesProducer.java (original) +++ lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/codecs/facet42/Facet42DocValuesProducer.java Wed Feb 20 11:19:22 2013 @@ -29,6 +29,7 @@ import org.apache.lucene.index.IndexFile import org.apache.lucene.index.NumericDocValues; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SortedDocValues; +import org.apache.lucene.index.SortedSetDocValues; import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.IOUtils; @@ -75,6 +76,11 @@ class Facet42DocValuesProducer extends D } @Override + public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException { + throw new UnsupportedOperationException("FacetsDocValues only implements binary"); + } + + @Override public void close() throws IOException { } }