Return-Path: X-Original-To: archive-asf-public-internal@cust-asf2.ponee.io Delivered-To: archive-asf-public-internal@cust-asf2.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by cust-asf2.ponee.io (Postfix) with ESMTP id 1A8A7200AE2 for ; Fri, 27 May 2016 18:42:24 +0200 (CEST) Received: by cust-asf.ponee.io (Postfix) id 1910E160A10; Fri, 27 May 2016 16:42:24 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id BF42A160A3B for ; Fri, 27 May 2016 18:42:21 +0200 (CEST) Received: (qmail 34710 invoked by uid 500); 27 May 2016 16:42:20 -0000 Mailing-List: contact commits-help@lucene.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@lucene.apache.org Delivered-To: mailing list commits@lucene.apache.org Received: (qmail 34191 invoked by uid 99); 27 May 2016 16:42:20 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Fri, 27 May 2016 16:42:20 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id 2BE79E0103; Fri, 27 May 2016 16:42:19 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: yonik@apache.org To: commits@lucene.apache.org Date: Fri, 27 May 2016 16:42:22 -0000 Message-Id: In-Reply-To: <88dcd42928bf4fb3a9c2feca8ce720ca@git.apache.org> References: <88dcd42928bf4fb3a9c2feca8ce720ca@git.apache.org> X-Mailer: ASF-Git Admin Mailer Subject: [4/6] lucene-solr:branch_6x: SOLR-9160: Sync 6x and 7.0 move of UninvertingReader, SlowCompositeReaderWrapper for Solr (LUCENE-7283) archived-at: Fri, 27 May 2016 16:42:24 -0000 http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/5525f429/solr/core/src/java/org/apache/solr/uninverting/FieldCacheImpl.java ---------------------------------------------------------------------- diff --git a/solr/core/src/java/org/apache/solr/uninverting/FieldCacheImpl.java b/solr/core/src/java/org/apache/solr/uninverting/FieldCacheImpl.java new file mode 100644 index 0000000..e6a066d --- /dev/null +++ b/solr/core/src/java/org/apache/solr/uninverting/FieldCacheImpl.java @@ -0,0 +1,1085 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.uninverting; + +import java.io.IOException; +import java.io.PrintStream; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.WeakHashMap; + +import org.apache.lucene.index.BinaryDocValues; +import org.apache.lucene.index.DocValues; +import org.apache.lucene.index.DocValuesType; +import org.apache.lucene.index.PostingsEnum; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.index.LeafReader; +import org.apache.lucene.index.NumericDocValues; +import org.apache.lucene.index.PointValues; +import org.apache.lucene.index.PointValues.IntersectVisitor; +import org.apache.lucene.index.PointValues.Relation; +import org.apache.lucene.index.SegmentReader; +import org.apache.lucene.index.SortedDocValues; +import org.apache.lucene.index.SortedSetDocValues; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.util.Accountable; +import org.apache.lucene.util.Accountables; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.PagedBytes; +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.packed.GrowableWriter; +import org.apache.lucene.util.packed.PackedInts; +import org.apache.lucene.util.packed.PackedLongValues; + +/** + * Expert: The default cache implementation, storing all values in memory. + * A WeakHashMap is used for storage. + * + * @since lucene 1.4 + */ +class FieldCacheImpl implements FieldCache { + + private Map,Cache> caches; + FieldCacheImpl() { + init(); + } + + private synchronized void init() { + caches = new HashMap<>(6); + caches.put(Long.TYPE, new LongCache(this)); + caches.put(BinaryDocValues.class, new BinaryDocValuesCache(this)); + caches.put(SortedDocValues.class, new SortedDocValuesCache(this)); + caches.put(DocTermOrds.class, new DocTermOrdsCache(this)); + caches.put(DocsWithFieldCache.class, new DocsWithFieldCache(this)); + } + + @Override + public synchronized void purgeAllCaches() { + init(); + } + + @Override + public synchronized void purgeByCacheKey(Object coreCacheKey) { + for(Cache c : caches.values()) { + c.purgeByCacheKey(coreCacheKey); + } + } + + @Override + public synchronized CacheEntry[] getCacheEntries() { + List result = new ArrayList<>(17); + for(final Map.Entry,Cache> cacheEntry: caches.entrySet()) { + final Cache cache = cacheEntry.getValue(); + final Class cacheType = cacheEntry.getKey(); + synchronized(cache.readerCache) { + for (final Map.Entry> readerCacheEntry : cache.readerCache.entrySet()) { + final Object readerKey = readerCacheEntry.getKey(); + if (readerKey == null) continue; + final Map innerCache = readerCacheEntry.getValue(); + for (final Map.Entry mapEntry : innerCache.entrySet()) { + CacheKey entry = mapEntry.getKey(); + result.add(new CacheEntry(readerKey, entry.field, + cacheType, entry.custom, + mapEntry.getValue())); + } + } + } + } + return result.toArray(new CacheEntry[result.size()]); + } + + // per-segment fieldcaches don't purge until the shared core closes. + final SegmentReader.CoreClosedListener purgeCore = new SegmentReader.CoreClosedListener() { + @Override + public void onClose(Object ownerCoreCacheKey) { + FieldCacheImpl.this.purgeByCacheKey(ownerCoreCacheKey); + } + }; + + private void initReader(LeafReader reader) { + reader.addCoreClosedListener(purgeCore); + } + + /** Expert: Internal cache. */ + abstract static class Cache { + + Cache(FieldCacheImpl wrapper) { + this.wrapper = wrapper; + } + + final FieldCacheImpl wrapper; + + final Map> readerCache = new WeakHashMap<>(); + + protected abstract Accountable createValue(LeafReader reader, CacheKey key, boolean setDocsWithField) + throws IOException; + + /** Remove this reader from the cache, if present. */ + public void purgeByCacheKey(Object coreCacheKey) { + synchronized(readerCache) { + readerCache.remove(coreCacheKey); + } + } + + /** Sets the key to the value for the provided reader; + * if the key is already set then this doesn't change it. */ + public void put(LeafReader reader, CacheKey key, Accountable value) { + final Object readerKey = reader.getCoreCacheKey(); + synchronized (readerCache) { + Map innerCache = readerCache.get(readerKey); + if (innerCache == null) { + // First time this reader is using FieldCache + innerCache = new HashMap<>(); + readerCache.put(readerKey, innerCache); + wrapper.initReader(reader); + } + if (innerCache.get(key) == null) { + innerCache.put(key, value); + } else { + // Another thread beat us to it; leave the current + // value + } + } + } + + public Object get(LeafReader reader, CacheKey key, boolean setDocsWithField) throws IOException { + Map innerCache; + Accountable value; + final Object readerKey = reader.getCoreCacheKey(); + synchronized (readerCache) { + innerCache = readerCache.get(readerKey); + if (innerCache == null) { + // First time this reader is using FieldCache + innerCache = new HashMap<>(); + readerCache.put(readerKey, innerCache); + wrapper.initReader(reader); + value = null; + } else { + value = innerCache.get(key); + } + if (value == null) { + value = new CreationPlaceholder(); + innerCache.put(key, value); + } + } + if (value instanceof CreationPlaceholder) { + synchronized (value) { + CreationPlaceholder progress = (CreationPlaceholder) value; + if (progress.value == null) { + progress.value = createValue(reader, key, setDocsWithField); + synchronized (readerCache) { + innerCache.put(key, progress.value); + } + + // Only check if key.custom (the parser) is + // non-null; else, we check twice for a single + // call to FieldCache.getXXX + if (key.custom != null && wrapper != null) { + final PrintStream infoStream = wrapper.getInfoStream(); + if (infoStream != null) { + printNewInsanity(infoStream, progress.value); + } + } + } + return progress.value; + } + } + return value; + } + + private void printNewInsanity(PrintStream infoStream, Object value) { + final FieldCacheSanityChecker.Insanity[] insanities = FieldCacheSanityChecker.checkSanity(wrapper); + for(int i=0;i 0; + + if (setDocsWithField) { + final int docCount = values.getDocCount(field); + assert docCount <= maxDoc; + if (docCount == maxDoc) { + // Fast case: all docs have this field: + this.docsWithField = new Bits.MatchAllBits(maxDoc); + setDocsWithField = false; + } + } + + final boolean doDocsWithField = setDocsWithField; + BytesRef scratch = new BytesRef(); + values.intersect(field, new IntersectVisitor() { + @Override + public void visit(int docID) throws IOException { + throw new AssertionError(); + } + + @Override + public void visit(int docID, byte[] packedValue) throws IOException { + scratch.bytes = packedValue; + scratch.length = packedValue.length; + visitTerm(scratch); + visitDoc(docID); + if (doDocsWithField) { + if (docsWithField == null) { + // Lazy init + docsWithField = new FixedBitSet(maxDoc); + } + ((FixedBitSet)docsWithField).set(docID); + } + } + + @Override + public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) { + return Relation.CELL_CROSSES_QUERY; // inspect all byte-docid pairs + } + }); + } + + final void uninvertPostings(LeafReader reader, String field, boolean setDocsWithField) throws IOException { + final int maxDoc = reader.maxDoc(); + Terms terms = reader.terms(field); + if (terms != null) { + if (setDocsWithField) { + final int termsDocCount = terms.getDocCount(); + assert termsDocCount <= maxDoc; + if (termsDocCount == maxDoc) { + // Fast case: all docs have this field: + this.docsWithField = new Bits.MatchAllBits(maxDoc); + setDocsWithField = false; + } + } + + final TermsEnum termsEnum = termsEnum(terms); + + PostingsEnum docs = null; + FixedBitSet docsWithField = null; + while(true) { + final BytesRef term = termsEnum.next(); + if (term == null) { + break; + } + visitTerm(term); + docs = termsEnum.postings(docs, PostingsEnum.NONE); + while (true) { + final int docID = docs.nextDoc(); + if (docID == DocIdSetIterator.NO_MORE_DOCS) { + break; + } + visitDoc(docID); + if (setDocsWithField) { + if (docsWithField == null) { + // Lazy init + this.docsWithField = docsWithField = new FixedBitSet(maxDoc); + } + docsWithField.set(docID); + } + } + } + } + } + + /** @deprecated remove this when legacy numerics are removed */ + @Deprecated + protected abstract TermsEnum termsEnum(Terms terms) throws IOException; + protected abstract void visitTerm(BytesRef term); + protected abstract void visitDoc(int docID); + } + + // null Bits means no docs matched + void setDocsWithField(LeafReader reader, String field, Bits docsWithField, Parser parser) { + final int maxDoc = reader.maxDoc(); + final Bits bits; + if (docsWithField == null) { + bits = new Bits.MatchNoBits(maxDoc); + } else if (docsWithField instanceof FixedBitSet) { + final int numSet = ((FixedBitSet) docsWithField).cardinality(); + if (numSet >= maxDoc) { + // The cardinality of the BitSet is maxDoc if all documents have a value. + assert numSet == maxDoc; + bits = new Bits.MatchAllBits(maxDoc); + } else { + bits = docsWithField; + } + } else { + bits = docsWithField; + } + caches.get(DocsWithFieldCache.class).put(reader, new CacheKey(field, parser), new BitsEntry(bits)); + } + + private static class HoldsOneThing { + private T it; + + public void set(T it) { + this.it = it; + } + + public T get() { + return it; + } + } + + private static class GrowableWriterAndMinValue { + GrowableWriterAndMinValue(GrowableWriter array, long minValue) { + this.writer = array; + this.minValue = minValue; + } + public GrowableWriter writer; + public long minValue; + } + + public Bits getDocsWithField(LeafReader reader, String field, Parser parser) throws IOException { + final FieldInfo fieldInfo = reader.getFieldInfos().fieldInfo(field); + if (fieldInfo == null) { + // field does not exist or has no value + return new Bits.MatchNoBits(reader.maxDoc()); + } else if (fieldInfo.getDocValuesType() != DocValuesType.NONE) { + return reader.getDocsWithField(field); + } + + if (parser instanceof PointParser) { + // points case + + } else { + // postings case + if (fieldInfo.getIndexOptions() == IndexOptions.NONE) { + return new Bits.MatchNoBits(reader.maxDoc()); + } + } + BitsEntry bitsEntry = (BitsEntry) caches.get(DocsWithFieldCache.class).get(reader, new CacheKey(field, parser), false); + return bitsEntry.bits; + } + + static class BitsEntry implements Accountable { + final Bits bits; + + BitsEntry(Bits bits) { + this.bits = bits; + } + + @Override + public long ramBytesUsed() { + long base = RamUsageEstimator.NUM_BYTES_OBJECT_REF; + if (bits instanceof Bits.MatchAllBits || bits instanceof Bits.MatchNoBits) { + return base; + } else { + return base + (bits.length() >>> 3); + } + } + } + + static final class DocsWithFieldCache extends Cache { + DocsWithFieldCache(FieldCacheImpl wrapper) { + super(wrapper); + } + + @Override + protected BitsEntry createValue(LeafReader reader, CacheKey key, boolean setDocsWithField /* ignored */) throws IOException { + final String field = key.field; + final Parser parser = (Parser) key.custom; + if (parser instanceof PointParser) { + return createValuePoints(reader, field); + } else { + return createValuePostings(reader, field); + } + } + + private BitsEntry createValuePoints(LeafReader reader, String field) throws IOException { + final int maxDoc = reader.maxDoc(); + PointValues values = reader.getPointValues(); + assert values != null; + assert values.size(field) > 0; + + final int docCount = values.getDocCount(field); + assert docCount <= maxDoc; + if (docCount == maxDoc) { + // Fast case: all docs have this field: + return new BitsEntry(new Bits.MatchAllBits(maxDoc)); + } + + // otherwise a no-op uninvert! + Uninvert u = new Uninvert(true) { + @Override + protected TermsEnum termsEnum(Terms terms) throws IOException { + throw new AssertionError(); + } + + @Override + protected void visitTerm(BytesRef term) {} + + @Override + protected void visitDoc(int docID) {} + }; + u.uninvert(reader, field, true); + return new BitsEntry(u.docsWithField); + } + + // TODO: it is dumb that uninverting code is duplicated here in this method!! + private BitsEntry createValuePostings(LeafReader reader, String field) throws IOException { + final int maxDoc = reader.maxDoc(); + + // Visit all docs that have terms for this field + FixedBitSet res = null; + Terms terms = reader.terms(field); + if (terms != null) { + final int termsDocCount = terms.getDocCount(); + assert termsDocCount <= maxDoc; + if (termsDocCount == maxDoc) { + // Fast case: all docs have this field: + return new BitsEntry(new Bits.MatchAllBits(maxDoc)); + } + final TermsEnum termsEnum = terms.iterator(); + PostingsEnum docs = null; + while(true) { + final BytesRef term = termsEnum.next(); + if (term == null) { + break; + } + if (res == null) { + // lazy init + res = new FixedBitSet(maxDoc); + } + + docs = termsEnum.postings(docs, PostingsEnum.NONE); + // TODO: use bulk API + while (true) { + final int docID = docs.nextDoc(); + if (docID == DocIdSetIterator.NO_MORE_DOCS) { + break; + } + res.set(docID); + } + } + } + if (res == null) { + return new BitsEntry(new Bits.MatchNoBits(maxDoc)); + } + final int numSet = res.cardinality(); + if (numSet >= maxDoc) { + // The cardinality of the BitSet is maxDoc if all documents have a value. + assert numSet == maxDoc; + return new BitsEntry(new Bits.MatchAllBits(maxDoc)); + } + return new BitsEntry(res); + } + } + + @Override + public NumericDocValues getNumerics(LeafReader reader, String field, Parser parser, boolean setDocsWithField) throws IOException { + if (parser == null) { + throw new NullPointerException(); + } + final NumericDocValues valuesIn = reader.getNumericDocValues(field); + if (valuesIn != null) { + // Not cached here by FieldCacheImpl (cached instead + // per-thread by SegmentReader): + return valuesIn; + } else { + final FieldInfo info = reader.getFieldInfos().fieldInfo(field); + if (info == null) { + return DocValues.emptyNumeric(); + } else if (info.getDocValuesType() != DocValuesType.NONE) { + throw new IllegalStateException("Type mismatch: " + field + " was indexed as " + info.getDocValuesType()); + } + + if (parser instanceof PointParser) { + // points case + // no points in this segment + if (info.getPointDimensionCount() == 0) { + return DocValues.emptyNumeric(); + } + if (info.getPointDimensionCount() != 1) { + throw new IllegalStateException("Type mismatch: " + field + " was indexed with dimensions=" + info.getPointDimensionCount()); + } + PointValues values = reader.getPointValues(); + // no actual points for this field (e.g. all points deleted) + if (values == null || values.size(field) == 0) { + return DocValues.emptyNumeric(); + } + // not single-valued + if (values.size(field) != values.getDocCount(field)) { + throw new IllegalStateException("Type mismatch: " + field + " was indexed with multiple values, numValues=" + values.size(field) + ",numDocs=" + values.getDocCount(field)); + } + } else { + // postings case + // not indexed + if (info.getIndexOptions() == IndexOptions.NONE) { + return DocValues.emptyNumeric(); + } + } + return (NumericDocValues) caches.get(Long.TYPE).get(reader, new CacheKey(field, parser), setDocsWithField); + } + } + + static class LongsFromArray extends NumericDocValues implements Accountable { + private final PackedInts.Reader values; + private final long minValue; + + public LongsFromArray(PackedInts.Reader values, long minValue) { + this.values = values; + this.minValue = minValue; + } + + @Override + public long get(int docID) { + return minValue + values.get(docID); + } + + @Override + public long ramBytesUsed() { + return values.ramBytesUsed() + RamUsageEstimator.NUM_BYTES_OBJECT_REF + Long.BYTES; + } + } + + static final class LongCache extends Cache { + LongCache(FieldCacheImpl wrapper) { + super(wrapper); + } + + @Override + protected Accountable createValue(final LeafReader reader, CacheKey key, boolean setDocsWithField) + throws IOException { + + final Parser parser = (Parser) key.custom; + + final HoldsOneThing valuesRef = new HoldsOneThing<>(); + + Uninvert u = new Uninvert(parser instanceof PointParser) { + private long minValue; + private long currentValue; + private GrowableWriter values; + + @Override + public void visitTerm(BytesRef term) { + currentValue = parser.parseValue(term); + if (values == null) { + // Lazy alloc so for the numeric field case + // (which will hit a NumberFormatException + // when we first try the DEFAULT_INT_PARSER), + // we don't double-alloc: + int startBitsPerValue; + // Make sure than missing values (0) can be stored without resizing + if (currentValue < 0) { + minValue = currentValue; + startBitsPerValue = minValue == Long.MIN_VALUE ? 64 : PackedInts.bitsRequired(-minValue); + } else { + minValue = 0; + startBitsPerValue = PackedInts.bitsRequired(currentValue); + } + values = new GrowableWriter(startBitsPerValue, reader.maxDoc(), PackedInts.FAST); + if (minValue != 0) { + values.fill(0, values.size(), -minValue); // default value must be 0 + } + valuesRef.set(new GrowableWriterAndMinValue(values, minValue)); + } + } + + @Override + public void visitDoc(int docID) { + values.set(docID, currentValue - minValue); + } + + @Override + protected TermsEnum termsEnum(Terms terms) throws IOException { + return parser.termsEnum(terms); + } + }; + + u.uninvert(reader, key.field, setDocsWithField); + + if (setDocsWithField) { + wrapper.setDocsWithField(reader, key.field, u.docsWithField, parser); + } + GrowableWriterAndMinValue values = valuesRef.get(); + if (values == null) { + return new LongsFromArray(new PackedInts.NullReader(reader.maxDoc()), 0L); + } + return new LongsFromArray(values.writer.getMutable(), values.minValue); + } + } + + public static class SortedDocValuesImpl implements Accountable { + private final PagedBytes.Reader bytes; + private final PackedLongValues termOrdToBytesOffset; + private final PackedInts.Reader docToTermOrd; + private final int numOrd; + + public SortedDocValuesImpl(PagedBytes.Reader bytes, PackedLongValues termOrdToBytesOffset, PackedInts.Reader docToTermOrd, int numOrd) { + this.bytes = bytes; + this.docToTermOrd = docToTermOrd; + this.termOrdToBytesOffset = termOrdToBytesOffset; + this.numOrd = numOrd; + } + + public SortedDocValues iterator() { + final BytesRef term = new BytesRef(); + return new SortedDocValues() { + + @Override + public int getValueCount() { + return numOrd; + } + + @Override + public int getOrd(int docID) { + // Subtract 1, matching the 1+ord we did when + // storing, so that missing values, which are 0 in the + // packed ints, are returned as -1 ord: + return (int) docToTermOrd.get(docID)-1; + } + + @Override + public BytesRef lookupOrd(int ord) { + if (ord < 0) { + throw new IllegalArgumentException("ord must be >=0 (got ord=" + ord + ")"); + } + bytes.fill(term, termOrdToBytesOffset.get(ord)); + return term; + } + }; + } + + @Override + public long ramBytesUsed() { + return bytes.ramBytesUsed() + + termOrdToBytesOffset.ramBytesUsed() + + docToTermOrd.ramBytesUsed() + + 3*RamUsageEstimator.NUM_BYTES_OBJECT_REF + + Integer.BYTES; + } + + @Override + public Collection getChildResources() { + List resources = new ArrayList<>(3); + resources.add(Accountables.namedAccountable("term bytes", bytes)); + resources.add(Accountables.namedAccountable("ord -> term", termOrdToBytesOffset)); + resources.add(Accountables.namedAccountable("doc -> ord", docToTermOrd)); + return Collections.unmodifiableList(resources); + } + } + + public SortedDocValues getTermsIndex(LeafReader reader, String field) throws IOException { + return getTermsIndex(reader, field, PackedInts.FAST); + } + + public SortedDocValues getTermsIndex(LeafReader reader, String field, float acceptableOverheadRatio) throws IOException { + SortedDocValues valuesIn = reader.getSortedDocValues(field); + if (valuesIn != null) { + // Not cached here by FieldCacheImpl (cached instead + // per-thread by SegmentReader): + return valuesIn; + } else { + final FieldInfo info = reader.getFieldInfos().fieldInfo(field); + if (info == null) { + return DocValues.emptySorted(); + } else if (info.getDocValuesType() != DocValuesType.NONE) { + // we don't try to build a sorted instance from numeric/binary doc + // values because dedup can be very costly + throw new IllegalStateException("Type mismatch: " + field + " was indexed as " + info.getDocValuesType()); + } else if (info.getIndexOptions() == IndexOptions.NONE) { + return DocValues.emptySorted(); + } + SortedDocValuesImpl impl = (SortedDocValuesImpl) caches.get(SortedDocValues.class).get(reader, new CacheKey(field, acceptableOverheadRatio), false); + return impl.iterator(); + } + } + + static class SortedDocValuesCache extends Cache { + SortedDocValuesCache(FieldCacheImpl wrapper) { + super(wrapper); + } + + @Override + protected Accountable createValue(LeafReader reader, CacheKey key, boolean setDocsWithField /* ignored */) + throws IOException { + + final int maxDoc = reader.maxDoc(); + + Terms terms = reader.terms(key.field); + + final float acceptableOverheadRatio = ((Float) key.custom).floatValue(); + + final PagedBytes bytes = new PagedBytes(15); + + int startTermsBPV; + + // TODO: use Uninvert? + if (terms != null) { + // Try for coarse estimate for number of bits; this + // should be an underestimate most of the time, which + // is fine -- GrowableWriter will reallocate as needed + long numUniqueTerms = terms.size(); + if (numUniqueTerms != -1L) { + if (numUniqueTerms > maxDoc) { + throw new IllegalStateException("Type mismatch: " + key.field + " was indexed with multiple values per document, use SORTED_SET instead"); + } + + startTermsBPV = PackedInts.bitsRequired(numUniqueTerms); + } else { + startTermsBPV = 1; + } + } else { + startTermsBPV = 1; + } + + PackedLongValues.Builder termOrdToBytesOffset = PackedLongValues.monotonicBuilder(PackedInts.COMPACT); + final GrowableWriter docToTermOrd = new GrowableWriter(startTermsBPV, maxDoc, acceptableOverheadRatio); + + int termOrd = 0; + + // TODO: use Uninvert? + + if (terms != null) { + final TermsEnum termsEnum = terms.iterator(); + PostingsEnum docs = null; + + while(true) { + final BytesRef term = termsEnum.next(); + if (term == null) { + break; + } + if (termOrd >= maxDoc) { + throw new IllegalStateException("Type mismatch: " + key.field + " was indexed with multiple values per document, use SORTED_SET instead"); + } + + termOrdToBytesOffset.add(bytes.copyUsingLengthPrefix(term)); + docs = termsEnum.postings(docs, PostingsEnum.NONE); + while (true) { + final int docID = docs.nextDoc(); + if (docID == DocIdSetIterator.NO_MORE_DOCS) { + break; + } + // Store 1+ ord into packed bits + docToTermOrd.set(docID, 1+termOrd); + } + termOrd++; + } + } + + // maybe an int-only impl? + return new SortedDocValuesImpl(bytes.freeze(true), termOrdToBytesOffset.build(), docToTermOrd.getMutable(), termOrd); + } + } + + private static class BinaryDocValuesImpl implements Accountable { + private final PagedBytes.Reader bytes; + private final PackedInts.Reader docToOffset; + + public BinaryDocValuesImpl(PagedBytes.Reader bytes, PackedInts.Reader docToOffset) { + this.bytes = bytes; + this.docToOffset = docToOffset; + } + + public BinaryDocValues iterator() { + final BytesRef term = new BytesRef(); + return new BinaryDocValues() { + @Override + public BytesRef get(int docID) { + final long pointer = docToOffset.get(docID); + if (pointer == 0) { + term.length = 0; + } else { + bytes.fill(term, pointer); + } + return term; + } + }; + } + + @Override + public long ramBytesUsed() { + return bytes.ramBytesUsed() + docToOffset.ramBytesUsed() + 2*RamUsageEstimator.NUM_BYTES_OBJECT_REF; + } + + @Override + public Collection getChildResources() { + List resources = new ArrayList<>(2); + resources.add(Accountables.namedAccountable("term bytes", bytes)); + resources.add(Accountables.namedAccountable("addresses", docToOffset)); + return Collections.unmodifiableList(resources); + } + } + + // TODO: this if DocTermsIndex was already created, we + // should share it... + public BinaryDocValues getTerms(LeafReader reader, String field, boolean setDocsWithField) throws IOException { + return getTerms(reader, field, setDocsWithField, PackedInts.FAST); + } + + public BinaryDocValues getTerms(LeafReader reader, String field, boolean setDocsWithField, float acceptableOverheadRatio) throws IOException { + BinaryDocValues valuesIn = reader.getBinaryDocValues(field); + if (valuesIn == null) { + valuesIn = reader.getSortedDocValues(field); + } + + if (valuesIn != null) { + // Not cached here by FieldCacheImpl (cached instead + // per-thread by SegmentReader): + return valuesIn; + } + + final FieldInfo info = reader.getFieldInfos().fieldInfo(field); + if (info == null) { + return DocValues.emptyBinary(); + } else if (info.getDocValuesType() != DocValuesType.NONE) { + throw new IllegalStateException("Type mismatch: " + field + " was indexed as " + info.getDocValuesType()); + } else if (info.getIndexOptions() == IndexOptions.NONE) { + return DocValues.emptyBinary(); + } + + BinaryDocValuesImpl impl = (BinaryDocValuesImpl) caches.get(BinaryDocValues.class).get(reader, new CacheKey(field, acceptableOverheadRatio), setDocsWithField); + return impl.iterator(); + } + + static final class BinaryDocValuesCache extends Cache { + BinaryDocValuesCache(FieldCacheImpl wrapper) { + super(wrapper); + } + + @Override + protected Accountable createValue(LeafReader reader, CacheKey key, boolean setDocsWithField) + throws IOException { + + // TODO: would be nice to first check if DocTermsIndex + // was already cached for this field and then return + // that instead, to avoid insanity + + final int maxDoc = reader.maxDoc(); + Terms terms = reader.terms(key.field); + + final float acceptableOverheadRatio = ((Float) key.custom).floatValue(); + + final int termCountHardLimit = maxDoc; + + // Holds the actual term data, expanded. + final PagedBytes bytes = new PagedBytes(15); + + int startBPV; + + if (terms != null) { + // Try for coarse estimate for number of bits; this + // should be an underestimate most of the time, which + // is fine -- GrowableWriter will reallocate as needed + long numUniqueTerms = terms.size(); + if (numUniqueTerms != -1L) { + if (numUniqueTerms > termCountHardLimit) { + numUniqueTerms = termCountHardLimit; + } + startBPV = PackedInts.bitsRequired(numUniqueTerms*4); + } else { + startBPV = 1; + } + } else { + startBPV = 1; + } + + final GrowableWriter docToOffset = new GrowableWriter(startBPV, maxDoc, acceptableOverheadRatio); + + // pointer==0 means not set + bytes.copyUsingLengthPrefix(new BytesRef()); + + if (terms != null) { + int termCount = 0; + final TermsEnum termsEnum = terms.iterator(); + PostingsEnum docs = null; + while(true) { + if (termCount++ == termCountHardLimit) { + // app is misusing the API (there is more than + // one term per doc); in this case we make best + // effort to load what we can (see LUCENE-2142) + break; + } + + final BytesRef term = termsEnum.next(); + if (term == null) { + break; + } + final long pointer = bytes.copyUsingLengthPrefix(term); + docs = termsEnum.postings(docs, PostingsEnum.NONE); + while (true) { + final int docID = docs.nextDoc(); + if (docID == DocIdSetIterator.NO_MORE_DOCS) { + break; + } + docToOffset.set(docID, pointer); + } + } + } + + final PackedInts.Reader offsetReader = docToOffset.getMutable(); + if (setDocsWithField) { + wrapper.setDocsWithField(reader, key.field, new Bits() { + @Override + public boolean get(int index) { + return offsetReader.get(index) != 0; + } + + @Override + public int length() { + return maxDoc; + } + }, null); + } + // maybe an int-only impl? + return new BinaryDocValuesImpl(bytes.freeze(true), offsetReader); + } + } + + // TODO: this if DocTermsIndex was already created, we + // should share it... + public SortedSetDocValues getDocTermOrds(LeafReader reader, String field, BytesRef prefix) throws IOException { + // not a general purpose filtering mechanism... + assert prefix == null || prefix == INT32_TERM_PREFIX || prefix == INT64_TERM_PREFIX; + + SortedSetDocValues dv = reader.getSortedSetDocValues(field); + if (dv != null) { + return dv; + } + + SortedDocValues sdv = reader.getSortedDocValues(field); + if (sdv != null) { + return DocValues.singleton(sdv); + } + + final FieldInfo info = reader.getFieldInfos().fieldInfo(field); + if (info == null) { + return DocValues.emptySortedSet(); + } else if (info.getDocValuesType() != DocValuesType.NONE) { + throw new IllegalStateException("Type mismatch: " + field + " was indexed as " + info.getDocValuesType()); + } else if (info.getIndexOptions() == IndexOptions.NONE) { + return DocValues.emptySortedSet(); + } + + // ok we need to uninvert. check if we can optimize a bit. + + Terms terms = reader.terms(field); + if (terms == null) { + return DocValues.emptySortedSet(); + } else { + // if #postings = #docswithfield we know that the field is "single valued enough". + // it's possible the same term might appear twice in the same document, but SORTED_SET discards frequency. + // it's still ok with filtering (which we limit to numerics), it just means precisionStep = Inf + long numPostings = terms.getSumDocFreq(); + if (numPostings != -1 && numPostings == terms.getDocCount()) { + return DocValues.singleton(getTermsIndex(reader, field)); + } + } + + DocTermOrds dto = (DocTermOrds) caches.get(DocTermOrds.class).get(reader, new CacheKey(field, prefix), false); + return dto.iterator(reader); + } + + static final class DocTermOrdsCache extends Cache { + DocTermOrdsCache(FieldCacheImpl wrapper) { + super(wrapper); + } + + @Override + protected Accountable createValue(LeafReader reader, CacheKey key, boolean setDocsWithField /* ignored */) + throws IOException { + BytesRef prefix = (BytesRef) key.custom; + return new DocTermOrds(reader, null, key.field, prefix); + } + } + + private volatile PrintStream infoStream; + + public void setInfoStream(PrintStream stream) { + infoStream = stream; + } + + public PrintStream getInfoStream() { + return infoStream; + } +} + http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/5525f429/solr/core/src/java/org/apache/solr/uninverting/FieldCacheSanityChecker.java ---------------------------------------------------------------------- diff --git a/solr/core/src/java/org/apache/solr/uninverting/FieldCacheSanityChecker.java b/solr/core/src/java/org/apache/solr/uninverting/FieldCacheSanityChecker.java new file mode 100644 index 0000000..ec398f2 --- /dev/null +++ b/solr/core/src/java/org/apache/solr/uninverting/FieldCacheSanityChecker.java @@ -0,0 +1,425 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.uninverting; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexReaderContext; +import org.apache.lucene.store.AlreadyClosedException; +import org.apache.lucene.util.MapOfSets; +import org.apache.solr.uninverting.FieldCache.CacheEntry; + +/** + * Provides methods for sanity checking that entries in the FieldCache + * are not wasteful or inconsistent. + *

+ *

+ * Lucene 2.9 Introduced numerous enhancements into how the FieldCache + * is used by the low levels of Lucene searching (for Sorting and + * ValueSourceQueries) to improve both the speed for Sorting, as well + * as reopening of IndexReaders. But these changes have shifted the + * usage of FieldCache from "top level" IndexReaders (frequently a + * MultiReader or DirectoryReader) down to the leaf level SegmentReaders. + * As a result, existing applications that directly access the FieldCache + * may find RAM usage increase significantly when upgrading to 2.9 or + * Later. This class provides an API for these applications (or their + * Unit tests) to check at run time if the FieldCache contains "insane" + * usages of the FieldCache. + *

+ * @lucene.experimental + * @see FieldCache + * @see FieldCacheSanityChecker.Insanity + * @see FieldCacheSanityChecker.InsanityType + */ +final class FieldCacheSanityChecker { + + public FieldCacheSanityChecker() { + /* NOOP */ + } + + /** + * Quick and dirty convenience method + * @see #check + */ + public static Insanity[] checkSanity(FieldCache cache) { + return checkSanity(cache.getCacheEntries()); + } + + /** + * Quick and dirty convenience method that instantiates an instance with + * "good defaults" and uses it to test the CacheEntrys + * @see #check + */ + public static Insanity[] checkSanity(CacheEntry... cacheEntries) { + FieldCacheSanityChecker sanityChecker = new FieldCacheSanityChecker(); + return sanityChecker.check(cacheEntries); + } + + + /** + * Tests a CacheEntry[] for indication of "insane" cache usage. + *

+ * NOTE:FieldCache CreationPlaceholder objects are ignored. + * (:TODO: is this a bad idea? are we masking a real problem?) + *

+ */ + public Insanity[] check(CacheEntry... cacheEntries) { + if (null == cacheEntries || 0 == cacheEntries.length) + return new Insanity[0]; + + // the indirect mapping lets MapOfSet dedup identical valIds for us + // + // maps the (valId) identityhashCode of cache values to + // sets of CacheEntry instances + final MapOfSets valIdToItems = new MapOfSets<>(new HashMap>(17)); + // maps ReaderField keys to Sets of ValueIds + final MapOfSets readerFieldToValIds = new MapOfSets<>(new HashMap>(17)); + // + + // any keys that we know result in more then one valId + final Set valMismatchKeys = new HashSet<>(); + + // iterate over all the cacheEntries to get the mappings we'll need + for (int i = 0; i < cacheEntries.length; i++) { + final CacheEntry item = cacheEntries[i]; + final Object val = item.getValue(); + + // It's OK to have dup entries, where one is eg + // float[] and the other is the Bits (from + // getDocWithField()) + if (val instanceof FieldCacheImpl.BitsEntry) { + continue; + } + + if (val instanceof FieldCache.CreationPlaceholder) + continue; + + final ReaderField rf = new ReaderField(item.getReaderKey(), + item.getFieldName()); + + final Integer valId = Integer.valueOf(System.identityHashCode(val)); + + // indirect mapping, so the MapOfSet will dedup identical valIds for us + valIdToItems.put(valId, item); + if (1 < readerFieldToValIds.put(rf, valId)) { + valMismatchKeys.add(rf); + } + } + + final List insanity = new ArrayList<>(valMismatchKeys.size() * 3); + + insanity.addAll(checkValueMismatch(valIdToItems, + readerFieldToValIds, + valMismatchKeys)); + insanity.addAll(checkSubreaders(valIdToItems, + readerFieldToValIds)); + + return insanity.toArray(new Insanity[insanity.size()]); + } + + /** + * Internal helper method used by check that iterates over + * valMismatchKeys and generates a Collection of Insanity + * instances accordingly. The MapOfSets are used to populate + * the Insanity objects. + * @see InsanityType#VALUEMISMATCH + */ + private Collection checkValueMismatch(MapOfSets valIdToItems, + MapOfSets readerFieldToValIds, + Set valMismatchKeys) { + + final List insanity = new ArrayList<>(valMismatchKeys.size() * 3); + + if (! valMismatchKeys.isEmpty() ) { + // we have multiple values for some ReaderFields + + final Map> rfMap = readerFieldToValIds.getMap(); + final Map> valMap = valIdToItems.getMap(); + for (final ReaderField rf : valMismatchKeys) { + final List badEntries = new ArrayList<>(valMismatchKeys.size() * 2); + for(final Integer value: rfMap.get(rf)) { + for (final CacheEntry cacheEntry : valMap.get(value)) { + badEntries.add(cacheEntry); + } + } + + CacheEntry[] badness = new CacheEntry[badEntries.size()]; + badness = badEntries.toArray(badness); + + insanity.add(new Insanity(InsanityType.VALUEMISMATCH, + "Multiple distinct value objects for " + + rf.toString(), badness)); + } + } + return insanity; + } + + /** + * Internal helper method used by check that iterates over + * the keys of readerFieldToValIds and generates a Collection + * of Insanity instances whenever two (or more) ReaderField instances are + * found that have an ancestry relationships. + * + * @see InsanityType#SUBREADER + */ + private Collection checkSubreaders( MapOfSets valIdToItems, + MapOfSets readerFieldToValIds) { + + final List insanity = new ArrayList<>(23); + + Map> badChildren = new HashMap<>(17); + MapOfSets badKids = new MapOfSets<>(badChildren); // wrapper + + Map> viToItemSets = valIdToItems.getMap(); + Map> rfToValIdSets = readerFieldToValIds.getMap(); + + Set seen = new HashSet<>(17); + + Set readerFields = rfToValIdSets.keySet(); + for (final ReaderField rf : readerFields) { + + if (seen.contains(rf)) continue; + + List kids = getAllDescendantReaderKeys(rf.readerKey); + for (Object kidKey : kids) { + ReaderField kid = new ReaderField(kidKey, rf.fieldName); + + if (badChildren.containsKey(kid)) { + // we've already process this kid as RF and found other problems + // track those problems as our own + badKids.put(rf, kid); + badKids.putAll(rf, badChildren.get(kid)); + badChildren.remove(kid); + + } else if (rfToValIdSets.containsKey(kid)) { + // we have cache entries for the kid + badKids.put(rf, kid); + } + seen.add(kid); + } + seen.add(rf); + } + + // every mapping in badKids represents an Insanity + for (final ReaderField parent : badChildren.keySet()) { + Set kids = badChildren.get(parent); + + List badEntries = new ArrayList<>(kids.size() * 2); + + // put parent entr(ies) in first + { + for (final Integer value : rfToValIdSets.get(parent)) { + badEntries.addAll(viToItemSets.get(value)); + } + } + + // now the entries for the descendants + for (final ReaderField kid : kids) { + for (final Integer value : rfToValIdSets.get(kid)) { + badEntries.addAll(viToItemSets.get(value)); + } + } + + CacheEntry[] badness = new CacheEntry[badEntries.size()]; + badness = badEntries.toArray(badness); + + insanity.add(new Insanity(InsanityType.SUBREADER, + "Found caches for descendants of " + + parent.toString(), + badness)); + } + + return insanity; + + } + + /** + * Checks if the seed is an IndexReader, and if so will walk + * the hierarchy of subReaders building up a list of the objects + * returned by {@code seed.getCoreCacheKey()} + */ + private List getAllDescendantReaderKeys(Object seed) { + List all = new ArrayList<>(17); // will grow as we iter + all.add(seed); + for (int i = 0; i < all.size(); i++) { + final Object obj = all.get(i); + // TODO: We don't check closed readers here (as getTopReaderContext + // throws AlreadyClosedException), what should we do? Reflection? + if (obj instanceof IndexReader) { + try { + final List childs = + ((IndexReader) obj).getContext().children(); + if (childs != null) { // it is composite reader + for (final IndexReaderContext ctx : childs) { + all.add(ctx.reader().getCoreCacheKey()); + } + } + } catch (AlreadyClosedException ace) { + // ignore this reader + } + } + } + // need to skip the first, because it was the seed + return all.subList(1, all.size()); + } + + /** + * Simple pair object for using "readerKey + fieldName" a Map key + */ + private final static class ReaderField { + public final Object readerKey; + public final String fieldName; + public ReaderField(Object readerKey, String fieldName) { + this.readerKey = readerKey; + this.fieldName = fieldName; + } + @Override + public int hashCode() { + return System.identityHashCode(readerKey) * fieldName.hashCode(); + } + @Override + public boolean equals(Object that) { + if (! (that instanceof ReaderField)) return false; + + ReaderField other = (ReaderField) that; + return (this.readerKey == other.readerKey && + this.fieldName.equals(other.fieldName)); + } + @Override + public String toString() { + return readerKey.toString() + "+" + fieldName; + } + } + + /** + * Simple container for a collection of related CacheEntry objects that + * in conjunction with each other represent some "insane" usage of the + * FieldCache. + */ + public final static class Insanity { + private final InsanityType type; + private final String msg; + private final CacheEntry[] entries; + public Insanity(InsanityType type, String msg, CacheEntry... entries) { + if (null == type) { + throw new IllegalArgumentException + ("Insanity requires non-null InsanityType"); + } + if (null == entries || 0 == entries.length) { + throw new IllegalArgumentException + ("Insanity requires non-null/non-empty CacheEntry[]"); + } + this.type = type; + this.msg = msg; + this.entries = entries; + + } + /** + * Type of insane behavior this object represents + */ + public InsanityType getType() { return type; } + /** + * Description of hte insane behavior + */ + public String getMsg() { return msg; } + /** + * CacheEntry objects which suggest a problem + */ + public CacheEntry[] getCacheEntries() { return entries; } + /** + * Multi-Line representation of this Insanity object, starting with + * the Type and Msg, followed by each CacheEntry.toString() on its + * own line prefaced by a tab character + */ + @Override + public String toString() { + StringBuilder buf = new StringBuilder(); + buf.append(getType()).append(": "); + + String m = getMsg(); + if (null != m) buf.append(m); + + buf.append('\n'); + + CacheEntry[] ce = getCacheEntries(); + for (int i = 0; i < ce.length; i++) { + buf.append('\t').append(ce[i].toString()).append('\n'); + } + + return buf.toString(); + } + } + + /** + * An Enumeration of the different types of "insane" behavior that + * may be detected in a FieldCache. + * + * @see InsanityType#SUBREADER + * @see InsanityType#VALUEMISMATCH + * @see InsanityType#EXPECTED + */ + public final static class InsanityType { + private final String label; + private InsanityType(final String label) { + this.label = label; + } + @Override + public String toString() { return label; } + + /** + * Indicates an overlap in cache usage on a given field + * in sub/super readers. + */ + public final static InsanityType SUBREADER + = new InsanityType("SUBREADER"); + + /** + *

+ * Indicates entries have the same reader+fieldname but + * different cached values. This can happen if different datatypes, + * or parsers are used -- and while it's not necessarily a bug + * it's typically an indication of a possible problem. + *

+ *

+ * NOTE: Only the reader, fieldname, and cached value are actually + * tested -- if two cache entries have different parsers or datatypes but + * the cached values are the same Object (== not just equal()) this method + * does not consider that a red flag. This allows for subtle variations + * in the way a Parser is specified (null vs DEFAULT_LONG_PARSER, etc...) + *

+ */ + public final static InsanityType VALUEMISMATCH + = new InsanityType("VALUEMISMATCH"); + + /** + * Indicates an expected bit of "insanity". This may be useful for + * clients that wish to preserve/log information about insane usage + * but indicate that it was expected. + */ + public final static InsanityType EXPECTED + = new InsanityType("EXPECTED"); + } + + +} http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/5525f429/solr/core/src/java/org/apache/solr/uninverting/UninvertingReader.java ---------------------------------------------------------------------- diff --git a/solr/core/src/java/org/apache/solr/uninverting/UninvertingReader.java b/solr/core/src/java/org/apache/solr/uninverting/UninvertingReader.java new file mode 100644 index 0000000..4450cbb --- /dev/null +++ b/solr/core/src/java/org/apache/solr/uninverting/UninvertingReader.java @@ -0,0 +1,391 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.uninverting; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Map; + +import org.apache.lucene.document.BinaryDocValuesField; // javadocs +import org.apache.lucene.document.NumericDocValuesField; // javadocs +import org.apache.lucene.document.SortedDocValuesField; // javadocs +import org.apache.lucene.document.SortedSetDocValuesField; // javadocs +import org.apache.lucene.document.StringField; // javadocs +import org.apache.lucene.index.BinaryDocValues; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.DocValuesType; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.FilterDirectoryReader; +import org.apache.lucene.index.FilterLeafReader; +import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.index.LeafReader; +import org.apache.lucene.index.NumericDocValues; +import org.apache.lucene.index.SortedDocValues; +import org.apache.lucene.index.SortedSetDocValues; +import org.apache.lucene.util.Bits; +import org.apache.solr.uninverting.FieldCache.CacheEntry; + +/** + * A FilterReader that exposes indexed values as if they also had + * docvalues. + *

+ * This is accomplished by "inverting the inverted index" or "uninversion". + *

+ * The uninversion process happens lazily: upon the first request for the + * field's docvalues (e.g. via {@link org.apache.lucene.index.LeafReader#getNumericDocValues(String)} + * or similar), it will create the docvalues on-the-fly if needed and cache it, + * based on the core cache key of the wrapped LeafReader. + */ +public class UninvertingReader extends FilterLeafReader { + + /** + * Specifies the type of uninversion to apply for the field. + */ + public static enum Type { + /** + * Single-valued Integer, (e.g. indexed with {@link org.apache.lucene.document.IntPoint}) + *

+ * Fields with this type act as if they were indexed with + * {@link NumericDocValuesField}. + */ + INTEGER_POINT, + /** + * Single-valued Integer, (e.g. indexed with {@link org.apache.lucene.document.LongPoint}) + *

+ * Fields with this type act as if they were indexed with + * {@link NumericDocValuesField}. + */ + LONG_POINT, + /** + * Single-valued Integer, (e.g. indexed with {@link org.apache.lucene.document.FloatPoint}) + *

+ * Fields with this type act as if they were indexed with + * {@link NumericDocValuesField}. + */ + FLOAT_POINT, + /** + * Single-valued Integer, (e.g. indexed with {@link org.apache.lucene.document.DoublePoint}) + *

+ * Fields with this type act as if they were indexed with + * {@link NumericDocValuesField}. + */ + DOUBLE_POINT, + /** + * Single-valued Integer, (e.g. indexed with {@link org.apache.lucene.document.LegacyIntField}) + *

+ * Fields with this type act as if they were indexed with + * {@link NumericDocValuesField}. + * @deprecated Index with points and use {@link #INTEGER_POINT} instead. + */ + @Deprecated + LEGACY_INTEGER, + /** + * Single-valued Long, (e.g. indexed with {@link org.apache.lucene.document.LegacyLongField}) + *

+ * Fields with this type act as if they were indexed with + * {@link NumericDocValuesField}. + * @deprecated Index with points and use {@link #LONG_POINT} instead. + */ + @Deprecated + LEGACY_LONG, + /** + * Single-valued Float, (e.g. indexed with {@link org.apache.lucene.document.LegacyFloatField}) + *

+ * Fields with this type act as if they were indexed with + * {@link NumericDocValuesField}. + * @deprecated Index with points and use {@link #FLOAT_POINT} instead. + */ + @Deprecated + LEGACY_FLOAT, + /** + * Single-valued Double, (e.g. indexed with {@link org.apache.lucene.document.LegacyDoubleField}) + *

+ * Fields with this type act as if they were indexed with + * {@link NumericDocValuesField}. + * @deprecated Index with points and use {@link #DOUBLE_POINT} instead. + */ + @Deprecated + LEGACY_DOUBLE, + /** + * Single-valued Binary, (e.g. indexed with {@link StringField}) + *

+ * Fields with this type act as if they were indexed with + * {@link BinaryDocValuesField}. + */ + BINARY, + /** + * Single-valued Binary, (e.g. indexed with {@link StringField}) + *

+ * Fields with this type act as if they were indexed with + * {@link SortedDocValuesField}. + */ + SORTED, + /** + * Multi-valued Binary, (e.g. indexed with {@link StringField}) + *

+ * Fields with this type act as if they were indexed with + * {@link SortedSetDocValuesField}. + */ + SORTED_SET_BINARY, + /** + * Multi-valued Integer, (e.g. indexed with {@link org.apache.lucene.document.LegacyIntField}) + *

+ * Fields with this type act as if they were indexed with + * {@link SortedSetDocValuesField}. + */ + SORTED_SET_INTEGER, + /** + * Multi-valued Float, (e.g. indexed with {@link org.apache.lucene.document.LegacyFloatField}) + *

+ * Fields with this type act as if they were indexed with + * {@link SortedSetDocValuesField}. + */ + SORTED_SET_FLOAT, + /** + * Multi-valued Long, (e.g. indexed with {@link org.apache.lucene.document.LegacyLongField}) + *

+ * Fields with this type act as if they were indexed with + * {@link SortedSetDocValuesField}. + */ + SORTED_SET_LONG, + /** + * Multi-valued Double, (e.g. indexed with {@link org.apache.lucene.document.LegacyDoubleField}) + *

+ * Fields with this type act as if they were indexed with + * {@link SortedSetDocValuesField}. + */ + SORTED_SET_DOUBLE + } + + /** + * Wraps a provided DirectoryReader. Note that for convenience, the returned reader + * can be used normally (e.g. passed to {@link DirectoryReader#openIfChanged(DirectoryReader)}) + * and so on. + */ + public static DirectoryReader wrap(DirectoryReader in, final Map mapping) throws IOException { + return new UninvertingDirectoryReader(in, mapping); + } + + static class UninvertingDirectoryReader extends FilterDirectoryReader { + final Map mapping; + + public UninvertingDirectoryReader(DirectoryReader in, final Map mapping) throws IOException { + super(in, new FilterDirectoryReader.SubReaderWrapper() { + @Override + public LeafReader wrap(LeafReader reader) { + return new UninvertingReader(reader, mapping); + } + }); + this.mapping = mapping; + } + + @Override + protected DirectoryReader doWrapDirectoryReader(DirectoryReader in) throws IOException { + return new UninvertingDirectoryReader(in, mapping); + } + } + + final Map mapping; + final FieldInfos fieldInfos; + + /** + * Create a new UninvertingReader with the specified mapping + *

+ * Expert: This should almost never be used. Use {@link #wrap(DirectoryReader, Map)} + * instead. + * + * @lucene.internal + */ + public UninvertingReader(LeafReader in, Map mapping) { + super(in); + this.mapping = mapping; + ArrayList filteredInfos = new ArrayList<>(); + for (FieldInfo fi : in.getFieldInfos()) { + DocValuesType type = fi.getDocValuesType(); + if (type == DocValuesType.NONE) { + Type t = mapping.get(fi.name); + if (t != null) { + if (t == Type.INTEGER_POINT || t == Type.LONG_POINT || t == Type.FLOAT_POINT || t == Type.DOUBLE_POINT) { + // type uses points + if (fi.getPointDimensionCount() == 0) { + continue; + } + } else { + // type uses inverted index + if (fi.getIndexOptions() == IndexOptions.NONE) { + continue; + } + } + switch(t) { + case INTEGER_POINT: + case LONG_POINT: + case FLOAT_POINT: + case DOUBLE_POINT: + case LEGACY_INTEGER: + case LEGACY_LONG: + case LEGACY_FLOAT: + case LEGACY_DOUBLE: + type = DocValuesType.NUMERIC; + break; + case BINARY: + type = DocValuesType.BINARY; + break; + case SORTED: + type = DocValuesType.SORTED; + break; + case SORTED_SET_BINARY: + case SORTED_SET_INTEGER: + case SORTED_SET_FLOAT: + case SORTED_SET_LONG: + case SORTED_SET_DOUBLE: + type = DocValuesType.SORTED_SET; + break; + default: + throw new AssertionError(); + } + } + } + filteredInfos.add(new FieldInfo(fi.name, fi.number, fi.hasVectors(), fi.omitsNorms(), + fi.hasPayloads(), fi.getIndexOptions(), type, fi.getDocValuesGen(), fi.attributes(), + fi.getPointDimensionCount(), fi.getPointNumBytes())); + } + fieldInfos = new FieldInfos(filteredInfos.toArray(new FieldInfo[filteredInfos.size()])); + } + + @Override + public FieldInfos getFieldInfos() { + return fieldInfos; + } + + @Override + public NumericDocValues getNumericDocValues(String field) throws IOException { + Type v = getType(field); + if (v != null) { + switch (v) { + case INTEGER_POINT: return FieldCache.DEFAULT.getNumerics(in, field, FieldCache.INT_POINT_PARSER, true); + case FLOAT_POINT: return FieldCache.DEFAULT.getNumerics(in, field, FieldCache.FLOAT_POINT_PARSER, true); + case LONG_POINT: return FieldCache.DEFAULT.getNumerics(in, field, FieldCache.LONG_POINT_PARSER, true); + case DOUBLE_POINT: return FieldCache.DEFAULT.getNumerics(in, field, FieldCache.DOUBLE_POINT_PARSER, true); + case LEGACY_INTEGER: return FieldCache.DEFAULT.getNumerics(in, field, FieldCache.LEGACY_INT_PARSER, true); + case LEGACY_FLOAT: return FieldCache.DEFAULT.getNumerics(in, field, FieldCache.LEGACY_FLOAT_PARSER, true); + case LEGACY_LONG: return FieldCache.DEFAULT.getNumerics(in, field, FieldCache.LEGACY_LONG_PARSER, true); + case LEGACY_DOUBLE: return FieldCache.DEFAULT.getNumerics(in, field, FieldCache.LEGACY_DOUBLE_PARSER, true); + } + } + return super.getNumericDocValues(field); + } + + @Override + public BinaryDocValues getBinaryDocValues(String field) throws IOException { + Type v = getType(field); + if (v == Type.BINARY) { + return FieldCache.DEFAULT.getTerms(in, field, true); + } else { + return in.getBinaryDocValues(field); + } + } + + @Override + public SortedDocValues getSortedDocValues(String field) throws IOException { + Type v = getType(field); + if (v == Type.SORTED) { + return FieldCache.DEFAULT.getTermsIndex(in, field); + } else { + return in.getSortedDocValues(field); + } + } + + @Override + public SortedSetDocValues getSortedSetDocValues(String field) throws IOException { + Type v = getType(field); + if (v != null) { + switch (v) { + case SORTED_SET_INTEGER: + case SORTED_SET_FLOAT: + return FieldCache.DEFAULT.getDocTermOrds(in, field, FieldCache.INT32_TERM_PREFIX); + case SORTED_SET_LONG: + case SORTED_SET_DOUBLE: + return FieldCache.DEFAULT.getDocTermOrds(in, field, FieldCache.INT64_TERM_PREFIX); + case SORTED_SET_BINARY: + return FieldCache.DEFAULT.getDocTermOrds(in, field, null); + } + } + return in.getSortedSetDocValues(field); + } + + @Override + public Bits getDocsWithField(String field) throws IOException { + Type v = getType(field); + if (v != null) { + switch (v) { + case INTEGER_POINT: return FieldCache.DEFAULT.getDocsWithField(in, field, FieldCache.INT_POINT_PARSER); + case FLOAT_POINT: return FieldCache.DEFAULT.getDocsWithField(in, field, FieldCache.FLOAT_POINT_PARSER); + case LONG_POINT: return FieldCache.DEFAULT.getDocsWithField(in, field, FieldCache.LONG_POINT_PARSER); + case DOUBLE_POINT: return FieldCache.DEFAULT.getDocsWithField(in, field, FieldCache.DOUBLE_POINT_PARSER); + case LEGACY_INTEGER: return FieldCache.DEFAULT.getDocsWithField(in, field, FieldCache.LEGACY_INT_PARSER); + case LEGACY_FLOAT: return FieldCache.DEFAULT.getDocsWithField(in, field, FieldCache.LEGACY_FLOAT_PARSER); + case LEGACY_LONG: return FieldCache.DEFAULT.getDocsWithField(in, field, FieldCache.LEGACY_LONG_PARSER); + case LEGACY_DOUBLE: return FieldCache.DEFAULT.getDocsWithField(in, field, FieldCache.LEGACY_DOUBLE_PARSER); + default: + return FieldCache.DEFAULT.getDocsWithField(in, field, null); + } + } else { + return in.getDocsWithField(field); + } + } + + /** + * Returns the field's uninversion type, or null + * if the field doesn't exist or doesn't have a mapping. + */ + private Type getType(String field) { + FieldInfo info = fieldInfos.fieldInfo(field); + if (info == null || info.getDocValuesType() == DocValuesType.NONE) { + return null; + } + return mapping.get(field); + } + + @Override + public Object getCoreCacheKey() { + return in.getCoreCacheKey(); + } + + @Override + public Object getCombinedCoreAndDeletesKey() { + return in.getCombinedCoreAndDeletesKey(); + } + + @Override + public String toString() { + return "Uninverting(" + in.toString() + ")"; + } + + /** + * Return information about the backing cache + * @lucene.internal + */ + public static String[] getUninvertedStats() { + CacheEntry[] entries = FieldCache.DEFAULT.getCacheEntries(); + String[] info = new String[entries.length]; + for (int i = 0; i < entries.length; i++) { + info[i] = entries[i].toString(); + } + return info; + } +} http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/5525f429/solr/core/src/java/org/apache/solr/uninverting/package-info.java ---------------------------------------------------------------------- diff --git a/solr/core/src/java/org/apache/solr/uninverting/package-info.java b/solr/core/src/java/org/apache/solr/uninverting/package-info.java new file mode 100644 index 0000000..d95e08f --- /dev/null +++ b/solr/core/src/java/org/apache/solr/uninverting/package-info.java @@ -0,0 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Support for creating docvalues on-the-fly from the inverted index at runtime. + */ +package org.apache.solr.uninverting; http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/5525f429/solr/core/src/java/org/apache/solr/update/DeleteByQueryWrapper.java ---------------------------------------------------------------------- diff --git a/solr/core/src/java/org/apache/solr/update/DeleteByQueryWrapper.java b/solr/core/src/java/org/apache/solr/update/DeleteByQueryWrapper.java index 3d87161..778e4c6 100644 --- a/solr/core/src/java/org/apache/solr/update/DeleteByQueryWrapper.java +++ b/solr/core/src/java/org/apache/solr/update/DeleteByQueryWrapper.java @@ -29,8 +29,9 @@ import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.Scorer; import org.apache.lucene.search.Weight; -import org.apache.lucene.uninverting.UninvertingReader; import org.apache.solr.schema.IndexSchema; +import org.apache.solr.uninverting.UninvertingReader; +import org.apache.solr.uninverting.UninvertingReader; /** * Allows access to uninverted docvalues by delete-by-queries. http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/5525f429/solr/core/src/java/org/apache/solr/update/VersionInfo.java ---------------------------------------------------------------------- diff --git a/solr/core/src/java/org/apache/solr/update/VersionInfo.java b/solr/core/src/java/org/apache/solr/update/VersionInfo.java index 5fe415c..bee30f5 100644 --- a/solr/core/src/java/org/apache/solr/update/VersionInfo.java +++ b/solr/core/src/java/org/apache/solr/update/VersionInfo.java @@ -24,7 +24,6 @@ import java.util.concurrent.locks.ReentrantReadWriteLock; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.LeafReaderContext; -import org.apache.lucene.index.SlowCompositeReaderWrapper; import org.apache.lucene.index.Terms; import org.apache.lucene.queries.function.FunctionValues; import org.apache.lucene.queries.function.ValueSource; @@ -34,6 +33,7 @@ import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.LegacyNumericUtils; import org.apache.solr.common.SolrException; import org.apache.solr.common.util.SuppressForbidden; +import org.apache.solr.index.SlowCompositeReaderWrapper; import org.apache.solr.schema.IndexSchema; import org.apache.solr.schema.SchemaField; import org.apache.solr.search.SolrIndexSearcher; http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/5525f429/solr/core/src/test/org/apache/solr/index/TestSlowCompositeReaderWrapper.java ---------------------------------------------------------------------- diff --git a/solr/core/src/test/org/apache/solr/index/TestSlowCompositeReaderWrapper.java b/solr/core/src/test/org/apache/solr/index/TestSlowCompositeReaderWrapper.java new file mode 100644 index 0000000..0685e55 --- /dev/null +++ b/solr/core/src/test/org/apache/solr/index/TestSlowCompositeReaderWrapper.java @@ -0,0 +1,95 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.index; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.concurrent.atomic.AtomicInteger; + +import org.apache.lucene.document.Document; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.LeafReader; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.TestUtil; + +public class TestSlowCompositeReaderWrapper extends LuceneTestCase { + + public void testCoreListenerOnSlowCompositeReaderWrapper() throws IOException { + RandomIndexWriter w = new RandomIndexWriter(random(), newDirectory()); + final int numDocs = TestUtil.nextInt(random(), 1, 5); + for (int i = 0; i < numDocs; ++i) { + w.addDocument(new Document()); + if (random().nextBoolean()) { + w.commit(); + } + } + w.commit(); + w.close(); + + final IndexReader reader = DirectoryReader.open(w.w.getDirectory()); + final LeafReader leafReader = SlowCompositeReaderWrapper.wrap(reader); + + final int numListeners = TestUtil.nextInt(random(), 1, 10); + final List listeners = new ArrayList<>(); + AtomicInteger counter = new AtomicInteger(numListeners); + + for (int i = 0; i < numListeners; ++i) { + CountCoreListener listener = new CountCoreListener(counter, leafReader.getCoreCacheKey()); + listeners.add(listener); + leafReader.addCoreClosedListener(listener); + } + for (int i = 0; i < 100; ++i) { + leafReader.addCoreClosedListener(listeners.get(random().nextInt(listeners.size()))); + } + final int removed = random().nextInt(numListeners); + Collections.shuffle(listeners, random()); + for (int i = 0; i < removed; ++i) { + leafReader.removeCoreClosedListener(listeners.get(i)); + } + assertEquals(numListeners, counter.get()); + // make sure listeners are registered on the wrapped reader and that closing any of them has the same effect + if (random().nextBoolean()) { + reader.close(); + } else { + leafReader.close(); + } + assertEquals(removed, counter.get()); + w.w.getDirectory().close(); + } + + private static final class CountCoreListener implements LeafReader.CoreClosedListener { + + private final AtomicInteger count; + private final Object coreCacheKey; + + public CountCoreListener(AtomicInteger count, Object coreCacheKey) { + this.count = count; + this.coreCacheKey = coreCacheKey; + } + + @Override + public void onClose(Object coreCacheKey) { + assertSame(this.coreCacheKey, coreCacheKey); + count.decrementAndGet(); + } + + } +} http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/5525f429/solr/core/src/test/org/apache/solr/request/TestFaceting.java ---------------------------------------------------------------------- diff --git a/solr/core/src/test/org/apache/solr/request/TestFaceting.java b/solr/core/src/test/org/apache/solr/request/TestFaceting.java index 97dcedf..4dd49e1 100644 --- a/solr/core/src/test/org/apache/solr/request/TestFaceting.java +++ b/solr/core/src/test/org/apache/solr/request/TestFaceting.java @@ -25,12 +25,12 @@ import org.apache.lucene.index.DocValues; import org.apache.lucene.index.SortedSetDocValues; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.uninverting.DocTermOrds; import org.apache.lucene.util.BytesRef; import org.apache.solr.SolrTestCaseJ4; import org.apache.solr.common.SolrException.ErrorCode; import org.apache.solr.common.params.FacetParams; import org.apache.solr.search.SolrIndexSearcher; +import org.apache.solr.uninverting.DocTermOrds; import org.apache.solr.util.RefCounted; import org.junit.After; import org.junit.BeforeClass; http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/5525f429/solr/core/src/test/org/apache/solr/search/TestSort.java ---------------------------------------------------------------------- diff --git a/solr/core/src/test/org/apache/solr/search/TestSort.java b/solr/core/src/test/org/apache/solr/search/TestSort.java index e874c37..8590b18 100644 --- a/solr/core/src/test/org/apache/solr/search/TestSort.java +++ b/solr/core/src/test/org/apache/solr/search/TestSort.java @@ -42,13 +42,12 @@ import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.LeafCollector; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.Sort; -import org.apache.lucene.search.SortField; import org.apache.lucene.search.SortField.Type; +import org.apache.lucene.search.SortField; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.TopFieldCollector; import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMDirectory; -import org.apache.lucene.uninverting.UninvertingReader; import org.apache.lucene.util.BitDocIdSet; import org.apache.lucene.util.Bits; import org.apache.lucene.util.FixedBitSet; @@ -56,6 +55,7 @@ import org.apache.lucene.util.TestUtil; import org.apache.solr.SolrTestCaseJ4; import org.apache.solr.request.SolrQueryRequest; import org.apache.solr.schema.SchemaField; +import org.apache.solr.uninverting.UninvertingReader; import org.junit.BeforeClass; import org.slf4j.Logger; import org.slf4j.LoggerFactory;