From commits-return-102482-archive-asf-public=cust-asf.ponee.io@lucene.apache.org Sat Jul 28 06:49:36 2018 Return-Path: X-Original-To: archive-asf-public@cust-asf.ponee.io Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by mx-eu-01.ponee.io (Postfix) with SMTP id 6085D180679 for ; Sat, 28 Jul 2018 06:49:35 +0200 (CEST) Received: (qmail 71050 invoked by uid 500); 28 Jul 2018 04:49:32 -0000 Mailing-List: contact commits-help@lucene.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@lucene.apache.org Delivered-To: mailing list commits@lucene.apache.org Received: (qmail 64003 invoked by uid 99); 28 Jul 2018 04:49:26 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Sat, 28 Jul 2018 04:49:26 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id 87FCAE1188; Sat, 28 Jul 2018 04:49:25 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: shalin@apache.org To: commits@lucene.apache.org Date: Sat, 28 Jul 2018 04:49:53 -0000 Message-Id: In-Reply-To: <4cf611223e9848ea9aed676541fc733c@git.apache.org> References: <4cf611223e9848ea9aed676541fc733c@git.apache.org> X-Mailer: ASF-Git Admin Mailer Subject: [30/50] [abbrv] lucene-solr:jira/solr-11990: LUCENE-8425: Expose hard live docs on SegmentReader level LUCENE-8425: Expose hard live docs on SegmentReader level Today if soft deletes are used we expose a union of hard and soft deletes via LeafReader#getLiveDocs. Yet, if a users wants to take advantage of searching also soft-deleted documents the only option today is to search all documents even though some of them are hard deleted. The recommendation is to not mix those but in exceptional cases ie. when a document hits a non-aborting exception during indexing the document is marked as hard deleted which is the correct action. In order to filter those out having access to the hard live docs on the segment reader level allows to filter out these documents. Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/35fa0b4f Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/35fa0b4f Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/35fa0b4f Branch: refs/heads/jira/solr-11990 Commit: 35fa0b4f55f95ca0c8d8b21c77e78e478fba8e74 Parents: a254e7d Author: Simon Willnauer Authored: Tue Jul 24 13:41:11 2018 +0200 Committer: Simon Willnauer Committed: Wed Jul 25 09:34:36 2018 +0200 ---------------------------------------------------------------------- .../org/apache/lucene/index/ReaderPool.java | 2 +- .../apache/lucene/index/ReadersAndUpdates.java | 5 +- .../org/apache/lucene/index/SegmentReader.java | 49 +++++++------ .../lucene/index/StandardDirectoryReader.java | 16 +++-- .../apache/lucene/index/TestIndexWriter.java | 75 ++++++++++++++++++++ .../nrt/SegmentInfosSearcherManager.java | 2 +- 6 files changed, 118 insertions(+), 31 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/35fa0b4f/lucene/core/src/java/org/apache/lucene/index/ReaderPool.java ---------------------------------------------------------------------- diff --git a/lucene/core/src/java/org/apache/lucene/index/ReaderPool.java b/lucene/core/src/java/org/apache/lucene/index/ReaderPool.java index 980f4a1..b792be2 100644 --- a/lucene/core/src/java/org/apache/lucene/index/ReaderPool.java +++ b/lucene/core/src/java/org/apache/lucene/index/ReaderPool.java @@ -89,7 +89,7 @@ final class ReaderPool implements Closeable { LeafReaderContext leaf = leaves.get(i); SegmentReader segReader = (SegmentReader) leaf.reader(); SegmentReader newReader = new SegmentReader(segmentInfos.info(i), segReader, segReader.getLiveDocs(), - segReader.numDocs()); + segReader.getHardLiveDocs(), segReader.numDocs(), true); readerMap.put(newReader.getOriginalSegmentInfo(), new ReadersAndUpdates(segmentInfos.getIndexCreatedVersionMajor(), newReader, newPendingDeletes(newReader, newReader.getOriginalSegmentInfo()))); } http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/35fa0b4f/lucene/core/src/java/org/apache/lucene/index/ReadersAndUpdates.java ---------------------------------------------------------------------- diff --git a/lucene/core/src/java/org/apache/lucene/index/ReadersAndUpdates.java b/lucene/core/src/java/org/apache/lucene/index/ReadersAndUpdates.java index 3453447..b09338f 100644 --- a/lucene/core/src/java/org/apache/lucene/index/ReadersAndUpdates.java +++ b/lucene/core/src/java/org/apache/lucene/index/ReadersAndUpdates.java @@ -214,7 +214,7 @@ final class ReadersAndUpdates { // force new liveDocs Bits liveDocs = pendingDeletes.getLiveDocs(); if (liveDocs != null) { - return new SegmentReader(info, reader, liveDocs, pendingDeletes.numDocs()); + return new SegmentReader(info, reader, liveDocs, pendingDeletes.getHardLiveDocs(), pendingDeletes.numDocs(), true); } else { // liveDocs == null and reader != null. That can only be if there are no deletes assert reader.getLiveDocs() == null; @@ -645,7 +645,8 @@ final class ReadersAndUpdates { private SegmentReader createNewReaderWithLatestLiveDocs(SegmentReader reader) throws IOException { assert reader != null; assert Thread.holdsLock(this) : Thread.currentThread().getName(); - SegmentReader newReader = new SegmentReader(info, reader, pendingDeletes.getLiveDocs(), pendingDeletes.numDocs()); + SegmentReader newReader = new SegmentReader(info, reader, pendingDeletes.getLiveDocs(), + pendingDeletes.getHardLiveDocs(), pendingDeletes.numDocs(), true); boolean success2 = false; try { pendingDeletes.onNewReader(newReader, info); http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/35fa0b4f/lucene/core/src/java/org/apache/lucene/index/SegmentReader.java ---------------------------------------------------------------------- diff --git a/lucene/core/src/java/org/apache/lucene/index/SegmentReader.java b/lucene/core/src/java/org/apache/lucene/index/SegmentReader.java index 9373718..b368b96 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SegmentReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/SegmentReader.java @@ -51,6 +51,7 @@ public final class SegmentReader extends CodecReader { private final SegmentCommitInfo originalSi; private final LeafMetaData metaData; private final Bits liveDocs; + private final Bits hardLiveDocs; // Normally set to si.maxDoc - si.delDocCount, unless we // were created as an NRT reader from IW, in which case IW @@ -65,7 +66,7 @@ public final class SegmentReader extends CodecReader { final DocValuesProducer docValuesProducer; final FieldInfos fieldInfos; - + /** * Constructs a new SegmentReader with a new core. * @throws CorruptIndexException if the index is corrupt @@ -87,16 +88,16 @@ public final class SegmentReader extends CodecReader { try { if (si.hasDeletions()) { // NOTE: the bitvector is stored using the regular directory, not cfs - liveDocs = codec.liveDocsFormat().readLiveDocs(directory(), si, IOContext.READONCE); + hardLiveDocs = liveDocs = codec.liveDocsFormat().readLiveDocs(directory(), si, IOContext.READONCE); } else { assert si.getDelCount() == 0; - liveDocs = null; + hardLiveDocs = liveDocs = null; } numDocs = si.info.maxDoc() - si.getDelCount(); fieldInfos = initFieldInfos(); docValuesProducer = initDocValuesProducer(); - + assert assertLiveDocs(isNRT, hardLiveDocs, liveDocs); success = true; } finally { // With lock-less commits, it's entirely possible (and @@ -111,26 +112,9 @@ public final class SegmentReader extends CodecReader { } /** Create new SegmentReader sharing core from a previous - * SegmentReader and loading new live docs from a new - * deletes file. Used by openIfChanged. */ - SegmentReader(SegmentCommitInfo si, SegmentReader sr) throws IOException { - this(si, sr, - si.hasDeletions() ? si.info.getCodec().liveDocsFormat().readLiveDocs(si.info.dir, si, IOContext.READONCE) : null, - si.info.maxDoc() - si.getDelCount(), false); - } - - /** Create new SegmentReader sharing core from a previous - * SegmentReader and using the provided in-memory - * liveDocs. Used by IndexWriter to provide a new NRT - * reader */ - SegmentReader(SegmentCommitInfo si, SegmentReader sr, Bits liveDocs, int numDocs) throws IOException { - this(si, sr, liveDocs, numDocs, true); - } - - /** Create new SegmentReader sharing core from a previous * SegmentReader and using the provided liveDocs, and recording * whether those liveDocs were carried in ram (isNRT=true). */ - SegmentReader(SegmentCommitInfo si, SegmentReader sr, Bits liveDocs, int numDocs, boolean isNRT) throws IOException { + SegmentReader(SegmentCommitInfo si, SegmentReader sr, Bits liveDocs, Bits hardLiveDocs, int numDocs, boolean isNRT) throws IOException { if (numDocs > si.info.maxDoc()) { throw new IllegalArgumentException("numDocs=" + numDocs + " but maxDoc=" + si.info.maxDoc()); } @@ -141,6 +125,8 @@ public final class SegmentReader extends CodecReader { this.originalSi = si; this.metaData = sr.getMetaData(); this.liveDocs = liveDocs; + this.hardLiveDocs = hardLiveDocs; + assert assertLiveDocs(isNRT, hardLiveDocs, liveDocs); this.isNRT = isNRT; this.numDocs = numDocs; this.core = sr.core; @@ -159,6 +145,15 @@ public final class SegmentReader extends CodecReader { } } + private static boolean assertLiveDocs(boolean isNRT, Bits hardLiveDocs, Bits liveDocs) { + if (isNRT) { + assert hardLiveDocs == null || liveDocs != null : " liveDocs must be non null if hardLiveDocs are non null"; + } else { + assert hardLiveDocs == liveDocs : "non-nrt case must have identical liveDocs"; + } + return true; + } + /** * init most recent DocValues for the current commit */ @@ -361,4 +356,14 @@ public final class SegmentReader extends CodecReader { SegmentCommitInfo getOriginalSegmentInfo() { return originalSi; } + + /** + * Returns the live docs that are not hard-deleted. This is an expert API to be used with + * soft-deletes to filter out document that hard deleted for instance due to aborted documents or to distinguish + * soft and hard deleted documents ie. a rolled back tombstone. + * @lucene.experimental + */ + public Bits getHardLiveDocs() { + return hardLiveDocs; + } } http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/35fa0b4f/lucene/core/src/java/org/apache/lucene/index/StandardDirectoryReader.java ---------------------------------------------------------------------- diff --git a/lucene/core/src/java/org/apache/lucene/index/StandardDirectoryReader.java b/lucene/core/src/java/org/apache/lucene/index/StandardDirectoryReader.java index 3b1b72f..5b2b049 100644 --- a/lucene/core/src/java/org/apache/lucene/index/StandardDirectoryReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/StandardDirectoryReader.java @@ -32,6 +32,7 @@ import java.util.concurrent.CopyOnWriteArraySet; import org.apache.lucene.store.AlreadyClosedException; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; +import org.apache.lucene.util.Bits; import org.apache.lucene.util.IOUtils; /** Default implementation of {@link DirectoryReader}. */ @@ -174,16 +175,17 @@ public final class StandardDirectoryReader extends DirectoryReader { try { SegmentReader newReader; if (oldReader == null || commitInfo.info.getUseCompoundFile() != oldReader.getSegmentInfo().info.getUseCompoundFile()) { - // this is a new reader; in case we hit an exception we can decRef it safely newReader = new SegmentReader(commitInfo, infos.getIndexCreatedVersionMajor(), IOContext.READ); newReaders[i] = newReader; } else { if (oldReader.isNRT) { // We must load liveDocs/DV updates from disk: - newReaders[i] = new SegmentReader(commitInfo, oldReader); + Bits liveDocs = commitInfo.hasDeletions() ? commitInfo.info.getCodec().liveDocsFormat() + .readLiveDocs(commitInfo.info.dir, commitInfo, IOContext.READONCE) : null; + newReaders[i] = new SegmentReader(commitInfo, oldReader, liveDocs, liveDocs, + commitInfo.info.maxDoc() - commitInfo.getDelCount(), false); } else { - if (oldReader.getSegmentInfo().getDelGen() == commitInfo.getDelGen() && oldReader.getSegmentInfo().getFieldInfosGen() == commitInfo.getFieldInfosGen()) { // No change; this reader will be shared between @@ -197,10 +199,14 @@ public final class StandardDirectoryReader extends DirectoryReader { if (oldReader.getSegmentInfo().getDelGen() == commitInfo.getDelGen()) { // only DV updates - newReaders[i] = new SegmentReader(commitInfo, oldReader, oldReader.getLiveDocs(), oldReader.numDocs(), false); // this is not an NRT reader! + newReaders[i] = new SegmentReader(commitInfo, oldReader, oldReader.getLiveDocs(), + oldReader.getHardLiveDocs(), oldReader.numDocs(), false); // this is not an NRT reader! } else { // both DV and liveDocs have changed - newReaders[i] = new SegmentReader(commitInfo, oldReader); + Bits liveDocs = commitInfo.hasDeletions() ? commitInfo.info.getCodec().liveDocsFormat() + .readLiveDocs(commitInfo.info.dir, commitInfo, IOContext.READONCE) : null; + newReaders[i] = new SegmentReader(commitInfo, oldReader, liveDocs, liveDocs, + commitInfo.info.maxDoc() - commitInfo.getDelCount(), false); } } } http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/35fa0b4f/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java ---------------------------------------------------------------------- diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java index 85e6979..4861929 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java @@ -3144,6 +3144,10 @@ public class TestIndexWriter extends LuceneTestCase { numSoftDeleted += info.getSoftDelCount(); } assertEquals(writer.maxDoc() - writer.numDocs(), numSoftDeleted); + for (LeafReaderContext context : reader.leaves()) { + LeafReader leaf = context.reader(); + assertNull(((SegmentReader) leaf).getHardLiveDocs()); + } writer.close(); reader.close(); dir.close(); @@ -3263,6 +3267,12 @@ public class TestIndexWriter extends LuceneTestCase { assertEquals(1, topDocs.totalHits); } } + if (mixDeletes == false) { + for (LeafReaderContext context : reader.leaves()) { + LeafReader leaf = context.reader(); + assertNull(((SegmentReader) leaf).getHardLiveDocs()); + } + } mergeAwaySoftDeletes.set(true); writer.addDocument(new Document()); // add a dummy doc to trigger a segment here writer.flush(); @@ -3524,4 +3534,69 @@ public class TestIndexWriter extends LuceneTestCase { w.close(); d.close(); } + + public void testSoftAndHardLiveDocs() throws IOException { + Directory dir = newDirectory(); + IndexWriterConfig indexWriterConfig = newIndexWriterConfig(); + String softDeletesField = "soft_delete"; + indexWriterConfig.setSoftDeletesField(softDeletesField); + IndexWriter writer = new IndexWriter(dir, indexWriterConfig); + Set uniqueDocs = new HashSet<>(); + for (int i = 0; i < 100; i++) { + int docId = random().nextInt(5); + uniqueDocs.add(docId); + Document doc = new Document(); + doc.add(new StringField("id", String.valueOf(docId), Field.Store.YES)); + if (docId % 2 == 0) { + writer.updateDocument(new Term("id", String.valueOf(docId)), doc); + } else { + writer.softUpdateDocument(new Term("id", String.valueOf(docId)), doc, + new NumericDocValuesField(softDeletesField, 0)); + } + if (random().nextBoolean()) { + assertHardLiveDocs(writer, uniqueDocs); + } + } + + if (random().nextBoolean()) { + writer.commit(); + } + assertHardLiveDocs(writer, uniqueDocs); + + + IOUtils.close(writer, dir); + } + + private void assertHardLiveDocs(IndexWriter writer, Set uniqueDocs) throws IOException { + try (DirectoryReader reader = DirectoryReader.open(writer)) { + assertEquals(uniqueDocs.size(), reader.numDocs()); + List leaves = reader.leaves(); + for (LeafReaderContext ctx : leaves) { + LeafReader leaf = ctx.reader(); + assertTrue(leaf instanceof SegmentReader); + SegmentReader sr = (SegmentReader) leaf; + if (sr.getHardLiveDocs() != null) { + Terms id = sr.terms("id"); + TermsEnum iterator = id.iterator(); + Bits hardLiveDocs = sr.getHardLiveDocs(); + Bits liveDocs = sr.getLiveDocs(); + for (Integer dId : uniqueDocs) { + boolean mustBeHardDeleted = dId % 2 == 0; + if (iterator.seekExact(new BytesRef(dId.toString()))) { + PostingsEnum postings = iterator.postings(null); + while (postings.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { + if (liveDocs.get(postings.docID())) { + assertTrue(hardLiveDocs.get(postings.docID())); + } else if (mustBeHardDeleted) { + assertFalse(hardLiveDocs.get(postings.docID())); + } else { + assertTrue(hardLiveDocs.get(postings.docID())); + } + } + } + } + } + } + } + } } http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/35fa0b4f/lucene/replicator/src/java/org/apache/lucene/replicator/nrt/SegmentInfosSearcherManager.java ---------------------------------------------------------------------- diff --git a/lucene/replicator/src/java/org/apache/lucene/replicator/nrt/SegmentInfosSearcherManager.java b/lucene/replicator/src/java/org/apache/lucene/replicator/nrt/SegmentInfosSearcherManager.java index a04464a..d18ee10 100644 --- a/lucene/replicator/src/java/org/apache/lucene/replicator/nrt/SegmentInfosSearcherManager.java +++ b/lucene/replicator/src/java/org/apache/lucene/replicator/nrt/SegmentInfosSearcherManager.java @@ -107,7 +107,7 @@ class SegmentInfosSearcherManager extends ReferenceManager { DirectoryReader r = StandardDirectoryReader.open(dir, currentInfos, subs); addReaderClosedListener(r); node.message("refreshed to version=" + currentInfos.getVersion() + " r=" + r); - return SearcherManager.getSearcher(searcherFactory, r, (DirectoryReader) old.getIndexReader()); + return SearcherManager.getSearcher(searcherFactory, r, old.getIndexReader()); } private void addReaderClosedListener(IndexReader r) {