lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From sha...@apache.org
Subject [30/50] [abbrv] lucene-solr:jira/solr-11990: LUCENE-8425: Expose hard live docs on SegmentReader level
Date Sat, 28 Jul 2018 04:49:53 GMT
LUCENE-8425: Expose hard live docs on SegmentReader level

Today if soft deletes are used we expose a union of hard and soft deletes
via LeafReader#getLiveDocs. Yet, if a users wants to take advantage of
searching also soft-deleted documents the only option today is to search
all documents even though some of them are hard deleted. The recommendation
is to not mix those but in exceptional cases ie. when a document hits a
non-aborting exception during indexing the document is marked as hard
deleted which is the correct action. In order to filter those out having
access to the hard live docs on the segment reader level allows to filter out
these documents.


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/35fa0b4f
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/35fa0b4f
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/35fa0b4f

Branch: refs/heads/jira/solr-11990
Commit: 35fa0b4f55f95ca0c8d8b21c77e78e478fba8e74
Parents: a254e7d
Author: Simon Willnauer <simonw@apache.org>
Authored: Tue Jul 24 13:41:11 2018 +0200
Committer: Simon Willnauer <simonw@apache.org>
Committed: Wed Jul 25 09:34:36 2018 +0200

----------------------------------------------------------------------
 .../org/apache/lucene/index/ReaderPool.java     |  2 +-
 .../apache/lucene/index/ReadersAndUpdates.java  |  5 +-
 .../org/apache/lucene/index/SegmentReader.java  | 49 +++++++------
 .../lucene/index/StandardDirectoryReader.java   | 16 +++--
 .../apache/lucene/index/TestIndexWriter.java    | 75 ++++++++++++++++++++
 .../nrt/SegmentInfosSearcherManager.java        |  2 +-
 6 files changed, 118 insertions(+), 31 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/35fa0b4f/lucene/core/src/java/org/apache/lucene/index/ReaderPool.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/ReaderPool.java b/lucene/core/src/java/org/apache/lucene/index/ReaderPool.java
index 980f4a1..b792be2 100644
--- a/lucene/core/src/java/org/apache/lucene/index/ReaderPool.java
+++ b/lucene/core/src/java/org/apache/lucene/index/ReaderPool.java
@@ -89,7 +89,7 @@ final class ReaderPool implements Closeable {
         LeafReaderContext leaf = leaves.get(i);
         SegmentReader segReader = (SegmentReader) leaf.reader();
         SegmentReader newReader = new SegmentReader(segmentInfos.info(i), segReader, segReader.getLiveDocs(),
-            segReader.numDocs());
+            segReader.getHardLiveDocs(), segReader.numDocs(), true);
         readerMap.put(newReader.getOriginalSegmentInfo(), new ReadersAndUpdates(segmentInfos.getIndexCreatedVersionMajor(),
             newReader, newPendingDeletes(newReader, newReader.getOriginalSegmentInfo())));
       }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/35fa0b4f/lucene/core/src/java/org/apache/lucene/index/ReadersAndUpdates.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/ReadersAndUpdates.java b/lucene/core/src/java/org/apache/lucene/index/ReadersAndUpdates.java
index 3453447..b09338f 100644
--- a/lucene/core/src/java/org/apache/lucene/index/ReadersAndUpdates.java
+++ b/lucene/core/src/java/org/apache/lucene/index/ReadersAndUpdates.java
@@ -214,7 +214,7 @@ final class ReadersAndUpdates {
     // force new liveDocs
     Bits liveDocs = pendingDeletes.getLiveDocs();
     if (liveDocs != null) {
-      return new SegmentReader(info, reader, liveDocs, pendingDeletes.numDocs());
+      return new SegmentReader(info, reader, liveDocs, pendingDeletes.getHardLiveDocs(),
pendingDeletes.numDocs(), true);
     } else {
       // liveDocs == null and reader != null. That can only be if there are no deletes
       assert reader.getLiveDocs() == null;
@@ -645,7 +645,8 @@ final class ReadersAndUpdates {
   private SegmentReader createNewReaderWithLatestLiveDocs(SegmentReader reader) throws IOException
{
     assert reader != null;
     assert Thread.holdsLock(this) : Thread.currentThread().getName();
-    SegmentReader newReader = new SegmentReader(info, reader, pendingDeletes.getLiveDocs(),
pendingDeletes.numDocs());
+    SegmentReader newReader = new SegmentReader(info, reader, pendingDeletes.getLiveDocs(),
+        pendingDeletes.getHardLiveDocs(), pendingDeletes.numDocs(), true);
     boolean success2 = false;
     try {
       pendingDeletes.onNewReader(newReader, info);

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/35fa0b4f/lucene/core/src/java/org/apache/lucene/index/SegmentReader.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/SegmentReader.java b/lucene/core/src/java/org/apache/lucene/index/SegmentReader.java
index 9373718..b368b96 100644
--- a/lucene/core/src/java/org/apache/lucene/index/SegmentReader.java
+++ b/lucene/core/src/java/org/apache/lucene/index/SegmentReader.java
@@ -51,6 +51,7 @@ public final class SegmentReader extends CodecReader {
   private final SegmentCommitInfo originalSi;
   private final LeafMetaData metaData;
   private final Bits liveDocs;
+  private final Bits hardLiveDocs;
 
   // Normally set to si.maxDoc - si.delDocCount, unless we
   // were created as an NRT reader from IW, in which case IW
@@ -65,7 +66,7 @@ public final class SegmentReader extends CodecReader {
   
   final DocValuesProducer docValuesProducer;
   final FieldInfos fieldInfos;
-  
+
   /**
    * Constructs a new SegmentReader with a new core.
    * @throws CorruptIndexException if the index is corrupt
@@ -87,16 +88,16 @@ public final class SegmentReader extends CodecReader {
     try {
       if (si.hasDeletions()) {
         // NOTE: the bitvector is stored using the regular directory, not cfs
-        liveDocs = codec.liveDocsFormat().readLiveDocs(directory(), si, IOContext.READONCE);
+        hardLiveDocs = liveDocs = codec.liveDocsFormat().readLiveDocs(directory(), si, IOContext.READONCE);
       } else {
         assert si.getDelCount() == 0;
-        liveDocs = null;
+        hardLiveDocs = liveDocs = null;
       }
       numDocs = si.info.maxDoc() - si.getDelCount();
       
       fieldInfos = initFieldInfos();
       docValuesProducer = initDocValuesProducer();
-
+      assert assertLiveDocs(isNRT, hardLiveDocs, liveDocs);
       success = true;
     } finally {
       // With lock-less commits, it's entirely possible (and
@@ -111,26 +112,9 @@ public final class SegmentReader extends CodecReader {
   }
 
   /** Create new SegmentReader sharing core from a previous
-   *  SegmentReader and loading new live docs from a new
-   *  deletes file.  Used by openIfChanged. */
-  SegmentReader(SegmentCommitInfo si, SegmentReader sr) throws IOException {
-    this(si, sr,
-         si.hasDeletions() ? si.info.getCodec().liveDocsFormat().readLiveDocs(si.info.dir,
si, IOContext.READONCE) : null,
-         si.info.maxDoc() - si.getDelCount(), false);
-  }
-
-  /** Create new SegmentReader sharing core from a previous
-   *  SegmentReader and using the provided in-memory
-   *  liveDocs.  Used by IndexWriter to provide a new NRT
-   *  reader */
-  SegmentReader(SegmentCommitInfo si, SegmentReader sr, Bits liveDocs, int numDocs) throws
IOException {
-    this(si, sr, liveDocs, numDocs, true);
-  }
-    
-  /** Create new SegmentReader sharing core from a previous
    *  SegmentReader and using the provided liveDocs, and recording
    *  whether those liveDocs were carried in ram (isNRT=true). */
-  SegmentReader(SegmentCommitInfo si, SegmentReader sr, Bits liveDocs, int numDocs, boolean
isNRT) throws IOException {
+  SegmentReader(SegmentCommitInfo si, SegmentReader sr, Bits liveDocs, Bits hardLiveDocs,
int numDocs, boolean isNRT) throws IOException {
     if (numDocs > si.info.maxDoc()) {
       throw new IllegalArgumentException("numDocs=" + numDocs + " but maxDoc=" + si.info.maxDoc());
     }
@@ -141,6 +125,8 @@ public final class SegmentReader extends CodecReader {
     this.originalSi = si;
     this.metaData = sr.getMetaData();
     this.liveDocs = liveDocs;
+    this.hardLiveDocs = hardLiveDocs;
+    assert assertLiveDocs(isNRT, hardLiveDocs, liveDocs);
     this.isNRT = isNRT;
     this.numDocs = numDocs;
     this.core = sr.core;
@@ -159,6 +145,15 @@ public final class SegmentReader extends CodecReader {
     }
   }
 
+  private static boolean assertLiveDocs(boolean isNRT, Bits hardLiveDocs, Bits liveDocs)
{
+    if (isNRT) {
+      assert hardLiveDocs == null || liveDocs != null : " liveDocs must be non null if hardLiveDocs
are non null";
+    } else {
+      assert hardLiveDocs == liveDocs : "non-nrt case must have identical liveDocs";
+    }
+    return true;
+  }
+
   /**
    * init most recent DocValues for the current commit
    */
@@ -361,4 +356,14 @@ public final class SegmentReader extends CodecReader {
   SegmentCommitInfo getOriginalSegmentInfo() {
     return originalSi;
   }
+
+  /**
+   * Returns the live docs that are not hard-deleted. This is an expert API to be used with
+   * soft-deletes to filter out document that hard deleted for instance due to aborted documents
or to distinguish
+   * soft and hard deleted documents ie. a rolled back tombstone.
+   * @lucene.experimental
+   */
+  public Bits getHardLiveDocs() {
+    return hardLiveDocs;
+  }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/35fa0b4f/lucene/core/src/java/org/apache/lucene/index/StandardDirectoryReader.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/StandardDirectoryReader.java b/lucene/core/src/java/org/apache/lucene/index/StandardDirectoryReader.java
index 3b1b72f..5b2b049 100644
--- a/lucene/core/src/java/org/apache/lucene/index/StandardDirectoryReader.java
+++ b/lucene/core/src/java/org/apache/lucene/index/StandardDirectoryReader.java
@@ -32,6 +32,7 @@ import java.util.concurrent.CopyOnWriteArraySet;
 import org.apache.lucene.store.AlreadyClosedException;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.IOContext;
+import org.apache.lucene.util.Bits;
 import org.apache.lucene.util.IOUtils;
 
 /** Default implementation of {@link DirectoryReader}. */
@@ -174,16 +175,17 @@ public final class StandardDirectoryReader extends DirectoryReader {
       try {
         SegmentReader newReader;
         if (oldReader == null || commitInfo.info.getUseCompoundFile() != oldReader.getSegmentInfo().info.getUseCompoundFile())
{
-
           // this is a new reader; in case we hit an exception we can decRef it safely
           newReader = new SegmentReader(commitInfo, infos.getIndexCreatedVersionMajor(),
IOContext.READ);
           newReaders[i] = newReader;
         } else {
           if (oldReader.isNRT) {
             // We must load liveDocs/DV updates from disk:
-            newReaders[i] = new SegmentReader(commitInfo, oldReader);
+            Bits liveDocs = commitInfo.hasDeletions() ? commitInfo.info.getCodec().liveDocsFormat()
+                .readLiveDocs(commitInfo.info.dir, commitInfo, IOContext.READONCE) : null;
+            newReaders[i] = new SegmentReader(commitInfo, oldReader, liveDocs, liveDocs,
+                commitInfo.info.maxDoc() - commitInfo.getDelCount(), false);
           } else {
-            
             if (oldReader.getSegmentInfo().getDelGen() == commitInfo.getDelGen()
                 && oldReader.getSegmentInfo().getFieldInfosGen() == commitInfo.getFieldInfosGen())
{
               // No change; this reader will be shared between
@@ -197,10 +199,14 @@ public final class StandardDirectoryReader extends DirectoryReader {
 
               if (oldReader.getSegmentInfo().getDelGen() == commitInfo.getDelGen()) {
                 // only DV updates
-                newReaders[i] = new SegmentReader(commitInfo, oldReader, oldReader.getLiveDocs(),
oldReader.numDocs(), false); // this is not an NRT reader!
+                newReaders[i] = new SegmentReader(commitInfo, oldReader, oldReader.getLiveDocs(),
+                    oldReader.getHardLiveDocs(), oldReader.numDocs(), false); // this is
not an NRT reader!
               } else {
                 // both DV and liveDocs have changed
-                newReaders[i] = new SegmentReader(commitInfo, oldReader);
+                Bits liveDocs = commitInfo.hasDeletions() ? commitInfo.info.getCodec().liveDocsFormat()
+                    .readLiveDocs(commitInfo.info.dir, commitInfo, IOContext.READONCE) :
null;
+                newReaders[i] = new SegmentReader(commitInfo, oldReader, liveDocs, liveDocs,
+                    commitInfo.info.maxDoc() - commitInfo.getDelCount(), false);
               }
             }
           }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/35fa0b4f/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java
index 85e6979..4861929 100644
--- a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java
@@ -3144,6 +3144,10 @@ public class TestIndexWriter extends LuceneTestCase {
      numSoftDeleted += info.getSoftDelCount();
     }
     assertEquals(writer.maxDoc() - writer.numDocs(), numSoftDeleted);
+    for (LeafReaderContext context : reader.leaves()) {
+      LeafReader leaf = context.reader();
+      assertNull(((SegmentReader) leaf).getHardLiveDocs());
+    }
     writer.close();
     reader.close();
     dir.close();
@@ -3263,6 +3267,12 @@ public class TestIndexWriter extends LuceneTestCase {
         assertEquals(1, topDocs.totalHits);
       }
     }
+    if (mixDeletes == false) {
+      for (LeafReaderContext context : reader.leaves()) {
+        LeafReader leaf = context.reader();
+        assertNull(((SegmentReader) leaf).getHardLiveDocs());
+      }
+    }
     mergeAwaySoftDeletes.set(true);
     writer.addDocument(new Document()); // add a dummy doc to trigger a segment here
     writer.flush();
@@ -3524,4 +3534,69 @@ public class TestIndexWriter extends LuceneTestCase {
     w.close();
     d.close();
   }
+
+  public void testSoftAndHardLiveDocs() throws IOException {
+    Directory dir = newDirectory();
+    IndexWriterConfig indexWriterConfig = newIndexWriterConfig();
+    String softDeletesField = "soft_delete";
+    indexWriterConfig.setSoftDeletesField(softDeletesField);
+    IndexWriter writer = new IndexWriter(dir, indexWriterConfig);
+    Set<Integer> uniqueDocs = new HashSet<>();
+    for (int i = 0; i < 100; i++) {
+      int docId = random().nextInt(5);
+      uniqueDocs.add(docId);
+      Document doc = new Document();
+      doc.add(new StringField("id",  String.valueOf(docId), Field.Store.YES));
+      if (docId %  2 == 0) {
+        writer.updateDocument(new Term("id", String.valueOf(docId)), doc);
+      } else {
+        writer.softUpdateDocument(new Term("id", String.valueOf(docId)), doc,
+            new NumericDocValuesField(softDeletesField,  0));
+      }
+      if (random().nextBoolean()) {
+        assertHardLiveDocs(writer, uniqueDocs);
+      }
+    }
+
+    if (random().nextBoolean()) {
+      writer.commit();
+    }
+    assertHardLiveDocs(writer, uniqueDocs);
+
+
+    IOUtils.close(writer, dir);
+  }
+
+  private void assertHardLiveDocs(IndexWriter writer, Set<Integer> uniqueDocs) throws
IOException {
+    try (DirectoryReader reader = DirectoryReader.open(writer)) {
+      assertEquals(uniqueDocs.size(), reader.numDocs());
+      List<LeafReaderContext> leaves = reader.leaves();
+      for (LeafReaderContext ctx : leaves) {
+        LeafReader leaf = ctx.reader();
+        assertTrue(leaf instanceof SegmentReader);
+        SegmentReader sr = (SegmentReader) leaf;
+        if (sr.getHardLiveDocs() != null) {
+          Terms id = sr.terms("id");
+          TermsEnum iterator = id.iterator();
+          Bits hardLiveDocs = sr.getHardLiveDocs();
+          Bits liveDocs = sr.getLiveDocs();
+          for (Integer dId : uniqueDocs) {
+            boolean mustBeHardDeleted = dId % 2 == 0;
+            if (iterator.seekExact(new BytesRef(dId.toString()))) {
+              PostingsEnum postings = iterator.postings(null);
+              while (postings.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
+                if (liveDocs.get(postings.docID())) {
+                  assertTrue(hardLiveDocs.get(postings.docID()));
+                } else if (mustBeHardDeleted) {
+                  assertFalse(hardLiveDocs.get(postings.docID()));
+                } else {
+                  assertTrue(hardLiveDocs.get(postings.docID()));
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/35fa0b4f/lucene/replicator/src/java/org/apache/lucene/replicator/nrt/SegmentInfosSearcherManager.java
----------------------------------------------------------------------
diff --git a/lucene/replicator/src/java/org/apache/lucene/replicator/nrt/SegmentInfosSearcherManager.java
b/lucene/replicator/src/java/org/apache/lucene/replicator/nrt/SegmentInfosSearcherManager.java
index a04464a..d18ee10 100644
--- a/lucene/replicator/src/java/org/apache/lucene/replicator/nrt/SegmentInfosSearcherManager.java
+++ b/lucene/replicator/src/java/org/apache/lucene/replicator/nrt/SegmentInfosSearcherManager.java
@@ -107,7 +107,7 @@ class SegmentInfosSearcherManager extends ReferenceManager<IndexSearcher>
{
     DirectoryReader r = StandardDirectoryReader.open(dir, currentInfos, subs);
     addReaderClosedListener(r);
     node.message("refreshed to version=" + currentInfos.getVersion() + " r=" + r);
-    return SearcherManager.getSearcher(searcherFactory, r, (DirectoryReader) old.getIndexReader());
+    return SearcherManager.getSearcher(searcherFactory, r, old.getIndexReader());
   }
 
   private void addReaderClosedListener(IndexReader r) {


Mime
View raw message