lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From dnh...@apache.org
Subject lucene-solr:branch_7x: LUCENE-8381: Do not count hard-deletes as soft-deletes in merges
Date Wed, 04 Jul 2018 18:42:51 GMT
Repository: lucene-solr
Updated Branches:
  refs/heads/branch_7x 4f9767ca0 -> c65fb8568


LUCENE-8381: Do not count hard-deletes as soft-deletes in merges

Today if a document is soft-deleted, then hard-deleted, IW, however,
considers that doc as soft-deleted when wrapping readers for merges.

This change makes sure that IW excludes the hard-deleted documents from
a merge reader, and does not count them as soft-deletes.

Co-authored-by: Simon Willnauer <simonw@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/c65fb856
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/c65fb856
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/c65fb856

Branch: refs/heads/branch_7x
Commit: c65fb8568a9d1b126c00f13c001644120e5cdf02
Parents: 4f9767c
Author: Nhat Nguyen <nhat.nguyen@elastic.co>
Authored: Tue Jul 3 22:56:46 2018 -0400
Committer: Nhat Nguyen <nhat.nguyen@elastic.co>
Committed: Wed Jul 4 14:22:04 2018 -0400

----------------------------------------------------------------------
 .../apache/lucene/index/FilterCodecReader.java  | 23 ++++++++++
 .../org/apache/lucene/index/IndexWriter.java    | 44 ++++++++++++++++----
 .../index/SoftDeletesRetentionMergePolicy.java  | 35 ++--------------
 .../TestSoftDeletesRetentionMergePolicy.java    | 38 +++++++++++++++++
 4 files changed, 101 insertions(+), 39 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c65fb856/lucene/core/src/java/org/apache/lucene/index/FilterCodecReader.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/FilterCodecReader.java b/lucene/core/src/java/org/apache/lucene/index/FilterCodecReader.java
index fd36ecb..4187051 100644
--- a/lucene/core/src/java/org/apache/lucene/index/FilterCodecReader.java
+++ b/lucene/core/src/java/org/apache/lucene/index/FilterCodecReader.java
@@ -127,4 +127,27 @@ public abstract class FilterCodecReader extends CodecReader {
     in.checkIntegrity();
   }
 
+  /**
+   * Returns a filtered codec reader with the given live docs and numDocs.
+   */
+  static FilterCodecReader wrapLiveDocs(CodecReader reader, Bits liveDocs, int numDocs) {
+    return new FilterCodecReader(reader) {
+      @Override
+      public CacheHelper getCoreCacheHelper() {
+        return reader.getCoreCacheHelper();
+      }
+      @Override
+      public CacheHelper getReaderCacheHelper() {
+        return null; // we are altering live docs
+      }
+      @Override
+      public Bits getLiveDocs() {
+        return liveDocs;
+      }
+      @Override
+      public int numDocs() {
+        return numDocs;
+      }
+    };
+  }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c65fb856/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java
index c4c5c57..e8ed2bb 100644
--- a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java
@@ -48,6 +48,7 @@ import org.apache.lucene.index.DocValuesUpdate.BinaryDocValuesUpdate;
 import org.apache.lucene.index.DocValuesUpdate.NumericDocValuesUpdate;
 import org.apache.lucene.index.FieldInfos.FieldNumbers;
 import org.apache.lucene.index.IndexWriterConfig.OpenMode;
+import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.search.DocValuesFieldExistsQuery;
 import org.apache.lucene.search.MatchAllDocsQuery;
 import org.apache.lucene.search.Query;
@@ -4384,25 +4385,52 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable,
 
       // Let the merge wrap readers
       List<CodecReader> mergeReaders = new ArrayList<>();
-      int numSoftDeleted = 0;
-      for (SegmentReader reader : merge.readers) {
+      int softDeleteCount = 0;
+      for (int r = 0; r < merge.readers.size(); r++) {
+        SegmentReader reader = merge.readers.get(r);
         CodecReader wrappedReader = merge.wrapForMerge(reader);
         validateMergeReader(wrappedReader);
-        mergeReaders.add(wrappedReader);
         if (softDeletesEnabled) {
           if (reader != wrappedReader) { // if we don't have a wrapped reader we won't preserve
any soft-deletes
-            Bits liveDocs = wrappedReader.getLiveDocs();
-            numSoftDeleted += PendingSoftDeletes.countSoftDeletes(
-                DocValuesFieldExistsQuery.getDocValuesDocIdSetIterator(config.getSoftDeletesField(),
wrappedReader),
-                liveDocs);
+            Bits hardLiveDocs = merge.hardLiveDocs.get(r);
+            Bits wrappedLiveDocs = wrappedReader.getLiveDocs();
+            int hardDeleteCount = 0;
+            DocIdSetIterator softDeletedDocs = DocValuesFieldExistsQuery.getDocValuesDocIdSetIterator(config.getSoftDeletesField(),
wrappedReader);
+            if (softDeletedDocs != null) {
+              int docId;
+              while ((docId = softDeletedDocs.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS)
{
+                if (wrappedLiveDocs == null || wrappedLiveDocs.get(docId)) {
+                  if (hardLiveDocs == null || hardLiveDocs.get(docId)) {
+                    softDeleteCount++;
+                  } else {
+                    hardDeleteCount++;
+                  }
+                }
+              }
+            }
+            // Wrap the wrapped reader again if we have excluded some hard-deleted docs
+            if (hardLiveDocs != null && hardDeleteCount > 0) {
+              Bits liveDocs = wrappedLiveDocs == null ? hardLiveDocs : new Bits() {
+                @Override
+                public boolean get(int index) {
+                  return hardLiveDocs.get(index) && wrappedLiveDocs.get(index);
+                }
+                @Override
+                public int length() {
+                  return hardLiveDocs.length();
+                }
+              };
+              wrappedReader = FilterCodecReader.wrapLiveDocs(wrappedReader, liveDocs, wrappedReader.numDocs()
- hardDeleteCount);
+            }
           }
         }
+        mergeReaders.add(wrappedReader);
       }
       final SegmentMerger merger = new SegmentMerger(mergeReaders,
                                                      merge.info.info, infoStream, dirWrapper,
                                                      globalFieldNumberMap, 
                                                      context);
-      merge.info.setSoftDelCount(numSoftDeleted);
+      merge.info.setSoftDelCount(softDeleteCount);
       merge.checkAborted();
 
       merge.mergeStartNS = System.nanoTime();

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c65fb856/lucene/core/src/java/org/apache/lucene/index/SoftDeletesRetentionMergePolicy.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/SoftDeletesRetentionMergePolicy.java
b/lucene/core/src/java/org/apache/lucene/index/SoftDeletesRetentionMergePolicy.java
index 36a2208..182af50 100644
--- a/lucene/core/src/java/org/apache/lucene/index/SoftDeletesRetentionMergePolicy.java
+++ b/lucene/core/src/java/org/apache/lucene/index/SoftDeletesRetentionMergePolicy.java
@@ -73,7 +73,7 @@ public final class SoftDeletesRetentionMergePolicy extends OneMergeWrappingMerge
   public boolean keepFullyDeletedSegment(IOSupplier<CodecReader> readerIOSupplier)
throws IOException {
     CodecReader reader = readerIOSupplier.get();
     /* we only need a single hit to keep it no need for soft deletes to be checked*/
-    Scorer scorer = getScorer(retentionQuerySupplier.get(), wrapLiveDocs(reader, null, reader.maxDoc()));
+    Scorer scorer = getScorer(retentionQuerySupplier.get(), FilterCodecReader.wrapLiveDocs(reader,
null, reader.maxDoc()));
     if (scorer != null) {
       DocIdSetIterator iterator = scorer.iterator();
       boolean atLeastOneHit = iterator.nextDoc() != DocIdSetIterator.NO_MORE_DOCS;
@@ -88,7 +88,7 @@ public final class SoftDeletesRetentionMergePolicy extends OneMergeWrappingMerge
     if (liveDocs == null) { // no deletes - just keep going
       return reader;
     }
-    CodecReader wrappedReader = wrapLiveDocs(reader, new Bits() { // only search deleted
+    CodecReader wrappedReader = FilterCodecReader.wrapLiveDocs(reader, new Bits() { // only
search deleted
       @Override
       public boolean get(int index) {
         return liveDocs.get(index) == false;
@@ -114,7 +114,7 @@ public final class SoftDeletesRetentionMergePolicy extends OneMergeWrappingMerge
         }
       }
       assert reader.numDocs() + numExtraLiveDocs <= reader.maxDoc() : "numDocs: " + reader.numDocs()
+ " numExtraLiveDocs: " + numExtraLiveDocs + " maxDoc: " + reader.maxDoc();
-      return wrapLiveDocs(reader, cloneLiveDocs, reader.numDocs() + numExtraLiveDocs);
+      return FilterCodecReader.wrapLiveDocs(reader, cloneLiveDocs, reader.numDocs() + numExtraLiveDocs);
     } else {
       return reader;
     }
@@ -144,33 +144,6 @@ public final class SoftDeletesRetentionMergePolicy extends OneMergeWrappingMerge
     return weight.scorer(reader.getContext());
   }
 
-  /**
-   * Returns a codec reader with the given live docs
-   */
-  private static CodecReader wrapLiveDocs(CodecReader reader, Bits liveDocs, int numDocs)
{
-    return new FilterCodecReader(reader) {
-      @Override
-      public CacheHelper getCoreCacheHelper() {
-        return reader.getCoreCacheHelper();
-      }
-
-      @Override
-      public CacheHelper getReaderCacheHelper() {
-        return null; // we are altering live docs
-      }
-
-      @Override
-      public Bits getLiveDocs() {
-        return liveDocs;
-      }
-
-      @Override
-      public int numDocs() {
-        return numDocs;
-      }
-    };
-  }
-
   @Override
   public int numDeletesToMerge(SegmentCommitInfo info, int delCount, IOSupplier<CodecReader>
readerSupplier) throws IOException {
     final int numDeletesToMerge = super.numDeletesToMerge(info, delCount, readerSupplier);
@@ -180,7 +153,7 @@ public final class SoftDeletesRetentionMergePolicy extends OneMergeWrappingMerge
         BooleanQuery.Builder builder = new BooleanQuery.Builder();
         builder.add(new DocValuesFieldExistsQuery(field), BooleanClause.Occur.FILTER);
         builder.add(retentionQuerySupplier.get(), BooleanClause.Occur.FILTER);
-        Scorer scorer = getScorer(builder.build(), wrapLiveDocs(reader, null, reader.maxDoc()));
+        Scorer scorer = getScorer(builder.build(), FilterCodecReader.wrapLiveDocs(reader,
null, reader.maxDoc()));
         if (scorer != null) {
           DocIdSetIterator iterator = scorer.iterator();
           Bits liveDocs = reader.getLiveDocs();

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c65fb856/lucene/core/src/test/org/apache/lucene/index/TestSoftDeletesRetentionMergePolicy.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/index/TestSoftDeletesRetentionMergePolicy.java
b/lucene/core/src/test/org/apache/lucene/index/TestSoftDeletesRetentionMergePolicy.java
index be1f7ac..bb36ba3 100644
--- a/lucene/core/src/test/org/apache/lucene/index/TestSoftDeletesRetentionMergePolicy.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestSoftDeletesRetentionMergePolicy.java
@@ -569,6 +569,44 @@ public class TestSoftDeletesRetentionMergePolicy extends LuceneTestCase
{
     IOUtils.close(writer, dir);
   }
 
+  public void testMergeSoftDeleteAndHardDelete() throws Exception {
+    Directory dir = newDirectory();
+    String softDelete = "soft_delete";
+    IndexWriterConfig config = newIndexWriterConfig()
+        .setSoftDeletesField(softDelete)
+        .setMergePolicy(new SoftDeletesRetentionMergePolicy("soft_delete",
+            MatchAllDocsQuery::new, new LogDocMergePolicy()));
+    config.setReaderPooling(true);
+    IndexWriter writer = new IndexWriter(dir, config);
+    Document d = new Document();
+    d.add(new StringField("id", "0", Field.Store.YES));
+    writer.addDocument(d);
+    d = new Document();
+    d.add(new StringField("id", "1", Field.Store.YES));
+    d.add(new NumericDocValuesField("soft_delete", 1));
+    writer.addDocument(d);
+    try (DirectoryReader reader = writer.getReader()) {
+      assertEquals(2, reader.maxDoc());
+      assertEquals(1, reader.numDocs());
+    }
+    while (true) {
+      try (DirectoryReader reader = writer.getReader()) {
+        TopDocs topDocs = new IndexSearcher(new NoDeletesWrapper(reader)).search(new TermQuery(new
Term("id", "1")), 1);
+        assertEquals(1, topDocs.totalHits);
+        if (writer.tryDeleteDocument(reader, topDocs.scoreDocs[0].doc) > 0) {
+          break;
+        }
+      }
+    }
+    writer.forceMergeDeletes(true);
+    assertEquals(1, writer.segmentInfos.size());
+    SegmentCommitInfo si = writer.segmentInfos.info(0);
+    assertEquals(0, si.getSoftDelCount()); // hard-delete should supersede the soft-delete
+    assertEquals(0, si.getDelCount());
+    assertEquals(1, si.info.maxDoc());
+    IOUtils.close(writer, dir);
+  }
+
   static void doUpdate(Term doc, IndexWriter writer, Field... fields) throws IOException
{
     long seqId = -1;
     do { // retry if we just committing a merge


Mime
View raw message