Return-Path: X-Original-To: apmail-cassandra-commits-archive@www.apache.org Delivered-To: apmail-cassandra-commits-archive@www.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 17CE8DD7D for ; Tue, 13 Nov 2012 07:58:30 +0000 (UTC) Received: (qmail 33665 invoked by uid 500); 13 Nov 2012 07:58:28 -0000 Delivered-To: apmail-cassandra-commits-archive@cassandra.apache.org Received: (qmail 33622 invoked by uid 500); 13 Nov 2012 07:58:28 -0000 Mailing-List: contact commits-help@cassandra.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@cassandra.apache.org Delivered-To: mailing list commits@cassandra.apache.org Received: (qmail 32497 invoked by uid 99); 13 Nov 2012 07:58:27 -0000 Received: from tyr.zones.apache.org (HELO tyr.zones.apache.org) (140.211.11.114) by apache.org (qpsmtpd/0.29) with ESMTP; Tue, 13 Nov 2012 07:58:27 +0000 Received: by tyr.zones.apache.org (Postfix, from userid 65534) id 4CD925557B; Tue, 13 Nov 2012 07:58:27 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: slebresne@apache.org To: commits@cassandra.apache.org X-Mailer: ASF-Git Admin Mailer Subject: [4/5] git commit: Optimize mostRecentTomstone check in CC.collectAllData Message-Id: <20121113075827.4CD925557B@tyr.zones.apache.org> Date: Tue, 13 Nov 2012 07:58:27 +0000 (UTC) Optimize mostRecentTomstone check in CC.collectAllData patch by slebresne; reviewed by jbellis for CASSANDRA-4883 Project: http://git-wip-us.apache.org/repos/asf/cassandra/repo Commit: http://git-wip-us.apache.org/repos/asf/cassandra/commit/53943180 Tree: http://git-wip-us.apache.org/repos/asf/cassandra/tree/53943180 Diff: http://git-wip-us.apache.org/repos/asf/cassandra/diff/53943180 Branch: refs/heads/trunk Commit: 53943180a4a41ecc5e145d1e08b0ea4a9d849c8c Parents: 9118764 Author: Sylvain Lebresne Authored: Tue Nov 13 08:54:37 2012 +0100 Committer: Sylvain Lebresne Committed: Tue Nov 13 08:54:37 2012 +0100 ---------------------------------------------------------------------- CHANGES.txt | 1 + .../org/apache/cassandra/config/CFMetaData.java | 3 +- .../apache/cassandra/db/CollationController.java | 33 +++++++-------- src/java/org/apache/cassandra/db/DataTracker.java | 31 ++++++-------- .../cassandra/db/CollationControllerTest.java | 20 +++++---- 5 files changed, 43 insertions(+), 45 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/cassandra/blob/53943180/CHANGES.txt ---------------------------------------------------------------------- diff --git a/CHANGES.txt b/CHANGES.txt index 43fc53a..77cc8a8 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -8,6 +8,7 @@ * Separate tracing from Log4J (CASSANDRA-4861) * Exclude gcable tombstones from merkle-tree computation (CASSANDRA-4905) * Better printing of AbstractBounds for tracing (CASSANDRA-4931) + * Optimize mostRecentTomstone check in CC.collectAllData (CASSANDRA-4883) 1.2-beta2 http://git-wip-us.apache.org/repos/asf/cassandra/blob/53943180/src/java/org/apache/cassandra/config/CFMetaData.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/cassandra/config/CFMetaData.java b/src/java/org/apache/cassandra/config/CFMetaData.java index 921242a..c3a00b1 100644 --- a/src/java/org/apache/cassandra/config/CFMetaData.java +++ b/src/java/org/apache/cassandra/config/CFMetaData.java @@ -181,7 +181,8 @@ public final class CFMetaData + "thrift_version text," + "cql_version text," + "data_center text," - + "rack text" + + "rack text," + + "partitioner text," + ") WITH COMMENT='information about the local node'"); public static final CFMetaData TraceSessionsCf = compile(14, "CREATE TABLE " + Tracing.SESSIONS_CF + " (" http://git-wip-us.apache.org/repos/asf/cassandra/blob/53943180/src/java/org/apache/cassandra/db/CollationController.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/cassandra/db/CollationController.java b/src/java/org/apache/cassandra/db/CollationController.java index 80d0a2c..73c675f 100644 --- a/src/java/org/apache/cassandra/db/CollationController.java +++ b/src/java/org/apache/cassandra/db/CollationController.java @@ -241,13 +241,27 @@ public class CollationController } } + /* + * We can't eliminate full sstables based on the timestamp of what we've already read like + * in collectTimeOrderedData, but we still want to eliminate sstable whose maxTimestamp < mostRecentTombstone + * we've read. We still rely on the sstable ordering by maxTimestamp since if + * maxTimestamp_s1 > maxTimestamp_s0, + * we're guaranteed that s1 cannot have a row tombstone such that + * timestamp(tombstone) > maxTimestamp_s0 + * since we necessarily have + * timestamp(tombstone) <= maxTimestamp_s1 + * In othere words, iterating in maxTimestamp order allow to do our mostRecentTombstone elimination + * in one pass, and minimize the number of sstables for which we read a rowTombstone. + */ + Collections.sort(view.sstables, SSTable.maxTimestampComparator); + long mostRecentRowTombstone = Long.MIN_VALUE; for (SSTableReader sstable : view.sstables) { // if we've already seen a row tombstone with a timestamp greater // than the most recent update to this sstable, we can skip it if (sstable.getMaxTimestamp() < mostRecentRowTombstone) - continue; + break; OnDiskAtomIterator iter = filter.getSSTableColumnIterator(sstable); iterators.add(iter); @@ -262,23 +276,6 @@ public class CollationController } } - // If we saw a row tombstone, do a second pass through the iterators we - // obtained from the sstables and drop any whose maxTimestamp < that of the - // row tombstone - { - Iterator it = iterators.iterator(); - while (it.hasNext()) - { - OnDiskAtomIterator iter = it.next(); - if ((iter instanceof ISSTableColumnIterator) - && ((ISSTableColumnIterator) iter).getSStable().getMaxTimestamp() < mostRecentRowTombstone) - { - FileUtils.closeQuietly(iter); - it.remove(); - } - } - } - // we need to distinguish between "there is no data at all for this row" (BF will let us rebuild that efficiently) // and "there used to be data, but it's gone now" (we should cache the empty CF so we don't need to rebuild that slower) if (iterators.isEmpty()) http://git-wip-us.apache.org/repos/asf/cassandra/blob/53943180/src/java/org/apache/cassandra/db/DataTracker.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/cassandra/db/DataTracker.java b/src/java/org/apache/cassandra/db/DataTracker.java index 85da449..7e8887f 100644 --- a/src/java/org/apache/cassandra/db/DataTracker.java +++ b/src/java/org/apache/cassandra/db/DataTracker.java @@ -64,7 +64,7 @@ public class DataTracker return view.get().memtablesPendingFlush; } - public List getSSTables() + public Set getSSTables() { return view.get().sstables; } @@ -300,7 +300,7 @@ public class DataTracker { view.set(new View(new Memtable(cfstore), Collections.emptySet(), - Collections.emptyList(), + Collections.emptySet(), Collections.emptySet(), SSTableIntervalTree.empty())); } @@ -456,15 +456,10 @@ public class DataTracker public final Memtable memtable; public final Set memtablesPendingFlush; public final Set compacting; - // We can't use a SortedSet here because "the ordering maintained by a sorted set (whether or not an - // explicit comparator is provided) must be consistent with equals." In particular, - // ImmutableSortedSet will ignore any objects that compare equally with an existing Set member. - // Obviously, dropping sstables whose max column timestamp happens to be equal to another's - // is not acceptable for us. So, we use a List instead. - public final List sstables; + public final Set sstables; public final SSTableIntervalTree intervalTree; - View(Memtable memtable, Set pendingFlush, List sstables, Set compacting, SSTableIntervalTree intervalTree) + View(Memtable memtable, Set pendingFlush, Set sstables, Set compacting, SSTableIntervalTree intervalTree) { this.memtable = memtable; this.memtablesPendingFlush = pendingFlush; @@ -492,18 +487,18 @@ public class DataTracker public View replaceFlushed(Memtable flushedMemtable, SSTableReader newSSTable) { Set newPending = ImmutableSet.copyOf(Sets.difference(memtablesPendingFlush, Collections.singleton(flushedMemtable))); - List newSSTables = newSSTable == null - ? Collections.emptyList() + Set newSSTables = newSSTable == null + ? Collections.emptySet() : newSSTables(newSSTable); SSTableIntervalTree intervalTree = buildIntervalTree(newSSTables); - return new View(memtable, newPending, Collections.unmodifiableList(newSSTables), compacting, intervalTree); + return new View(memtable, newPending, newSSTables, compacting, intervalTree); } public View replace(Collection oldSSTables, Iterable replacements) { - List newSSTables = newSSTables(oldSSTables, replacements); + Set newSSTables = newSSTables(oldSSTables, replacements); SSTableIntervalTree intervalTree = buildIntervalTree(newSSTables); - return new View(memtable, memtablesPendingFlush, Collections.unmodifiableList(newSSTables), compacting, intervalTree); + return new View(memtable, memtablesPendingFlush, newSSTables, compacting, intervalTree); } public View markCompacting(Collection tomark) @@ -518,19 +513,19 @@ public class DataTracker return new View(memtable, memtablesPendingFlush, sstables, compactingNew, intervalTree); } - private List newSSTables(SSTableReader newSSTable) + private Set newSSTables(SSTableReader newSSTable) { assert newSSTable != null; // not performance-sensitive, don't obsess over doing a selection merge here return newSSTables(Collections.emptyList(), Collections.singletonList(newSSTable)); } - private List newSSTables(Collection oldSSTables, Iterable replacements) + private Set newSSTables(Collection oldSSTables, Iterable replacements) { ImmutableSet oldSet = ImmutableSet.copyOf(oldSSTables); int newSSTablesSize = sstables.size() - oldSSTables.size() + Iterables.size(replacements); assert newSSTablesSize >= Iterables.size(replacements) : String.format("Incoherent new size %d replacing %s by %s in %s", newSSTablesSize, oldSSTables, replacements, this); - List newSSTables = new ArrayList(newSSTablesSize); + Set newSSTables = new HashSet(newSSTablesSize); for (SSTableReader sstable : sstables) { if (!oldSet.contains(sstable)) @@ -538,7 +533,7 @@ public class DataTracker } Iterables.addAll(newSSTables, replacements); assert newSSTables.size() == newSSTablesSize : String.format("Expecting new size of %d, got %d while replacing %s by %s in %s", newSSTablesSize, newSSTables.size(), oldSSTables, replacements, this); - return newSSTables; + return ImmutableSet.copyOf(newSSTables); } @Override http://git-wip-us.apache.org/repos/asf/cassandra/blob/53943180/test/unit/org/apache/cassandra/db/CollationControllerTest.java ---------------------------------------------------------------------- diff --git a/test/unit/org/apache/cassandra/db/CollationControllerTest.java b/test/unit/org/apache/cassandra/db/CollationControllerTest.java index f469639..4a8e426 100644 --- a/test/unit/org/apache/cassandra/db/CollationControllerTest.java +++ b/test/unit/org/apache/cassandra/db/CollationControllerTest.java @@ -30,6 +30,8 @@ import org.apache.cassandra.db.filter.QueryPath; import org.apache.cassandra.utils.ByteBufferUtil; import org.junit.Test; +import org.apache.cassandra.io.sstable.SSTableReader; + public class CollationControllerTest extends SchemaLoader { @Test @@ -61,20 +63,22 @@ public class CollationControllerTest extends SchemaLoader store.forceBlockingFlush(); + // add yet one more mutation + rm = new RowMutation("Keyspace1", dk.key); + rm.add(path, ByteBufferUtil.bytes("foobar"), 30); + rm.apply(); + store.forceBlockingFlush(); + // A NamesQueryFilter goes down one code path (through collectTimeOrderedData()) + // It should only iterate the last flushed sstable, since it probably contains the most recent value for Column1 QueryFilter filter = QueryFilter.getNamesFilter(dk, path, ByteBufferUtil.bytes("Column1")); CollationController controller = new CollationController(store, false, filter, Integer.MIN_VALUE); controller.getTopLevelColumns(); assertEquals(1, controller.getSstablesIterated()); - + // SliceQueryFilter goes down another path (through collectAllData()) - // Add another mutation, with a lower timestamp then force another flush - // so we can assert that we're not reading every sstable - rm = new RowMutation("Keyspace1", dk.key); - rm.add(path, ByteBufferUtil.bytes("asdf"), 5); - rm.apply(); - store.forceBlockingFlush(); - + // We will read "only" the last sstable in that case, but because the 2nd sstable has a tombstone that is more + // recent than the maxTimestamp of the very first sstable we flushed, we should only read the 2 first sstables. filter = QueryFilter.getIdentityFilter(dk, path); controller = new CollationController(store, false, filter, Integer.MIN_VALUE); controller.getTopLevelColumns();