Return-Path: X-Original-To: apmail-cassandra-commits-archive@www.apache.org Delivered-To: apmail-cassandra-commits-archive@www.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 11C47DD7D for ; Thu, 8 Nov 2012 14:59:40 +0000 (UTC) Received: (qmail 560 invoked by uid 500); 8 Nov 2012 14:59:39 -0000 Delivered-To: apmail-cassandra-commits-archive@cassandra.apache.org Received: (qmail 529 invoked by uid 500); 8 Nov 2012 14:59:39 -0000 Mailing-List: contact commits-help@cassandra.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@cassandra.apache.org Delivered-To: mailing list commits@cassandra.apache.org Received: (qmail 508 invoked by uid 99); 8 Nov 2012 14:59:39 -0000 Received: from tyr.zones.apache.org (HELO tyr.zones.apache.org) (140.211.11.114) by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 08 Nov 2012 14:59:39 +0000 Received: by tyr.zones.apache.org (Postfix, from userid 65534) id 1D355458FC; Thu, 8 Nov 2012 14:59:39 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: slebresne@apache.org To: commits@cassandra.apache.org X-Mailer: ASF-Git Admin Mailer Subject: [2/2] git commit: Exclude gcable tombstones from merkle-tree computation Message-Id: <20121108145939.1D355458FC@tyr.zones.apache.org> Date: Thu, 8 Nov 2012 14:59:39 +0000 (UTC) Exclude gcable tombstones from merkle-tree computation patch by slebresne; reviewed by jbellis for CASSANDRA-4905 Project: http://git-wip-us.apache.org/repos/asf/cassandra/repo Commit: http://git-wip-us.apache.org/repos/asf/cassandra/commit/9a89eefe Tree: http://git-wip-us.apache.org/repos/asf/cassandra/tree/9a89eefe Diff: http://git-wip-us.apache.org/repos/asf/cassandra/diff/9a89eefe Branch: refs/heads/trunk Commit: 9a89eefe11828bd33e198f447071b06d9852df7d Parents: e27a955 Author: Sylvain Lebresne Authored: Thu Nov 8 11:46:30 2012 +0100 Committer: Sylvain Lebresne Committed: Thu Nov 8 15:58:18 2012 +0100 ---------------------------------------------------------------------- CHANGES.txt | 1 + .../org/apache/cassandra/db/ColumnFamilyStore.java | 5 ++ src/java/org/apache/cassandra/db/Directories.java | 12 +++++ .../cassandra/db/compaction/CompactionManager.java | 35 +++++++++++--- 4 files changed, 45 insertions(+), 8 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/cassandra/blob/9a89eefe/CHANGES.txt ---------------------------------------------------------------------- diff --git a/CHANGES.txt b/CHANGES.txt index 1af4353..9bc4478 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,6 +1,7 @@ 1.2-rc1 * Don't share slice query filter in CQL3 SelectStatement (CASSANDRA-4928) * Separate tracing from Log4J (CASSANDRA-4861) + * Exclude gcable tombstones from merkle-tree computation (CASSANDRA-4905) 1.2-beta2 * fp rate of 1.0 disables BF entirely; LCS defaults to 1.0 (CASSANDRA-4876) http://git-wip-us.apache.org/repos/asf/cassandra/blob/9a89eefe/src/java/org/apache/cassandra/db/ColumnFamilyStore.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/cassandra/db/ColumnFamilyStore.java b/src/java/org/apache/cassandra/db/ColumnFamilyStore.java index 382a7f5..199d07a 100644 --- a/src/java/org/apache/cassandra/db/ColumnFamilyStore.java +++ b/src/java/org/apache/cassandra/db/ColumnFamilyStore.java @@ -1586,6 +1586,11 @@ public class ColumnFamilyStore implements ColumnFamilyStoreMBean return directories.snapshotExists(snapshotName); } + public long getSnapshotCreationTime(String snapshotName) + { + return directories.snapshotCreationTime(snapshotName); + } + public void clearSnapshot(String snapshotName) { directories.clearSnapshot(snapshotName); http://git-wip-us.apache.org/repos/asf/cassandra/blob/9a89eefe/src/java/org/apache/cassandra/db/Directories.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/cassandra/db/Directories.java b/src/java/org/apache/cassandra/db/Directories.java index 714b363..877b5ed 100644 --- a/src/java/org/apache/cassandra/db/Directories.java +++ b/src/java/org/apache/cassandra/db/Directories.java @@ -437,6 +437,18 @@ public class Directories } } + // The snapshot must exist + public long snapshotCreationTime(String snapshotName) + { + for (File dir : sstableDirectories) + { + File snapshotDir = new File(dir, join(SNAPSHOT_SUBDIR, snapshotName)); + if (snapshotDir.exists()) + return snapshotDir.lastModified(); + } + throw new RuntimeException("Snapshot " + snapshotName + " doesn't exist"); + } + private static File getOrCreate(File base, String... subdirs) { File dir = subdirs == null || subdirs.length == 0 ? base : new File(base, join(subdirs)); http://git-wip-us.apache.org/repos/asf/cassandra/blob/9a89eefe/src/java/org/apache/cassandra/db/compaction/CompactionManager.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionManager.java b/src/java/org/apache/cassandra/db/compaction/CompactionManager.java index a0e6ba0..a93f392 100644 --- a/src/java/org/apache/cassandra/db/compaction/CompactionManager.java +++ b/src/java/org/apache/cassandra/db/compaction/CompactionManager.java @@ -714,10 +714,17 @@ public class CompactionManager implements CompactionManagerMBean return; Collection sstables; - if (cfs.table.snapshotExists(validator.request.sessionid)) + int gcBefore; + if (cfs.snapshotExists(validator.request.sessionid)) { // If there is a snapshot created for the session then read from there. sstables = cfs.getSnapshotSSTableReader(validator.request.sessionid); + + // Computing gcbefore based on the current time wouldn't be very good because we know each replica will execute + // this at a different time (that's the whole purpose of repair with snaphsot). So instead we take the creation + // time of the snapshot, which should give us roughtly the same time on each replica (roughtly being in that case + // 'as good as in the non-snapshot' case) + gcBefore = (int)(cfs.getSnapshotCreationTime(validator.request.sessionid) / 1000) - cfs.metadata.getGcGraceSeconds(); } else { @@ -738,9 +745,10 @@ public class CompactionManager implements CompactionManagerMBean // we don't mark validating sstables as compacting in DataTracker, so we have to mark them referenced // instead so they won't be cleaned up if they do get compacted during the validation sstables = cfs.markCurrentSSTablesReferenced(); + gcBefore = getDefaultGcBefore(cfs); } - CompactionIterable ci = new ValidationCompactionIterable(cfs, sstables, validator.request.range); + CompactionIterable ci = new ValidationCompactionIterable(cfs, sstables, validator.request.range, gcBefore); CloseableIterator iter = ci.iterator(); metrics.beginCompaction(ci); try @@ -874,31 +882,42 @@ public class CompactionManager implements CompactionManagerMBean private static class ValidationCompactionIterable extends CompactionIterable { - public ValidationCompactionIterable(ColumnFamilyStore cfs, Collection sstables, Range range) + public ValidationCompactionIterable(ColumnFamilyStore cfs, Collection sstables, Range range, int gcBefore) { super(OperationType.VALIDATION, cfs.getCompactionStrategy().getScanners(sstables, range), - new ValidationCompactionController(cfs, sstables)); + new ValidationCompactionController(cfs, gcBefore)); } } /* - * Controller for validation compaction that never purges. + * Controller for validation compaction that always purges. * Note that we should not call cfs.getOverlappingSSTables on the provided * sstables because those sstables are not guaranteed to be active sstables * (since we can run repair on a snapshot). */ private static class ValidationCompactionController extends CompactionController { - public ValidationCompactionController(ColumnFamilyStore cfs, Collection sstables) + public ValidationCompactionController(ColumnFamilyStore cfs, int gcBefore) { - super(cfs, NO_GC, null); + super(cfs, gcBefore, null); } @Override public boolean shouldPurge(DecoratedKey key) { - return false; + /* + * The main reason we always purge is that including gcable tombstone would mean that the + * repair digest will depends on the scheduling of compaction on the different nodes. This + * is still not perfect because gcbefore is currently dependend on the current time at which + * the validation compaction start, which while not too bad for normal repair is broken for + * repair on snapshots. A better solution would be to agree on a gcbefore that all node would + * use, and we'll do that with CASSANDRA-4932. + * Note validation compaction includes all sstables, so we don't have the problem of purging + * a tombstone that could shadow a column in another sstable, but this is doubly not a concern + * since validation compaction is read-only. + */ + return true; } }