cassandra-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From slebre...@apache.org
Subject [2/2] git commit: Exclude gcable tombstones from merkle-tree computation
Date Thu, 08 Nov 2012 14:59:39 GMT
Exclude gcable tombstones from merkle-tree computation

patch by slebresne; reviewed by jbellis for CASSANDRA-4905


Project: http://git-wip-us.apache.org/repos/asf/cassandra/repo
Commit: http://git-wip-us.apache.org/repos/asf/cassandra/commit/9a89eefe
Tree: http://git-wip-us.apache.org/repos/asf/cassandra/tree/9a89eefe
Diff: http://git-wip-us.apache.org/repos/asf/cassandra/diff/9a89eefe

Branch: refs/heads/trunk
Commit: 9a89eefe11828bd33e198f447071b06d9852df7d
Parents: e27a955
Author: Sylvain Lebresne <sylvain@datastax.com>
Authored: Thu Nov 8 11:46:30 2012 +0100
Committer: Sylvain Lebresne <sylvain@datastax.com>
Committed: Thu Nov 8 15:58:18 2012 +0100

----------------------------------------------------------------------
 CHANGES.txt                                        |    1 +
 .../org/apache/cassandra/db/ColumnFamilyStore.java |    5 ++
 src/java/org/apache/cassandra/db/Directories.java  |   12 +++++
 .../cassandra/db/compaction/CompactionManager.java |   35 +++++++++++---
 4 files changed, 45 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/cassandra/blob/9a89eefe/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index 1af4353..9bc4478 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,6 +1,7 @@
 1.2-rc1
  * Don't share slice query filter in CQL3 SelectStatement (CASSANDRA-4928)
  * Separate tracing from Log4J (CASSANDRA-4861)
+ * Exclude gcable tombstones from merkle-tree computation (CASSANDRA-4905)
 
 1.2-beta2
  * fp rate of 1.0 disables BF entirely; LCS defaults to 1.0 (CASSANDRA-4876)

http://git-wip-us.apache.org/repos/asf/cassandra/blob/9a89eefe/src/java/org/apache/cassandra/db/ColumnFamilyStore.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/cassandra/db/ColumnFamilyStore.java b/src/java/org/apache/cassandra/db/ColumnFamilyStore.java
index 382a7f5..199d07a 100644
--- a/src/java/org/apache/cassandra/db/ColumnFamilyStore.java
+++ b/src/java/org/apache/cassandra/db/ColumnFamilyStore.java
@@ -1586,6 +1586,11 @@ public class ColumnFamilyStore implements ColumnFamilyStoreMBean
         return directories.snapshotExists(snapshotName);
     }
 
+    public long getSnapshotCreationTime(String snapshotName)
+    {
+        return directories.snapshotCreationTime(snapshotName);
+    }
+
     public void clearSnapshot(String snapshotName)
     {
         directories.clearSnapshot(snapshotName);

http://git-wip-us.apache.org/repos/asf/cassandra/blob/9a89eefe/src/java/org/apache/cassandra/db/Directories.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/cassandra/db/Directories.java b/src/java/org/apache/cassandra/db/Directories.java
index 714b363..877b5ed 100644
--- a/src/java/org/apache/cassandra/db/Directories.java
+++ b/src/java/org/apache/cassandra/db/Directories.java
@@ -437,6 +437,18 @@ public class Directories
         }
     }
 
+    // The snapshot must exist
+    public long snapshotCreationTime(String snapshotName)
+    {
+        for (File dir : sstableDirectories)
+        {
+            File snapshotDir = new File(dir, join(SNAPSHOT_SUBDIR, snapshotName));
+            if (snapshotDir.exists())
+                return snapshotDir.lastModified();
+        }
+        throw new RuntimeException("Snapshot " + snapshotName + " doesn't exist");
+    }
+
     private static File getOrCreate(File base, String... subdirs)
     {
         File dir = subdirs == null || subdirs.length == 0 ? base : new File(base, join(subdirs));

http://git-wip-us.apache.org/repos/asf/cassandra/blob/9a89eefe/src/java/org/apache/cassandra/db/compaction/CompactionManager.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionManager.java b/src/java/org/apache/cassandra/db/compaction/CompactionManager.java
index a0e6ba0..a93f392 100644
--- a/src/java/org/apache/cassandra/db/compaction/CompactionManager.java
+++ b/src/java/org/apache/cassandra/db/compaction/CompactionManager.java
@@ -714,10 +714,17 @@ public class CompactionManager implements CompactionManagerMBean
             return;
 
         Collection<SSTableReader> sstables;
-        if (cfs.table.snapshotExists(validator.request.sessionid))
+        int gcBefore;
+        if (cfs.snapshotExists(validator.request.sessionid))
         {
             // If there is a snapshot created for the session then read from there.
             sstables = cfs.getSnapshotSSTableReader(validator.request.sessionid);
+
+            // Computing gcbefore based on the current time wouldn't be very good because
we know each replica will execute
+            // this at a different time (that's the whole purpose of repair with snaphsot).
So instead we take the creation
+            // time of the snapshot, which should give us roughtly the same time on each
replica (roughtly being in that case
+            // 'as good as in the non-snapshot' case)
+            gcBefore = (int)(cfs.getSnapshotCreationTime(validator.request.sessionid) / 1000)
- cfs.metadata.getGcGraceSeconds();
         }
         else
         {
@@ -738,9 +745,10 @@ public class CompactionManager implements CompactionManagerMBean
             // we don't mark validating sstables as compacting in DataTracker, so we have
to mark them referenced
             // instead so they won't be cleaned up if they do get compacted during the validation
             sstables = cfs.markCurrentSSTablesReferenced();
+            gcBefore = getDefaultGcBefore(cfs);
         }
 
-        CompactionIterable ci = new ValidationCompactionIterable(cfs, sstables, validator.request.range);
+        CompactionIterable ci = new ValidationCompactionIterable(cfs, sstables, validator.request.range,
gcBefore);
         CloseableIterator<AbstractCompactedRow> iter = ci.iterator();
         metrics.beginCompaction(ci);
         try
@@ -874,31 +882,42 @@ public class CompactionManager implements CompactionManagerMBean
 
     private static class ValidationCompactionIterable extends CompactionIterable
     {
-        public ValidationCompactionIterable(ColumnFamilyStore cfs, Collection<SSTableReader>
sstables, Range<Token> range)
+        public ValidationCompactionIterable(ColumnFamilyStore cfs, Collection<SSTableReader>
sstables, Range<Token> range, int gcBefore)
         {
             super(OperationType.VALIDATION,
                   cfs.getCompactionStrategy().getScanners(sstables, range),
-                  new ValidationCompactionController(cfs, sstables));
+                  new ValidationCompactionController(cfs, gcBefore));
         }
     }
 
     /*
-     * Controller for validation compaction that never purges.
+     * Controller for validation compaction that always purges.
      * Note that we should not call cfs.getOverlappingSSTables on the provided
      * sstables because those sstables are not guaranteed to be active sstables
      * (since we can run repair on a snapshot).
      */
     private static class ValidationCompactionController extends CompactionController
     {
-        public ValidationCompactionController(ColumnFamilyStore cfs, Collection<SSTableReader>
sstables)
+        public ValidationCompactionController(ColumnFamilyStore cfs, int gcBefore)
         {
-            super(cfs, NO_GC, null);
+            super(cfs, gcBefore, null);
         }
 
         @Override
         public boolean shouldPurge(DecoratedKey key)
         {
-            return false;
+            /*
+             * The main reason we always purge is that including gcable tombstone would mean
that the
+             * repair digest will depends on the scheduling of compaction on the different
nodes. This
+             * is still not perfect because gcbefore is currently dependend on the current
time at which
+             * the validation compaction start, which while not too bad for normal repair
is broken for
+             * repair on snapshots. A better solution would be to agree on a gcbefore that
all node would
+             * use, and we'll do that with CASSANDRA-4932.
+             * Note validation compaction includes all sstables, so we don't have the problem
of purging
+             * a tombstone that could shadow a column in another sstable, but this is doubly
not a concern
+             * since validation compaction is read-only.
+             */
+            return true;
         }
     }
 


Mime
View raw message