Mailing-List: contact commits-help@cassandra.apache.org; run by ezmlm
Precedence: bulk
Reply-To: dev@cassandra.apache.org
Content-Type: text/plain; charset="us-ascii"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
From: slebresne@apache.org
To: commits@cassandra.apache.org
Date: Mon, 27 Mar 2017 10:04:40 -0000
Message-Id: <70755c557a6544b593bfa1e0925c5f92@git.apache.org>
Subject: [1/6] cassandra git commit: Dropping column results in "corrupt"
 SSTable
archived-at: Mon, 27 Mar 2017 10:04:42 -0000

Repository: cassandra
Updated Branches:
  refs/heads/cassandra-3.0 631162271 -> 5262bb17b
  refs/heads/cassandra-3.11 ee7023e32 -> c0ac928d9
  refs/heads/trunk 23cd27fcf -> 3dabeeaa2


Dropping column results in "corrupt" SSTable

patch by Sylvain Lebresne; reviewed by Alex Petrov for CASSANDRA-13337


Project: http://git-wip-us.apache.org/repos/asf/cassandra/repo
Commit: http://git-wip-us.apache.org/repos/asf/cassandra/commit/5262bb17
Tree: http://git-wip-us.apache.org/repos/asf/cassandra/tree/5262bb17
Diff: http://git-wip-us.apache.org/repos/asf/cassandra/diff/5262bb17

Branch: refs/heads/cassandra-3.0
Commit: 5262bb17b46fc8c02f9f836ddf9317d0de2698cd
Parents: 6311622
Author: Sylvain Lebresne <sylvain@datastax.com>
Authored: Mon Mar 20 15:49:27 2017 +0100
Committer: Sylvain Lebresne <sylvain@datastax.com>
Committed: Mon Mar 27 11:58:32 2017 +0200

----------------------------------------------------------------------
 CHANGES.txt                                     |  1 +
 .../db/columniterator/SSTableIterator.java      | 78 ++++++++++++--------
 .../columniterator/SSTableReversedIterator.java |  3 +-
 .../cassandra/db/rows/RangeTombstoneMarker.java |  6 ++
 .../apache/cassandra/db/rows/Unfiltered.java    |  2 +
 .../cassandra/db/rows/UnfilteredSerializer.java | 43 +++++++++--
 .../cql3/validation/operations/AlterTest.java   | 51 +++++++++++++
 7 files changed, 144 insertions(+), 40 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/cassandra/blob/5262bb17/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index 2c5573a..0b1bb01 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,4 +1,5 @@
 3.0.13
+ * Dropping column results in "corrupt" SSTable (CASSANDRA-13337)
  * Bugs handling range tombstones in the sstable iterators (CASSANDRA-13340)
  * Fix CONTAINS filtering for null collections (CASSANDRA-13246)
  * Applying: Use a unique metric reservoir per test run when using Cassandra-wide metrics residing in MBeans (CASSANDRA-13216)

http://git-wip-us.apache.org/repos/asf/cassandra/blob/5262bb17/src/java/org/apache/cassandra/db/columniterator/SSTableIterator.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/cassandra/db/columniterator/SSTableIterator.java b/src/java/org/apache/cassandra/db/columniterator/SSTableIterator.java
index 9bcca48..fa337c0 100644
--- a/src/java/org/apache/cassandra/db/columniterator/SSTableIterator.java
+++ b/src/java/org/apache/cassandra/db/columniterator/SSTableIterator.java
@@ -123,20 +123,27 @@ public class SSTableIterator extends AbstractSSTableIterator
         {
             assert deserializer != null;
 
-            // We use a same reasoning as in handlePreSliceData regarding the strictness of the inequality below.
-            // We want to exclude deserialized unfiltered equal to end, because 1) we won't miss any rows since those
-            // woudn't be equal to a slice bound and 2) a end bound can be equal to a start bound
-            // (EXCL_END(x) == INCL_START(x) for instance) and in that case we don't want to return start bound because
-            // it's fundamentally excluded. And if the bound is a  end (for a range tombstone), it means it's exactly
-            // our slice end, but in that  case we will properly close the range tombstone anyway as part of our "close
-            // an open marker" code in hasNextInterna
-            if (!deserializer.hasNext() || deserializer.compareNextTo(end) >= 0)
-                return null;
-
-            Unfiltered next = deserializer.readNext();
-            if (next.kind() == Unfiltered.Kind.RANGE_TOMBSTONE_MARKER)
-                updateOpenMarker((RangeTombstoneMarker)next);
-            return next;
+            while (true)
+            {
+                // We use a same reasoning as in handlePreSliceData regarding the strictness of the inequality below.
+                // We want to exclude deserialized unfiltered equal to end, because 1) we won't miss any rows since those
+                // woudn't be equal to a slice bound and 2) a end bound can be equal to a start bound
+                // (EXCL_END(x) == INCL_START(x) for instance) and in that case we don't want to return start bound because
+                // it's fundamentally excluded. And if the bound is a  end (for a range tombstone), it means it's exactly
+                // our slice end, but in that  case we will properly close the range tombstone anyway as part of our "close
+                // an open marker" code in hasNextInterna
+                if (!deserializer.hasNext() || deserializer.compareNextTo(end) >= 0)
+                    return null;
+
+                Unfiltered next = deserializer.readNext();
+                // We may get empty row for the same reason expressed on UnfilteredSerializer.deserializeOne.
+                if (next.isEmpty())
+                    continue;
+
+                if (next.kind() == Unfiltered.Kind.RANGE_TOMBSTONE_MARKER)
+                    updateOpenMarker((RangeTombstoneMarker) next);
+                return next;
+            }
         }
 
         protected boolean hasNextInternal() throws IOException
@@ -256,24 +263,31 @@ public class SSTableIterator extends AbstractSSTableIterator
         @Override
         protected Unfiltered computeNext() throws IOException
         {
-            // Our previous read might have made us cross an index block boundary. If so, update our informations.
-            // If we read from the beginning of the partition, this is also what will initialize the index state.
-            indexState.updateBlock();
-
-            // Return the next unfiltered unless we've reached the end, or we're beyond our slice
-            // end (note that unless we're on the last block for the slice, there is no point
-            // in checking the slice end).
-            if (indexState.isDone()
-                || indexState.currentBlockIdx() > lastBlockIdx
-                || !deserializer.hasNext()
-                || (indexState.currentBlockIdx() == lastBlockIdx && deserializer.compareNextTo(end) >= 0))
-                return null;
-
-
-            Unfiltered next = deserializer.readNext();
-            if (next.kind() == Unfiltered.Kind.RANGE_TOMBSTONE_MARKER)
-                updateOpenMarker((RangeTombstoneMarker)next);
-            return next;
+            while (true)
+            {
+                // Our previous read might have made us cross an index block boundary. If so, update our informations.
+                // If we read from the beginning of the partition, this is also what will initialize the index state.
+                indexState.updateBlock();
+
+                // Return the next unfiltered unless we've reached the end, or we're beyond our slice
+                // end (note that unless we're on the last block for the slice, there is no point
+                // in checking the slice end).
+                if (indexState.isDone()
+                    || indexState.currentBlockIdx() > lastBlockIdx
+                    || !deserializer.hasNext()
+                    || (indexState.currentBlockIdx() == lastBlockIdx && deserializer.compareNextTo(end) >= 0))
+                    return null;
+
+
+                Unfiltered next = deserializer.readNext();
+                // We may get empty row for the same reason expressed on UnfilteredSerializer.deserializeOne.
+                if (next.isEmpty())
+                    continue;
+
+                if (next.kind() == Unfiltered.Kind.RANGE_TOMBSTONE_MARKER)
+                    updateOpenMarker((RangeTombstoneMarker) next);
+                return next;
+            }
         }
     }
 }

http://git-wip-us.apache.org/repos/asf/cassandra/blob/5262bb17/src/java/org/apache/cassandra/db/columniterator/SSTableReversedIterator.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/cassandra/db/columniterator/SSTableReversedIterator.java b/src/java/org/apache/cassandra/db/columniterator/SSTableReversedIterator.java
index 49dc82a..4bb7fe8 100644
--- a/src/java/org/apache/cassandra/db/columniterator/SSTableReversedIterator.java
+++ b/src/java/org/apache/cassandra/db/columniterator/SSTableReversedIterator.java
@@ -213,7 +213,8 @@ public class SSTableReversedIterator extends AbstractSSTableIterator
                    && !stopReadingDisk())
             {
                 Unfiltered unfiltered = deserializer.readNext();
-                if (!isFirst || includeFirst)
+                // We may get empty row for the same reason expressed on UnfilteredSerializer.deserializeOne.
+                if (!unfiltered.isEmpty() && (!isFirst || includeFirst))
                     buffer.add(unfiltered);
 
                 isFirst = false;

http://git-wip-us.apache.org/repos/asf/cassandra/blob/5262bb17/src/java/org/apache/cassandra/db/rows/RangeTombstoneMarker.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/cassandra/db/rows/RangeTombstoneMarker.java b/src/java/org/apache/cassandra/db/rows/RangeTombstoneMarker.java
index 1cd5fb4..c4c9f7f 100644
--- a/src/java/org/apache/cassandra/db/rows/RangeTombstoneMarker.java
+++ b/src/java/org/apache/cassandra/db/rows/RangeTombstoneMarker.java
@@ -49,6 +49,12 @@ public interface RangeTombstoneMarker extends Unfiltered
 
     public RangeTombstoneMarker copy(AbstractAllocator allocator);
 
+    default public boolean isEmpty()
+    {
+        // There is no such thing as an empty marker
+        return false;
+    }
+
     /**
      * Utility class to help merging range tombstone markers coming from multiple inputs (UnfilteredRowIterators).
      * <p>

http://git-wip-us.apache.org/repos/asf/cassandra/blob/5262bb17/src/java/org/apache/cassandra/db/rows/Unfiltered.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/cassandra/db/rows/Unfiltered.java b/src/java/org/apache/cassandra/db/rows/Unfiltered.java
index 9d96137..9511eeb 100644
--- a/src/java/org/apache/cassandra/db/rows/Unfiltered.java
+++ b/src/java/org/apache/cassandra/db/rows/Unfiltered.java
@@ -55,6 +55,8 @@ public interface Unfiltered extends Clusterable
      */
     public void validateData(CFMetaData metadata);
 
+    public boolean isEmpty();
+
     public String toString(CFMetaData metadata);
     public String toString(CFMetaData metadata, boolean fullDetails);
     public String toString(CFMetaData metadata, boolean includeClusterKeys, boolean fullDetails);

http://git-wip-us.apache.org/repos/asf/cassandra/blob/5262bb17/src/java/org/apache/cassandra/db/rows/UnfilteredSerializer.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/cassandra/db/rows/UnfilteredSerializer.java b/src/java/org/apache/cassandra/db/rows/UnfilteredSerializer.java
index dc6f187..bdc8388 100644
--- a/src/java/org/apache/cassandra/db/rows/UnfilteredSerializer.java
+++ b/src/java/org/apache/cassandra/db/rows/UnfilteredSerializer.java
@@ -350,9 +350,44 @@ public class UnfilteredSerializer
         return 1;
     }
 
+    /**
+     * Deserialize an {@link Unfiltered} from the provided input.
+     *
+     * @param in the input from which to deserialize.
+     * @param header serialization header corresponding to the serialized data.
+     * @param helper the helper to use for deserialization.
+     * @param builder a row builder, passed here so we don't allocate a new one for every new row.
+     * @return the deserialized {@link Unfiltered} or {@code null} if we've read the end of a partition. This method is
+     * guaranteed to never return empty rows.
+     */
     public Unfiltered deserialize(DataInputPlus in, SerializationHeader header, SerializationHelper helper, Row.Builder builder)
     throws IOException
     {
+        while (true)
+        {
+            Unfiltered unfiltered = deserializeOne(in, header, helper, builder);
+            if (unfiltered == null)
+                return null;
+
+            // Skip empty rows, see deserializeOne javadoc
+            if (!unfiltered.isEmpty())
+                return unfiltered;
+        }
+    }
+
+    /**
+     * Deserialize a single {@link Unfiltered} from the provided input.
+     * <p>
+     * <b>WARNING:</b> this can return an empty row because it's possible there is a row serialized, but that row only
+     * contains data for dropped columns, see CASSANDRA-13337. But as most code expect rows to not be empty, this isn't
+     * meant to be exposed publicly.
+     *
+     * But as {@link UnfilteredRowIterator} should not return empty
+     * rows, this mean consumer of this method should make sure to skip said empty rows.
+     */
+    private Unfiltered deserializeOne(DataInputPlus in, SerializationHeader header, SerializationHelper helper, Row.Builder builder)
+    throws IOException
+    {
         // It wouldn't be wrong per-se to use an unsorted builder, but it would be inefficient so make sure we don't do it by mistake
         assert builder.isSorted();
 
@@ -374,13 +409,7 @@ public class UnfilteredSerializer
                 throw new IOException("Corrupt flags value for unfiltered partition (isStatic flag set): " + flags);
 
             builder.newRow(Clustering.serializer.deserialize(in, helper.version, header.clusteringTypes()));
-            Row row = deserializeRowBody(in, header, helper, flags, extendedFlags, builder);
-            // we do not write empty rows because Rows.collectStats(), called by BTW.applyToRow(), asserts that rows are not empty
-            // if we don't throw here, then later the very same assertion in Rows.collectStats() will fail compactions
-            // see BlackListingCompactionsTest and CASSANDRA-9530 for details
-            if (row.isEmpty())
-                throw new IOException("Corrupt empty row found in unfiltered partition");
-            return row;
+            return deserializeRowBody(in, header, helper, flags, extendedFlags, builder);
         }
     }
 

http://git-wip-us.apache.org/repos/asf/cassandra/blob/5262bb17/test/unit/org/apache/cassandra/cql3/validation/operations/AlterTest.java
----------------------------------------------------------------------
diff --git a/test/unit/org/apache/cassandra/cql3/validation/operations/AlterTest.java b/test/unit/org/apache/cassandra/cql3/validation/operations/AlterTest.java
index 245be30..c48ffe5 100644
--- a/test/unit/org/apache/cassandra/cql3/validation/operations/AlterTest.java
+++ b/test/unit/org/apache/cassandra/cql3/validation/operations/AlterTest.java
@@ -355,4 +355,55 @@ public class AlterTest extends CQLTester
             assertEquals(errorMsg, e.getMessage());
         }
     }
+
+    /**
+     * Test for CASSANDRA-13337. Checks that dropping a column when a sstable contains only data for that column
+     * works properly.
+     */
+    @Test
+    public void testAlterDropEmptySSTable() throws Throwable
+    {
+        createTable("CREATE TABLE %s(k int PRIMARY KEY, x int, y int)");
+
+        execute("UPDATE %s SET x = 1 WHERE k = 0");
+
+        flush();
+
+        execute("UPDATE %s SET x = 1, y = 1 WHERE k = 0");
+
+        flush();
+
+        execute("ALTER TABLE %s DROP x");
+
+        compact();
+
+        assertRows(execute("SELECT * FROM %s"), row(0, 1));
+    }
+
+    /**
+     * Similarly to testAlterDropEmptySSTable, checks we don't return empty rows from queries (testAlterDropEmptySSTable
+     * tests the compaction case).
+     */
+    @Test
+    public void testAlterOnlyColumnBehaviorWithFlush() throws Throwable
+    {
+        testAlterOnlyColumnBehaviorWithFlush(true);
+        testAlterOnlyColumnBehaviorWithFlush(false);
+    }
+
+    private void testAlterOnlyColumnBehaviorWithFlush(boolean flushAfterInsert) throws Throwable
+    {
+        createTable("CREATE TABLE %s(k int PRIMARY KEY, x int, y int)");
+
+        execute("UPDATE %s SET x = 1 WHERE k = 0");
+
+        assertRows(execute("SELECT * FROM %s"), row(0, 1, null));
+
+        if (flushAfterInsert)
+            flush();
+
+        execute("ALTER TABLE %s DROP x");
+
+        assertEmpty(execute("SELECT * FROM %s"));
+    }
 }