couchdb-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
Subject [couchdb] 01/01: Opimize writing KV node append writes
Date Wed, 03 May 2017 17:57:15 GMT
This is an automated email from the ASF dual-hosted git repository.

davisp pushed a commit to branch COUCHDB-3298-optimize-writing-kv-nodes
in repository

commit e3b5f40d9130a6d347b02f31a74fd5300b41bc6e
Author: Paul J. Davis <>
AuthorDate: Wed May 3 12:27:08 2017 -0500

    Opimize writing KV node append writes
    As it turns out, the original change in COUCHDB-3298 ends up hurting
    disk usage when a view emits large amounts of data (i.e., more than
    half of the btree chunk size). The cause for this is that instead of
    writing single element nodes it would instead prefer to write kv nodes
    with three elements. While normally we might prefer this in memory, it
    turns out that our append only storage this causes a significantly more
    amount of trash on disk.
    We can show this with a few trivial examples. Imagine we write KV's a
    through f. The two following patterns show the nodes as we write each
    new kv.
        Before 3298:
        [a, b]
        [a, b]', [c]
        [a, b]', [c, d]
        [a, b]', [c, d]', [e]
        [a, b]', [c, d]', [e, f]
        After 3298:
        [a, b]
        [a, b, c]
        [a, b]', [c, d]
        [a, b]', [c, d, e]
        [a, b]', [c, d]', [e, f]
    The thing to realize here is which of these nodes end up as garbage. In
    the first example we end up with [a], [a, b], [c], [c, d], and [e] nodes
    that have been orphaned. Where as in the second case we end up with
    [a], [a, b], [a, b, c], [c, d], [c, d, e] as nodes that have been
    orphaned. A quick aside, the reason that [a, b] and [c, d] are orphaned
    is due to how a btree update works. For instance, when adding c, we read
    [a, b] into memory, append c, and then during our node write we call
    chunkify which gives us back [a, b], [c] which leads us to writing [a,
    b] a second time.
    This patch changes the write function to realize when we're merely
    appending KVs and saves us this extra write and generation of garbage.
    Its node patterns look like such:
        [a, b]
        [a, b], [c]
        [a, b], [c, d]
        [a, b], [c, d], [e]
        [a, b], [c, d], [e, f]
    Which means we only end up generating [a], [c], and [e] as garbage (with
    respect to kv nodes, kp nodes retain their historical behavior).
 src/couch/src/couch_btree.erl | 58 ++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 57 insertions(+), 1 deletion(-)

diff --git a/src/couch/src/couch_btree.erl b/src/couch/src/couch_btree.erl
index adbc92b..492e53b 100644
--- a/src/couch/src/couch_btree.erl
+++ b/src/couch/src/couch_btree.erl
@@ -396,7 +396,8 @@ modify_node(Bt, RootPointerInfo, Actions, QueryOutput) ->
         {LastKey, _LastValue} = element(tuple_size(NodeTuple), NodeTuple),
         {ok, [{LastKey, RootPointerInfo}], QueryOutput2};
     _Else2 ->
-        {ok, ResultList} = write_node(Bt, NodeType, NewNodeList),
+        {ok, ResultList} = write_node(
+                Bt, RootPointerInfo, NodeType, NodeList, NewNodeList),
         {ok, ResultList, QueryOutput2}
@@ -440,6 +441,61 @@ write_node(#btree{fd = Fd, compression = Comp} = Bt, NodeType, NodeList)
     {ok, ResultList}.
+% Don't make our append-only write optimization for
+% kp nodes.
+write_node(Bt, _OldNode, kp_node, _OldList, NewList) ->
+    write_node(Bt, kp_node, NewList);
+% If we're creating a new kv node then there's no
+% possibility for the optimization
+write_node(Bt, _OldNode, NodeType, [], NewList) ->
+    write_node(Bt, NodeType, NewList);
+% Disable the optimization for nodes that only
+% have a single element so we don't end up increasing
+% the number of reads when folding a btree
+write_node(Bt, _OldNode, NodeType, [_], NewList) ->
+    write_node(Bt, NodeType, NewList);
+% If a KV node has had a new key appended to the
+% end of its list we can instead take the appended
+% KVs and create a new node while reusing the old
+% node already on disk. This saves us both the effort
+% of writing data that's already on disk as well as
+% saves us the disk space that would have been
+% orphaned by not reusing the old node.
+write_node(Bt, OldNode, NodeType, OldList, NewList) ->
+    case is_append_only(OldList, NewList) of
+        false ->
+            write_node(Bt, NodeType, NewList);
+        {true, Suffix} ->
+            case old_node_full(OldList) of
+                true ->
+                    {ok, Results} = write_node(Bt, NodeType, Suffix),
+                    {OldLastKey, _} = lists:last(OldList),
+                    {ok, [{OldLastKey,OldNode} | Results]};
+                false ->
+                    write_node(Bt, NodeType, NewList)
+            end
+    end.
+% This function will blow up if OldList == NewList
+% on purpose as an assertion that modify_node
+% doesn't provide this input.
+is_append_only([], [_ | _] = Suffix) ->
+    {true, Suffix};
+is_append_only([KV1 | _], [KV2 | _]) when KV1 /= KV2 ->
+    false;
+is_append_only([KV | Rest1], [KV | Rest2]) ->
+    is_append_only(Rest1, Rest2).
+old_node_full(OldList) ->
+    ChunkThreshold = get_chunk_size(),
+    NodeSize = lists:foldl(fun(KV, Acc) ->
+        Acc + ?term_size(KV)
+    end, 0, OldList),
+    NodeSize >= ChunkThreshold.
 modify_kpnode(Bt, {}, _LowerBound, Actions, [], QueryOutput) ->
     modify_node(Bt, nil, Actions, QueryOutput);
 modify_kpnode(_Bt, NodeTuple, LowerBound, [], ResultNode, QueryOutput) ->

To stop receiving notification emails like this one, please contact
"" <>.

View raw message