Return-Path: X-Original-To: archive-asf-public-internal@cust-asf2.ponee.io Delivered-To: archive-asf-public-internal@cust-asf2.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by cust-asf2.ponee.io (Postfix) with ESMTP id 55F18200C70 for ; Thu, 4 May 2017 21:30:13 +0200 (CEST) Received: by cust-asf.ponee.io (Postfix) id 549C5160B9B; Thu, 4 May 2017 19:30:13 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id 75B2A160BB0 for ; Thu, 4 May 2017 21:30:12 +0200 (CEST) Received: (qmail 85751 invoked by uid 500); 4 May 2017 19:30:11 -0000 Mailing-List: contact commits-help@couchdb.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@couchdb.apache.org Delivered-To: mailing list commits@couchdb.apache.org Received: (qmail 85742 invoked by uid 99); 4 May 2017 19:30:11 -0000 Received: from ec2-52-202-80-70.compute-1.amazonaws.com (HELO gitbox.apache.org) (52.202.80.70) by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 04 May 2017 19:30:11 +0000 Received: by gitbox.apache.org (ASF Mail Server at gitbox.apache.org, from userid 33) id 8BFAC81166; Thu, 4 May 2017 19:30:10 +0000 (UTC) Date: Thu, 04 May 2017 19:30:11 +0000 To: "commits@couchdb.apache.org" Subject: [couchdb] 01/01: Opimize writing KV node append writes MIME-Version: 1.0 Content-Type: text/plain; charset=utf-8 Content-Transfer-Encoding: 8bit From: davisp@apache.org Reply-To: "commits@couchdb.apache.org" In-Reply-To: <149392621006.1551.13963925991724192720@gitbox.apache.org> References: <149392621006.1551.13963925991724192720@gitbox.apache.org> X-Git-Host: gitbox.apache.org X-Git-Repo: couchdb X-Git-Refname: refs/heads/COUCHDB-3298-optimize-writing-kv-nodes X-Git-Reftype: branch X-Git-Rev: 6b2ed293dee8e077f418f50bc0d0f24aba69c760 X-Git-NotificationType: diff X-Git-Multimail-Version: 1.3.dev Auto-Submitted: auto-generated Message-Id: <20170504193010.8BFAC81166@gitbox.apache.org> archived-at: Thu, 04 May 2017 19:30:13 -0000 This is an automated email from the ASF dual-hosted git repository. davisp pushed a commit to branch COUCHDB-3298-optimize-writing-kv-nodes in repository https://gitbox.apache.org/repos/asf/couchdb.git commit 6b2ed293dee8e077f418f50bc0d0f24aba69c760 Author: Paul J. Davis AuthorDate: Wed May 3 12:27:08 2017 -0500 Opimize writing KV node append writes As it turns out, the original change in COUCHDB-3298 ends up hurting disk usage when a view emits large amounts of data (i.e., more than half of the btree chunk size). The cause for this is that instead of writing single element nodes it would instead prefer to write kv nodes with three elements. While normally we might prefer this in memory, it turns out that our append only storage this causes a significantly more amount of trash on disk. We can show this with a few trivial examples. Imagine we write KV's a through f. The two following patterns show the nodes as we write each new kv. Before 3298: [] [a] [a, b] [a, b]', [c] [a, b]', [c, d] [a, b]', [c, d]', [e] [a, b]', [c, d]', [e, f] After 3298: [] [a] [a, b] [a, b, c] [a, b]', [c, d] [a, b]', [c, d, e] [a, b]', [c, d]', [e, f] The thing to realize here is which of these nodes end up as garbage. In the first example we end up with [a], [a, b], [c], [c, d], and [e] nodes that have been orphaned. Where as in the second case we end up with [a], [a, b], [a, b, c], [c, d], [c, d, e] as nodes that have been orphaned. A quick aside, the reason that [a, b] and [c, d] are orphaned is due to how a btree update works. For instance, when adding c, we read [a, b] into memory, append c, and then during our node write we call chunkify which gives us back [a, b], [c] which leads us to writing [a, b] a second time. This patch changes the write function to realize when we're merely appending KVs and saves us this extra write and generation of garbage. Its node patterns look like such: [] [a] [a, b] [a, b], [c] [a, b], [c, d] [a, b], [c, d], [e] [a, b], [c, d], [e, f] Which means we only end up generating [a], [c], and [e] as garbage (with respect to kv nodes, kp nodes retain their historical behavior). --- src/couch/src/couch_btree.erl | 69 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 68 insertions(+), 1 deletion(-) diff --git a/src/couch/src/couch_btree.erl b/src/couch/src/couch_btree.erl index adbc92b..cf29796 100644 --- a/src/couch/src/couch_btree.erl +++ b/src/couch/src/couch_btree.erl @@ -396,7 +396,8 @@ modify_node(Bt, RootPointerInfo, Actions, QueryOutput) -> {LastKey, _LastValue} = element(tuple_size(NodeTuple), NodeTuple), {ok, [{LastKey, RootPointerInfo}], QueryOutput2}; _Else2 -> - {ok, ResultList} = write_node(Bt, NodeType, NewNodeList), + {ok, ResultList} = write_node( + Bt, RootPointerInfo, NodeType, NodeList, NewNodeList), {ok, ResultList, QueryOutput2} end. @@ -440,6 +441,72 @@ write_node(#btree{fd = Fd, compression = Comp} = Bt, NodeType, NodeList) -> ], {ok, ResultList}. +% Don't make our append-only write optimization for +% kp nodes. +write_node(Bt, _OldNode, kp_node, _OldList, NewList) -> + write_node(Bt, kp_node, NewList); + +% If we're creating a new kv node then there's no +% possibility for the optimization +write_node(Bt, _OldNode, NodeType, [], NewList) -> + write_node(Bt, NodeType, NewList); + +% Disable the optimization for nodes that only +% have a single element so we don't end up increasing +% the number of reads when folding a btree +write_node(Bt, _OldNode, NodeType, [_], NewList) -> + write_node(Bt, NodeType, NewList); + +% If a KV node has had a new key appended to the +% end of its list we can instead take the appended +% KVs and create a new node while reusing the old +% node already on disk. This saves us both the effort +% of writing data that's already on disk as well as +% saves us the disk space that would have been +% orphaned by not reusing the old node. +write_node(Bt, OldNode, NodeType, OldList, NewList) -> + case is_append_only(OldList, NewList) of + false -> + write_node(Bt, NodeType, NewList); + {true, Suffix} -> + case old_node_full(OldList) of + true -> + {ok, Results} = write_node(Bt, NodeType, Suffix), + {OldLastKey, _} = lists:last(OldList), + {ok, [{OldLastKey,OldNode} | Results]}; + false -> + write_node(Bt, NodeType, NewList) + end + end. + +% This function will break if both input +% lists are empty. This is specifically to +% assert that this case is filtered out +% in `modify_node/4`. +% +% First clause finds our suff +is_append_only([], [_ | _] = Suffix) -> + {true, Suffix}; + +% We've removed keys from the node +is_append_only([_ | _], []) -> + false; + +% We've found a mismatch of keys in the node +is_append_only([KV1 | _], [KV2 | _]) when KV1 /= KV2 -> + false; + +% Current key is equal, keep going +is_append_only([KV | Rest1], [KV | Rest2]) -> + is_append_only(Rest1, Rest2). + +old_node_full(OldList) -> + ChunkThreshold = get_chunk_size(), + NodeSize = lists:foldl(fun(KV, Acc) -> + Acc + ?term_size(KV) + end, 0, OldList), + NodeSize >= ChunkThreshold. + modify_kpnode(Bt, {}, _LowerBound, Actions, [], QueryOutput) -> modify_node(Bt, nil, Actions, QueryOutput); modify_kpnode(_Bt, NodeTuple, LowerBound, [], ResultNode, QueryOutput) -> -- To stop receiving notification emails like this one, please contact "commits@couchdb.apache.org" .