Return-Path: X-Original-To: archive-asf-public-internal@cust-asf2.ponee.io Delivered-To: archive-asf-public-internal@cust-asf2.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by cust-asf2.ponee.io (Postfix) with ESMTP id D4073200D2B for ; Thu, 2 Nov 2017 20:32:18 +0100 (CET) Received: by cust-asf.ponee.io (Postfix) id D26BC160BE5; Thu, 2 Nov 2017 19:32:18 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id 250161609EB for ; Thu, 2 Nov 2017 20:32:17 +0100 (CET) Received: (qmail 87801 invoked by uid 500); 2 Nov 2017 19:32:17 -0000 Mailing-List: contact commits-help@couchdb.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@couchdb.apache.org Delivered-To: mailing list commits@couchdb.apache.org Received: (qmail 87792 invoked by uid 99); 2 Nov 2017 19:32:17 -0000 Received: from ec2-52-202-80-70.compute-1.amazonaws.com (HELO gitbox.apache.org) (52.202.80.70) by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 02 Nov 2017 19:32:17 +0000 Received: by gitbox.apache.org (ASF Mail Server at gitbox.apache.org, from userid 33) id 9CA2D81C26; Thu, 2 Nov 2017 19:32:14 +0000 (UTC) Date: Thu, 02 Nov 2017 19:32:18 +0000 To: "commits@couchdb.apache.org" Subject: [couchdb] 04/04: Optimize couch_key_tree:stem/2 MIME-Version: 1.0 Content-Type: text/plain; charset=utf-8 Content-Transfer-Encoding: 8bit From: davisp@apache.org In-Reply-To: <150965113467.13658.1019656453679493846@gitbox.apache.org> References: <150965113467.13658.1019656453679493846@gitbox.apache.org> X-Git-Host: gitbox.apache.org X-Git-Repo: couchdb X-Git-Refname: refs/heads/optimize-doc-updates X-Git-Reftype: branch X-Git-Rev: ad22a32da046c9316a01bfb4f3e5cb6d299dabbd X-Git-NotificationType: diff X-Git-Multimail-Version: 1.5.dev Auto-Submitted: auto-generated Message-Id: <20171102193215.9CA2D81C26@gitbox.apache.org> archived-at: Thu, 02 Nov 2017 19:32:19 -0000 This is an automated email from the ASF dual-hosted git repository. davisp pushed a commit to branch optimize-doc-updates in repository https://gitbox.apache.org/repos/asf/couchdb.git commit ad22a32da046c9316a01bfb4f3e5cb6d299dabbd Author: Paul J. Davis AuthorDate: Thu Nov 2 14:30:14 2017 -0500 Optimize couch_key_tree:stem/2 This changes the stemming algorithm to be a single pass O(N) operation rather than the existing O(N^2) operation. The old stemming algorithm works by taking the entire tree apart and then merging the resulting paths back into a single tree. The merging operation ends up causing the O(N^2) aspect because every branch has to be compared to every exisitng sub-tree. --- src/couch/src/couch_db_updater.erl | 9 +++++--- src/couch/src/couch_key_tree.erl | 44 +++++++++++++++++++++++++++++++++++++- 2 files changed, 49 insertions(+), 4 deletions(-) diff --git a/src/couch/src/couch_db_updater.erl b/src/couch/src/couch_db_updater.erl index bcddbe0..07ac0cf 100644 --- a/src/couch/src/couch_db_updater.erl +++ b/src/couch/src/couch_db_updater.erl @@ -876,8 +876,11 @@ stem_full_doc_info(#full_doc_info{rev_tree = Tree} = Info, Limit) -> Stemmed = couch_key_tree:stem(Tree, Limit), Info#full_doc_info{rev_tree = Stemmed}. -stem_full_doc_infos(#db{revs_limit=Limit}, DocInfos) -> - lists:map(fun(FDI) -> stem_full_doc_info(FDI, Limit) end, DocInfos). +full_stem_full_doc_infos(#db{revs_limit=Limit}, DocInfos) -> + lists:map(fun(#full_doc_info{rev_tree=Tree}=FDI) -> + Stemmed = couch_key_tree:full_stem(Tree, Limit), + FDI#full_doc_info{rev_tree=Stemmed} + DocInfos). update_docs_int(Db, DocsList, NonRepDocs, MergeConflicts, FullCommit) -> #db{ @@ -1125,7 +1128,7 @@ copy_docs(Db, #db{fd = DestFd} = NewDb, MixedInfos, Retry) -> } end, NewInfos0), - NewInfos = stem_full_doc_infos(Db, NewInfos1), + NewInfos = full_stem_full_doc_infos(Db, NewInfos1), RemoveSeqs = case Retry of nil -> diff --git a/src/couch/src/couch_key_tree.erl b/src/couch/src/couch_key_tree.erl index bc4076a..e509e89 100644 --- a/src/couch/src/couch_key_tree.erl +++ b/src/couch/src/couch_key_tree.erl @@ -62,7 +62,8 @@ mapfold/3, merge/3, merge/2, remove_leafs/2, -stem/2 +stem/2, +full_stem/2 ]). -include_lib("couch/include/couch_db.hrl"). @@ -470,6 +471,47 @@ map_leafs_simple(Fun, Pos, [{Key, Value, SubTree} | RestTree]) -> stem(Trees, Limit) -> + lists:map(fun(Tree) -> + stem_tree(Tree, Limit) + end, Trees). + + +stem_tree({Depth, Children}, Limit) -> + lists:map(fun(Child) -> + stem_tree(Depth, Child, Limit) + end, Children). + + +stem_tree(_Depth, {_Key, _Val, []} = Leaf, Limit) -> + {Limit - 1, Leaf, []}; + +stem_tree(Depth, {Key, Val, Children}, Limit) -> + FinalAcc = lists:foldl(fun(Child, {LimitPosAcc, ChildAcc, BranchAcc}) -> + case stem_tree(Depth + 1, Child, Limit) of + {LimitPos, NewChild, NewBranches} -> + NewLimitPosAcc = erlang:max(LimitPos, LimitPosAcc), + NewChildAcc = [NewChild | ChildAcc], + NewBranchAcc = NewBranches ++ BranchAcc, + {NewLimitPosAcc, NewChildAcc, NewBranchAcc}; + {LimitPos, NewBranches} -> + NewLimitPosAcc = erlang:max(LimitPos, LimitPosAcc), + NewBranchAcc = NewBranches ++ BranchAcc, + {NewLimitPosAcc, ChildAcc, NewBranchAcc} + end + end, {-1, [], []}, Children), + {FinalLimitPos, FinalChildren, FinalBranches} = FinalAcc, + NewChildren = lists:sort(FinalChildren) + case {FinalLimitPos, NewChildren} of + {N, _} when N > 0, length(NewChildren) > 0 -> + {FinalLimiPos - 1, {Key, Value, NewChildren}, FinalBranches}; + {0, _} when length(NewChildren) > 0 -> + {-1, [{Depth, NewChildren} | FinalBranches]}; + {N, []} when N < 0 -> + {FinalLimitPos - 1, FinalBranches} + end. + + +full_stem(Trees, Limit) -> % flatten each branch in a tree into a tree path, sort by starting rev # Paths = lists:sort(lists:map(fun({Pos, Path}) -> StemmedPath = lists:sublist(Path, Limit), -- To stop receiving notification emails like this one, please contact "commits@couchdb.apache.org" .