couchdb-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From dav...@apache.org
Subject [couchdb] 04/04: Optimize couch_key_tree:stem/2
Date Thu, 02 Nov 2017 19:32:18 GMT
This is an automated email from the ASF dual-hosted git repository.

davisp pushed a commit to branch optimize-doc-updates
in repository https://gitbox.apache.org/repos/asf/couchdb.git

commit ad22a32da046c9316a01bfb4f3e5cb6d299dabbd
Author: Paul J. Davis <paul.joseph.davis@gmail.com>
AuthorDate: Thu Nov 2 14:30:14 2017 -0500

    Optimize couch_key_tree:stem/2
    
    This changes the stemming algorithm to be a single pass O(N) operation
    rather than the existing O(N^2) operation.
    
    The old stemming algorithm works by taking the entire tree apart and
    then merging the resulting paths back into a single tree. The merging
    operation ends up causing the O(N^2) aspect because every branch has to
    be compared to every exisitng sub-tree.
---
 src/couch/src/couch_db_updater.erl |  9 +++++---
 src/couch/src/couch_key_tree.erl   | 44 +++++++++++++++++++++++++++++++++++++-
 2 files changed, 49 insertions(+), 4 deletions(-)

diff --git a/src/couch/src/couch_db_updater.erl b/src/couch/src/couch_db_updater.erl
index bcddbe0..07ac0cf 100644
--- a/src/couch/src/couch_db_updater.erl
+++ b/src/couch/src/couch_db_updater.erl
@@ -876,8 +876,11 @@ stem_full_doc_info(#full_doc_info{rev_tree = Tree} = Info, Limit) ->
     Stemmed = couch_key_tree:stem(Tree, Limit),
     Info#full_doc_info{rev_tree = Stemmed}.
 
-stem_full_doc_infos(#db{revs_limit=Limit}, DocInfos) ->
-    lists:map(fun(FDI) -> stem_full_doc_info(FDI, Limit) end, DocInfos).
+full_stem_full_doc_infos(#db{revs_limit=Limit}, DocInfos) ->
+    lists:map(fun(#full_doc_info{rev_tree=Tree}=FDI) ->
+        Stemmed = couch_key_tree:full_stem(Tree, Limit),
+        FDI#full_doc_info{rev_tree=Stemmed}
+    DocInfos).
 
 update_docs_int(Db, DocsList, NonRepDocs, MergeConflicts, FullCommit) ->
     #db{
@@ -1125,7 +1128,7 @@ copy_docs(Db, #db{fd = DestFd} = NewDb, MixedInfos, Retry) ->
         }
     end, NewInfos0),
 
-    NewInfos = stem_full_doc_infos(Db, NewInfos1),
+    NewInfos = full_stem_full_doc_infos(Db, NewInfos1),
     RemoveSeqs =
     case Retry of
     nil ->
diff --git a/src/couch/src/couch_key_tree.erl b/src/couch/src/couch_key_tree.erl
index bc4076a..e509e89 100644
--- a/src/couch/src/couch_key_tree.erl
+++ b/src/couch/src/couch_key_tree.erl
@@ -62,7 +62,8 @@ mapfold/3,
 merge/3,
 merge/2,
 remove_leafs/2,
-stem/2
+stem/2,
+full_stem/2
 ]).
 
 -include_lib("couch/include/couch_db.hrl").
@@ -470,6 +471,47 @@ map_leafs_simple(Fun, Pos, [{Key, Value, SubTree} | RestTree]) ->
 
 
 stem(Trees, Limit) ->
+    lists:map(fun(Tree) ->
+        stem_tree(Tree, Limit)
+    end, Trees).
+
+
+stem_tree({Depth, Children}, Limit) ->
+    lists:map(fun(Child) ->
+        stem_tree(Depth, Child, Limit)
+    end, Children).
+
+
+stem_tree(_Depth, {_Key, _Val, []} = Leaf, Limit) ->
+    {Limit - 1, Leaf, []};
+
+stem_tree(Depth, {Key, Val, Children}, Limit) ->
+    FinalAcc = lists:foldl(fun(Child, {LimitPosAcc, ChildAcc, BranchAcc}) ->
+        case stem_tree(Depth + 1, Child, Limit) of
+            {LimitPos, NewChild, NewBranches} ->
+                NewLimitPosAcc = erlang:max(LimitPos, LimitPosAcc),
+                NewChildAcc = [NewChild | ChildAcc],
+                NewBranchAcc = NewBranches ++ BranchAcc,
+                {NewLimitPosAcc, NewChildAcc, NewBranchAcc};
+            {LimitPos, NewBranches} ->
+                NewLimitPosAcc = erlang:max(LimitPos, LimitPosAcc),
+                NewBranchAcc = NewBranches ++ BranchAcc,
+                {NewLimitPosAcc, ChildAcc, NewBranchAcc}
+        end
+    end, {-1, [], []}, Children),
+    {FinalLimitPos, FinalChildren, FinalBranches} = FinalAcc,
+    NewChildren = lists:sort(FinalChildren)
+    case {FinalLimitPos, NewChildren} of
+        {N, _} when N > 0, length(NewChildren) > 0 ->
+            {FinalLimiPos - 1, {Key, Value, NewChildren}, FinalBranches};
+        {0, _} when length(NewChildren) > 0 ->
+            {-1, [{Depth, NewChildren} | FinalBranches]};
+        {N, []} when N < 0 ->
+            {FinalLimitPos - 1, FinalBranches}
+    end.
+
+
+full_stem(Trees, Limit) ->
     % flatten each branch in a tree into a tree path, sort by starting rev #
     Paths = lists:sort(lists:map(fun({Pos, Path}) ->
         StemmedPath = lists:sublist(Path, Limit),

-- 
To stop receiving notification emails like this one, please contact
"commits@couchdb.apache.org" <commits@couchdb.apache.org>.

Mime
View raw message