couchdb-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From dav...@apache.org
Subject [couchdb] 01/02: Optimize couch_key_tree:stem/2
Date Sat, 16 Jun 2018 21:57:05 GMT
This is an automated email from the ASF dual-hosted git repository.

davisp pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/couchdb.git

commit aebdbc452573f70f4e50d88af5814d0fbe936333
Author: Paul J. Davis <paul.joseph.davis@gmail.com>
AuthorDate: Wed Jun 13 13:15:11 2018 -0500

    Optimize couch_key_tree:stem/2
    
    This is two related optimizations for stemming revisions. The first
    optimization re-writes the stemming algorithm to drop it from an O(N^2)
    to O(N) operation by using a depth first search through the tree and
    tracking which revisions are within `revs_limit` revs from a leaf
    dropping any revision that exceeds that limit.
    
    The second optimization is just that we avoid calling stemming more
    often than necessary by switching away from using `merge/3` to `merge/2`
    and then calling `stem/2` only when necessary.
    
    Co-Authored-By: Nick Vatamaniuc <vatamane@apache.org>
---
 src/couch/src/couch_db.erl         | 10 ++----
 src/couch/src/couch_db_updater.erl | 33 +++++++++--------
 src/couch/src/couch_key_tree.erl   | 74 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 95 insertions(+), 22 deletions(-)

diff --git a/src/couch/src/couch_db.erl b/src/couch/src/couch_db.erl
index 93ea07e..b47cc7e 100644
--- a/src/couch/src/couch_db.erl
+++ b/src/couch/src/couch_db.erl
@@ -870,16 +870,10 @@ prep_and_validate_replicated_updates(Db, [Bucket|RestBuckets], [OldInfo|RestOldI
             {[], AccErrors}, Bucket),
         prep_and_validate_replicated_updates(Db, RestBuckets, RestOldInfo, [ValidatedBucket
| AccPrepped], AccErrors3);
     #full_doc_info{rev_tree=OldTree} ->
-        RevsLimit = get_revs_limit(Db),
         OldLeafs = couch_key_tree:get_all_leafs_full(OldTree),
         OldLeafsLU = [{Start, RevId} || {Start, [{RevId, _}|_]} <- OldLeafs],
-        NewRevTree = lists:foldl(
-            fun(NewDoc, AccTree) ->
-                {NewTree, _} = couch_key_tree:merge(AccTree,
-                    couch_doc:to_path(NewDoc), RevsLimit),
-                NewTree
-            end,
-            OldTree, Bucket),
+        NewPaths = lists:map(fun couch_doc:to_path/1, Bucket),
+        NewRevTree = couch_key_tree:multi_merge(OldTree, NewPaths),
         Leafs = couch_key_tree:get_all_leafs_full(NewRevTree),
         LeafRevsFullDict = dict:from_list( [{{Start, RevId}, FullPath} || {Start, [{RevId,
_}|_]}=FullPath <- Leafs]),
         {ValidatedBucket, AccErrors3} =
diff --git a/src/couch/src/couch_db_updater.erl b/src/couch/src/couch_db_updater.erl
index a2de3bc..fba99a7 100644
--- a/src/couch/src/couch_db_updater.erl
+++ b/src/couch/src/couch_db_updater.erl
@@ -504,23 +504,24 @@ merge_rev_trees(Limit, MergeConflicts, [NewDocs|RestDocsList],
         [OldDocInfo|RestOldInfo], AccNewInfos, AccRemoveSeqs, AccSeq) ->
     erlang:put(last_id_merged, OldDocInfo#full_doc_info.id), % for debugging
     NewDocInfo0 = lists:foldl(fun({Client, NewDoc}, OldInfoAcc) ->
-        merge_rev_tree(OldInfoAcc, NewDoc, Client, Limit, MergeConflicts)
+        merge_rev_tree(OldInfoAcc, NewDoc, Client, MergeConflicts)
     end, OldDocInfo, NewDocs),
+    NewDocInfo1 = stem_full_doc_info(NewDocInfo0, Limit),
     % When MergeConflicts is false, we updated #full_doc_info.deleted on every
     % iteration of merge_rev_tree. However, merge_rev_tree does not update
     % #full_doc_info.deleted when MergeConflicts is true, since we don't need
     % to know whether the doc is deleted between iterations. Since we still
     % need to know if the doc is deleted after the merge happens, we have to
     % set it here.
-    NewDocInfo1 = case MergeConflicts of
+    NewDocInfo2 = case MergeConflicts of
         true ->
-            NewDocInfo0#full_doc_info{
-                deleted = couch_doc:is_deleted(NewDocInfo0)
+            NewDocInfo1#full_doc_info{
+                deleted = couch_doc:is_deleted(NewDocInfo1)
             };
         false ->
-            NewDocInfo0
+            NewDocInfo1
     end,
-    if NewDocInfo1 == OldDocInfo ->
+    if NewDocInfo2 == OldDocInfo ->
         % nothing changed
         merge_rev_trees(Limit, MergeConflicts, RestDocsList, RestOldInfo,
             AccNewInfos, AccRemoveSeqs, AccSeq);
@@ -529,7 +530,7 @@ merge_rev_trees(Limit, MergeConflicts, [NewDocs|RestDocsList],
         % important to note that the update_seq on OldDocInfo should
         % be identical to the value on NewDocInfo1.
         OldSeq = OldDocInfo#full_doc_info.update_seq,
-        NewDocInfo2 = NewDocInfo1#full_doc_info{
+        NewDocInfo3 = NewDocInfo2#full_doc_info{
             update_seq = AccSeq + 1
         },
         RemoveSeqs = case OldSeq of
@@ -537,10 +538,10 @@ merge_rev_trees(Limit, MergeConflicts, [NewDocs|RestDocsList],
             _ -> [OldSeq | AccRemoveSeqs]
         end,
         merge_rev_trees(Limit, MergeConflicts, RestDocsList, RestOldInfo,
-            [NewDocInfo2|AccNewInfos], RemoveSeqs, AccSeq+1)
+            [NewDocInfo3|AccNewInfos], RemoveSeqs, AccSeq+1)
     end.
 
-merge_rev_tree(OldInfo, NewDoc, Client, Limit, false)
+merge_rev_tree(OldInfo, NewDoc, Client, false)
         when OldInfo#full_doc_info.deleted ->
     % We're recreating a document that was previously
     % deleted. To check that this is a recreation from
@@ -574,7 +575,7 @@ merge_rev_tree(OldInfo, NewDoc, Client, Limit, false)
             % Merge our modified new doc into the tree
             #full_doc_info{rev_tree=OldTree} = OldInfo,
             NewTree0 = couch_doc:to_path(NewDoc2),
-            case couch_key_tree:merge(OldTree, NewTree0, Limit) of
+            case couch_key_tree:merge(OldTree, NewTree0) of
                 {NewTree1, new_leaf} ->
                     % We changed the revision id so inform the caller
                     send_result(Client, NewDoc, {ok, {OldPos+1, NewRevId}}),
@@ -589,7 +590,7 @@ merge_rev_tree(OldInfo, NewDoc, Client, Limit, false)
             send_result(Client, NewDoc, conflict),
             OldInfo
     end;
-merge_rev_tree(OldInfo, NewDoc, Client, Limit, false) ->
+merge_rev_tree(OldInfo, NewDoc, Client, false) ->
     % We're attempting to merge a new revision into an
     % undeleted document. To not be a conflict we require
     % that the merge results in extending a branch.
@@ -597,7 +598,7 @@ merge_rev_tree(OldInfo, NewDoc, Client, Limit, false) ->
     OldTree = OldInfo#full_doc_info.rev_tree,
     NewTree0 = couch_doc:to_path(NewDoc),
     NewDeleted = NewDoc#doc.deleted,
-    case couch_key_tree:merge(OldTree, NewTree0, Limit) of
+    case couch_key_tree:merge(OldTree, NewTree0) of
         {NewTree, new_leaf} when not NewDeleted ->
             OldInfo#full_doc_info{
                 rev_tree = NewTree,
@@ -615,14 +616,18 @@ merge_rev_tree(OldInfo, NewDoc, Client, Limit, false) ->
             send_result(Client, NewDoc, conflict),
             OldInfo
     end;
-merge_rev_tree(OldInfo, NewDoc, _Client, Limit, true) ->
+merge_rev_tree(OldInfo, NewDoc, _Client, true) ->
     % We're merging in revisions without caring about
     % conflicts. Most likely this is a replication update.
     OldTree = OldInfo#full_doc_info.rev_tree,
     NewTree0 = couch_doc:to_path(NewDoc),
-    {NewTree, _} = couch_key_tree:merge(OldTree, NewTree0, Limit),
+    {NewTree, _} = couch_key_tree:merge(OldTree, NewTree0),
     OldInfo#full_doc_info{rev_tree = NewTree}.
 
+stem_full_doc_info(#full_doc_info{rev_tree = Tree} = Info, Limit) ->
+    Stemmed = couch_key_tree:stem(Tree, Limit),
+    Info#full_doc_info{rev_tree = Stemmed}.
+
 update_docs_int(Db, DocsList, LocalDocs, MergeConflicts, FullCommit) ->
     UpdateSeq = couch_db_engine:get_update_seq(Db),
     RevsLimit = couch_db_engine:get_revs_limit(Db),
diff --git a/src/couch/src/couch_key_tree.erl b/src/couch/src/couch_key_tree.erl
index cd661e2..5d53615 100644
--- a/src/couch/src/couch_key_tree.erl
+++ b/src/couch/src/couch_key_tree.erl
@@ -59,6 +59,7 @@ get_key_leafs/2,
 map/2,
 map_leafs/2,
 mapfold/3,
+multi_merge/2,
 merge/3,
 merge/2,
 remove_leafs/2,
@@ -71,6 +72,15 @@ stem/2
 -type revtree() :: [tree()].
 
 
+%% @doc Merge multiple paths into the given tree.
+-spec multi_merge(revtree(), tree()) -> revtree().
+multi_merge(RevTree, Trees) ->
+    lists:foldl(fun(Tree, RevTreeAcc) ->
+        {NewRevTree, _} = merge(RevTreeAcc, Tree),
+        NewRevTree
+    end, RevTree, lists:sort(Trees)).
+
+
 %% @doc Merge a path into the given tree and then stem the result.
 %% Although Tree is of type tree(), it must not contain any branches.
 -spec merge(revtree(), tree() | path(), pos_integer()) ->
@@ -470,6 +480,70 @@ map_leafs_simple(Fun, Pos, [{Key, Value, SubTree} | RestTree]) ->
 
 
 stem(Trees, Limit) ->
+    try
+        {_, Branches} = lists:foldl(fun(Tree, {Seen, TreeAcc}) ->
+            {NewSeen, NewBranches} = stem_tree(Tree, Limit, Seen),
+            {NewSeen, NewBranches ++ TreeAcc}
+        end, {sets:new(), []}, Trees),
+        lists:sort(Branches)
+    catch throw:dupe_keys ->
+        repair_tree(Trees, Limit)
+    end.
+
+
+stem_tree({Depth, Child}, Limit, Seen) ->
+    case stem_tree(Depth, Child, Limit, Seen) of
+        {NewSeen, _, NewChild, NewBranches} ->
+            {NewSeen, [{Depth, NewChild} | NewBranches]};
+        {NewSeen, _, NewBranches} ->
+            {NewSeen, NewBranches}
+    end.
+
+
+stem_tree(_Depth, {Key, _Val, []} = Leaf, Limit, Seen) ->
+    {check_key(Key, Seen), Limit - 1, Leaf, []};
+
+stem_tree(Depth, {Key, Val, Children}, Limit, Seen0) ->
+    Seen1 = check_key(Key, Seen0),
+    FinalAcc = lists:foldl(fun(Child, Acc) ->
+        {SeenAcc, LimitPosAcc, ChildAcc, BranchAcc} = Acc,
+        case stem_tree(Depth + 1, Child, Limit, SeenAcc) of
+            {NewSeenAcc, LimitPos, NewChild, NewBranches} ->
+                NewLimitPosAcc = erlang:max(LimitPos, LimitPosAcc),
+                NewChildAcc = [NewChild | ChildAcc],
+                NewBranchAcc = NewBranches ++ BranchAcc,
+                {NewSeenAcc, NewLimitPosAcc, NewChildAcc, NewBranchAcc};
+            {NewSeenAcc, LimitPos, NewBranches} ->
+                NewLimitPosAcc = erlang:max(LimitPos, LimitPosAcc),
+                NewBranchAcc = NewBranches ++ BranchAcc,
+                {NewSeenAcc, NewLimitPosAcc, ChildAcc, NewBranchAcc}
+        end
+    end, {Seen1, -1, [], []}, Children),
+    {FinalSeen, FinalLimitPos, FinalChildren, FinalBranches} = FinalAcc,
+    case FinalLimitPos of
+        N when N > 0, length(FinalChildren) > 0 ->
+            FinalNode = {Key, Val, lists:reverse(FinalChildren)},
+            {FinalSeen, FinalLimitPos - 1, FinalNode, FinalBranches};
+        0 when length(FinalChildren) > 0 ->
+            NewBranches = lists:map(fun(Child) ->
+                {Depth + 1, Child}
+            end, lists:reverse(FinalChildren)),
+            {FinalSeen, -1, NewBranches ++ FinalBranches};
+        N when N < 0, length(FinalChildren) == 0 ->
+            {FinalSeen, FinalLimitPos - 1, FinalBranches}
+    end.
+
+
+check_key(Key, Seen) ->
+    case sets:is_element(Key, Seen) of
+        true ->
+            throw(dupe_keys);
+        false ->
+            sets:add_element(Key, Seen)
+    end.
+
+
+repair_tree(Trees, Limit) ->
     % flatten each branch in a tree into a tree path, sort by starting rev #
     Paths = lists:sort(lists:map(fun({Pos, Path}) ->
         StemmedPath = lists:sublist(Path, Limit),

-- 
To stop receiving notification emails like this one, please contact
davisp@apache.org.

Mime
View raw message