couchdb-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
Subject svn commit: r1164350 - in /couchdb/trunk: share/www/script/test/recreate_doc.js src/couchdb/couch_doc.erl src/couchdb/couch_key_tree.erl
Date Fri, 02 Sep 2011 04:31:11 GMT
Author: davisp
Date: Fri Sep  2 04:31:10 2011
New Revision: 1164350

Fix introduction of duplicates into _changes feed

When a document is updated the new update_seq is assigned as part of the
rev_tree merging in couch_db_updater:merge_rev_trees/7 based on the
condition of whether the new rev_tree is equal to the old tree. This
equality is done as a simple Erlang term comparison. If the trees are
not equal a new update_seq is assigned to the #full_doc_info{} record
that is stored in fulldocinfo_by_id_btree.

During replication it is possible that a document update merges into the
rev_tree internally without creating a leaf. If the terminal node of the
replicated path happens to land on a node with a value of ?REV_MISSING
the new document information will be preferred and replace the

This preference ends up causing the rev_tree comparison to evaluate to
false which ends up giving this document a new update_seq. Up until this
point everything is fine. We write a bit of extra data (that will be
cleared during compaction) because of a previous bug where we decided to
be cautious and avoid losing data due to a broken rev_tree merging
aglorithm. It is also important to note that this is the place were we
calculate the update_seq to remove from the docinfo_by_seq_tree.

After this point we get back to couch_db_udpater:update_docs_int/5 where
we eventually call couch_db_updater:new_index_entries/3 which creates
the new entries for the fulldocinfo_by_id_tree and docinfo_by_seq_btree.
At this point we end up creating a #doc_info{} record based on the
leaves in the rev_tree. As you recall, the update that caused the new
update_seq was not a leaf, at this point we create a #doc_info{} record
with an incorrect high_seq member pointing to the update_seq we are
about to remove from the docinfo_by_seq_tree (remember we calculated the
seq to remove before we consulted the leaves).

The end result is that we remove the same update_seq we insert. This
sets us up for the real bug. The next time we go to update this document
the same procedure is applied. But what happens is at the point we
calculate the seq to remove from docinfo_by_seq_tree, we calculate the
wrong value. Thus when the update continues we remove an update_seq that
doesn't exist in the tree and insert our new seq. But, the seq from the
previous update is still in the tree. Thus, our docinfo_by_seq_tree now
contains two entries pointing at the same document.

At this point, we run into the observed behavior of this bug that ends
up causing duplicate entries in views which then ends up throwing errors
when the view is compaction. These errors are also particularly nasty
because they bubble up the the couch_view gen_server which crashes and
spiders out crashing every couch_view_group process. That part probably
isn't important though.

There's a simple test include with the patch to illustrate the behavior
and maintain an assertion that it stays fixed.

Fixes COUCHDB-1265


Modified: couchdb/trunk/share/www/script/test/recreate_doc.js
--- couchdb/trunk/share/www/script/test/recreate_doc.js (original)
+++ couchdb/trunk/share/www/script/test/recreate_doc.js Fri Sep  2 04:31:10 2011
@@ -77,4 +77,45 @@ couchTests.recreate_doc = function(debug
   } catch (e) {
     T(e.error == "conflict");
+  db.deleteDb();
+  db.createDb();
+  // COUCHDB-1265
+  // Resuscitate an unavailable old revision and make sure that it
+  // doesn't introduce duplicates into the _changes feed.
+  var doc = {_id: "bar", count: 0};
+  T(;
+  var ghost = {_id: "bar", _rev: doc._rev, count: doc.count};
+  for(var i = 0; i < 2; i++) {
+    doc.count += 1;
+    T(;
+  }
+  // Compact so that the old revision to be resuscitated will be
+  // in the rev_tree as ?REV_MISSING
+  db.compact();
+  while( {}
+  // Saving the ghost here puts it back in the rev_tree in such
+  // a way as to create a new update_seq but without changing a
+  // leaf revision. This would cause the #full_doc_info{} and
+  // #doc_info{} records to diverge in their idea of what the
+  // doc's update_seq is and end up introducing a duplicate in
+  // the _changes feed the next time this doc is updated.
+  T(, {new_edits: false}).ok);
+  // The duplicate would have been introduce here becuase the #doc_info{}
+  // would not have been removed correctly.
+  T(;
+  // And finally assert that there are no duplicates in _changes.
+  var req = CouchDB.request("GET", "/test_suite_db/_changes");
+  var resp = JSON.parse(req.responseText);
+  var docids = {};
+  for(var i = 0; i < resp.results.length; i++) {
+    T(docids[resp.results[i].id] === undefined, "Duplicates in _changes feed.");
+    docids[resp.results[i].id] = true;
+  }

Modified: couchdb/trunk/src/couchdb/couch_doc.erl
--- couchdb/trunk/src/couchdb/couch_doc.erl (original)
+++ couchdb/trunk/src/couchdb/couch_doc.erl Fri Sep  2 04:31:10 2011
@@ -319,10 +319,19 @@ to_doc_info(FullDocInfo) ->
     {DocInfo, _Path} = to_doc_info_path(FullDocInfo),
-max_seq([], Max) ->
-    Max;
-max_seq([#rev_info{seq=Seq}|Rest], Max) ->
-    max_seq(Rest, if Max > Seq -> Max; true -> Seq end).
+max_seq(Tree) ->
+    FoldFun = fun({_Pos, _Key}, Value, _Type, MaxOldSeq) ->
+        case Value of
+            {_Deleted, _DiskPos, OldTreeSeq} ->
+                % Older versions didn't track data sizes.
+                erlang:max(MaxOldSeq, OldTreeSeq);
+            {_Deleted, _DiskPos, OldTreeSeq, _Size} ->
+                erlang:max(MaxOldSeq, OldTreeSeq);
+            _ ->
+                MaxOldSeq
+        end
+    end,
+    couch_key_tree:fold(FoldFun, 0, Tree).
 to_doc_info_path(#full_doc_info{id=Id,rev_tree=Tree}) ->
     RevInfosAndPath = [
@@ -342,7 +351,7 @@ to_doc_info_path(#full_doc_info{id=Id,re
         end, RevInfosAndPath),
     [{_RevInfo, WinPath}|_] = SortedRevInfosAndPath,
     RevInfos = [RevInfo || {RevInfo, _Path} <- SortedRevInfosAndPath],
-    {#doc_info{id=Id, high_seq=max_seq(RevInfos, 0), revs=RevInfos}, WinPath}.
+    {#doc_info{id=Id, high_seq=max_seq(Tree), revs=RevInfos}, WinPath}.

Modified: couchdb/trunk/src/couchdb/couch_key_tree.erl
--- couchdb/trunk/src/couchdb/couch_key_tree.erl (original)
+++ couchdb/trunk/src/couchdb/couch_key_tree.erl Fri Sep  2 04:31:10 2011
@@ -49,7 +49,7 @@
 -export([merge/3, find_missing/2, get_key_leafs/2, get_full_key_paths/2, get/2]).
 -export([get_all_leafs/1, count_leafs/1, remove_leafs/2, get_all_leafs_full/1, stem/2]).
--export([map/2, mapfold/3, map_leafs/2]).
+-export([map/2, mapfold/3, map_leafs/2, fold/3]).
@@ -320,6 +320,21 @@ count_leafs_simple([{_Key, _Value, SubTr
     count_leafs_simple(SubTree) + count_leafs_simple(RestTree).
+fold(_Fun, Acc, []) ->
+    Acc;
+fold(Fun, Acc0, [{Pos, Tree}|Rest]) ->
+    Acc1 = fold_simple(Fun, Acc0, Pos, [Tree]),
+    fold(Fun, Acc1, Rest).
+fold_simple(_Fun, Acc, _Pos, []) ->
+    Acc;
+fold_simple(Fun, Acc0, Pos, [{Key, Value, SubTree} | RestTree]) ->
+    Type = if SubTree == [] -> leaf; true -> branch end,
+    Acc1 = Fun({Pos, Key}, Value, Type, Acc0),
+    Acc2 = fold_simple(Fun, Acc1, Pos+1, SubTree),
+    fold_simple(Fun, Acc2, Pos, RestTree).
 map(_Fun, []) ->
 map(Fun, [{Pos, Tree}|Rest]) ->

View raw message