couchdb-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From rnew...@apache.org
Subject [38/49] fabric commit: updated refs/heads/windsor-merge to b1c0030
Date Fri, 01 Aug 2014 14:34:59 GMT
Consistently log fabric worker timeouts

Write a log line for each worker that did not return a response
when a fabric request times out. The format of the log line is:

    fabric_worker_timeout ENDPOINT,NODE,SHARD_NAME

This is intented to be easily consumable by downstream tools
(e.g., Splunk).

BugzID: 26984


Project: http://git-wip-us.apache.org/repos/asf/couchdb-fabric/repo
Commit: http://git-wip-us.apache.org/repos/asf/couchdb-fabric/commit/b1c0030f
Tree: http://git-wip-us.apache.org/repos/asf/couchdb-fabric/tree/b1c0030f
Diff: http://git-wip-us.apache.org/repos/asf/couchdb-fabric/diff/b1c0030f

Branch: refs/heads/windsor-merge
Commit: b1c0030fa3d960e9ea6e3109204ffee185459b8e
Parents: bfaa56b
Author: Mike Wallace <mikewallace1979@googlemail.com>
Authored: Sat Jan 11 22:11:20 2014 +0000
Committer: Robert Newson <rnewson@apache.org>
Committed: Fri Aug 1 15:33:43 2014 +0100

----------------------------------------------------------------------
 include/fabric.hrl              |  6 ++++++
 src/fabric_db_create.erl        | 10 +++++++++-
 src/fabric_db_delete.erl        |  7 ++++++-
 src/fabric_db_doc_count.erl     |  9 +++++++--
 src/fabric_db_info.erl          | 10 ++++++++++
 src/fabric_db_meta.erl          | 24 ++++++++++++++++++------
 src/fabric_dict.erl             |  3 +++
 src/fabric_doc_missing_revs.erl | 11 +++++++++--
 src/fabric_doc_open.erl         |  3 +++
 src/fabric_doc_open_revs.erl    |  3 +++
 src/fabric_doc_update.erl       |  4 +++-
 src/fabric_group_info.erl       |  9 +++++++--
 src/fabric_util.erl             | 16 ++++++++++------
 src/fabric_view_all_docs.erl    |  9 ++++++++-
 src/fabric_view_changes.erl     | 10 ++++++++++
 src/fabric_view_map.erl         | 10 +++++++++-
 src/fabric_view_reduce.erl      | 10 +++++++++-
 17 files changed, 130 insertions(+), 24 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/couchdb-fabric/blob/b1c0030f/include/fabric.hrl
----------------------------------------------------------------------
diff --git a/include/fabric.hrl b/include/fabric.hrl
index 94769bd..abbc4ad 100644
--- a/include/fabric.hrl
+++ b/include/fabric.hrl
@@ -32,5 +32,11 @@
     user_acc
 }).
 
+-record(stream_acc, {
+    workers,
+    start_fun,
+    replacements
+}).
+
 -record(view_row, {key, id, value, doc, worker}).
 -record(change, {key, id, value, deleted=false, doc, worker}).

http://git-wip-us.apache.org/repos/asf/couchdb-fabric/blob/b1c0030f/src/fabric_db_create.erl
----------------------------------------------------------------------
diff --git a/src/fabric_db_create.erl b/src/fabric_db_create.erl
index 5a8334f..8b9d32a 100644
--- a/src/fabric_db_create.erl
+++ b/src/fabric_db_create.erl
@@ -73,6 +73,9 @@ create_shard_files(Shards) ->
     try fabric_util:recv(Workers, #shard.ref, fun handle_message/3, Workers) of
     {error, file_exists} ->
         file_exists;
+    {timeout, DefunctWorkers} ->
+        fabric_util:log_timeout(DefunctWorkers, "create_db"),
+        {error, timeout};
     _ ->
         ok
     after
@@ -104,7 +107,12 @@ create_shard_db_doc(Doc) ->
     Workers = fabric_util:submit_jobs(Shards, create_shard_db_doc, [Doc]),
     Acc0 = {length(Shards), fabric_dict:init(Workers, nil)},
     try fabric_util:recv(Workers, #shard.ref, fun handle_db_update/3, Acc0) of
-    {timeout, _} ->
+    {timeout, {_, WorkersDict}} ->
+        DefunctWorkers = fabric_util:remove_done_workers(WorkersDict, nil),
+        fabric_util:log_timeout(
+            DefunctWorkers,
+            "create_shard_db_doc"
+        ),
         {error, timeout};
     Else ->
         Else

http://git-wip-us.apache.org/repos/asf/couchdb-fabric/blob/b1c0030f/src/fabric_db_delete.erl
----------------------------------------------------------------------
diff --git a/src/fabric_db_delete.erl b/src/fabric_db_delete.erl
index 2087f44..9ba55fb 100644
--- a/src/fabric_db_delete.erl
+++ b/src/fabric_db_delete.erl
@@ -43,7 +43,12 @@ delete_shard_db_doc(Doc) ->
     Workers = fabric_util:submit_jobs(Shards, delete_shard_db_doc, [Doc]),
     Acc0 = {length(Shards), fabric_dict:init(Workers, nil)},
     try fabric_util:recv(Workers, #shard.ref, fun handle_db_update/3, Acc0) of
-    {timeout, _} ->
+    {timeout, {_, WorkersDict}} ->
+        DefunctWorkers = fabric_util:remove_done_workers(WorkersDict, nil),
+        fabric_util:log_timeout(
+            DefunctWorkers,
+            "delete_shard_db_doc"
+        ),
         {error, timeout};
     Else ->
         Else

http://git-wip-us.apache.org/repos/asf/couchdb-fabric/blob/b1c0030f/src/fabric_db_doc_count.erl
----------------------------------------------------------------------
diff --git a/src/fabric_db_doc_count.erl b/src/fabric_db_doc_count.erl
index dcc32aa..a0fd3ec 100644
--- a/src/fabric_db_doc_count.erl
+++ b/src/fabric_db_doc_count.erl
@@ -23,8 +23,13 @@ go(DbName) ->
     Workers = fabric_util:submit_jobs(Shards, get_doc_count, []),
     RexiMon = fabric_util:create_monitors(Shards),
     Acc0 = {fabric_dict:init(Workers, nil), 0},
-    try
-        fabric_util:recv(Workers, #shard.ref, fun handle_message/3, Acc0)
+    try fabric_util:recv(Workers, #shard.ref, fun handle_message/3, Acc0) of
+    {timeout, {WorkersDict, _}} ->
+        DefunctWorkers = fabric_util:remove_done_workers(WorkersDict, nil),
+        fabric_util:log_timeout(DefunctWorkers, "get_doc_count"),
+        {error, timeout};
+    Else ->
+        Else
     after
         rexi_monitor:stop(RexiMon)
     end.

http://git-wip-us.apache.org/repos/asf/couchdb-fabric/blob/b1c0030f/src/fabric_db_info.erl
----------------------------------------------------------------------
diff --git a/src/fabric_db_info.erl b/src/fabric_db_info.erl
index de88632..8a41cde 100644
--- a/src/fabric_db_info.erl
+++ b/src/fabric_db_info.erl
@@ -26,6 +26,16 @@ go(DbName) ->
     try
         case fabric_util:recv(Workers, #shard.ref, Fun, Acc0) of
             {ok, Acc} -> {ok, Acc};
+            {timeout, {WorkersDict, _}} ->
+                DefunctWorkers = fabric_util:remove_done_workers(
+                    WorkersDict,
+                    nil
+                ),
+                fabric_util:log_timeout(
+                    DefunctWorkers,
+                    "get_db_info"
+                ),
+                {error, timeout};
             {error, Error} -> throw(Error)
         end
     after

http://git-wip-us.apache.org/repos/asf/couchdb-fabric/blob/b1c0030f/src/fabric_db_meta.erl
----------------------------------------------------------------------
diff --git a/src/fabric_db_meta.erl b/src/fabric_db_meta.erl
index 2d5ba8d..1550062 100644
--- a/src/fabric_db_meta.erl
+++ b/src/fabric_db_meta.erl
@@ -29,19 +29,22 @@ set_revs_limit(DbName, Limit, Options) ->
     Shards = mem3:shards(DbName),
     Workers = fabric_util:submit_jobs(Shards, set_revs_limit, [Limit, Options]),
     Handler = fun handle_revs_message/3,
-    Waiting = length(Workers) - 1,
-    case fabric_util:recv(Workers, #shard.ref, Handler, Waiting) of
+    Acc0 = {Workers, length(Workers) - 1},
+    case fabric_util:recv(Workers, #shard.ref, Handler, Acc0) of
     {ok, ok} ->
         ok;
+    {timeout, {DefunctWorkers, _}} ->
+        fabric_util:log_timeout(DefunctWorkers, "set_revs_limit"),
+        {error, timeout};
     Error ->
         Error
     end.
 
-handle_revs_message(ok, _, 0) ->
+handle_revs_message(ok, _, {_Workers, 0}) ->
     {stop, ok};
-handle_revs_message(ok, _, Waiting) ->
-    {ok, Waiting - 1};
-handle_revs_message(Error, _, _Waiting) ->
+handle_revs_message(ok, Worker, {Workers, Waiting}) ->
+    {ok, {lists:delete(Worker, Workers), Waiting - 1}};
+handle_revs_message(Error, _, _Acc) ->
     {error, Error}.
 
 
@@ -61,6 +64,9 @@ set_security(DbName, SecObj, Options) ->
             ok -> ok;
             Error -> Error
         end;
+    {timeout, #acc{workers=DefunctWorkers}} ->
+        fabric_util:log_timeout(DefunctWorkers, "set_security"),
+        {error, timeout};
     Error ->
         Error
     after
@@ -133,6 +139,12 @@ get_all_security(DbName, Options) ->
         {ok, SecObjs};
     {ok, _} ->
         {error, no_majority};
+    {timeout, #acc{workers=DefunctWorkers}} ->
+        fabric_util:log_timeout(
+            DefunctWorkers,
+            "get_all_security"
+        ),
+        {error, timeout};
     Error ->
         Error
     after

http://git-wip-us.apache.org/repos/asf/couchdb-fabric/blob/b1c0030f/src/fabric_dict.erl
----------------------------------------------------------------------
diff --git a/src/fabric_dict.erl b/src/fabric_dict.erl
index f88ca97..ec2e25c 100644
--- a/src/fabric_dict.erl
+++ b/src/fabric_dict.erl
@@ -52,3 +52,6 @@ filter(Fun, Dict) ->
 
 fold(Fun, Acc0, Dict) ->
     orddict:fold(Fun, Acc0, Dict).
+
+to_list(Dict) ->
+    orddict:to_list(Dict).

http://git-wip-us.apache.org/repos/asf/couchdb-fabric/blob/b1c0030f/src/fabric_doc_missing_revs.erl
----------------------------------------------------------------------
diff --git a/src/fabric_doc_missing_revs.erl b/src/fabric_doc_missing_revs.erl
index ec154ee..1687111 100644
--- a/src/fabric_doc_missing_revs.erl
+++ b/src/fabric_doc_missing_revs.erl
@@ -29,8 +29,15 @@ go(DbName, AllIdsRevs, Options) ->
     ResultDict = dict:from_list([{Id, {{nil,Revs},[]}} || {Id, Revs} <- AllIdsRevs]),
     RexiMon = fabric_util:create_monitors(Workers),
     Acc0 = {length(Workers), ResultDict, Workers},
-    try
-        fabric_util:recv(Workers, #shard.ref, fun handle_message/3, Acc0)
+    try fabric_util:recv(Workers, #shard.ref, fun handle_message/3, Acc0) of
+    {timeout, {_, _, DefunctWorkers}} ->
+        fabric_util:log_timeout(
+            DefunctWorkers,
+            "get_missing_revs"
+        ),
+        {error, timeout};
+    Else ->
+        Else
     after
         rexi_monitor:stop(RexiMon)
     end.

http://git-wip-us.apache.org/repos/asf/couchdb-fabric/blob/b1c0030f/src/fabric_doc_open.erl
----------------------------------------------------------------------
diff --git a/src/fabric_doc_open.erl b/src/fabric_doc_open.erl
index 26bd2a3..1805c80 100644
--- a/src/fabric_doc_open.erl
+++ b/src/fabric_doc_open.erl
@@ -47,6 +47,9 @@ go(DbName, Id, Options) ->
     {ok, #acc{}=Acc} ->
         Reply = handle_response(Acc),
         format_reply(Reply, SuppressDeletedDoc);
+    {timeout, #acc{workers=DefunctWorkers}} ->
+        fabric_util:log_timeout(DefunctWorkers, "open_doc"),
+        {error, timeout};
     Error ->
         Error
     after

http://git-wip-us.apache.org/repos/asf/couchdb-fabric/blob/b1c0030f/src/fabric_doc_open_revs.erl
----------------------------------------------------------------------
diff --git a/src/fabric_doc_open_revs.erl b/src/fabric_doc_open_revs.erl
index 31d7616..b24924d 100644
--- a/src/fabric_doc_open_revs.erl
+++ b/src/fabric_doc_open_revs.erl
@@ -47,6 +47,9 @@ go(DbName, Id, Revs, Options) ->
     try fabric_util:recv(Workers, #shard.ref, fun handle_message/3, State) of
     {ok, {ok, Reply}} ->
         {ok, Reply};
+    {timeout, #state{workers=DefunctWorkers}} ->
+        fabric_util:log_timeout(DefunctWorkers, "open_revs"),
+        {error, timeout};
     Else ->
         Else
     after

http://git-wip-us.apache.org/repos/asf/couchdb-fabric/blob/b1c0030f/src/fabric_doc_update.erl
----------------------------------------------------------------------
diff --git a/src/fabric_doc_update.erl b/src/fabric_doc_update.erl
index 9e2ce50..da8eeed 100644
--- a/src/fabric_doc_update.erl
+++ b/src/fabric_doc_update.erl
@@ -39,7 +39,9 @@ go(DbName, AllDocs0, Opts) ->
     {ok, {Health, Results}} when Health =:= ok; Health =:= accepted ->
         {Health, [R || R <- couch_util:reorder_results(AllDocs, Results), R =/= noreply]};
     {timeout, Acc} ->
-        {_, _, W1, _, DocReplDict} = Acc,
+        {_, _, W1, GroupedDocs1, DocReplDict} = Acc,
+        {DefunctWorkers, _} = lists:unzip(GroupedDocs1),
+        fabric_util:log_timeout(DefunctWorkers, "update_docs"),
         {Health, _, Resp} = dict:fold(fun force_reply/3, {ok, W1, []},
             DocReplDict),
         {Health, [R || R <- couch_util:reorder_results(AllDocs, Resp), R =/= noreply]};

http://git-wip-us.apache.org/repos/asf/couchdb-fabric/blob/b1c0030f/src/fabric_group_info.erl
----------------------------------------------------------------------
diff --git a/src/fabric_group_info.erl b/src/fabric_group_info.erl
index 5325f76..b5ee2b2 100644
--- a/src/fabric_group_info.erl
+++ b/src/fabric_group_info.erl
@@ -27,8 +27,13 @@ go(DbName, #doc{id=DDocId}) ->
     Workers = fabric_util:submit_jobs(Shards, group_info, [DDocId]),
     RexiMon = fabric_util:create_monitors(Shards),
     Acc0 = {fabric_dict:init(Workers, nil), []},
-    try
-        fabric_util:recv(Workers, #shard.ref, fun handle_message/3, Acc0)
+    try fabric_util:recv(Workers, #shard.ref, fun handle_message/3, Acc0) of
+    {timeout, {WorkersDict, _}} ->
+        DefunctWorkers = fabric_util:remove_done_workers(WorkersDict, nil),
+        fabric_util:log_timeout(DefunctWorkers, "group_info"),
+        {error, timeout};
+    Else ->
+        Else
     after
         rexi_monitor:stop(RexiMon)
     end.

http://git-wip-us.apache.org/repos/asf/couchdb-fabric/blob/b1c0030f/src/fabric_util.erl
----------------------------------------------------------------------
diff --git a/src/fabric_util.erl b/src/fabric_util.erl
index 5f59725..18ff578 100644
--- a/src/fabric_util.erl
+++ b/src/fabric_util.erl
@@ -17,6 +17,7 @@
         remove_down_workers/2, doc_id_and_rev/1]).
 -export([request_timeout/0, attachments_timeout/0, all_docs_timeout/0]).
 -export([stream_start/2, stream_start/4]).
+-export([log_timeout/2, remove_done_workers/2]).
 
 -compile({inline, [{doc_id_and_rev,1}]}).
 
@@ -25,12 +26,6 @@
 -include_lib("couch/include/couch_db.hrl").
 -include_lib("eunit/include/eunit.hrl").
 
--record(stream_acc, {
-    workers,
-    start_fun,
-    replacements
-}).
-
 remove_down_workers(Workers, BadNode) ->
     Filter = fun(#shard{node = Node}, _) -> Node =/= BadNode end,
     NewWorkers = fabric_dict:filter(Filter, Workers),
@@ -155,6 +150,15 @@ timeout(Type, Default) ->
         N -> list_to_integer(N)
     end.
 
+log_timeout(Workers, EndPoint) ->
+    lists:map(fun(#shard{node=Dest, name=Name}) ->
+        Fmt = "fabric_worker_timeout ~s,~p,~p",
+        ?LOG_ERROR(Fmt, [EndPoint, Dest, Name])
+    end, Workers).
+
+remove_done_workers(Workers, WaitingIndicator) ->
+    [W || {W, WI} <- fabric_dict:to_list(Workers), WI == WaitingIndicator].
+
 get_db(DbName) ->
     get_db(DbName, []).
 

http://git-wip-us.apache.org/repos/asf/couchdb-fabric/blob/b1c0030f/src/fabric_view_all_docs.erl
----------------------------------------------------------------------
diff --git a/src/fabric_view_all_docs.erl b/src/fabric_view_all_docs.erl
index d4dec09..e7cc67c 100644
--- a/src/fabric_view_all_docs.erl
+++ b/src/fabric_view_all_docs.erl
@@ -33,7 +33,14 @@ go(DbName, Options, #mrargs{keys=undefined} = QueryArgs, Callback, Acc)
->
                 after
                     fabric_util:cleanup(Workers)
                 end;
-            {timeout, _} ->
+            {timeout, NewState} ->
+                DefunctWorkers = fabric_util:remove_done_workers(
+                    NewState#stream_acc.workers, waiting
+                ),
+                fabric_util:log_timeout(
+                    DefunctWorkers,
+                    "all_docs"
+                ),
                 Callback({error, timeout}, Acc);
             {error, Error} ->
                 Callback({error, Error}, Acc)

http://git-wip-us.apache.org/repos/asf/couchdb-fabric/blob/b1c0030f/src/fabric_view_changes.erl
----------------------------------------------------------------------
diff --git a/src/fabric_view_changes.erl b/src/fabric_view_changes.erl
index 34670c9..44639d9 100644
--- a/src/fabric_view_changes.erl
+++ b/src/fabric_view_changes.erl
@@ -174,6 +174,16 @@ send_changes(DbName, ChangesArgs, Callback, PackedSeqs, AccIn, Timeout)
->
                 after
                     fabric_util:cleanup(Workers)
                 end;
+            {timeout, NewState} ->
+                DefunctWorkers = fabric_util:remove_done_workers(
+                    NewState#stream_acc.workers,
+                    waiting
+                ),
+                fabric_util:log_timeout(
+                    DefunctWorkers,
+                    "changes"
+                ),
+                throw({error, timeout});
             {error, Reason} ->
                 throw({error, Reason});
             Else ->

http://git-wip-us.apache.org/repos/asf/couchdb-fabric/blob/b1c0030f/src/fabric_view_map.erl
----------------------------------------------------------------------
diff --git a/src/fabric_view_map.erl b/src/fabric_view_map.erl
index 1201daf..1977888 100644
--- a/src/fabric_view_map.erl
+++ b/src/fabric_view_map.erl
@@ -40,7 +40,15 @@ go(DbName, DDoc, View, Args, Callback, Acc) ->
                 after
                     fabric_util:cleanup(Workers)
                 end;
-            {timeout, _} ->
+            {timeout, NewState} ->
+                DefunctWorkers = fabric_util:remove_done_workers(
+                    NewState#stream_acc.workers,
+                    waiting
+                ),
+                fabric_util:log_timeout(
+                    DefunctWorkers,
+                    "map_view"
+                ),
                 Callback({error, timeout}, Acc);
             {error, Error} ->
                 Callback({error, Error}, Acc)

http://git-wip-us.apache.org/repos/asf/couchdb-fabric/blob/b1c0030f/src/fabric_view_reduce.erl
----------------------------------------------------------------------
diff --git a/src/fabric_view_reduce.erl b/src/fabric_view_reduce.erl
index 2e0d1f2..583c8ff 100644
--- a/src/fabric_view_reduce.erl
+++ b/src/fabric_view_reduce.erl
@@ -41,7 +41,15 @@ go(DbName, DDoc, VName, Args, Callback, Acc, {red, {_, Lang, _}, _}=VInfo)
->
                 after
                     fabric_util:cleanup(Workers)
                 end;
-            {timeout, _} ->
+            {timeout, NewState} ->
+                DefunctWorkers = fabric_util:remove_done_workers(
+                    NewState#stream_acc.workers,
+                    waiting
+                ),
+                fabric_util:log_timeout(
+                    DefunctWorkers,
+                    "reduce_view"
+                ),
                 Callback({error, timeout}, Acc);
             {error, Error} ->
                 Callback({error, Error}, Acc)


Mime
View raw message