couchdb-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From vatam...@apache.org
Subject [14/50] couch-replicator commit: updated refs/heads/63012-scheduler to 27a5eae
Date Tue, 14 Mar 2017 19:26:09 GMT
Make sure jobs do not retry HTTP requests for too long

Replication HTTP requests have their own retry mechanism. If a request fails
it will be individually retried a few times in a row without crashing the
whole replication job. This can help with short / intermetent network failures.

However, longer retries in that part of code will interact unfavorably
with the scheduler, as it makes the job seem to run without crashing long enough
for the scheduler to consider it to be "healthy". When in reality the job might
have been wasting all that time retrying  without making any real progress.

To fix, record the first time the job start crashing in the #httpdb{} record
which is passed recursively between retry attempts. If a request was retrying
close to what the current scheduler health hreshold value is, stop retrying and
crash the whole job. This ensure scheduler will register the job as crashing
consecutively and will back it off as intended.


Project: http://git-wip-us.apache.org/repos/asf/couchdb-couch-replicator/repo
Commit: http://git-wip-us.apache.org/repos/asf/couchdb-couch-replicator/commit/c40b5c21
Tree: http://git-wip-us.apache.org/repos/asf/couchdb-couch-replicator/tree/c40b5c21
Diff: http://git-wip-us.apache.org/repos/asf/couchdb-couch-replicator/diff/c40b5c21

Branch: refs/heads/63012-scheduler
Commit: c40b5c21760e45ef91df93e73c56327c7605b236
Parents: 9584fcd
Author: Nick Vatamaniuc <vatamane@apache.org>
Authored: Fri Oct 28 01:32:28 2016 -0400
Committer: Nick Vatamaniuc <vatamane@apache.org>
Committed: Fri Oct 28 11:28:59 2016 -0400

----------------------------------------------------------------------
 src/couch_replicator_api_wrap.hrl |  3 ++-
 src/couch_replicator_httpc.erl    | 44 +++++++++++++++++++++++++++++-----
 2 files changed, 40 insertions(+), 7 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/couchdb-couch-replicator/blob/c40b5c21/src/couch_replicator_api_wrap.hrl
----------------------------------------------------------------------
diff --git a/src/couch_replicator_api_wrap.hrl b/src/couch_replicator_api_wrap.hrl
index eee04da..d15d214 100644
--- a/src/couch_replicator_api_wrap.hrl
+++ b/src/couch_replicator_api_wrap.hrl
@@ -24,7 +24,8 @@
     retries = 10,
     wait = 250,         % milliseconds
     httpc_pool = nil,
-    http_connections
+    http_connections,
+    first_error_timestamp = nil
 }).
 
 -record(oauth, {

http://git-wip-us.apache.org/repos/asf/couchdb-couch-replicator/blob/c40b5c21/src/couch_replicator_httpc.erl
----------------------------------------------------------------------
diff --git a/src/couch_replicator_httpc.erl b/src/couch_replicator_httpc.erl
index 4606e99..5fb7842 100644
--- a/src/couch_replicator_httpc.erl
+++ b/src/couch_replicator_httpc.erl
@@ -274,13 +274,45 @@ discard_message(ReqId, Worker, Count) ->
 maybe_retry(Error, Worker, #httpdb{retries = 0} = HttpDb, Params) ->
     report_error(Worker, HttpDb, Params, {error, Error});
 
-maybe_retry(Error, _Worker, #httpdb{retries = Retries, wait = Wait} = HttpDb,
+maybe_retry(Error, Worker, #httpdb{retries = Retries, wait = Wait} = HttpDb,
     Params) ->
-    ok = timer:sleep(Wait),
-    log_retry_error(Params, HttpDb, Wait, Error),
-    Wait2 = erlang:min(Wait * 2, ?MAX_WAIT),
-    NewHttpDb = HttpDb#httpdb{retries = Retries - 1, wait = Wait2},
-    throw({retry, NewHttpDb, Params}).
+    case total_error_time_exceeded(HttpDb) of
+        true ->
+            report_error(Worker, HttpDb, Params, {error, Error});
+        false ->
+            ok = timer:sleep(Wait),
+            log_retry_error(Params, HttpDb, Wait, Error),
+            Wait2 = erlang:min(Wait * 2, ?MAX_WAIT),
+            HttpDb1 = HttpDb#httpdb{retries = Retries - 1, wait = Wait2},
+            HttpDb2 = update_first_error_timestamp(HttpDb1),
+            throw({retry, HttpDb2, Params})
+    end.
+
+
+% When retrying, check to make total time spent retrying a request is below
+% the current scheduler health threshold. The goal is to not exceed the
+% threshold, otherwise the job which keep retrying too long will still be
+% considered healthy.
+total_error_time_exceeded(#httpdb{first_error_timestamp = nil}) ->
+    false;
+
+total_error_time_exceeded(#httpdb{first_error_timestamp = ErrorTimestamp}) ->
+    HealthThresholdSec = couch_replicator_scheduler:health_threshold(),
+    % Theshold value is halved because in the calling code the next step
+    % is a doubling. Not halving here could mean sleeping too long and
+    % exceeding the health threshold.
+    ThresholdUSec = (HealthThresholdSec / 2) * 1000000,
+    timer:now_diff(os:timestamp(), ErrorTimestamp) > ThresholdUSec.
+
+
+% Remember the first time an error occurs. This value is used later to check
+% the total time spend retrying a request. Because retrying is cursive, on
+% successful result #httpdb{} record is reset back to the original value.
+update_first_error_timestamp(#httpdb{first_error_timestamp = nil} = HttpDb) ->
+    HttpDb#httpdb{first_error_timestamp = os:timestamp()};
+
+update_first_error_timestamp(HttpDb) ->
+    HttpDb.
 
 
 log_retry_error(Params, HttpDb, Wait, Error) ->


Mime
View raw message