Received: (from majordom@localhost) by hyperreal.org (8.8.5/8.8.5) id NAA04667; Thu, 11 Sep 1997 13:02:18 -0700 (PDT) Received: from twinlark.arctic.org (twinlark.arctic.org [204.62.130.91]) by hyperreal.org (8.8.5/8.8.5) with SMTP id NAA04642 for ; Thu, 11 Sep 1997 13:02:14 -0700 (PDT) Received: (qmail 20097 invoked by uid 500); 11 Sep 1997 20:03:33 -0000 Date: Thu, 11 Sep 1997 13:03:33 -0700 (PDT) From: Dean Gaudet To: new-httpd@apache.org Subject: Re: [PATCH] scoreboard fixes (take 2) In-Reply-To: Message-ID: MIME-Version: 1.0 Content-Type: TEXT/PLAIN; charset=US-ASCII Sender: new-httpd-owner@apache.org Precedence: bulk Reply-To: new-httpd@apache.org Alexei just committed a win32 fix that conflicts with this patch. Here's an update. Dean Index: CHANGES =================================================================== RCS file: /export/home/cvs/apachen/src/CHANGES,v retrieving revision 1.432 diff -u -r1.432 CHANGES --- CHANGES 1997/09/10 17:43:17 1.432 +++ CHANGES 1997/09/11 20:00:39 @@ -1,5 +1,12 @@ Changes with Apache 1.3b1 + *) API: the short_score record has been split into two pieces, one which + the parent writes on, and one which the child writes on. As part of + this change the get_scoreboard_info() function was removed, and + scoreboard_image was exported. This change fixes a race condition + in filebased scoreboard systems, and speeds up changes involving the + scoreboard in earlier 1.3 development. [Dean Gaudet] + *) API: New function child_terminate() triggers the child process to exit, while allowing the child finish what it needs to for the current request first. Index: main/http_main.c =================================================================== RCS file: /export/home/cvs/apachen/src/main/http_main.c,v retrieving revision 1.217 diff -u -r1.217 http_main.c --- http_main.c 1997/09/11 19:32:43 1.217 +++ http_main.c 1997/09/11 20:00:43 @@ -242,7 +242,7 @@ static int my_child_num; #endif -static scoreboard *scoreboard_image = NULL; +scoreboard *scoreboard_image = NULL; /* small utility macros to make things easier to read */ @@ -1424,7 +1424,7 @@ * anyway. */ -API_EXPORT(void) sync_scoreboard_image (void) +inline void sync_scoreboard_image (void) { #ifdef SCOREBOARD_FILE lseek (scoreboard_fd, 0L, 0); @@ -1434,7 +1434,7 @@ #endif /* MULTITHREAD */ -API_EXPORT(int) exists_scoreboard_image (void) +int exists_scoreboard_image (void) { return (scoreboard_image ? 1 : 0); } @@ -1459,17 +1459,14 @@ sync_scoreboard_image(); ss = &scoreboard_image->servers[child_num]; old_status = ss->status; - ss->x.pid = my_pid; ss->status = status; #ifdef OPTIMIZE_TIMEOUTS ++ss->cur_vtime; #endif #if defined(STATUS) -#ifdef OPTIMIZE_TIMEOUTS - ss->last_used = ss->last_rtime; /* close enough */ -#else - ss->last_used=time(NULL); +#ifndef OPTIMIZE_TIMEOUTS + ss->last_used = time(NULL); #endif if (status == SERVER_READY || status == SERVER_DEAD) { /* @@ -1514,11 +1511,6 @@ #endif } -API_EXPORT(short_score) get_scoreboard_info(int i) -{ - return (scoreboard_image->servers[i]); -} - #if defined(STATUS) void time_process_request (int child_num, int status) { @@ -1595,7 +1587,7 @@ int i; for (i = 0; i < max_daemons_limit; ++i) - if (scoreboard_image->servers[i].x.pid == pid) + if (scoreboard_image->parent[i].pid == pid) return i; return -1; @@ -1608,7 +1600,7 @@ sync_scoreboard_image(); for (i = 0; i < max_daemons_limit; ++i) { - int pid = scoreboard_image->servers[i].x.pid; + int pid = scoreboard_image->parent[i].pid; if (pid != my_pid && pid != 0) { int waitret = 0, @@ -1687,7 +1679,7 @@ for (n = 0; n < max_daemons_limit; ++n) { if (scoreboard_image->servers[n].status != SERVER_DEAD - && waitpid (scoreboard_image->servers[n].x.pid, &status, WNOHANG) + && waitpid (scoreboard_image->parent[n].pid, &status, WNOHANG) == -1 && errno == ECHILD) { sync_scoreboard_image (); @@ -1720,7 +1712,7 @@ if(scoreboard_image->servers[pi].status != SERVER_DEAD) { e[hi] = pi; - h[hi++] = (HANDLE)scoreboard_image->servers[pi].x.pid; + h[hi++] = (HANDLE)scoreboard_image->parent[pi].pid; } } @@ -1730,9 +1722,9 @@ if(rv == -1) err = GetLastError(); if((WAIT_OBJECT_0 <= (unsigned int)rv) && ((unsigned int)rv < (WAIT_OBJECT_0 + hi))) - return(scoreboard_image->servers[e[rv - WAIT_OBJECT_0]].x.pid); + return(scoreboard_image->parent[e[rv - WAIT_OBJECT_0]].pid); else if((WAIT_ABANDONED_0 <= (unsigned int)rv) && ((unsigned int)rv < (WAIT_ABANDONED_0 + hi))) - return(scoreboard_image->servers[e[rv - WAIT_ABANDONED_0]].x.pid); + return(scoreboard_image->parent[e[rv - WAIT_ABANDONED_0]].pid); } } @@ -2897,7 +2889,7 @@ } } -static int make_child(server_rec *s, int slot) +static int make_child(server_rec *s, int slot, time_t now) { int pid; @@ -2944,14 +2936,15 @@ child_main (slot); } - /* If the parent proceeds with a restart before the child has written - * their pid into the scoreboard we'll end up "forgetting" about the - * child. So we write the child pid into the scoreboard now. (This - * is safe, because the child is going to be writing the same value - * to the same word.) - * XXX: this needs to be sync'd to disk in the non shared memory stuff - */ - scoreboard_image->servers[slot].x.pid = pid; +#ifdef OPTIMIZE_TIMEOUTS + scoreboard_image->parent[slot].last_rtime = now; +#endif + scoreboard_image->parent[slot].pid = pid; +#ifdef SCOREBOARD_FILE + lseek(scoreboard_fd, XtOffsetOf(scoreboard, parent[slot]), 0); + force_write(scoreboard_fd, &scoreboard_image->parent[slot], + sizeof(parent_score)); +#endif return 0; } @@ -2961,12 +2954,13 @@ static void startup_children (int number_to_start) { int i; + time_t now = time(0); for (i = 0; number_to_start && i < daemons_limit; ++i ) { if (scoreboard_image->servers[i].status != SERVER_DEAD) { continue; } - if (make_child (server_conf, i) < 0) { + if (make_child (server_conf, i, now) < 0) { break; } --number_to_start; @@ -2991,26 +2985,30 @@ int i; int to_kill; int idle_count; - int free_head; - int *free_ptr; - int free_length; short_score *ss; -#ifdef OPTIMIZE_TIMEOUTS time_t now = time(0); -#endif + int free_length; + int free_slots[MAX_SPAWN_RATE]; + int last_non_dead; /* initialize the free_list */ - free_head = -1; - free_ptr = &free_head; free_length = 0; to_kill = -1; idle_count = 0; + last_non_dead = -1; sync_scoreboard_image (); for (i = 0; i < daemons_limit; ++i) { + if (i >= max_daemons_limit && free_length == idle_spawn_rate) break; ss = &scoreboard_image->servers[i]; switch (ss->status) { + /* We consider a starting server as idle because we started it + * at least a cycle ago, and if it still hasn't finished starting + * then we're just going to swamp things worse by forking more. + * So we hopefully won't need to fork more if we count it. + */ + case SERVER_STARTING: case SERVER_READY: ++idle_count; /* always kill the highest numbered child if we have to... @@ -3024,39 +3022,43 @@ case SERVER_DEAD: /* try to keep children numbers as low as possible */ if (free_length < idle_spawn_rate) { - *free_ptr = i; - free_ptr = &scoreboard_image->servers[i].x.free_list; + free_slots[free_length] = i; ++free_length; } break; } + if (ss->status != SERVER_DEAD) { + last_non_dead = i; #ifdef OPTIMIZE_TIMEOUTS - if (ss->status != SERVER_DEAD && ss->timeout_len) { - /* if it's a live server, with a live timeout then start checking - * its timeout */ - if (ss->cur_vtime != ss->last_vtime) { - /* it has made progress, so update its last_rtime, last_vtime */ - ss->last_rtime = now; - ss->last_vtime = ss->cur_vtime; - } else if (ss->last_rtime + ss->timeout_len < now) { - /* no progress, and the timeout length has been exceeded */ - ss->timeout_len = 0; - kill (ss->x.pid, SIGALRM); + if (ss->timeout_len) { + /* if it's a live server, with a live timeout then + * start checking its timeout */ + parent_score *ps = &scoreboard_image->parent[i]; + if (ss->cur_vtime != ps->last_vtime) { + /* it has made progress, so update its last_rtime, + * last_vtime */ + ps->last_rtime = now; + ps->last_vtime = ss->cur_vtime; + } else if (ps->last_rtime + ss->timeout_len < now) { + /* no progress, and the timeout length has been exceeded */ + ss->timeout_len = 0; + kill (ps->pid, SIGALRM); + } } - } #endif + } } + max_daemons_limit = last_non_dead + 1; if (idle_count > daemons_max_free) { /* kill off one child... we use SIGUSR1 because that'll cause it to * shut down gracefully, in case it happened to pick up a request * while we were counting */ - kill (scoreboard_image->servers[to_kill].x.pid, SIGUSR1); + kill (scoreboard_image->parent[to_kill].pid, SIGUSR1); idle_spawn_rate = 1; } else if (idle_count < daemons_min_free) { /* terminate the free list */ - *free_ptr = -1; - if (free_head == -1) { + if (free_length == 0) { /* only report this condition once */ static int reported = 0; @@ -3066,6 +3068,7 @@ " raising the MaxClients setting"); reported = 1; } + idle_spawn_rate = 1; } else { if (idle_spawn_rate >= 4) { aplog_error(APLOG_MARK, APLOG_ERR, server_conf, @@ -3073,12 +3076,8 @@ "to increase StartServers, or Min/MaxSpareServers)", idle_spawn_rate); } - i = 0; - while (i < idle_spawn_rate && free_head != -1) { - int slot = free_head; - free_head = scoreboard_image->servers[free_head].x.free_list; - make_child (server_conf, slot); - ++i; + for (i = 0; i < free_length; ++i) { + make_child (server_conf, free_slots[i], now); } /* the next time around we want to spawn twice as many if this * wasn't good enough, but not if we've just done a graceful @@ -3202,10 +3201,8 @@ /* we're still doing a 1-for-1 replacement of dead * children with new children */ - make_child (server_conf, child_slot); + make_child (server_conf, child_slot, time(0)); --remaining_children_to_start; - /* don't perform idle maintenance yet */ - continue; } } else if (is_graceful) { /* Great, we've probably just lost a slot in the @@ -3215,6 +3212,12 @@ aplog_error(APLOG_MARK, APLOG_WARNING, server_conf, "long lost child came home! (pid %d)", pid ); } + /* Don't perform idle maintenance when a child dies, + * only do it when there's a timeout. Remember only a + * finite number of children can die, and it's pretty + * pathological for a lot to die suddenly. + */ + continue; } else if (remaining_children_to_start) { /* we hit a 1 second timeout in which none of the previous * generation of children needed to be reaped... so assume Index: main/scoreboard.h =================================================================== RCS file: /export/home/cvs/apachen/src/main/scoreboard.h,v retrieving revision 1.29 diff -u -r1.29 scoreboard.h --- scoreboard.h 1997/09/11 19:32:44 1.29 +++ scoreboard.h 1997/09/11 20:00:43 @@ -74,6 +74,7 @@ #define SERVER_BUSY_LOG 6 /* Logging the request */ #define SERVER_BUSY_DNS 7 /* Looking up a hostname */ #define SERVER_GRACEFUL 8 /* server is gracefully finishing request */ +#define SERVER_NUM_STATUS 9 /* number of status settings */ /* A "virtual time" is simply a counter that indicates that a child is * making progress. The parent checks up on each child, and when they have @@ -90,14 +91,9 @@ */ typedef unsigned vtime_t; +/* stuff which the children generally write, and the parent mainly reads */ typedef struct { - union { - pid_t pid; /* if it's not DEAD then this is the pid */ - int free_list; /* otherwise this is scratch space */ - } x; #ifdef OPTIMIZE_TIMEOUTS - vtime_t last_vtime; /* the last vtime the parent has seen */ - time_t last_rtime; /* time(0) of the last change */ vtime_t cur_vtime; /* the child's current vtime */ unsigned short timeout_len; /* length of the timeout */ #endif @@ -119,7 +115,9 @@ #ifndef NO_TIMES struct tms times; #endif +#ifndef OPTIMIZE_TIMEOUTS time_t last_used; +#endif char client[32]; /* Keep 'em small... */ char request[64]; /* We just want an idea... */ char vhost[32]; /* What virtual host is being accessed? */ @@ -132,17 +130,28 @@ restart is required */ } global_score; +/* stuff which the parent generally writes and the children rarely read */ +typedef struct { + pid_t pid; +#ifdef OPTIMIZE_TIMEOUTS + time_t last_rtime; /* time(0) of the last change */ + vtime_t last_vtime; /* the last vtime the parent has seen */ +#endif +} parent_score; + typedef struct { short_score servers[HARD_SERVER_LIMIT]; + parent_score parent[HARD_SERVER_LIMIT]; global_score global; } scoreboard; #define SCOREBOARD_SIZE sizeof(scoreboard) -API_EXPORT(void) sync_scoreboard_image(void); -API_EXPORT(short_score) get_scoreboard_info(int x); -API_EXPORT(int) exists_scoreboard_image (void); +void sync_scoreboard_image(void); +int exists_scoreboard_image (void); + +extern scoreboard *scoreboard_image; /* for time_process_request() in http_main.c */ #define START_PREQUEST 1 Index: modules/standard/mod_log_config.c =================================================================== RCS file: /export/home/cvs/apachen/src/modules/standard/mod_log_config.c,v retrieving revision 1.36 diff -u -r1.36 mod_log_config.c --- mod_log_config.c 1997/08/23 02:59:45 1.36 +++ mod_log_config.c 1997/09/11 20:00:44 @@ -164,6 +164,7 @@ #include "httpd.h" #include "http_config.h" #include "http_core.h" /* For REMOTE_NAME */ +#include module MODULE_VAR_EXPORT config_log_module; @@ -175,6 +176,21 @@ static mode_t xfer_mode = ( S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH ); #endif +/* POSIX.1 defines PIPE_BUF as the maximum number of bytes that is + * guaranteed to be atomic when writing a pipe. And PIPE_BUF >= 512 + * is guaranteed. So we'll just guess 512 in the event the system + * doesn't have this. Now, for file writes there is actually no limit, + * the entire write is atomic. Whether all systems implement this + * correctly is another question entirely ... so we'll just use PIPE_BUF + * because it's probably a good guess as to what is implemented correctly + * everywhere. + */ +#ifdef PIPE_BUF +#define LOG_BUFSIZE PIPE_BUF +#else +#define LOG_BUFSIZE (512) +#endif + /* * multi_log_state is our per-(virtual)-server configuration. We store * an array of the logs we are going to use, each of type config_log_state. @@ -209,6 +225,10 @@ char *fname; array_header *format; int log_fd; +#ifdef BUFFERED_LOGS + int outcnt; + char outbuf[LOG_BUFSIZE]; +#endif } config_log_state; /* @@ -525,6 +545,16 @@ return cp ? cp : "-"; } +#ifdef BUFFERED_LOGS +static void flush_log (config_log_state *cls) +{ + if (cls->outcnt && cls->log_fd != -1) { + write (cls->log_fd, cls->outbuf, cls->outcnt); + cls->outcnt = 0; + } +} +#endif + static int config_log_transaction(request_rec *r, config_log_state *cls, array_header *default_format) { array_header *strsa; @@ -562,8 +592,20 @@ strcpy (s, strs[i]); s += strlen (strs[i]); } - - write(cls->log_fd, str, strlen(str)); + +#ifdef BUFFERED_LOGS + if (len + cls->outcnt > LOG_BUFSIZE) { + flush_log (cls); + } + if (len >= LOG_BUFSIZE) { + write (cls->log_fd, str, len); + } else { + memcpy (&cls->outbuf[cls->outcnt], str, len); + cls->outcnt += len; + } +#else + write (cls->log_fd, str, len); +#endif return OK; } @@ -606,7 +648,7 @@ (multi_log_state *)palloc(p, sizeof (multi_log_state)); mls->config_logs = - make_array(p, 5, sizeof (config_log_state)); + make_array(p, 1, sizeof (config_log_state)); mls->default_format = NULL; mls->server_config_logs = NULL; @@ -736,6 +778,9 @@ exit(1); } } +#ifdef BUFFERED_LOGS + cls->outcnt = 0; +#endif return cls; } @@ -784,6 +829,32 @@ for (s = s->next; s; s = s->next) open_multi_logs (s, p); } +#ifdef BUFFERED_LOGS +static void flush_all_logs (server_rec *s, pool *p) +{ + multi_log_state *mls; + array_header *log_list; + config_log_state *clsarray; + int i; + + for (; s; s = s->next) { + mls = get_module_config(s->module_config, &config_log_module); + log_list = NULL; + if (mls->config_logs->nelts) { + log_list = mls->config_logs; + } else if (mls->server_config_logs) { + log_list = mls->server_config_logs; + } + if (log_list) { + clsarray = (config_log_state *)log_list->elts; + for (i = 0; i < log_list->nelts; ++i) { + flush_log (&clsarray[i]); + } + } + } +} +#endif + module MODULE_VAR_EXPORT config_log_module = { STANDARD_MODULE_STUFF, init_config_log, /* initializer */ @@ -802,6 +873,10 @@ multi_log_transaction, /* logger */ NULL, /* header parser */ NULL, /* child_init */ - NULL, /* child_exit */ +#ifdef BUFFERED_LOGS + flush_all_logs, /* child_exit */ +#else + NULL, +#endif NULL /* post read-request */ }; Index: modules/standard/mod_status.c =================================================================== RCS file: /export/home/cvs/apachen/src/modules/standard/mod_status.c,v retrieving revision 1.62 diff -u -r1.62 mod_status.c --- mod_status.c 1997/09/11 19:32:49 1.62 +++ mod_status.c 1997/09/11 20:00:45 @@ -126,6 +126,7 @@ module MODULE_VAR_EXPORT status_module; +#ifdef STATUS /* Format the number of bytes nicely */ static void format_byte_out(request_rec *r,unsigned long bytes) @@ -149,6 +150,7 @@ else rprintf(r,"%.1f GB",(float)kbytes/MBYTE); } +#endif static void show_time(request_rec *r,time_t tsecs) { @@ -223,7 +225,8 @@ int no_table_report=0; server_rec *server = r->server; short_score score_record; - char status[]="??????????"; + parent_score ps_record; + char status[SERVER_NUM_STATUS]; char stat_buffer[HARD_SERVER_LIMIT]; clock_t tu,ts,tcu,tcs; @@ -290,9 +293,10 @@ sync_scoreboard_image(); for (i = 0; iservers[i]; + ps_record = scoreboard_image->parent[i]; res = score_record.status; - stat_buffer[i] = status[res]; + stat_buffer[i] = (res == SERVER_UNKNOWN) ? '?' : status[res]; if (res == SERVER_READY) ready++; else if (res != SERVER_DEAD && res != SERVER_UNKNOWN) @@ -300,8 +304,7 @@ #if defined(STATUS) lres = score_record.access_count; bytes= score_record.bytes_served; - if (lres!=0 || (score_record.status != SERVER_READY - && score_record.status != SERVER_DEAD)) + if (lres!=0 || (res != SERVER_READY && res != SERVER_DEAD)) { #ifndef NO_TIMES tu+=score_record.times.tms_utime; @@ -444,7 +447,8 @@ for (i = 0; iservers[i]; + ps_record = scoreboard_image->parent[i]; #if defined(NO_GETTIMEOFDAY) #ifndef NO_TIMES @@ -487,7 +491,7 @@ i,(int)conn_lres,my_lres,lres); else rprintf(r,"Server %d (%d): %d|%lu|%lu [", - i,(int)score_record.x.pid,(int)conn_lres,my_lres,lres); + i,(int)ps_record.pid,(int)conn_lres,my_lres,lres); switch (score_record.status) { @@ -533,7 +537,11 @@ score_record.times.tms_cutime/tick, score_record.times.tms_cstime/tick, #endif +#ifdef OPTIMIZE_TIMEOUTS + difftime(nowtime, ps_record.last_rtime), +#else difftime(nowtime, score_record.last_used), +#endif (long)req_time); format_byte_out(r,conn_bytes); rputs("|",r); @@ -552,7 +560,7 @@ i,(int)conn_lres,my_lres,lres); else rprintf(r,"%d%d%d/%lu/%lu", - i,(int)score_record.x.pid,(int)conn_lres,my_lres,lres); + i,(int)ps_record.pid,(int)conn_lres,my_lres,lres); switch (score_record.status) { @@ -597,7 +605,11 @@ score_record.times.tms_cutime + score_record.times.tms_cstime)/tick, #endif +#ifdef OPTIMIZE_TIMEOUTS + difftime(nowtime, ps_record.last_rtime), +#else difftime(nowtime, score_record.last_used), +#endif (long)req_time); rprintf(r,"%-1.1f%-2.2f%-2.2f\n", (float)conn_bytes/KBYTE, (float)my_bytes/MBYTE,